self_example/pytorch_example/RUL/otherIdea/adaHDctLSTM/train.py

481 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/11/15 16:32
@Usage :
@Desc :
'''
import torch.nn as nn
import torch
import torch.optim as optim
import os
import argparse
import datetime
import numpy as np
import random
from tqdm import tqdm
from RUL.otherIdea.adaHDctLSTM.utils import utils
from RUL.otherIdea.adaHDctLSTM.model import AdaRNN
import RUL.otherIdea.adaHDctLSTM.dataset_vibrate.data_process as data_process
from RUL.baseModel.loss.ffd import fft_mse, dct_mse
from RUL.otherIdea.adaHDctLSTM.test import test
import matplotlib.pyplot as plt
import time
from RUL.baseModel.CommonFunction import IsStopTraining
'''
超参数设置:
'''
# 数据准备
is_norm = False
is_single = True
tdc_loss_type = 'cos'
num_domain = 2 # 划分为几个源域和目标域
# RNN相关
hidden_num = 10 # LSTM细胞个数
feature = 2 # 一个点的维度
predict_num = 200 # 预测个数
batch_size = 32
model_name = "AdaRNN"
hidden_list = [64, 64] # 每层RNN的隐藏层的维度
### bottleneck_list: (dim,is_BatchNorm,is_ReLu,drop_out)
bottleneck = [(64, False, False, 0), (64, True, True, 0.5)]
# bottleneck = [(128, False, True, 0),
# (64, True, True, 0.2),
# (32, True, True, 0.2),
# (16, False, False, 0)]
# 训练相关
pre_epoch = 40
epochs = 1000
transfer_loss_type = 'cos' # 目前测试cos最好
dw = 0.5
lr = 0.01
fft_dw = 0.1 # 整体效果感觉是让Hard Predict更准了0.01 0.1较好
m_dw = 0.8
len_win = 0 # 窗口大小为0暂时不知道有什么用
seed = 250
# 相关初始化工作
out_dir = './outputs'
output_path = out_dir + '/{0}_tdc({1})_transfer({2})_domain{3}_dw{4}_fdw{5}_lr{6}_Norm{7}'.format(model_name,
tdc_loss_type,
transfer_loss_type,
num_domain,
dw, fft_dw, lr,
is_norm)
save_model_name = 'parameters/seed{0}_hidden{1}_feature{2}_predict{3}_dimList{4}'.format(seed, hidden_num,
feature,
predict_num,
str(hidden_list[
0]) + "-" + str(
hidden_list[1]))
save_fig_name = 'fig/seed{0}_hidden{1}_feature{2}_predict{3}_dimList{4}'.format(seed, hidden_num,
feature,
predict_num,
str(hidden_list[0]) + "-" + str(
hidden_list[1]))
utils.dir_exist(output_path)
utils.dir_exist(os.path.join(output_path, 'parameters'))
utils.dir_exist(os.path.join(output_path, 'fig'))
log_file = os.path.join(output_path, 'run.log')
def pprint(*text):
# print with UTC+8 time
time = '[' + str(datetime.datetime.utcnow() +
datetime.timedelta(hours=8))[:19] + '] -'
print(time, *text, flush=True)
if log_file is None:
return
with open(log_file, 'a') as f:
print(time, *text, flush=True, file=f)
def get_model(name='AdaRNN'):
# 经过测试整体来说如果加了bottleneck整体更愿意振动而不加整体仅存在趋势
# bottleneck_list: (dim,is_BatchNorm,is_ReLu,drop_out)
return AdaRNN(use_bottleneck=True, bottleneck_list=bottleneck, n_input=feature, n_hiddens=hidden_list,
n_output=1, dropout=0.0, model_type=name, len_seq=hidden_num,
trans_loss=transfer_loss_type,mdw=m_dw)
def train_AdaRNN(model, optimizer, train_loader_list, epoch, dist_old=None, weight_mat=None):
model.train()
criterion = nn.MSELoss()
criterion_1 = nn.L1Loss()
loss_all = []
loss_1_all = []
dist_mat = torch.zeros(len(hidden_list), hidden_num)
len_loader = np.inf
for loader in train_loader_list:
if len(loader) < len_loader:
len_loader = len(loader)
for data_all in tqdm(zip(*train_loader_list), total=len_loader):
optimizer.zero_grad()
# 如果训练集域之间的batch_size对不齐就没法计算
# 为了不抛弃所有样本这里将选择最小的域batch_size作为本轮的batch_size
min_batch_size = 10000
for data in data_all:
min_batch_size = min(min_batch_size, data[0].shape[0])
list_feat = []
list_label = []
for data in data_all:
feature, label, label_reg = data[0].float(
), data[1].float(), data[2].float()
list_feat.append(feature[:min_batch_size])
list_label.append(label_reg[:min_batch_size])
index = get_index(len(data_all) - 1)
loss_mse = torch.zeros(1)
loss_fft = torch.zeros(1)
loss_transfer = torch.zeros(1)
total_loss_l1 = torch.zeros(1)
for i in range(len(index)):
feature_s = list_feat[index[i][0]]
feature_t = list_feat[index[i][1]]
label_reg_s = list_label[index[i][0]]
label_reg_t = list_label[index[i][1]]
# 在batch_size处合并
feature_all = torch.cat((feature_s, feature_t), 0)
if epoch < pre_epoch:
pred_all, each_loss_transfer, out_weight_list = model.forward_pre_train(
feature_all, len_win=len_win)
else:
pred_all, each_loss_transfer, dist, weight_mat = model.forward_Boosting(
feature_all, weight_mat)
dist_mat = dist_mat + dist
pred_s = pred_all[0:feature_s.size(0)]
pred_t = pred_all[feature_s.size(0):]
loss_s = criterion(pred_s, label_reg_s)
loss_t = criterion(pred_t, label_reg_t)
lossf_s = dct_mse(pred_s, label_reg_s)
lossf_t = dct_mse(pred_t, label_reg_t)
loss_l1 = criterion_1(pred_s, label_reg_s)
loss_mse += loss_s + loss_t
loss_fft += (lossf_s + lossf_t) * fft_dw
loss_transfer += dw * each_loss_transfer
total_loss_l1 += loss_l1
total_loss = loss_mse + loss_transfer + loss_fft
loss_all.append([total_loss.item(), loss_mse.item(), loss_transfer.item(), loss_fft.item()])
loss_1_all.append(total_loss_l1.item())
# 反向传播
total_loss.backward()
# 梯度裁剪梯度最大范数为3
torch.nn.utils.clip_grad_value_(model.parameters(), 3.)
optimizer.step()
loss = np.array(loss_all).mean(axis=0)
loss_l1 = np.array(loss_1_all).mean()
if epoch >= pre_epoch:
if epoch > pre_epoch:
weight_mat = model.update_weight_Boosting(
weight_mat, dist_old, dist_mat)
return loss, loss_l1, weight_mat, dist_mat
else:
weight_mat = transform_type(out_weight_list)
return loss, loss_l1, weight_mat, None
def get_index(num_domain=2):
index = []
for i in range(num_domain):
for j in range(i + 1, num_domain + 1):
index.append((i, j))
return index
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def val_epoch(model, val_loader, device, scheduler):
model.eval()
val_loss = 0
val_loss_1 = 0
val_loss_r = 0
criterion = nn.MSELoss()
criterion_1 = nn.L1Loss()
with torch.no_grad():
for val_batch_idx, (val_data, val_continue, val_label) in enumerate(val_loader):
val_data, val_label = val_data.to(device), val_label.to(device)
val_predict_data = model.predict(val_data)
loss = criterion(val_predict_data, val_label)
loss_r = torch.sqrt(loss)
loss_1 = criterion_1(val_predict_data, val_label)
val_loss += loss.item()
val_loss_1 += loss_1.item()
val_loss_r += loss_r.item()
scheduler.step(val_loss)
loss = val_loss / len(val_loader)
loss_1 = val_loss_1 / len(val_loader)
loss_r = val_loss_r / len(val_loader)
return loss, loss_1, loss_r
def test_epoch_inference(model, test_loader, prefix='Test'):
model.eval()
total_loss = 0
total_loss_1 = 0
total_loss_r = 0
correct = 0
criterion = nn.MSELoss()
criterion_1 = nn.L1Loss()
i = 0
for feature, label, label_reg in tqdm(test_loader, desc=prefix, total=len(test_loader)):
feature, label_reg = feature.float(), label_reg.float()
with torch.no_grad():
pred = model.predict(feature)
loss = criterion(pred, label_reg)
loss_r = torch.sqrt(loss)
loss_1 = criterion_1(pred, label_reg)
total_loss += loss.item()
total_loss_1 += loss_1.item()
total_loss_r += loss_r.item()
if i == 0:
label_list = label_reg.cpu().numpy()
predict_list = pred.cpu().numpy()
else:
label_list = np.hstack((label_list, label_reg.cpu().numpy()))
predict_list = np.hstack((predict_list, pred.cpu().numpy()))
i = i + 1
loss = total_loss / len(test_loader)
loss_1 = total_loss_1 / len(test_loader)
loss_r = total_loss_r / len(test_loader)
return loss, loss_1, loss_r, label_list, predict_list
def inference(model, data_loader):
loss, loss_1, loss_r, label_list, predict_list = test_epoch_inference(
model, data_loader, prefix='Inference')
return loss, loss_1, loss_r, label_list, predict_list
def inference_all(output_path, model, model_path, loaders):
pprint('inference...')
loss_list = []
loss_l1_list = []
loss_r_list = []
model.load_state_dict(torch.load(model_path))
i = 0
for loader in loaders:
loss, loss_1, loss_r, label_list, predict_list = inference(
model, loader)
loss_list.append(loss)
loss_l1_list.append(loss_1)
loss_r_list.append(loss_r)
i = i + 1
return loss_list, loss_l1_list, loss_r_list
def transform_type(init_weight):
weight = torch.ones(len(hidden_list), hidden_num)
for i in range(weight.shape[0]):
for j in range(weight.shape[1]):
weight[i, j] = init_weight[i][j].item()
return weight
def loadData():
train_loader_list, valid_loader, test_loader = data_process.load_weather_data_multi_domain(
hidden_num=hidden_num, feature=feature, predict_num=predict_num, is_norm=is_norm, batch_size=batch_size,
number_domain=num_domain, mode='tdc', dis_type=tdc_loss_type
)
return train_loader_list, valid_loader, test_loader
pass
def train(model, train_loader_list, valid_loader, lr_patience, early_stop_patience, device):
optimizer = optim.SGD(model.parameters(), lr=lr)
scheduler_model = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5,
patience=lr_patience)
best_score = np.inf
best_epoch, stop_round = 0, 0
weight_mat, dist_mat = None, None
train_loss_list = []
val_loss_list = []
best_save_path = None
for epoch in range(epochs):
epoch_start_time = time.time()
train_loss, loss1, weight_mat, dist_mat = train_AdaRNN(
model, optimizer, train_loader_list, epoch, dist_mat, weight_mat)
val_loss, val_loss_l1, val_loss_r = val_epoch(
model, valid_loader, device=device, scheduler=scheduler_model)
pprint(
"[{:03d}/{:03d}] {:2.2f} sec(s) train_total_loss: {:3.9f} | train_mse_loss: {:3.9f} | train_fft_loss: {:3.9f} | train_transfer_loss: {:3.9f} "
" | val_loss: {:3.9f} | Learning rate : {:3.6f}".format(
epoch + 1, epochs, time.time() - epoch_start_time,
train_loss[0], train_loss[1], train_loss[3], train_loss[2],
val_loss,
optimizer.state_dict()['param_groups'][0]['lr']))
if len(val_loss_list) == 0 or val_loss < min(val_loss_list):
pprint("保存模型最佳模型成功")
best_epoch = epoch
best_score = val_loss
# 保存模型参数
if best_save_path != None:
utils.delete_file(os.path.join(output_path, best_save_path))
best_save_path = save_model_name + "_epoch" + str(epoch) + \
"_trainLoss" + str('%.5f' % train_loss[1]) + \
"_valLoss" + str('%.5f' % val_loss) + ".pkl"
print(os.path.join(output_path, best_save_path))
torch.save(model.state_dict(),
os.path.join(output_path, best_save_path))
train_loss_list.append(train_loss)
val_loss_list.append(val_loss)
if IsStopTraining(history_loss=val_loss_list, patience=early_stop_patience):
pprint("{0}次loss未下降,训练停止".format(early_stop_patience))
break
pprint('best val score:', best_score, '@', best_epoch)
return best_save_path
pass
def main_transfer():
if torch.cuda.is_available():
device = torch.device("cuda:0")
else:
device = torch.device("cpu")
pprint('create DataLoaders...')
train_loader_list, valid_loader, test_loader = loadData()
pprint('create AdaRNN model...')
model = get_model(model_name)
num_model = count_parameters(model)
print(model)
print('#model params:', num_model)
pprint('train model...')
best_save_path = train(model=model, train_loader_list=train_loader_list, valid_loader=valid_loader, lr_patience=20,
early_stop_patience=50, device=device)
end = time.time()
print("训练耗时:{:3.2f}s".format(end - begin))
pprint('验证模型...')
loaders = train_loader_list[0], valid_loader, test_loader
loss_list, loss_l1_list, loss_r_list = inference_all(output_path, model, os.path.join(
output_path, best_save_path), loaders)
pprint('MSE: train %.6f, valid %.6f, test %.6f' %
(loss_list[0], loss_list[1], loss_list[2]))
pprint('L1: train %.6f, valid %.6f, test %.6f' %
(loss_l1_list[0], loss_l1_list[1], loss_l1_list[2]))
pprint('RMSE: train %.6f, valid %.6f, test %.6f' %
(loss_r_list[0], loss_r_list[1], loss_r_list[2]))
pprint('Finished.')
# 加载网络
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.load_state_dict(torch.load(os.path.join(output_path, best_save_path), map_location=device))
test(hidden_num=hidden_num, feature=feature, predict_num=predict_num, batch_size=batch_size, model=model,
is_single=is_single, is_norm=is_norm, save_fig_name=os.path.join(output_path, save_fig_name))
def after_test(save_name):
model = get_model(model_name)
# 加载网络
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.load_state_dict(torch.load(
save_name
, map_location=device))
test(hidden_num=hidden_num, feature=feature, predict_num=predict_num, batch_size=batch_size, model=model,
is_single=is_single, is_norm=is_norm)
def get_args():
parser = argparse.ArgumentParser()
# model
parser.add_argument('--model_name', default='AdaRNN')
parser.add_argument('--d_feat', type=int, default=feature)
parser.add_argument('--hidden_size', type=int, default=64)
parser.add_argument('--num_layers', type=int, default=2)
parser.add_argument('--dropout', type=float, default=0.0)
parser.add_argument('--class_num', type=int, default=1)
parser.add_argument('--pre_epoch', type=int, default=40) # 20, 30, 50
# training
parser.add_argument('--n_epochs', type=int, default=200)
parser.add_argument('--lr', type=float, default=5e-4)
parser.add_argument('--early_stop', type=int, default=40)
parser.add_argument('--smooth_steps', type=int, default=5)
parser.add_argument('--batch_size', type=int, default=36)
parser.add_argument('--dw', type=float, default=0.5) # 0.01, 0.05, 5.0
parser.add_argument('--loss_type', type=str, default='cos')
parser.add_argument('--data_mode', type=str, default='tdc')
parser.add_argument('--num_domain', type=int, default=2)
parser.add_argument('--len_seq', type=int, default=hidden_num)
# other
parser.add_argument('--seed', type=int, default=10)
parser.add_argument('--data_path', default="E:\self_example\pytorch_example\RUL\otherIdea/adaRNN\dataset/")
parser.add_argument('--outdir', default='./outputs')
parser.add_argument('--overwrite', action='store_true')
parser.add_argument('--log_file', type=str, default='run.log')
parser.add_argument('--gpu_id', type=int, default=0)
parser.add_argument('--len_win', type=int, default=0)
args = parser.parse_args()
return args
if __name__ == '__main__':
begin = time.time()
if torch.cuda.is_available():
device = torch.device("cuda:0")
else:
device = torch.device("cpu")
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
# 训练与测试
main_transfer()
'''事后测试'''
# after_test(save_name="E:\self_example\pytorch_example\RUL\otherIdea/adaHDctEmdLSTM\outputs\AdaRNN_tdc(cos)_transfer(cos)_domain2_dw0.5_fdw0.1_lr0.01_NormFalse\parameters\seed450_hidden10_feature2_predict50_dimList64-64_epoch34_trainLoss0.00751_valLoss0.01469.pkl")