# -*- encoding:utf-8 -*- ''' @Author : dingjiawen @Date : 2023/11/15 16:32 @Usage : @Desc : ''' import torch.nn as nn import torch import torch.optim as optim import os import argparse import datetime import numpy as np import random from tqdm import tqdm from RUL.otherIdea.adaNHDctLSTM.utils import utils from RUL.otherIdea.adaNHDctLSTM.model import AdaRNN import RUL.otherIdea.adaNHDctLSTM.dataset_vibrate.data_process as data_process from RUL.baseModel.loss.ffd import fft_mse, dct_mse from RUL.otherIdea.adaNHDctLSTM.test import test import matplotlib.pyplot as plt import time from RUL.baseModel.CommonFunction import IsStopTraining ''' 超参数设置: ''' # 数据准备 is_norm = True is_single = True tdc_loss_type = 'cos' num_domain = 2 # 划分为几个源域和目标域 # RNN相关 hidden_num = 10 # LSTM细胞个数 feature = 20 # 一个点的维度 predict_num = 200 # 预测个数 batch_size = 32 model_name = "AdaRNN" hidden_list = [64, 64] # 每层RNN的隐藏层的维度 ### bottleneck_list: (dim,is_BatchNorm,is_ReLu,drop_out) bottleneck = [(64, False, False, 0), (64, True, True, 0.5)] # bottleneck = [(128, False, True, 0), # (64, True, True, 0.2), # (32, True, True, 0.2), # (16, False, False, 0)] # 训练相关 pre_epoch = 40 epochs = 1000 transfer_loss_type = 'cos' # 目前测试cos最好 dw = 0.5 # 迁移loss权重 lr = 0.01 fft_dw = 0.01 # fft_loss权重 整体效果,感觉是让Hard Predict更准了0.01 0.1较好 mdw = 0.0 # 手工特征权重 整体效果,0.3 0.5比较好 len_win = 0 # 窗口大小为0,暂时不知道有什么用 seed = 5 # 相关初始化工作 out_dir = './outputs' output_path = out_dir + '/{0}_tdc({1})_transfer({2})_domain{3}_dw{4}_fdw{5}_lr{6}_Norm{7}'.format(model_name, tdc_loss_type, transfer_loss_type, num_domain, dw, fft_dw, lr, is_norm) save_model_name = 'parameters/seed{0}_hidden{1}_feature{2}_predict{3}_dimList{4}'.format(seed, hidden_num, feature, predict_num, str(hidden_list[ 0]) + "-" + str( hidden_list[1])) save_fig_name = 'fig/seed{0}_hidden{1}_feature{2}_predict{3}_dimList{4}'.format(seed, hidden_num, feature, predict_num, str(hidden_list[0]) + "-" + str( hidden_list[1])) utils.dir_exist(output_path) utils.dir_exist(os.path.join(output_path, 'parameters')) utils.dir_exist(os.path.join(output_path, 'fig')) log_file = os.path.join(output_path, 'run.log') def pprint(*text): # print with UTC+8 time time = '[' + str(datetime.datetime.utcnow() + datetime.timedelta(hours=8))[:19] + '] -' print(time, *text, flush=True) if log_file is None: return with open(log_file, 'a') as f: print(time, *text, flush=True, file=f) def get_model(name='AdaRNN'): # 经过测试,整体来说,如果加了bottleneck,整体更愿意振动,而不加整体仅存在趋势 # bottleneck_list: (dim,is_BatchNorm,is_ReLu,drop_out) return AdaRNN(use_bottleneck=True, bottleneck_list=bottleneck, n_input=feature, n_hiddens=hidden_list, n_output=1, dropout=0.0, model_type=name, len_seq=hidden_num, trans_loss=transfer_loss_type, mdw=mdw) def train_AdaRNN(model, optimizer, train_loader_list, epoch, dist_old=None, weight_mat=None): model.train() criterion = nn.MSELoss() criterion_1 = nn.L1Loss() loss_all = [] loss_1_all = [] dist_mat = torch.zeros(len(hidden_list), hidden_num) len_loader = np.inf for loader in train_loader_list: if len(loader) < len_loader: len_loader = len(loader) for data_all in tqdm(zip(*train_loader_list), total=len_loader): optimizer.zero_grad() # 如果训练集域之间的batch_size对不齐就没法计算, # 为了不抛弃所有样本,这里将选择最小的域batch_size作为本轮的batch_size min_batch_size = 10000 for data in data_all: min_batch_size = min(min_batch_size, data[0].shape[0]) list_feat = [] list_label = [] for data in data_all: feature, label, label_reg = data[0].float( ), data[1].float(), data[2].float() list_feat.append(feature[:min_batch_size]) list_label.append(label_reg[:min_batch_size]) index = get_index(len(data_all) - 1) loss_mse = torch.zeros(1) loss_fft = torch.zeros(1) loss_transfer = torch.zeros(1) total_loss_l1 = torch.zeros(1) for i in range(len(index)): feature_s = list_feat[index[i][0]] feature_t = list_feat[index[i][1]] label_reg_s = list_label[index[i][0]] label_reg_t = list_label[index[i][1]] # 在batch_size处合并 feature_all = torch.cat((feature_s, feature_t), 0) if epoch < pre_epoch: pred_all, each_loss_transfer, out_weight_list = model.forward_pre_train( feature_all, len_win=len_win) else: pred_all, each_loss_transfer, dist, weight_mat = model.forward_Boosting( feature_all, weight_mat) dist_mat = dist_mat + dist pred_s = pred_all[0:feature_s.size(0)] pred_t = pred_all[feature_s.size(0):] loss_s = criterion(pred_s, label_reg_s) loss_t = criterion(pred_t, label_reg_t) lossf_s = dct_mse(pred_s, label_reg_s) lossf_t = dct_mse(pred_t, label_reg_t) loss_l1 = criterion_1(pred_s, label_reg_s) loss_mse += loss_s + loss_t loss_fft += (lossf_s + lossf_t) * fft_dw loss_transfer += dw * each_loss_transfer total_loss_l1 += loss_l1 total_loss = loss_mse + loss_transfer + loss_fft loss_all.append([total_loss.item(), loss_mse.item(), loss_transfer.item(), loss_fft.item()]) loss_1_all.append(total_loss_l1.item()) # 反向传播 total_loss.backward() # 梯度裁剪,梯度最大范数为3 torch.nn.utils.clip_grad_value_(model.parameters(), 3.) optimizer.step() loss = np.array(loss_all).mean(axis=0) loss_l1 = np.array(loss_1_all).mean() if epoch >= pre_epoch: if epoch > pre_epoch: weight_mat = model.update_weight_Boosting( weight_mat, dist_old, dist_mat) return loss, loss_l1, weight_mat, dist_mat else: weight_mat = transform_type(out_weight_list) return loss, loss_l1, weight_mat, None def get_index(num_domain=2): index = [] for i in range(num_domain): for j in range(i + 1, num_domain + 1): index.append((i, j)) return index def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) def val_epoch(model, val_loader, device, scheduler): model.eval() val_loss = 0 val_loss_1 = 0 val_loss_r = 0 criterion = nn.MSELoss() criterion_1 = nn.L1Loss() with torch.no_grad(): for val_batch_idx, (val_data, val_continue, val_label) in enumerate(val_loader): val_data, val_label = val_data.to(device), val_label.to(device) val_predict_data = model.predict(val_data) loss = criterion(val_predict_data, val_label) loss_r = torch.sqrt(loss) loss_1 = criterion_1(val_predict_data, val_label) val_loss += loss.item() val_loss_1 += loss_1.item() val_loss_r += loss_r.item() scheduler.step(val_loss) loss = val_loss / len(val_loader) loss_1 = val_loss_1 / len(val_loader) loss_r = val_loss_r / len(val_loader) return loss, loss_1, loss_r def test_epoch_inference(model, test_loader, prefix='Test'): model.eval() total_loss = 0 total_loss_1 = 0 total_loss_r = 0 correct = 0 criterion = nn.MSELoss() criterion_1 = nn.L1Loss() i = 0 for feature, label, label_reg in tqdm(test_loader, desc=prefix, total=len(test_loader)): feature, label_reg = feature.float(), label_reg.float() with torch.no_grad(): pred = model.predict(feature) loss = criterion(pred, label_reg) loss_r = torch.sqrt(loss) loss_1 = criterion_1(pred, label_reg) total_loss += loss.item() total_loss_1 += loss_1.item() total_loss_r += loss_r.item() if i == 0: label_list = label_reg.cpu().numpy() predict_list = pred.cpu().numpy() else: label_list = np.hstack((label_list, label_reg.cpu().numpy())) predict_list = np.hstack((predict_list, pred.cpu().numpy())) i = i + 1 loss = total_loss / len(test_loader) loss_1 = total_loss_1 / len(test_loader) loss_r = total_loss_r / len(test_loader) return loss, loss_1, loss_r, label_list, predict_list def inference(model, data_loader): loss, loss_1, loss_r, label_list, predict_list = test_epoch_inference( model, data_loader, prefix='Inference') return loss, loss_1, loss_r, label_list, predict_list def inference_all(output_path, model, model_path, loaders): pprint('inference...') loss_list = [] loss_l1_list = [] loss_r_list = [] model.load_state_dict(torch.load(model_path)) i = 0 for loader in loaders: loss, loss_1, loss_r, label_list, predict_list = inference( model, loader) loss_list.append(loss) loss_l1_list.append(loss_1) loss_r_list.append(loss_r) i = i + 1 return loss_list, loss_l1_list, loss_r_list def transform_type(init_weight): weight = torch.ones(len(hidden_list), hidden_num) for i in range(weight.shape[0]): for j in range(weight.shape[1]): weight[i, j] = init_weight[i][j].item() return weight def loadData(): train_loader_list, valid_loader, test_loader = data_process.load_weather_data_multi_domain( hidden_num=hidden_num, feature=feature, predict_num=predict_num, is_norm=is_norm, batch_size=batch_size, number_domain=num_domain, mode='tdc', dis_type=tdc_loss_type ) return train_loader_list, valid_loader, test_loader pass def train(model, train_loader_list, valid_loader, lr_patience, early_stop_patience, device): optimizer = optim.SGD(model.parameters(), lr=lr) scheduler_model = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=lr_patience) best_score = np.inf best_epoch, stop_round = 0, 0 weight_mat, dist_mat = None, None train_loss_list = [] val_loss_list = [] best_save_path = None for epoch in range(epochs): epoch_start_time = time.time() train_loss, loss1, weight_mat, dist_mat = train_AdaRNN( model, optimizer, train_loader_list, epoch, dist_mat, weight_mat) val_loss, val_loss_l1, val_loss_r = val_epoch( model, valid_loader, device=device, scheduler=scheduler_model) pprint( "[{:03d}/{:03d}] {:2.2f} sec(s) train_total_loss: {:3.9f} | train_mse_loss: {:3.9f} | train_fft_loss: {:3.9f} | train_transfer_loss: {:3.9f} " " | val_loss: {:3.9f} | Learning rate : {:3.6f}".format( epoch + 1, epochs, time.time() - epoch_start_time, train_loss[0], train_loss[1], train_loss[3], train_loss[2], val_loss, optimizer.state_dict()['param_groups'][0]['lr'])) if len(val_loss_list) == 0 or val_loss < min(val_loss_list): pprint("保存模型最佳模型成功") best_epoch = epoch best_score = val_loss # 保存模型参数 if best_save_path != None: utils.delete_file(os.path.join(output_path, best_save_path)) best_save_path = save_model_name + "_epoch" + str(epoch) + \ "_trainLoss" + str('%.5f' % train_loss[1]) + \ "_valLoss" + str('%.5f' % val_loss) + ".pkl" print(os.path.join(output_path, best_save_path)) torch.save(model.state_dict(), os.path.join(output_path, best_save_path)) train_loss_list.append(train_loss) val_loss_list.append(val_loss) if IsStopTraining(history_loss=val_loss_list, patience=early_stop_patience): pprint("{0}次loss未下降,训练停止".format(early_stop_patience)) break pprint('best val score:', best_score, '@', best_epoch) return best_save_path pass # 全局最优再保存 def trainForTotal(model, train_loader_list, valid_loader, lr_patience, early_stop_patience, device): optimizer = optim.SGD(model.parameters(), lr=lr) scheduler_model = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=lr_patience) best_score = np.inf best_epoch, stop_round = 0, 0 weight_mat, dist_mat = None, None train_loss_list = [] val_loss_list = [] best_save_path = None for epoch in range(epochs): epoch_start_time = time.time() train_loss, loss1, weight_mat, dist_mat = train_AdaRNN( model, optimizer, train_loader_list, epoch, dist_mat, weight_mat) val_loss, val_loss_l1, val_loss_r = val_epoch( model, valid_loader, device=device, scheduler=scheduler_model) pprint( "[{:03d}/{:03d}] {:2.2f} sec(s) train_total_loss: {:3.9f} | train_mse_loss: {:3.9f} | train_fft_loss: {:3.9f} | train_transfer_loss: {:3.9f} " " | val_loss: {:3.9f} | Learning rate : {:3.6f}".format( epoch + 1, epochs, time.time() - epoch_start_time, train_loss[0], train_loss[1], train_loss[3], train_loss[2], val_loss, optimizer.state_dict()['param_groups'][0]['lr'])) totalLoss = train_loss[1]+val_loss if len(val_loss_list) == 0 or totalLoss < min(val_loss_list): pprint(f"保存模型最佳模型成功,Loss:{totalLoss}") best_epoch = epoch best_score = val_loss # 保存模型参数 if best_save_path != None: utils.delete_file(os.path.join(output_path, best_save_path)) best_save_path = save_model_name + "_epoch" + str(epoch) + \ "_trainLoss" + str('%.5f' % train_loss[1]) + \ "_valLoss" + str('%.5f' % val_loss) + ".pkl" print(os.path.join(output_path, best_save_path)) torch.save(model.state_dict(), os.path.join(output_path, best_save_path)) train_loss_list.append(train_loss) val_loss_list.append(totalLoss) if IsStopTraining(history_loss=val_loss_list, patience=early_stop_patience): pprint("{0}次loss未下降,训练停止".format(early_stop_patience)) break pprint('best val score:', best_score, '@', best_epoch) return best_save_path pass def main_transfer(): if torch.cuda.is_available(): device = torch.device("cuda:0") else: device = torch.device("cpu") pprint('create DataLoaders...') train_loader_list, valid_loader, test_loader = loadData() pprint('create AdaRNN model...') model = get_model(model_name) num_model = count_parameters(model) print(model) print('#model params:', num_model) pprint('train model...') best_save_path = train(model=model, train_loader_list=train_loader_list, valid_loader=valid_loader, lr_patience=20, early_stop_patience=50, device=device) # best_save_path = trainForTotal(model=model, train_loader_list=train_loader_list, valid_loader=valid_loader, lr_patience=20, # early_stop_patience=50, device=device) end = time.time() print("训练耗时:{:3.2f}s".format(end - begin)) pprint('验证模型...') loaders = train_loader_list[0], valid_loader, test_loader loss_list, loss_l1_list, loss_r_list = inference_all(output_path, model, os.path.join( output_path, best_save_path), loaders) pprint('MSE: train %.6f, valid %.6f, test %.6f' % (loss_list[0], loss_list[1], loss_list[2])) pprint('L1: train %.6f, valid %.6f, test %.6f' % (loss_l1_list[0], loss_l1_list[1], loss_l1_list[2])) pprint('RMSE: train %.6f, valid %.6f, test %.6f' % (loss_r_list[0], loss_r_list[1], loss_r_list[2])) pprint('Finished.') # 加载网络 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.load_state_dict(torch.load(os.path.join(output_path, best_save_path), map_location=device)) test(hidden_num=hidden_num, feature=feature, predict_num=predict_num, batch_size=batch_size, model=model, is_single=is_single, is_norm=is_norm, save_fig_name=os.path.join(output_path, save_fig_name)) def after_test(save_name): model = get_model(model_name) # 加载网络 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.load_state_dict(torch.load( save_name , map_location=device)) test(hidden_num=hidden_num, feature=feature, predict_num=predict_num, batch_size=batch_size, model=model, is_single=is_single, is_norm=is_norm) def get_args(): parser = argparse.ArgumentParser() # model parser.add_argument('--model_name', default='AdaRNN') parser.add_argument('--d_feat', type=int, default=feature) parser.add_argument('--hidden_size', type=int, default=64) parser.add_argument('--num_layers', type=int, default=2) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--class_num', type=int, default=1) parser.add_argument('--pre_epoch', type=int, default=40) # 20, 30, 50 # training parser.add_argument('--n_epochs', type=int, default=200) parser.add_argument('--lr', type=float, default=5e-4) parser.add_argument('--early_stop', type=int, default=40) parser.add_argument('--smooth_steps', type=int, default=5) parser.add_argument('--batch_size', type=int, default=36) parser.add_argument('--dw', type=float, default=0.5) # 0.01, 0.05, 5.0 parser.add_argument('--loss_type', type=str, default='cos') parser.add_argument('--data_mode', type=str, default='tdc') parser.add_argument('--num_domain', type=int, default=2) parser.add_argument('--len_seq', type=int, default=hidden_num) # other parser.add_argument('--seed', type=int, default=10) parser.add_argument('--data_path', default="E:\self_example\pytorch_example\RUL\otherIdea/adaRNN\dataset/") parser.add_argument('--outdir', default='./outputs') parser.add_argument('--overwrite', action='store_true') parser.add_argument('--log_file', type=str, default='run.log') parser.add_argument('--gpu_id', type=int, default=0) parser.add_argument('--len_win', type=int, default=0) args = parser.parse_args() return args if __name__ == '__main__': begin = time.time() if torch.cuda.is_available(): device = torch.device("cuda:0") else: device = torch.device("cpu") torch.manual_seed(seed) random.seed(seed) np.random.seed(seed) # 训练与测试 main_transfer() '''事后测试''' # after_test(save_name="E:\self_example\pytorch_example\RUL\otherIdea/adaNHDctLSTM\outputs\AdaRNN_tdc(cos)_transfer(cos)_domain2_dw0.5_fdw0.1_lr0.01_NormFalse\parameters\seed250_hidden10_feature2_predict50_dimList64-64_epoch18_trainLoss0.03446_valLoss0.01136.pkl")