468 lines
17 KiB
Python
468 lines
17 KiB
Python
# -*- encoding:utf-8 -*-
|
||
|
||
'''
|
||
@Author : dingjiawen
|
||
@Date : 2023/11/15 16:32
|
||
@Usage :
|
||
@Desc :
|
||
'''
|
||
import torch.nn as nn
|
||
import torch
|
||
import torch.optim as optim
|
||
|
||
import os
|
||
import argparse
|
||
import datetime
|
||
import numpy as np
|
||
import random
|
||
from tqdm import tqdm
|
||
import RUL.baseModel.utils.utils as utils
|
||
from RUL.otherIdea.adaRNN.model import AdaRNN
|
||
|
||
import RUL.otherIdea.adaRNN.dataset_vibrate.data_process as data_process
|
||
from RUL.otherIdea.adaRNN.test import test
|
||
import matplotlib.pyplot as plt
|
||
import time
|
||
from RUL.baseModel.CommonFunction import IsStopTraining
|
||
|
||
'''
|
||
超参数设置:
|
||
'''
|
||
# 数据准备
|
||
is_norm = False
|
||
is_single = True
|
||
tdc_loss_type = 'cos'
|
||
num_domain = 2 # 划分为几个源域和目标域
|
||
|
||
# RNN相关
|
||
hidden_num = 10 # LSTM细胞个数
|
||
feature = 2 # 一个点的维度
|
||
predict_num = 200 # 预测个数
|
||
batch_size = 32
|
||
model_name = "AdaRNN"
|
||
hidden_list = [64, 64] # 每层RNN的隐藏层的维度
|
||
bottleneck = [(64, False, False, 0), (64, True, True, 0.5)]
|
||
# bottleneck = [(128, False, True, 0),
|
||
# (64, True, True, 0.2),
|
||
# (32, True, True, 0.2),
|
||
# (16, False, False, 0)]
|
||
|
||
# 训练相关
|
||
pre_epoch = 40
|
||
epochs = 1000
|
||
transfer_loss_type = 'cos'
|
||
dw = 0.5
|
||
lr = 0.01
|
||
len_win = 0 # 窗口大小,为0,暂时不知道有什么用
|
||
seed = 5
|
||
|
||
# 相关初始化工作
|
||
out_dir = './outputs'
|
||
output_path = out_dir + '/{0}_tdcLoss({1})_transferLoss({2})_domain{3}_dw{4}_lr{5}'.format(model_name, tdc_loss_type,
|
||
transfer_loss_type,
|
||
num_domain,
|
||
dw, lr)
|
||
save_model_name = 'parameters/{0}_hidden{1}_feature{2}_predict{3}_dimList{4}'.format(model_name, hidden_num,
|
||
feature,
|
||
predict_num,
|
||
str(hidden_list[0]) + "-" + str(
|
||
hidden_list[1]))
|
||
save_fig_name = 'fig/{0}_hidden{1}_feature{2}_predict{3}_dimList{4}.png'.format(model_name, hidden_num,
|
||
feature,
|
||
predict_num,
|
||
str(hidden_list[0]) + "-" + str(
|
||
hidden_list[1]))
|
||
utils.dir_exist(output_path)
|
||
utils.dir_exist(os.path.join(output_path, 'parameters'))
|
||
utils.dir_exist(os.path.join(output_path, 'fig'))
|
||
log_file = os.path.join(output_path, 'run.log')
|
||
|
||
|
||
def pprint(*text):
|
||
# print with UTC+8 time
|
||
time = '[' + str(datetime.datetime.utcnow() +
|
||
datetime.timedelta(hours=8))[:19] + '] -'
|
||
print(time, *text, flush=True)
|
||
if log_file is None:
|
||
return
|
||
with open(log_file, 'a') as f:
|
||
print(time, *text, flush=True, file=f)
|
||
|
||
|
||
def get_model(name='AdaRNN'):
|
||
# 经过测试,整体来说,如果加了bottleneck,整体更愿意振动,而不加整体仅存在趋势
|
||
# bottleneck_list: (dim,is_BatchNorm,is_ReLu,drop_out)
|
||
|
||
return AdaRNN(use_bottleneck=True, bottleneck_list=bottleneck, n_input=feature, n_hiddens=hidden_list,
|
||
n_output=1, dropout=0.0, model_type=name, len_seq=hidden_num,
|
||
trans_loss=transfer_loss_type)
|
||
|
||
|
||
def train_AdaRNN(model, optimizer, train_loader_list, epoch, dist_old=None, weight_mat=None):
|
||
model.train()
|
||
criterion = nn.MSELoss()
|
||
criterion_1 = nn.L1Loss()
|
||
loss_all = []
|
||
loss_1_all = []
|
||
dist_mat = torch.zeros(len(hidden_list), hidden_num)
|
||
len_loader = np.inf
|
||
for loader in train_loader_list:
|
||
if len(loader) < len_loader:
|
||
len_loader = len(loader)
|
||
for data_all in tqdm(zip(*train_loader_list), total=len_loader):
|
||
optimizer.zero_grad()
|
||
|
||
# 如果训练集域之间的batch_size对不齐就没法计算,
|
||
# 为了不抛弃所有样本,这里将选择最小的域batch_size作为本轮的batch_size
|
||
min_batch_size = 10000
|
||
for data in data_all:
|
||
min_batch_size = min(min_batch_size, data[0].shape[0])
|
||
|
||
list_feat = []
|
||
list_label = []
|
||
for data in data_all:
|
||
feature, label, label_reg = data[0].float(
|
||
), data[1].float(), data[2].float()
|
||
list_feat.append(feature[:min_batch_size])
|
||
list_label.append(label_reg[:min_batch_size])
|
||
|
||
index = get_index(len(data_all) - 1)
|
||
|
||
loss_mse = torch.zeros(1)
|
||
loss_transfer = torch.zeros(1)
|
||
total_loss_l1 = torch.zeros(1)
|
||
for i in range(len(index)):
|
||
feature_s = list_feat[index[i][0]]
|
||
feature_t = list_feat[index[i][1]]
|
||
label_reg_s = list_label[index[i][0]]
|
||
label_reg_t = list_label[index[i][1]]
|
||
# 在batch_size处合并
|
||
feature_all = torch.cat((feature_s, feature_t), 0)
|
||
|
||
if epoch < pre_epoch:
|
||
pred_all, each_loss_transfer, out_weight_list = model.forward_pre_train(
|
||
feature_all, len_win=len_win)
|
||
else:
|
||
pred_all, each_loss_transfer, dist, weight_mat = model.forward_Boosting(
|
||
feature_all, weight_mat)
|
||
dist_mat = dist_mat + dist
|
||
pred_s = pred_all[0:feature_s.size(0)]
|
||
pred_t = pred_all[feature_s.size(0):]
|
||
|
||
loss_s = criterion(pred_s, label_reg_s)
|
||
loss_t = criterion(pred_t, label_reg_t)
|
||
loss_l1 = criterion_1(pred_s, label_reg_s)
|
||
|
||
loss_mse += loss_s + loss_t
|
||
loss_transfer += dw * each_loss_transfer
|
||
total_loss_l1 += loss_l1
|
||
|
||
total_loss = loss_mse + loss_transfer
|
||
loss_all.append([total_loss.item(), loss_mse.item(), loss_transfer.item()])
|
||
loss_1_all.append(total_loss_l1.item())
|
||
|
||
# 反向传播
|
||
total_loss.backward()
|
||
# 梯度裁剪,梯度最大范数为3
|
||
torch.nn.utils.clip_grad_value_(model.parameters(), 3.)
|
||
optimizer.step()
|
||
loss = np.array(loss_all).mean(axis=0)
|
||
loss_l1 = np.array(loss_1_all).mean()
|
||
|
||
if epoch >= pre_epoch:
|
||
if epoch > pre_epoch:
|
||
weight_mat = model.update_weight_Boosting(
|
||
weight_mat, dist_old, dist_mat)
|
||
return loss, loss_l1, weight_mat, dist_mat
|
||
else:
|
||
weight_mat = transform_type(out_weight_list)
|
||
return loss, loss_l1, weight_mat, None
|
||
|
||
|
||
def get_index(num_domain=2):
|
||
index = []
|
||
for i in range(num_domain):
|
||
for j in range(i + 1, num_domain + 1):
|
||
index.append((i, j))
|
||
return index
|
||
|
||
|
||
def count_parameters(model):
|
||
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||
|
||
|
||
def val_epoch(model, val_loader, device, scheduler):
|
||
model.eval()
|
||
val_loss = 0
|
||
val_loss_1 = 0
|
||
val_loss_r = 0
|
||
|
||
criterion = nn.MSELoss()
|
||
criterion_1 = nn.L1Loss()
|
||
|
||
with torch.no_grad():
|
||
for val_batch_idx, (val_data, val_continue, val_label) in enumerate(val_loader):
|
||
val_data, val_label = val_data.to(device), val_label.to(device)
|
||
val_predict_data = model.predict(val_data)
|
||
|
||
loss = criterion(val_predict_data, val_label)
|
||
loss_r = torch.sqrt(loss)
|
||
loss_1 = criterion_1(val_predict_data, val_label)
|
||
val_loss += loss.item()
|
||
val_loss_1 += loss_1.item()
|
||
val_loss_r += loss_r.item()
|
||
|
||
# scheduler.step(val_loss)
|
||
|
||
loss = val_loss / len(val_loader)
|
||
loss_1 = val_loss_1 / len(val_loader)
|
||
loss_r = val_loss_r / len(val_loader)
|
||
|
||
return loss, loss_1, loss_r
|
||
|
||
|
||
def test_epoch_inference(model, test_loader, prefix='Test'):
|
||
model.eval()
|
||
total_loss = 0
|
||
total_loss_1 = 0
|
||
total_loss_r = 0
|
||
correct = 0
|
||
criterion = nn.MSELoss()
|
||
criterion_1 = nn.L1Loss()
|
||
i = 0
|
||
for feature, label, label_reg in tqdm(test_loader, desc=prefix, total=len(test_loader)):
|
||
feature, label_reg = feature.float(), label_reg.float()
|
||
with torch.no_grad():
|
||
pred = model.predict(feature)
|
||
loss = criterion(pred, label_reg)
|
||
loss_r = torch.sqrt(loss)
|
||
loss_1 = criterion_1(pred, label_reg)
|
||
total_loss += loss.item()
|
||
total_loss_1 += loss_1.item()
|
||
total_loss_r += loss_r.item()
|
||
if i == 0:
|
||
label_list = label_reg.cpu().numpy()
|
||
predict_list = pred.cpu().numpy()
|
||
else:
|
||
label_list = np.hstack((label_list, label_reg.cpu().numpy()))
|
||
predict_list = np.hstack((predict_list, pred.cpu().numpy()))
|
||
|
||
i = i + 1
|
||
loss = total_loss / len(test_loader)
|
||
loss_1 = total_loss_1 / len(test_loader)
|
||
loss_r = total_loss_r / len(test_loader)
|
||
return loss, loss_1, loss_r, label_list, predict_list
|
||
|
||
|
||
def inference(model, data_loader):
|
||
loss, loss_1, loss_r, label_list, predict_list = test_epoch_inference(
|
||
model, data_loader, prefix='Inference')
|
||
return loss, loss_1, loss_r, label_list, predict_list
|
||
|
||
|
||
def inference_all(output_path, model, model_path, loaders):
|
||
pprint('inference...')
|
||
loss_list = []
|
||
loss_l1_list = []
|
||
loss_r_list = []
|
||
model.load_state_dict(torch.load(model_path))
|
||
i = 0
|
||
|
||
for loader in loaders:
|
||
loss, loss_1, loss_r, label_list, predict_list = inference(
|
||
model, loader)
|
||
loss_list.append(loss)
|
||
loss_l1_list.append(loss_1)
|
||
loss_r_list.append(loss_r)
|
||
i = i + 1
|
||
return loss_list, loss_l1_list, loss_r_list
|
||
|
||
|
||
def transform_type(init_weight):
|
||
weight = torch.ones(len(hidden_list), hidden_num)
|
||
for i in range(weight.shape[0]):
|
||
for j in range(weight.shape[1]):
|
||
weight[i, j] = init_weight[i][j].item()
|
||
return weight
|
||
|
||
|
||
def loadData():
|
||
train_loader_list, valid_loader, test_loader = data_process.load_weather_data_multi_domain(
|
||
hidden_num=hidden_num, feature=feature, predict_num=predict_num, is_norm=is_norm, batch_size=batch_size,
|
||
number_domain=num_domain, mode='tdc', dis_type=tdc_loss_type
|
||
)
|
||
return train_loader_list, valid_loader, test_loader
|
||
pass
|
||
|
||
|
||
def train(model, train_loader_list, valid_loader, lr_patience, early_stop_patience, device):
|
||
optimizer = optim.SGD(model.parameters(), lr=lr)
|
||
|
||
scheduler_model = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.2,
|
||
patience=lr_patience)
|
||
|
||
best_score = np.inf
|
||
best_epoch, stop_round = 0, 0
|
||
weight_mat, dist_mat = None, None
|
||
train_loss_list = []
|
||
val_loss_list = []
|
||
best_save_path = None
|
||
for epoch in range(epochs):
|
||
epoch_start_time = time.time()
|
||
train_loss, loss1, weight_mat, dist_mat = train_AdaRNN(
|
||
model, optimizer, train_loader_list, epoch, dist_mat, weight_mat)
|
||
|
||
val_loss, val_loss_l1, val_loss_r = val_epoch(
|
||
model, valid_loader, device=device, scheduler=scheduler_model)
|
||
|
||
pprint(
|
||
"[{:03d}/{:03d}] {:2.2f} sec(s) train_total_loss: {:3.9f} | train_mse_loss: {:3.9f} | train_transfer_loss: {:3.9f} "
|
||
" | val_loss: {:3.9f} | Learning rate : {:3.6f}".format(
|
||
epoch + 1, epochs, time.time() - epoch_start_time,
|
||
train_loss[0], train_loss[1], train_loss[2],
|
||
val_loss,
|
||
optimizer.state_dict()['param_groups'][0]['lr']))
|
||
|
||
if len(val_loss_list) == 0 or val_loss < min(val_loss_list):
|
||
pprint("保存模型最佳模型成功")
|
||
best_epoch = epoch
|
||
best_score = val_loss
|
||
# 保存模型参数
|
||
if best_save_path != None:
|
||
utils.delete_file(best_save_path)
|
||
best_save_path = save_model_name + "_epoch" + str(epoch) + \
|
||
"_trainLoss" + str(train_loss[1]) + \
|
||
"_valLoss" + str(val_loss) + ".pkl"
|
||
print(os.path.join(output_path, best_save_path))
|
||
torch.save(model.state_dict(),
|
||
os.path.join(output_path, best_save_path))
|
||
|
||
train_loss_list.append(train_loss)
|
||
val_loss_list.append(val_loss)
|
||
|
||
if IsStopTraining(history_loss=val_loss_list, patience=early_stop_patience):
|
||
pprint("{0}次loss未下降,训练停止".format(early_stop_patience))
|
||
break
|
||
|
||
pprint('best val score:', best_score, '@', best_epoch)
|
||
return best_save_path
|
||
pass
|
||
|
||
|
||
def main_transfer():
|
||
if torch.cuda.is_available():
|
||
device = torch.device("cuda:0")
|
||
else:
|
||
device = torch.device("cpu")
|
||
|
||
pprint('create DataLoaders...')
|
||
train_loader_list, valid_loader, test_loader = loadData()
|
||
|
||
pprint('create AdaRNN model...')
|
||
model = get_model(model_name)
|
||
|
||
num_model = count_parameters(model)
|
||
|
||
print(model)
|
||
print('#model params:', num_model)
|
||
|
||
pprint('train model...')
|
||
best_save_path = train(model=model, train_loader_list=train_loader_list, valid_loader=valid_loader, lr_patience=20,
|
||
early_stop_patience=50, device=device)
|
||
|
||
end = time.time()
|
||
|
||
print("训练耗时:{:3.2f}s".format(end - begin))
|
||
|
||
pprint('验证模型...')
|
||
loaders = train_loader_list[0], valid_loader, test_loader
|
||
loss_list, loss_l1_list, loss_r_list = inference_all(output_path, model, os.path.join(
|
||
output_path, best_save_path), loaders)
|
||
pprint('MSE: train %.6f, valid %.6f, test %.6f' %
|
||
(loss_list[0], loss_list[1], loss_list[2]))
|
||
pprint('L1: train %.6f, valid %.6f, test %.6f' %
|
||
(loss_l1_list[0], loss_l1_list[1], loss_l1_list[2]))
|
||
pprint('RMSE: train %.6f, valid %.6f, test %.6f' %
|
||
(loss_r_list[0], loss_r_list[1], loss_r_list[2]))
|
||
pprint('Finished.')
|
||
|
||
# 加载网络
|
||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||
|
||
model.load_state_dict(torch.load(os.path.join(output_path, best_save_path), map_location=device))
|
||
|
||
test(hidden_num=hidden_num, feature=feature, predict_num=predict_num, batch_size=batch_size, model=model,
|
||
is_single=is_single, is_norm=is_norm, save_fig_name=os.path.join(output_path, save_fig_name))
|
||
|
||
|
||
def after_test(save_name):
|
||
model = get_model(model_name)
|
||
# 加载网络
|
||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||
|
||
model.load_state_dict(torch.load(
|
||
save_name
|
||
, map_location=device))
|
||
|
||
test(hidden_num=hidden_num, feature=feature, predict_num=predict_num, batch_size=batch_size, model=model,
|
||
is_single=is_single, is_norm=is_norm)
|
||
|
||
|
||
def get_args():
|
||
parser = argparse.ArgumentParser()
|
||
|
||
# model
|
||
parser.add_argument('--model_name', default='AdaRNN')
|
||
parser.add_argument('--d_feat', type=int, default=feature)
|
||
|
||
parser.add_argument('--hidden_size', type=int, default=64)
|
||
parser.add_argument('--num_layers', type=int, default=2)
|
||
parser.add_argument('--dropout', type=float, default=0.0)
|
||
parser.add_argument('--class_num', type=int, default=1)
|
||
parser.add_argument('--pre_epoch', type=int, default=40) # 20, 30, 50
|
||
|
||
# training
|
||
parser.add_argument('--n_epochs', type=int, default=200)
|
||
parser.add_argument('--lr', type=float, default=5e-4)
|
||
parser.add_argument('--early_stop', type=int, default=40)
|
||
parser.add_argument('--smooth_steps', type=int, default=5)
|
||
parser.add_argument('--batch_size', type=int, default=36)
|
||
parser.add_argument('--dw', type=float, default=0.5) # 0.01, 0.05, 5.0
|
||
parser.add_argument('--loss_type', type=str, default='cos')
|
||
|
||
parser.add_argument('--data_mode', type=str, default='tdc')
|
||
|
||
parser.add_argument('--num_domain', type=int, default=2)
|
||
parser.add_argument('--len_seq', type=int, default=hidden_num)
|
||
|
||
# other
|
||
parser.add_argument('--seed', type=int, default=10)
|
||
parser.add_argument('--data_path', default="E:\self_example\pytorch_example\RUL\otherIdea/adaRNN\dataset/")
|
||
parser.add_argument('--outdir', default='./outputs')
|
||
parser.add_argument('--overwrite', action='store_true')
|
||
parser.add_argument('--log_file', type=str, default='run.log')
|
||
parser.add_argument('--gpu_id', type=int, default=0)
|
||
parser.add_argument('--len_win', type=int, default=0)
|
||
args = parser.parse_args()
|
||
|
||
return args
|
||
|
||
|
||
if __name__ == '__main__':
|
||
begin = time.time()
|
||
|
||
if torch.cuda.is_available():
|
||
device = torch.device("cuda:0")
|
||
else:
|
||
device = torch.device("cpu")
|
||
|
||
torch.manual_seed(seed)
|
||
random.seed(seed)
|
||
np.random.seed(seed)
|
||
|
||
# 训练与测试
|
||
main_transfer()
|
||
|
||
'''事后测试'''
|
||
# after_test(save_name="E:\self_example\pytorch_example\RUL\otherIdea/adaRNN\outputs\AdaRNN_tdcLoss(cos)_transferLoss(cos)_domain2_dw0.5_lr0.0005\parameters\AdaRNN_hidden24_feature10_predict50_dimList64-64_epoch62_trainLoss0.5115623474121094_valLoss0.12946119904518127.pkl")
|