# -*- encoding:utf-8 -*- ''' @Author : dingjiawen @Date : 2023/11/10 15:21 @Usage : @Desc : 获取数据集 ''' import torch import numpy as np from torch.utils.data import Dataset, DataLoader '''正常Dataset类''' class Nor_Dataset(Dataset): def __init__(self, datas, labels=None): self.datas = torch.tensor(datas) if labels is not None: self.labels = torch.tensor(labels) else: self.labels = None def __getitem__(self, index): data = self.datas[index] if self.labels is not None: label = self.labels[index] return data, label return data def __len__(self): return len(self.datas) def standardization(data): mu = np.mean(data, axis=0) sigma = np.std(data, axis=0) return (data - mu) / sigma def normalization(data): _range = np.max(data) - np.min(data) return (data - np.min(data)) / _range # LSTM_cell的数目,维度,是否正则化 def getData(filter_num, dims, if_norm: bool = False): # 数据读入 HI_merge_data_origin = np.load("../../dataset/HI_merge_data.npy") # plt.plot(HI_merge_data[0:1250, 1]) # 去除掉退化特征不明显前面的点 HI_merge_data = HI_merge_data_origin[0:1250, 1] # 是否正则化 if if_norm: HI_merge_data = normalization(HI_merge_data) # plt.plot(HI_merge_data) # plt.show() (total_dims,) = HI_merge_data.shape # # 将其分成重叠采样状态-滑动窗口函数 predict_data = np.empty(shape=[total_dims - filter_num, filter_num]) # 重叠采样获取时间部和训练次数 for dim in range(total_dims - filter_num): predict_data[dim] = HI_merge_data[dim:dim + filter_num] train_label = predict_data[dims:, :] train_label_single = HI_merge_data[dims + filter_num - 1:-1] # 再重叠采样获取一个点的维度 '''train_data.shape:(sample,filter_num) -> (sample,filter_num,dims)''' # # 将其分成重叠采样状态-滑动窗口函数 train_data = np.empty(shape=[dims, total_dims - filter_num - dims, filter_num]) for dim in range(dims): train_data[dim] = predict_data[dim:total_dims - filter_num - dims + dim, :] # 转置变成想要的数据 (dims,sample,filter_num) -> (sample,filter_num,dims) train_data = np.transpose(train_data, [1, 2, 0]) # todo 解决模型保存时,query无法序列化的问题 total_data = HI_merge_data print("total_data.shape:", total_data.shape) print("train_data.shape:", train_data.shape) # (20, 1200, 30) print("train_label.shape:", train_label.shape) # (20, 1200) print("train_label_single.shape:", train_label_single.shape) # 所有的原始数据;所有的训练数据;所有的训练标签(预测一个序列);所有的训练标签(预测一个点) return total_data, train_data, train_label, train_label_single def splitValData(data, label, label_single, predict_num=50): sample, hidden, feature = data.shape train_data = data[:sample - predict_num, :, :] val_data = data[sample - predict_num:, :, :] train_label = label[:sample - predict_num, :] val_label = label[sample - predict_num:, :] train_label_single = label_single[:sample - predict_num, ] val_label_single = label_single[sample - predict_num:, ] return train_data, val_data, train_label, val_label, train_label_single, val_label_single def getTotalData(hidden_num, feature, is_single=True, is_norm=False): total_data, train_data, train_label, train_label_single = getData(hidden_num, feature, is_norm) if is_single: total_dataset = Nor_Dataset(train_data, train_label_single) else: total_dataset = Nor_Dataset(train_data, train_label) return total_data, total_dataset # lstm细胞数,channel数,预测多少个点,是否正则化 def getDataset(hidden_num, feature, predict_num, is_single=True, is_norm=False): total_data, train_data, train_label, train_label_single = getData(hidden_num, feature, is_norm) # 根据预测的点数划分训练集和测试集(验证集) train_data, val_data, train_label, val_label, train_label_single, val_label_single = splitValData(train_data, train_label, train_label_single, predict_num=predict_num) if is_single: train_dataset = Nor_Dataset(train_data, train_label_single) val_dataset = Nor_Dataset(val_data, val_label_single) else: train_dataset = Nor_Dataset(train_data, train_label) val_dataset = Nor_Dataset(val_data, val_label) return train_dataset, val_dataset