20231108爬虫学习更新

This commit is contained in:
kevinding1125 2023-11-08 19:14:57 +08:00
parent df0f410f8f
commit d8746d192f
20 changed files with 3133 additions and 0 deletions

View File

@ -0,0 +1,244 @@
# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/11/8 16:08
@Usage :
@Desc :参考 https://github.com/Python3WebSpider/BeautifulSoupTest
'''
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
def baseUse():
soup = BeautifulSoup(html, 'lxml')
print(soup.title) # <title>The Dormouse's story</title>
print(type(soup.title)) # <class 'bs4.element.Tag'>
print(soup.title.string) # The Dormouse's story
print(soup.head) # <head><title>The Dormouse's story</title></head>
print(soup.p) # <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
print(soup.p.name) # 获取节点名称 p
print(soup.p.attrs) # 获取属性 {'class': ['title'], 'name': 'dromouse'}
print(soup.p.attrs['name']) # 获取属性值 dromouse
print(soup.p['name']) # 获取属性值 dromouse
print(soup.body.p['name']) # 嵌套选择 dromouse
print("==========================")
def child():
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
# 子结点
for i, child in enumerate(soup.p.children):
print(i, child)
print("===============================")
# 子孙节点
for i, child in enumerate(soup.p.descendants):
print(i, child)
print("===============================")
def parent():
soup = BeautifulSoup(html, 'lxml')
# 父节点
print(soup.a.parent)
print("===============================")
# 祖父节点
print(type(soup.a.parents))
print(list(enumerate(soup.a.parents)))
print("=============================")
def brother():
html = """
<html>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
Hello
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
"""
# 兄弟节点
soup = BeautifulSoup(html, 'lxml')
print('Next Sibling', soup.a.next_sibling)
print('Prev Sibling', soup.a.previous_sibling)
print('Next Siblings', list(enumerate(soup.a.next_siblings)))
print('Prev Siblings', list(enumerate(soup.a.previous_siblings)))
# 找到所有满足条件的
def findAll():
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(name='ul'))
print(type(soup.find_all(name='ul')[0]))
for ul in soup.find_all(name='ul'):
print(ul.find_all(name='li'))
for ul in soup.find_all(name='ul'):
print(ul.find_all(name='li'))
for li in ul.find_all(name='li'):
print(li.string)
# 找属性满足匹配得到
def attrs():
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(attrs={'name': 'elements'}))
# 常用的属性可以不用attrs传递
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(id='list-1'))
print(soup.find_all(class_='element'))
import re
print(soup.find_all(string=re.compile('Foo')))# string等同于text,即里面的具体内容
# 返回匹配到的第一个元素
def find():
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find(name='ul'))
print(type(soup.find(name='ul')))
print(soup.find(class_='list'))
# css选择器
def cssSelect():
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.select('.panel .panel-heading'))
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
print(type(soup.select('ul')[0]))
# 嵌套选择
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
print(ul.select('li'))
# 获取属性
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
print(ul['id'])
print(ul.attrs['id'])
# 获取文本
soup = BeautifulSoup(html, 'lxml')
for li in soup.select('li'):
print('Get Text:', li.get_text())
print('String:', li.string)
if __name__ == '__main__':
cssSelect()

View File

@ -0,0 +1,8 @@
#-*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/11/8 16:07
@Usage :
@Desc :
'''

View File

@ -0,0 +1,8 @@
#-*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/11/8 16:54
@Usage :
@Desc :
'''

View File

@ -0,0 +1,329 @@
# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/11/8 16:54
@Usage :
@Desc :Pyquery学习 参考: https://github.com/Python3WebSpider/PyQueryTest
'''
from pyquery import PyQuery as pq
# 字符串初始化
def stringBase():
html = '''
<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = pq(html)
print(doc('li'))
# URL初始化
def URLBase():
doc = pq(url='https://cuiqingcai.com')
print(doc('title'))
# 上述代码等同于下面
# doc = pq(requests.get('https://cuiqingcai.com').text)
# print(doc('title'))
# 文件初始化
def fileBase():
doc = pq(filename='demo.html')
print(doc('li'))
# 基本的css选择器
def cssSelect():
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = pq(html)
print(doc('#container .list li'))
print(type(doc('#container .list li')))
#
for item in doc('#container .list li').items():
print(item.text())
# 寻找子节点
def child():
html = '''
<div>
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = pq(html)
items = doc('.list')
print(type(items))
print(items)
lis = items.find('li')
print(type(lis))
print(lis)
#
#
lis = items.children()
print(type(lis))
print(lis)
#
lis = items.children('.active')
print(lis)
def parent():
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
container = items.parent()
print(type(container))
print(container)
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
parents = items.parents()
print(type(parents))
print(parents)
parent = items.parents('.wrap')
print(parent)
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.list .item-0.active')
print(li.siblings())
def brother():
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.list .item-0.active')
print(li.siblings('.active'))
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
print(str(li))
from pyquery import PyQuery as pq
doc = pq(html)
# 可能是多个节点
lis = doc('li').items()
print(type(lis))
for li in lis:
print(li, type(li))
def attrs():
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a, type(a))
print(a.attr('href'))
a = doc('a')
print(a, type(a))
print(a.attr('href'))
print(a.attr.href)
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('a')
for item in a.items():
# 获取属性和文本
print(item.attr('href'),item.text())
def getHTML():
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li')
print(li.html()) # 第一个节点对应的html <a href="link2.html">second item</a>
print(li.text()) # 所有匹配的节点的文本 second item third item fourth item fifth item
print(type(li.text()))
# 增加或者删除节点的class
def operateNode():
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.removeClass('active')
print(li)
li.addClass('active')
print(li)
'''
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
'''
def operateNodeInformation():
html = '''
<ul class="list">
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
</ul>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link')
print(li)
li.text('changed item')
print(li)
li.html('<span>changed item</span>')
print(li)
'''
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active" name="link">changed item</li>
<li class="item-0 active" name="link"><span>changed item</span></li>
'''
def removeInformation():
html = '''
<div class="wrap">
Hello, World
<p>This is a paragraph.</p>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
'''
Hello, World
This is a paragraph.
'''
wrap.find('p').remove()
print(wrap.text())
'''
Hello, World
'''
# 伪类选择器
def fakeCSSSelect():
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li:first-child')
print(li)
li = doc('li:last-child')
print(li)
li = doc('li:nth-child(2)')
print(li)
li = doc('li:gt(2)')
print(li)
li = doc('li:nth-child(2n)')
print(li)
li = doc('li:contains(second)')
print(li)
if __name__ == '__main__':
fakeCSSSelect()

View File

@ -0,0 +1,195 @@
# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/11/8 15:15
@Usage :
@Desc :
'''
from lxml import etree
'''
XPath基本规则:
1) nodename:选择此节点的所有子节点
2) /:从当前节点选取直接子节点
3) //:从当前阶段选择子孙节点
4) .:选取当前节点
5) ..:选取当前节点的父节点
6) @:选取属性
举例:
//title[@lang='eng]代表选择所有名称为title,同时属性lang的值为eng的节点
'''
def htmlByString():
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = etree.tostring(html)
print(result.decode('utf-8'))
def htmlByFile():
html = etree.parse('./test.html', etree.HTMLParser())
result = etree.tostring(html)
print(result.decode('utf-8'))
def allNode():
html = etree.parse('./test.html', etree.HTMLParser())
# 从头开始匹配所有的
result = html.xpath('//*')
print(result)
print(result[0])
# 匹配所有li的
result = html.xpath('//li')
print(result)
print(result[0])
# 子节点匹配
def childNode():
html = etree.parse('./test.html', etree.HTMLParser())
# 匹配所有li的子节点a
result = html.xpath('//li/a')
print(result)
print(result[0])
# 匹配所有li的子孙节点a 相当于只要是子节点下面的就可以匹配上
result = html.xpath('//ul//a')
print(result)
print(result[0])
# 父节点匹配
def fatherNode():
html = etree.parse('./test.html', etree.HTMLParser())
# 匹配a节点属性href是link4.html的父节点的class属性
result = html.xpath('//a[@href="link4.html"]/../@class')
print(result)
# 也可以通过parent::来获取
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
print(result)
# 文本获取
def textGet():
html = etree.parse('./test.html', etree.HTMLParser())
# 匹配li节点属性class是item-0的节点的子节点a的text
result = html.xpath('//li[@class="item-0"]/a/text()')
print(result) # ['first item', 'fifth item']
# 匹配li节点属性class是item-0的节点的子孙节点的text
result = html.xpath('//li[@class="item-0"]//text()')
print(result) # ['first item', 'fifth item', '\r\n ']
# 属性获取
def fieldGet():
html = etree.parse('./test.html', etree.HTMLParser())
# 匹配li节点属性class是item-0的节点的子节点a的href属性
result = html.xpath('//li/a/@href')
print(result) # ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
# 属性多值匹配
def fieldsGet():
text = '''
<li class="li li-first"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[@class="li"]/a/text()')
print(result) # [] 匹配不到
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result) # ['first item'] contains匹配到了
# 多属性匹配
def fieldssGet():
text = '''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
# 多属性用and连接
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
print(result)
# 按序选择
def orderGet():
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/a/text()')
print(result) # ['first item']
result = html.xpath('//li[last()]/a/text()')
print(result) # ['fifth item']
result = html.xpath('//li[position()<3]/a/text()')
print(result) # ['first item', 'second item']
result = html.xpath('//li[last()-2]/a/text()')
print(result) # ['third item']
def nodeSelect():
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/ancestor::*')
print(result)
# ancestor获取祖先
result = html.xpath('//li[1]/ancestor::div')
print(result)
# attribute获取所有属性
result = html.xpath('//li[1]/attribute::*')
print(result)
# child获取子节点
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)
# descendant获取子孙结点
result = html.xpath('//li[1]/descendant::span')
print(result)
# following获取当前节点之后的所有节点
result = html.xpath('//li[1]/following::*[2]')
print(result)
# following-sibling获取当前节点之后的同级节点
result = html.xpath('//li[1]/following-sibling::*')
print(result)
if __name__ == '__main__':
nodeSelect()

View File

@ -0,0 +1,8 @@
#-*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/11/8 15:15
@Usage :
@Desc :
'''

View File

@ -0,0 +1,9 @@
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>

View File

@ -0,0 +1,8 @@
#-*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/11/8 15:12
@Usage :
@Desc :
'''

View File

@ -0,0 +1,153 @@
# _*_ coding: UTF-8 _*_
'''
@Author : dingjiawen
@Date : 2022/7/11 12:55
@Usage :
@Desc :
'''
import numpy as np
import pandas as pd
import time
# 只计算了该程序运行CPU的时间
import timeit
# cat_sale = pd.read_excel('data/catering_sale.xls')
path = "G:\data\SCADA数据\jb4q_8.csv"
cat_sale = pd.read_csv(path)
# cat_sale.drop('日期', axis=1, inplace=True)
# 过滤异常值,并置为空值
# cat_sale['销量'][(cat_sale['销量'] < 400) | (cat_sale['销量'] > 5000)] = np.NAN
# 将0值变成NAN 通过双中括号进行索引任意位置
# print(df['realtime'][1])
cat_sale[:][cat_sale[:] == 0] = np.nan # 在索引比较的时候要转换成同一类型使用astype
# 分别定义求插商与求w的函数
'''
:param x:差值前后的索引值
:param y:差值前后的数值
'''
def cal_f(x, y):
"""
计算插商
"""
f0 = np.zeros((len(x), len(y))) # 定义一个存储插商的数组
for k in range(len(y) + 1): # 遍历列
for i in range(k, len(x)): # 遍历行
if k == 0:
f0[i, k] = y[i]
else:
f0[i, k] = (f0[i, k - 1] - f0[i - 1, k - 1]) / (x[i] - x[i - 1])
# print('差商表', '\n', f0)
return f0
'''
:param x:差值前后的索引值
:param y:差值前后的数值
:param x_j:需要差值的索引
'''
def newton(x, y, x_j):
"""
牛顿差值多项式
"""
f0 = cal_f(x, y) # 计算插商
f0 = f0.diagonal() # 插商对角线
# 与w相乘
f1 = 0
for i in range(len(f0)):
s = 1
k = 0
while k < i:
s = s * (x_j - x[k])
k += 1
f1 = f1 + f0[i] * s
return f1
# 自定义列向量插值函数,获取需差值的前后几个数
'''
:param s:整个差值的序列
:param n:需要差值的索引
:param x_j:需要差值的索引
:param is_fast:是否需要快速差值(无论前后是否是零值均采用);反之则一直找到不为0值的进行计算
:param k:取前后多少个数
'''
def ployinterp_columns(s, n, x_j, is_fast: bool = False, k=3):
X = []
Y = []
if is_fast:
# 如果最前面的值不够k个
if n < k:
a = list(range(0, n)) + list(range(n + 1, n + k + 1))
y = s[list(range(0, n)) + list(range(n + 1, n + k + 1))]
# 如果最后面的值不够k个
elif n > len(s) - k - 1:
y = s[list(range(n - k, n)) + list(range(n + 1, len(s)))]
# 前后均有k个
else:
y = s[list(range(n - k, n)) + list(range(n + 1, n + k + 1))] # 取空值处的前后5个数
y = y[y.notnull()] # 剔除空值
X = y.index
Y = list(y)
else:
# 先取序列前后各k个不为空的值
index = n - 1
while len(X) < k and index >= 0:
if not np.isnan(s[index]):
Y.append(s[index])
X.append(index)
index -= 1
index = n + 1
X.reverse()
Y.reverse()
while len(X) < 2 * k and index <= len(s):
if not np.isnan(s[index]):
Y.append(s[index])
X.append(index)
index += 1
# print(X)
# print(Y)
return newton(X, Y, x_j) # 插值并返回插值结果
def execute():
cat_sale[:][cat_sale[:] == 0] = np.nan # 在索引比较的时候要转换成同一类型使用astype
for i in cat_sale.columns:
temp = cat_sale[i].isnull()
if temp[:][temp[:] == True].__len__() > 0:
print("{0}列处理前空行数:{1}".format(i, cat_sale[i].isnull().sum()))
for j in range(len(cat_sale)):
if (cat_sale[i].isnull())[j]:
x_j = cat_sale.index[j]
cat_sale.loc[j,i] = ployinterp_columns(cat_sale[i], j, x_j)
print('{0}行牛顿插值为{1}'.format(j, cat_sale.loc[j, i]))
print("{0}列处理后空行数:{1}".format(i, cat_sale[i].isnull().sum()))
print("========================================")
print(cat_sale)
cat_sale.to_csv("G:\data\SCADA数据\jb4q_8_dealed.csv")
# cat_sale.to_excel('saless.xls')
def test():
cat_sale[:][cat_sale[:] == 0] = np.nan # 在索引比较的时候要转换成同一类型使用astype
for j in range(len(cat_sale['num_gearbox_sumptemp'])):
if (cat_sale['num_gearbox_sumptemp'].isnull())[j]:
x_j = cat_sale.index[j]
cat_sale.loc[j,'num_gearbox_sumptemp'] = ployinterp_columns(cat_sale['num_gearbox_sumptemp'], j, x_j,is_fast=True)
# print('第{0}行牛顿插值为{1}'.format(j, cat_sale.loc[j,'num_gearbox_sumptemp']))
if __name__ == '__main__':
start = timeit.default_timer()
# execute()
test()
end = timeit.default_timer()
print('Running time: %s Seconds' % (end - start))
# 返回值是浮点数

View File

@ -0,0 +1,96 @@
# _*_ coding: UTF-8 _*_
'''
@Author : dingjiawen
@Date : 2022/7/11 11:43
@Usage :
@Desc :
'''
import numpy as np
import pandas as pd
# 拉格朗日插值算法
def LagrangeInterpolation(slices, x, k=5):
# slices(series) :the defining points
# k :the number of defining points of Lagrange poly 前后各k个值
# slices index :the corresponding value on each defining point
# x :the point whose value we are interested
# print(slices[x])
# print(np.isnan(slices[x]))
result = 0 # later to save final result
X = []
Y = []
# 先取序列前后各k个不为空的值
index = x - 1
while len(X) < k and index >= 0:
if not np.isnan(slices[index]):
Y.append(slices[index])
X.append(index)
index -= 1
index = x + 1
X.reverse()
Y.reverse()
while len(X) < 2 * k and index <= len(slices):
if not np.isnan(slices[index]):
Y.append(slices[index])
X.append(index)
index += 1
# print(X)
# print(Y)
for j in range(len(X)):
# result_l 基函数
result_l = 1
for i in range(len(X)):
if i != j:
result_l = result_l * (x - X[i]) / (X[j] - X[i])
# 取值 slices[j]
result = result + slices[j] * result_l
return result
if __name__ == '__main__':
path = "G:\data\SCADA数据\jb4q_8.csv"
df = pd.read_csv(path)
columns = df.columns
print(df.columns)
# 将0值变成NAN 通过双中括号进行索引任意位置
# print(df['realtime'][1])
df[:][df[:] == 0] = np.nan # 在索引比较的时候要转换成同一类型使用astype
# TODO 测试单点插值
print(df['num_gearbox_sumptemp'].isnull())
# print("插值为:", LagrangeInterpolation(df['num_gearbox_sumptemp'], 47, 2))
# TODO 单列测试插值
print("之前的空值数量:", df['num_gearbox_sumptemp'].isnull().sum())
for j in range(len(df)):
if (df['num_gearbox_sumptemp'].isnull())[j]:
s = df['num_gearbox_sumptemp']
df.loc[j, 'num_gearbox_sumptemp'] = LagrangeInterpolation(s, j, 5)
print("插值之后的空值数量:", df['num_gearbox_sumptemp'].isnull().sum())
# # TODO 整体处理
print("之前的空值数量:", df.isnull().sum())
for i in columns:
temp = df[i].isnull()
if temp[:][temp[:] == True].__len__() > 0:
for j in range(len(df)):
if (df[i].isnull())[j]:
s = df[columns[i]]
df.loc[j, i] = LagrangeInterpolation(s, j, 3)
print("插值之后的空值数量:",df.isnull().sum())
df.to_csv("G:\实验室/2022项目中期\数据治理算法\jb4q_8_lagrange.csv")

View File

@ -0,0 +1,149 @@
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 7 09:23:31 2020
@author: AlbertHu
"""
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 5 21:33:46 2020
@author: AlbertHu
"""
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 5 10:40:27 2020
@author: AlbertHu
"""
import os
import time
import numpy as np
import pandas as pd
import datetime
def findallfiles(cmsfilesfatherpath): #返回父目录包括子目录下所有文件的地址
cmsfilepaths = []
files = os.listdir(cmsfilesfatherpath)
for fi in files:
fi_d = os.path.join(cmsfilesfatherpath, fi)
if os.path.isdir(fi_d):
# files.extend(findcmsfiles(fi_d))
pass
else:
cmsfilepaths.append(fi_d)
return cmsfilepaths
def findIndexOfExceptPoint(data):
indexList2D = []
indexList1 = []
indexList2 = []
indexList3 = []
indexList4 = []
indexList5 = []
print("开始清洗")
for i in data.index:
if i % 10000 == 0:
print("已处理了{}组数据".format(i))
#条件1
if data[' 瞬时风速'][i] < 3.5 and data[' 1#叶片变桨角度'][i] > 89:
indexList1.append(i)
elif data[' 瞬时风速'][i] >= 3.5 and data[' 瞬时风速'][i] <= 10 and data[' 1#叶片变桨角度'][i] > 0.5:
indexList1.append(i)
elif data[' 瞬时风速'][i] >= 11 and data[' 瞬时风速'][i] <= 25 and (data[' 有功功率'][i] < 1800 and data[' 1#叶片变桨角度'][i] > 1.5):
indexList1.append(i)
elif data[' 瞬时风速'][i] > 25 and data[' 有功功率'][i] >0:
indexList1.append(i)
else:
pass
#条件2
if abs(data[' 齿轮箱高速轴前端温度'][i])>200 or abs(data[' 齿轮箱高速轴后端温度'][i])>200 or abs(data[' 齿轮箱冷却水温'][i])>200 or abs(data[' 齿轮箱进口油温'][i])>200 or abs(data[' 齿轮箱油池温度'][i])>200 or abs(data[' 环境温度'][i]>200):
indexList2.append(i)
else:
pass
#条件3 #条件6
if data[' 齿轮箱高速轴前端温度'][i] > 80 or data[' 齿轮箱高速轴后端温度'][i] > 80 or abs(data[' 齿轮箱高速轴前端温度'][i] - data[' 齿轮箱高速轴后端温度'][i]) > 20:
indexList3.append(i)
else:
pass
#条件4
if data[' 有功功率'][i] > 100 and data[' 齿轮箱进口压力'][i] <= 0:
indexList4.append(i)
else:
pass
#条件5
if abs(data[' 齿轮箱进口压力'][i] - data[' 齿轮箱泵出口压力'][i]) > 5:
indexList5.append(i)
else:
pass
indexList2D = [indexList1,indexList2,indexList3,indexList4,indexList5]
return indexList2D
# #条件6
# if data[' 齿轮箱高速轴前端温度'][i] > 80 or data[' 齿轮箱高速轴后端温度'][i]) > 80:
fathpath = r'D:\1.SCADA_风电数据\靖边二期2019_已处理'
allfilepaths = findallfiles(fathpath)
testpath = allfilepaths[0]
#allfilepaths = [r'F:\scada_ewma本地数据2(重要)\data\DataResult(靖边二期2019)\风机7.csv']
#testpath=r'F:\scada_ewma本地数据2(重要)\data\DataResult(粤水电达坂城2020.1月-5月)\风机1.csv'
for testpath in allfilepaths:
data = pd.read_csv(testpath,encoding='gbk',parse_dates = ['时间'])
data.columns
indexList2D = findIndexOfExceptPoint(data)
savePath = r'./cleanScada/JB2Q615/风机{}'.format(data['风机号'][1])
if not os.path.exists(savePath):
os.makedirs(savePath)
file = open(savePath + '/IndexOfExceptPoint.txt','w')
a = 1
for List in indexList2D:
for i in List:
file.write(str(i)+',')
try:
data.drop([i],inplace=True)
except:
continue
file.write('{}\n'.format(a))
a += 1
file.close()
data.to_csv(savePath+'.csv',encoding='gbk')

View File

@ -0,0 +1,67 @@
# _*_ coding: UTF-8 _*_
'''
@Author : dingjiawen
@Date : 2022/7/7 10:29
@Usage : 对SCADA数据进行基础的清洗工作
@Desc :
'''
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import time
from condition_monitoring.lib.IOBase import ioLib
'''
超参数设置
'''
# 需处理文件的父目录
fatherPath = "G:\data\SCADA数据\华能三塘湖"
# 处理好文件的父目录
fatherDealedPath = "G:\data\SCADA数据\华能三塘湖\dealed"
baseUseCols = ["时间", "风机号", "发电机转矩", "发电机无功功率", "发电机转速", "发电机有功功率", "发电机绕组最高温度", "齿轮箱油池温度", "齿轮箱进口油温", "齿轮箱进口压力",
"齿轮箱油泵出口压力", "齿轮箱冷却水温度", "有功功率", "60s平均有功功率", "10min平均有功功率", "10s平均有功功率", "10s平均无功功率", "无功功率", "瞬时风速",
"机舱温度"]
baseWinds = []
# 列出父目录下所有文件
def listFile(fatherPath = fatherPath):
filepaths = []
files = os.listdir(fatherPath)
for file in files:
fi_d = os.path.join(fatherPath, file)
if os.path.isdir(fi_d):
pass
# files.extend(findcmsfiles(fi_d))
else:
filepaths.append(fi_d)
return filepaths
def dropNa(filePath):
data = pd.read_csv(filePath, low_memory=False, encoding='gbk', usecols=baseUseCols, parse_dates=['时间'])
print(data)
data.dropna(axis=0, how='any', inplace=True)
print(data)
data.append()
ioLib.saveCSV(data=data, savePath=fatherDealedPath)
def separateByWindNum(data):
indexLists = []
windList1 = []
windList2 = []
if __name__ == '__main__':
filePath = "G:\data\SCADA数据\华能三塘湖/1华能三塘湖20180730-20180803.csv"

View File

@ -0,0 +1,228 @@
import pandas as pd
import numpy as np
import tensorflow as tf
import csv
import os
import matplotlib.pyplot as plt
import seaborn as sns
'''设置数据源文件路径'''
# source_path = r'G:\data\SCADA数据\jb4q_8.csv'
source_path = "G:\data\SCADA数据\jb4q_8_delete_total_zero.csv"
'''修改后的数据源存储路径'''
save_path = r'G:\data\SCADA数据\jb4q_8_delete_total_zero.csv'
'''需要的列'''
# baseUseCols = ["num_gearbox_sumptemp","num_gearbox_inletoiltemp","num_gearbox_inletpress","num_gearbox_coolingwatertemp"]
# target_path = r'G:\data\SCADA数据\华能三塘湖/dealed/后十万2018.01.16.csv'
# target_folder = r'G:\data\SCADA数据\华能三塘湖/dealed'
# 生成文件夹
def folderGenerate(folder_name):
if not os.path.exists(folder_name):
os.makedirs(folder_name)
# 皮尔逊相关系数
def cal_correlation_coefficient(data, label):
print("计算皮尔逊相关系数")
print(data)
print(data.shape)
pd_data = pd.DataFrame(data)
person = pd_data.corr()
print(person)
# 画热点图heatmap
# cmap = sns.heatmap(person, annot=True, xticklabels=label, yticklabels=label)
# plt.figure(1, figsize=(6.0, 2.68))
# plt.subplots_adjust(left=0.1, right=0.94, bottom=0.2, top=0.9, wspace=None,
# hspace=None)
# plt.tight_layout()
# font1 = {'family': 'Times New Roman', 'weight': 'normal', 'size': 10} # 设置坐标标签的字体大小,字体
# font2 = {'family': 'Times New Roman', 'weight': 'normal', 'size': 15} # 设置坐标标签的字体大小,字体
# plt.xlabel("X", size=10,fontdict=font1)
# plt.ylabel("Y", size=10,fontdict=font1)
# plt.title("Heatmap of correlation coefficient matrix", size=20,fontdict=font1)
#
# # 调整色带的标签:
# cbar = cmap.collections[0].colorbar
# cbar.ax.tick_params(labelsize=15, labelcolor="black")
# cbar.ax.set_ylabel(ylabel="color scale", color="red", loc="center",fontdict=font2)
#
# plt.show()
return person
def get_most_N_correlation_coefficient(person, N=10):
print("获得相关度最高的{}个值".format(N))
# total_correlation = person[1:, 1:]
abs_correlation = np.abs(person)
one = np.ones(shape=abs_correlation.shape)
two = np.subtract(one, abs_correlation)
rows, cols = two.shape
total_sum = []
for i in range(cols):
# print(two[i])
total = np.sum(two[i])
total_sum.append(total)
print("total_sum:", total_sum)
# 取最小的N个数因为是与1减了以后的,越小相关系数越大
print("arg:",np.argpartition(total_sum, N))
min = np.argpartition(total_sum, N)[:N]
max = np.argpartition(total_sum, N)[total_sum.__len__() - N:]
print("min:",min)
return min
# 过滤或者线性填充
def findIndexOfExceptPoint(data: pd.DataFrame):
# indexList2D = []
# indexList = []
# indexList2 = []
# indexList3 = []
# indexList4 = []
indexList = []
print("开始清洗")
for i in data.index:
if i % 10000 == 0:
print("已处理了{}条数据".format(i))
## 删除绝大多数0
# if data['num_gearbox_sumptemp'][i] != 0 and (i < 416166 or i > 432766) and (
# data['num_gearbox_pumpoutletpress'][i] == 0 or data['num_activepower'][i] == 0 or
# data['num_gen_torque'][i] == 0):
# indexList.append(i)
# 删除全部有0
# if (i < 416166 or i > 432766) and (
# data['num_gearbox_pumpoutletpress'][i] == 0 or data['num_activepower'][i] == 0 or
# data['num_gen_torque'][i] == 0):
# indexList.append(i)
# 只删除全部0
if (i < 416166 or i > 432766) and (
data['num_gearbox_sumptemp'][i] == 0 and data['num_gearbox_inletoiltemp'][i] == 0 and
data['num_gearbox_inletpress'][i] == 0):
indexList.append(i)
else:
pass
# indexList2D = [indexList1, indexList2, indexList3, indexList4, indexList5]
indexList2D = set(indexList)
print("要移除的index:", indexList2D)
return indexList2D
# 根据index移除异常数据
def removeDataByIndex(indexList, data):
print("开始移除异常index的数据")
a = 1
data.drop(indexList, inplace=True)
# for i in indexList:
# try:
# data.drop([i], inplace=True)
# except:
# continue
# # print('第{}组\n'.format(a))
# # a += 1
return data
# 处理数据(移除,重新赋值,或者是其他操作)
def dealData(scada_data: pd.DataFrame):
# 是否保存处理好的数据
Is_save = True
indexList = findIndexOfExceptPoint(scada_data)
removeDataByIndex(indexList=indexList, data=scada_data)
print("处理后的数据为:")
print(scada_data)
if Is_save:
print("============保存处理好的数据,路径为{}============".format(save_path))
scada_data.to_csv(save_path, index=False, encoding='gbk')
return scada_data
# 读取数据,转为numpy数组或者tf数组
def read_data(file_name, isNew: bool = False):
''' 导入数据 '''
with open(file_name, 'r') as f:
if isNew:
# scada_data = pd.read_csv(f,low_memory=False, encoding='gbk', usecols=baseUseCols, parse_dates=['时间'])
scada_data = pd.read_csv(f, low_memory=False, encoding='gbk', parse_dates=['realtime'])
print(scada_data)
scada_data = dealData(scada_data=scada_data)
print(scada_data.head)
scada_data = np.array(scada_data)
else:
scada_data = np.loadtxt(f, str, delimiter=",")
label = scada_data[0, 3:]
label=list(['Gs','Gio','Gip','Gp','Gwt','En','Gft','Grt','Gwt','Et','Rs','Ap','Ws','Dw','Ges','Gt','Vx','Vy'])
print("导入数据成功,将数据转为numpy或tf数组...")
needed_data = scada_data[1:, 3:].astype(dtype=np.float)
## needed_data = tf.cast(needed_data, tf.float32) tensor无法转为pd.DataFrame
print(needed_data)
print("转换成功,并返回...")
return needed_data, label
def plot_original_data(data):
rows, cols = data.shape
print("开始画图...")
for i in range(cols):
plt.figure(i)
plt.plot(data[:, i])
plt.show()
def execute(file_name=source_path,N=10):
needed_data, label = read_data(file_name=file_name, isNew=False)
print(needed_data)
print(needed_data.shape)
# plot_original_data(needed_data)
person = cal_correlation_coefficient(needed_data, label)
person = np.array(person)
min = get_most_N_correlation_coefficient(person, N=N)
for index in min:
if index == min[0]:
total_data = np.expand_dims(needed_data[:, index], axis=-1)
else:
total_data = np.concatenate([total_data, np.expand_dims(needed_data[:, index], axis=-1)], axis=-1)
return total_data
def deal_data(file_name=source_path):
''' 导入数据 '''
with open(file_name, 'r') as f:
# scada_data = pd.read_csv(f,low_memory=False, encoding='gbk', usecols=baseUseCols, parse_dates=['时间'])
scada_data = pd.read_csv(f, low_memory=False, encoding='gbk', parse_dates=['realtime'])
print(scada_data)
scada_data = dealData(scada_data=scada_data)
print(scada_data.head)
scada_data = np.array(scada_data)
scada_data = np.loadtxt(f, str, delimiter=",")
label = scada_data[0, 3:]
label = list(
['Gs', 'Gio', 'Gip', 'Gp', 'Gwt', 'En', 'Gft', 'Grt', 'Gwt', 'Et', 'Rs', 'Ap', 'Ws', 'Dw', 'Ges', 'Gt',
'Vx', 'Vy'])
print("导入数据成功,将数据转为numpy或tf数组...")
needed_data = scada_data[1:, 3:].astype(dtype=np.float)
## needed_data = tf.cast(needed_data, tf.float32) tensor无法转为pd.DataFrame
print(needed_data)
print("转换成功,并返回...")
return needed_data, label
pass
if __name__ == '__main__':
total_data = execute(N=10, file_name=source_path)
# print(total_data)
# print(total_data.shape)
# plot_original_data()

View File

@ -0,0 +1,207 @@
import pandas as pd
import numpy as np
import tensorflow as tf
import csv
import os
import matplotlib.pyplot as plt
import seaborn as sns
'''设置数据源文件路径'''
# source_path = r'G:\data\SCADA数据\jb4q_8.csv'
source_path = "G:\data\SCADA数据\jb4q_8_delete_total_zero.csv"
'''修改后的数据源存储路径'''
save_path = r'G:\data\SCADA数据\jb4q_8_delete_total_zero.csv'
'''需要的列'''
# baseUseCols = ["num_gearbox_sumptemp","num_gearbox_inletoiltemp","num_gearbox_inletpress","num_gearbox_coolingwatertemp"]
# target_path = r'G:\data\SCADA数据\华能三塘湖/dealed/后十万2018.01.16.csv'
# target_folder = r'G:\data\SCADA数据\华能三塘湖/dealed'
#96748 107116
# 生成文件夹
def folderGenerate(folder_name):
if not os.path.exists(folder_name):
os.makedirs(folder_name)
# 皮尔逊相关系数
def cal_correlation_coefficient(data, label):
print("计算皮尔逊相关系数")
pd_data = pd.DataFrame(data)
person = pd_data.corr()
print(person)
# 画热点图heatmap
# cmap = sns.heatmap(person, annot=True, xticklabels=label, yticklabels=label)
# plt.figure(1, figsize=(6.0, 2.68))
# plt.subplots_adjust(left=0.1, right=0.94, bottom=0.2, top=0.9, wspace=None,
# hspace=None)
# plt.tight_layout()
# font1 = {'family': 'Times New Roman', 'weight': 'normal', 'size': 10} # 设置坐标标签的字体大小,字体
# font2 = {'family': 'Times New Roman', 'weight': 'normal', 'size': 15} # 设置坐标标签的字体大小,字体
# plt.xlabel("X", size=10,fontdict=font1)
# plt.ylabel("Y", size=10,fontdict=font1)
# plt.title("Heatmap of correlation coefficient matrix", size=20,fontdict=font1)
#
# # 调整色带的标签:
# cbar = cmap.collections[0].colorbar
# cbar.ax.tick_params(labelsize=15, labelcolor="black")
# cbar.ax.set_ylabel(ylabel="color scale", color="red", loc="center",fontdict=font2)
#
# plt.show()
return person
def get_most_N_correlation_coefficient(person, N=10):
print("获得相关度最高的{}个值".format(N))
# total_correlation = person[1:, 1:]
abs_correlation = np.abs(person)
one = np.ones(shape=abs_correlation.shape)
two = np.subtract(one, abs_correlation)
rows, cols = two.shape
total_sum = []
for i in range(cols):
# print(two[i])
total = np.sum(two[i])
total_sum.append(total)
print("total_sum:", total_sum)
# 取最小的N个数因为是与1减了以后的,越小相关系数越大
print("arg:",np.argpartition(total_sum, N))
min = np.argpartition(total_sum, N)[:N]
max = np.argpartition(total_sum, N)[total_sum.__len__() - N:]
print("min:",min)
return min
# 过滤或者线性填充
def findIndexOfExceptPoint(data: pd.DataFrame):
# indexList2D = []
# indexList = []
# indexList2 = []
# indexList3 = []
# indexList4 = []
indexList = []
print("开始清洗")
for i in data.index:
if i % 10000 == 0:
print("已处理了{}条数据".format(i))
## 删除绝大多数0
# if data['num_gearbox_sumptemp'][i] != 0 and (i < 416166 or i > 432766) and (
# data['num_gearbox_pumpoutletpress'][i] == 0 or data['num_activepower'][i] == 0 or
# data['num_gen_torque'][i] == 0):
# indexList.append(i)
# 删除全部有0
# if (i < 416166 or i > 432766) and (
# data['num_gearbox_pumpoutletpress'][i] == 0 or data['num_activepower'][i] == 0 or
# data['num_gen_torque'][i] == 0):
# indexList.append(i)
# 只删除全部0
if (i < 416166 or i > 432766) and (
data['num_gearbox_sumptemp'][i] == 0 and data['num_gearbox_inletoiltemp'][i] == 0 and
data['num_gearbox_inletpress'][i] == 0):
indexList.append(i)
else:
pass
# indexList2D = [indexList1, indexList2, indexList3, indexList4, indexList5]
indexList2D = set(indexList)
print("要移除的index:", indexList2D)
return indexList2D
# 根据index移除异常数据
def removeDataByIndex(indexList, data):
print("开始移除异常index的数据")
a = 1
data.drop(indexList, inplace=True)
# for i in indexList:
# try:
# data.drop([i], inplace=True)
# except:
# continue
# # print('第{}组\n'.format(a))
# # a += 1
return data
# 处理数据(移除,重新赋值,或者是其他操作)
def dealData(scada_data: pd.DataFrame):
# 是否保存处理好的数据
Is_save = True
indexList = findIndexOfExceptPoint(scada_data)
removeDataByIndex(indexList=indexList, data=scada_data)
print("处理后的数据为:")
print(scada_data)
if Is_save:
print("============保存处理好的数据,路径为{}============".format(save_path))
scada_data.to_csv(save_path, index=False, encoding='gbk')
return scada_data
# 读取数据,转为numpy数组或者tf数组
def read_data(file_name, isNew: bool = False):
''' 导入数据 '''
with open(file_name, 'r') as f:
if isNew:
# scada_data = pd.read_csv(f,low_memory=False, encoding='gbk', usecols=baseUseCols, parse_dates=['时间'])
scada_data = pd.read_csv(f, low_memory=False, encoding='gbk', parse_dates=['realtime'])
print(scada_data)
scada_data = dealData(scada_data=scada_data)
print(scada_data.head)
scada_data = np.array(scada_data)
else:
scada_data = np.loadtxt(f, str, delimiter=",")
label = scada_data[0, 4:]
label=list(['Gs','Gio','Gip','Gp','Gwt','En','Gft','Grt','Gwt','Et','Rs','Ap','Ws','Dw','Ges','Gt','Vx','Vy'])
print("导入数据成功,将数据转为numpy或tf数组...")
needed_data = scada_data[1:, 4:].astype(dtype=np.float)
## needed_data = tf.cast(needed_data, tf.float32) tensor无法转为pd.DataFrame
print(needed_data)
print("转换成功,并返回...")
return needed_data, label
def plot_original_data(data):
rows, cols = data.shape
print("开始画图...")
for i in range(cols):
plt.figure(i)
plt.plot(data[:, i])
plt.show()
def execute(file_name=source_path,N=10):
needed_data, label = read_data(file_name=file_name, isNew=False)
print(needed_data)
print(needed_data.shape)
# plot_original_data(needed_data)
person = cal_correlation_coefficient(needed_data, label)
person = np.array(person)
min = get_most_N_correlation_coefficient(person, N=N)
for index in min:
if index == min[0]:
total_data = np.expand_dims(needed_data[:, index], axis=-1)
else:
total_data = np.concatenate([total_data, np.expand_dims(needed_data[:, index], axis=-1)], axis=-1)
return total_data
if __name__ == '__main__':
# total_data = execute(N=10, file_name=source_path)
# print(total_data)
# print(total_data.shape)7 10 13
# 15中间有一段差别很大
file_name='H:\data\SCADA数据\SCADA_已处理_粤水电达坂城2020.1月-5月/风机15.csv'
needed_data, label = read_data(file_name=file_name, isNew=False)
print(needed_data.shape)
plot_original_data(needed_data)

View File

@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
# coding: utf-8
'''
@Author : dingjiawen
@Date : 2022/11/2 12:59
@Usage : 画原始数据
@Desc :
'''
import pandas as pd
import numpy as np
source_path = "G:\data\SCADA数据\jb4q_8_delete_total_zero.csv"
def deal_data(file_name=source_path):
''' 导入数据 '''
with open(file_name, 'r') as f:
scada_data = np.loadtxt(f, str, delimiter=",")
label = scada_data[0, 3:]
label = list(
['Gs', 'Gio', 'Gip', 'Gp', 'Gwt', 'En', 'Gft', 'Grt', 'Gwt', 'Et', 'Rs', 'Ap', 'Ws', 'Dw', 'Ges', 'Gt',
'Vx', 'Vy'])
print("导入数据成功,将数据转为numpy或tf数组...")
needed_data = scada_data[1:37000, 3:].astype(dtype=np.float)
## needed_data = tf.cast(needed_data, tf.float32) tensor无法转为pd.DataFrame
print(needed_data)
print("转换成功,并返回...")
return needed_data, label
pass
# 归一化
def normalization(data):
rows, cols = data.shape
print("归一化之前:", data)
print(data.shape)
print("======================")
# 归一化
max = np.max(data, axis=0)
max = np.broadcast_to(max, [rows, cols])
min = np.min(data, axis=0)
min = np.broadcast_to(min, [rows, cols])
data = (data - min) / (max - min)
print("归一化之后:", data)
print(data.shape)
return data
if __name__ == '__main__':
needed_data, label=deal_data()
data=normalization(data=needed_data)
np.savetxt('G:\data\SCADA数据/normalization.csv',data,delimiter=',')
print(data.shape)

View File

@ -0,0 +1,262 @@
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from condition_monitoring.data_deal import loadData
from keras.callbacks import EarlyStopping
import os
import shutil
# 孔师兄idea:CNN+GRU
'''超参数设置'''
time_stamp = 120
feature_num = 10
batch_size = 8
learning_rate = 0.01
EPOCH = 101
model_name = "CNN_GRU"
'''EWMA超参数'''
K = 18
namuda = 0.01
'''保存名称'''
save_name = "../model/{0}_timestamp{1}_featureNum{2}_batch_size{3}_Epoch{4}.h5".format(model_name,
time_stamp, feature_num,
batch_size, EPOCH)
'''文件名'''
file_name = "G:\data\SCADA数据\jb4q_8_delete_all_zero.csv"
def remove(data, time_stamp=time_stamp):
rows, cols = data.shape
print("remove_data.shape:", data.shape)
num = int(rows / time_stamp)
return data[:num * time_stamp, :]
pass
# 不重叠采样
def get_training_data(data, time_stamp=time_stamp):
removed_data = remove(data=data)
rows, cols = removed_data.shape
# print("removed_data.shape:", data.shape)
# print("removed_data:", removed_data)
train_data = np.reshape(removed_data, [-1, time_stamp, cols])
# print("train_data:", train_data)
batchs, time_stamp, cols = train_data.shape
for i in range(1, batchs):
each_label = np.expand_dims(train_data[i, 0, :], axis=0)
if i == 1:
train_label = each_label
else:
train_label = np.concatenate([train_label, each_label], axis=0)
# print("train_data.shape:", train_data.shape)
# print("train_label.shape", train_label.shape)
return train_data[:-1, :], train_label
# 重叠采样
def get_training_data_overlapping(data,time_stamp=time_stamp):
rows,cols = data.shape
train_data = np.empty(shape=[rows-time_stamp-1,time_stamp,cols])
train_label = np.empty(shape=[rows-time_stamp-1,cols])
for i in range(rows):
if i +time_stamp >= rows:
break
if i + time_stamp < rows - 1:
train_data[i] = data[i:i+time_stamp]
train_label[i] = data[i+time_stamp]
print("重叠采样以后:")
print("data:",train_data)
print("label:",train_label)
return train_data,train_label
def condition_monitoring_model():
input = tf.keras.Input(shape=[time_stamp, feature_num])
conv1 = tf.keras.layers.Conv1D(filters=256, kernel_size=1)(input)
GRU1 = tf.keras.layers.GRU(128, return_sequences=False)(conv1)
d1 = tf.keras.layers.Dense(300)(GRU1)
output = tf.keras.layers.Dense(10)(d1)
model = tf.keras.Model(inputs=input, outputs=output)
return model
# 归一化
def normalization(data):
rows, cols = data.shape
print("归一化之前:", data)
print(data.shape)
print("======================")
# 归一化
max = np.max(data, axis=0)
max = np.broadcast_to(max, [rows, cols])
min = np.min(data, axis=0)
min = np.broadcast_to(min, [rows, cols])
data = (data - min) / (max - min)
print("归一化之后:", data)
print(data.shape)
return data
# 正则化
def Regularization(data):
rows, cols = data.shape
print("正则化之前:", data)
print(data.shape)
print("======================")
# 正则化
mean = np.mean(data, axis=0)
mean = np.broadcast_to(mean, shape=[rows, cols])
dst = np.sqrt(np.var(data, axis=0))
dst = np.broadcast_to(dst, shape=[rows, cols])
data = (data - mean) / dst
print("正则化之后:", data)
print(data.shape)
return data
pass
def EWMA(data, K=K, namuda=namuda):
# t是啥暂时未知
t = 0
mid = np.mean(data, axis=0)
standard = np.sqrt(np.var(data, axis=0))
UCL = mid + K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
LCL = mid - K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
return mid, UCL, LCL
pass
def get_MSE(data, label, new_model):
predicted_data = new_model.predict(data)
temp = np.abs(predicted_data - label)
temp1 = (temp - np.broadcast_to(np.mean(temp, axis=0), shape=predicted_data.shape))
temp2 = np.broadcast_to(np.sqrt(np.var(temp, axis=0)), shape=predicted_data.shape)
temp3 = temp1/temp2
mse = np.sum((temp1 / temp2) ** 2, axis=1)
print("z:", mse)
print(mse.shape)
# mse=np.mean((predicted_data-label)**2,axis=1)
print("mse", mse)
dims, = mse.shape
mean = np.mean(mse)
std = np.sqrt(np.var(mse))
max = mean + 3 * std
# min = mean-3*std
max = np.broadcast_to(max, shape=[dims, ])
# min = np.broadcast_to(min,shape=[dims,])
mean = np.broadcast_to(mean, shape=[dims, ])
# plt.plot(max)
# plt.plot(mse)
# plt.plot(mean)
# # plt.plot(min)
# plt.show()
#
#
return mse,mean,max
# pass
if __name__ == '__main__':
total_data = loadData.execute(N=feature_num,file_name=file_name)
total_data = normalization(data=total_data)
train_data, train_label = get_training_data_overlapping(total_data[:300455, :])
## TODO training
# model = condition_monitoring_model()
# checkpoint = tf.keras.callbacks.ModelCheckpoint(
# filepath=save_name,
# monitor='val_loss',
# verbose=1,
# save_best_only=True,
# mode='min',
# period=1)
# lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)
# early_stop = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=30, mode='min', verbose=1)
# model.compile(optimizer=tf.optimizers.Adam(learning_rate=learning_rate), loss=tf.losses.mse)
# model.summary()
# model.fit(train_data, train_label, batch_size=batch_size, epochs=EPOCH, validation_split=0.1,
# callbacks=[checkpoint, lr_scheduler, early_stop])
## TODO testing
print("===============================")
print(total_data.shape)
print("===============================")
test_data, test_label = get_training_data(total_data[:300455, :])
newModel = tf.keras.models.load_model(save_name)
mse,mean,max = get_MSE(test_data, test_label, new_model=newModel)
print("===============================")
print("mse:",mse)
print(mse.shape)
print("===============================")
test_data, test_label = get_training_data(total_data[20000:, :])
predicted_data = newModel.predict(test_data)
rows, cols = predicted_data.shape
print("=====================================")
print(predicted_data)
print(predicted_data.shape)
print("=====================================")
temp = np.abs(predicted_data - test_label)
temp1 = (temp - np.broadcast_to(np.mean(temp, axis=0), shape=predicted_data.shape))
temp2 = np.broadcast_to(np.sqrt(np.var(temp, axis=0)), shape=predicted_data.shape)
temp3 = temp1 / temp2
mse = np.sum((temp1 / temp2) ** 2, axis=1)
print("====================")
print("new_mse:",mse)
print(mse.shape)
np.savetxt("mse", mse, delimiter=',')
print("===================")
plt.plot(mse[2000:])
plt.plot(mean)
plt.plot(max)
plt.show()
data = pd.DataFrame(mse).ewm(span=3).mean()
print(data)
data =np.array(data)
index,_ = data.shape
for i in range(2396):
if data[i,0] >5:
data[i,0] = data[i-1,:]
print(data)
mean = data[2000:2396,:].mean()
std = data[2000:2396,:].std()
mean=np.broadcast_to(mean,shape=[500,])
std=np.broadcast_to(std,shape=[500,])
plt.plot(data[2000:2396,:])
plt.plot(mean)
plt.plot(mean+3*std)
plt.plot(mean-3*std)
plt.show()

View File

@ -0,0 +1,526 @@
# -*- coding: utf-8 -*-
# coding: utf-8
import tensorflow as tf
import tensorflow.keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from model.DepthwiseCon1D.DepthwiseConv1D import DepthwiseConv1D
from model.Dynamic_channelAttention.Dynamic_channelAttention import DynamicChannelAttention
from condition_monitoring.data_deal import loadData
from model.Joint_Monitoring.Joint_Monitoring2 import Joint_Monitoring
from model.CommonFunction.CommonFunction import *
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model, save_model
'''
@Author : dingjiawen
@Date : 2022/7/8 10:29
@Usage : 尝试将预测和分类两种方式相结合,联合监测
@Desc :REPVGG+unsampling+GRU进行重构,后面接GDP=全局动态池化+分类器
随epoch衰减的MSELoss+随epoch增强的crossEntropy
'''
'''超参数设置'''
time_stamp = 120
feature_num = 10
batch_size = 16
learning_rate = 0.001
EPOCH = 101
model_name = "joint"
'''EWMA超参数'''
K = 18
namuda = 0.01
'''保存名称'''
save_name = "../model/weight/{0}_timestamp{1}_feature{2}_Epoch{4}_weight/weight".format(model_name,
time_stamp,
feature_num,
batch_size,
EPOCH)
save_step_two_name = "../model/two_weight/{0}_timestamp{1}_feature{2}_weight/weight".format(model_name,
time_stamp,
feature_num,
batch_size,
EPOCH)
# save_name = "../model/joint/{0}_timestamp{1}_feature{2}.h5".format(model_name,
# time_stamp,
# feature_num,
# batch_size,
# EPOCH)
# save_step_two_name = "../model/joint_two/{0}_timestamp{1}_feature{2}.h5".format(model_name,
# time_stamp,
# feature_num,
# batch_size,
# EPOCH)
'''文件名'''
file_name = "G:\data\SCADA数据\jb4q_8_delete_all_zero.csv"
'''
文件说明jb4q_8_delete_all_zero.csv是删除了除异常以外的所有0值的文件
文件从0:300454行均是正常值(2019/7.30 00:00:00 - 2019/9/18 11:21:00)
从300455:317052行均是异常值(2019/9/18 11:21:01 - 2019/9/29 23:59:00)
'''
'''文件参数'''
# 最后正常的时间点
healthy_date = 300454
# 最后异常的时间点
unhealthy_date = 317052
# 异常容忍程度
unhealthy_patience = 5
def remove(data, time_stamp=time_stamp):
rows, cols = data.shape
print("remove_data.shape:", data.shape)
num = int(rows / time_stamp)
return data[:num * time_stamp, :]
pass
# 不重叠采样
def get_training_data(data, time_stamp: int = time_stamp):
removed_data = remove(data=data)
rows, cols = removed_data.shape
print("removed_data.shape:", data.shape)
print("removed_data:", removed_data)
train_data = np.reshape(removed_data, [-1, time_stamp, cols])
print("train_data:", train_data)
batchs, time_stamp, cols = train_data.shape
for i in range(1, batchs):
each_label = np.expand_dims(train_data[i, 0, :], axis=0)
if i == 1:
train_label = each_label
else:
train_label = np.concatenate([train_label, each_label], axis=0)
print("train_data.shape:", train_data.shape)
print("train_label.shape", train_label.shape)
return train_data[:-1, :], train_label
# 重叠采样
def get_training_data_overlapping(data, time_stamp: int = time_stamp, is_Healthy: bool = True):
rows, cols = data.shape
train_data = np.empty(shape=[rows - time_stamp - 1, time_stamp, cols])
train_label = np.empty(shape=[rows - time_stamp - 1, cols])
for i in range(rows):
if i + time_stamp >= rows:
break
if i + time_stamp < rows - 1:
train_data[i] = data[i:i + time_stamp]
train_label[i] = data[i + time_stamp]
print("重叠采样以后:")
print("data:", train_data) # (300334,120,10)
print("label:", train_label) # (300334,10)
if is_Healthy:
train_label2 = np.ones(shape=[train_label.shape[0]])
else:
train_label2 = np.zeros(shape=[train_label.shape[0]])
print("label2:", train_label2)
return train_data, train_label, train_label2
# RepConv重参数化卷积
def RepConv(input_tensor, k=3):
_, _, output_dim = input_tensor.shape
conv1 = tf.keras.layers.Conv1D(filters=output_dim, kernel_size=k, strides=1, padding='SAME')(input_tensor)
b1 = tf.keras.layers.BatchNormalization()(conv1)
conv2 = tf.keras.layers.Conv1D(filters=output_dim, kernel_size=1, strides=1, padding='SAME')(input_tensor)
b2 = tf.keras.layers.BatchNormalization()(conv2)
b3 = tf.keras.layers.BatchNormalization()(input_tensor)
out = tf.keras.layers.Add()([b1, b2, b3])
out = tf.nn.relu(out)
return out
# RepBlock模块
def RepBlock(input_tensor, num: int = 3):
for i in range(num):
input_tensor = RepConv(input_tensor)
return input_tensor
# GAP 全局平均池化
def Global_avg_channelAttention(input_tensor):
_, length, channel = input_tensor.shape
DWC1 = DepthwiseConv1D(kernel_size=1, padding='SAME')(input_tensor)
GAP = tf.keras.layers.GlobalAvgPool1D()(DWC1)
c1 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GAP)
s1 = tf.nn.sigmoid(c1)
output = tf.multiply(input_tensor, s1)
return output
# GDP 全局动态池化
def Global_Dynamic_channelAttention(input_tensor):
_, length, channel = input_tensor.shape
DWC1 = DepthwiseConv1D(kernel_size=1, padding='SAME')(input_tensor)
# GAP
GAP = tf.keras.layers.GlobalAvgPool1D()(DWC1)
c1 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GAP)
s1 = tf.nn.sigmoid(c1)
# GMP
GMP = tf.keras.layers.GlobalMaxPool1D()(DWC1)
c2 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GMP)
s3 = tf.nn.sigmoid(c2)
output = tf.multiply(input_tensor, s1)
return output
# 归一化
def normalization(data):
rows, cols = data.shape
print("归一化之前:", data)
print(data.shape)
print("======================")
# 归一化
max = np.max(data, axis=0)
max = np.broadcast_to(max, [rows, cols])
min = np.min(data, axis=0)
min = np.broadcast_to(min, [rows, cols])
data = (data - min) / (max - min)
print("归一化之后:", data)
print(data.shape)
return data
# 正则化
def Regularization(data):
rows, cols = data.shape
print("正则化之前:", data)
print(data.shape)
print("======================")
# 正则化
mean = np.mean(data, axis=0)
mean = np.broadcast_to(mean, shape=[rows, cols])
dst = np.sqrt(np.var(data, axis=0))
dst = np.broadcast_to(dst, shape=[rows, cols])
data = (data - mean) / dst
print("正则化之后:", data)
print(data.shape)
return data
pass
def EWMA(data, K=K, namuda=namuda):
# t是啥暂时未知
t = 0
mid = np.mean(data, axis=0)
standard = np.sqrt(np.var(data, axis=0))
UCL = mid + K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
LCL = mid - K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
return mid, UCL, LCL
pass
def get_MSE(data, label, new_model):
predicted_data = new_model.predict(data)
temp = np.abs(predicted_data - label)
temp1 = (temp - np.broadcast_to(np.mean(temp, axis=0), shape=predicted_data.shape))
temp2 = np.broadcast_to(np.sqrt(np.var(temp, axis=0)), shape=predicted_data.shape)
temp3 = temp1 / temp2
mse = np.sum((temp1 / temp2) ** 2, axis=1)
print("z:", mse)
print(mse.shape)
# mse=np.mean((predicted_data-label)**2,axis=1)
print("mse", mse)
dims, = mse.shape
mean = np.mean(mse)
std = np.sqrt(np.var(mse))
max = mean + 3 * std
# min = mean-3*std
max = np.broadcast_to(max, shape=[dims, ])
# min = np.broadcast_to(min,shape=[dims,])
mean = np.broadcast_to(mean, shape=[dims, ])
# plt.plot(max)
# plt.plot(mse)
# plt.plot(mean)
# # plt.plot(min)
# plt.show()
#
#
return mse, mean, max
# pass
def condition_monitoring_model():
input = tf.keras.Input(shape=[time_stamp, feature_num])
conv1 = tf.keras.layers.Conv1D(filters=256, kernel_size=1)(input)
GRU1 = tf.keras.layers.GRU(128, return_sequences=False)(conv1)
d1 = tf.keras.layers.Dense(300)(GRU1)
output = tf.keras.layers.Dense(10)(d1)
model = tf.keras.Model(inputs=input, outputs=output)
return model
# trian_data:(300455,120,10)
# trian_label1:(300455,10)
# trian_label2:(300455,)
def shuffle(train_data, train_label1, train_label2, is_split: bool = False, split_size: float = 0.2):
(train_data, test_data, train_label1, test_label1, train_label2, test_label2) = train_test_split(train_data,
train_label1,
train_label2,
test_size=split_size,
shuffle=True,
random_state=100)
if is_split:
return train_data, train_label1, train_label2, test_data, test_label1, test_label2
train_data = np.concatenate([train_data, test_data], axis=0)
train_label1 = np.concatenate([train_label1, test_label1], axis=0)
train_label2 = np.concatenate([train_label2, test_label2], axis=0)
# print(train_data.shape)
# print(train_label1.shape)
# print(train_label2.shape)
# print(train_data.shape)
return train_data, train_label1, train_label2
pass
def split_test_data(healthy_data, healthy_label1, healthy_label2, unhealthy_data, unhealthy_label1, unhealthy_label2,
split_size: float = 0.2):
data = np.concatenate([healthy_data, unhealthy_data], axis=0)
label1 = np.concatenate([healthy_label1, unhealthy_label1], axis=0)
label2 = np.concatenate([healthy_label2, unhealthy_label2], axis=0)
(train_data, test_data, train_label1, test_label1, train_label2, test_label2) = train_test_split(data,
label1,
label2,
test_size=split_size,
shuffle=True,
random_state=100)
# print(train_data.shape)
# print(train_label1.shape)
# print(train_label2.shape)
# print(train_data.shape)
return train_data, train_label1, train_label2, test_data, test_label1, test_label2
pass
# trian_data:(300455,120,10)
# trian_label1:(300455,10)
# trian_label2:(300455,)
def train_step_one(train_data, train_label1, train_label2):
model = Joint_Monitoring()
# # # # TODO 需要运行编译一次,才能打印model.summary()
# model.build(input_shape=(batch_size, filter_num, dims))
# model.summary()
history_loss = []
history_val_loss = []
learning_rate = 1e-3
for epoch in range(EPOCH):
print()
print("EPOCH:", epoch, "/", EPOCH, ":")
train_data, train_label1, train_label2 = shuffle(train_data, train_label1, train_label2)
if epoch == 0:
train_data, train_label1, train_label2, val_data, val_label1, val_label2 = shuffle(train_data, train_label1,
train_label2,
is_split=True)
# print()
# print("EPOCH:", epoch, "/", EPOCH, ":")
# 用于让train知道这是这个epoch中的第几次训练
z = 0
# 用于batch_size次再训练
k = 1
for data_1, label_1, label_2 in zip(train_data, train_label1, train_label2):
size, _, _ = train_data.shape
data_1 = tf.expand_dims(data_1, axis=0)
label_1 = tf.expand_dims(label_1, axis=0)
label_2 = tf.expand_dims(label_2, axis=0)
if batch_size != 1:
if k % batch_size == 1:
data = data_1
label1 = label_1
label2 = label_2
else:
data = tf.concat([data, data_1], axis=0)
label1 = tf.concat([label1, label_1], axis=0)
label2 = tf.concat([label2, label_2], axis=0)
else:
data = data_1
label1 = label_1
label2 = label_2
if k % batch_size == 0:
# label = tf.expand_dims(label, axis=-1)
loss_value = model.train(input_tensor=data, label1=label1, label2=label2, learning_rate=learning_rate,
is_first_time=True)
print(z * batch_size, "/", size, ":===============>", "loss:", loss_value.numpy())
k = 0
z = z + 1
k = k + 1
val_loss = model.get_val_loss(val_data=val_data, val_label1=val_label1, val_label2=val_label2,
is_first_time=True)
SaveBestModel(model=model, save_name=save_name, history_loss=history_val_loss, loss_value=val_loss.numpy())
# SaveBestH5Model(model=model, save_name=save_name, history_loss=history_val_loss, loss_value=val_loss.numpy())
history_val_loss.append(val_loss)
history_loss.append(loss_value.numpy())
print('Training loss is :', loss_value.numpy())
print('Validating loss is :', val_loss.numpy())
if IsStopTraining(history_loss=history_val_loss, patience=7):
break
if Is_Reduce_learning_rate(history_loss=history_val_loss, patience=3):
if learning_rate >= 1e-4:
learning_rate = learning_rate * 0.1
pass
def train_step_two(step_one_model, step_two_model, train_data, train_label1, train_label2):
# step_two_model = Joint_Monitoring()
# step_two_model.build(input_shape=(batch_size, time_stamp, feature_num))
# step_two_model.summary()
history_loss = []
history_val_loss = []
history_accuracy = []
learning_rate = 1e-3
for epoch in range(EPOCH):
print()
print("EPOCH:", epoch, "/", EPOCH, ":")
train_data, train_label1, train_label2 = shuffle(train_data, train_label1, train_label2)
if epoch == 0:
train_data, train_label1, train_label2, val_data, val_label1, val_label2 = shuffle(train_data, train_label1,
train_label2,
is_split=True)
# print()
# print("EPOCH:", epoch, "/", EPOCH, ":")
# 用于让train知道这是这个epoch中的第几次训练
z = 0
# 用于batch_size次再训练
k = 1
accuracy_num = 0
for data_1, label_1, label_2 in zip(train_data, train_label1, train_label2):
size, _, _ = train_data.shape
data_1 = tf.expand_dims(data_1, axis=0)
label_1 = tf.expand_dims(label_1, axis=0)
label_2 = tf.expand_dims(label_2, axis=0)
if batch_size != 1:
if k % batch_size == 1:
data = data_1
label1 = label_1
label2 = label_2
else:
data = tf.concat([data, data_1], axis=0)
label1 = tf.concat([label1, label_1], axis=0)
label2 = tf.concat([label2, label_2], axis=0)
else:
data = data_1
label1 = label_1
label2 = label_2
if k % batch_size == 0:
# label = tf.expand_dims(label, axis=-1)
output1, output2, output3, _ = step_one_model.call(inputs=data, is_first_time=True)
loss_value, accuracy_value = step_two_model.train(input_tensor=data, label1=label1, label2=label2,
learning_rate=learning_rate,
is_first_time=False, pred_3=output1, pred_4=output2,
pred_5=output3)
accuracy_num += accuracy_value
print(z * batch_size, "/", size, ":===============>", "loss:", loss_value.numpy(), "| accuracy:",
accuracy_num / ((z + 1) * batch_size))
k = 0
z = z + 1
k = k + 1
val_loss, val_accuracy = step_two_model.get_val_loss(val_data=val_data, val_label1=val_label1,
val_label2=val_label2,
is_first_time=False, step_one_model=step_one_model)
SaveBestModelByAccuracy(model=step_two_model, save_name=save_step_two_name, history_accuracy=history_accuracy,
accuracy_value=val_accuracy)
history_val_loss.append(val_loss)
history_loss.append(loss_value.numpy())
print('Training loss is : {0} | Training accuracy is : {1}'.format(loss_value.numpy(),
accuracy_num / ((z + 1) * batch_size)))
print('Validating loss is : {0} | Validating accuracy is : {1}'.format(val_loss.numpy(), val_accuracy))
if IsStopTraining(history_loss=history_val_loss, patience=7):
break
if Is_Reduce_learning_rate(history_loss=history_val_loss, patience=3):
if learning_rate >= 1e-4:
learning_rate = learning_rate * 0.1
pass
def test(step_one_model, step_two_model, test_data, test_label1, test_label2):
history_loss = []
history_val_loss = []
val_loss, val_accuracy = step_two_model.get_val_loss(val_data=test_data, val_label1=test_label1,
val_label2=test_label2,
is_first_time=False, step_one_model=step_one_model)
history_val_loss.append(val_loss)
print("val_accuracy:", val_accuracy)
print("val_loss:", val_loss)
if __name__ == '__main__':
total_data = loadData.execute(N=feature_num, file_name=file_name)
total_data = normalization(data=total_data)
train_data_healthy, train_label1_healthy, train_label2_healthy = get_training_data_overlapping(
total_data[:healthy_date, :], is_Healthy=True)
train_data_unhealthy, train_label1_unhealthy, train_label2_unhealthy = get_training_data_overlapping(
total_data[healthy_date - time_stamp + unhealthy_patience:unhealthy_date, :],
is_Healthy=False)
# TODO 第一步训练
# 单次测试
# train_step_one(train_data=train_data_healthy[:32, :, :], train_label1=train_label1_healthy[:32, :],train_label2=train_label2_healthy[:32, ])
# train_step_one(train_data=train_data_healthy, train_label1=train_label1_healthy,train_label2=train_label2_healthy)
# 导入第一步已经训练好的模型,一个继续训练,一个只输出结果
step_one_model = Joint_Monitoring()
step_one_model.load_weights(save_name)
#
# step_two_model = Joint_Monitoring()
# step_two_model.load_weights(save_name)
# TODO 第二步训练
### healthy_data.shape: (300333,120,10)
### unhealthy_data.shape: (16594,10)
healthy_size, _, _ = train_data_healthy.shape
unhealthy_size, _, _ = train_data_unhealthy.shape
train_data, train_label1, train_label2, test_data, test_label1, test_label2 = split_test_data(
healthy_data=train_data_healthy[healthy_size - 2 * unhealthy_size:, :, :],
healthy_label1=train_label1_healthy[healthy_size - 2 * unhealthy_size:, :],
healthy_label2=train_label2_healthy[healthy_size - 2 * unhealthy_size:, ], unhealthy_data=train_data_unhealthy,
unhealthy_label1=train_label1_unhealthy, unhealthy_label2=train_label2_unhealthy)
# train_step_two(step_one_model=step_one_model, step_two_model=step_two_model,
# train_data=train_data,
# train_label1=train_label1, train_label2=np.expand_dims(train_label2, axis=-1))
# TODO 测试测试集
step_two_model = Joint_Monitoring()
step_two_model.load_weights(save_step_two_name)
test(step_one_model=step_one_model, step_two_model=step_two_model, test_data=test_data, test_label1=test_label1,
test_label2=np.expand_dims(test_label2, axis=-1))
pass

View File

@ -0,0 +1,576 @@
# -*- coding: utf-8 -*-
# coding: utf-8
import tensorflow as tf
import tensorflow.keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from model.DepthwiseCon1D.DepthwiseConv1D import DepthwiseConv1D
from model.Dynamic_channelAttention.Dynamic_channelAttention import DynamicChannelAttention
from condition_monitoring.data_deal import loadData
from model.Joint_Monitoring.Joint_Monitoring3 import Joint_Monitoring
from model.CommonFunction.CommonFunction import *
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model, save_model
'''
@Author : dingjiawen
@Date : 2022/7/8 10:29
@Usage : 尝试将预测和分类两种方式相结合,联合监测
@Desc :REPVGG+unsampling+GRU进行重构,后面接GDP=全局动态池化+分类器
随epoch衰减的MSELoss+随epoch增强的crossEntropy
'''
'''超参数设置'''
time_stamp = 120
feature_num = 10
batch_size = 16
learning_rate = 0.001
EPOCH = 101
model_name = "joint"
'''EWMA超参数'''
K = 18
namuda = 0.01
'''保存名称'''
save_name = "../hard_model/weight/{0}_timestamp{1}_feature{2}_weight_epoch8/weight".format(model_name,
time_stamp,
feature_num,
batch_size,
EPOCH)
save_step_two_name = "../hard_model/two_weight/{0}_timestamp{1}_feature{2}_weight_epoch14/weight".format(model_name,
time_stamp,
feature_num,
batch_size,
EPOCH)
# save_name = "../model/joint/{0}_timestamp{1}_feature{2}.h5".format(model_name,
# time_stamp,
# feature_num,
# batch_size,
# EPOCH)
# save_step_two_name = "../model/joint_two/{0}_timestamp{1}_feature{2}.h5".format(model_name,
# time_stamp,
# feature_num,
# batch_size,
# EPOCH)
'''文件名'''
file_name = "G:\data\SCADA数据\jb4q_8_delete_total_zero.csv"
'''
文件说明jb4q_8_delete_total_zero.csv是删除了只删除了全是0的列的文件
文件从0:415548行均是正常值(2019/7.30 00:00:00 - 2019/9/18 11:14:00)
从415549:432153行均是异常值(2019/9/18 11:21:01 - 2021/1/18 00:00:00)
'''
'''文件参数'''
# 最后正常的时间点
healthy_date = 415548
# 最后异常的时间点
unhealthy_date = 432153
# 异常容忍程度
unhealthy_patience = 5
def remove(data, time_stamp=time_stamp):
rows, cols = data.shape
print("remove_data.shape:", data.shape)
num = int(rows / time_stamp)
return data[:num * time_stamp, :]
pass
# 不重叠采样
def get_training_data(data, time_stamp: int = time_stamp):
removed_data = remove(data=data)
rows, cols = removed_data.shape
print("removed_data.shape:", data.shape)
print("removed_data:", removed_data)
train_data = np.reshape(removed_data, [-1, time_stamp, cols])
print("train_data:", train_data)
batchs, time_stamp, cols = train_data.shape
for i in range(1, batchs):
each_label = np.expand_dims(train_data[i, 0, :], axis=0)
if i == 1:
train_label = each_label
else:
train_label = np.concatenate([train_label, each_label], axis=0)
print("train_data.shape:", train_data.shape)
print("train_label.shape", train_label.shape)
return train_data[:-1, :], train_label
# 重叠采样
def get_training_data_overlapping(data, time_stamp: int = time_stamp, is_Healthy: bool = True):
rows, cols = data.shape
train_data = np.empty(shape=[rows - time_stamp - 1, time_stamp, cols])
train_label = np.empty(shape=[rows - time_stamp - 1, cols])
for i in range(rows):
if i + time_stamp >= rows:
break
if i + time_stamp < rows - 1:
train_data[i] = data[i:i + time_stamp]
train_label[i] = data[i + time_stamp]
print("重叠采样以后:")
print("data:", train_data) # (300334,120,10)
print("label:", train_label) # (300334,10)
if is_Healthy:
train_label2 = np.ones(shape=[train_label.shape[0]])
else:
train_label2 = np.zeros(shape=[train_label.shape[0]])
print("label2:", train_label2)
return train_data, train_label, train_label2
# RepConv重参数化卷积
def RepConv(input_tensor, k=3):
_, _, output_dim = input_tensor.shape
conv1 = tf.keras.layers.Conv1D(filters=output_dim, kernel_size=k, strides=1, padding='SAME')(input_tensor)
b1 = tf.keras.layers.BatchNormalization()(conv1)
conv2 = tf.keras.layers.Conv1D(filters=output_dim, kernel_size=1, strides=1, padding='SAME')(input_tensor)
b2 = tf.keras.layers.BatchNormalization()(conv2)
b3 = tf.keras.layers.BatchNormalization()(input_tensor)
out = tf.keras.layers.Add()([b1, b2, b3])
out = tf.nn.relu(out)
return out
# RepBlock模块
def RepBlock(input_tensor, num: int = 3):
for i in range(num):
input_tensor = RepConv(input_tensor)
return input_tensor
# GAP 全局平均池化
def Global_avg_channelAttention(input_tensor):
_, length, channel = input_tensor.shape
DWC1 = DepthwiseConv1D(kernel_size=1, padding='SAME')(input_tensor)
GAP = tf.keras.layers.GlobalAvgPool1D()(DWC1)
c1 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GAP)
s1 = tf.nn.sigmoid(c1)
output = tf.multiply(input_tensor, s1)
return output
# GDP 全局动态池化
def Global_Dynamic_channelAttention(input_tensor):
_, length, channel = input_tensor.shape
DWC1 = DepthwiseConv1D(kernel_size=1, padding='SAME')(input_tensor)
# GAP
GAP = tf.keras.layers.GlobalAvgPool1D()(DWC1)
c1 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GAP)
s1 = tf.nn.sigmoid(c1)
# GMP
GMP = tf.keras.layers.GlobalMaxPool1D()(DWC1)
c2 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GMP)
s3 = tf.nn.sigmoid(c2)
output = tf.multiply(input_tensor, s1)
return output
# 归一化
def normalization(data):
rows, cols = data.shape
print("归一化之前:", data)
print(data.shape)
print("======================")
# 归一化
max = np.max(data, axis=0)
max = np.broadcast_to(max, [rows, cols])
min = np.min(data, axis=0)
min = np.broadcast_to(min, [rows, cols])
data = (data - min) / (max - min)
print("归一化之后:", data)
print(data.shape)
return data
# 正则化
def Regularization(data):
rows, cols = data.shape
print("正则化之前:", data)
print(data.shape)
print("======================")
# 正则化
mean = np.mean(data, axis=0)
mean = np.broadcast_to(mean, shape=[rows, cols])
dst = np.sqrt(np.var(data, axis=0))
dst = np.broadcast_to(dst, shape=[rows, cols])
data = (data - mean) / dst
print("正则化之后:", data)
print(data.shape)
return data
pass
def EWMA(data, K=K, namuda=namuda):
# t是啥暂时未知
t = 0
mid = np.mean(data, axis=0)
standard = np.sqrt(np.var(data, axis=0))
UCL = mid + K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
LCL = mid - K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
return mid, UCL, LCL
pass
def get_MSE(data, label, new_model):
predicted_data = new_model.predict(data)
temp = np.abs(predicted_data - label)
temp1 = (temp - np.broadcast_to(np.mean(temp, axis=0), shape=predicted_data.shape))
temp2 = np.broadcast_to(np.sqrt(np.var(temp, axis=0)), shape=predicted_data.shape)
temp3 = temp1 / temp2
mse = np.sum((temp1 / temp2) ** 2, axis=1)
print("z:", mse)
print(mse.shape)
# mse=np.mean((predicted_data-label)**2,axis=1)
print("mse", mse)
dims, = mse.shape
mean = np.mean(mse)
std = np.sqrt(np.var(mse))
max = mean + 3 * std
# min = mean-3*std
max = np.broadcast_to(max, shape=[dims, ])
# min = np.broadcast_to(min,shape=[dims,])
mean = np.broadcast_to(mean, shape=[dims, ])
# plt.plot(max)
# plt.plot(mse)
# plt.plot(mean)
# # plt.plot(min)
# plt.show()
#
#
return mse, mean, max
# pass
def condition_monitoring_model():
input = tf.keras.Input(shape=[time_stamp, feature_num])
conv1 = tf.keras.layers.Conv1D(filters=256, kernel_size=1)(input)
GRU1 = tf.keras.layers.GRU(128, return_sequences=False)(conv1)
d1 = tf.keras.layers.Dense(300)(GRU1)
output = tf.keras.layers.Dense(10)(d1)
model = tf.keras.Model(inputs=input, outputs=output)
return model
# trian_data:(300455,120,10)
# trian_label1:(300455,10)
# trian_label2:(300455,)
def shuffle(train_data, train_label1, train_label2, is_split: bool = False, split_size: float = 0.2):
(train_data, test_data, train_label1, test_label1, train_label2, test_label2) = train_test_split(train_data,
train_label1,
train_label2,
test_size=split_size,
shuffle=True,
random_state=100)
if is_split:
return train_data, train_label1, train_label2, test_data, test_label1, test_label2
train_data = np.concatenate([train_data, test_data], axis=0)
train_label1 = np.concatenate([train_label1, test_label1], axis=0)
train_label2 = np.concatenate([train_label2, test_label2], axis=0)
# print(train_data.shape)
# print(train_label1.shape)
# print(train_label2.shape)
# print(train_data.shape)
return train_data, train_label1, train_label2
pass
def split_test_data(healthy_data, healthy_label1, healthy_label2, unhealthy_data, unhealthy_label1, unhealthy_label2,
split_size: float = 0.2, shuffle: bool = True):
data = np.concatenate([healthy_data, unhealthy_data], axis=0)
label1 = np.concatenate([healthy_label1, unhealthy_label1], axis=0)
label2 = np.concatenate([healthy_label2, unhealthy_label2], axis=0)
(train_data, test_data, train_label1, test_label1, train_label2, test_label2) = train_test_split(data,
label1,
label2,
test_size=split_size,
shuffle=shuffle,
random_state=100)
# print(train_data.shape)
# print(train_label1.shape)
# print(train_label2.shape)
# print(train_data.shape)
return train_data, train_label1, train_label2, test_data, test_label1, test_label2
pass
# trian_data:(300455,120,10)
# trian_label1:(300455,10)
# trian_label2:(300455,)
def train_step_one(train_data, train_label1, train_label2):
model = Joint_Monitoring()
# # # # TODO 需要运行编译一次,才能打印model.summary()
# model.build(input_shape=(batch_size, filter_num, dims))
# model.summary()
history_loss = []
history_val_loss = []
learning_rate = 1e-3
for epoch in range(EPOCH):
print()
print("EPOCH:", epoch, "/", EPOCH, ":")
train_data, train_label1, train_label2 = shuffle(train_data, train_label1, train_label2)
if epoch == 0:
train_data, train_label1, train_label2, val_data, val_label1, val_label2 = shuffle(train_data, train_label1,
train_label2,
is_split=True)
# print()
# print("EPOCH:", epoch, "/", EPOCH, ":")
# 用于让train知道这是这个epoch中的第几次训练
z = 0
# 用于batch_size次再训练
k = 1
for data_1, label_1, label_2 in zip(train_data, train_label1, train_label2):
size, _, _ = train_data.shape
data_1 = tf.expand_dims(data_1, axis=0)
label_1 = tf.expand_dims(label_1, axis=0)
label_2 = tf.expand_dims(label_2, axis=0)
if batch_size != 1:
if k % batch_size == 1:
data = data_1
label1 = label_1
label2 = label_2
else:
data = tf.concat([data, data_1], axis=0)
label1 = tf.concat([label1, label_1], axis=0)
label2 = tf.concat([label2, label_2], axis=0)
else:
data = data_1
label1 = label_1
label2 = label_2
if k % batch_size == 0:
# label = tf.expand_dims(label, axis=-1)
loss_value, accuracy_value = model.train(input_tensor=data, label1=label1, label2=label2,
learning_rate=learning_rate,
is_first_time=True)
print(z * batch_size, "/", size, ":===============>", "loss:", loss_value.numpy())
k = 0
z = z + 1
k = k + 1
val_loss, val_accuracy = model.get_val_loss(val_data=val_data, val_label1=val_label1, val_label2=val_label2,
is_first_time=True)
SaveBestModel(model=model, save_name=save_name, history_loss=history_val_loss, loss_value=val_loss.numpy())
# SaveBestH5Model(model=model, save_name=save_name, history_loss=history_val_loss, loss_value=val_loss.numpy())
history_val_loss.append(val_loss)
history_loss.append(loss_value.numpy())
print('Training loss is :', loss_value.numpy())
print('Validating loss is :', val_loss.numpy())
if IsStopTraining(history_loss=history_val_loss, patience=7):
break
if Is_Reduce_learning_rate(history_loss=history_val_loss, patience=3):
if learning_rate >= 1e-4:
learning_rate = learning_rate * 0.1
pass
def train_step_two(step_one_model, step_two_model, train_data, train_label1, train_label2):
# step_two_model = Joint_Monitoring()
# step_two_model.build(input_shape=(batch_size, time_stamp, feature_num))
# step_two_model.summary()
history_loss = []
history_val_loss = []
history_accuracy = []
learning_rate = 1e-3
for epoch in range(EPOCH):
print()
print("EPOCH:", epoch, "/", EPOCH, ":")
train_data, train_label1, train_label2 = shuffle(train_data, train_label1, train_label2)
if epoch == 0:
train_data, train_label1, train_label2, val_data, val_label1, val_label2 = shuffle(train_data, train_label1,
train_label2,
is_split=True)
# print()
# print("EPOCH:", epoch, "/", EPOCH, ":")
# 用于让train知道这是这个epoch中的第几次训练
z = 0
# 用于batch_size次再训练
k = 1
accuracy_num = 0
for data_1, label_1, label_2 in zip(train_data, train_label1, train_label2):
size, _, _ = train_data.shape
data_1 = tf.expand_dims(data_1, axis=0)
label_1 = tf.expand_dims(label_1, axis=0)
label_2 = tf.expand_dims(label_2, axis=0)
if batch_size != 1:
if k % batch_size == 1:
data = data_1
label1 = label_1
label2 = label_2
else:
data = tf.concat([data, data_1], axis=0)
label1 = tf.concat([label1, label_1], axis=0)
label2 = tf.concat([label2, label_2], axis=0)
else:
data = data_1
label1 = label_1
label2 = label_2
if k % batch_size == 0:
# label = tf.expand_dims(label, axis=-1)
output1, output2, output3, _ = step_one_model.call(inputs=data, is_first_time=True)
loss_value, accuracy_value = step_two_model.train(input_tensor=data, label1=label1, label2=label2,
learning_rate=learning_rate,
is_first_time=False, pred_3=output1, pred_4=output2,
pred_5=output3)
accuracy_num += accuracy_value
print(z * batch_size, "/", size, ":===============>", "loss:", loss_value.numpy(), "| accuracy:",
accuracy_num / ((z + 1) * batch_size))
k = 0
z = z + 1
k = k + 1
val_loss, val_accuracy = step_two_model.get_val_loss(val_data=val_data, val_label1=val_label1,
val_label2=val_label2,
is_first_time=False, step_one_model=step_one_model)
SaveBestModelByAccuracy(model=step_two_model, save_name=save_step_two_name, history_accuracy=history_accuracy,
accuracy_value=val_accuracy)
history_val_loss.append(val_loss)
history_loss.append(loss_value.numpy())
history_accuracy.append(val_accuracy)
print('Training loss is : {0} | Training accuracy is : {1}'.format(loss_value.numpy(),
accuracy_num / ((z + 1) * batch_size)))
print('Validating loss is : {0} | Validating accuracy is : {1}'.format(val_loss.numpy(), val_accuracy))
if IsStopTraining(history_loss=history_val_loss, patience=7):
break
if Is_Reduce_learning_rate(history_loss=history_val_loss, patience=3):
if learning_rate >= 1e-4:
learning_rate = learning_rate * 0.1
pass
def test(step_one_model, step_two_model, test_data, test_label1, test_label2):
history_loss = []
history_val_loss = []
val_loss, val_accuracy = step_two_model.get_val_loss(val_data=test_data, val_label1=test_label1,
val_label2=test_label2,
is_first_time=False, step_one_model=step_one_model)
history_val_loss.append(val_loss)
print("val_accuracy:", val_accuracy)
print("val_loss:", val_loss)
def showResult(step_two_model: Joint_Monitoring, test_data, isPlot: bool = False):
# 获取模型的所有参数的个数
# step_two_model.count_params()
total_result = []
size, length, dims = test_data.shape
for epoch in range(0, size - batch_size + 1, batch_size):
each_test_data = test_data[epoch:epoch + batch_size, :, :]
_, _, _, output4 = step_two_model.call(each_test_data, is_first_time=False)
total_result.append(output4)
total_result = np.reshape(total_result, [total_result.__len__(), -1])
total_result = np.reshape(total_result, [-1, ])
if isPlot:
plt.scatter(list(range(total_result.shape[0])), total_result, c='black', s=10)
# 画出 y=1 这条水平线
plt.axhline(0.5, c='red', label='Failure threshold')
# 箭头指向上面的水平线
# plt.arrow(35000, 0.9, 33000, 0.75, head_width=0.02, head_length=0.1, shape="full", fc='red', ec='red',
# alpha=0.9, overhang=0.5)
# plt.text(35000, 0.9, "Truth Fault", fontsize=10, color='black', verticalalignment='top')
plt.axvline(test_data.shape[0] * 2 / 3, c='blue', ls='-.')
plt.xlabel("time")
plt.ylabel("confience")
plt.text(total_result.shape[0] * 4 / 5, 0.6, "Fault", fontsize=10, color='black', verticalalignment='top',
horizontalalignment='center',
bbox={'facecolor': 'grey',
'pad': 10})
plt.text(total_result.shape[0] * 1 / 3, 0.4, "Norm", fontsize=10, color='black', verticalalignment='top',
horizontalalignment='center',
bbox={'facecolor': 'grey',
'pad': 10})
plt.grid()
# plt.ylim(0, 1)
# plt.xlim(-50, 1300)
# plt.legend("", loc='upper left')
plt.show()
return total_result
if __name__ == '__main__':
total_data = loadData.execute(N=feature_num, file_name=file_name)
total_data = normalization(data=total_data)
train_data_healthy, train_label1_healthy, train_label2_healthy = get_training_data_overlapping(
total_data[:healthy_date, :], is_Healthy=True)
train_data_unhealthy, train_label1_unhealthy, train_label2_unhealthy = get_training_data_overlapping(
total_data[healthy_date - time_stamp + unhealthy_patience:unhealthy_date, :],
is_Healthy=False)
#### TODO 第一步训练
# 单次测试
# train_step_one(train_data=train_data_healthy[:32, :, :], train_label1=train_label1_healthy[:32, :],train_label2=train_label2_healthy[:32, ])
# train_step_one(train_data=train_data_healthy, train_label1=train_label1_healthy, train_label2=train_label2_healthy)
# 导入第一步已经训练好的模型,一个继续训练,一个只输出结果
# step_one_model = Joint_Monitoring()
# step_one_model.load_weights(save_name)
#
# step_two_model = Joint_Monitoring()
# step_two_model.load_weights(save_name)
#### TODO 第二步训练
### healthy_data.shape: (300333,120,10)
### unhealthy_data.shape: (16594,10)
healthy_size, _, _ = train_data_healthy.shape
unhealthy_size, _, _ = train_data_unhealthy.shape
# train_data, train_label1, train_label2, test_data, test_label1, test_label2 = split_test_data(
# healthy_data=train_data_healthy[healthy_size - 2 * unhealthy_size:, :, :],
# healthy_label1=train_label1_healthy[healthy_size - 2 * unhealthy_size:, :],
# healthy_label2=train_label2_healthy[healthy_size - 2 * unhealthy_size:, ], unhealthy_data=train_data_unhealthy,
# unhealthy_label1=train_label1_unhealthy, unhealthy_label2=train_label2_unhealthy)
# train_step_two(step_one_model=step_one_model, step_two_model=step_two_model,
# train_data=train_data,
# train_label1=train_label1, train_label2=np.expand_dims(train_label2, axis=-1))
### TODO 测试测试集
step_one_model = Joint_Monitoring()
step_one_model.load_weights(save_name)
step_two_model = Joint_Monitoring()
step_two_model.load_weights(save_step_two_name)
# test(step_one_model=step_one_model, step_two_model=step_two_model, test_data=test_data, test_label1=test_label1,
# test_label2=np.expand_dims(test_label2, axis=-1))
###TODO 展示全部的结果
all_data, _, _ = get_training_data_overlapping(
total_data[healthy_size - 2 * unhealthy_size:unhealthy_date, :], is_Healthy=True)
# all_data = np.concatenate([])
# 单次测试
# showResult(step_two_model, test_data=all_data[:32], isPlot=True)
showResult(step_two_model, test_data=all_data, isPlot=True)
pass