20231108爬虫学习更新
This commit is contained in:
parent
df0f410f8f
commit
d8746d192f
|
|
@ -0,0 +1,244 @@
|
|||
# -*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/11/8 16:08
|
||||
@Usage :
|
||||
@Desc :参考 https://github.com/Python3WebSpider/BeautifulSoupTest
|
||||
'''
|
||||
|
||||
html = """
|
||||
<html><head><title>The Dormouse's story</title></head>
|
||||
<body>
|
||||
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
|
||||
<p class="story">Once upon a time there were three little sisters; and their names were
|
||||
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
|
||||
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
|
||||
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
|
||||
and they lived at the bottom of a well.</p>
|
||||
<p class="story">...</p>
|
||||
"""
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
|
||||
def baseUse():
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
print(soup.title) # <title>The Dormouse's story</title>
|
||||
print(type(soup.title)) # <class 'bs4.element.Tag'>
|
||||
print(soup.title.string) # The Dormouse's story
|
||||
print(soup.head) # <head><title>The Dormouse's story</title></head>
|
||||
print(soup.p) # <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
|
||||
print(soup.p.name) # 获取节点名称 p
|
||||
print(soup.p.attrs) # 获取属性 {'class': ['title'], 'name': 'dromouse'}
|
||||
print(soup.p.attrs['name']) # 获取属性值 dromouse
|
||||
print(soup.p['name']) # 获取属性值 dromouse
|
||||
print(soup.body.p['name']) # 嵌套选择 dromouse
|
||||
|
||||
print("==========================")
|
||||
|
||||
|
||||
def child():
|
||||
html = """
|
||||
<html>
|
||||
<head>
|
||||
<title>The Dormouse's story</title>
|
||||
</head>
|
||||
<body>
|
||||
<p class="story">
|
||||
Once upon a time there were three little sisters; and their names were
|
||||
<a href="http://example.com/elsie" class="sister" id="link1">
|
||||
<span>Elsie</span>
|
||||
</a>
|
||||
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
|
||||
and
|
||||
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
|
||||
and they lived at the bottom of a well.
|
||||
</p>
|
||||
<p class="story">...</p>
|
||||
"""
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
# 子结点
|
||||
for i, child in enumerate(soup.p.children):
|
||||
print(i, child)
|
||||
print("===============================")
|
||||
# 子孙节点
|
||||
for i, child in enumerate(soup.p.descendants):
|
||||
print(i, child)
|
||||
print("===============================")
|
||||
|
||||
|
||||
def parent():
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
# 父节点
|
||||
print(soup.a.parent)
|
||||
print("===============================")
|
||||
# 祖父节点
|
||||
print(type(soup.a.parents))
|
||||
print(list(enumerate(soup.a.parents)))
|
||||
print("=============================")
|
||||
|
||||
|
||||
def brother():
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<p class="story">
|
||||
Once upon a time there were three little sisters; and their names were
|
||||
<a href="http://example.com/elsie" class="sister" id="link1">
|
||||
<span>Elsie</span>
|
||||
</a>
|
||||
Hello
|
||||
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
|
||||
and
|
||||
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
|
||||
and they lived at the bottom of a well.
|
||||
</p>
|
||||
"""
|
||||
# 兄弟节点
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
print('Next Sibling', soup.a.next_sibling)
|
||||
print('Prev Sibling', soup.a.previous_sibling)
|
||||
print('Next Siblings', list(enumerate(soup.a.next_siblings)))
|
||||
print('Prev Siblings', list(enumerate(soup.a.previous_siblings)))
|
||||
|
||||
# 找到所有满足条件的
|
||||
def findAll():
|
||||
|
||||
html = '''
|
||||
<div class="panel">
|
||||
<div class="panel-heading">
|
||||
<h4>Hello</h4>
|
||||
</div>
|
||||
<div class="panel-body">
|
||||
<ul class="list" id="list-1">
|
||||
<li class="element">Foo</li>
|
||||
<li class="element">Bar</li>
|
||||
<li class="element">Jay</li>
|
||||
</ul>
|
||||
<ul class="list list-small" id="list-2">
|
||||
<li class="element">Foo</li>
|
||||
<li class="element">Bar</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
'''
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
print(soup.find_all(name='ul'))
|
||||
print(type(soup.find_all(name='ul')[0]))
|
||||
|
||||
for ul in soup.find_all(name='ul'):
|
||||
print(ul.find_all(name='li'))
|
||||
|
||||
for ul in soup.find_all(name='ul'):
|
||||
print(ul.find_all(name='li'))
|
||||
for li in ul.find_all(name='li'):
|
||||
print(li.string)
|
||||
|
||||
|
||||
# 找属性满足匹配得到
|
||||
def attrs():
|
||||
html = '''
|
||||
<div class="panel">
|
||||
<div class="panel-heading">
|
||||
<h4>Hello</h4>
|
||||
</div>
|
||||
<div class="panel-body">
|
||||
<ul class="list" id="list-1" name="elements">
|
||||
<li class="element">Foo</li>
|
||||
<li class="element">Bar</li>
|
||||
<li class="element">Jay</li>
|
||||
</ul>
|
||||
<ul class="list list-small" id="list-2">
|
||||
<li class="element">Foo</li>
|
||||
<li class="element">Bar</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
'''
|
||||
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
print(soup.find_all(attrs={'id': 'list-1'}))
|
||||
print(soup.find_all(attrs={'name': 'elements'}))
|
||||
|
||||
# 常用的属性可以不用attrs传递
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
print(soup.find_all(id='list-1'))
|
||||
print(soup.find_all(class_='element'))
|
||||
import re
|
||||
print(soup.find_all(string=re.compile('Foo')))# string等同于text,即里面的具体内容
|
||||
|
||||
|
||||
# 返回匹配到的第一个元素
|
||||
def find():
|
||||
html = '''
|
||||
<div class="panel">
|
||||
<div class="panel-heading">
|
||||
<h4>Hello</h4>
|
||||
</div>
|
||||
<div class="panel-body">
|
||||
<ul class="list" id="list-1">
|
||||
<li class="element">Foo</li>
|
||||
<li class="element">Bar</li>
|
||||
<li class="element">Jay</li>
|
||||
</ul>
|
||||
<ul class="list list-small" id="list-2">
|
||||
<li class="element">Foo</li>
|
||||
<li class="element">Bar</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
'''
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
print(soup.find(name='ul'))
|
||||
print(type(soup.find(name='ul')))
|
||||
print(soup.find(class_='list'))
|
||||
|
||||
# css选择器
|
||||
def cssSelect():
|
||||
html = '''
|
||||
<div class="panel">
|
||||
<div class="panel-heading">
|
||||
<h4>Hello</h4>
|
||||
</div>
|
||||
<div class="panel-body">
|
||||
<ul class="list" id="list-1">
|
||||
<li class="element">Foo</li>
|
||||
<li class="element">Bar</li>
|
||||
<li class="element">Jay</li>
|
||||
</ul>
|
||||
<ul class="list list-small" id="list-2">
|
||||
<li class="element">Foo</li>
|
||||
<li class="element">Bar</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
'''
|
||||
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
print(soup.select('.panel .panel-heading'))
|
||||
print(soup.select('ul li'))
|
||||
print(soup.select('#list-2 .element'))
|
||||
print(type(soup.select('ul')[0]))
|
||||
|
||||
# 嵌套选择
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
for ul in soup.select('ul'):
|
||||
print(ul.select('li'))
|
||||
|
||||
# 获取属性
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
for ul in soup.select('ul'):
|
||||
print(ul['id'])
|
||||
print(ul.attrs['id'])
|
||||
|
||||
# 获取文本
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
for li in soup.select('li'):
|
||||
print('Get Text:', li.get_text())
|
||||
print('String:', li.string)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cssSelect()
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
#-*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/11/8 16:07
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
#-*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/11/8 16:54
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
|
@ -0,0 +1,329 @@
|
|||
# -*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/11/8 16:54
|
||||
@Usage :
|
||||
@Desc :Pyquery学习 参考: https://github.com/Python3WebSpider/PyQueryTest
|
||||
'''
|
||||
from pyquery import PyQuery as pq
|
||||
|
||||
|
||||
# 字符串初始化
|
||||
def stringBase():
|
||||
html = '''
|
||||
<div>
|
||||
<ul>
|
||||
<li class="item-0">first item</li>
|
||||
<li class="item-1"><a href="link2.html">second item</a></li>
|
||||
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
||||
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
||||
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
'''
|
||||
|
||||
doc = pq(html)
|
||||
print(doc('li'))
|
||||
|
||||
|
||||
# URL初始化
|
||||
def URLBase():
|
||||
doc = pq(url='https://cuiqingcai.com')
|
||||
print(doc('title'))
|
||||
|
||||
# 上述代码等同于下面
|
||||
# doc = pq(requests.get('https://cuiqingcai.com').text)
|
||||
# print(doc('title'))
|
||||
|
||||
|
||||
# 文件初始化
|
||||
def fileBase():
|
||||
doc = pq(filename='demo.html')
|
||||
print(doc('li'))
|
||||
|
||||
# 基本的css选择器
|
||||
def cssSelect():
|
||||
html = '''
|
||||
<div id="container">
|
||||
<ul class="list">
|
||||
<li class="item-0">first item</li>
|
||||
<li class="item-1"><a href="link2.html">second item</a></li>
|
||||
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
||||
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
||||
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
'''
|
||||
doc = pq(html)
|
||||
print(doc('#container .list li'))
|
||||
print(type(doc('#container .list li')))
|
||||
|
||||
#
|
||||
for item in doc('#container .list li').items():
|
||||
print(item.text())
|
||||
|
||||
# 寻找子节点
|
||||
def child():
|
||||
html = '''
|
||||
<div>
|
||||
<ul class="list">
|
||||
<li class="item-0">first item</li>
|
||||
<li class="item-1"><a href="link2.html">second item</a></li>
|
||||
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
||||
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
||||
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
'''
|
||||
doc = pq(html)
|
||||
items = doc('.list')
|
||||
print(type(items))
|
||||
print(items)
|
||||
lis = items.find('li')
|
||||
print(type(lis))
|
||||
print(lis)
|
||||
#
|
||||
#
|
||||
lis = items.children()
|
||||
print(type(lis))
|
||||
print(lis)
|
||||
|
||||
#
|
||||
lis = items.children('.active')
|
||||
print(lis)
|
||||
|
||||
|
||||
def parent():
|
||||
html = '''
|
||||
<div class="wrap">
|
||||
<div id="container">
|
||||
<ul class="list">
|
||||
<li class="item-0">first item</li>
|
||||
<li class="item-1"><a href="link2.html">second item</a></li>
|
||||
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
||||
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
||||
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
'''
|
||||
from pyquery import PyQuery as pq
|
||||
doc = pq(html)
|
||||
items = doc('.list')
|
||||
container = items.parent()
|
||||
print(type(container))
|
||||
print(container)
|
||||
|
||||
from pyquery import PyQuery as pq
|
||||
doc = pq(html)
|
||||
items = doc('.list')
|
||||
parents = items.parents()
|
||||
print(type(parents))
|
||||
print(parents)
|
||||
|
||||
parent = items.parents('.wrap')
|
||||
print(parent)
|
||||
|
||||
from pyquery import PyQuery as pq
|
||||
doc = pq(html)
|
||||
li = doc('.list .item-0.active')
|
||||
print(li.siblings())
|
||||
|
||||
def brother():
|
||||
html = '''
|
||||
<div class="wrap">
|
||||
<div id="container">
|
||||
<ul class="list">
|
||||
<li class="item-0">first item</li>
|
||||
<li class="item-1"><a href="link2.html">second item</a></li>
|
||||
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
||||
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
||||
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
'''
|
||||
from pyquery import PyQuery as pq
|
||||
doc = pq(html)
|
||||
li = doc('.list .item-0.active')
|
||||
print(li.siblings('.active'))
|
||||
|
||||
from pyquery import PyQuery as pq
|
||||
doc = pq(html)
|
||||
li = doc('.item-0.active')
|
||||
print(li)
|
||||
print(str(li))
|
||||
|
||||
from pyquery import PyQuery as pq
|
||||
doc = pq(html)
|
||||
# 可能是多个节点
|
||||
lis = doc('li').items()
|
||||
print(type(lis))
|
||||
for li in lis:
|
||||
print(li, type(li))
|
||||
|
||||
def attrs():
|
||||
html = '''
|
||||
<div class="wrap">
|
||||
<div id="container">
|
||||
<ul class="list">
|
||||
<li class="item-0">first item</li>
|
||||
<li class="item-1"><a href="link2.html">second item</a></li>
|
||||
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
||||
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
||||
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
'''
|
||||
from pyquery import PyQuery as pq
|
||||
doc = pq(html)
|
||||
a = doc('.item-0.active a')
|
||||
print(a, type(a))
|
||||
print(a.attr('href'))
|
||||
|
||||
a = doc('a')
|
||||
print(a, type(a))
|
||||
print(a.attr('href'))
|
||||
print(a.attr.href)
|
||||
|
||||
from pyquery import PyQuery as pq
|
||||
doc = pq(html)
|
||||
a = doc('a')
|
||||
for item in a.items():
|
||||
# 获取属性和文本
|
||||
print(item.attr('href'),item.text())
|
||||
|
||||
def getHTML():
|
||||
html = '''
|
||||
<div class="wrap">
|
||||
<div id="container">
|
||||
<ul class="list">
|
||||
<li class="item-1"><a href="link2.html">second item</a></li>
|
||||
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
||||
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
||||
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
'''
|
||||
from pyquery import PyQuery as pq
|
||||
doc = pq(html)
|
||||
li = doc('li')
|
||||
print(li.html()) # 第一个节点对应的html <a href="link2.html">second item</a>
|
||||
print(li.text()) # 所有匹配的节点的文本 second item third item fourth item fifth item
|
||||
print(type(li.text()))
|
||||
|
||||
# 增加或者删除节点的class
|
||||
def operateNode():
|
||||
html = '''
|
||||
<div class="wrap">
|
||||
<div id="container">
|
||||
<ul class="list">
|
||||
<li class="item-0">first item</li>
|
||||
<li class="item-1"><a href="link2.html">second item</a></li>
|
||||
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
||||
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
||||
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
'''
|
||||
from pyquery import PyQuery as pq
|
||||
doc = pq(html)
|
||||
li = doc('.item-0.active')
|
||||
print(li)
|
||||
li.removeClass('active')
|
||||
print(li)
|
||||
li.addClass('active')
|
||||
print(li)
|
||||
|
||||
'''
|
||||
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
||||
|
||||
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
|
||||
|
||||
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
||||
'''
|
||||
|
||||
|
||||
|
||||
def operateNodeInformation():
|
||||
html = '''
|
||||
<ul class="list">
|
||||
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
||||
</ul>
|
||||
'''
|
||||
from pyquery import PyQuery as pq
|
||||
doc = pq(html)
|
||||
li = doc('.item-0.active')
|
||||
print(li)
|
||||
li.attr('name', 'link')
|
||||
print(li)
|
||||
li.text('changed item')
|
||||
print(li)
|
||||
li.html('<span>changed item</span>')
|
||||
print(li)
|
||||
'''
|
||||
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
|
||||
<li class="item-0 active" name="link">changed item</li>
|
||||
<li class="item-0 active" name="link"><span>changed item</span></li>
|
||||
'''
|
||||
|
||||
|
||||
def removeInformation():
|
||||
html = '''
|
||||
<div class="wrap">
|
||||
Hello, World
|
||||
<p>This is a paragraph.</p>
|
||||
</div>
|
||||
'''
|
||||
from pyquery import PyQuery as pq
|
||||
doc = pq(html)
|
||||
wrap = doc('.wrap')
|
||||
print(wrap.text())
|
||||
'''
|
||||
Hello, World
|
||||
This is a paragraph.
|
||||
'''
|
||||
wrap.find('p').remove()
|
||||
print(wrap.text())
|
||||
'''
|
||||
Hello, World
|
||||
'''
|
||||
|
||||
# 伪类选择器
|
||||
def fakeCSSSelect():
|
||||
html = '''
|
||||
<div class="wrap">
|
||||
<div id="container">
|
||||
<ul class="list">
|
||||
<li class="item-0">first item</li>
|
||||
<li class="item-1"><a href="link2.html">second item</a></li>
|
||||
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
||||
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
||||
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
'''
|
||||
from pyquery import PyQuery as pq
|
||||
doc = pq(html)
|
||||
li = doc('li:first-child')
|
||||
print(li)
|
||||
li = doc('li:last-child')
|
||||
print(li)
|
||||
li = doc('li:nth-child(2)')
|
||||
print(li)
|
||||
li = doc('li:gt(2)')
|
||||
print(li)
|
||||
li = doc('li:nth-child(2n)')
|
||||
print(li)
|
||||
li = doc('li:contains(second)')
|
||||
print(li)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
fakeCSSSelect()
|
||||
|
|
@ -0,0 +1,195 @@
|
|||
# -*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/11/8 15:15
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
||||
from lxml import etree
|
||||
|
||||
'''
|
||||
XPath基本规则:
|
||||
|
||||
1) nodename:选择此节点的所有子节点
|
||||
2) /:从当前节点选取直接子节点
|
||||
3) //:从当前阶段选择子孙节点
|
||||
4) .:选取当前节点
|
||||
5) ..:选取当前节点的父节点
|
||||
6) @:选取属性
|
||||
|
||||
举例:
|
||||
//title[@lang='eng]代表选择所有名称为title,同时属性lang的值为eng的节点
|
||||
'''
|
||||
|
||||
|
||||
def htmlByString():
|
||||
text = '''
|
||||
<div>
|
||||
<ul>
|
||||
<li class="item-0"><a href="link1.html">first item</a></li>
|
||||
<li class="item-1"><a href="link2.html">second item</a></li>
|
||||
<li class="item-inactive"><a href="link3.html">third item</a></li>
|
||||
<li class="item-1"><a href="link4.html">fourth item</a></li>
|
||||
<li class="item-0"><a href="link5.html">fifth item</a>
|
||||
</ul>
|
||||
</div>
|
||||
'''
|
||||
html = etree.HTML(text)
|
||||
result = etree.tostring(html)
|
||||
print(result.decode('utf-8'))
|
||||
|
||||
|
||||
def htmlByFile():
|
||||
html = etree.parse('./test.html', etree.HTMLParser())
|
||||
result = etree.tostring(html)
|
||||
print(result.decode('utf-8'))
|
||||
|
||||
|
||||
def allNode():
|
||||
html = etree.parse('./test.html', etree.HTMLParser())
|
||||
# 从头开始匹配所有的
|
||||
result = html.xpath('//*')
|
||||
print(result)
|
||||
print(result[0])
|
||||
|
||||
# 匹配所有li的
|
||||
result = html.xpath('//li')
|
||||
print(result)
|
||||
print(result[0])
|
||||
|
||||
|
||||
# 子节点匹配
|
||||
def childNode():
|
||||
html = etree.parse('./test.html', etree.HTMLParser())
|
||||
|
||||
# 匹配所有li的子节点a
|
||||
result = html.xpath('//li/a')
|
||||
print(result)
|
||||
print(result[0])
|
||||
|
||||
# 匹配所有li的子孙节点a 相当于只要是子节点下面的就可以匹配上
|
||||
result = html.xpath('//ul//a')
|
||||
print(result)
|
||||
print(result[0])
|
||||
|
||||
|
||||
# 父节点匹配
|
||||
def fatherNode():
|
||||
html = etree.parse('./test.html', etree.HTMLParser())
|
||||
|
||||
# 匹配a节点属性href是link4.html的父节点的class属性
|
||||
result = html.xpath('//a[@href="link4.html"]/../@class')
|
||||
print(result)
|
||||
# 也可以通过parent::来获取
|
||||
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
|
||||
print(result)
|
||||
|
||||
|
||||
# 文本获取
|
||||
def textGet():
|
||||
html = etree.parse('./test.html', etree.HTMLParser())
|
||||
|
||||
# 匹配li节点属性class是item-0的节点的子节点a的text
|
||||
result = html.xpath('//li[@class="item-0"]/a/text()')
|
||||
print(result) # ['first item', 'fifth item']
|
||||
|
||||
# 匹配li节点属性class是item-0的节点的子孙节点的text
|
||||
result = html.xpath('//li[@class="item-0"]//text()')
|
||||
print(result) # ['first item', 'fifth item', '\r\n ']
|
||||
|
||||
|
||||
# 属性获取
|
||||
def fieldGet():
|
||||
html = etree.parse('./test.html', etree.HTMLParser())
|
||||
|
||||
# 匹配li节点属性class是item-0的节点的子节点a的href属性
|
||||
result = html.xpath('//li/a/@href')
|
||||
print(result) # ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
|
||||
|
||||
|
||||
# 属性多值匹配
|
||||
def fieldsGet():
|
||||
text = '''
|
||||
<li class="li li-first"><a href="link.html">first item</a></li>
|
||||
'''
|
||||
html = etree.HTML(text)
|
||||
result = html.xpath('//li[@class="li"]/a/text()')
|
||||
print(result) # [] 匹配不到
|
||||
|
||||
result = html.xpath('//li[contains(@class, "li")]/a/text()')
|
||||
print(result) # ['first item'] contains匹配到了
|
||||
|
||||
|
||||
# 多属性匹配
|
||||
def fieldssGet():
|
||||
text = '''
|
||||
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
|
||||
'''
|
||||
html = etree.HTML(text)
|
||||
# 多属性用and连接
|
||||
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
|
||||
print(result)
|
||||
|
||||
|
||||
# 按序选择
|
||||
def orderGet():
|
||||
text = '''
|
||||
<div>
|
||||
<ul>
|
||||
<li class="item-0"><a href="link1.html">first item</a></li>
|
||||
<li class="item-1"><a href="link2.html">second item</a></li>
|
||||
<li class="item-inactive"><a href="link3.html">third item</a></li>
|
||||
<li class="item-1"><a href="link4.html">fourth item</a></li>
|
||||
<li class="item-0"><a href="link5.html">fifth item</a>
|
||||
</ul>
|
||||
</div>
|
||||
'''
|
||||
html = etree.HTML(text)
|
||||
result = html.xpath('//li[1]/a/text()')
|
||||
print(result) # ['first item']
|
||||
result = html.xpath('//li[last()]/a/text()')
|
||||
print(result) # ['fifth item']
|
||||
result = html.xpath('//li[position()<3]/a/text()')
|
||||
print(result) # ['first item', 'second item']
|
||||
result = html.xpath('//li[last()-2]/a/text()')
|
||||
print(result) # ['third item']
|
||||
|
||||
|
||||
def nodeSelect():
|
||||
text = '''
|
||||
<div>
|
||||
<ul>
|
||||
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
|
||||
<li class="item-1"><a href="link2.html">second item</a></li>
|
||||
<li class="item-inactive"><a href="link3.html">third item</a></li>
|
||||
<li class="item-1"><a href="link4.html">fourth item</a></li>
|
||||
<li class="item-0"><a href="link5.html">fifth item</a>
|
||||
</ul>
|
||||
</div>
|
||||
'''
|
||||
html = etree.HTML(text)
|
||||
result = html.xpath('//li[1]/ancestor::*')
|
||||
print(result)
|
||||
# ancestor获取祖先
|
||||
result = html.xpath('//li[1]/ancestor::div')
|
||||
print(result)
|
||||
# attribute获取所有属性
|
||||
result = html.xpath('//li[1]/attribute::*')
|
||||
print(result)
|
||||
# child获取子节点
|
||||
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
|
||||
print(result)
|
||||
# descendant获取子孙结点
|
||||
result = html.xpath('//li[1]/descendant::span')
|
||||
print(result)
|
||||
# following获取当前节点之后的所有节点
|
||||
result = html.xpath('//li[1]/following::*[2]')
|
||||
print(result)
|
||||
# following-sibling获取当前节点之后的同级节点
|
||||
result = html.xpath('//li[1]/following-sibling::*')
|
||||
print(result)
|
||||
|
||||
if __name__ == '__main__':
|
||||
nodeSelect()
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
#-*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/11/8 15:15
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
<div>
|
||||
<ul>
|
||||
<li class="item-0"><a href="link1.html">first item</a></li>
|
||||
<li class="item-1"><a href="link2.html">second item</a></li>
|
||||
<li class="item-inactive"><a href="link3.html">third item</a></li>
|
||||
<li class="item-1"><a href="link4.html">fourth item</a></li>
|
||||
<li class="item-0"><a href="link5.html">fifth item</a>
|
||||
</ul>
|
||||
</div>
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
#-*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/11/8 15:12
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
|
@ -0,0 +1,153 @@
|
|||
# _*_ coding: UTF-8 _*_
|
||||
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2022/7/11 12:55
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import time
|
||||
# 只计算了该程序运行CPU的时间
|
||||
import timeit
|
||||
|
||||
# cat_sale = pd.read_excel('data/catering_sale.xls')
|
||||
path = "G:\data\SCADA数据\jb4q_8.csv"
|
||||
cat_sale = pd.read_csv(path)
|
||||
# cat_sale.drop('日期', axis=1, inplace=True)
|
||||
|
||||
# 过滤异常值,并置为空值
|
||||
# cat_sale['销量'][(cat_sale['销量'] < 400) | (cat_sale['销量'] > 5000)] = np.NAN
|
||||
# 将0值变成NAN 通过双中括号进行索引任意位置
|
||||
# print(df['realtime'][1])
|
||||
cat_sale[:][cat_sale[:] == 0] = np.nan # 在索引比较的时候,要转换成同一类型,使用astype
|
||||
|
||||
# 分别定义求插商与求w的函数
|
||||
'''
|
||||
:param x:差值前后的索引值
|
||||
:param y:差值前后的数值
|
||||
'''
|
||||
def cal_f(x, y):
|
||||
"""
|
||||
计算插商
|
||||
"""
|
||||
f0 = np.zeros((len(x), len(y))) # 定义一个存储插商的数组
|
||||
for k in range(len(y) + 1): # 遍历列
|
||||
for i in range(k, len(x)): # 遍历行
|
||||
if k == 0:
|
||||
f0[i, k] = y[i]
|
||||
else:
|
||||
f0[i, k] = (f0[i, k - 1] - f0[i - 1, k - 1]) / (x[i] - x[i - 1])
|
||||
# print('差商表', '\n', f0)
|
||||
return f0
|
||||
|
||||
|
||||
'''
|
||||
:param x:差值前后的索引值
|
||||
:param y:差值前后的数值
|
||||
:param x_j:需要差值的索引
|
||||
'''
|
||||
def newton(x, y, x_j):
|
||||
"""
|
||||
牛顿差值多项式
|
||||
"""
|
||||
f0 = cal_f(x, y) # 计算插商
|
||||
f0 = f0.diagonal() # 插商对角线
|
||||
# 与w相乘
|
||||
f1 = 0
|
||||
for i in range(len(f0)):
|
||||
s = 1
|
||||
k = 0
|
||||
while k < i:
|
||||
s = s * (x_j - x[k])
|
||||
k += 1
|
||||
f1 = f1 + f0[i] * s
|
||||
return f1
|
||||
|
||||
|
||||
# 自定义列向量插值函数,获取需差值的前后几个数
|
||||
'''
|
||||
:param s:整个差值的序列
|
||||
:param n:需要差值的索引
|
||||
:param x_j:需要差值的索引
|
||||
:param is_fast:是否需要快速差值(无论前后是否是零值均采用);反之则一直找到不为0值的进行计算
|
||||
:param k:取前后多少个数
|
||||
'''
|
||||
def ployinterp_columns(s, n, x_j, is_fast: bool = False, k=3):
|
||||
X = []
|
||||
Y = []
|
||||
if is_fast:
|
||||
# 如果最前面的值不够k个
|
||||
if n < k:
|
||||
a = list(range(0, n)) + list(range(n + 1, n + k + 1))
|
||||
y = s[list(range(0, n)) + list(range(n + 1, n + k + 1))]
|
||||
# 如果最后面的值不够k个
|
||||
elif n > len(s) - k - 1:
|
||||
y = s[list(range(n - k, n)) + list(range(n + 1, len(s)))]
|
||||
# 前后均有k个
|
||||
else:
|
||||
y = s[list(range(n - k, n)) + list(range(n + 1, n + k + 1))] # 取空值处的前后5个数
|
||||
y = y[y.notnull()] # 剔除空值
|
||||
X = y.index
|
||||
Y = list(y)
|
||||
else:
|
||||
# 先取序列前后各k个不为空的值
|
||||
index = n - 1
|
||||
while len(X) < k and index >= 0:
|
||||
if not np.isnan(s[index]):
|
||||
Y.append(s[index])
|
||||
X.append(index)
|
||||
index -= 1
|
||||
index = n + 1
|
||||
X.reverse()
|
||||
Y.reverse()
|
||||
|
||||
while len(X) < 2 * k and index <= len(s):
|
||||
if not np.isnan(s[index]):
|
||||
Y.append(s[index])
|
||||
X.append(index)
|
||||
index += 1
|
||||
# print(X)
|
||||
# print(Y)
|
||||
|
||||
return newton(X, Y, x_j) # 插值并返回插值结果
|
||||
|
||||
|
||||
def execute():
|
||||
cat_sale[:][cat_sale[:] == 0] = np.nan # 在索引比较的时候,要转换成同一类型,使用astype
|
||||
for i in cat_sale.columns:
|
||||
temp = cat_sale[i].isnull()
|
||||
if temp[:][temp[:] == True].__len__() > 0:
|
||||
print("{0}列处理前空行数:{1}".format(i, cat_sale[i].isnull().sum()))
|
||||
for j in range(len(cat_sale)):
|
||||
if (cat_sale[i].isnull())[j]:
|
||||
x_j = cat_sale.index[j]
|
||||
cat_sale.loc[j,i] = ployinterp_columns(cat_sale[i], j, x_j)
|
||||
print('第{0}行牛顿插值为{1}'.format(j, cat_sale.loc[j, i]))
|
||||
print("{0}列处理后空行数:{1}".format(i, cat_sale[i].isnull().sum()))
|
||||
print("========================================")
|
||||
print(cat_sale)
|
||||
cat_sale.to_csv("G:\data\SCADA数据\jb4q_8_dealed.csv")
|
||||
# cat_sale.to_excel('saless.xls')
|
||||
|
||||
|
||||
def test():
|
||||
cat_sale[:][cat_sale[:] == 0] = np.nan # 在索引比较的时候,要转换成同一类型,使用astype
|
||||
for j in range(len(cat_sale['num_gearbox_sumptemp'])):
|
||||
if (cat_sale['num_gearbox_sumptemp'].isnull())[j]:
|
||||
x_j = cat_sale.index[j]
|
||||
cat_sale.loc[j,'num_gearbox_sumptemp'] = ployinterp_columns(cat_sale['num_gearbox_sumptemp'], j, x_j,is_fast=True)
|
||||
# print('第{0}行牛顿插值为{1}'.format(j, cat_sale.loc[j,'num_gearbox_sumptemp']))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
start = timeit.default_timer()
|
||||
# execute()
|
||||
test()
|
||||
end = timeit.default_timer()
|
||||
print('Running time: %s Seconds' % (end - start))
|
||||
# 返回值是浮点数
|
||||
|
||||
|
|
@ -0,0 +1,96 @@
|
|||
# _*_ coding: UTF-8 _*_
|
||||
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2022/7/11 11:43
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
# 拉格朗日插值算法
|
||||
def LagrangeInterpolation(slices, x, k=5):
|
||||
# slices(series) :the defining points
|
||||
# k :the number of defining points of Lagrange poly 前后各k个值
|
||||
# slices index :the corresponding value on each defining point
|
||||
# x :the point whose value we are interested
|
||||
# print(slices[x])
|
||||
# print(np.isnan(slices[x]))
|
||||
result = 0 # later to save final result
|
||||
X = []
|
||||
Y = []
|
||||
# 先取序列前后各k个不为空的值
|
||||
index = x - 1
|
||||
while len(X) < k and index >= 0:
|
||||
if not np.isnan(slices[index]):
|
||||
Y.append(slices[index])
|
||||
X.append(index)
|
||||
index -= 1
|
||||
index = x + 1
|
||||
X.reverse()
|
||||
Y.reverse()
|
||||
|
||||
while len(X) < 2 * k and index <= len(slices):
|
||||
if not np.isnan(slices[index]):
|
||||
Y.append(slices[index])
|
||||
X.append(index)
|
||||
index += 1
|
||||
# print(X)
|
||||
# print(Y)
|
||||
|
||||
for j in range(len(X)):
|
||||
# result_l 基函数
|
||||
result_l = 1
|
||||
for i in range(len(X)):
|
||||
if i != j:
|
||||
result_l = result_l * (x - X[i]) / (X[j] - X[i])
|
||||
# 取值 slices[j]
|
||||
result = result + slices[j] * result_l
|
||||
|
||||
return result
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
path = "G:\data\SCADA数据\jb4q_8.csv"
|
||||
|
||||
df = pd.read_csv(path)
|
||||
columns = df.columns
|
||||
print(df.columns)
|
||||
|
||||
# 将0值变成NAN 通过双中括号进行索引任意位置
|
||||
# print(df['realtime'][1])
|
||||
df[:][df[:] == 0] = np.nan # 在索引比较的时候,要转换成同一类型,使用astype
|
||||
|
||||
# TODO 测试单点插值
|
||||
print(df['num_gearbox_sumptemp'].isnull())
|
||||
# print("插值为:", LagrangeInterpolation(df['num_gearbox_sumptemp'], 47, 2))
|
||||
|
||||
# TODO 单列测试插值
|
||||
print("之前的空值数量:", df['num_gearbox_sumptemp'].isnull().sum())
|
||||
for j in range(len(df)):
|
||||
if (df['num_gearbox_sumptemp'].isnull())[j]:
|
||||
s = df['num_gearbox_sumptemp']
|
||||
df.loc[j, 'num_gearbox_sumptemp'] = LagrangeInterpolation(s, j, 5)
|
||||
print("插值之后的空值数量:", df['num_gearbox_sumptemp'].isnull().sum())
|
||||
|
||||
# # TODO 整体处理
|
||||
print("之前的空值数量:", df.isnull().sum())
|
||||
for i in columns:
|
||||
temp = df[i].isnull()
|
||||
if temp[:][temp[:] == True].__len__() > 0:
|
||||
for j in range(len(df)):
|
||||
if (df[i].isnull())[j]:
|
||||
s = df[columns[i]]
|
||||
df.loc[j, i] = LagrangeInterpolation(s, j, 3)
|
||||
|
||||
print("插值之后的空值数量:",df.isnull().sum())
|
||||
df.to_csv("G:\实验室/2022项目中期\数据治理算法\jb4q_8_lagrange.csv")
|
||||
|
|
@ -0,0 +1,149 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sun Jun 7 09:23:31 2020
|
||||
|
||||
@author: AlbertHu
|
||||
"""
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Jun 5 21:33:46 2020
|
||||
|
||||
@author: AlbertHu
|
||||
"""
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Jun 5 10:40:27 2020
|
||||
|
||||
@author: AlbertHu
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import datetime
|
||||
|
||||
def findallfiles(cmsfilesfatherpath): #返回父目录包括子目录下所有文件的地址
|
||||
cmsfilepaths = []
|
||||
files = os.listdir(cmsfilesfatherpath)
|
||||
for fi in files:
|
||||
fi_d = os.path.join(cmsfilesfatherpath, fi)
|
||||
if os.path.isdir(fi_d):
|
||||
# files.extend(findcmsfiles(fi_d))
|
||||
pass
|
||||
else:
|
||||
cmsfilepaths.append(fi_d)
|
||||
return cmsfilepaths
|
||||
def findIndexOfExceptPoint(data):
|
||||
indexList2D = []
|
||||
indexList1 = []
|
||||
indexList2 = []
|
||||
indexList3 = []
|
||||
indexList4 = []
|
||||
indexList5 = []
|
||||
print("开始清洗")
|
||||
for i in data.index:
|
||||
if i % 10000 == 0:
|
||||
print("已处理了{}组数据".format(i))
|
||||
#条件1
|
||||
if data[' 瞬时风速'][i] < 3.5 and data[' 1#叶片变桨角度'][i] > 89:
|
||||
indexList1.append(i)
|
||||
elif data[' 瞬时风速'][i] >= 3.5 and data[' 瞬时风速'][i] <= 10 and data[' 1#叶片变桨角度'][i] > 0.5:
|
||||
indexList1.append(i)
|
||||
elif data[' 瞬时风速'][i] >= 11 and data[' 瞬时风速'][i] <= 25 and (data[' 有功功率'][i] < 1800 and data[' 1#叶片变桨角度'][i] > 1.5):
|
||||
indexList1.append(i)
|
||||
elif data[' 瞬时风速'][i] > 25 and data[' 有功功率'][i] >0:
|
||||
indexList1.append(i)
|
||||
else:
|
||||
pass
|
||||
#条件2
|
||||
if abs(data[' 齿轮箱高速轴前端温度'][i])>200 or abs(data[' 齿轮箱高速轴后端温度'][i])>200 or abs(data[' 齿轮箱冷却水温'][i])>200 or abs(data[' 齿轮箱进口油温'][i])>200 or abs(data[' 齿轮箱油池温度'][i])>200 or abs(data[' 环境温度'][i]>200):
|
||||
indexList2.append(i)
|
||||
else:
|
||||
pass
|
||||
#条件3 #条件6
|
||||
if data[' 齿轮箱高速轴前端温度'][i] > 80 or data[' 齿轮箱高速轴后端温度'][i] > 80 or abs(data[' 齿轮箱高速轴前端温度'][i] - data[' 齿轮箱高速轴后端温度'][i]) > 20:
|
||||
indexList3.append(i)
|
||||
else:
|
||||
pass
|
||||
#条件4
|
||||
if data[' 有功功率'][i] > 100 and data[' 齿轮箱进口压力'][i] <= 0:
|
||||
indexList4.append(i)
|
||||
else:
|
||||
pass
|
||||
#条件5
|
||||
if abs(data[' 齿轮箱进口压力'][i] - data[' 齿轮箱泵出口压力'][i]) > 5:
|
||||
indexList5.append(i)
|
||||
else:
|
||||
pass
|
||||
indexList2D = [indexList1,indexList2,indexList3,indexList4,indexList5]
|
||||
return indexList2D
|
||||
# #条件6
|
||||
# if data[' 齿轮箱高速轴前端温度'][i] > 80 or data[' 齿轮箱高速轴后端温度'][i]) > 80:
|
||||
|
||||
|
||||
|
||||
|
||||
fathpath = r'D:\1.SCADA_风电数据\靖边二期2019_已处理'
|
||||
allfilepaths = findallfiles(fathpath)
|
||||
testpath = allfilepaths[0]
|
||||
#allfilepaths = [r'F:\scada_ewma本地数据2(重要)\data\DataResult(靖边二期2019)\风机7.csv']
|
||||
|
||||
#testpath=r'F:\scada_ewma本地数据2(重要)\data\DataResult(粤水电达坂城2020.1月-5月)\风机1.csv'
|
||||
for testpath in allfilepaths:
|
||||
data = pd.read_csv(testpath,encoding='gbk',parse_dates = ['时间'])
|
||||
data.columns
|
||||
|
||||
indexList2D = findIndexOfExceptPoint(data)
|
||||
|
||||
savePath = r'./cleanScada/JB2Q615/风机{}'.format(data['风机号'][1])
|
||||
if not os.path.exists(savePath):
|
||||
os.makedirs(savePath)
|
||||
file = open(savePath + '/IndexOfExceptPoint.txt','w')
|
||||
a = 1
|
||||
for List in indexList2D:
|
||||
for i in List:
|
||||
file.write(str(i)+',')
|
||||
try:
|
||||
data.drop([i],inplace=True)
|
||||
except:
|
||||
continue
|
||||
file.write('第{}组\n'.format(a))
|
||||
a += 1
|
||||
file.close()
|
||||
|
||||
data.to_csv(savePath+'.csv',encoding='gbk')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
# _*_ coding: UTF-8 _*_
|
||||
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2022/7/7 10:29
|
||||
@Usage : 对SCADA数据进行基础的清洗工作
|
||||
@Desc :
|
||||
'''
|
||||
|
||||
import tensorflow as tf
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import time
|
||||
from condition_monitoring.lib.IOBase import ioLib
|
||||
|
||||
'''
|
||||
超参数设置
|
||||
'''
|
||||
# 需处理文件的父目录
|
||||
fatherPath = "G:\data\SCADA数据\华能三塘湖"
|
||||
# 处理好文件的父目录
|
||||
fatherDealedPath = "G:\data\SCADA数据\华能三塘湖\dealed"
|
||||
|
||||
baseUseCols = ["时间", "风机号", "发电机转矩", "发电机无功功率", "发电机转速", "发电机有功功率", "发电机绕组最高温度", "齿轮箱油池温度", "齿轮箱进口油温", "齿轮箱进口压力",
|
||||
"齿轮箱油泵出口压力", "齿轮箱冷却水温度", "有功功率", "60s平均有功功率", "10min平均有功功率", "10s平均有功功率", "10s平均无功功率", "无功功率", "瞬时风速",
|
||||
"机舱温度"]
|
||||
|
||||
baseWinds = []
|
||||
|
||||
# 列出父目录下所有文件
|
||||
def listFile(fatherPath = fatherPath):
|
||||
filepaths = []
|
||||
files = os.listdir(fatherPath)
|
||||
for file in files:
|
||||
fi_d = os.path.join(fatherPath, file)
|
||||
if os.path.isdir(fi_d):
|
||||
pass
|
||||
# files.extend(findcmsfiles(fi_d))
|
||||
else:
|
||||
filepaths.append(fi_d)
|
||||
|
||||
return filepaths
|
||||
|
||||
|
||||
def dropNa(filePath):
|
||||
data = pd.read_csv(filePath, low_memory=False, encoding='gbk', usecols=baseUseCols, parse_dates=['时间'])
|
||||
print(data)
|
||||
data.dropna(axis=0, how='any', inplace=True)
|
||||
print(data)
|
||||
data.append()
|
||||
ioLib.saveCSV(data=data, savePath=fatherDealedPath)
|
||||
|
||||
|
||||
|
||||
def separateByWindNum(data):
|
||||
indexLists = []
|
||||
windList1 = []
|
||||
windList2 = []
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
filePath = "G:\data\SCADA数据\华能三塘湖/1华能三塘湖20180730-20180803.csv"
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,228 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import csv
|
||||
import os
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
'''设置数据源文件路径'''
|
||||
# source_path = r'G:\data\SCADA数据\jb4q_8.csv'
|
||||
source_path = "G:\data\SCADA数据\jb4q_8_delete_total_zero.csv"
|
||||
|
||||
'''修改后的数据源存储路径'''
|
||||
save_path = r'G:\data\SCADA数据\jb4q_8_delete_total_zero.csv'
|
||||
|
||||
'''需要的列'''
|
||||
|
||||
|
||||
# baseUseCols = ["num_gearbox_sumptemp","num_gearbox_inletoiltemp","num_gearbox_inletpress","num_gearbox_coolingwatertemp"]
|
||||
|
||||
# target_path = r'G:\data\SCADA数据\华能三塘湖/dealed/后十万2018.01.16.csv'
|
||||
# target_folder = r'G:\data\SCADA数据\华能三塘湖/dealed'
|
||||
|
||||
|
||||
# 生成文件夹
|
||||
def folderGenerate(folder_name):
|
||||
if not os.path.exists(folder_name):
|
||||
os.makedirs(folder_name)
|
||||
|
||||
|
||||
# 皮尔逊相关系数
|
||||
def cal_correlation_coefficient(data, label):
|
||||
print("计算皮尔逊相关系数")
|
||||
print(data)
|
||||
print(data.shape)
|
||||
pd_data = pd.DataFrame(data)
|
||||
person = pd_data.corr()
|
||||
print(person)
|
||||
# 画热点图heatmap
|
||||
# cmap = sns.heatmap(person, annot=True, xticklabels=label, yticklabels=label)
|
||||
# plt.figure(1, figsize=(6.0, 2.68))
|
||||
# plt.subplots_adjust(left=0.1, right=0.94, bottom=0.2, top=0.9, wspace=None,
|
||||
# hspace=None)
|
||||
# plt.tight_layout()
|
||||
# font1 = {'family': 'Times New Roman', 'weight': 'normal', 'size': 10} # 设置坐标标签的字体大小,字体
|
||||
# font2 = {'family': 'Times New Roman', 'weight': 'normal', 'size': 15} # 设置坐标标签的字体大小,字体
|
||||
# plt.xlabel("X", size=10,fontdict=font1)
|
||||
# plt.ylabel("Y", size=10,fontdict=font1)
|
||||
# plt.title("Heatmap of correlation coefficient matrix", size=20,fontdict=font1)
|
||||
#
|
||||
# # 调整色带的标签:
|
||||
# cbar = cmap.collections[0].colorbar
|
||||
# cbar.ax.tick_params(labelsize=15, labelcolor="black")
|
||||
# cbar.ax.set_ylabel(ylabel="color scale", color="red", loc="center",fontdict=font2)
|
||||
#
|
||||
# plt.show()
|
||||
return person
|
||||
|
||||
|
||||
def get_most_N_correlation_coefficient(person, N=10):
|
||||
print("获得相关度最高的{}个值".format(N))
|
||||
# total_correlation = person[1:, 1:]
|
||||
abs_correlation = np.abs(person)
|
||||
one = np.ones(shape=abs_correlation.shape)
|
||||
two = np.subtract(one, abs_correlation)
|
||||
rows, cols = two.shape
|
||||
total_sum = []
|
||||
for i in range(cols):
|
||||
# print(two[i])
|
||||
total = np.sum(two[i])
|
||||
total_sum.append(total)
|
||||
|
||||
print("total_sum:", total_sum)
|
||||
# 取最小的N个数,因为是与1减了以后的,越小相关系数越大
|
||||
print("arg:",np.argpartition(total_sum, N))
|
||||
min = np.argpartition(total_sum, N)[:N]
|
||||
max = np.argpartition(total_sum, N)[total_sum.__len__() - N:]
|
||||
print("min:",min)
|
||||
return min
|
||||
|
||||
|
||||
# 过滤或者线性填充
|
||||
def findIndexOfExceptPoint(data: pd.DataFrame):
|
||||
# indexList2D = []
|
||||
# indexList = []
|
||||
# indexList2 = []
|
||||
# indexList3 = []
|
||||
# indexList4 = []
|
||||
indexList = []
|
||||
print("开始清洗")
|
||||
for i in data.index:
|
||||
if i % 10000 == 0:
|
||||
print("已处理了{}条数据".format(i))
|
||||
## 删除绝大多数0
|
||||
# if data['num_gearbox_sumptemp'][i] != 0 and (i < 416166 or i > 432766) and (
|
||||
# data['num_gearbox_pumpoutletpress'][i] == 0 or data['num_activepower'][i] == 0 or
|
||||
# data['num_gen_torque'][i] == 0):
|
||||
# indexList.append(i)
|
||||
# 删除全部有0
|
||||
# if (i < 416166 or i > 432766) and (
|
||||
# data['num_gearbox_pumpoutletpress'][i] == 0 or data['num_activepower'][i] == 0 or
|
||||
# data['num_gen_torque'][i] == 0):
|
||||
# indexList.append(i)
|
||||
# 只删除全部0
|
||||
if (i < 416166 or i > 432766) and (
|
||||
data['num_gearbox_sumptemp'][i] == 0 and data['num_gearbox_inletoiltemp'][i] == 0 and
|
||||
data['num_gearbox_inletpress'][i] == 0):
|
||||
indexList.append(i)
|
||||
else:
|
||||
pass
|
||||
|
||||
# indexList2D = [indexList1, indexList2, indexList3, indexList4, indexList5]
|
||||
indexList2D = set(indexList)
|
||||
print("要移除的index:", indexList2D)
|
||||
return indexList2D
|
||||
|
||||
|
||||
# 根据index移除异常数据
|
||||
def removeDataByIndex(indexList, data):
|
||||
print("开始移除异常index的数据")
|
||||
a = 1
|
||||
data.drop(indexList, inplace=True)
|
||||
# for i in indexList:
|
||||
# try:
|
||||
# data.drop([i], inplace=True)
|
||||
# except:
|
||||
# continue
|
||||
# # print('第{}组\n'.format(a))
|
||||
# # a += 1
|
||||
return data
|
||||
|
||||
|
||||
# 处理数据(移除,重新赋值,或者是其他操作)
|
||||
def dealData(scada_data: pd.DataFrame):
|
||||
# 是否保存处理好的数据
|
||||
Is_save = True
|
||||
indexList = findIndexOfExceptPoint(scada_data)
|
||||
removeDataByIndex(indexList=indexList, data=scada_data)
|
||||
print("处理后的数据为:")
|
||||
print(scada_data)
|
||||
if Is_save:
|
||||
print("============保存处理好的数据,路径为{}============".format(save_path))
|
||||
scada_data.to_csv(save_path, index=False, encoding='gbk')
|
||||
|
||||
return scada_data
|
||||
|
||||
|
||||
# 读取数据,转为numpy数组或者tf数组
|
||||
def read_data(file_name, isNew: bool = False):
|
||||
''' 导入数据 '''
|
||||
with open(file_name, 'r') as f:
|
||||
if isNew:
|
||||
# scada_data = pd.read_csv(f,low_memory=False, encoding='gbk', usecols=baseUseCols, parse_dates=['时间'])
|
||||
scada_data = pd.read_csv(f, low_memory=False, encoding='gbk', parse_dates=['realtime'])
|
||||
print(scada_data)
|
||||
scada_data = dealData(scada_data=scada_data)
|
||||
print(scada_data.head)
|
||||
scada_data = np.array(scada_data)
|
||||
else:
|
||||
scada_data = np.loadtxt(f, str, delimiter=",")
|
||||
label = scada_data[0, 3:]
|
||||
label=list(['Gs','Gio','Gip','Gp','Gwt','En','Gft','Grt','Gwt','Et','Rs','Ap','Ws','Dw','Ges','Gt','Vx','Vy'])
|
||||
print("导入数据成功,将数据转为numpy或tf数组...")
|
||||
needed_data = scada_data[1:, 3:].astype(dtype=np.float)
|
||||
## needed_data = tf.cast(needed_data, tf.float32) tensor无法转为pd.DataFrame
|
||||
print(needed_data)
|
||||
print("转换成功,并返回...")
|
||||
return needed_data, label
|
||||
|
||||
|
||||
def plot_original_data(data):
|
||||
rows, cols = data.shape
|
||||
print("开始画图...")
|
||||
|
||||
for i in range(cols):
|
||||
plt.figure(i)
|
||||
plt.plot(data[:, i])
|
||||
plt.show()
|
||||
|
||||
|
||||
def execute(file_name=source_path,N=10):
|
||||
needed_data, label = read_data(file_name=file_name, isNew=False)
|
||||
print(needed_data)
|
||||
print(needed_data.shape)
|
||||
# plot_original_data(needed_data)
|
||||
person = cal_correlation_coefficient(needed_data, label)
|
||||
person = np.array(person)
|
||||
min = get_most_N_correlation_coefficient(person, N=N)
|
||||
|
||||
for index in min:
|
||||
if index == min[0]:
|
||||
total_data = np.expand_dims(needed_data[:, index], axis=-1)
|
||||
else:
|
||||
total_data = np.concatenate([total_data, np.expand_dims(needed_data[:, index], axis=-1)], axis=-1)
|
||||
|
||||
return total_data
|
||||
|
||||
|
||||
def deal_data(file_name=source_path):
|
||||
''' 导入数据 '''
|
||||
with open(file_name, 'r') as f:
|
||||
|
||||
# scada_data = pd.read_csv(f,low_memory=False, encoding='gbk', usecols=baseUseCols, parse_dates=['时间'])
|
||||
scada_data = pd.read_csv(f, low_memory=False, encoding='gbk', parse_dates=['realtime'])
|
||||
print(scada_data)
|
||||
scada_data = dealData(scada_data=scada_data)
|
||||
print(scada_data.head)
|
||||
scada_data = np.array(scada_data)
|
||||
|
||||
scada_data = np.loadtxt(f, str, delimiter=",")
|
||||
label = scada_data[0, 3:]
|
||||
label = list(
|
||||
['Gs', 'Gio', 'Gip', 'Gp', 'Gwt', 'En', 'Gft', 'Grt', 'Gwt', 'Et', 'Rs', 'Ap', 'Ws', 'Dw', 'Ges', 'Gt',
|
||||
'Vx', 'Vy'])
|
||||
print("导入数据成功,将数据转为numpy或tf数组...")
|
||||
needed_data = scada_data[1:, 3:].astype(dtype=np.float)
|
||||
## needed_data = tf.cast(needed_data, tf.float32) tensor无法转为pd.DataFrame
|
||||
print(needed_data)
|
||||
print("转换成功,并返回...")
|
||||
return needed_data, label
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
total_data = execute(N=10, file_name=source_path)
|
||||
# print(total_data)
|
||||
# print(total_data.shape)
|
||||
# plot_original_data()
|
||||
|
|
@ -0,0 +1,207 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import csv
|
||||
import os
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
'''设置数据源文件路径'''
|
||||
# source_path = r'G:\data\SCADA数据\jb4q_8.csv'
|
||||
source_path = "G:\data\SCADA数据\jb4q_8_delete_total_zero.csv"
|
||||
|
||||
'''修改后的数据源存储路径'''
|
||||
save_path = r'G:\data\SCADA数据\jb4q_8_delete_total_zero.csv'
|
||||
|
||||
'''需要的列'''
|
||||
|
||||
|
||||
# baseUseCols = ["num_gearbox_sumptemp","num_gearbox_inletoiltemp","num_gearbox_inletpress","num_gearbox_coolingwatertemp"]
|
||||
|
||||
# target_path = r'G:\data\SCADA数据\华能三塘湖/dealed/后十万2018.01.16.csv'
|
||||
# target_folder = r'G:\data\SCADA数据\华能三塘湖/dealed'
|
||||
|
||||
#96748 107116
|
||||
|
||||
|
||||
# 生成文件夹
|
||||
def folderGenerate(folder_name):
|
||||
if not os.path.exists(folder_name):
|
||||
os.makedirs(folder_name)
|
||||
|
||||
|
||||
# 皮尔逊相关系数
|
||||
def cal_correlation_coefficient(data, label):
|
||||
print("计算皮尔逊相关系数")
|
||||
pd_data = pd.DataFrame(data)
|
||||
person = pd_data.corr()
|
||||
print(person)
|
||||
# 画热点图heatmap
|
||||
# cmap = sns.heatmap(person, annot=True, xticklabels=label, yticklabels=label)
|
||||
# plt.figure(1, figsize=(6.0, 2.68))
|
||||
# plt.subplots_adjust(left=0.1, right=0.94, bottom=0.2, top=0.9, wspace=None,
|
||||
# hspace=None)
|
||||
# plt.tight_layout()
|
||||
# font1 = {'family': 'Times New Roman', 'weight': 'normal', 'size': 10} # 设置坐标标签的字体大小,字体
|
||||
# font2 = {'family': 'Times New Roman', 'weight': 'normal', 'size': 15} # 设置坐标标签的字体大小,字体
|
||||
# plt.xlabel("X", size=10,fontdict=font1)
|
||||
# plt.ylabel("Y", size=10,fontdict=font1)
|
||||
# plt.title("Heatmap of correlation coefficient matrix", size=20,fontdict=font1)
|
||||
#
|
||||
# # 调整色带的标签:
|
||||
# cbar = cmap.collections[0].colorbar
|
||||
# cbar.ax.tick_params(labelsize=15, labelcolor="black")
|
||||
# cbar.ax.set_ylabel(ylabel="color scale", color="red", loc="center",fontdict=font2)
|
||||
#
|
||||
# plt.show()
|
||||
return person
|
||||
|
||||
|
||||
def get_most_N_correlation_coefficient(person, N=10):
|
||||
print("获得相关度最高的{}个值".format(N))
|
||||
# total_correlation = person[1:, 1:]
|
||||
abs_correlation = np.abs(person)
|
||||
one = np.ones(shape=abs_correlation.shape)
|
||||
two = np.subtract(one, abs_correlation)
|
||||
rows, cols = two.shape
|
||||
total_sum = []
|
||||
for i in range(cols):
|
||||
# print(two[i])
|
||||
total = np.sum(two[i])
|
||||
total_sum.append(total)
|
||||
|
||||
print("total_sum:", total_sum)
|
||||
# 取最小的N个数,因为是与1减了以后的,越小相关系数越大
|
||||
print("arg:",np.argpartition(total_sum, N))
|
||||
min = np.argpartition(total_sum, N)[:N]
|
||||
max = np.argpartition(total_sum, N)[total_sum.__len__() - N:]
|
||||
print("min:",min)
|
||||
return min
|
||||
|
||||
|
||||
# 过滤或者线性填充
|
||||
def findIndexOfExceptPoint(data: pd.DataFrame):
|
||||
# indexList2D = []
|
||||
# indexList = []
|
||||
# indexList2 = []
|
||||
# indexList3 = []
|
||||
# indexList4 = []
|
||||
indexList = []
|
||||
print("开始清洗")
|
||||
for i in data.index:
|
||||
if i % 10000 == 0:
|
||||
print("已处理了{}条数据".format(i))
|
||||
## 删除绝大多数0
|
||||
# if data['num_gearbox_sumptemp'][i] != 0 and (i < 416166 or i > 432766) and (
|
||||
# data['num_gearbox_pumpoutletpress'][i] == 0 or data['num_activepower'][i] == 0 or
|
||||
# data['num_gen_torque'][i] == 0):
|
||||
# indexList.append(i)
|
||||
# 删除全部有0
|
||||
# if (i < 416166 or i > 432766) and (
|
||||
# data['num_gearbox_pumpoutletpress'][i] == 0 or data['num_activepower'][i] == 0 or
|
||||
# data['num_gen_torque'][i] == 0):
|
||||
# indexList.append(i)
|
||||
# 只删除全部0
|
||||
if (i < 416166 or i > 432766) and (
|
||||
data['num_gearbox_sumptemp'][i] == 0 and data['num_gearbox_inletoiltemp'][i] == 0 and
|
||||
data['num_gearbox_inletpress'][i] == 0):
|
||||
indexList.append(i)
|
||||
else:
|
||||
pass
|
||||
|
||||
# indexList2D = [indexList1, indexList2, indexList3, indexList4, indexList5]
|
||||
indexList2D = set(indexList)
|
||||
print("要移除的index:", indexList2D)
|
||||
return indexList2D
|
||||
|
||||
|
||||
# 根据index移除异常数据
|
||||
def removeDataByIndex(indexList, data):
|
||||
print("开始移除异常index的数据")
|
||||
a = 1
|
||||
data.drop(indexList, inplace=True)
|
||||
# for i in indexList:
|
||||
# try:
|
||||
# data.drop([i], inplace=True)
|
||||
# except:
|
||||
# continue
|
||||
# # print('第{}组\n'.format(a))
|
||||
# # a += 1
|
||||
return data
|
||||
|
||||
|
||||
# 处理数据(移除,重新赋值,或者是其他操作)
|
||||
def dealData(scada_data: pd.DataFrame):
|
||||
# 是否保存处理好的数据
|
||||
Is_save = True
|
||||
indexList = findIndexOfExceptPoint(scada_data)
|
||||
removeDataByIndex(indexList=indexList, data=scada_data)
|
||||
print("处理后的数据为:")
|
||||
print(scada_data)
|
||||
if Is_save:
|
||||
print("============保存处理好的数据,路径为{}============".format(save_path))
|
||||
scada_data.to_csv(save_path, index=False, encoding='gbk')
|
||||
|
||||
return scada_data
|
||||
|
||||
|
||||
# 读取数据,转为numpy数组或者tf数组
|
||||
def read_data(file_name, isNew: bool = False):
|
||||
''' 导入数据 '''
|
||||
with open(file_name, 'r') as f:
|
||||
if isNew:
|
||||
# scada_data = pd.read_csv(f,low_memory=False, encoding='gbk', usecols=baseUseCols, parse_dates=['时间'])
|
||||
scada_data = pd.read_csv(f, low_memory=False, encoding='gbk', parse_dates=['realtime'])
|
||||
print(scada_data)
|
||||
scada_data = dealData(scada_data=scada_data)
|
||||
print(scada_data.head)
|
||||
scada_data = np.array(scada_data)
|
||||
else:
|
||||
scada_data = np.loadtxt(f, str, delimiter=",")
|
||||
label = scada_data[0, 4:]
|
||||
label=list(['Gs','Gio','Gip','Gp','Gwt','En','Gft','Grt','Gwt','Et','Rs','Ap','Ws','Dw','Ges','Gt','Vx','Vy'])
|
||||
print("导入数据成功,将数据转为numpy或tf数组...")
|
||||
needed_data = scada_data[1:, 4:].astype(dtype=np.float)
|
||||
## needed_data = tf.cast(needed_data, tf.float32) tensor无法转为pd.DataFrame
|
||||
print(needed_data)
|
||||
print("转换成功,并返回...")
|
||||
return needed_data, label
|
||||
|
||||
|
||||
def plot_original_data(data):
|
||||
rows, cols = data.shape
|
||||
print("开始画图...")
|
||||
|
||||
for i in range(cols):
|
||||
plt.figure(i)
|
||||
plt.plot(data[:, i])
|
||||
plt.show()
|
||||
|
||||
|
||||
def execute(file_name=source_path,N=10):
|
||||
needed_data, label = read_data(file_name=file_name, isNew=False)
|
||||
print(needed_data)
|
||||
print(needed_data.shape)
|
||||
# plot_original_data(needed_data)
|
||||
person = cal_correlation_coefficient(needed_data, label)
|
||||
person = np.array(person)
|
||||
min = get_most_N_correlation_coefficient(person, N=N)
|
||||
|
||||
for index in min:
|
||||
if index == min[0]:
|
||||
total_data = np.expand_dims(needed_data[:, index], axis=-1)
|
||||
else:
|
||||
total_data = np.concatenate([total_data, np.expand_dims(needed_data[:, index], axis=-1)], axis=-1)
|
||||
|
||||
return total_data
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# total_data = execute(N=10, file_name=source_path)
|
||||
# print(total_data)
|
||||
# print(total_data.shape)7 10 13
|
||||
# 15中间有一段差别很大
|
||||
file_name='H:\data\SCADA数据\SCADA_已处理_粤水电达坂城2020.1月-5月/风机15.csv'
|
||||
needed_data, label = read_data(file_name=file_name, isNew=False)
|
||||
print(needed_data.shape)
|
||||
plot_original_data(needed_data)
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# coding: utf-8
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2022/11/2 12:59
|
||||
@Usage : 画原始数据
|
||||
@Desc :
|
||||
'''
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
|
||||
|
||||
|
||||
source_path = "G:\data\SCADA数据\jb4q_8_delete_total_zero.csv"
|
||||
|
||||
def deal_data(file_name=source_path):
|
||||
''' 导入数据 '''
|
||||
with open(file_name, 'r') as f:
|
||||
scada_data = np.loadtxt(f, str, delimiter=",")
|
||||
label = scada_data[0, 3:]
|
||||
label = list(
|
||||
['Gs', 'Gio', 'Gip', 'Gp', 'Gwt', 'En', 'Gft', 'Grt', 'Gwt', 'Et', 'Rs', 'Ap', 'Ws', 'Dw', 'Ges', 'Gt',
|
||||
'Vx', 'Vy'])
|
||||
print("导入数据成功,将数据转为numpy或tf数组...")
|
||||
needed_data = scada_data[1:37000, 3:].astype(dtype=np.float)
|
||||
## needed_data = tf.cast(needed_data, tf.float32) tensor无法转为pd.DataFrame
|
||||
print(needed_data)
|
||||
print("转换成功,并返回...")
|
||||
return needed_data, label
|
||||
pass
|
||||
|
||||
|
||||
# 归一化
|
||||
def normalization(data):
|
||||
rows, cols = data.shape
|
||||
print("归一化之前:", data)
|
||||
print(data.shape)
|
||||
print("======================")
|
||||
|
||||
# 归一化
|
||||
max = np.max(data, axis=0)
|
||||
max = np.broadcast_to(max, [rows, cols])
|
||||
min = np.min(data, axis=0)
|
||||
min = np.broadcast_to(min, [rows, cols])
|
||||
|
||||
data = (data - min) / (max - min)
|
||||
print("归一化之后:", data)
|
||||
print(data.shape)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
needed_data, label=deal_data()
|
||||
data=normalization(data=needed_data)
|
||||
np.savetxt('G:\data\SCADA数据/normalization.csv',data,delimiter=',')
|
||||
print(data.shape)
|
||||
|
|
@ -0,0 +1,262 @@
|
|||
import tensorflow as tf
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from condition_monitoring.data_deal import loadData
|
||||
from keras.callbacks import EarlyStopping
|
||||
import os
|
||||
import shutil
|
||||
|
||||
# 孔师兄idea:CNN+GRU
|
||||
|
||||
|
||||
'''超参数设置'''
|
||||
time_stamp = 120
|
||||
feature_num = 10
|
||||
batch_size = 8
|
||||
learning_rate = 0.01
|
||||
EPOCH = 101
|
||||
model_name = "CNN_GRU"
|
||||
'''EWMA超参数'''
|
||||
K = 18
|
||||
namuda = 0.01
|
||||
'''保存名称'''
|
||||
save_name = "../model/{0}_timestamp{1}_featureNum{2}_batch_size{3}_Epoch{4}.h5".format(model_name,
|
||||
time_stamp, feature_num,
|
||||
batch_size, EPOCH)
|
||||
'''文件名'''
|
||||
file_name = "G:\data\SCADA数据\jb4q_8_delete_all_zero.csv"
|
||||
|
||||
def remove(data, time_stamp=time_stamp):
|
||||
rows, cols = data.shape
|
||||
print("remove_data.shape:", data.shape)
|
||||
num = int(rows / time_stamp)
|
||||
|
||||
return data[:num * time_stamp, :]
|
||||
pass
|
||||
|
||||
|
||||
# 不重叠采样
|
||||
def get_training_data(data, time_stamp=time_stamp):
|
||||
removed_data = remove(data=data)
|
||||
rows, cols = removed_data.shape
|
||||
# print("removed_data.shape:", data.shape)
|
||||
# print("removed_data:", removed_data)
|
||||
train_data = np.reshape(removed_data, [-1, time_stamp, cols])
|
||||
# print("train_data:", train_data)
|
||||
batchs, time_stamp, cols = train_data.shape
|
||||
|
||||
for i in range(1, batchs):
|
||||
each_label = np.expand_dims(train_data[i, 0, :], axis=0)
|
||||
if i == 1:
|
||||
train_label = each_label
|
||||
else:
|
||||
train_label = np.concatenate([train_label, each_label], axis=0)
|
||||
|
||||
# print("train_data.shape:", train_data.shape)
|
||||
# print("train_label.shape", train_label.shape)
|
||||
return train_data[:-1, :], train_label
|
||||
|
||||
|
||||
# 重叠采样
|
||||
def get_training_data_overlapping(data,time_stamp=time_stamp):
|
||||
|
||||
rows,cols = data.shape
|
||||
train_data = np.empty(shape=[rows-time_stamp-1,time_stamp,cols])
|
||||
train_label = np.empty(shape=[rows-time_stamp-1,cols])
|
||||
for i in range(rows):
|
||||
if i +time_stamp >= rows:
|
||||
break
|
||||
if i + time_stamp < rows - 1:
|
||||
train_data[i] = data[i:i+time_stamp]
|
||||
train_label[i] = data[i+time_stamp]
|
||||
|
||||
print("重叠采样以后:")
|
||||
print("data:",train_data)
|
||||
print("label:",train_label)
|
||||
|
||||
return train_data,train_label
|
||||
|
||||
|
||||
|
||||
def condition_monitoring_model():
|
||||
input = tf.keras.Input(shape=[time_stamp, feature_num])
|
||||
conv1 = tf.keras.layers.Conv1D(filters=256, kernel_size=1)(input)
|
||||
GRU1 = tf.keras.layers.GRU(128, return_sequences=False)(conv1)
|
||||
d1 = tf.keras.layers.Dense(300)(GRU1)
|
||||
output = tf.keras.layers.Dense(10)(d1)
|
||||
model = tf.keras.Model(inputs=input, outputs=output)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
# 归一化
|
||||
def normalization(data):
|
||||
rows, cols = data.shape
|
||||
print("归一化之前:", data)
|
||||
print(data.shape)
|
||||
print("======================")
|
||||
|
||||
# 归一化
|
||||
max = np.max(data, axis=0)
|
||||
max = np.broadcast_to(max, [rows, cols])
|
||||
min = np.min(data, axis=0)
|
||||
min = np.broadcast_to(min, [rows, cols])
|
||||
|
||||
data = (data - min) / (max - min)
|
||||
print("归一化之后:", data)
|
||||
print(data.shape)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
# 正则化
|
||||
def Regularization(data):
|
||||
rows, cols = data.shape
|
||||
print("正则化之前:", data)
|
||||
print(data.shape)
|
||||
print("======================")
|
||||
|
||||
# 正则化
|
||||
mean = np.mean(data, axis=0)
|
||||
mean = np.broadcast_to(mean, shape=[rows, cols])
|
||||
dst = np.sqrt(np.var(data, axis=0))
|
||||
dst = np.broadcast_to(dst, shape=[rows, cols])
|
||||
data = (data - mean) / dst
|
||||
print("正则化之后:", data)
|
||||
print(data.shape)
|
||||
|
||||
return data
|
||||
pass
|
||||
|
||||
|
||||
def EWMA(data, K=K, namuda=namuda):
|
||||
# t是啥暂时未知
|
||||
t = 0
|
||||
mid = np.mean(data, axis=0)
|
||||
standard = np.sqrt(np.var(data, axis=0))
|
||||
UCL = mid + K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
|
||||
LCL = mid - K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
|
||||
return mid, UCL, LCL
|
||||
pass
|
||||
|
||||
|
||||
def get_MSE(data, label, new_model):
|
||||
predicted_data = new_model.predict(data)
|
||||
|
||||
temp = np.abs(predicted_data - label)
|
||||
temp1 = (temp - np.broadcast_to(np.mean(temp, axis=0), shape=predicted_data.shape))
|
||||
temp2 = np.broadcast_to(np.sqrt(np.var(temp, axis=0)), shape=predicted_data.shape)
|
||||
temp3 = temp1/temp2
|
||||
mse = np.sum((temp1 / temp2) ** 2, axis=1)
|
||||
print("z:", mse)
|
||||
print(mse.shape)
|
||||
|
||||
# mse=np.mean((predicted_data-label)**2,axis=1)
|
||||
print("mse", mse)
|
||||
|
||||
dims, = mse.shape
|
||||
|
||||
mean = np.mean(mse)
|
||||
std = np.sqrt(np.var(mse))
|
||||
max = mean + 3 * std
|
||||
# min = mean-3*std
|
||||
max = np.broadcast_to(max, shape=[dims, ])
|
||||
# min = np.broadcast_to(min,shape=[dims,])
|
||||
mean = np.broadcast_to(mean, shape=[dims, ])
|
||||
|
||||
# plt.plot(max)
|
||||
# plt.plot(mse)
|
||||
# plt.plot(mean)
|
||||
# # plt.plot(min)
|
||||
# plt.show()
|
||||
#
|
||||
#
|
||||
return mse,mean,max
|
||||
# pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
total_data = loadData.execute(N=feature_num,file_name=file_name)
|
||||
total_data = normalization(data=total_data)
|
||||
train_data, train_label = get_training_data_overlapping(total_data[:300455, :])
|
||||
|
||||
## TODO training
|
||||
# model = condition_monitoring_model()
|
||||
# checkpoint = tf.keras.callbacks.ModelCheckpoint(
|
||||
# filepath=save_name,
|
||||
# monitor='val_loss',
|
||||
# verbose=1,
|
||||
# save_best_only=True,
|
||||
# mode='min',
|
||||
# period=1)
|
||||
# lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)
|
||||
# early_stop = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=30, mode='min', verbose=1)
|
||||
# model.compile(optimizer=tf.optimizers.Adam(learning_rate=learning_rate), loss=tf.losses.mse)
|
||||
# model.summary()
|
||||
# model.fit(train_data, train_label, batch_size=batch_size, epochs=EPOCH, validation_split=0.1,
|
||||
# callbacks=[checkpoint, lr_scheduler, early_stop])
|
||||
|
||||
## TODO testing
|
||||
print("===============================")
|
||||
print(total_data.shape)
|
||||
print("===============================")
|
||||
test_data, test_label = get_training_data(total_data[:300455, :])
|
||||
newModel = tf.keras.models.load_model(save_name)
|
||||
mse,mean,max = get_MSE(test_data, test_label, new_model=newModel)
|
||||
print("===============================")
|
||||
print("mse:",mse)
|
||||
print(mse.shape)
|
||||
print("===============================")
|
||||
|
||||
|
||||
test_data, test_label = get_training_data(total_data[20000:, :])
|
||||
predicted_data = newModel.predict(test_data)
|
||||
rows, cols = predicted_data.shape
|
||||
print("=====================================")
|
||||
print(predicted_data)
|
||||
print(predicted_data.shape)
|
||||
print("=====================================")
|
||||
|
||||
temp = np.abs(predicted_data - test_label)
|
||||
temp1 = (temp - np.broadcast_to(np.mean(temp, axis=0), shape=predicted_data.shape))
|
||||
temp2 = np.broadcast_to(np.sqrt(np.var(temp, axis=0)), shape=predicted_data.shape)
|
||||
temp3 = temp1 / temp2
|
||||
mse = np.sum((temp1 / temp2) ** 2, axis=1)
|
||||
print("====================")
|
||||
print("new_mse:",mse)
|
||||
print(mse.shape)
|
||||
np.savetxt("mse", mse, delimiter=',')
|
||||
print("===================")
|
||||
|
||||
plt.plot(mse[2000:])
|
||||
plt.plot(mean)
|
||||
plt.plot(max)
|
||||
plt.show()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
data = pd.DataFrame(mse).ewm(span=3).mean()
|
||||
print(data)
|
||||
data =np.array(data)
|
||||
|
||||
index,_ = data.shape
|
||||
|
||||
|
||||
|
||||
for i in range(2396):
|
||||
if data[i,0] >5:
|
||||
data[i,0] = data[i-1,:]
|
||||
print(data)
|
||||
mean = data[2000:2396,:].mean()
|
||||
std = data[2000:2396,:].std()
|
||||
mean=np.broadcast_to(mean,shape=[500,])
|
||||
std=np.broadcast_to(std,shape=[500,])
|
||||
plt.plot(data[2000:2396,:])
|
||||
plt.plot(mean)
|
||||
plt.plot(mean+3*std)
|
||||
plt.plot(mean-3*std)
|
||||
plt.show()
|
||||
|
|
@ -0,0 +1,526 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# coding: utf-8
|
||||
import tensorflow as tf
|
||||
import tensorflow.keras
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from model.DepthwiseCon1D.DepthwiseConv1D import DepthwiseConv1D
|
||||
from model.Dynamic_channelAttention.Dynamic_channelAttention import DynamicChannelAttention
|
||||
from condition_monitoring.data_deal import loadData
|
||||
from model.Joint_Monitoring.Joint_Monitoring2 import Joint_Monitoring
|
||||
|
||||
from model.CommonFunction.CommonFunction import *
|
||||
from sklearn.model_selection import train_test_split
|
||||
from tensorflow.keras.models import load_model, save_model
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2022/7/8 10:29
|
||||
@Usage : 尝试将预测和分类两种方式相结合,联合监测
|
||||
@Desc :REPVGG+unsampling+GRU进行重构,后面接GDP=全局动态池化+分类器
|
||||
随epoch衰减的MSELoss+随epoch增强的crossEntropy
|
||||
'''
|
||||
|
||||
'''超参数设置'''
|
||||
time_stamp = 120
|
||||
feature_num = 10
|
||||
batch_size = 16
|
||||
learning_rate = 0.001
|
||||
EPOCH = 101
|
||||
model_name = "joint"
|
||||
'''EWMA超参数'''
|
||||
K = 18
|
||||
namuda = 0.01
|
||||
'''保存名称'''
|
||||
|
||||
save_name = "../model/weight/{0}_timestamp{1}_feature{2}_Epoch{4}_weight/weight".format(model_name,
|
||||
time_stamp,
|
||||
feature_num,
|
||||
batch_size,
|
||||
EPOCH)
|
||||
save_step_two_name = "../model/two_weight/{0}_timestamp{1}_feature{2}_weight/weight".format(model_name,
|
||||
time_stamp,
|
||||
feature_num,
|
||||
batch_size,
|
||||
EPOCH)
|
||||
|
||||
# save_name = "../model/joint/{0}_timestamp{1}_feature{2}.h5".format(model_name,
|
||||
# time_stamp,
|
||||
# feature_num,
|
||||
# batch_size,
|
||||
# EPOCH)
|
||||
# save_step_two_name = "../model/joint_two/{0}_timestamp{1}_feature{2}.h5".format(model_name,
|
||||
# time_stamp,
|
||||
# feature_num,
|
||||
# batch_size,
|
||||
# EPOCH)
|
||||
'''文件名'''
|
||||
file_name = "G:\data\SCADA数据\jb4q_8_delete_all_zero.csv"
|
||||
|
||||
'''
|
||||
文件说明:jb4q_8_delete_all_zero.csv是删除了除异常以外的所有0值的文件
|
||||
文件从0:300454行均是正常值(2019/7.30 00:00:00 - 2019/9/18 11:21:00)
|
||||
从300455:317052行均是异常值(2019/9/18 11:21:01 - 2019/9/29 23:59:00)
|
||||
'''
|
||||
'''文件参数'''
|
||||
# 最后正常的时间点
|
||||
healthy_date = 300454
|
||||
# 最后异常的时间点
|
||||
unhealthy_date = 317052
|
||||
# 异常容忍程度
|
||||
unhealthy_patience = 5
|
||||
|
||||
|
||||
def remove(data, time_stamp=time_stamp):
|
||||
rows, cols = data.shape
|
||||
print("remove_data.shape:", data.shape)
|
||||
num = int(rows / time_stamp)
|
||||
|
||||
return data[:num * time_stamp, :]
|
||||
pass
|
||||
|
||||
|
||||
# 不重叠采样
|
||||
def get_training_data(data, time_stamp: int = time_stamp):
|
||||
removed_data = remove(data=data)
|
||||
rows, cols = removed_data.shape
|
||||
print("removed_data.shape:", data.shape)
|
||||
print("removed_data:", removed_data)
|
||||
train_data = np.reshape(removed_data, [-1, time_stamp, cols])
|
||||
print("train_data:", train_data)
|
||||
batchs, time_stamp, cols = train_data.shape
|
||||
|
||||
for i in range(1, batchs):
|
||||
each_label = np.expand_dims(train_data[i, 0, :], axis=0)
|
||||
if i == 1:
|
||||
train_label = each_label
|
||||
else:
|
||||
train_label = np.concatenate([train_label, each_label], axis=0)
|
||||
|
||||
print("train_data.shape:", train_data.shape)
|
||||
print("train_label.shape", train_label.shape)
|
||||
return train_data[:-1, :], train_label
|
||||
|
||||
|
||||
# 重叠采样
|
||||
def get_training_data_overlapping(data, time_stamp: int = time_stamp, is_Healthy: bool = True):
|
||||
rows, cols = data.shape
|
||||
train_data = np.empty(shape=[rows - time_stamp - 1, time_stamp, cols])
|
||||
train_label = np.empty(shape=[rows - time_stamp - 1, cols])
|
||||
for i in range(rows):
|
||||
if i + time_stamp >= rows:
|
||||
break
|
||||
if i + time_stamp < rows - 1:
|
||||
train_data[i] = data[i:i + time_stamp]
|
||||
train_label[i] = data[i + time_stamp]
|
||||
|
||||
print("重叠采样以后:")
|
||||
print("data:", train_data) # (300334,120,10)
|
||||
print("label:", train_label) # (300334,10)
|
||||
|
||||
if is_Healthy:
|
||||
train_label2 = np.ones(shape=[train_label.shape[0]])
|
||||
else:
|
||||
train_label2 = np.zeros(shape=[train_label.shape[0]])
|
||||
|
||||
print("label2:", train_label2)
|
||||
|
||||
return train_data, train_label, train_label2
|
||||
|
||||
|
||||
# RepConv重参数化卷积
|
||||
def RepConv(input_tensor, k=3):
|
||||
_, _, output_dim = input_tensor.shape
|
||||
conv1 = tf.keras.layers.Conv1D(filters=output_dim, kernel_size=k, strides=1, padding='SAME')(input_tensor)
|
||||
b1 = tf.keras.layers.BatchNormalization()(conv1)
|
||||
|
||||
conv2 = tf.keras.layers.Conv1D(filters=output_dim, kernel_size=1, strides=1, padding='SAME')(input_tensor)
|
||||
b2 = tf.keras.layers.BatchNormalization()(conv2)
|
||||
|
||||
b3 = tf.keras.layers.BatchNormalization()(input_tensor)
|
||||
|
||||
out = tf.keras.layers.Add()([b1, b2, b3])
|
||||
out = tf.nn.relu(out)
|
||||
return out
|
||||
|
||||
|
||||
# RepBlock模块
|
||||
def RepBlock(input_tensor, num: int = 3):
|
||||
for i in range(num):
|
||||
input_tensor = RepConv(input_tensor)
|
||||
return input_tensor
|
||||
|
||||
|
||||
# GAP 全局平均池化
|
||||
def Global_avg_channelAttention(input_tensor):
|
||||
_, length, channel = input_tensor.shape
|
||||
DWC1 = DepthwiseConv1D(kernel_size=1, padding='SAME')(input_tensor)
|
||||
GAP = tf.keras.layers.GlobalAvgPool1D()(DWC1)
|
||||
c1 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GAP)
|
||||
s1 = tf.nn.sigmoid(c1)
|
||||
output = tf.multiply(input_tensor, s1)
|
||||
return output
|
||||
|
||||
|
||||
# GDP 全局动态池化
|
||||
def Global_Dynamic_channelAttention(input_tensor):
|
||||
_, length, channel = input_tensor.shape
|
||||
DWC1 = DepthwiseConv1D(kernel_size=1, padding='SAME')(input_tensor)
|
||||
|
||||
# GAP
|
||||
GAP = tf.keras.layers.GlobalAvgPool1D()(DWC1)
|
||||
c1 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GAP)
|
||||
s1 = tf.nn.sigmoid(c1)
|
||||
|
||||
# GMP
|
||||
GMP = tf.keras.layers.GlobalMaxPool1D()(DWC1)
|
||||
c2 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GMP)
|
||||
s3 = tf.nn.sigmoid(c2)
|
||||
|
||||
output = tf.multiply(input_tensor, s1)
|
||||
return output
|
||||
|
||||
|
||||
# 归一化
|
||||
def normalization(data):
|
||||
rows, cols = data.shape
|
||||
print("归一化之前:", data)
|
||||
print(data.shape)
|
||||
print("======================")
|
||||
|
||||
# 归一化
|
||||
max = np.max(data, axis=0)
|
||||
max = np.broadcast_to(max, [rows, cols])
|
||||
min = np.min(data, axis=0)
|
||||
min = np.broadcast_to(min, [rows, cols])
|
||||
|
||||
data = (data - min) / (max - min)
|
||||
print("归一化之后:", data)
|
||||
print(data.shape)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
# 正则化
|
||||
def Regularization(data):
|
||||
rows, cols = data.shape
|
||||
print("正则化之前:", data)
|
||||
print(data.shape)
|
||||
print("======================")
|
||||
|
||||
# 正则化
|
||||
mean = np.mean(data, axis=0)
|
||||
mean = np.broadcast_to(mean, shape=[rows, cols])
|
||||
dst = np.sqrt(np.var(data, axis=0))
|
||||
dst = np.broadcast_to(dst, shape=[rows, cols])
|
||||
data = (data - mean) / dst
|
||||
print("正则化之后:", data)
|
||||
print(data.shape)
|
||||
|
||||
return data
|
||||
pass
|
||||
|
||||
|
||||
def EWMA(data, K=K, namuda=namuda):
|
||||
# t是啥暂时未知
|
||||
t = 0
|
||||
mid = np.mean(data, axis=0)
|
||||
standard = np.sqrt(np.var(data, axis=0))
|
||||
UCL = mid + K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
|
||||
LCL = mid - K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
|
||||
return mid, UCL, LCL
|
||||
pass
|
||||
|
||||
|
||||
def get_MSE(data, label, new_model):
|
||||
predicted_data = new_model.predict(data)
|
||||
|
||||
temp = np.abs(predicted_data - label)
|
||||
temp1 = (temp - np.broadcast_to(np.mean(temp, axis=0), shape=predicted_data.shape))
|
||||
temp2 = np.broadcast_to(np.sqrt(np.var(temp, axis=0)), shape=predicted_data.shape)
|
||||
temp3 = temp1 / temp2
|
||||
mse = np.sum((temp1 / temp2) ** 2, axis=1)
|
||||
print("z:", mse)
|
||||
print(mse.shape)
|
||||
|
||||
# mse=np.mean((predicted_data-label)**2,axis=1)
|
||||
print("mse", mse)
|
||||
|
||||
dims, = mse.shape
|
||||
|
||||
mean = np.mean(mse)
|
||||
std = np.sqrt(np.var(mse))
|
||||
max = mean + 3 * std
|
||||
# min = mean-3*std
|
||||
max = np.broadcast_to(max, shape=[dims, ])
|
||||
# min = np.broadcast_to(min,shape=[dims,])
|
||||
mean = np.broadcast_to(mean, shape=[dims, ])
|
||||
|
||||
# plt.plot(max)
|
||||
# plt.plot(mse)
|
||||
# plt.plot(mean)
|
||||
# # plt.plot(min)
|
||||
# plt.show()
|
||||
#
|
||||
#
|
||||
return mse, mean, max
|
||||
# pass
|
||||
|
||||
|
||||
def condition_monitoring_model():
|
||||
input = tf.keras.Input(shape=[time_stamp, feature_num])
|
||||
conv1 = tf.keras.layers.Conv1D(filters=256, kernel_size=1)(input)
|
||||
GRU1 = tf.keras.layers.GRU(128, return_sequences=False)(conv1)
|
||||
d1 = tf.keras.layers.Dense(300)(GRU1)
|
||||
output = tf.keras.layers.Dense(10)(d1)
|
||||
|
||||
model = tf.keras.Model(inputs=input, outputs=output)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
# trian_data:(300455,120,10)
|
||||
# trian_label1:(300455,10)
|
||||
# trian_label2:(300455,)
|
||||
def shuffle(train_data, train_label1, train_label2, is_split: bool = False, split_size: float = 0.2):
|
||||
(train_data, test_data, train_label1, test_label1, train_label2, test_label2) = train_test_split(train_data,
|
||||
train_label1,
|
||||
train_label2,
|
||||
test_size=split_size,
|
||||
shuffle=True,
|
||||
random_state=100)
|
||||
if is_split:
|
||||
return train_data, train_label1, train_label2, test_data, test_label1, test_label2
|
||||
train_data = np.concatenate([train_data, test_data], axis=0)
|
||||
train_label1 = np.concatenate([train_label1, test_label1], axis=0)
|
||||
train_label2 = np.concatenate([train_label2, test_label2], axis=0)
|
||||
# print(train_data.shape)
|
||||
# print(train_label1.shape)
|
||||
# print(train_label2.shape)
|
||||
# print(train_data.shape)
|
||||
|
||||
return train_data, train_label1, train_label2
|
||||
pass
|
||||
|
||||
|
||||
def split_test_data(healthy_data, healthy_label1, healthy_label2, unhealthy_data, unhealthy_label1, unhealthy_label2,
|
||||
split_size: float = 0.2):
|
||||
data = np.concatenate([healthy_data, unhealthy_data], axis=0)
|
||||
label1 = np.concatenate([healthy_label1, unhealthy_label1], axis=0)
|
||||
label2 = np.concatenate([healthy_label2, unhealthy_label2], axis=0)
|
||||
(train_data, test_data, train_label1, test_label1, train_label2, test_label2) = train_test_split(data,
|
||||
label1,
|
||||
label2,
|
||||
test_size=split_size,
|
||||
shuffle=True,
|
||||
random_state=100)
|
||||
|
||||
# print(train_data.shape)
|
||||
# print(train_label1.shape)
|
||||
# print(train_label2.shape)
|
||||
# print(train_data.shape)
|
||||
|
||||
return train_data, train_label1, train_label2, test_data, test_label1, test_label2
|
||||
|
||||
pass
|
||||
|
||||
|
||||
# trian_data:(300455,120,10)
|
||||
# trian_label1:(300455,10)
|
||||
# trian_label2:(300455,)
|
||||
def train_step_one(train_data, train_label1, train_label2):
|
||||
model = Joint_Monitoring()
|
||||
# # # # TODO 需要运行编译一次,才能打印model.summary()
|
||||
# model.build(input_shape=(batch_size, filter_num, dims))
|
||||
# model.summary()
|
||||
history_loss = []
|
||||
history_val_loss = []
|
||||
learning_rate = 1e-3
|
||||
for epoch in range(EPOCH):
|
||||
|
||||
print()
|
||||
print("EPOCH:", epoch, "/", EPOCH, ":")
|
||||
train_data, train_label1, train_label2 = shuffle(train_data, train_label1, train_label2)
|
||||
if epoch == 0:
|
||||
train_data, train_label1, train_label2, val_data, val_label1, val_label2 = shuffle(train_data, train_label1,
|
||||
train_label2,
|
||||
is_split=True)
|
||||
# print()
|
||||
# print("EPOCH:", epoch, "/", EPOCH, ":")
|
||||
# 用于让train知道,这是这个epoch中的第几次训练
|
||||
z = 0
|
||||
# 用于batch_size次再训练
|
||||
k = 1
|
||||
for data_1, label_1, label_2 in zip(train_data, train_label1, train_label2):
|
||||
size, _, _ = train_data.shape
|
||||
data_1 = tf.expand_dims(data_1, axis=0)
|
||||
label_1 = tf.expand_dims(label_1, axis=0)
|
||||
label_2 = tf.expand_dims(label_2, axis=0)
|
||||
if batch_size != 1:
|
||||
if k % batch_size == 1:
|
||||
data = data_1
|
||||
label1 = label_1
|
||||
label2 = label_2
|
||||
else:
|
||||
data = tf.concat([data, data_1], axis=0)
|
||||
label1 = tf.concat([label1, label_1], axis=0)
|
||||
label2 = tf.concat([label2, label_2], axis=0)
|
||||
else:
|
||||
data = data_1
|
||||
label1 = label_1
|
||||
label2 = label_2
|
||||
|
||||
if k % batch_size == 0:
|
||||
# label = tf.expand_dims(label, axis=-1)
|
||||
loss_value = model.train(input_tensor=data, label1=label1, label2=label2, learning_rate=learning_rate,
|
||||
is_first_time=True)
|
||||
print(z * batch_size, "/", size, ":===============>", "loss:", loss_value.numpy())
|
||||
k = 0
|
||||
z = z + 1
|
||||
k = k + 1
|
||||
val_loss = model.get_val_loss(val_data=val_data, val_label1=val_label1, val_label2=val_label2,
|
||||
is_first_time=True)
|
||||
SaveBestModel(model=model, save_name=save_name, history_loss=history_val_loss, loss_value=val_loss.numpy())
|
||||
# SaveBestH5Model(model=model, save_name=save_name, history_loss=history_val_loss, loss_value=val_loss.numpy())
|
||||
history_val_loss.append(val_loss)
|
||||
history_loss.append(loss_value.numpy())
|
||||
print('Training loss is :', loss_value.numpy())
|
||||
print('Validating loss is :', val_loss.numpy())
|
||||
if IsStopTraining(history_loss=history_val_loss, patience=7):
|
||||
break
|
||||
if Is_Reduce_learning_rate(history_loss=history_val_loss, patience=3):
|
||||
if learning_rate >= 1e-4:
|
||||
learning_rate = learning_rate * 0.1
|
||||
pass
|
||||
|
||||
|
||||
def train_step_two(step_one_model, step_two_model, train_data, train_label1, train_label2):
|
||||
# step_two_model = Joint_Monitoring()
|
||||
# step_two_model.build(input_shape=(batch_size, time_stamp, feature_num))
|
||||
# step_two_model.summary()
|
||||
history_loss = []
|
||||
history_val_loss = []
|
||||
history_accuracy = []
|
||||
learning_rate = 1e-3
|
||||
for epoch in range(EPOCH):
|
||||
print()
|
||||
print("EPOCH:", epoch, "/", EPOCH, ":")
|
||||
train_data, train_label1, train_label2 = shuffle(train_data, train_label1, train_label2)
|
||||
if epoch == 0:
|
||||
train_data, train_label1, train_label2, val_data, val_label1, val_label2 = shuffle(train_data, train_label1,
|
||||
train_label2,
|
||||
is_split=True)
|
||||
# print()
|
||||
# print("EPOCH:", epoch, "/", EPOCH, ":")
|
||||
# 用于让train知道,这是这个epoch中的第几次训练
|
||||
z = 0
|
||||
# 用于batch_size次再训练
|
||||
k = 1
|
||||
accuracy_num = 0
|
||||
for data_1, label_1, label_2 in zip(train_data, train_label1, train_label2):
|
||||
size, _, _ = train_data.shape
|
||||
data_1 = tf.expand_dims(data_1, axis=0)
|
||||
label_1 = tf.expand_dims(label_1, axis=0)
|
||||
label_2 = tf.expand_dims(label_2, axis=0)
|
||||
if batch_size != 1:
|
||||
if k % batch_size == 1:
|
||||
data = data_1
|
||||
label1 = label_1
|
||||
label2 = label_2
|
||||
else:
|
||||
data = tf.concat([data, data_1], axis=0)
|
||||
label1 = tf.concat([label1, label_1], axis=0)
|
||||
label2 = tf.concat([label2, label_2], axis=0)
|
||||
else:
|
||||
data = data_1
|
||||
label1 = label_1
|
||||
label2 = label_2
|
||||
|
||||
if k % batch_size == 0:
|
||||
# label = tf.expand_dims(label, axis=-1)
|
||||
output1, output2, output3, _ = step_one_model.call(inputs=data, is_first_time=True)
|
||||
loss_value, accuracy_value = step_two_model.train(input_tensor=data, label1=label1, label2=label2,
|
||||
learning_rate=learning_rate,
|
||||
is_first_time=False, pred_3=output1, pred_4=output2,
|
||||
pred_5=output3)
|
||||
accuracy_num += accuracy_value
|
||||
print(z * batch_size, "/", size, ":===============>", "loss:", loss_value.numpy(), "| accuracy:",
|
||||
accuracy_num / ((z + 1) * batch_size))
|
||||
k = 0
|
||||
z = z + 1
|
||||
k = k + 1
|
||||
|
||||
val_loss, val_accuracy = step_two_model.get_val_loss(val_data=val_data, val_label1=val_label1,
|
||||
val_label2=val_label2,
|
||||
is_first_time=False, step_one_model=step_one_model)
|
||||
SaveBestModelByAccuracy(model=step_two_model, save_name=save_step_two_name, history_accuracy=history_accuracy,
|
||||
accuracy_value=val_accuracy)
|
||||
history_val_loss.append(val_loss)
|
||||
history_loss.append(loss_value.numpy())
|
||||
print('Training loss is : {0} | Training accuracy is : {1}'.format(loss_value.numpy(),
|
||||
accuracy_num / ((z + 1) * batch_size)))
|
||||
print('Validating loss is : {0} | Validating accuracy is : {1}'.format(val_loss.numpy(), val_accuracy))
|
||||
if IsStopTraining(history_loss=history_val_loss, patience=7):
|
||||
break
|
||||
if Is_Reduce_learning_rate(history_loss=history_val_loss, patience=3):
|
||||
if learning_rate >= 1e-4:
|
||||
learning_rate = learning_rate * 0.1
|
||||
pass
|
||||
|
||||
|
||||
def test(step_one_model, step_two_model, test_data, test_label1, test_label2):
|
||||
history_loss = []
|
||||
history_val_loss = []
|
||||
|
||||
val_loss, val_accuracy = step_two_model.get_val_loss(val_data=test_data, val_label1=test_label1,
|
||||
val_label2=test_label2,
|
||||
is_first_time=False, step_one_model=step_one_model)
|
||||
|
||||
history_val_loss.append(val_loss)
|
||||
print("val_accuracy:", val_accuracy)
|
||||
print("val_loss:", val_loss)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
total_data = loadData.execute(N=feature_num, file_name=file_name)
|
||||
total_data = normalization(data=total_data)
|
||||
train_data_healthy, train_label1_healthy, train_label2_healthy = get_training_data_overlapping(
|
||||
total_data[:healthy_date, :], is_Healthy=True)
|
||||
train_data_unhealthy, train_label1_unhealthy, train_label2_unhealthy = get_training_data_overlapping(
|
||||
total_data[healthy_date - time_stamp + unhealthy_patience:unhealthy_date, :],
|
||||
is_Healthy=False)
|
||||
# TODO 第一步训练
|
||||
# 单次测试
|
||||
# train_step_one(train_data=train_data_healthy[:32, :, :], train_label1=train_label1_healthy[:32, :],train_label2=train_label2_healthy[:32, ])
|
||||
# train_step_one(train_data=train_data_healthy, train_label1=train_label1_healthy,train_label2=train_label2_healthy)
|
||||
|
||||
# 导入第一步已经训练好的模型,一个继续训练,一个只输出结果
|
||||
step_one_model = Joint_Monitoring()
|
||||
step_one_model.load_weights(save_name)
|
||||
#
|
||||
# step_two_model = Joint_Monitoring()
|
||||
# step_two_model.load_weights(save_name)
|
||||
|
||||
# TODO 第二步训练
|
||||
### healthy_data.shape: (300333,120,10)
|
||||
### unhealthy_data.shape: (16594,10)
|
||||
healthy_size, _, _ = train_data_healthy.shape
|
||||
unhealthy_size, _, _ = train_data_unhealthy.shape
|
||||
train_data, train_label1, train_label2, test_data, test_label1, test_label2 = split_test_data(
|
||||
healthy_data=train_data_healthy[healthy_size - 2 * unhealthy_size:, :, :],
|
||||
healthy_label1=train_label1_healthy[healthy_size - 2 * unhealthy_size:, :],
|
||||
healthy_label2=train_label2_healthy[healthy_size - 2 * unhealthy_size:, ], unhealthy_data=train_data_unhealthy,
|
||||
unhealthy_label1=train_label1_unhealthy, unhealthy_label2=train_label2_unhealthy)
|
||||
# train_step_two(step_one_model=step_one_model, step_two_model=step_two_model,
|
||||
# train_data=train_data,
|
||||
# train_label1=train_label1, train_label2=np.expand_dims(train_label2, axis=-1))
|
||||
|
||||
# TODO 测试测试集
|
||||
step_two_model = Joint_Monitoring()
|
||||
step_two_model.load_weights(save_step_two_name)
|
||||
test(step_one_model=step_one_model, step_two_model=step_two_model, test_data=test_data, test_label1=test_label1,
|
||||
test_label2=np.expand_dims(test_label2, axis=-1))
|
||||
|
||||
pass
|
||||
|
|
@ -0,0 +1,576 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# coding: utf-8
|
||||
import tensorflow as tf
|
||||
import tensorflow.keras
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from model.DepthwiseCon1D.DepthwiseConv1D import DepthwiseConv1D
|
||||
from model.Dynamic_channelAttention.Dynamic_channelAttention import DynamicChannelAttention
|
||||
from condition_monitoring.data_deal import loadData
|
||||
from model.Joint_Monitoring.Joint_Monitoring3 import Joint_Monitoring
|
||||
|
||||
from model.CommonFunction.CommonFunction import *
|
||||
from sklearn.model_selection import train_test_split
|
||||
from tensorflow.keras.models import load_model, save_model
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2022/7/8 10:29
|
||||
@Usage : 尝试将预测和分类两种方式相结合,联合监测
|
||||
@Desc :REPVGG+unsampling+GRU进行重构,后面接GDP=全局动态池化+分类器
|
||||
随epoch衰减的MSELoss+随epoch增强的crossEntropy
|
||||
'''
|
||||
|
||||
'''超参数设置'''
|
||||
time_stamp = 120
|
||||
feature_num = 10
|
||||
batch_size = 16
|
||||
learning_rate = 0.001
|
||||
EPOCH = 101
|
||||
model_name = "joint"
|
||||
'''EWMA超参数'''
|
||||
K = 18
|
||||
namuda = 0.01
|
||||
'''保存名称'''
|
||||
|
||||
save_name = "../hard_model/weight/{0}_timestamp{1}_feature{2}_weight_epoch8/weight".format(model_name,
|
||||
time_stamp,
|
||||
feature_num,
|
||||
batch_size,
|
||||
EPOCH)
|
||||
save_step_two_name = "../hard_model/two_weight/{0}_timestamp{1}_feature{2}_weight_epoch14/weight".format(model_name,
|
||||
time_stamp,
|
||||
feature_num,
|
||||
batch_size,
|
||||
EPOCH)
|
||||
|
||||
# save_name = "../model/joint/{0}_timestamp{1}_feature{2}.h5".format(model_name,
|
||||
# time_stamp,
|
||||
# feature_num,
|
||||
# batch_size,
|
||||
# EPOCH)
|
||||
# save_step_two_name = "../model/joint_two/{0}_timestamp{1}_feature{2}.h5".format(model_name,
|
||||
# time_stamp,
|
||||
# feature_num,
|
||||
# batch_size,
|
||||
# EPOCH)
|
||||
'''文件名'''
|
||||
file_name = "G:\data\SCADA数据\jb4q_8_delete_total_zero.csv"
|
||||
|
||||
'''
|
||||
文件说明:jb4q_8_delete_total_zero.csv是删除了只删除了全是0的列的文件
|
||||
文件从0:415548行均是正常值(2019/7.30 00:00:00 - 2019/9/18 11:14:00)
|
||||
从415549:432153行均是异常值(2019/9/18 11:21:01 - 2021/1/18 00:00:00)
|
||||
'''
|
||||
'''文件参数'''
|
||||
# 最后正常的时间点
|
||||
healthy_date = 415548
|
||||
# 最后异常的时间点
|
||||
unhealthy_date = 432153
|
||||
# 异常容忍程度
|
||||
unhealthy_patience = 5
|
||||
|
||||
|
||||
def remove(data, time_stamp=time_stamp):
|
||||
rows, cols = data.shape
|
||||
print("remove_data.shape:", data.shape)
|
||||
num = int(rows / time_stamp)
|
||||
|
||||
return data[:num * time_stamp, :]
|
||||
pass
|
||||
|
||||
|
||||
# 不重叠采样
|
||||
def get_training_data(data, time_stamp: int = time_stamp):
|
||||
removed_data = remove(data=data)
|
||||
rows, cols = removed_data.shape
|
||||
print("removed_data.shape:", data.shape)
|
||||
print("removed_data:", removed_data)
|
||||
train_data = np.reshape(removed_data, [-1, time_stamp, cols])
|
||||
print("train_data:", train_data)
|
||||
batchs, time_stamp, cols = train_data.shape
|
||||
|
||||
for i in range(1, batchs):
|
||||
each_label = np.expand_dims(train_data[i, 0, :], axis=0)
|
||||
if i == 1:
|
||||
train_label = each_label
|
||||
else:
|
||||
train_label = np.concatenate([train_label, each_label], axis=0)
|
||||
|
||||
print("train_data.shape:", train_data.shape)
|
||||
print("train_label.shape", train_label.shape)
|
||||
return train_data[:-1, :], train_label
|
||||
|
||||
|
||||
# 重叠采样
|
||||
def get_training_data_overlapping(data, time_stamp: int = time_stamp, is_Healthy: bool = True):
|
||||
rows, cols = data.shape
|
||||
train_data = np.empty(shape=[rows - time_stamp - 1, time_stamp, cols])
|
||||
train_label = np.empty(shape=[rows - time_stamp - 1, cols])
|
||||
for i in range(rows):
|
||||
if i + time_stamp >= rows:
|
||||
break
|
||||
if i + time_stamp < rows - 1:
|
||||
train_data[i] = data[i:i + time_stamp]
|
||||
train_label[i] = data[i + time_stamp]
|
||||
|
||||
print("重叠采样以后:")
|
||||
print("data:", train_data) # (300334,120,10)
|
||||
print("label:", train_label) # (300334,10)
|
||||
|
||||
if is_Healthy:
|
||||
train_label2 = np.ones(shape=[train_label.shape[0]])
|
||||
else:
|
||||
train_label2 = np.zeros(shape=[train_label.shape[0]])
|
||||
|
||||
print("label2:", train_label2)
|
||||
|
||||
return train_data, train_label, train_label2
|
||||
|
||||
|
||||
# RepConv重参数化卷积
|
||||
def RepConv(input_tensor, k=3):
|
||||
_, _, output_dim = input_tensor.shape
|
||||
conv1 = tf.keras.layers.Conv1D(filters=output_dim, kernel_size=k, strides=1, padding='SAME')(input_tensor)
|
||||
b1 = tf.keras.layers.BatchNormalization()(conv1)
|
||||
|
||||
conv2 = tf.keras.layers.Conv1D(filters=output_dim, kernel_size=1, strides=1, padding='SAME')(input_tensor)
|
||||
b2 = tf.keras.layers.BatchNormalization()(conv2)
|
||||
|
||||
b3 = tf.keras.layers.BatchNormalization()(input_tensor)
|
||||
|
||||
out = tf.keras.layers.Add()([b1, b2, b3])
|
||||
out = tf.nn.relu(out)
|
||||
return out
|
||||
|
||||
|
||||
# RepBlock模块
|
||||
def RepBlock(input_tensor, num: int = 3):
|
||||
for i in range(num):
|
||||
input_tensor = RepConv(input_tensor)
|
||||
return input_tensor
|
||||
|
||||
|
||||
# GAP 全局平均池化
|
||||
def Global_avg_channelAttention(input_tensor):
|
||||
_, length, channel = input_tensor.shape
|
||||
DWC1 = DepthwiseConv1D(kernel_size=1, padding='SAME')(input_tensor)
|
||||
GAP = tf.keras.layers.GlobalAvgPool1D()(DWC1)
|
||||
c1 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GAP)
|
||||
s1 = tf.nn.sigmoid(c1)
|
||||
output = tf.multiply(input_tensor, s1)
|
||||
return output
|
||||
|
||||
|
||||
# GDP 全局动态池化
|
||||
def Global_Dynamic_channelAttention(input_tensor):
|
||||
_, length, channel = input_tensor.shape
|
||||
DWC1 = DepthwiseConv1D(kernel_size=1, padding='SAME')(input_tensor)
|
||||
|
||||
# GAP
|
||||
GAP = tf.keras.layers.GlobalAvgPool1D()(DWC1)
|
||||
c1 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GAP)
|
||||
s1 = tf.nn.sigmoid(c1)
|
||||
|
||||
# GMP
|
||||
GMP = tf.keras.layers.GlobalMaxPool1D()(DWC1)
|
||||
c2 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GMP)
|
||||
s3 = tf.nn.sigmoid(c2)
|
||||
|
||||
output = tf.multiply(input_tensor, s1)
|
||||
return output
|
||||
|
||||
|
||||
# 归一化
|
||||
def normalization(data):
|
||||
rows, cols = data.shape
|
||||
print("归一化之前:", data)
|
||||
print(data.shape)
|
||||
print("======================")
|
||||
|
||||
# 归一化
|
||||
max = np.max(data, axis=0)
|
||||
max = np.broadcast_to(max, [rows, cols])
|
||||
min = np.min(data, axis=0)
|
||||
min = np.broadcast_to(min, [rows, cols])
|
||||
|
||||
data = (data - min) / (max - min)
|
||||
print("归一化之后:", data)
|
||||
print(data.shape)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
# 正则化
|
||||
def Regularization(data):
|
||||
rows, cols = data.shape
|
||||
print("正则化之前:", data)
|
||||
print(data.shape)
|
||||
print("======================")
|
||||
|
||||
# 正则化
|
||||
mean = np.mean(data, axis=0)
|
||||
mean = np.broadcast_to(mean, shape=[rows, cols])
|
||||
dst = np.sqrt(np.var(data, axis=0))
|
||||
dst = np.broadcast_to(dst, shape=[rows, cols])
|
||||
data = (data - mean) / dst
|
||||
print("正则化之后:", data)
|
||||
print(data.shape)
|
||||
|
||||
return data
|
||||
pass
|
||||
|
||||
|
||||
def EWMA(data, K=K, namuda=namuda):
|
||||
# t是啥暂时未知
|
||||
t = 0
|
||||
mid = np.mean(data, axis=0)
|
||||
standard = np.sqrt(np.var(data, axis=0))
|
||||
UCL = mid + K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
|
||||
LCL = mid - K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
|
||||
return mid, UCL, LCL
|
||||
pass
|
||||
|
||||
|
||||
def get_MSE(data, label, new_model):
|
||||
predicted_data = new_model.predict(data)
|
||||
|
||||
temp = np.abs(predicted_data - label)
|
||||
temp1 = (temp - np.broadcast_to(np.mean(temp, axis=0), shape=predicted_data.shape))
|
||||
temp2 = np.broadcast_to(np.sqrt(np.var(temp, axis=0)), shape=predicted_data.shape)
|
||||
temp3 = temp1 / temp2
|
||||
mse = np.sum((temp1 / temp2) ** 2, axis=1)
|
||||
print("z:", mse)
|
||||
print(mse.shape)
|
||||
|
||||
# mse=np.mean((predicted_data-label)**2,axis=1)
|
||||
print("mse", mse)
|
||||
|
||||
dims, = mse.shape
|
||||
|
||||
mean = np.mean(mse)
|
||||
std = np.sqrt(np.var(mse))
|
||||
max = mean + 3 * std
|
||||
# min = mean-3*std
|
||||
max = np.broadcast_to(max, shape=[dims, ])
|
||||
# min = np.broadcast_to(min,shape=[dims,])
|
||||
mean = np.broadcast_to(mean, shape=[dims, ])
|
||||
|
||||
# plt.plot(max)
|
||||
# plt.plot(mse)
|
||||
# plt.plot(mean)
|
||||
# # plt.plot(min)
|
||||
# plt.show()
|
||||
#
|
||||
#
|
||||
return mse, mean, max
|
||||
# pass
|
||||
|
||||
|
||||
def condition_monitoring_model():
|
||||
input = tf.keras.Input(shape=[time_stamp, feature_num])
|
||||
conv1 = tf.keras.layers.Conv1D(filters=256, kernel_size=1)(input)
|
||||
GRU1 = tf.keras.layers.GRU(128, return_sequences=False)(conv1)
|
||||
d1 = tf.keras.layers.Dense(300)(GRU1)
|
||||
output = tf.keras.layers.Dense(10)(d1)
|
||||
|
||||
model = tf.keras.Model(inputs=input, outputs=output)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
# trian_data:(300455,120,10)
|
||||
# trian_label1:(300455,10)
|
||||
# trian_label2:(300455,)
|
||||
def shuffle(train_data, train_label1, train_label2, is_split: bool = False, split_size: float = 0.2):
|
||||
(train_data, test_data, train_label1, test_label1, train_label2, test_label2) = train_test_split(train_data,
|
||||
train_label1,
|
||||
train_label2,
|
||||
test_size=split_size,
|
||||
shuffle=True,
|
||||
random_state=100)
|
||||
if is_split:
|
||||
return train_data, train_label1, train_label2, test_data, test_label1, test_label2
|
||||
train_data = np.concatenate([train_data, test_data], axis=0)
|
||||
train_label1 = np.concatenate([train_label1, test_label1], axis=0)
|
||||
train_label2 = np.concatenate([train_label2, test_label2], axis=0)
|
||||
# print(train_data.shape)
|
||||
# print(train_label1.shape)
|
||||
# print(train_label2.shape)
|
||||
# print(train_data.shape)
|
||||
|
||||
return train_data, train_label1, train_label2
|
||||
pass
|
||||
|
||||
|
||||
def split_test_data(healthy_data, healthy_label1, healthy_label2, unhealthy_data, unhealthy_label1, unhealthy_label2,
|
||||
split_size: float = 0.2, shuffle: bool = True):
|
||||
data = np.concatenate([healthy_data, unhealthy_data], axis=0)
|
||||
label1 = np.concatenate([healthy_label1, unhealthy_label1], axis=0)
|
||||
label2 = np.concatenate([healthy_label2, unhealthy_label2], axis=0)
|
||||
(train_data, test_data, train_label1, test_label1, train_label2, test_label2) = train_test_split(data,
|
||||
label1,
|
||||
label2,
|
||||
test_size=split_size,
|
||||
shuffle=shuffle,
|
||||
random_state=100)
|
||||
|
||||
# print(train_data.shape)
|
||||
# print(train_label1.shape)
|
||||
# print(train_label2.shape)
|
||||
# print(train_data.shape)
|
||||
|
||||
return train_data, train_label1, train_label2, test_data, test_label1, test_label2
|
||||
|
||||
pass
|
||||
|
||||
|
||||
# trian_data:(300455,120,10)
|
||||
# trian_label1:(300455,10)
|
||||
# trian_label2:(300455,)
|
||||
def train_step_one(train_data, train_label1, train_label2):
|
||||
model = Joint_Monitoring()
|
||||
# # # # TODO 需要运行编译一次,才能打印model.summary()
|
||||
# model.build(input_shape=(batch_size, filter_num, dims))
|
||||
# model.summary()
|
||||
history_loss = []
|
||||
history_val_loss = []
|
||||
learning_rate = 1e-3
|
||||
for epoch in range(EPOCH):
|
||||
|
||||
print()
|
||||
print("EPOCH:", epoch, "/", EPOCH, ":")
|
||||
train_data, train_label1, train_label2 = shuffle(train_data, train_label1, train_label2)
|
||||
if epoch == 0:
|
||||
train_data, train_label1, train_label2, val_data, val_label1, val_label2 = shuffle(train_data, train_label1,
|
||||
train_label2,
|
||||
is_split=True)
|
||||
# print()
|
||||
# print("EPOCH:", epoch, "/", EPOCH, ":")
|
||||
# 用于让train知道,这是这个epoch中的第几次训练
|
||||
z = 0
|
||||
# 用于batch_size次再训练
|
||||
k = 1
|
||||
for data_1, label_1, label_2 in zip(train_data, train_label1, train_label2):
|
||||
size, _, _ = train_data.shape
|
||||
data_1 = tf.expand_dims(data_1, axis=0)
|
||||
label_1 = tf.expand_dims(label_1, axis=0)
|
||||
label_2 = tf.expand_dims(label_2, axis=0)
|
||||
if batch_size != 1:
|
||||
if k % batch_size == 1:
|
||||
data = data_1
|
||||
label1 = label_1
|
||||
label2 = label_2
|
||||
else:
|
||||
data = tf.concat([data, data_1], axis=0)
|
||||
label1 = tf.concat([label1, label_1], axis=0)
|
||||
label2 = tf.concat([label2, label_2], axis=0)
|
||||
else:
|
||||
data = data_1
|
||||
label1 = label_1
|
||||
label2 = label_2
|
||||
|
||||
if k % batch_size == 0:
|
||||
# label = tf.expand_dims(label, axis=-1)
|
||||
loss_value, accuracy_value = model.train(input_tensor=data, label1=label1, label2=label2,
|
||||
learning_rate=learning_rate,
|
||||
is_first_time=True)
|
||||
print(z * batch_size, "/", size, ":===============>", "loss:", loss_value.numpy())
|
||||
k = 0
|
||||
z = z + 1
|
||||
k = k + 1
|
||||
val_loss, val_accuracy = model.get_val_loss(val_data=val_data, val_label1=val_label1, val_label2=val_label2,
|
||||
is_first_time=True)
|
||||
SaveBestModel(model=model, save_name=save_name, history_loss=history_val_loss, loss_value=val_loss.numpy())
|
||||
# SaveBestH5Model(model=model, save_name=save_name, history_loss=history_val_loss, loss_value=val_loss.numpy())
|
||||
history_val_loss.append(val_loss)
|
||||
history_loss.append(loss_value.numpy())
|
||||
print('Training loss is :', loss_value.numpy())
|
||||
print('Validating loss is :', val_loss.numpy())
|
||||
if IsStopTraining(history_loss=history_val_loss, patience=7):
|
||||
break
|
||||
if Is_Reduce_learning_rate(history_loss=history_val_loss, patience=3):
|
||||
if learning_rate >= 1e-4:
|
||||
learning_rate = learning_rate * 0.1
|
||||
pass
|
||||
|
||||
|
||||
def train_step_two(step_one_model, step_two_model, train_data, train_label1, train_label2):
|
||||
# step_two_model = Joint_Monitoring()
|
||||
# step_two_model.build(input_shape=(batch_size, time_stamp, feature_num))
|
||||
# step_two_model.summary()
|
||||
history_loss = []
|
||||
history_val_loss = []
|
||||
history_accuracy = []
|
||||
learning_rate = 1e-3
|
||||
for epoch in range(EPOCH):
|
||||
print()
|
||||
print("EPOCH:", epoch, "/", EPOCH, ":")
|
||||
train_data, train_label1, train_label2 = shuffle(train_data, train_label1, train_label2)
|
||||
if epoch == 0:
|
||||
train_data, train_label1, train_label2, val_data, val_label1, val_label2 = shuffle(train_data, train_label1,
|
||||
train_label2,
|
||||
is_split=True)
|
||||
# print()
|
||||
# print("EPOCH:", epoch, "/", EPOCH, ":")
|
||||
# 用于让train知道,这是这个epoch中的第几次训练
|
||||
z = 0
|
||||
# 用于batch_size次再训练
|
||||
k = 1
|
||||
accuracy_num = 0
|
||||
for data_1, label_1, label_2 in zip(train_data, train_label1, train_label2):
|
||||
size, _, _ = train_data.shape
|
||||
data_1 = tf.expand_dims(data_1, axis=0)
|
||||
label_1 = tf.expand_dims(label_1, axis=0)
|
||||
label_2 = tf.expand_dims(label_2, axis=0)
|
||||
if batch_size != 1:
|
||||
if k % batch_size == 1:
|
||||
data = data_1
|
||||
label1 = label_1
|
||||
label2 = label_2
|
||||
else:
|
||||
data = tf.concat([data, data_1], axis=0)
|
||||
label1 = tf.concat([label1, label_1], axis=0)
|
||||
label2 = tf.concat([label2, label_2], axis=0)
|
||||
else:
|
||||
data = data_1
|
||||
label1 = label_1
|
||||
label2 = label_2
|
||||
|
||||
if k % batch_size == 0:
|
||||
# label = tf.expand_dims(label, axis=-1)
|
||||
output1, output2, output3, _ = step_one_model.call(inputs=data, is_first_time=True)
|
||||
loss_value, accuracy_value = step_two_model.train(input_tensor=data, label1=label1, label2=label2,
|
||||
learning_rate=learning_rate,
|
||||
is_first_time=False, pred_3=output1, pred_4=output2,
|
||||
pred_5=output3)
|
||||
accuracy_num += accuracy_value
|
||||
print(z * batch_size, "/", size, ":===============>", "loss:", loss_value.numpy(), "| accuracy:",
|
||||
accuracy_num / ((z + 1) * batch_size))
|
||||
k = 0
|
||||
z = z + 1
|
||||
k = k + 1
|
||||
|
||||
val_loss, val_accuracy = step_two_model.get_val_loss(val_data=val_data, val_label1=val_label1,
|
||||
val_label2=val_label2,
|
||||
is_first_time=False, step_one_model=step_one_model)
|
||||
SaveBestModelByAccuracy(model=step_two_model, save_name=save_step_two_name, history_accuracy=history_accuracy,
|
||||
accuracy_value=val_accuracy)
|
||||
history_val_loss.append(val_loss)
|
||||
history_loss.append(loss_value.numpy())
|
||||
history_accuracy.append(val_accuracy)
|
||||
print('Training loss is : {0} | Training accuracy is : {1}'.format(loss_value.numpy(),
|
||||
accuracy_num / ((z + 1) * batch_size)))
|
||||
print('Validating loss is : {0} | Validating accuracy is : {1}'.format(val_loss.numpy(), val_accuracy))
|
||||
if IsStopTraining(history_loss=history_val_loss, patience=7):
|
||||
break
|
||||
if Is_Reduce_learning_rate(history_loss=history_val_loss, patience=3):
|
||||
if learning_rate >= 1e-4:
|
||||
learning_rate = learning_rate * 0.1
|
||||
pass
|
||||
|
||||
|
||||
def test(step_one_model, step_two_model, test_data, test_label1, test_label2):
|
||||
history_loss = []
|
||||
history_val_loss = []
|
||||
|
||||
val_loss, val_accuracy = step_two_model.get_val_loss(val_data=test_data, val_label1=test_label1,
|
||||
val_label2=test_label2,
|
||||
is_first_time=False, step_one_model=step_one_model)
|
||||
|
||||
history_val_loss.append(val_loss)
|
||||
print("val_accuracy:", val_accuracy)
|
||||
print("val_loss:", val_loss)
|
||||
|
||||
|
||||
def showResult(step_two_model: Joint_Monitoring, test_data, isPlot: bool = False):
|
||||
# 获取模型的所有参数的个数
|
||||
# step_two_model.count_params()
|
||||
total_result = []
|
||||
size, length, dims = test_data.shape
|
||||
for epoch in range(0, size - batch_size + 1, batch_size):
|
||||
each_test_data = test_data[epoch:epoch + batch_size, :, :]
|
||||
_, _, _, output4 = step_two_model.call(each_test_data, is_first_time=False)
|
||||
total_result.append(output4)
|
||||
total_result = np.reshape(total_result, [total_result.__len__(), -1])
|
||||
total_result = np.reshape(total_result, [-1, ])
|
||||
if isPlot:
|
||||
plt.scatter(list(range(total_result.shape[0])), total_result, c='black', s=10)
|
||||
# 画出 y=1 这条水平线
|
||||
plt.axhline(0.5, c='red', label='Failure threshold')
|
||||
# 箭头指向上面的水平线
|
||||
# plt.arrow(35000, 0.9, 33000, 0.75, head_width=0.02, head_length=0.1, shape="full", fc='red', ec='red',
|
||||
# alpha=0.9, overhang=0.5)
|
||||
# plt.text(35000, 0.9, "Truth Fault", fontsize=10, color='black', verticalalignment='top')
|
||||
plt.axvline(test_data.shape[0] * 2 / 3, c='blue', ls='-.')
|
||||
plt.xlabel("time")
|
||||
plt.ylabel("confience")
|
||||
plt.text(total_result.shape[0] * 4 / 5, 0.6, "Fault", fontsize=10, color='black', verticalalignment='top',
|
||||
horizontalalignment='center',
|
||||
bbox={'facecolor': 'grey',
|
||||
'pad': 10})
|
||||
plt.text(total_result.shape[0] * 1 / 3, 0.4, "Norm", fontsize=10, color='black', verticalalignment='top',
|
||||
horizontalalignment='center',
|
||||
bbox={'facecolor': 'grey',
|
||||
'pad': 10})
|
||||
plt.grid()
|
||||
# plt.ylim(0, 1)
|
||||
# plt.xlim(-50, 1300)
|
||||
# plt.legend("", loc='upper left')
|
||||
plt.show()
|
||||
return total_result
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
total_data = loadData.execute(N=feature_num, file_name=file_name)
|
||||
total_data = normalization(data=total_data)
|
||||
train_data_healthy, train_label1_healthy, train_label2_healthy = get_training_data_overlapping(
|
||||
total_data[:healthy_date, :], is_Healthy=True)
|
||||
train_data_unhealthy, train_label1_unhealthy, train_label2_unhealthy = get_training_data_overlapping(
|
||||
total_data[healthy_date - time_stamp + unhealthy_patience:unhealthy_date, :],
|
||||
is_Healthy=False)
|
||||
#### TODO 第一步训练
|
||||
# 单次测试
|
||||
# train_step_one(train_data=train_data_healthy[:32, :, :], train_label1=train_label1_healthy[:32, :],train_label2=train_label2_healthy[:32, ])
|
||||
# train_step_one(train_data=train_data_healthy, train_label1=train_label1_healthy, train_label2=train_label2_healthy)
|
||||
|
||||
# 导入第一步已经训练好的模型,一个继续训练,一个只输出结果
|
||||
# step_one_model = Joint_Monitoring()
|
||||
# step_one_model.load_weights(save_name)
|
||||
#
|
||||
# step_two_model = Joint_Monitoring()
|
||||
# step_two_model.load_weights(save_name)
|
||||
|
||||
#### TODO 第二步训练
|
||||
### healthy_data.shape: (300333,120,10)
|
||||
### unhealthy_data.shape: (16594,10)
|
||||
healthy_size, _, _ = train_data_healthy.shape
|
||||
unhealthy_size, _, _ = train_data_unhealthy.shape
|
||||
# train_data, train_label1, train_label2, test_data, test_label1, test_label2 = split_test_data(
|
||||
# healthy_data=train_data_healthy[healthy_size - 2 * unhealthy_size:, :, :],
|
||||
# healthy_label1=train_label1_healthy[healthy_size - 2 * unhealthy_size:, :],
|
||||
# healthy_label2=train_label2_healthy[healthy_size - 2 * unhealthy_size:, ], unhealthy_data=train_data_unhealthy,
|
||||
# unhealthy_label1=train_label1_unhealthy, unhealthy_label2=train_label2_unhealthy)
|
||||
# train_step_two(step_one_model=step_one_model, step_two_model=step_two_model,
|
||||
# train_data=train_data,
|
||||
# train_label1=train_label1, train_label2=np.expand_dims(train_label2, axis=-1))
|
||||
|
||||
### TODO 测试测试集
|
||||
step_one_model = Joint_Monitoring()
|
||||
step_one_model.load_weights(save_name)
|
||||
step_two_model = Joint_Monitoring()
|
||||
step_two_model.load_weights(save_step_two_name)
|
||||
# test(step_one_model=step_one_model, step_two_model=step_two_model, test_data=test_data, test_label1=test_label1,
|
||||
# test_label2=np.expand_dims(test_label2, axis=-1))
|
||||
|
||||
###TODO 展示全部的结果
|
||||
all_data, _, _ = get_training_data_overlapping(
|
||||
total_data[healthy_size - 2 * unhealthy_size:unhealthy_date, :], is_Healthy=True)
|
||||
# all_data = np.concatenate([])
|
||||
# 单次测试
|
||||
# showResult(step_two_model, test_data=all_data[:32], isPlot=True)
|
||||
showResult(step_two_model, test_data=all_data, isPlot=True)
|
||||
|
||||
pass
|
||||
Loading…
Reference in New Issue