20231108爬虫学习更新

2023-11-08 19:14:57 +08:00 · 2023-11-08 19:14:57 +08:00 · d8746d192f
parent df0f410f8f
commit d8746d192f
20 changed files with 3133 additions and 0 deletions
--- a/Spider/Chapter03_网页数据的提取/BeautifulSoup库/BeautifulSoupLearning.py
+++ b/Spider/Chapter03_网页数据的提取/BeautifulSoup库/BeautifulSoupLearning.py
@ -0,0 +1,244 @@
+# -*- encoding:utf-8 -*-
+
+'''
+@Author : dingjiawen
+@Date : 2023/11/8 16:08
+@Usage : 
+@Desc :参考 https://github.com/Python3WebSpider/BeautifulSoupTest
+'''
+
+html = """
+<html><head><title>The Dormouse's story</title></head>
+<body>
+<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
+<p class="story">Once upon a time there were three little sisters; and their names were
+<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
+<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
+<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
+and they lived at the bottom of a well.</p>
+<p class="story">...</p>
+"""
+from bs4 import BeautifulSoup
+
+
+
+def baseUse():
+    soup = BeautifulSoup(html, 'lxml')
+    print(soup.title)  # <title>The Dormouse's story</title>
+    print(type(soup.title))  # <class 'bs4.element.Tag'>
+    print(soup.title.string)  # The Dormouse's story
+    print(soup.head)  # <head><title>The Dormouse's story</title></head>
+    print(soup.p)  # <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
+    print(soup.p.name)  # 获取节点名称 p
+    print(soup.p.attrs)  # 获取属性 {'class': ['title'], 'name': 'dromouse'}
+    print(soup.p.attrs['name'])  # 获取属性值 dromouse
+    print(soup.p['name'])  # 获取属性值 dromouse
+    print(soup.body.p['name'])  # 嵌套选择 dromouse
+
+    print("==========================")
+
+
+def child():
+    html = """
+    <html>
+        <head>
+            <title>The Dormouse's story</title>
+        </head>
+        <body>
+            <p class="story">
+                Once upon a time there were three little sisters; and their names were
+                <a href="http://example.com/elsie" class="sister" id="link1">
+                    <span>Elsie</span>
+                </a>
+                <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
+                and
+                <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
+                and they lived at the bottom of a well.
+            </p>
+            <p class="story">...</p>
+    """
+    soup = BeautifulSoup(html, 'lxml')
+    # 子结点
+    for i, child in enumerate(soup.p.children):
+        print(i, child)
+    print("===============================")
+    # 子孙节点
+    for i, child in enumerate(soup.p.descendants):
+        print(i, child)
+    print("===============================")
+
+
+def parent():
+    soup = BeautifulSoup(html, 'lxml')
+    # 父节点
+    print(soup.a.parent)
+    print("===============================")
+    # 祖父节点
+    print(type(soup.a.parents))
+    print(list(enumerate(soup.a.parents)))
+    print("=============================")
+
+
+def brother():
+    html = """
+    <html>
+        <body>
+            <p class="story">
+                Once upon a time there were three little sisters; and their names were
+                <a href="http://example.com/elsie" class="sister" id="link1">
+                    <span>Elsie</span>
+                </a>
+                Hello
+                <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
+                and
+                <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
+                and they lived at the bottom of a well.
+            </p>
+    """
+    # 兄弟节点
+    soup = BeautifulSoup(html, 'lxml')
+    print('Next Sibling', soup.a.next_sibling)
+    print('Prev Sibling', soup.a.previous_sibling)
+    print('Next Siblings', list(enumerate(soup.a.next_siblings)))
+    print('Prev Siblings', list(enumerate(soup.a.previous_siblings)))
+
+# 找到所有满足条件的
+def findAll():
+
+    html = '''
+    <div class="panel">
+        <div class="panel-heading">
+            <h4>Hello</h4>
+        </div>
+        <div class="panel-body">
+            <ul class="list" id="list-1">
+                <li class="element">Foo</li>
+                <li class="element">Bar</li>
+                <li class="element">Jay</li>
+            </ul>
+            <ul class="list list-small" id="list-2">
+                <li class="element">Foo</li>
+                <li class="element">Bar</li>
+            </ul>
+        </div>
+    </div>
+    '''
+    soup = BeautifulSoup(html, 'lxml')
+    print(soup.find_all(name='ul'))
+    print(type(soup.find_all(name='ul')[0]))
+
+    for ul in soup.find_all(name='ul'):
+        print(ul.find_all(name='li'))
+
+    for ul in soup.find_all(name='ul'):
+        print(ul.find_all(name='li'))
+        for li in ul.find_all(name='li'):
+            print(li.string)
+
+
+# 找属性满足匹配得到
+def attrs():
+    html = '''
+    <div class="panel">
+        <div class="panel-heading">
+            <h4>Hello</h4>
+        </div>
+        <div class="panel-body">
+            <ul class="list" id="list-1" name="elements">
+                <li class="element">Foo</li>
+                <li class="element">Bar</li>
+                <li class="element">Jay</li>
+            </ul>
+            <ul class="list list-small" id="list-2">
+                <li class="element">Foo</li>
+                <li class="element">Bar</li>
+            </ul>
+        </div>
+    </div>
+    '''
+
+    soup = BeautifulSoup(html, 'lxml')
+    print(soup.find_all(attrs={'id': 'list-1'}))
+    print(soup.find_all(attrs={'name': 'elements'}))
+
+    # 常用的属性可以不用attrs传递
+    soup = BeautifulSoup(html, 'lxml')
+    print(soup.find_all(id='list-1'))
+    print(soup.find_all(class_='element'))
+    import re
+    print(soup.find_all(string=re.compile('Foo')))# string等同于text,即里面的具体内容
+
+
+# 返回匹配到的第一个元素
+def find():
+    html = '''
+    <div class="panel">
+        <div class="panel-heading">
+            <h4>Hello</h4>
+        </div>
+        <div class="panel-body">
+            <ul class="list" id="list-1">
+                <li class="element">Foo</li>
+                <li class="element">Bar</li>
+                <li class="element">Jay</li>
+            </ul>
+            <ul class="list list-small" id="list-2">
+                <li class="element">Foo</li>
+                <li class="element">Bar</li>
+            </ul>
+        </div>
+    </div>
+    '''
+    soup = BeautifulSoup(html, 'lxml')
+    print(soup.find(name='ul'))
+    print(type(soup.find(name='ul')))
+    print(soup.find(class_='list'))
+
+# css选择器
+def cssSelect():
+    html = '''
+    <div class="panel">
+        <div class="panel-heading">
+            <h4>Hello</h4>
+        </div>
+        <div class="panel-body">
+            <ul class="list" id="list-1">
+                <li class="element">Foo</li>
+                <li class="element">Bar</li>
+                <li class="element">Jay</li>
+            </ul>
+            <ul class="list list-small" id="list-2">
+                <li class="element">Foo</li>
+                <li class="element">Bar</li>
+            </ul>
+        </div>
+    </div>
+    '''
+
+    soup = BeautifulSoup(html, 'lxml')
+    print(soup.select('.panel .panel-heading'))
+    print(soup.select('ul li'))
+    print(soup.select('#list-2 .element'))
+    print(type(soup.select('ul')[0]))
+
+    # 嵌套选择
+    soup = BeautifulSoup(html, 'lxml')
+    for ul in soup.select('ul'):
+        print(ul.select('li'))
+
+    # 获取属性
+    soup = BeautifulSoup(html, 'lxml')
+    for ul in soup.select('ul'):
+        print(ul['id'])
+        print(ul.attrs['id'])
+
+    # 获取文本
+    soup = BeautifulSoup(html, 'lxml')
+    for li in soup.select('li'):
+        print('Get Text:', li.get_text())
+        print('String:', li.string)
+
+
+
+if __name__ == '__main__':
+    cssSelect()
--- a/Spider/Chapter03_网页数据的提取/BeautifulSoup库/init.py
+++ b/Spider/Chapter03_网页数据的提取/BeautifulSoup库/init.py
@ -0,0 +1,8 @@
+#-*- encoding:utf-8 -*-
+
+'''
+@Author : dingjiawen
+@Date : 2023/11/8 16:07
+@Usage : 
+@Desc :
+'''
--- a/Spider/Chapter03_网页数据的提取/Pyquery库/init.py
+++ b/Spider/Chapter03_网页数据的提取/Pyquery库/init.py
@ -0,0 +1,8 @@
+#-*- encoding:utf-8 -*-
+
+'''
+@Author : dingjiawen
+@Date : 2023/11/8 16:54
+@Usage : 
+@Desc :
+'''
--- a/Spider/Chapter03_网页数据的提取/Pyquery库/pyqueryLearning.py
+++ b/Spider/Chapter03_网页数据的提取/Pyquery库/pyqueryLearning.py
@ -0,0 +1,329 @@
+# -*- encoding:utf-8 -*-
+
+'''
+@Author : dingjiawen
+@Date : 2023/11/8 16:54
+@Usage : 
+@Desc :Pyquery学习 参考: https://github.com/Python3WebSpider/PyQueryTest
+'''
+from pyquery import PyQuery as pq
+
+
+# 字符串初始化
+def stringBase():
+    html = '''
+    <div>
+        <ul>
+             <li class="item-0">first item</li>
+             <li class="item-1"><a href="link2.html">second item</a></li>
+             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
+             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
+             <li class="item-0"><a href="link5.html">fifth item</a></li>
+         </ul>
+     </div>
+    '''
+
+    doc = pq(html)
+    print(doc('li'))
+
+
+# URL初始化
+def URLBase():
+    doc = pq(url='https://cuiqingcai.com')
+    print(doc('title'))
+
+    # 上述代码等同于下面
+    # doc = pq(requests.get('https://cuiqingcai.com').text)
+    # print(doc('title'))
+
+
+# 文件初始化
+def fileBase():
+    doc = pq(filename='demo.html')
+    print(doc('li'))
+
+# 基本的css选择器
+def cssSelect():
+    html = '''
+    <div id="container">
+        <ul class="list">
+             <li class="item-0">first item</li>
+             <li class="item-1"><a href="link2.html">second item</a></li>
+             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
+             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
+             <li class="item-0"><a href="link5.html">fifth item</a></li>
+         </ul>
+     </div>
+    '''
+    doc = pq(html)
+    print(doc('#container .list li'))
+    print(type(doc('#container .list li')))
+
+    #
+    for item in doc('#container .list li').items():
+        print(item.text())
+
+# 寻找子节点
+def child():
+    html = '''
+    <div>
+        <ul class="list">
+             <li class="item-0">first item</li>
+             <li class="item-1"><a href="link2.html">second item</a></li>
+             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
+             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
+             <li class="item-0"><a href="link5.html">fifth item</a></li>
+         </ul>
+     </div>
+    '''
+    doc = pq(html)
+    items = doc('.list')
+    print(type(items))
+    print(items)
+    lis = items.find('li')
+    print(type(lis))
+    print(lis)
+    #
+    #
+    lis = items.children()
+    print(type(lis))
+    print(lis)
+
+    #
+    lis = items.children('.active')
+    print(lis)
+
+
+def parent():
+    html = '''
+    <div class="wrap">
+        <div id="container">
+            <ul class="list">
+                 <li class="item-0">first item</li>
+                 <li class="item-1"><a href="link2.html">second item</a></li>
+                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
+                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
+                 <li class="item-0"><a href="link5.html">fifth item</a></li>
+             </ul>
+         </div>
+     </div>
+    '''
+    from pyquery import PyQuery as pq
+    doc = pq(html)
+    items = doc('.list')
+    container = items.parent()
+    print(type(container))
+    print(container)
+
+    from pyquery import PyQuery as pq
+    doc = pq(html)
+    items = doc('.list')
+    parents = items.parents()
+    print(type(parents))
+    print(parents)
+
+    parent = items.parents('.wrap')
+    print(parent)
+
+    from pyquery import PyQuery as pq
+    doc = pq(html)
+    li = doc('.list .item-0.active')
+    print(li.siblings())
+
+def brother():
+    html = '''
+        <div class="wrap">
+            <div id="container">
+                <ul class="list">
+                     <li class="item-0">first item</li>
+                     <li class="item-1"><a href="link2.html">second item</a></li>
+                     <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
+                     <li class="item-1 active"><a href="link4.html">fourth item</a></li>
+                     <li class="item-0"><a href="link5.html">fifth item</a></li>
+                 </ul>
+             </div>
+         </div>
+        '''
+    from pyquery import PyQuery as pq
+    doc = pq(html)
+    li = doc('.list .item-0.active')
+    print(li.siblings('.active'))
+
+    from pyquery import PyQuery as pq
+    doc = pq(html)
+    li = doc('.item-0.active')
+    print(li)
+    print(str(li))
+
+    from pyquery import PyQuery as pq
+    doc = pq(html)
+    # 可能是多个节点
+    lis = doc('li').items()
+    print(type(lis))
+    for li in lis:
+        print(li, type(li))
+
+def attrs():
+    html = '''
+    <div class="wrap">
+        <div id="container">
+            <ul class="list">
+                 <li class="item-0">first item</li>
+                 <li class="item-1"><a href="link2.html">second item</a></li>
+                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
+                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
+                 <li class="item-0"><a href="link5.html">fifth item</a></li>
+             </ul>
+         </div>
+     </div>
+    '''
+    from pyquery import PyQuery as pq
+    doc = pq(html)
+    a = doc('.item-0.active a')
+    print(a, type(a))
+    print(a.attr('href'))
+
+    a = doc('a')
+    print(a, type(a))
+    print(a.attr('href'))
+    print(a.attr.href)
+
+    from pyquery import PyQuery as pq
+    doc = pq(html)
+    a = doc('a')
+    for item in a.items():
+        # 获取属性和文本
+        print(item.attr('href'),item.text())
+
+def getHTML():
+    html = '''
+    <div class="wrap">
+        <div id="container">
+            <ul class="list">
+                 <li class="item-1"><a href="link2.html">second item</a></li>
+                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
+                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
+                 <li class="item-0"><a href="link5.html">fifth item</a></li>
+             </ul>
+         </div>
+     </div>
+    '''
+    from pyquery import PyQuery as pq
+    doc = pq(html)
+    li = doc('li')
+    print(li.html()) # 第一个节点对应的html  <a href="link2.html">second item</a>
+    print(li.text()) # 所有匹配的节点的文本 second item third item fourth item fifth item
+    print(type(li.text()))
+
+# 增加或者删除节点的class
+def operateNode():
+    html = '''
+    <div class="wrap">
+        <div id="container">
+            <ul class="list">
+                 <li class="item-0">first item</li>
+                 <li class="item-1"><a href="link2.html">second item</a></li>
+                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
+                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
+                 <li class="item-0"><a href="link5.html">fifth item</a></li>
+             </ul>
+         </div>
+     </div>
+    '''
+    from pyquery import PyQuery as pq
+    doc = pq(html)
+    li = doc('.item-0.active')
+    print(li)
+    li.removeClass('active')
+    print(li)
+    li.addClass('active')
+    print(li)
+
+    '''
+    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
+                 
+    <li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
+                     
+    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
+    '''
+
+
+
+def operateNodeInformation():
+    html = '''
+    <ul class="list">
+         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
+    </ul>
+    '''
+    from pyquery import PyQuery as pq
+    doc = pq(html)
+    li = doc('.item-0.active')
+    print(li)
+    li.attr('name', 'link')
+    print(li)
+    li.text('changed item')
+    print(li)
+    li.html('<span>changed item</span>')
+    print(li)
+    '''
+    <li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
+    <li class="item-0 active" name="link">changed item</li>
+    <li class="item-0 active" name="link"><span>changed item</span></li>
+    '''
+
+
+def removeInformation():
+    html = '''
+    <div class="wrap">
+        Hello, World
+        <p>This is a paragraph.</p>
+     </div>
+    '''
+    from pyquery import PyQuery as pq
+    doc = pq(html)
+    wrap = doc('.wrap')
+    print(wrap.text())
+    '''
+    Hello, World
+    This is a paragraph.
+    '''
+    wrap.find('p').remove()
+    print(wrap.text())
+    '''
+    Hello, World
+    '''
+
+# 伪类选择器
+def fakeCSSSelect():
+    html = '''
+    <div class="wrap">
+        <div id="container">
+            <ul class="list">
+                 <li class="item-0">first item</li>
+                 <li class="item-1"><a href="link2.html">second item</a></li>
+                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
+                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
+                 <li class="item-0"><a href="link5.html">fifth item</a></li>
+             </ul>
+         </div>
+     </div>
+    '''
+    from pyquery import PyQuery as pq
+    doc = pq(html)
+    li = doc('li:first-child')
+    print(li)
+    li = doc('li:last-child')
+    print(li)
+    li = doc('li:nth-child(2)')
+    print(li)
+    li = doc('li:gt(2)')
+    print(li)
+    li = doc('li:nth-child(2n)')
+    print(li)
+    li = doc('li:contains(second)')
+    print(li)
+
+
+
+if __name__ == '__main__':
+    fakeCSSSelect()
--- a/Spider/Chapter03_网页数据的提取/XPath库/XpathLearning.py
+++ b/Spider/Chapter03_网页数据的提取/XPath库/XpathLearning.py
@ -0,0 +1,195 @@
+# -*- encoding:utf-8 -*-
+
+'''
+@Author : dingjiawen
+@Date : 2023/11/8 15:15
+@Usage : 
+@Desc :
+'''
+
+from lxml import etree
+
+'''
+XPath基本规则:
+
+    1)  nodename:选择此节点的所有子节点
+    2)  /:从当前节点选取直接子节点
+    3)  //:从当前阶段选择子孙节点
+    4)  .:选取当前节点
+    5)  ..:选取当前节点的父节点
+    6)  @:选取属性
+
+举例:
+//title[@lang='eng]代表选择所有名称为title,同时属性lang的值为eng的节点
+'''
+
+
+def htmlByString():
+    text = '''
+    <div>
+        <ul>
+             <li class="item-0"><a href="link1.html">first item</a></li>
+             <li class="item-1"><a href="link2.html">second item</a></li>
+             <li class="item-inactive"><a href="link3.html">third item</a></li>
+             <li class="item-1"><a href="link4.html">fourth item</a></li>
+             <li class="item-0"><a href="link5.html">fifth item</a>
+         </ul>
+     </div>
+    '''
+    html = etree.HTML(text)
+    result = etree.tostring(html)
+    print(result.decode('utf-8'))
+
+
+def htmlByFile():
+    html = etree.parse('./test.html', etree.HTMLParser())
+    result = etree.tostring(html)
+    print(result.decode('utf-8'))
+
+
+def allNode():
+    html = etree.parse('./test.html', etree.HTMLParser())
+    # 从头开始匹配所有的
+    result = html.xpath('//*')
+    print(result)
+    print(result[0])
+
+    # 匹配所有li的
+    result = html.xpath('//li')
+    print(result)
+    print(result[0])
+
+
+# 子节点匹配
+def childNode():
+    html = etree.parse('./test.html', etree.HTMLParser())
+
+    # 匹配所有li的子节点a
+    result = html.xpath('//li/a')
+    print(result)
+    print(result[0])
+
+    # 匹配所有li的子孙节点a  相当于只要是子节点下面的就可以匹配上
+    result = html.xpath('//ul//a')
+    print(result)
+    print(result[0])
+
+
+# 父节点匹配
+def fatherNode():
+    html = etree.parse('./test.html', etree.HTMLParser())
+
+    # 匹配a节点属性href是link4.html的父节点的class属性
+    result = html.xpath('//a[@href="link4.html"]/../@class')
+    print(result)
+    # 也可以通过parent::来获取
+    result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
+    print(result)
+
+
+# 文本获取
+def textGet():
+    html = etree.parse('./test.html', etree.HTMLParser())
+
+    # 匹配li节点属性class是item-0的节点的子节点a的text
+    result = html.xpath('//li[@class="item-0"]/a/text()')
+    print(result)  # ['first item', 'fifth item']
+
+    # 匹配li节点属性class是item-0的节点的子孙节点的text
+    result = html.xpath('//li[@class="item-0"]//text()')
+    print(result)  # ['first item', 'fifth item', '\r\n    ']
+
+
+# 属性获取
+def fieldGet():
+    html = etree.parse('./test.html', etree.HTMLParser())
+
+    # 匹配li节点属性class是item-0的节点的子节点a的href属性
+    result = html.xpath('//li/a/@href')
+    print(result)  # ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
+
+
+# 属性多值匹配
+def fieldsGet():
+    text = '''
+    <li class="li li-first"><a href="link.html">first item</a></li>
+    '''
+    html = etree.HTML(text)
+    result = html.xpath('//li[@class="li"]/a/text()')
+    print(result)  # [] 匹配不到
+
+    result = html.xpath('//li[contains(@class, "li")]/a/text()')
+    print(result)  # ['first item']  contains匹配到了
+
+
+# 多属性匹配
+def fieldssGet():
+    text = '''
+    <li class="li li-first" name="item"><a href="link.html">first item</a></li>
+    '''
+    html = etree.HTML(text)
+    # 多属性用and连接
+    result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
+    print(result)
+
+
+# 按序选择
+def orderGet():
+    text = '''
+    <div>
+        <ul>
+             <li class="item-0"><a href="link1.html">first item</a></li>
+             <li class="item-1"><a href="link2.html">second item</a></li>
+             <li class="item-inactive"><a href="link3.html">third item</a></li>
+             <li class="item-1"><a href="link4.html">fourth item</a></li>
+             <li class="item-0"><a href="link5.html">fifth item</a>
+         </ul>
+     </div>
+    '''
+    html = etree.HTML(text)
+    result = html.xpath('//li[1]/a/text()')
+    print(result)  # ['first item']
+    result = html.xpath('//li[last()]/a/text()')
+    print(result)  # ['fifth item']
+    result = html.xpath('//li[position()<3]/a/text()')
+    print(result)  # ['first item', 'second item']
+    result = html.xpath('//li[last()-2]/a/text()')
+    print(result)  # ['third item']
+
+
+def nodeSelect():
+    text = '''
+    <div>
+        <ul>
+             <li class="item-0"><a href="link1.html"><span>first item</span></a></li>
+             <li class="item-1"><a href="link2.html">second item</a></li>
+             <li class="item-inactive"><a href="link3.html">third item</a></li>
+             <li class="item-1"><a href="link4.html">fourth item</a></li>
+             <li class="item-0"><a href="link5.html">fifth item</a>
+         </ul>
+     </div>
+    '''
+    html = etree.HTML(text)
+    result = html.xpath('//li[1]/ancestor::*')
+    print(result)
+    # ancestor获取祖先
+    result = html.xpath('//li[1]/ancestor::div')
+    print(result)
+    # attribute获取所有属性
+    result = html.xpath('//li[1]/attribute::*')
+    print(result)
+    # child获取子节点
+    result = html.xpath('//li[1]/child::a[@href="link1.html"]')
+    print(result)
+    # descendant获取子孙结点
+    result = html.xpath('//li[1]/descendant::span')
+    print(result)
+    # following获取当前节点之后的所有节点
+    result = html.xpath('//li[1]/following::*[2]')
+    print(result)
+    # following-sibling获取当前节点之后的同级节点
+    result = html.xpath('//li[1]/following-sibling::*')
+    print(result)
+
+if __name__ == '__main__':
+    nodeSelect()
--- a/Spider/Chapter03_网页数据的提取/XPath库/init.py
+++ b/Spider/Chapter03_网页数据的提取/XPath库/init.py
@ -0,0 +1,8 @@
+#-*- encoding:utf-8 -*-
+
+'''
+@Author : dingjiawen
+@Date : 2023/11/8 15:15
+@Usage : 
+@Desc :
+'''
--- a/Spider/Chapter03_网页数据的提取/XPath库/test.html
+++ b/Spider/Chapter03_网页数据的提取/XPath库/test.html
@ -0,0 +1,9 @@
+<div>
+    <ul>
+         <li class="item-0"><a href="link1.html">first item</a></li>
+         <li class="item-1"><a href="link2.html">second item</a></li>
+         <li class="item-inactive"><a href="link3.html">third item</a></li>
+         <li class="item-1"><a href="link4.html">fourth item</a></li>
+         <li class="item-0"><a href="link5.html">fifth item</a>
+    </ul>
+</div>
--- a/Spider/Chapter03_网页数据的提取/init.py
+++ b/Spider/Chapter03_网页数据的提取/init.py
@ -0,0 +1,8 @@
+#-*- encoding:utf-8 -*-
+
+'''
+@Author : dingjiawen
+@Date : 2023/11/8 15:12
+@Usage : 
+@Desc :
+'''
--- a/TensorFlow_eaxmple/Model_train_test/condition_monitoring/dataETL/NewtonInsert.py
+++ b/TensorFlow_eaxmple/Model_train_test/condition_monitoring/dataETL/NewtonInsert.py
@ -0,0 +1,153 @@
+# _*_ coding: UTF-8 _*_
+
+
+'''
+@Author : dingjiawen
+@Date : 2022/7/11 12:55
+@Usage : 
+@Desc :
+'''
+
+import numpy as np
+import pandas as pd
+import time
+# 只计算了该程序运行CPU的时间
+import timeit
+
+# cat_sale = pd.read_excel('data/catering_sale.xls')
+path = "G:\data\SCADA数据\jb4q_8.csv"
+cat_sale = pd.read_csv(path)
+# cat_sale.drop('日期', axis=1, inplace=True)
+
+# 过滤异常值，并置为空值
+# cat_sale['销量'][(cat_sale['销量'] < 400) | (cat_sale['销量'] > 5000)] = np.NAN
+# 将0值变成NAN  通过双中括号进行索引任意位置
+# print(df['realtime'][1])
+cat_sale[:][cat_sale[:] == 0] = np.nan  # 在索引比较的时候，要转换成同一类型，使用astype
+
+# 分别定义求插商与求w的函数
+'''
+:param x:差值前后的索引值
+:param y:差值前后的数值
+'''
+def cal_f(x, y):
+    """
+    计算插商
+    """
+    f0 = np.zeros((len(x), len(y)))  # 定义一个存储插商的数组
+    for k in range(len(y) + 1):  # 遍历列
+        for i in range(k, len(x)):  # 遍历行
+            if k == 0:
+                f0[i, k] = y[i]
+            else:
+                f0[i, k] = (f0[i, k - 1] - f0[i - 1, k - 1]) / (x[i] - x[i - 1])
+    # print('差商表', '\n', f0)
+    return f0
+
+
+'''
+:param x:差值前后的索引值
+:param y:差值前后的数值
+:param x_j:需要差值的索引
+'''
+def newton(x, y, x_j):
+    """
+    牛顿差值多项式
+    """
+    f0 = cal_f(x, y)  # 计算插商
+    f0 = f0.diagonal()  # 插商对角线
+    # 与w相乘
+    f1 = 0
+    for i in range(len(f0)):
+        s = 1
+        k = 0
+        while k < i:
+            s = s * (x_j - x[k])
+            k += 1
+        f1 = f1 + f0[i] * s
+    return f1
+
+
+# 自定义列向量插值函数,获取需差值的前后几个数
+'''
+:param s:整个差值的序列
+:param n:需要差值的索引
+:param x_j:需要差值的索引
+:param is_fast:是否需要快速差值(无论前后是否是零值均采用);反之则一直找到不为0值的进行计算
+:param k:取前后多少个数
+'''
+def ployinterp_columns(s, n, x_j, is_fast: bool = False, k=3):
+    X = []
+    Y = []
+    if is_fast:
+        # 如果最前面的值不够k个
+        if n < k:
+            a = list(range(0, n)) + list(range(n + 1, n + k + 1))
+            y = s[list(range(0, n)) + list(range(n + 1, n + k + 1))]
+        # 如果最后面的值不够k个
+        elif n > len(s) - k - 1:
+            y = s[list(range(n - k, n)) + list(range(n + 1, len(s)))]
+        # 前后均有k个
+        else:
+            y = s[list(range(n - k, n)) + list(range(n + 1, n + k + 1))]  # 取空值处的前后5个数
+        y = y[y.notnull()]  # 剔除空值
+        X = y.index
+        Y = list(y)
+    else:
+        # 先取序列前后各k个不为空的值
+        index = n - 1
+        while len(X) < k and index >= 0:
+            if not np.isnan(s[index]):
+                Y.append(s[index])
+                X.append(index)
+            index -= 1
+        index = n + 1
+        X.reverse()
+        Y.reverse()
+
+        while len(X) < 2 * k and index <= len(s):
+            if not np.isnan(s[index]):
+                Y.append(s[index])
+                X.append(index)
+            index += 1
+        # print(X)
+        # print(Y)
+
+    return newton(X, Y, x_j)  # 插值并返回插值结果
+
+
+def execute():
+    cat_sale[:][cat_sale[:] == 0] = np.nan  # 在索引比较的时候，要转换成同一类型，使用astype
+    for i in cat_sale.columns:
+        temp = cat_sale[i].isnull()
+        if temp[:][temp[:] == True].__len__() > 0:
+            print("{0}列处理前空行数:{1}".format(i, cat_sale[i].isnull().sum()))
+            for j in range(len(cat_sale)):
+                if (cat_sale[i].isnull())[j]:
+                    x_j = cat_sale.index[j]
+                    cat_sale.loc[j,i] = ployinterp_columns(cat_sale[i], j, x_j)
+                    print('第{0}行牛顿插值为{1}'.format(j, cat_sale.loc[j, i]))
+            print("{0}列处理后空行数:{1}".format(i, cat_sale[i].isnull().sum()))
+            print("========================================")
+    print(cat_sale)
+    cat_sale.to_csv("G:\data\SCADA数据\jb4q_8_dealed.csv")
+    # cat_sale.to_excel('saless.xls')
+
+
+def test():
+    cat_sale[:][cat_sale[:] == 0] = np.nan  # 在索引比较的时候，要转换成同一类型，使用astype
+    for j in range(len(cat_sale['num_gearbox_sumptemp'])):
+        if (cat_sale['num_gearbox_sumptemp'].isnull())[j]:
+            x_j = cat_sale.index[j]
+            cat_sale.loc[j,'num_gearbox_sumptemp'] = ployinterp_columns(cat_sale['num_gearbox_sumptemp'], j, x_j,is_fast=True)
+            # print('第{0}行牛顿插值为{1}'.format(j, cat_sale.loc[j,'num_gearbox_sumptemp']))
+
+
+if __name__ == '__main__':
+    start = timeit.default_timer()
+    # execute()
+    test()
+    end = timeit.default_timer()
+    print('Running time: %s Seconds' % (end - start))
+    # 返回值是浮点数
+
--- a/TensorFlow_eaxmple/Model_train_test/condition_monitoring/dataETL/init.py
+++ b/TensorFlow_eaxmple/Model_train_test/condition_monitoring/dataETL/init.py
--- a/TensorFlow_eaxmple/Model_train_test/condition_monitoring/dataETL/lagrangeInsert.py
+++ b/TensorFlow_eaxmple/Model_train_test/condition_monitoring/dataETL/lagrangeInsert.py
@ -0,0 +1,96 @@
+# _*_ coding: UTF-8 _*_
+
+
+'''
+@Author : dingjiawen
+@Date : 2022/7/11 11:43
+@Usage : 
+@Desc :
+'''
+
+
+import numpy as np
+import pandas as pd
+
+
+# 拉格朗日插值算法
+def LagrangeInterpolation(slices, x, k=5):
+    # slices(series) :the defining points
+    # k :the number of defining points of Lagrange poly  前后各k个值
+    # slices index :the corresponding value on each defining point
+    # x :the point whose value we are interested
+    # print(slices[x])
+    # print(np.isnan(slices[x]))
+    result = 0  # later to save final result
+    X = []
+    Y = []
+    # 先取序列前后各k个不为空的值
+    index = x - 1
+    while len(X) < k and index >= 0:
+        if not np.isnan(slices[index]):
+            Y.append(slices[index])
+            X.append(index)
+        index -= 1
+    index = x + 1
+    X.reverse()
+    Y.reverse()
+
+    while len(X) < 2 * k and index <= len(slices):
+        if not np.isnan(slices[index]):
+            Y.append(slices[index])
+            X.append(index)
+        index += 1
+    # print(X)
+    # print(Y)
+
+    for j in range(len(X)):
+        # result_l 基函数
+        result_l = 1
+        for i in range(len(X)):
+            if i != j:
+                result_l = result_l * (x - X[i]) / (X[j] - X[i])
+        # 取值 slices[j]
+        result = result + slices[j] * result_l
+
+    return result
+
+
+
+
+
+
+if __name__ == '__main__':
+    path = "G:\data\SCADA数据\jb4q_8.csv"
+
+    df = pd.read_csv(path)
+    columns = df.columns
+    print(df.columns)
+
+    # 将0值变成NAN  通过双中括号进行索引任意位置
+    # print(df['realtime'][1])
+    df[:][df[:] == 0] = np.nan  # 在索引比较的时候，要转换成同一类型，使用astype
+
+    # TODO 测试单点插值
+    print(df['num_gearbox_sumptemp'].isnull())
+    # print("插值为:", LagrangeInterpolation(df['num_gearbox_sumptemp'], 47, 2))
+
+    # TODO 单列测试插值
+    print("之前的空值数量:", df['num_gearbox_sumptemp'].isnull().sum())
+    for j in range(len(df)):
+        if (df['num_gearbox_sumptemp'].isnull())[j]:
+            s = df['num_gearbox_sumptemp']
+            df.loc[j, 'num_gearbox_sumptemp'] = LagrangeInterpolation(s, j, 5)
+    print("插值之后的空值数量:", df['num_gearbox_sumptemp'].isnull().sum())
+
+    # # TODO 整体处理
+    print("之前的空值数量:", df.isnull().sum())
+    for i in columns:
+        temp = df[i].isnull()
+        if temp[:][temp[:] == True].__len__() > 0:
+            for j in range(len(df)):
+                if (df[i].isnull())[j]:
+                    s = df[columns[i]]
+                    df.loc[j, i] = LagrangeInterpolation(s, j, 3)
+
+    print("插值之后的空值数量:",df.isnull().sum())
+    df.to_csv("G:\实验室/2022项目中期\数据治理算法\jb4q_8_lagrange.csv")
--- a/TensorFlow_eaxmple/Model_train_test/condition_monitoring/dataETL/scada_data_process_for_JBYQ_YSD.py
+++ b/TensorFlow_eaxmple/Model_train_test/condition_monitoring/dataETL/scada_data_process_for_JBYQ_YSD.py
@ -0,0 +1,149 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jun  7 09:23:31 2020
+
+@author: AlbertHu
+"""
+
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun  5 21:33:46 2020
+
+@author: AlbertHu
+"""
+
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun  5 10:40:27 2020
+
+@author: AlbertHu
+"""
+
+import os
+import time
+import numpy as np
+import pandas as pd
+import datetime
+
+def findallfiles(cmsfilesfatherpath):            #返回父目录包括子目录下所有文件的地址
+    cmsfilepaths = []
+    files = os.listdir(cmsfilesfatherpath)
+    for fi in files:
+        fi_d = os.path.join(cmsfilesfatherpath, fi)
+        if os.path.isdir(fi_d):
+            # files.extend(findcmsfiles(fi_d))
+            pass
+        else:                                   
+            cmsfilepaths.append(fi_d)
+    return cmsfilepaths
+def findIndexOfExceptPoint(data):
+    indexList2D = []
+    indexList1 = []
+    indexList2 = []
+    indexList3 = []
+    indexList4 = []
+    indexList5 = []
+    print("开始清洗")
+    for i in data.index:
+        if i % 10000 == 0:
+            print("已处理了{}组数据".format(i))
+        #条件1
+        if data[' 瞬时风速'][i] < 3.5 and data[' 1#叶片变桨角度'][i] > 89:           
+            indexList1.append(i)
+        elif data[' 瞬时风速'][i] >= 3.5 and data[' 瞬时风速'][i] <= 10 and data[' 1#叶片变桨角度'][i] > 0.5:
+            indexList1.append(i)
+        elif data[' 瞬时风速'][i] >= 11 and data[' 瞬时风速'][i] <= 25 and (data[' 有功功率'][i] < 1800 and data[' 1#叶片变桨角度'][i] > 1.5):
+            indexList1.append(i)
+        elif data[' 瞬时风速'][i] > 25 and data[' 有功功率'][i] >0:
+            indexList1.append(i)
+        else:
+            pass
+        #条件2
+        if abs(data[' 齿轮箱高速轴前端温度'][i])>200 or abs(data[' 齿轮箱高速轴后端温度'][i])>200 or abs(data[' 齿轮箱冷却水温'][i])>200 or abs(data[' 齿轮箱进口油温'][i])>200 or abs(data[' 齿轮箱油池温度'][i])>200 or abs(data[' 环境温度'][i]>200):
+            indexList2.append(i)
+        else:
+            pass
+        #条件3 #条件6
+        if data[' 齿轮箱高速轴前端温度'][i] > 80 or data[' 齿轮箱高速轴后端温度'][i] > 80 or abs(data[' 齿轮箱高速轴前端温度'][i] - data[' 齿轮箱高速轴后端温度'][i]) > 20:
+            indexList3.append(i)
+        else:
+            pass
+        #条件4
+        if data[' 有功功率'][i] > 100 and data[' 齿轮箱进口压力'][i] <= 0:
+            indexList4.append(i)
+        else:
+            pass
+        #条件5
+        if abs(data[' 齿轮箱进口压力'][i] -  data[' 齿轮箱泵出口压力'][i]) > 5:
+            indexList5.append(i)
+        else:
+            pass
+    indexList2D = [indexList1,indexList2,indexList3,indexList4,indexList5]
+    return indexList2D
+#        #条件6
+#        if data[' 齿轮箱高速轴前端温度'][i] > 80 or data[' 齿轮箱高速轴后端温度'][i]) > 80:
+        
+
+
+
+fathpath = r'D:\1.SCADA_风电数据\靖边二期2019_已处理'
+allfilepaths = findallfiles(fathpath)
+testpath = allfilepaths[0]
+#allfilepaths = [r'F:\scada_ewma本地数据2(重要)\data\DataResult(靖边二期2019)\风机7.csv']
+
+#testpath=r'F:\scada_ewma本地数据2(重要)\data\DataResult(粤水电达坂城2020.1月-5月)\风机1.csv'
+for testpath in allfilepaths:
+    data = pd.read_csv(testpath,encoding='gbk',parse_dates = ['时间'])
+    data.columns
+    
+    indexList2D = findIndexOfExceptPoint(data)
+    
+    savePath = r'./cleanScada/JB2Q615/风机{}'.format(data['风机号'][1])
+    if not os.path.exists(savePath):
+        os.makedirs(savePath)
+    file = open(savePath + '/IndexOfExceptPoint.txt','w')
+    a = 1
+    for List in indexList2D:
+        for i in List:
+            file.write(str(i)+',')
+            try:
+                data.drop([i],inplace=True)
+            except:
+                continue
+        file.write('第{}组\n'.format(a))
+        a += 1
+    file.close()
+    
+    data.to_csv(savePath+'.csv',encoding='gbk')
+    
+
+
+   
+
+    
+    
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/TensorFlow_eaxmple/Model_train_test/condition_monitoring/data_deal/baseETL.py
+++ b/TensorFlow_eaxmple/Model_train_test/condition_monitoring/data_deal/baseETL.py
@ -0,0 +1,67 @@
+# _*_ coding: UTF-8 _*_
+
+
+'''
+@Author : dingjiawen
+@Date : 2022/7/7 10:29
+@Usage : 对SCADA数据进行基础的清洗工作
+@Desc :
+'''
+
+import tensorflow as tf
+import pandas as pd
+import numpy as np
+import os
+import time
+from condition_monitoring.lib.IOBase import ioLib
+
+'''
+超参数设置
+'''
+# 需处理文件的父目录
+fatherPath = "G:\data\SCADA数据\华能三塘湖"
+# 处理好文件的父目录
+fatherDealedPath = "G:\data\SCADA数据\华能三塘湖\dealed"
+
+baseUseCols = ["时间", "风机号", "发电机转矩", "发电机无功功率", "发电机转速", "发电机有功功率", "发电机绕组最高温度", "齿轮箱油池温度", "齿轮箱进口油温", "齿轮箱进口压力",
+              "齿轮箱油泵出口压力", "齿轮箱冷却水温度", "有功功率", "60s平均有功功率", "10min平均有功功率", "10s平均有功功率", "10s平均无功功率", "无功功率", "瞬时风速",
+              "机舱温度"]
+
+baseWinds = []
+
+# 列出父目录下所有文件
+def listFile(fatherPath = fatherPath):
+    filepaths = []
+    files = os.listdir(fatherPath)
+    for file in files:
+        fi_d = os.path.join(fatherPath, file)
+        if os.path.isdir(fi_d):
+            pass
+            # files.extend(findcmsfiles(fi_d))
+        else:
+            filepaths.append(fi_d)
+
+    return filepaths
+
+
+def dropNa(filePath):
+    data = pd.read_csv(filePath, low_memory=False, encoding='gbk', usecols=baseUseCols, parse_dates=['时间'])
+    print(data)
+    data.dropna(axis=0, how='any', inplace=True)
+    print(data)
+    data.append()
+    ioLib.saveCSV(data=data, savePath=fatherDealedPath)
+
+
+
+def separateByWindNum(data):
+    indexLists = []
+    windList1 = []
+    windList2 = []
+
+
+
+if __name__ == '__main__':
+    filePath = "G:\data\SCADA数据\华能三塘湖/1华能三塘湖20180730-20180803.csv"
+
+
--- a/TensorFlow_eaxmple/Model_train_test/condition_monitoring/data_deal/loadData.py
+++ b/TensorFlow_eaxmple/Model_train_test/condition_monitoring/data_deal/loadData.py
@ -0,0 +1,228 @@
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+import csv
+import os
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+'''设置数据源文件路径'''
+# source_path = r'G:\data\SCADA数据\jb4q_8.csv'
+source_path = "G:\data\SCADA数据\jb4q_8_delete_total_zero.csv"
+
+'''修改后的数据源存储路径'''
+save_path = r'G:\data\SCADA数据\jb4q_8_delete_total_zero.csv'
+
+'''需要的列'''
+
+
+# baseUseCols = ["num_gearbox_sumptemp","num_gearbox_inletoiltemp","num_gearbox_inletpress","num_gearbox_coolingwatertemp"]
+
+# target_path = r'G:\data\SCADA数据\华能三塘湖/dealed/后十万2018.01.16.csv'
+# target_folder = r'G:\data\SCADA数据\华能三塘湖/dealed'
+
+
+# 生成文件夹
+def folderGenerate(folder_name):
+    if not os.path.exists(folder_name):
+        os.makedirs(folder_name)
+
+
+# 皮尔逊相关系数
+def cal_correlation_coefficient(data, label):
+    print("计算皮尔逊相关系数")
+    print(data)
+    print(data.shape)
+    pd_data = pd.DataFrame(data)
+    person = pd_data.corr()
+    print(person)
+    # 画热点图heatmap
+    # cmap = sns.heatmap(person, annot=True, xticklabels=label, yticklabels=label)
+    # plt.figure(1, figsize=(6.0, 2.68))
+    # plt.subplots_adjust(left=0.1, right=0.94, bottom=0.2, top=0.9, wspace=None,
+    #                     hspace=None)
+    # plt.tight_layout()
+    # font1 = {'family': 'Times New Roman', 'weight': 'normal', 'size': 10}  # 设置坐标标签的字体大小，字体
+    # font2 = {'family': 'Times New Roman', 'weight': 'normal', 'size': 15}  # 设置坐标标签的字体大小，字体
+    # plt.xlabel("X", size=10,fontdict=font1)
+    # plt.ylabel("Y", size=10,fontdict=font1)
+    # plt.title("Heatmap of correlation coefficient matrix", size=20,fontdict=font1)
+    #
+    # # 调整色带的标签：
+    # cbar = cmap.collections[0].colorbar
+    # cbar.ax.tick_params(labelsize=15, labelcolor="black")
+    # cbar.ax.set_ylabel(ylabel="color scale", color="red", loc="center",fontdict=font2)
+    #
+    # plt.show()
+    return person
+
+
+def get_most_N_correlation_coefficient(person, N=10):
+    print("获得相关度最高的{}个值".format(N))
+    # total_correlation = person[1:, 1:]
+    abs_correlation = np.abs(person)
+    one = np.ones(shape=abs_correlation.shape)
+    two = np.subtract(one, abs_correlation)
+    rows, cols = two.shape
+    total_sum = []
+    for i in range(cols):
+        # print(two[i])
+        total = np.sum(two[i])
+        total_sum.append(total)
+
+    print("total_sum:", total_sum)
+    # 取最小的N个数，因为是与1减了以后的,越小相关系数越大
+    print("arg:",np.argpartition(total_sum, N))
+    min = np.argpartition(total_sum, N)[:N]
+    max = np.argpartition(total_sum, N)[total_sum.__len__() - N:]
+    print("min:",min)
+    return min
+
+
+# 过滤或者线性填充
+def findIndexOfExceptPoint(data: pd.DataFrame):
+    # indexList2D = []
+    # indexList = []
+    # indexList2 = []
+    # indexList3 = []
+    # indexList4 = []
+    indexList = []
+    print("开始清洗")
+    for i in data.index:
+        if i % 10000 == 0:
+            print("已处理了{}条数据".format(i))
+        ## 删除绝大多数0
+        # if data['num_gearbox_sumptemp'][i] != 0 and (i < 416166 or i > 432766) and (
+        #         data['num_gearbox_pumpoutletpress'][i] == 0 or data['num_activepower'][i] == 0 or
+        #         data['num_gen_torque'][i] == 0):
+        #     indexList.append(i)
+        # 删除全部有0
+        # if  (i < 416166 or i > 432766) and (
+        #         data['num_gearbox_pumpoutletpress'][i] == 0 or data['num_activepower'][i] == 0 or
+        #         data['num_gen_torque'][i] == 0):
+        #     indexList.append(i)
+        # 只删除全部0
+        if  (i < 416166 or i > 432766) and (
+                data['num_gearbox_sumptemp'][i] == 0 and data['num_gearbox_inletoiltemp'][i] == 0 and
+                data['num_gearbox_inletpress'][i] == 0):
+            indexList.append(i)
+        else:
+            pass
+
+    # indexList2D = [indexList1, indexList2, indexList3, indexList4, indexList5]
+    indexList2D = set(indexList)
+    print("要移除的index:", indexList2D)
+    return indexList2D
+
+
+# 根据index移除异常数据
+def removeDataByIndex(indexList, data):
+    print("开始移除异常index的数据")
+    a = 1
+    data.drop(indexList, inplace=True)
+    # for i in indexList:
+    #     try:
+    #         data.drop([i], inplace=True)
+    #     except:
+    #         continue
+    #     # print('第{}组\n'.format(a))
+    #     # a += 1
+    return data
+
+
+# 处理数据(移除,重新赋值,或者是其他操作)
+def dealData(scada_data: pd.DataFrame):
+    # 是否保存处理好的数据
+    Is_save = True
+    indexList = findIndexOfExceptPoint(scada_data)
+    removeDataByIndex(indexList=indexList, data=scada_data)
+    print("处理后的数据为:")
+    print(scada_data)
+    if Is_save:
+        print("============保存处理好的数据,路径为{}============".format(save_path))
+        scada_data.to_csv(save_path, index=False, encoding='gbk')
+
+    return scada_data
+
+
+# 读取数据,转为numpy数组或者tf数组
+def read_data(file_name, isNew: bool = False):
+    ''' 导入数据 '''
+    with open(file_name, 'r') as f:
+        if isNew:
+            # scada_data = pd.read_csv(f,low_memory=False, encoding='gbk', usecols=baseUseCols, parse_dates=['时间'])
+            scada_data = pd.read_csv(f, low_memory=False, encoding='gbk', parse_dates=['realtime'])
+            print(scada_data)
+            scada_data = dealData(scada_data=scada_data)
+            print(scada_data.head)
+            scada_data = np.array(scada_data)
+        else:
+            scada_data = np.loadtxt(f, str, delimiter=",")
+        label = scada_data[0, 3:]
+        label=list(['Gs','Gio','Gip','Gp','Gwt','En','Gft','Grt','Gwt','Et','Rs','Ap','Ws','Dw','Ges','Gt','Vx','Vy'])
+        print("导入数据成功,将数据转为numpy或tf数组...")
+        needed_data = scada_data[1:, 3:].astype(dtype=np.float)
+        ## needed_data = tf.cast(needed_data, tf.float32)  tensor无法转为pd.DataFrame
+        print(needed_data)
+        print("转换成功，并返回...")
+        return needed_data, label
+
+
+def plot_original_data(data):
+    rows, cols = data.shape
+    print("开始画图...")
+
+    for i in range(cols):
+        plt.figure(i)
+        plt.plot(data[:, i])
+    plt.show()
+
+
+def execute(file_name=source_path,N=10):
+    needed_data, label = read_data(file_name=file_name, isNew=False)
+    print(needed_data)
+    print(needed_data.shape)
+    # plot_original_data(needed_data)
+    person = cal_correlation_coefficient(needed_data, label)
+    person = np.array(person)
+    min = get_most_N_correlation_coefficient(person, N=N)
+
+    for index in min:
+        if index == min[0]:
+            total_data = np.expand_dims(needed_data[:, index], axis=-1)
+        else:
+            total_data = np.concatenate([total_data, np.expand_dims(needed_data[:, index], axis=-1)], axis=-1)
+
+    return total_data
+
+
+def deal_data(file_name=source_path):
+    ''' 导入数据 '''
+    with open(file_name, 'r') as f:
+
+        # scada_data = pd.read_csv(f,low_memory=False, encoding='gbk', usecols=baseUseCols, parse_dates=['时间'])
+        scada_data = pd.read_csv(f, low_memory=False, encoding='gbk', parse_dates=['realtime'])
+        print(scada_data)
+        scada_data = dealData(scada_data=scada_data)
+        print(scada_data.head)
+        scada_data = np.array(scada_data)
+
+        scada_data = np.loadtxt(f, str, delimiter=",")
+        label = scada_data[0, 3:]
+        label = list(
+            ['Gs', 'Gio', 'Gip', 'Gp', 'Gwt', 'En', 'Gft', 'Grt', 'Gwt', 'Et', 'Rs', 'Ap', 'Ws', 'Dw', 'Ges', 'Gt',
+             'Vx', 'Vy'])
+        print("导入数据成功,将数据转为numpy或tf数组...")
+        needed_data = scada_data[1:, 3:].astype(dtype=np.float)
+        ## needed_data = tf.cast(needed_data, tf.float32)  tensor无法转为pd.DataFrame
+        print(needed_data)
+        print("转换成功，并返回...")
+        return needed_data, label
+    pass
+
+
+if __name__ == '__main__':
+    total_data = execute(N=10, file_name=source_path)
+    # print(total_data)
+    # print(total_data.shape)
+    # plot_original_data()
--- a/TensorFlow_eaxmple/Model_train_test/condition_monitoring/data_deal/loadData_daban.py
+++ b/TensorFlow_eaxmple/Model_train_test/condition_monitoring/data_deal/loadData_daban.py
@ -0,0 +1,207 @@
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+import csv
+import os
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+'''设置数据源文件路径'''
+# source_path = r'G:\data\SCADA数据\jb4q_8.csv'
+source_path = "G:\data\SCADA数据\jb4q_8_delete_total_zero.csv"
+
+'''修改后的数据源存储路径'''
+save_path = r'G:\data\SCADA数据\jb4q_8_delete_total_zero.csv'
+
+'''需要的列'''
+
+
+# baseUseCols = ["num_gearbox_sumptemp","num_gearbox_inletoiltemp","num_gearbox_inletpress","num_gearbox_coolingwatertemp"]
+
+# target_path = r'G:\data\SCADA数据\华能三塘湖/dealed/后十万2018.01.16.csv'
+# target_folder = r'G:\data\SCADA数据\华能三塘湖/dealed'
+
+#96748   107116
+
+
+# 生成文件夹
+def folderGenerate(folder_name):
+    if not os.path.exists(folder_name):
+        os.makedirs(folder_name)
+
+
+# 皮尔逊相关系数
+def cal_correlation_coefficient(data, label):
+    print("计算皮尔逊相关系数")
+    pd_data = pd.DataFrame(data)
+    person = pd_data.corr()
+    print(person)
+    # 画热点图heatmap
+    # cmap = sns.heatmap(person, annot=True, xticklabels=label, yticklabels=label)
+    # plt.figure(1, figsize=(6.0, 2.68))
+    # plt.subplots_adjust(left=0.1, right=0.94, bottom=0.2, top=0.9, wspace=None,
+    #                     hspace=None)
+    # plt.tight_layout()
+    # font1 = {'family': 'Times New Roman', 'weight': 'normal', 'size': 10}  # 设置坐标标签的字体大小，字体
+    # font2 = {'family': 'Times New Roman', 'weight': 'normal', 'size': 15}  # 设置坐标标签的字体大小，字体
+    # plt.xlabel("X", size=10,fontdict=font1)
+    # plt.ylabel("Y", size=10,fontdict=font1)
+    # plt.title("Heatmap of correlation coefficient matrix", size=20,fontdict=font1)
+    #
+    # # 调整色带的标签：
+    # cbar = cmap.collections[0].colorbar
+    # cbar.ax.tick_params(labelsize=15, labelcolor="black")
+    # cbar.ax.set_ylabel(ylabel="color scale", color="red", loc="center",fontdict=font2)
+    #
+    # plt.show()
+    return person
+
+
+def get_most_N_correlation_coefficient(person, N=10):
+    print("获得相关度最高的{}个值".format(N))
+    # total_correlation = person[1:, 1:]
+    abs_correlation = np.abs(person)
+    one = np.ones(shape=abs_correlation.shape)
+    two = np.subtract(one, abs_correlation)
+    rows, cols = two.shape
+    total_sum = []
+    for i in range(cols):
+        # print(two[i])
+        total = np.sum(two[i])
+        total_sum.append(total)
+
+    print("total_sum:", total_sum)
+    # 取最小的N个数，因为是与1减了以后的,越小相关系数越大
+    print("arg:",np.argpartition(total_sum, N))
+    min = np.argpartition(total_sum, N)[:N]
+    max = np.argpartition(total_sum, N)[total_sum.__len__() - N:]
+    print("min:",min)
+    return min
+
+
+# 过滤或者线性填充
+def findIndexOfExceptPoint(data: pd.DataFrame):
+    # indexList2D = []
+    # indexList = []
+    # indexList2 = []
+    # indexList3 = []
+    # indexList4 = []
+    indexList = []
+    print("开始清洗")
+    for i in data.index:
+        if i % 10000 == 0:
+            print("已处理了{}条数据".format(i))
+        ## 删除绝大多数0
+        # if data['num_gearbox_sumptemp'][i] != 0 and (i < 416166 or i > 432766) and (
+        #         data['num_gearbox_pumpoutletpress'][i] == 0 or data['num_activepower'][i] == 0 or
+        #         data['num_gen_torque'][i] == 0):
+        #     indexList.append(i)
+        # 删除全部有0
+        # if  (i < 416166 or i > 432766) and (
+        #         data['num_gearbox_pumpoutletpress'][i] == 0 or data['num_activepower'][i] == 0 or
+        #         data['num_gen_torque'][i] == 0):
+        #     indexList.append(i)
+        # 只删除全部0
+        if  (i < 416166 or i > 432766) and (
+                data['num_gearbox_sumptemp'][i] == 0 and data['num_gearbox_inletoiltemp'][i] == 0 and
+                data['num_gearbox_inletpress'][i] == 0):
+            indexList.append(i)
+        else:
+            pass
+
+    # indexList2D = [indexList1, indexList2, indexList3, indexList4, indexList5]
+    indexList2D = set(indexList)
+    print("要移除的index:", indexList2D)
+    return indexList2D
+
+
+# 根据index移除异常数据
+def removeDataByIndex(indexList, data):
+    print("开始移除异常index的数据")
+    a = 1
+    data.drop(indexList, inplace=True)
+    # for i in indexList:
+    #     try:
+    #         data.drop([i], inplace=True)
+    #     except:
+    #         continue
+    #     # print('第{}组\n'.format(a))
+    #     # a += 1
+    return data
+
+
+# 处理数据(移除,重新赋值,或者是其他操作)
+def dealData(scada_data: pd.DataFrame):
+    # 是否保存处理好的数据
+    Is_save = True
+    indexList = findIndexOfExceptPoint(scada_data)
+    removeDataByIndex(indexList=indexList, data=scada_data)
+    print("处理后的数据为:")
+    print(scada_data)
+    if Is_save:
+        print("============保存处理好的数据,路径为{}============".format(save_path))
+        scada_data.to_csv(save_path, index=False, encoding='gbk')
+
+    return scada_data
+
+
+# 读取数据,转为numpy数组或者tf数组
+def read_data(file_name, isNew: bool = False):
+    ''' 导入数据 '''
+    with open(file_name, 'r') as f:
+        if isNew:
+            # scada_data = pd.read_csv(f,low_memory=False, encoding='gbk', usecols=baseUseCols, parse_dates=['时间'])
+            scada_data = pd.read_csv(f, low_memory=False, encoding='gbk', parse_dates=['realtime'])
+            print(scada_data)
+            scada_data = dealData(scada_data=scada_data)
+            print(scada_data.head)
+            scada_data = np.array(scada_data)
+        else:
+            scada_data = np.loadtxt(f, str, delimiter=",")
+        label = scada_data[0, 4:]
+        label=list(['Gs','Gio','Gip','Gp','Gwt','En','Gft','Grt','Gwt','Et','Rs','Ap','Ws','Dw','Ges','Gt','Vx','Vy'])
+        print("导入数据成功,将数据转为numpy或tf数组...")
+        needed_data = scada_data[1:, 4:].astype(dtype=np.float)
+        ## needed_data = tf.cast(needed_data, tf.float32)  tensor无法转为pd.DataFrame
+        print(needed_data)
+        print("转换成功，并返回...")
+        return needed_data, label
+
+
+def plot_original_data(data):
+    rows, cols = data.shape
+    print("开始画图...")
+
+    for i in range(cols):
+        plt.figure(i)
+        plt.plot(data[:, i])
+    plt.show()
+
+
+def execute(file_name=source_path,N=10):
+    needed_data, label = read_data(file_name=file_name, isNew=False)
+    print(needed_data)
+    print(needed_data.shape)
+    # plot_original_data(needed_data)
+    person = cal_correlation_coefficient(needed_data, label)
+    person = np.array(person)
+    min = get_most_N_correlation_coefficient(person, N=N)
+
+    for index in min:
+        if index == min[0]:
+            total_data = np.expand_dims(needed_data[:, index], axis=-1)
+        else:
+            total_data = np.concatenate([total_data, np.expand_dims(needed_data[:, index], axis=-1)], axis=-1)
+
+    return total_data
+
+
+if __name__ == '__main__':
+    # total_data = execute(N=10, file_name=source_path)
+    # print(total_data)
+    # print(total_data.shape)7  10  13
+    # 15中间有一段差别很大
+    file_name='H:\data\SCADA数据\SCADA_已处理_粤水电达坂城2020.1月-5月/风机15.csv'
+    needed_data, label = read_data(file_name=file_name, isNew=False)
+    print(needed_data.shape)
+    plot_original_data(needed_data)
--- a/TensorFlow_eaxmple/Model_train_test/condition_monitoring/data_deal/plot_raw_data.py
+++ b/TensorFlow_eaxmple/Model_train_test/condition_monitoring/data_deal/plot_raw_data.py
@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+
+# coding: utf-8
+
+'''
+@Author : dingjiawen
+@Date : 2022/11/2 12:59
+@Usage : 画原始数据
+@Desc :
+'''
+import pandas as pd
+import numpy as np
+
+
+
+
+source_path = "G:\data\SCADA数据\jb4q_8_delete_total_zero.csv"
+
+def deal_data(file_name=source_path):
+    ''' 导入数据 '''
+    with open(file_name, 'r') as f:
+        scada_data = np.loadtxt(f, str, delimiter=",")
+        label = scada_data[0, 3:]
+        label = list(
+            ['Gs', 'Gio', 'Gip', 'Gp', 'Gwt', 'En', 'Gft', 'Grt', 'Gwt', 'Et', 'Rs', 'Ap', 'Ws', 'Dw', 'Ges', 'Gt',
+             'Vx', 'Vy'])
+        print("导入数据成功,将数据转为numpy或tf数组...")
+        needed_data = scada_data[1:37000, 3:].astype(dtype=np.float)
+        ## needed_data = tf.cast(needed_data, tf.float32)  tensor无法转为pd.DataFrame
+        print(needed_data)
+        print("转换成功，并返回...")
+        return needed_data, label
+    pass
+
+
+# 归一化
+def normalization(data):
+    rows, cols = data.shape
+    print("归一化之前:", data)
+    print(data.shape)
+    print("======================")
+
+    # 归一化
+    max = np.max(data, axis=0)
+    max = np.broadcast_to(max, [rows, cols])
+    min = np.min(data, axis=0)
+    min = np.broadcast_to(min, [rows, cols])
+
+    data = (data - min) / (max - min)
+    print("归一化之后:", data)
+    print(data.shape)
+
+    return data
+
+
+if __name__ == '__main__':
+    needed_data, label=deal_data()
+    data=normalization(data=needed_data)
+    np.savetxt('G:\data\SCADA数据/normalization.csv',data,delimiter=',')
+    print(data.shape)
--- a/TensorFlow_eaxmple/Model_train_test/condition_monitoring/others_idea/CNN_GRU.py
+++ b/TensorFlow_eaxmple/Model_train_test/condition_monitoring/others_idea/CNN_GRU.py
@ -0,0 +1,262 @@
+import tensorflow as tf
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from condition_monitoring.data_deal import loadData
+from keras.callbacks import EarlyStopping
+import os
+import shutil
+
+# 孔师兄idea:CNN+GRU
+
+
+'''超参数设置'''
+time_stamp = 120
+feature_num = 10
+batch_size = 8
+learning_rate = 0.01
+EPOCH = 101
+model_name = "CNN_GRU"
+'''EWMA超参数'''
+K = 18
+namuda = 0.01
+'''保存名称'''
+save_name = "../model/{0}_timestamp{1}_featureNum{2}_batch_size{3}_Epoch{4}.h5".format(model_name,
+                                                                                       time_stamp, feature_num,
+                                                                                       batch_size, EPOCH)
+'''文件名'''
+file_name = "G:\data\SCADA数据\jb4q_8_delete_all_zero.csv"
+
+def remove(data, time_stamp=time_stamp):
+    rows, cols = data.shape
+    print("remove_data.shape:", data.shape)
+    num = int(rows / time_stamp)
+
+    return data[:num * time_stamp, :]
+    pass
+
+
+# 不重叠采样
+def get_training_data(data, time_stamp=time_stamp):
+    removed_data = remove(data=data)
+    rows, cols = removed_data.shape
+    # print("removed_data.shape:", data.shape)
+    # print("removed_data:", removed_data)
+    train_data = np.reshape(removed_data, [-1, time_stamp, cols])
+    # print("train_data:", train_data)
+    batchs, time_stamp, cols = train_data.shape
+
+    for i in range(1, batchs):
+        each_label = np.expand_dims(train_data[i, 0, :], axis=0)
+        if i == 1:
+            train_label = each_label
+        else:
+            train_label = np.concatenate([train_label, each_label], axis=0)
+
+    # print("train_data.shape:", train_data.shape)
+    # print("train_label.shape", train_label.shape)
+    return train_data[:-1, :], train_label
+
+
+# 重叠采样
+def get_training_data_overlapping(data,time_stamp=time_stamp):
+
+    rows,cols = data.shape
+    train_data = np.empty(shape=[rows-time_stamp-1,time_stamp,cols])
+    train_label = np.empty(shape=[rows-time_stamp-1,cols])
+    for i in range(rows):
+        if i +time_stamp >= rows:
+            break
+        if i + time_stamp < rows - 1:
+            train_data[i] = data[i:i+time_stamp]
+            train_label[i] = data[i+time_stamp]
+
+    print("重叠采样以后：")
+    print("data:",train_data)
+    print("label:",train_label)
+
+    return train_data,train_label
+
+
+
+def condition_monitoring_model():
+    input = tf.keras.Input(shape=[time_stamp, feature_num])
+    conv1 = tf.keras.layers.Conv1D(filters=256, kernel_size=1)(input)
+    GRU1 = tf.keras.layers.GRU(128, return_sequences=False)(conv1)
+    d1 = tf.keras.layers.Dense(300)(GRU1)
+    output = tf.keras.layers.Dense(10)(d1)
+    model = tf.keras.Model(inputs=input, outputs=output)
+
+    return model
+
+
+# 归一化
+def normalization(data):
+    rows, cols = data.shape
+    print("归一化之前:", data)
+    print(data.shape)
+    print("======================")
+
+    # 归一化
+    max = np.max(data, axis=0)
+    max = np.broadcast_to(max, [rows, cols])
+    min = np.min(data, axis=0)
+    min = np.broadcast_to(min, [rows, cols])
+
+    data = (data - min) / (max - min)
+    print("归一化之后:", data)
+    print(data.shape)
+
+    return data
+
+
+# 正则化
+def Regularization(data):
+    rows, cols = data.shape
+    print("正则化之前:", data)
+    print(data.shape)
+    print("======================")
+
+    # 正则化
+    mean = np.mean(data, axis=0)
+    mean = np.broadcast_to(mean, shape=[rows, cols])
+    dst = np.sqrt(np.var(data, axis=0))
+    dst = np.broadcast_to(dst, shape=[rows, cols])
+    data = (data - mean) / dst
+    print("正则化之后:", data)
+    print(data.shape)
+
+    return data
+    pass
+
+
+def EWMA(data, K=K, namuda=namuda):
+    # t是啥暂时未知
+    t = 0
+    mid = np.mean(data, axis=0)
+    standard = np.sqrt(np.var(data, axis=0))
+    UCL = mid + K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
+    LCL = mid - K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
+    return mid, UCL, LCL
+    pass
+
+
+def get_MSE(data, label, new_model):
+    predicted_data = new_model.predict(data)
+
+    temp = np.abs(predicted_data - label)
+    temp1 = (temp - np.broadcast_to(np.mean(temp, axis=0), shape=predicted_data.shape))
+    temp2 = np.broadcast_to(np.sqrt(np.var(temp, axis=0)), shape=predicted_data.shape)
+    temp3 = temp1/temp2
+    mse = np.sum((temp1 / temp2) ** 2, axis=1)
+    print("z:", mse)
+    print(mse.shape)
+
+    # mse=np.mean((predicted_data-label)**2,axis=1)
+    print("mse", mse)
+
+    dims, = mse.shape
+
+    mean = np.mean(mse)
+    std = np.sqrt(np.var(mse))
+    max = mean + 3 * std
+    # min = mean-3*std
+    max = np.broadcast_to(max, shape=[dims, ])
+    # min = np.broadcast_to(min,shape=[dims,])
+    mean = np.broadcast_to(mean, shape=[dims, ])
+
+    # plt.plot(max)
+    # plt.plot(mse)
+    # plt.plot(mean)
+    # # plt.plot(min)
+    # plt.show()
+    #
+    #
+    return mse,mean,max
+    # pass
+
+
+if __name__ == '__main__':
+    total_data = loadData.execute(N=feature_num,file_name=file_name)
+    total_data = normalization(data=total_data)
+    train_data, train_label = get_training_data_overlapping(total_data[:300455, :])
+
+    ## TODO training
+    # model = condition_monitoring_model()
+    # checkpoint = tf.keras.callbacks.ModelCheckpoint(
+    #     filepath=save_name,
+    #     monitor='val_loss',
+    #     verbose=1,
+    #     save_best_only=True,
+    #     mode='min',
+    #     period=1)
+    # lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)
+    # early_stop = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=30, mode='min', verbose=1)
+    # model.compile(optimizer=tf.optimizers.Adam(learning_rate=learning_rate), loss=tf.losses.mse)
+    # model.summary()
+    # model.fit(train_data, train_label, batch_size=batch_size, epochs=EPOCH, validation_split=0.1,
+    #           callbacks=[checkpoint, lr_scheduler, early_stop])
+
+    ## TODO testing
+    print("===============================")
+    print(total_data.shape)
+    print("===============================")
+    test_data, test_label = get_training_data(total_data[:300455, :])
+    newModel = tf.keras.models.load_model(save_name)
+    mse,mean,max = get_MSE(test_data, test_label, new_model=newModel)
+    print("===============================")
+    print("mse:",mse)
+    print(mse.shape)
+    print("===============================")
+
+
+    test_data, test_label = get_training_data(total_data[20000:, :])
+    predicted_data = newModel.predict(test_data)
+    rows, cols = predicted_data.shape
+    print("=====================================")
+    print(predicted_data)
+    print(predicted_data.shape)
+    print("=====================================")
+
+    temp = np.abs(predicted_data - test_label)
+    temp1 = (temp - np.broadcast_to(np.mean(temp, axis=0), shape=predicted_data.shape))
+    temp2 = np.broadcast_to(np.sqrt(np.var(temp, axis=0)), shape=predicted_data.shape)
+    temp3 = temp1 / temp2
+    mse = np.sum((temp1 / temp2) ** 2, axis=1)
+    print("====================")
+    print("new_mse:",mse)
+    print(mse.shape)
+    np.savetxt("mse", mse, delimiter=',')
+    print("===================")
+
+    plt.plot(mse[2000:])
+    plt.plot(mean)
+    plt.plot(max)
+    plt.show()
+
+
+
+
+
+
+    data = pd.DataFrame(mse).ewm(span=3).mean()
+    print(data)
+    data =np.array(data)
+
+    index,_ = data.shape
+
+
+
+    for i in range(2396):
+        if data[i,0] >5:
+            data[i,0] = data[i-1,:]
+    print(data)
+    mean = data[2000:2396,:].mean()
+    std = data[2000:2396,:].std()
+    mean=np.broadcast_to(mean,shape=[500,])
+    std=np.broadcast_to(std,shape=[500,])
+    plt.plot(data[2000:2396,:])
+    plt.plot(mean)
+    plt.plot(mean+3*std)
+    plt.plot(mean-3*std)
+    plt.show()
--- a/TensorFlow_eaxmple/Model_train_test/condition_monitoring/others_idea/init.py
+++ b/TensorFlow_eaxmple/Model_train_test/condition_monitoring/others_idea/init.py
--- a/TensorFlow_eaxmple/Model_train_test/condition_monitoring/self_try/Joint_Monitoring.py
+++ b/TensorFlow_eaxmple/Model_train_test/condition_monitoring/self_try/Joint_Monitoring.py
@ -0,0 +1,526 @@
+# -*- coding: utf-8 -*-
+
+# coding: utf-8
+import tensorflow as tf
+import tensorflow.keras
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from model.DepthwiseCon1D.DepthwiseConv1D import DepthwiseConv1D
+from model.Dynamic_channelAttention.Dynamic_channelAttention import DynamicChannelAttention
+from condition_monitoring.data_deal import loadData
+from model.Joint_Monitoring.Joint_Monitoring2 import Joint_Monitoring
+
+from model.CommonFunction.CommonFunction import *
+from sklearn.model_selection import train_test_split
+from tensorflow.keras.models import load_model, save_model
+
+'''
+@Author : dingjiawen
+@Date : 2022/7/8 10:29
+@Usage : 尝试将预测和分类两种方式相结合,联合监测
+@Desc :REPVGG+unsampling+GRU进行重构,后面接GDP=全局动态池化+分类器
+随epoch衰减的MSELoss+随epoch增强的crossEntropy
+'''
+
+'''超参数设置'''
+time_stamp = 120
+feature_num = 10
+batch_size = 16
+learning_rate = 0.001
+EPOCH = 101
+model_name = "joint"
+'''EWMA超参数'''
+K = 18
+namuda = 0.01
+'''保存名称'''
+
+save_name = "../model/weight/{0}_timestamp{1}_feature{2}_Epoch{4}_weight/weight".format(model_name,
+                                                                                        time_stamp,
+                                                                                        feature_num,
+                                                                                        batch_size,
+                                                                                        EPOCH)
+save_step_two_name = "../model/two_weight/{0}_timestamp{1}_feature{2}_weight/weight".format(model_name,
+                                                                                            time_stamp,
+                                                                                            feature_num,
+                                                                                            batch_size,
+                                                                                            EPOCH)
+
+# save_name = "../model/joint/{0}_timestamp{1}_feature{2}.h5".format(model_name,
+#                                                                    time_stamp,
+#                                                                    feature_num,
+#                                                                    batch_size,
+#                                                                    EPOCH)
+# save_step_two_name = "../model/joint_two/{0}_timestamp{1}_feature{2}.h5".format(model_name,
+#                                                                                 time_stamp,
+#                                                                                 feature_num,
+#                                                                                 batch_size,
+#                                                                                 EPOCH)
+'''文件名'''
+file_name = "G:\data\SCADA数据\jb4q_8_delete_all_zero.csv"
+
+'''
+文件说明：jb4q_8_delete_all_zero.csv是删除了除异常以外的所有0值的文件
+文件从0:300454行均是正常值(2019/7.30 00:00:00 - 2019/9/18  11:21:00)
+从300455:317052行均是异常值(2019/9/18  11:21:01 - 2019/9/29  23:59:00)
+'''
+'''文件参数'''
+# 最后正常的时间点
+healthy_date = 300454
+# 最后异常的时间点
+unhealthy_date = 317052
+# 异常容忍程度
+unhealthy_patience = 5
+
+
+def remove(data, time_stamp=time_stamp):
+    rows, cols = data.shape
+    print("remove_data.shape:", data.shape)
+    num = int(rows / time_stamp)
+
+    return data[:num * time_stamp, :]
+    pass
+
+
+# 不重叠采样
+def get_training_data(data, time_stamp: int = time_stamp):
+    removed_data = remove(data=data)
+    rows, cols = removed_data.shape
+    print("removed_data.shape:", data.shape)
+    print("removed_data:", removed_data)
+    train_data = np.reshape(removed_data, [-1, time_stamp, cols])
+    print("train_data:", train_data)
+    batchs, time_stamp, cols = train_data.shape
+
+    for i in range(1, batchs):
+        each_label = np.expand_dims(train_data[i, 0, :], axis=0)
+        if i == 1:
+            train_label = each_label
+        else:
+            train_label = np.concatenate([train_label, each_label], axis=0)
+
+    print("train_data.shape:", train_data.shape)
+    print("train_label.shape", train_label.shape)
+    return train_data[:-1, :], train_label
+
+
+# 重叠采样
+def get_training_data_overlapping(data, time_stamp: int = time_stamp, is_Healthy: bool = True):
+    rows, cols = data.shape
+    train_data = np.empty(shape=[rows - time_stamp - 1, time_stamp, cols])
+    train_label = np.empty(shape=[rows - time_stamp - 1, cols])
+    for i in range(rows):
+        if i + time_stamp >= rows:
+            break
+        if i + time_stamp < rows - 1:
+            train_data[i] = data[i:i + time_stamp]
+            train_label[i] = data[i + time_stamp]
+
+    print("重叠采样以后：")
+    print("data:", train_data)  # (300334,120,10)
+    print("label:", train_label)  # (300334,10)
+
+    if is_Healthy:
+        train_label2 = np.ones(shape=[train_label.shape[0]])
+    else:
+        train_label2 = np.zeros(shape=[train_label.shape[0]])
+
+    print("label2:", train_label2)
+
+    return train_data, train_label, train_label2
+
+
+# RepConv重参数化卷积
+def RepConv(input_tensor, k=3):
+    _, _, output_dim = input_tensor.shape
+    conv1 = tf.keras.layers.Conv1D(filters=output_dim, kernel_size=k, strides=1, padding='SAME')(input_tensor)
+    b1 = tf.keras.layers.BatchNormalization()(conv1)
+
+    conv2 = tf.keras.layers.Conv1D(filters=output_dim, kernel_size=1, strides=1, padding='SAME')(input_tensor)
+    b2 = tf.keras.layers.BatchNormalization()(conv2)
+
+    b3 = tf.keras.layers.BatchNormalization()(input_tensor)
+
+    out = tf.keras.layers.Add()([b1, b2, b3])
+    out = tf.nn.relu(out)
+    return out
+
+
+# RepBlock模块
+def RepBlock(input_tensor, num: int = 3):
+    for i in range(num):
+        input_tensor = RepConv(input_tensor)
+    return input_tensor
+
+
+# GAP 全局平均池化
+def Global_avg_channelAttention(input_tensor):
+    _, length, channel = input_tensor.shape
+    DWC1 = DepthwiseConv1D(kernel_size=1, padding='SAME')(input_tensor)
+    GAP = tf.keras.layers.GlobalAvgPool1D()(DWC1)
+    c1 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GAP)
+    s1 = tf.nn.sigmoid(c1)
+    output = tf.multiply(input_tensor, s1)
+    return output
+
+
+# GDP 全局动态池化
+def Global_Dynamic_channelAttention(input_tensor):
+    _, length, channel = input_tensor.shape
+    DWC1 = DepthwiseConv1D(kernel_size=1, padding='SAME')(input_tensor)
+
+    # GAP
+    GAP = tf.keras.layers.GlobalAvgPool1D()(DWC1)
+    c1 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GAP)
+    s1 = tf.nn.sigmoid(c1)
+
+    # GMP
+    GMP = tf.keras.layers.GlobalMaxPool1D()(DWC1)
+    c2 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GMP)
+    s3 = tf.nn.sigmoid(c2)
+
+    output = tf.multiply(input_tensor, s1)
+    return output
+
+
+# 归一化
+def normalization(data):
+    rows, cols = data.shape
+    print("归一化之前:", data)
+    print(data.shape)
+    print("======================")
+
+    # 归一化
+    max = np.max(data, axis=0)
+    max = np.broadcast_to(max, [rows, cols])
+    min = np.min(data, axis=0)
+    min = np.broadcast_to(min, [rows, cols])
+
+    data = (data - min) / (max - min)
+    print("归一化之后:", data)
+    print(data.shape)
+
+    return data
+
+
+# 正则化
+def Regularization(data):
+    rows, cols = data.shape
+    print("正则化之前:", data)
+    print(data.shape)
+    print("======================")
+
+    # 正则化
+    mean = np.mean(data, axis=0)
+    mean = np.broadcast_to(mean, shape=[rows, cols])
+    dst = np.sqrt(np.var(data, axis=0))
+    dst = np.broadcast_to(dst, shape=[rows, cols])
+    data = (data - mean) / dst
+    print("正则化之后:", data)
+    print(data.shape)
+
+    return data
+    pass
+
+
+def EWMA(data, K=K, namuda=namuda):
+    # t是啥暂时未知
+    t = 0
+    mid = np.mean(data, axis=0)
+    standard = np.sqrt(np.var(data, axis=0))
+    UCL = mid + K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
+    LCL = mid - K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
+    return mid, UCL, LCL
+    pass
+
+
+def get_MSE(data, label, new_model):
+    predicted_data = new_model.predict(data)
+
+    temp = np.abs(predicted_data - label)
+    temp1 = (temp - np.broadcast_to(np.mean(temp, axis=0), shape=predicted_data.shape))
+    temp2 = np.broadcast_to(np.sqrt(np.var(temp, axis=0)), shape=predicted_data.shape)
+    temp3 = temp1 / temp2
+    mse = np.sum((temp1 / temp2) ** 2, axis=1)
+    print("z:", mse)
+    print(mse.shape)
+
+    # mse=np.mean((predicted_data-label)**2,axis=1)
+    print("mse", mse)
+
+    dims, = mse.shape
+
+    mean = np.mean(mse)
+    std = np.sqrt(np.var(mse))
+    max = mean + 3 * std
+    # min = mean-3*std
+    max = np.broadcast_to(max, shape=[dims, ])
+    # min = np.broadcast_to(min,shape=[dims,])
+    mean = np.broadcast_to(mean, shape=[dims, ])
+
+    # plt.plot(max)
+    # plt.plot(mse)
+    # plt.plot(mean)
+    # # plt.plot(min)
+    # plt.show()
+    #
+    #
+    return mse, mean, max
+    # pass
+
+
+def condition_monitoring_model():
+    input = tf.keras.Input(shape=[time_stamp, feature_num])
+    conv1 = tf.keras.layers.Conv1D(filters=256, kernel_size=1)(input)
+    GRU1 = tf.keras.layers.GRU(128, return_sequences=False)(conv1)
+    d1 = tf.keras.layers.Dense(300)(GRU1)
+    output = tf.keras.layers.Dense(10)(d1)
+
+    model = tf.keras.Model(inputs=input, outputs=output)
+
+    return model
+
+
+# trian_data:(300455,120,10)
+# trian_label1:(300455,10)
+# trian_label2:(300455,)
+def shuffle(train_data, train_label1, train_label2, is_split: bool = False, split_size: float = 0.2):
+    (train_data, test_data, train_label1, test_label1, train_label2, test_label2) = train_test_split(train_data,
+                                                                                                     train_label1,
+                                                                                                     train_label2,
+                                                                                                     test_size=split_size,
+                                                                                                     shuffle=True,
+                                                                                                     random_state=100)
+    if is_split:
+        return train_data, train_label1, train_label2, test_data, test_label1, test_label2
+    train_data = np.concatenate([train_data, test_data], axis=0)
+    train_label1 = np.concatenate([train_label1, test_label1], axis=0)
+    train_label2 = np.concatenate([train_label2, test_label2], axis=0)
+    # print(train_data.shape)
+    # print(train_label1.shape)
+    # print(train_label2.shape)
+    # print(train_data.shape)
+
+    return train_data, train_label1, train_label2
+    pass
+
+
+def split_test_data(healthy_data, healthy_label1, healthy_label2, unhealthy_data, unhealthy_label1, unhealthy_label2,
+                    split_size: float = 0.2):
+    data = np.concatenate([healthy_data, unhealthy_data], axis=0)
+    label1 = np.concatenate([healthy_label1, unhealthy_label1], axis=0)
+    label2 = np.concatenate([healthy_label2, unhealthy_label2], axis=0)
+    (train_data, test_data, train_label1, test_label1, train_label2, test_label2) = train_test_split(data,
+                                                                                                     label1,
+                                                                                                     label2,
+                                                                                                     test_size=split_size,
+                                                                                                     shuffle=True,
+                                                                                                     random_state=100)
+
+    # print(train_data.shape)
+    # print(train_label1.shape)
+    # print(train_label2.shape)
+    # print(train_data.shape)
+
+    return train_data, train_label1, train_label2, test_data, test_label1, test_label2
+
+    pass
+
+
+# trian_data:(300455,120,10)
+# trian_label1:(300455,10)
+# trian_label2:(300455,)
+def train_step_one(train_data, train_label1, train_label2):
+    model = Joint_Monitoring()
+    # # # # TODO 需要运行编译一次,才能打印model.summary()
+    # model.build(input_shape=(batch_size, filter_num, dims))
+    # model.summary()
+    history_loss = []
+    history_val_loss = []
+    learning_rate = 1e-3
+    for epoch in range(EPOCH):
+
+        print()
+        print("EPOCH:", epoch, "/", EPOCH, ":")
+        train_data, train_label1, train_label2 = shuffle(train_data, train_label1, train_label2)
+        if epoch == 0:
+            train_data, train_label1, train_label2, val_data, val_label1, val_label2 = shuffle(train_data, train_label1,
+                                                                                               train_label2,
+                                                                                               is_split=True)
+        # print()
+        # print("EPOCH:", epoch, "/", EPOCH, ":")
+        # 用于让train知道，这是这个epoch中的第几次训练
+        z = 0
+        # 用于batch_size次再训练
+        k = 1
+        for data_1, label_1, label_2 in zip(train_data, train_label1, train_label2):
+            size, _, _ = train_data.shape
+            data_1 = tf.expand_dims(data_1, axis=0)
+            label_1 = tf.expand_dims(label_1, axis=0)
+            label_2 = tf.expand_dims(label_2, axis=0)
+            if batch_size != 1:
+                if k % batch_size == 1:
+                    data = data_1
+                    label1 = label_1
+                    label2 = label_2
+                else:
+                    data = tf.concat([data, data_1], axis=0)
+                    label1 = tf.concat([label1, label_1], axis=0)
+                    label2 = tf.concat([label2, label_2], axis=0)
+            else:
+                data = data_1
+                label1 = label_1
+                label2 = label_2
+
+            if k % batch_size == 0:
+                # label = tf.expand_dims(label, axis=-1)
+                loss_value = model.train(input_tensor=data, label1=label1, label2=label2, learning_rate=learning_rate,
+                                         is_first_time=True)
+                print(z * batch_size, "/", size, ":===============>", "loss:", loss_value.numpy())
+                k = 0
+                z = z + 1
+            k = k + 1
+        val_loss = model.get_val_loss(val_data=val_data, val_label1=val_label1, val_label2=val_label2,
+                                      is_first_time=True)
+        SaveBestModel(model=model, save_name=save_name, history_loss=history_val_loss, loss_value=val_loss.numpy())
+        # SaveBestH5Model(model=model, save_name=save_name, history_loss=history_val_loss, loss_value=val_loss.numpy())
+        history_val_loss.append(val_loss)
+        history_loss.append(loss_value.numpy())
+        print('Training loss is :', loss_value.numpy())
+        print('Validating loss is :', val_loss.numpy())
+        if IsStopTraining(history_loss=history_val_loss, patience=7):
+            break
+        if Is_Reduce_learning_rate(history_loss=history_val_loss, patience=3):
+            if learning_rate >= 1e-4:
+                learning_rate = learning_rate * 0.1
+    pass
+
+
+def train_step_two(step_one_model, step_two_model, train_data, train_label1, train_label2):
+    # step_two_model = Joint_Monitoring()
+    # step_two_model.build(input_shape=(batch_size, time_stamp, feature_num))
+    # step_two_model.summary()
+    history_loss = []
+    history_val_loss = []
+    history_accuracy = []
+    learning_rate = 1e-3
+    for epoch in range(EPOCH):
+        print()
+        print("EPOCH:", epoch, "/", EPOCH, ":")
+        train_data, train_label1, train_label2 = shuffle(train_data, train_label1, train_label2)
+        if epoch == 0:
+            train_data, train_label1, train_label2, val_data, val_label1, val_label2 = shuffle(train_data, train_label1,
+                                                                                               train_label2,
+                                                                                               is_split=True)
+        # print()
+        # print("EPOCH:", epoch, "/", EPOCH, ":")
+        # 用于让train知道，这是这个epoch中的第几次训练
+        z = 0
+        # 用于batch_size次再训练
+        k = 1
+        accuracy_num = 0
+        for data_1, label_1, label_2 in zip(train_data, train_label1, train_label2):
+            size, _, _ = train_data.shape
+            data_1 = tf.expand_dims(data_1, axis=0)
+            label_1 = tf.expand_dims(label_1, axis=0)
+            label_2 = tf.expand_dims(label_2, axis=0)
+            if batch_size != 1:
+                if k % batch_size == 1:
+                    data = data_1
+                    label1 = label_1
+                    label2 = label_2
+                else:
+                    data = tf.concat([data, data_1], axis=0)
+                    label1 = tf.concat([label1, label_1], axis=0)
+                    label2 = tf.concat([label2, label_2], axis=0)
+            else:
+                data = data_1
+                label1 = label_1
+                label2 = label_2
+
+            if k % batch_size == 0:
+                # label = tf.expand_dims(label, axis=-1)
+                output1, output2, output3, _ = step_one_model.call(inputs=data, is_first_time=True)
+                loss_value, accuracy_value = step_two_model.train(input_tensor=data, label1=label1, label2=label2,
+                                                                  learning_rate=learning_rate,
+                                                                  is_first_time=False, pred_3=output1, pred_4=output2,
+                                                                  pred_5=output3)
+                accuracy_num += accuracy_value
+                print(z * batch_size, "/", size, ":===============>", "loss:", loss_value.numpy(), "| accuracy:",
+                      accuracy_num / ((z + 1) * batch_size))
+                k = 0
+                z = z + 1
+            k = k + 1
+
+        val_loss, val_accuracy = step_two_model.get_val_loss(val_data=val_data, val_label1=val_label1,
+                                                             val_label2=val_label2,
+                                                             is_first_time=False, step_one_model=step_one_model)
+        SaveBestModelByAccuracy(model=step_two_model, save_name=save_step_two_name, history_accuracy=history_accuracy,
+                                accuracy_value=val_accuracy)
+        history_val_loss.append(val_loss)
+        history_loss.append(loss_value.numpy())
+        print('Training loss is : {0} | Training accuracy is : {1}'.format(loss_value.numpy(),
+              accuracy_num / ((z + 1) * batch_size)))
+        print('Validating loss is : {0} | Validating accuracy is : {1}'.format(val_loss.numpy(), val_accuracy))
+        if IsStopTraining(history_loss=history_val_loss, patience=7):
+            break
+        if Is_Reduce_learning_rate(history_loss=history_val_loss, patience=3):
+            if learning_rate >= 1e-4:
+                learning_rate = learning_rate * 0.1
+    pass
+
+
+def test(step_one_model, step_two_model, test_data, test_label1, test_label2):
+    history_loss = []
+    history_val_loss = []
+
+    val_loss, val_accuracy = step_two_model.get_val_loss(val_data=test_data, val_label1=test_label1,
+                                                         val_label2=test_label2,
+                                                         is_first_time=False, step_one_model=step_one_model)
+
+    history_val_loss.append(val_loss)
+    print("val_accuracy:", val_accuracy)
+    print("val_loss:", val_loss)
+
+
+if __name__ == '__main__':
+    total_data = loadData.execute(N=feature_num, file_name=file_name)
+    total_data = normalization(data=total_data)
+    train_data_healthy, train_label1_healthy, train_label2_healthy = get_training_data_overlapping(
+        total_data[:healthy_date, :], is_Healthy=True)
+    train_data_unhealthy, train_label1_unhealthy, train_label2_unhealthy = get_training_data_overlapping(
+        total_data[healthy_date - time_stamp + unhealthy_patience:unhealthy_date, :],
+        is_Healthy=False)
+    # TODO 第一步训练
+    # 单次测试
+    # train_step_one(train_data=train_data_healthy[:32, :, :], train_label1=train_label1_healthy[:32, :],train_label2=train_label2_healthy[:32, ])
+    # train_step_one(train_data=train_data_healthy, train_label1=train_label1_healthy,train_label2=train_label2_healthy)
+
+    # 导入第一步已经训练好的模型,一个继续训练，一个只输出结果
+    step_one_model = Joint_Monitoring()
+    step_one_model.load_weights(save_name)
+    #
+    # step_two_model = Joint_Monitoring()
+    # step_two_model.load_weights(save_name)
+
+    # TODO 第二步训练
+    ### healthy_data.shape: (300333,120,10)
+    ### unhealthy_data.shape: (16594,10)
+    healthy_size, _, _ = train_data_healthy.shape
+    unhealthy_size, _, _ = train_data_unhealthy.shape
+    train_data, train_label1, train_label2, test_data, test_label1, test_label2 = split_test_data(
+        healthy_data=train_data_healthy[healthy_size - 2 * unhealthy_size:, :, :],
+        healthy_label1=train_label1_healthy[healthy_size - 2 * unhealthy_size:, :],
+        healthy_label2=train_label2_healthy[healthy_size - 2 * unhealthy_size:, ], unhealthy_data=train_data_unhealthy,
+        unhealthy_label1=train_label1_unhealthy, unhealthy_label2=train_label2_unhealthy)
+    # train_step_two(step_one_model=step_one_model, step_two_model=step_two_model,
+    #                train_data=train_data,
+    #                train_label1=train_label1, train_label2=np.expand_dims(train_label2, axis=-1))
+
+    # TODO 测试测试集
+    step_two_model = Joint_Monitoring()
+    step_two_model.load_weights(save_step_two_name)
+    test(step_one_model=step_one_model, step_two_model=step_two_model, test_data=test_data, test_label1=test_label1,
+         test_label2=np.expand_dims(test_label2, axis=-1))
+
+    pass
--- a/TensorFlow_eaxmple/Model_train_test/condition_monitoring/self_try/Joint_Monitoring_hard.py
+++ b/TensorFlow_eaxmple/Model_train_test/condition_monitoring/self_try/Joint_Monitoring_hard.py
@ -0,0 +1,576 @@
+# -*- coding: utf-8 -*-
+
+# coding: utf-8
+import tensorflow as tf
+import tensorflow.keras
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from model.DepthwiseCon1D.DepthwiseConv1D import DepthwiseConv1D
+from model.Dynamic_channelAttention.Dynamic_channelAttention import DynamicChannelAttention
+from condition_monitoring.data_deal import loadData
+from model.Joint_Monitoring.Joint_Monitoring3 import Joint_Monitoring
+
+from model.CommonFunction.CommonFunction import *
+from sklearn.model_selection import train_test_split
+from tensorflow.keras.models import load_model, save_model
+
+'''
+@Author : dingjiawen
+@Date : 2022/7/8 10:29
+@Usage : 尝试将预测和分类两种方式相结合,联合监测
+@Desc :REPVGG+unsampling+GRU进行重构,后面接GDP=全局动态池化+分类器
+随epoch衰减的MSELoss+随epoch增强的crossEntropy
+'''
+
+'''超参数设置'''
+time_stamp = 120
+feature_num = 10
+batch_size = 16
+learning_rate = 0.001
+EPOCH = 101
+model_name = "joint"
+'''EWMA超参数'''
+K = 18
+namuda = 0.01
+'''保存名称'''
+
+save_name = "../hard_model/weight/{0}_timestamp{1}_feature{2}_weight_epoch8/weight".format(model_name,
+                                                                                           time_stamp,
+                                                                                           feature_num,
+                                                                                           batch_size,
+                                                                                           EPOCH)
+save_step_two_name = "../hard_model/two_weight/{0}_timestamp{1}_feature{2}_weight_epoch14/weight".format(model_name,
+                                                                                                         time_stamp,
+                                                                                                         feature_num,
+                                                                                                         batch_size,
+                                                                                                         EPOCH)
+
+# save_name = "../model/joint/{0}_timestamp{1}_feature{2}.h5".format(model_name,
+#                                                                    time_stamp,
+#                                                                    feature_num,
+#                                                                    batch_size,
+#                                                                    EPOCH)
+# save_step_two_name = "../model/joint_two/{0}_timestamp{1}_feature{2}.h5".format(model_name,
+#                                                                                 time_stamp,
+#                                                                                 feature_num,
+#                                                                                 batch_size,
+#                                                                                 EPOCH)
+'''文件名'''
+file_name = "G:\data\SCADA数据\jb4q_8_delete_total_zero.csv"
+
+'''
+文件说明：jb4q_8_delete_total_zero.csv是删除了只删除了全是0的列的文件
+文件从0:415548行均是正常值(2019/7.30 00:00:00 - 2019/9/18  11:14:00)
+从415549:432153行均是异常值(2019/9/18  11:21:01 - 2021/1/18  00:00:00)
+'''
+'''文件参数'''
+# 最后正常的时间点
+healthy_date = 415548
+# 最后异常的时间点
+unhealthy_date = 432153
+# 异常容忍程度
+unhealthy_patience = 5
+
+
+def remove(data, time_stamp=time_stamp):
+    rows, cols = data.shape
+    print("remove_data.shape:", data.shape)
+    num = int(rows / time_stamp)
+
+    return data[:num * time_stamp, :]
+    pass
+
+
+# 不重叠采样
+def get_training_data(data, time_stamp: int = time_stamp):
+    removed_data = remove(data=data)
+    rows, cols = removed_data.shape
+    print("removed_data.shape:", data.shape)
+    print("removed_data:", removed_data)
+    train_data = np.reshape(removed_data, [-1, time_stamp, cols])
+    print("train_data:", train_data)
+    batchs, time_stamp, cols = train_data.shape
+
+    for i in range(1, batchs):
+        each_label = np.expand_dims(train_data[i, 0, :], axis=0)
+        if i == 1:
+            train_label = each_label
+        else:
+            train_label = np.concatenate([train_label, each_label], axis=0)
+
+    print("train_data.shape:", train_data.shape)
+    print("train_label.shape", train_label.shape)
+    return train_data[:-1, :], train_label
+
+
+# 重叠采样
+def get_training_data_overlapping(data, time_stamp: int = time_stamp, is_Healthy: bool = True):
+    rows, cols = data.shape
+    train_data = np.empty(shape=[rows - time_stamp - 1, time_stamp, cols])
+    train_label = np.empty(shape=[rows - time_stamp - 1, cols])
+    for i in range(rows):
+        if i + time_stamp >= rows:
+            break
+        if i + time_stamp < rows - 1:
+            train_data[i] = data[i:i + time_stamp]
+            train_label[i] = data[i + time_stamp]
+
+    print("重叠采样以后：")
+    print("data:", train_data)  # (300334,120,10)
+    print("label:", train_label)  # (300334,10)
+
+    if is_Healthy:
+        train_label2 = np.ones(shape=[train_label.shape[0]])
+    else:
+        train_label2 = np.zeros(shape=[train_label.shape[0]])
+
+    print("label2:", train_label2)
+
+    return train_data, train_label, train_label2
+
+
+# RepConv重参数化卷积
+def RepConv(input_tensor, k=3):
+    _, _, output_dim = input_tensor.shape
+    conv1 = tf.keras.layers.Conv1D(filters=output_dim, kernel_size=k, strides=1, padding='SAME')(input_tensor)
+    b1 = tf.keras.layers.BatchNormalization()(conv1)
+
+    conv2 = tf.keras.layers.Conv1D(filters=output_dim, kernel_size=1, strides=1, padding='SAME')(input_tensor)
+    b2 = tf.keras.layers.BatchNormalization()(conv2)
+
+    b3 = tf.keras.layers.BatchNormalization()(input_tensor)
+
+    out = tf.keras.layers.Add()([b1, b2, b3])
+    out = tf.nn.relu(out)
+    return out
+
+
+# RepBlock模块
+def RepBlock(input_tensor, num: int = 3):
+    for i in range(num):
+        input_tensor = RepConv(input_tensor)
+    return input_tensor
+
+
+# GAP 全局平均池化
+def Global_avg_channelAttention(input_tensor):
+    _, length, channel = input_tensor.shape
+    DWC1 = DepthwiseConv1D(kernel_size=1, padding='SAME')(input_tensor)
+    GAP = tf.keras.layers.GlobalAvgPool1D()(DWC1)
+    c1 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GAP)
+    s1 = tf.nn.sigmoid(c1)
+    output = tf.multiply(input_tensor, s1)
+    return output
+
+
+# GDP 全局动态池化
+def Global_Dynamic_channelAttention(input_tensor):
+    _, length, channel = input_tensor.shape
+    DWC1 = DepthwiseConv1D(kernel_size=1, padding='SAME')(input_tensor)
+
+    # GAP
+    GAP = tf.keras.layers.GlobalAvgPool1D()(DWC1)
+    c1 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GAP)
+    s1 = tf.nn.sigmoid(c1)
+
+    # GMP
+    GMP = tf.keras.layers.GlobalMaxPool1D()(DWC1)
+    c2 = tf.keras.layers.Conv1D(filters=channel, kernel_size=1, padding='SAME')(GMP)
+    s3 = tf.nn.sigmoid(c2)
+
+    output = tf.multiply(input_tensor, s1)
+    return output
+
+
+# 归一化
+def normalization(data):
+    rows, cols = data.shape
+    print("归一化之前:", data)
+    print(data.shape)
+    print("======================")
+
+    # 归一化
+    max = np.max(data, axis=0)
+    max = np.broadcast_to(max, [rows, cols])
+    min = np.min(data, axis=0)
+    min = np.broadcast_to(min, [rows, cols])
+
+    data = (data - min) / (max - min)
+    print("归一化之后:", data)
+    print(data.shape)
+
+    return data
+
+
+# 正则化
+def Regularization(data):
+    rows, cols = data.shape
+    print("正则化之前:", data)
+    print(data.shape)
+    print("======================")
+
+    # 正则化
+    mean = np.mean(data, axis=0)
+    mean = np.broadcast_to(mean, shape=[rows, cols])
+    dst = np.sqrt(np.var(data, axis=0))
+    dst = np.broadcast_to(dst, shape=[rows, cols])
+    data = (data - mean) / dst
+    print("正则化之后:", data)
+    print(data.shape)
+
+    return data
+    pass
+
+
+def EWMA(data, K=K, namuda=namuda):
+    # t是啥暂时未知
+    t = 0
+    mid = np.mean(data, axis=0)
+    standard = np.sqrt(np.var(data, axis=0))
+    UCL = mid + K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
+    LCL = mid - K * standard * np.sqrt(namuda / (2 - namuda) * (1 - (1 - namuda) ** 2 * t))
+    return mid, UCL, LCL
+    pass
+
+
+def get_MSE(data, label, new_model):
+    predicted_data = new_model.predict(data)
+
+    temp = np.abs(predicted_data - label)
+    temp1 = (temp - np.broadcast_to(np.mean(temp, axis=0), shape=predicted_data.shape))
+    temp2 = np.broadcast_to(np.sqrt(np.var(temp, axis=0)), shape=predicted_data.shape)
+    temp3 = temp1 / temp2
+    mse = np.sum((temp1 / temp2) ** 2, axis=1)
+    print("z:", mse)
+    print(mse.shape)
+
+    # mse=np.mean((predicted_data-label)**2,axis=1)
+    print("mse", mse)
+
+    dims, = mse.shape
+
+    mean = np.mean(mse)
+    std = np.sqrt(np.var(mse))
+    max = mean + 3 * std
+    # min = mean-3*std
+    max = np.broadcast_to(max, shape=[dims, ])
+    # min = np.broadcast_to(min,shape=[dims,])
+    mean = np.broadcast_to(mean, shape=[dims, ])
+
+    # plt.plot(max)
+    # plt.plot(mse)
+    # plt.plot(mean)
+    # # plt.plot(min)
+    # plt.show()
+    #
+    #
+    return mse, mean, max
+    # pass
+
+
+def condition_monitoring_model():
+    input = tf.keras.Input(shape=[time_stamp, feature_num])
+    conv1 = tf.keras.layers.Conv1D(filters=256, kernel_size=1)(input)
+    GRU1 = tf.keras.layers.GRU(128, return_sequences=False)(conv1)
+    d1 = tf.keras.layers.Dense(300)(GRU1)
+    output = tf.keras.layers.Dense(10)(d1)
+
+    model = tf.keras.Model(inputs=input, outputs=output)
+
+    return model
+
+
+# trian_data:(300455,120,10)
+# trian_label1:(300455,10)
+# trian_label2:(300455,)
+def shuffle(train_data, train_label1, train_label2, is_split: bool = False, split_size: float = 0.2):
+    (train_data, test_data, train_label1, test_label1, train_label2, test_label2) = train_test_split(train_data,
+                                                                                                     train_label1,
+                                                                                                     train_label2,
+                                                                                                     test_size=split_size,
+                                                                                                     shuffle=True,
+                                                                                                     random_state=100)
+    if is_split:
+        return train_data, train_label1, train_label2, test_data, test_label1, test_label2
+    train_data = np.concatenate([train_data, test_data], axis=0)
+    train_label1 = np.concatenate([train_label1, test_label1], axis=0)
+    train_label2 = np.concatenate([train_label2, test_label2], axis=0)
+    # print(train_data.shape)
+    # print(train_label1.shape)
+    # print(train_label2.shape)
+    # print(train_data.shape)
+
+    return train_data, train_label1, train_label2
+    pass
+
+
+def split_test_data(healthy_data, healthy_label1, healthy_label2, unhealthy_data, unhealthy_label1, unhealthy_label2,
+                    split_size: float = 0.2, shuffle: bool = True):
+    data = np.concatenate([healthy_data, unhealthy_data], axis=0)
+    label1 = np.concatenate([healthy_label1, unhealthy_label1], axis=0)
+    label2 = np.concatenate([healthy_label2, unhealthy_label2], axis=0)
+    (train_data, test_data, train_label1, test_label1, train_label2, test_label2) = train_test_split(data,
+                                                                                                     label1,
+                                                                                                     label2,
+                                                                                                     test_size=split_size,
+                                                                                                     shuffle=shuffle,
+                                                                                                     random_state=100)
+
+    # print(train_data.shape)
+    # print(train_label1.shape)
+    # print(train_label2.shape)
+    # print(train_data.shape)
+
+    return train_data, train_label1, train_label2, test_data, test_label1, test_label2
+
+    pass
+
+
+# trian_data:(300455,120,10)
+# trian_label1:(300455,10)
+# trian_label2:(300455,)
+def train_step_one(train_data, train_label1, train_label2):
+    model = Joint_Monitoring()
+    # # # # TODO 需要运行编译一次,才能打印model.summary()
+    # model.build(input_shape=(batch_size, filter_num, dims))
+    # model.summary()
+    history_loss = []
+    history_val_loss = []
+    learning_rate = 1e-3
+    for epoch in range(EPOCH):
+
+        print()
+        print("EPOCH:", epoch, "/", EPOCH, ":")
+        train_data, train_label1, train_label2 = shuffle(train_data, train_label1, train_label2)
+        if epoch == 0:
+            train_data, train_label1, train_label2, val_data, val_label1, val_label2 = shuffle(train_data, train_label1,
+                                                                                               train_label2,
+                                                                                               is_split=True)
+        # print()
+        # print("EPOCH:", epoch, "/", EPOCH, ":")
+        # 用于让train知道，这是这个epoch中的第几次训练
+        z = 0
+        # 用于batch_size次再训练
+        k = 1
+        for data_1, label_1, label_2 in zip(train_data, train_label1, train_label2):
+            size, _, _ = train_data.shape
+            data_1 = tf.expand_dims(data_1, axis=0)
+            label_1 = tf.expand_dims(label_1, axis=0)
+            label_2 = tf.expand_dims(label_2, axis=0)
+            if batch_size != 1:
+                if k % batch_size == 1:
+                    data = data_1
+                    label1 = label_1
+                    label2 = label_2
+                else:
+                    data = tf.concat([data, data_1], axis=0)
+                    label1 = tf.concat([label1, label_1], axis=0)
+                    label2 = tf.concat([label2, label_2], axis=0)
+            else:
+                data = data_1
+                label1 = label_1
+                label2 = label_2
+
+            if k % batch_size == 0:
+                # label = tf.expand_dims(label, axis=-1)
+                loss_value, accuracy_value = model.train(input_tensor=data, label1=label1, label2=label2,
+                                                         learning_rate=learning_rate,
+                                                         is_first_time=True)
+                print(z * batch_size, "/", size, ":===============>", "loss:", loss_value.numpy())
+                k = 0
+                z = z + 1
+            k = k + 1
+        val_loss, val_accuracy = model.get_val_loss(val_data=val_data, val_label1=val_label1, val_label2=val_label2,
+                                                    is_first_time=True)
+        SaveBestModel(model=model, save_name=save_name, history_loss=history_val_loss, loss_value=val_loss.numpy())
+        # SaveBestH5Model(model=model, save_name=save_name, history_loss=history_val_loss, loss_value=val_loss.numpy())
+        history_val_loss.append(val_loss)
+        history_loss.append(loss_value.numpy())
+        print('Training loss is :', loss_value.numpy())
+        print('Validating loss is :', val_loss.numpy())
+        if IsStopTraining(history_loss=history_val_loss, patience=7):
+            break
+        if Is_Reduce_learning_rate(history_loss=history_val_loss, patience=3):
+            if learning_rate >= 1e-4:
+                learning_rate = learning_rate * 0.1
+    pass
+
+
+def train_step_two(step_one_model, step_two_model, train_data, train_label1, train_label2):
+    # step_two_model = Joint_Monitoring()
+    # step_two_model.build(input_shape=(batch_size, time_stamp, feature_num))
+    # step_two_model.summary()
+    history_loss = []
+    history_val_loss = []
+    history_accuracy = []
+    learning_rate = 1e-3
+    for epoch in range(EPOCH):
+        print()
+        print("EPOCH:", epoch, "/", EPOCH, ":")
+        train_data, train_label1, train_label2 = shuffle(train_data, train_label1, train_label2)
+        if epoch == 0:
+            train_data, train_label1, train_label2, val_data, val_label1, val_label2 = shuffle(train_data, train_label1,
+                                                                                               train_label2,
+                                                                                               is_split=True)
+        # print()
+        # print("EPOCH:", epoch, "/", EPOCH, ":")
+        # 用于让train知道，这是这个epoch中的第几次训练
+        z = 0
+        # 用于batch_size次再训练
+        k = 1
+        accuracy_num = 0
+        for data_1, label_1, label_2 in zip(train_data, train_label1, train_label2):
+            size, _, _ = train_data.shape
+            data_1 = tf.expand_dims(data_1, axis=0)
+            label_1 = tf.expand_dims(label_1, axis=0)
+            label_2 = tf.expand_dims(label_2, axis=0)
+            if batch_size != 1:
+                if k % batch_size == 1:
+                    data = data_1
+                    label1 = label_1
+                    label2 = label_2
+                else:
+                    data = tf.concat([data, data_1], axis=0)
+                    label1 = tf.concat([label1, label_1], axis=0)
+                    label2 = tf.concat([label2, label_2], axis=0)
+            else:
+                data = data_1
+                label1 = label_1
+                label2 = label_2
+
+            if k % batch_size == 0:
+                # label = tf.expand_dims(label, axis=-1)
+                output1, output2, output3, _ = step_one_model.call(inputs=data, is_first_time=True)
+                loss_value, accuracy_value = step_two_model.train(input_tensor=data, label1=label1, label2=label2,
+                                                                  learning_rate=learning_rate,
+                                                                  is_first_time=False, pred_3=output1, pred_4=output2,
+                                                                  pred_5=output3)
+                accuracy_num += accuracy_value
+                print(z * batch_size, "/", size, ":===============>", "loss:", loss_value.numpy(), "| accuracy:",
+                      accuracy_num / ((z + 1) * batch_size))
+                k = 0
+                z = z + 1
+            k = k + 1
+
+        val_loss, val_accuracy = step_two_model.get_val_loss(val_data=val_data, val_label1=val_label1,
+                                                             val_label2=val_label2,
+                                                             is_first_time=False, step_one_model=step_one_model)
+        SaveBestModelByAccuracy(model=step_two_model, save_name=save_step_two_name, history_accuracy=history_accuracy,
+                                accuracy_value=val_accuracy)
+        history_val_loss.append(val_loss)
+        history_loss.append(loss_value.numpy())
+        history_accuracy.append(val_accuracy)
+        print('Training loss is : {0} | Training accuracy is : {1}'.format(loss_value.numpy(),
+                                                                           accuracy_num / ((z + 1) * batch_size)))
+        print('Validating loss is : {0} | Validating accuracy is : {1}'.format(val_loss.numpy(), val_accuracy))
+        if IsStopTraining(history_loss=history_val_loss, patience=7):
+            break
+        if Is_Reduce_learning_rate(history_loss=history_val_loss, patience=3):
+            if learning_rate >= 1e-4:
+                learning_rate = learning_rate * 0.1
+    pass
+
+
+def test(step_one_model, step_two_model, test_data, test_label1, test_label2):
+    history_loss = []
+    history_val_loss = []
+
+    val_loss, val_accuracy = step_two_model.get_val_loss(val_data=test_data, val_label1=test_label1,
+                                                         val_label2=test_label2,
+                                                         is_first_time=False, step_one_model=step_one_model)
+
+    history_val_loss.append(val_loss)
+    print("val_accuracy:", val_accuracy)
+    print("val_loss:", val_loss)
+
+
+def showResult(step_two_model: Joint_Monitoring, test_data, isPlot: bool = False):
+    # 获取模型的所有参数的个数
+    # step_two_model.count_params()
+    total_result = []
+    size, length, dims = test_data.shape
+    for epoch in range(0, size - batch_size + 1, batch_size):
+        each_test_data = test_data[epoch:epoch + batch_size, :, :]
+        _, _, _, output4 = step_two_model.call(each_test_data, is_first_time=False)
+        total_result.append(output4)
+    total_result = np.reshape(total_result, [total_result.__len__(), -1])
+    total_result = np.reshape(total_result, [-1, ])
+    if isPlot:
+        plt.scatter(list(range(total_result.shape[0])), total_result, c='black', s=10)
+        # 画出 y=1 这条水平线
+        plt.axhline(0.5, c='red', label='Failure threshold')
+        # 箭头指向上面的水平线
+        # plt.arrow(35000, 0.9, 33000, 0.75, head_width=0.02, head_length=0.1, shape="full", fc='red', ec='red',
+        #           alpha=0.9, overhang=0.5)
+        # plt.text(35000, 0.9, "Truth Fault", fontsize=10, color='black', verticalalignment='top')
+        plt.axvline(test_data.shape[0] * 2 / 3, c='blue', ls='-.')
+        plt.xlabel("time")
+        plt.ylabel("confience")
+        plt.text(total_result.shape[0] * 4 / 5, 0.6, "Fault", fontsize=10, color='black', verticalalignment='top',
+                 horizontalalignment='center',
+                 bbox={'facecolor': 'grey',
+                       'pad': 10})
+        plt.text(total_result.shape[0] * 1 / 3, 0.4, "Norm", fontsize=10, color='black', verticalalignment='top',
+                 horizontalalignment='center',
+                 bbox={'facecolor': 'grey',
+                       'pad': 10})
+        plt.grid()
+        # plt.ylim(0, 1)
+        # plt.xlim(-50, 1300)
+        # plt.legend("", loc='upper left')
+        plt.show()
+    return total_result
+
+
+if __name__ == '__main__':
+    total_data = loadData.execute(N=feature_num, file_name=file_name)
+    total_data = normalization(data=total_data)
+    train_data_healthy, train_label1_healthy, train_label2_healthy = get_training_data_overlapping(
+        total_data[:healthy_date, :], is_Healthy=True)
+    train_data_unhealthy, train_label1_unhealthy, train_label2_unhealthy = get_training_data_overlapping(
+        total_data[healthy_date - time_stamp + unhealthy_patience:unhealthy_date, :],
+        is_Healthy=False)
+    #### TODO 第一步训练
+    # 单次测试
+    # train_step_one(train_data=train_data_healthy[:32, :, :], train_label1=train_label1_healthy[:32, :],train_label2=train_label2_healthy[:32, ])
+    # train_step_one(train_data=train_data_healthy, train_label1=train_label1_healthy, train_label2=train_label2_healthy)
+
+    # 导入第一步已经训练好的模型,一个继续训练，一个只输出结果
+    # step_one_model = Joint_Monitoring()
+    # step_one_model.load_weights(save_name)
+    #
+    # step_two_model = Joint_Monitoring()
+    # step_two_model.load_weights(save_name)
+
+    #### TODO 第二步训练
+    ### healthy_data.shape: (300333,120,10)
+    ### unhealthy_data.shape: (16594,10)
+    healthy_size, _, _ = train_data_healthy.shape
+    unhealthy_size, _, _ = train_data_unhealthy.shape
+    # train_data, train_label1, train_label2, test_data, test_label1, test_label2 = split_test_data(
+    #     healthy_data=train_data_healthy[healthy_size - 2 * unhealthy_size:, :, :],
+    #     healthy_label1=train_label1_healthy[healthy_size - 2 * unhealthy_size:, :],
+    #     healthy_label2=train_label2_healthy[healthy_size - 2 * unhealthy_size:, ], unhealthy_data=train_data_unhealthy,
+    #     unhealthy_label1=train_label1_unhealthy, unhealthy_label2=train_label2_unhealthy)
+    # train_step_two(step_one_model=step_one_model, step_two_model=step_two_model,
+    #                train_data=train_data,
+    #                train_label1=train_label1, train_label2=np.expand_dims(train_label2, axis=-1))
+
+    ### TODO 测试测试集
+    step_one_model = Joint_Monitoring()
+    step_one_model.load_weights(save_name)
+    step_two_model = Joint_Monitoring()
+    step_two_model.load_weights(save_step_two_name)
+    # test(step_one_model=step_one_model, step_two_model=step_two_model, test_data=test_data, test_label1=test_label1,
+    #      test_label2=np.expand_dims(test_label2, axis=-1))
+
+    ###TODO 展示全部的结果
+    all_data, _, _ = get_training_data_overlapping(
+        total_data[healthy_size - 2 * unhealthy_size:unhealthy_date, :], is_Healthy=True)
+    # all_data = np.concatenate([])
+    # 单次测试
+    # showResult(step_two_model, test_data=all_data[:32], isPlot=True)
+    showResult(step_two_model, test_data=all_data, isPlot=True)
+
+    pass