196 lines
5.6 KiB
Python
196 lines
5.6 KiB
Python
# -*- encoding:utf-8 -*-
|
|
|
|
'''
|
|
@Author : dingjiawen
|
|
@Date : 2023/11/8 15:15
|
|
@Usage :
|
|
@Desc :
|
|
'''
|
|
|
|
from lxml import etree
|
|
|
|
'''
|
|
XPath基本规则:
|
|
|
|
1) nodename:选择此节点的所有子节点
|
|
2) /:从当前节点选取直接子节点
|
|
3) //:从当前阶段选择子孙节点
|
|
4) .:选取当前节点
|
|
5) ..:选取当前节点的父节点
|
|
6) @:选取属性
|
|
|
|
举例:
|
|
//title[@lang='eng]代表选择所有名称为title,同时属性lang的值为eng的节点
|
|
'''
|
|
|
|
|
|
def htmlByString():
|
|
text = '''
|
|
<div>
|
|
<ul>
|
|
<li class="item-0"><a href="link1.html">first item</a></li>
|
|
<li class="item-1"><a href="link2.html">second item</a></li>
|
|
<li class="item-inactive"><a href="link3.html">third item</a></li>
|
|
<li class="item-1"><a href="link4.html">fourth item</a></li>
|
|
<li class="item-0"><a href="link5.html">fifth item</a>
|
|
</ul>
|
|
</div>
|
|
'''
|
|
html = etree.HTML(text)
|
|
result = etree.tostring(html)
|
|
print(result.decode('utf-8'))
|
|
|
|
|
|
def htmlByFile():
|
|
html = etree.parse('./test.html', etree.HTMLParser())
|
|
result = etree.tostring(html)
|
|
print(result.decode('utf-8'))
|
|
|
|
|
|
def allNode():
|
|
html = etree.parse('./test.html', etree.HTMLParser())
|
|
# 从头开始匹配所有的
|
|
result = html.xpath('//*')
|
|
print(result)
|
|
print(result[0])
|
|
|
|
# 匹配所有li的
|
|
result = html.xpath('//li')
|
|
print(result)
|
|
print(result[0])
|
|
|
|
|
|
# 子节点匹配
|
|
def childNode():
|
|
html = etree.parse('./test.html', etree.HTMLParser())
|
|
|
|
# 匹配所有li的子节点a
|
|
result = html.xpath('//li/a')
|
|
print(result)
|
|
print(result[0])
|
|
|
|
# 匹配所有li的子孙节点a 相当于只要是子节点下面的就可以匹配上
|
|
result = html.xpath('//ul//a')
|
|
print(result)
|
|
print(result[0])
|
|
|
|
|
|
# 父节点匹配
|
|
def fatherNode():
|
|
html = etree.parse('./test.html', etree.HTMLParser())
|
|
|
|
# 匹配a节点属性href是link4.html的父节点的class属性
|
|
result = html.xpath('//a[@href="link4.html"]/../@class')
|
|
print(result)
|
|
# 也可以通过parent::来获取
|
|
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
|
|
print(result)
|
|
|
|
|
|
# 文本获取
|
|
def textGet():
|
|
html = etree.parse('./test.html', etree.HTMLParser())
|
|
|
|
# 匹配li节点属性class是item-0的节点的子节点a的text
|
|
result = html.xpath('//li[@class="item-0"]/a/text()')
|
|
print(result) # ['first item', 'fifth item']
|
|
|
|
# 匹配li节点属性class是item-0的节点的子孙节点的text
|
|
result = html.xpath('//li[@class="item-0"]//text()')
|
|
print(result) # ['first item', 'fifth item', '\r\n ']
|
|
|
|
|
|
# 属性获取
|
|
def fieldGet():
|
|
html = etree.parse('./test.html', etree.HTMLParser())
|
|
|
|
# 匹配li节点属性class是item-0的节点的子节点a的href属性
|
|
result = html.xpath('//li/a/@href')
|
|
print(result) # ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
|
|
|
|
|
|
# 属性多值匹配
|
|
def fieldsGet():
|
|
text = '''
|
|
<li class="li li-first"><a href="link.html">first item</a></li>
|
|
'''
|
|
html = etree.HTML(text)
|
|
result = html.xpath('//li[@class="li"]/a/text()')
|
|
print(result) # [] 匹配不到
|
|
|
|
result = html.xpath('//li[contains(@class, "li")]/a/text()')
|
|
print(result) # ['first item'] contains匹配到了
|
|
|
|
|
|
# 多属性匹配
|
|
def fieldssGet():
|
|
text = '''
|
|
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
|
|
'''
|
|
html = etree.HTML(text)
|
|
# 多属性用and连接
|
|
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
|
|
print(result)
|
|
|
|
|
|
# 按序选择
|
|
def orderGet():
|
|
text = '''
|
|
<div>
|
|
<ul>
|
|
<li class="item-0"><a href="link1.html">first item</a></li>
|
|
<li class="item-1"><a href="link2.html">second item</a></li>
|
|
<li class="item-inactive"><a href="link3.html">third item</a></li>
|
|
<li class="item-1"><a href="link4.html">fourth item</a></li>
|
|
<li class="item-0"><a href="link5.html">fifth item</a>
|
|
</ul>
|
|
</div>
|
|
'''
|
|
html = etree.HTML(text)
|
|
result = html.xpath('//li[1]/a/text()')
|
|
print(result) # ['first item']
|
|
result = html.xpath('//li[last()]/a/text()')
|
|
print(result) # ['fifth item']
|
|
result = html.xpath('//li[position()<3]/a/text()')
|
|
print(result) # ['first item', 'second item']
|
|
result = html.xpath('//li[last()-2]/a/text()')
|
|
print(result) # ['third item']
|
|
|
|
|
|
def nodeSelect():
|
|
text = '''
|
|
<div>
|
|
<ul>
|
|
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
|
|
<li class="item-1"><a href="link2.html">second item</a></li>
|
|
<li class="item-inactive"><a href="link3.html">third item</a></li>
|
|
<li class="item-1"><a href="link4.html">fourth item</a></li>
|
|
<li class="item-0"><a href="link5.html">fifth item</a>
|
|
</ul>
|
|
</div>
|
|
'''
|
|
html = etree.HTML(text)
|
|
result = html.xpath('//li[1]/ancestor::*')
|
|
print(result)
|
|
# ancestor获取祖先
|
|
result = html.xpath('//li[1]/ancestor::div')
|
|
print(result)
|
|
# attribute获取所有属性
|
|
result = html.xpath('//li[1]/attribute::*')
|
|
print(result)
|
|
# child获取子节点
|
|
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
|
|
print(result)
|
|
# descendant获取子孙结点
|
|
result = html.xpath('//li[1]/descendant::span')
|
|
print(result)
|
|
# following获取当前节点之后的所有节点
|
|
result = html.xpath('//li[1]/following::*[2]')
|
|
print(result)
|
|
# following-sibling获取当前节点之后的同级节点
|
|
result = html.xpath('//li[1]/following-sibling::*')
|
|
print(result)
|
|
|
|
if __name__ == '__main__':
|
|
nodeSelect()
|