self_example/Spider/Chapter03_网页数据的提取/XPath库/XpathLearning.py

196 lines
5.6 KiB
Python

# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/11/8 15:15
@Usage :
@Desc :
'''
from lxml import etree
'''
XPath基本规则:
1) nodename:选择此节点的所有子节点
2) /:从当前节点选取直接子节点
3) //:从当前阶段选择子孙节点
4) .:选取当前节点
5) ..:选取当前节点的父节点
6) @:选取属性
举例:
//title[@lang='eng]代表选择所有名称为title,同时属性lang的值为eng的节点
'''
def htmlByString():
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = etree.tostring(html)
print(result.decode('utf-8'))
def htmlByFile():
html = etree.parse('./test.html', etree.HTMLParser())
result = etree.tostring(html)
print(result.decode('utf-8'))
def allNode():
html = etree.parse('./test.html', etree.HTMLParser())
# 从头开始匹配所有的
result = html.xpath('//*')
print(result)
print(result[0])
# 匹配所有li的
result = html.xpath('//li')
print(result)
print(result[0])
# 子节点匹配
def childNode():
html = etree.parse('./test.html', etree.HTMLParser())
# 匹配所有li的子节点a
result = html.xpath('//li/a')
print(result)
print(result[0])
# 匹配所有li的子孙节点a 相当于只要是子节点下面的就可以匹配上
result = html.xpath('//ul//a')
print(result)
print(result[0])
# 父节点匹配
def fatherNode():
html = etree.parse('./test.html', etree.HTMLParser())
# 匹配a节点属性href是link4.html的父节点的class属性
result = html.xpath('//a[@href="link4.html"]/../@class')
print(result)
# 也可以通过parent::来获取
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
print(result)
# 文本获取
def textGet():
html = etree.parse('./test.html', etree.HTMLParser())
# 匹配li节点属性class是item-0的节点的子节点a的text
result = html.xpath('//li[@class="item-0"]/a/text()')
print(result) # ['first item', 'fifth item']
# 匹配li节点属性class是item-0的节点的子孙节点的text
result = html.xpath('//li[@class="item-0"]//text()')
print(result) # ['first item', 'fifth item', '\r\n ']
# 属性获取
def fieldGet():
html = etree.parse('./test.html', etree.HTMLParser())
# 匹配li节点属性class是item-0的节点的子节点a的href属性
result = html.xpath('//li/a/@href')
print(result) # ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
# 属性多值匹配
def fieldsGet():
text = '''
<li class="li li-first"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[@class="li"]/a/text()')
print(result) # [] 匹配不到
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result) # ['first item'] contains匹配到了
# 多属性匹配
def fieldssGet():
text = '''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
# 多属性用and连接
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
print(result)
# 按序选择
def orderGet():
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/a/text()')
print(result) # ['first item']
result = html.xpath('//li[last()]/a/text()')
print(result) # ['fifth item']
result = html.xpath('//li[position()<3]/a/text()')
print(result) # ['first item', 'second item']
result = html.xpath('//li[last()-2]/a/text()')
print(result) # ['third item']
def nodeSelect():
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/ancestor::*')
print(result)
# ancestor获取祖先
result = html.xpath('//li[1]/ancestor::div')
print(result)
# attribute获取所有属性
result = html.xpath('//li[1]/attribute::*')
print(result)
# child获取子节点
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)
# descendant获取子孙结点
result = html.xpath('//li[1]/descendant::span')
print(result)
# following获取当前节点之后的所有节点
result = html.xpath('//li[1]/following::*[2]')
print(result)
# following-sibling获取当前节点之后的同级节点
result = html.xpath('//li[1]/following-sibling::*')
print(result)
if __name__ == '__main__':
nodeSelect()