# -*- encoding:utf-8 -*- ''' @Author : dingjiawen @Date : 2023/11/8 15:15 @Usage : @Desc : ''' from lxml import etree ''' XPath基本规则: 1) nodename:选择此节点的所有子节点 2) /:从当前节点选取直接子节点 3) //:从当前阶段选择子孙节点 4) .:选取当前节点 5) ..:选取当前节点的父节点 6) @:选取属性 举例: //title[@lang='eng]代表选择所有名称为title,同时属性lang的值为eng的节点 ''' def htmlByString(): text = '''
''' html = etree.HTML(text) result = etree.tostring(html) print(result.decode('utf-8')) def htmlByFile(): html = etree.parse('./test.html', etree.HTMLParser()) result = etree.tostring(html) print(result.decode('utf-8')) def allNode(): html = etree.parse('./test.html', etree.HTMLParser()) # 从头开始匹配所有的 result = html.xpath('//*') print(result) print(result[0]) # 匹配所有li的 result = html.xpath('//li') print(result) print(result[0]) # 子节点匹配 def childNode(): html = etree.parse('./test.html', etree.HTMLParser()) # 匹配所有li的子节点a result = html.xpath('//li/a') print(result) print(result[0]) # 匹配所有li的子孙节点a 相当于只要是子节点下面的就可以匹配上 result = html.xpath('//ul//a') print(result) print(result[0]) # 父节点匹配 def fatherNode(): html = etree.parse('./test.html', etree.HTMLParser()) # 匹配a节点属性href是link4.html的父节点的class属性 result = html.xpath('//a[@href="link4.html"]/../@class') print(result) # 也可以通过parent::来获取 result = html.xpath('//a[@href="link4.html"]/parent::*/@class') print(result) # 文本获取 def textGet(): html = etree.parse('./test.html', etree.HTMLParser()) # 匹配li节点属性class是item-0的节点的子节点a的text result = html.xpath('//li[@class="item-0"]/a/text()') print(result) # ['first item', 'fifth item'] # 匹配li节点属性class是item-0的节点的子孙节点的text result = html.xpath('//li[@class="item-0"]//text()') print(result) # ['first item', 'fifth item', '\r\n '] # 属性获取 def fieldGet(): html = etree.parse('./test.html', etree.HTMLParser()) # 匹配li节点属性class是item-0的节点的子节点a的href属性 result = html.xpath('//li/a/@href') print(result) # ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html'] # 属性多值匹配 def fieldsGet(): text = '''