# -*- encoding:utf-8 -*- ''' @Author : dingjiawen @Date : 2023/11/8 15:15 @Usage : @Desc : ''' from lxml import etree ''' XPath基本规则: 1) nodename:选择此节点的所有子节点 2) /:从当前节点选取直接子节点 3) //:从当前阶段选择子孙节点 4) .:选取当前节点 5) ..:选取当前节点的父节点 6) @:选取属性举例: //title[@lang='eng]代表选择所有名称为title,同时属性lang的值为eng的节点 ''' def htmlByString(): text = '''

''' html = etree.HTML(text) result = etree.tostring(html) print(result.decode('utf-8')) def htmlByFile(): html = etree.parse('./test.html', etree.HTMLParser()) result = etree.tostring(html) print(result.decode('utf-8')) def allNode(): html = etree.parse('./test.html', etree.HTMLParser()) # 从头开始匹配所有的 result = html.xpath('//*') print(result) print(result[0]) # 匹配所有li的 result = html.xpath('//li') print(result) print(result[0]) # 子节点匹配 def childNode(): html = etree.parse('./test.html', etree.HTMLParser()) # 匹配所有li的子节点a result = html.xpath('//li/a') print(result) print(result[0]) # 匹配所有li的子孙节点a 相当于只要是子节点下面的就可以匹配上 result = html.xpath('//ul//a') print(result) print(result[0]) # 父节点匹配 def fatherNode(): html = etree.parse('./test.html', etree.HTMLParser()) # 匹配a节点属性href是link4.html的父节点的class属性 result = html.xpath('//a[@href="link4.html"]/../@class') print(result) # 也可以通过parent::来获取 result = html.xpath('//a[@href="link4.html"]/parent::*/@class') print(result) # 文本获取 def textGet(): html = etree.parse('./test.html', etree.HTMLParser()) # 匹配li节点属性class是item-0的节点的子节点a的text result = html.xpath('//li[@class="item-0"]/a/text()') print(result) # ['first item', 'fifth item'] # 匹配li节点属性class是item-0的节点的子孙节点的text result = html.xpath('//li[@class="item-0"]//text()') print(result) # ['first item', 'fifth item', '\r\n '] # 属性获取 def fieldGet(): html = etree.parse('./test.html', etree.HTMLParser()) # 匹配li节点属性class是item-0的节点的子节点a的href属性 result = html.xpath('//li/a/@href') print(result) # ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html'] # 属性多值匹配 def fieldsGet(): text = '''

''' html = etree.HTML(text) result = html.xpath('//li[@class="li"]/a/text()') print(result) # [] 匹配不到 result = html.xpath('//li[contains(@class, "li")]/a/text()') print(result) # ['first item'] contains匹配到了 # 多属性匹配 def fieldssGet(): text = '''

first item

''' html = etree.HTML(text) # 多属性用and连接 result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()') print(result) # 按序选择 def orderGet(): text = '''

''' html = etree.HTML(text) result = html.xpath('//li[1]/a/text()') print(result) # ['first item'] result = html.xpath('//li[last()]/a/text()') print(result) # ['fifth item'] result = html.xpath('//li[position()<3]/a/text()') print(result) # ['first item', 'second item'] result = html.xpath('//li[last()-2]/a/text()') print(result) # ['third item'] def nodeSelect(): text = '''

''' html = etree.HTML(text) result = html.xpath('//li[1]/ancestor::*') print(result) # ancestor获取祖先 result = html.xpath('//li[1]/ancestor::div') print(result) # attribute获取所有属性 result = html.xpath('//li[1]/attribute::*') print(result) # child获取子节点 result = html.xpath('//li[1]/child::a[@href="link1.html"]') print(result) # descendant获取子孙结点 result = html.xpath('//li[1]/descendant::span') print(result) # following获取当前节点之后的所有节点 result = html.xpath('//li[1]/following::*[2]') print(result) # following-sibling获取当前节点之后的同级节点 result = html.xpath('//li[1]/following-sibling::*') print(result) if __name__ == '__main__': nodeSelect()