# -*- encoding:utf-8 -*- ''' @Author : dingjiawen @Date : 2023/11/8 16:54 @Usage : @Desc :Pyquery学习 参考: https://github.com/Python3WebSpider/PyQueryTest ''' from pyquery import PyQuery as pq # 字符串初始化 def stringBase(): html = '''
''' doc = pq(html) print(doc('li')) # URL初始化 def URLBase(): doc = pq(url='https://cuiqingcai.com') print(doc('title')) # 上述代码等同于下面 # doc = pq(requests.get('https://cuiqingcai.com').text) # print(doc('title')) # 文件初始化 def fileBase(): doc = pq(filename='demo.html') print(doc('li')) # 基本的css选择器 def cssSelect(): html = '''
''' doc = pq(html) print(doc('#container .list li')) print(type(doc('#container .list li'))) # for item in doc('#container .list li').items(): print(item.text()) # 寻找子节点 def child(): html = '''
''' doc = pq(html) items = doc('.list') print(type(items)) print(items) lis = items.find('li') print(type(lis)) print(lis) # # lis = items.children() print(type(lis)) print(lis) # lis = items.children('.active') print(lis) def parent(): html = '''
''' from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') container = items.parent() print(type(container)) print(container) from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') parents = items.parents() print(type(parents)) print(parents) parent = items.parents('.wrap') print(parent) from pyquery import PyQuery as pq doc = pq(html) li = doc('.list .item-0.active') print(li.siblings()) def brother(): html = '''
''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.list .item-0.active') print(li.siblings('.active')) from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) print(str(li)) from pyquery import PyQuery as pq doc = pq(html) # 可能是多个节点 lis = doc('li').items() print(type(lis)) for li in lis: print(li, type(li)) def attrs(): html = '''
''' from pyquery import PyQuery as pq doc = pq(html) a = doc('.item-0.active a') print(a, type(a)) print(a.attr('href')) a = doc('a') print(a, type(a)) print(a.attr('href')) print(a.attr.href) from pyquery import PyQuery as pq doc = pq(html) a = doc('a') for item in a.items(): # 获取属性和文本 print(item.attr('href'),item.text()) def getHTML(): html = '''
''' from pyquery import PyQuery as pq doc = pq(html) li = doc('li') print(li.html()) # 第一个节点对应的html second item print(li.text()) # 所有匹配的节点的文本 second item third item fourth item fifth item print(type(li.text())) # 增加或者删除节点的class def operateNode(): html = '''
''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) li.removeClass('active') print(li) li.addClass('active') print(li) '''
  • third item
  • third item
  • third item
  • ''' def operateNodeInformation(): html = ''' ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) li.attr('name', 'link') print(li) li.text('changed item') print(li) li.html('changed item') print(li) '''
  • third item
  • changed item
  • changed item
  • ''' def removeInformation(): html = '''
    Hello, World

    This is a paragraph.

    ''' from pyquery import PyQuery as pq doc = pq(html) wrap = doc('.wrap') print(wrap.text()) ''' Hello, World This is a paragraph. ''' wrap.find('p').remove() print(wrap.text()) ''' Hello, World ''' # 伪类选择器 def fakeCSSSelect(): html = '''
    ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('li:first-child') print(li) li = doc('li:last-child') print(li) li = doc('li:nth-child(2)') print(li) li = doc('li:gt(2)') print(li) li = doc('li:nth-child(2n)') print(li) li = doc('li:contains(second)') print(li) if __name__ == '__main__': fakeCSSSelect()