330 lines
9.4 KiB
Python
330 lines
9.4 KiB
Python
# -*- encoding:utf-8 -*-
|
|
|
|
'''
|
|
@Author : dingjiawen
|
|
@Date : 2023/11/8 16:54
|
|
@Usage :
|
|
@Desc :Pyquery学习 参考: https://github.com/Python3WebSpider/PyQueryTest
|
|
'''
|
|
from pyquery import PyQuery as pq
|
|
|
|
|
|
# 字符串初始化
|
|
def stringBase():
|
|
html = '''
|
|
<div>
|
|
<ul>
|
|
<li class="item-0">first item</li>
|
|
<li class="item-1"><a href="link2.html">second item</a></li>
|
|
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
|
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
|
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
|
</ul>
|
|
</div>
|
|
'''
|
|
|
|
doc = pq(html)
|
|
print(doc('li'))
|
|
|
|
|
|
# URL初始化
|
|
def URLBase():
|
|
doc = pq(url='https://cuiqingcai.com')
|
|
print(doc('title'))
|
|
|
|
# 上述代码等同于下面
|
|
# doc = pq(requests.get('https://cuiqingcai.com').text)
|
|
# print(doc('title'))
|
|
|
|
|
|
# 文件初始化
|
|
def fileBase():
|
|
doc = pq(filename='demo.html')
|
|
print(doc('li'))
|
|
|
|
# 基本的css选择器
|
|
def cssSelect():
|
|
html = '''
|
|
<div id="container">
|
|
<ul class="list">
|
|
<li class="item-0">first item</li>
|
|
<li class="item-1"><a href="link2.html">second item</a></li>
|
|
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
|
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
|
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
|
</ul>
|
|
</div>
|
|
'''
|
|
doc = pq(html)
|
|
print(doc('#container .list li'))
|
|
print(type(doc('#container .list li')))
|
|
|
|
#
|
|
for item in doc('#container .list li').items():
|
|
print(item.text())
|
|
|
|
# 寻找子节点
|
|
def child():
|
|
html = '''
|
|
<div>
|
|
<ul class="list">
|
|
<li class="item-0">first item</li>
|
|
<li class="item-1"><a href="link2.html">second item</a></li>
|
|
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
|
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
|
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
|
</ul>
|
|
</div>
|
|
'''
|
|
doc = pq(html)
|
|
items = doc('.list')
|
|
print(type(items))
|
|
print(items)
|
|
lis = items.find('li')
|
|
print(type(lis))
|
|
print(lis)
|
|
#
|
|
#
|
|
lis = items.children()
|
|
print(type(lis))
|
|
print(lis)
|
|
|
|
#
|
|
lis = items.children('.active')
|
|
print(lis)
|
|
|
|
|
|
def parent():
|
|
html = '''
|
|
<div class="wrap">
|
|
<div id="container">
|
|
<ul class="list">
|
|
<li class="item-0">first item</li>
|
|
<li class="item-1"><a href="link2.html">second item</a></li>
|
|
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
|
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
|
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
'''
|
|
from pyquery import PyQuery as pq
|
|
doc = pq(html)
|
|
items = doc('.list')
|
|
container = items.parent()
|
|
print(type(container))
|
|
print(container)
|
|
|
|
from pyquery import PyQuery as pq
|
|
doc = pq(html)
|
|
items = doc('.list')
|
|
parents = items.parents()
|
|
print(type(parents))
|
|
print(parents)
|
|
|
|
parent = items.parents('.wrap')
|
|
print(parent)
|
|
|
|
from pyquery import PyQuery as pq
|
|
doc = pq(html)
|
|
li = doc('.list .item-0.active')
|
|
print(li.siblings())
|
|
|
|
def brother():
|
|
html = '''
|
|
<div class="wrap">
|
|
<div id="container">
|
|
<ul class="list">
|
|
<li class="item-0">first item</li>
|
|
<li class="item-1"><a href="link2.html">second item</a></li>
|
|
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
|
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
|
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
'''
|
|
from pyquery import PyQuery as pq
|
|
doc = pq(html)
|
|
li = doc('.list .item-0.active')
|
|
print(li.siblings('.active'))
|
|
|
|
from pyquery import PyQuery as pq
|
|
doc = pq(html)
|
|
li = doc('.item-0.active')
|
|
print(li)
|
|
print(str(li))
|
|
|
|
from pyquery import PyQuery as pq
|
|
doc = pq(html)
|
|
# 可能是多个节点
|
|
lis = doc('li').items()
|
|
print(type(lis))
|
|
for li in lis:
|
|
print(li, type(li))
|
|
|
|
def attrs():
|
|
html = '''
|
|
<div class="wrap">
|
|
<div id="container">
|
|
<ul class="list">
|
|
<li class="item-0">first item</li>
|
|
<li class="item-1"><a href="link2.html">second item</a></li>
|
|
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
|
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
|
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
'''
|
|
from pyquery import PyQuery as pq
|
|
doc = pq(html)
|
|
a = doc('.item-0.active a')
|
|
print(a, type(a))
|
|
print(a.attr('href'))
|
|
|
|
a = doc('a')
|
|
print(a, type(a))
|
|
print(a.attr('href'))
|
|
print(a.attr.href)
|
|
|
|
from pyquery import PyQuery as pq
|
|
doc = pq(html)
|
|
a = doc('a')
|
|
for item in a.items():
|
|
# 获取属性和文本
|
|
print(item.attr('href'),item.text())
|
|
|
|
def getHTML():
|
|
html = '''
|
|
<div class="wrap">
|
|
<div id="container">
|
|
<ul class="list">
|
|
<li class="item-1"><a href="link2.html">second item</a></li>
|
|
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
|
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
|
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
'''
|
|
from pyquery import PyQuery as pq
|
|
doc = pq(html)
|
|
li = doc('li')
|
|
print(li.html()) # 第一个节点对应的html <a href="link2.html">second item</a>
|
|
print(li.text()) # 所有匹配的节点的文本 second item third item fourth item fifth item
|
|
print(type(li.text()))
|
|
|
|
# 增加或者删除节点的class
|
|
def operateNode():
|
|
html = '''
|
|
<div class="wrap">
|
|
<div id="container">
|
|
<ul class="list">
|
|
<li class="item-0">first item</li>
|
|
<li class="item-1"><a href="link2.html">second item</a></li>
|
|
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
|
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
|
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
'''
|
|
from pyquery import PyQuery as pq
|
|
doc = pq(html)
|
|
li = doc('.item-0.active')
|
|
print(li)
|
|
li.removeClass('active')
|
|
print(li)
|
|
li.addClass('active')
|
|
print(li)
|
|
|
|
'''
|
|
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
|
|
|
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
|
|
|
|
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
|
'''
|
|
|
|
|
|
|
|
def operateNodeInformation():
|
|
html = '''
|
|
<ul class="list">
|
|
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
|
</ul>
|
|
'''
|
|
from pyquery import PyQuery as pq
|
|
doc = pq(html)
|
|
li = doc('.item-0.active')
|
|
print(li)
|
|
li.attr('name', 'link')
|
|
print(li)
|
|
li.text('changed item')
|
|
print(li)
|
|
li.html('<span>changed item</span>')
|
|
print(li)
|
|
'''
|
|
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
|
|
<li class="item-0 active" name="link">changed item</li>
|
|
<li class="item-0 active" name="link"><span>changed item</span></li>
|
|
'''
|
|
|
|
|
|
def removeInformation():
|
|
html = '''
|
|
<div class="wrap">
|
|
Hello, World
|
|
<p>This is a paragraph.</p>
|
|
</div>
|
|
'''
|
|
from pyquery import PyQuery as pq
|
|
doc = pq(html)
|
|
wrap = doc('.wrap')
|
|
print(wrap.text())
|
|
'''
|
|
Hello, World
|
|
This is a paragraph.
|
|
'''
|
|
wrap.find('p').remove()
|
|
print(wrap.text())
|
|
'''
|
|
Hello, World
|
|
'''
|
|
|
|
# 伪类选择器
|
|
def fakeCSSSelect():
|
|
html = '''
|
|
<div class="wrap">
|
|
<div id="container">
|
|
<ul class="list">
|
|
<li class="item-0">first item</li>
|
|
<li class="item-1"><a href="link2.html">second item</a></li>
|
|
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
|
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
|
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
'''
|
|
from pyquery import PyQuery as pq
|
|
doc = pq(html)
|
|
li = doc('li:first-child')
|
|
print(li)
|
|
li = doc('li:last-child')
|
|
print(li)
|
|
li = doc('li:nth-child(2)')
|
|
print(li)
|
|
li = doc('li:gt(2)')
|
|
print(li)
|
|
li = doc('li:nth-child(2n)')
|
|
print(li)
|
|
li = doc('li:contains(second)')
|
|
print(li)
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
fakeCSSSelect()
|