244 lines
7.4 KiB
Python
244 lines
7.4 KiB
Python
# -*- encoding:utf-8 -*-
|
|
|
|
'''
|
|
@Author : dingjiawen
|
|
@Date : 2023/11/8 16:08
|
|
@Usage :
|
|
@Desc :参考 https://github.com/Python3WebSpider/BeautifulSoupTest
|
|
'''
|
|
|
|
html = """
|
|
<html><head><title>The Dormouse's story</title></head>
|
|
<body>
|
|
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
|
|
<p class="story">Once upon a time there were three little sisters; and their names were
|
|
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
|
|
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
|
|
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
|
|
and they lived at the bottom of a well.</p>
|
|
<p class="story">...</p>
|
|
"""
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
def baseUse():
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
print(soup.title) # <title>The Dormouse's story</title>
|
|
print(type(soup.title)) # <class 'bs4.element.Tag'>
|
|
print(soup.title.string) # The Dormouse's story
|
|
print(soup.head) # <head><title>The Dormouse's story</title></head>
|
|
print(soup.p) # <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
|
|
print(soup.p.name) # 获取节点名称 p
|
|
print(soup.p.attrs) # 获取属性 {'class': ['title'], 'name': 'dromouse'}
|
|
print(soup.p.attrs['name']) # 获取属性值 dromouse
|
|
print(soup.p['name']) # 获取属性值 dromouse
|
|
print(soup.body.p['name']) # 嵌套选择 dromouse
|
|
|
|
print("==========================")
|
|
|
|
|
|
def child():
|
|
html = """
|
|
<html>
|
|
<head>
|
|
<title>The Dormouse's story</title>
|
|
</head>
|
|
<body>
|
|
<p class="story">
|
|
Once upon a time there were three little sisters; and their names were
|
|
<a href="http://example.com/elsie" class="sister" id="link1">
|
|
<span>Elsie</span>
|
|
</a>
|
|
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
|
|
and
|
|
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
|
|
and they lived at the bottom of a well.
|
|
</p>
|
|
<p class="story">...</p>
|
|
"""
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
# 子结点
|
|
for i, child in enumerate(soup.p.children):
|
|
print(i, child)
|
|
print("===============================")
|
|
# 子孙节点
|
|
for i, child in enumerate(soup.p.descendants):
|
|
print(i, child)
|
|
print("===============================")
|
|
|
|
|
|
def parent():
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
# 父节点
|
|
print(soup.a.parent)
|
|
print("===============================")
|
|
# 祖父节点
|
|
print(type(soup.a.parents))
|
|
print(list(enumerate(soup.a.parents)))
|
|
print("=============================")
|
|
|
|
|
|
def brother():
|
|
html = """
|
|
<html>
|
|
<body>
|
|
<p class="story">
|
|
Once upon a time there were three little sisters; and their names were
|
|
<a href="http://example.com/elsie" class="sister" id="link1">
|
|
<span>Elsie</span>
|
|
</a>
|
|
Hello
|
|
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
|
|
and
|
|
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
|
|
and they lived at the bottom of a well.
|
|
</p>
|
|
"""
|
|
# 兄弟节点
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
print('Next Sibling', soup.a.next_sibling)
|
|
print('Prev Sibling', soup.a.previous_sibling)
|
|
print('Next Siblings', list(enumerate(soup.a.next_siblings)))
|
|
print('Prev Siblings', list(enumerate(soup.a.previous_siblings)))
|
|
|
|
# 找到所有满足条件的
|
|
def findAll():
|
|
|
|
html = '''
|
|
<div class="panel">
|
|
<div class="panel-heading">
|
|
<h4>Hello</h4>
|
|
</div>
|
|
<div class="panel-body">
|
|
<ul class="list" id="list-1">
|
|
<li class="element">Foo</li>
|
|
<li class="element">Bar</li>
|
|
<li class="element">Jay</li>
|
|
</ul>
|
|
<ul class="list list-small" id="list-2">
|
|
<li class="element">Foo</li>
|
|
<li class="element">Bar</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
'''
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
print(soup.find_all(name='ul'))
|
|
print(type(soup.find_all(name='ul')[0]))
|
|
|
|
for ul in soup.find_all(name='ul'):
|
|
print(ul.find_all(name='li'))
|
|
|
|
for ul in soup.find_all(name='ul'):
|
|
print(ul.find_all(name='li'))
|
|
for li in ul.find_all(name='li'):
|
|
print(li.string)
|
|
|
|
|
|
# 找属性满足匹配得到
|
|
def attrs():
|
|
html = '''
|
|
<div class="panel">
|
|
<div class="panel-heading">
|
|
<h4>Hello</h4>
|
|
</div>
|
|
<div class="panel-body">
|
|
<ul class="list" id="list-1" name="elements">
|
|
<li class="element">Foo</li>
|
|
<li class="element">Bar</li>
|
|
<li class="element">Jay</li>
|
|
</ul>
|
|
<ul class="list list-small" id="list-2">
|
|
<li class="element">Foo</li>
|
|
<li class="element">Bar</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
'''
|
|
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
print(soup.find_all(attrs={'id': 'list-1'}))
|
|
print(soup.find_all(attrs={'name': 'elements'}))
|
|
|
|
# 常用的属性可以不用attrs传递
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
print(soup.find_all(id='list-1'))
|
|
print(soup.find_all(class_='element'))
|
|
import re
|
|
print(soup.find_all(string=re.compile('Foo')))# string等同于text,即里面的具体内容
|
|
|
|
|
|
# 返回匹配到的第一个元素
|
|
def find():
|
|
html = '''
|
|
<div class="panel">
|
|
<div class="panel-heading">
|
|
<h4>Hello</h4>
|
|
</div>
|
|
<div class="panel-body">
|
|
<ul class="list" id="list-1">
|
|
<li class="element">Foo</li>
|
|
<li class="element">Bar</li>
|
|
<li class="element">Jay</li>
|
|
</ul>
|
|
<ul class="list list-small" id="list-2">
|
|
<li class="element">Foo</li>
|
|
<li class="element">Bar</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
'''
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
print(soup.find(name='ul'))
|
|
print(type(soup.find(name='ul')))
|
|
print(soup.find(class_='list'))
|
|
|
|
# css选择器
|
|
def cssSelect():
|
|
html = '''
|
|
<div class="panel">
|
|
<div class="panel-heading">
|
|
<h4>Hello</h4>
|
|
</div>
|
|
<div class="panel-body">
|
|
<ul class="list" id="list-1">
|
|
<li class="element">Foo</li>
|
|
<li class="element">Bar</li>
|
|
<li class="element">Jay</li>
|
|
</ul>
|
|
<ul class="list list-small" id="list-2">
|
|
<li class="element">Foo</li>
|
|
<li class="element">Bar</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
'''
|
|
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
print(soup.select('.panel .panel-heading'))
|
|
print(soup.select('ul li'))
|
|
print(soup.select('#list-2 .element'))
|
|
print(type(soup.select('ul')[0]))
|
|
|
|
# 嵌套选择
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
for ul in soup.select('ul'):
|
|
print(ul.select('li'))
|
|
|
|
# 获取属性
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
for ul in soup.select('ul'):
|
|
print(ul['id'])
|
|
print(ul.attrs['id'])
|
|
|
|
# 获取文本
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
for li in soup.select('li'):
|
|
print('Get Text:', li.get_text())
|
|
print('String:', li.string)
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
cssSelect() |