self_example/Spider/Chapter07_动态渲染页面爬取/playwrightLearning/demo4常用操作.py

101 lines
2.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/12/7 15:12
@Usage :
@Desc : playwright常用操作
'''
from playwright.sync_api import sync_playwright
# 事件监听
def on_response(response):
print(f'Statue {response.status}: {response.url}')
# 截获ajax命令
def on_response1(response):
if '/api/movie/' in response.url and response.status == 200:
print(response.json())
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
# 监听response时间每次网络请求得到响应的时候会触发这个事件
# page.on('response', on_response)
page.on('response', on_response1)
page.goto('https://spa6.scrape.center/')
page.wait_for_load_state('networkidle')
browser.close()
# 获取页面源代码
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto('https://spa6.scrape.center/')
page.wait_for_load_state('networkidle')
html = page.content()
print(html)
browser.close()
# 获取节点内容
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto('https://spa6.scrape.center/')
page.wait_for_load_state('networkidle')
# 代表查找class为name的a节点第二个参数传href表示获取超链接的内容
href = page.get_attribute('a.name', 'href')
print(href)
browser.close()
# 获取多个节点
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto('https://spa6.scrape.center/')
page.wait_for_load_state('networkidle')
elements = page.query_selector_all('a.name')
for element in elements:
print(element.get_attribute('href'))
print(element.text_content())
browser.close()
# 网络拦截
import re
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
def canel_request(route, request):
route.abort()
page.route(re.compile(r"(\.png)|(\.jpg)"), canel_request)
page.goto("https://spa6.scrape.center/")
page.wait_for_load_state("networkidle")
page.screenshot(path='no_picture.png')
browser.close()
# 拦截之后填充自己的
import time
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
def modify_response(route, request):
route.fulfill(path="./custom_response.html")
page.route('/', modify_response)
page.goto("https://spa6.scrape.center/")
time.sleep(10)
browser.close()