53 lines
1.3 KiB
Python
53 lines
1.3 KiB
Python
# -*- encoding:utf-8 -*-
|
||
|
||
'''
|
||
@Author : dingjiawen
|
||
@Date : 2024/03/26 18:59
|
||
@Usage :
|
||
@Desc :使用playwright跳过加密逻辑爬取网站
|
||
'''
|
||
|
||
from playwright.sync_api import sync_playwright
|
||
import time
|
||
import requests
|
||
|
||
BASE_URL= 'https://spa6.scrape.center/'
|
||
INDEX_URL = BASE_URL + "/api/movie?limit={limit}&offset={offset}&token={token}"
|
||
MAX_PAGE = 10
|
||
LIMIT = 10
|
||
|
||
content = sync_playwright().start()
|
||
|
||
browser = content.chromium.launch()
|
||
|
||
page = browser.new_page()
|
||
|
||
# 注意这里路径需要加上**
|
||
page.route(
|
||
"**/js/chunk-19c920f8.c3a1129d.js",
|
||
lambda route: route.fulfill(path='chunk.js')
|
||
)
|
||
page.goto(BASE_URL, wait_until='networkidle')
|
||
|
||
def get_token():
|
||
# page.wait_for_function('window.encrypt !== undefined')
|
||
try:
|
||
result = page.evaluate('''() => {
|
||
console.log('window',window)
|
||
console.log('encrypt',window.encrypt)
|
||
return window.encrypt("%s")
|
||
}''' % ('/api/movie'))
|
||
return result
|
||
except Exception as e:
|
||
time.sleep(100)
|
||
print(e)
|
||
|
||
def get_key():
|
||
|
||
pass
|
||
|
||
for i in range(MAX_PAGE):
|
||
offset = LIMIT*i
|
||
result = requests.get(INDEX_URL.format(limit=LIMIT,offset=offset,token=get_token()))
|
||
print(result.text)
|
||
# 到这里已经基本可以爬出了,但是对于详情页的key爬取逻辑有点逆向不出来,无论是关键字还是啥,都断不住 |