# -*- encoding:utf-8 -*- ''' @Author : dingjiawen @Date : 2024/03/26 18:59 @Usage : @Desc :使用playwright跳过加密逻辑爬取网站 ''' from playwright.sync_api import sync_playwright import time import requests BASE_URL= 'https://spa6.scrape.center/' INDEX_URL = BASE_URL + "/api/movie?limit={limit}&offset={offset}&token={token}" MAX_PAGE = 10 LIMIT = 10 content = sync_playwright().start() browser = content.chromium.launch() page = browser.new_page() # 注意这里路径需要加上** page.route( "**/js/chunk-19c920f8.c3a1129d.js", lambda route: route.fulfill(path='chunk.js') ) page.goto(BASE_URL, wait_until='networkidle') def get_token(): # page.wait_for_function('window.encrypt !== undefined') try: result = page.evaluate('''() => { console.log('window',window) console.log('encrypt',window.encrypt) return window.encrypt("%s") }''' % ('/api/movie')) return result except Exception as e: time.sleep(100) print(e) def get_key(): pass for i in range(MAX_PAGE): offset = LIMIT*i result = requests.get(INDEX_URL.format(limit=LIMIT,offset=offset,token=get_token())) print(result.text) # 到这里已经基本可以爬出了,但是对于详情页的key爬取逻辑有点逆向不出来,无论是关键字还是啥,都断不住