105 lines
2.8 KiB
Python
105 lines
2.8 KiB
Python
#-*- encoding:utf-8 -*-
|
|
|
|
'''
|
|
@Author : dingjiawen
|
|
@Date : 2023/12/7 19:15
|
|
@Usage :
|
|
@Desc :
|
|
'''
|
|
|
|
import logging
|
|
from os.path import exists
|
|
from os import makedirs
|
|
import json
|
|
import asyncio
|
|
from pyppeteer import launch
|
|
from pyppeteer.errors import TimeoutError
|
|
|
|
logging.basicConfig(level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s: %(message)s')
|
|
|
|
INDEX_URL = 'https://spa2.scrape.center/page/{page}'
|
|
TIMEOUT = 10
|
|
TOTAL_PAGE = 10
|
|
RESULTS_DIR = 'results'
|
|
WINDOW_WIDTH, WINDOW_HEIGHT = 1366, 768
|
|
|
|
exists(RESULTS_DIR) or makedirs(RESULTS_DIR)
|
|
|
|
browser, tab = None, None
|
|
HEADLESS = True
|
|
|
|
|
|
async def init():
|
|
global browser, tab
|
|
browser = await launch(headless=HEADLESS,
|
|
args=['--disable-infobars', f'--window-size={WINDOW_WIDTH},{WINDOW_HEIGHT}'])
|
|
tab = await browser.newPage()
|
|
await tab.setViewport({'width': WINDOW_WIDTH, 'height': WINDOW_HEIGHT})
|
|
|
|
|
|
async def scrape_page(url, selector):
|
|
logging.info('scraping %s', url)
|
|
try:
|
|
await tab.goto(url)
|
|
await tab.waitForSelector(selector, options={
|
|
'timeout': TIMEOUT * 1000
|
|
})
|
|
except TimeoutError:
|
|
logging.error('error occurred while scraping %s', url, exc_info=True)
|
|
|
|
|
|
async def scrape_index(page):
|
|
url = INDEX_URL.format(page=page)
|
|
await scrape_page(url, '.item .name')
|
|
|
|
|
|
async def parse_index():
|
|
return await tab.querySelectorAllEval('.item .name', 'nodes => nodes.map(node => node.href)')
|
|
|
|
|
|
async def scrape_detail(url):
|
|
await scrape_page(url, 'h2')
|
|
|
|
|
|
async def parse_detail():
|
|
url = tab.url
|
|
name = await tab.querySelectorEval('h2', 'node => node.innerText')
|
|
categories = await tab.querySelectorAllEval('.categories button span', 'nodes => nodes.map(node => node.innerText)')
|
|
cover = await tab.querySelectorEval('.cover', 'node => node.src')
|
|
score = await tab.querySelectorEval('.score', 'node => node.innerText')
|
|
drama = await tab.querySelectorEval('.drama p', 'node => node.innerText')
|
|
return {
|
|
'url': url,
|
|
'name': name,
|
|
'categories': categories,
|
|
'cover': cover,
|
|
'score': score,
|
|
'drama': drama
|
|
}
|
|
|
|
|
|
async def save_data(data):
|
|
name = data.get('name')
|
|
data_path = f'{RESULTS_DIR}/{name}.json'
|
|
json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
|
|
|
|
|
|
async def main():
|
|
await init()
|
|
try:
|
|
for page in range(1, TOTAL_PAGE + 1):
|
|
await scrape_index(page)
|
|
detail_urls = await parse_index()
|
|
for detail_url in detail_urls:
|
|
await scrape_detail(detail_url)
|
|
detail_data = await parse_detail()
|
|
logging.info('data %s', detail_data)
|
|
await save_data(detail_data)
|
|
finally:
|
|
await browser.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.get_event_loop().run_until_complete(main())
|