self_example/Spider/spider_practice/起点/main.py

51 lines
2.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2024/03/27 15:33
@Usage :
@Desc : 正式爬取起点书籍
'''
import requests
from pyquery import PyQuery as pq
from crawel_detail import crawel_detail
'''
观察请求可以发现有两种方式获得chapter_id
'''
# url = 'https://www.qidian.com/ajax/book/category?_csrfToken={_csrfToken}&bookId={bookId}&w_tsfp={w_tsfp}'
url = 'https://www.qidian.com/book/1031940621/'
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': '_yep_uuid=59404ab0-2696-4162-b763-1256a5ca1dca; e1=%7B%22l6%22%3A%221%22%2C%22l1%22%3A%22%22%2C%22pid%22%3A%22qd_P_xiangqing%22%2C%22eid%22%3A%22%22%7D; e2=%7B%22l6%22%3A%221%22%2C%22l1%22%3A%22%22%2C%22pid%22%3A%22qd_P_xiangqing%22%2C%22eid%22%3A%22%22%7D; newstatisticUUID=1669693998_1518827460; _csrfToken=9VUvhprKzOz80xLUYXqgOzIcm011iQ9DfhwSyUD2; Hm_lvt_f00f67093ce2f38f215010b699629083=1701842016; supportwebp=true; supportWebp=true; _ga=GA1.1.67407022.1669694001; _ga_FZMMH98S83=GS1.1.1701860853.4.1.1701860855.0.0.0; _ga_PFYW0QLV3P=GS1.1.1701860853.4.1.1701860855.0.0.0; fu=923381569; trkf=1; traffic_utm_referer=https%3A//cn.bing.com/; w_tsfp=ltvgWVEE2utBvS0Q6KvslUKvEj87Z2R7xFw0D+M9Os09AaYjV5iM2IZ+utfldCyCt5Mxutrd9MVxYnGAU9QgexgdRcSYb5tH1VPHx8NlntdKRQJtA5KOD1McdbpzvTJCL24LIRDu3mt3ItRJmONgj14K5yZ137ZlCa8hbMFbixsAqOPFm/97DxvSliPXAHGHM3wLc+6C6rgv8LlSgW2DugDuLi11A7lD2UGS0yoeG3pV8w2pJbsDal7wcpK9Uv8wrTPzwjn3apCs2RYj4VA3sB49AtX02TXKL3ZEIAtrZUqukO18Lv3wdaN4qzsLX/hITghGqlkd5usw+EBJWXnsZSOLAf8r4wEEQ/JcrZ6+NA==',
'Host': 'www.qidian.com',
'Pragma': 'no-cache',
# TODO refer是防盗链即访问当前请求的上一级确保访问当前页面是合理的
'Referer': 'https://www.qidian.com/all/',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
response = requests.get(url, headers=header)
doc = pq(response.text)
result =[]
for item in doc('.chapter-item a').items():
strs = item.attr('href').split('/')
chapter_id, book_id = strs[-2], strs[-3]
# result.append((book_id,chapter_id))
crawel_detail(book_id,chatpter_id=chapter_id)