51 lines
2.7 KiB
Python
51 lines
2.7 KiB
Python
# -*- encoding:utf-8 -*-
|
||
|
||
'''
|
||
@Author : dingjiawen
|
||
@Date : 2024/03/27 15:33
|
||
@Usage :
|
||
@Desc : 正式爬取起点书籍
|
||
'''
|
||
|
||
import requests
|
||
from pyquery import PyQuery as pq
|
||
from crawel_detail import crawel_detail
|
||
|
||
'''
|
||
观察请求,可以发现,有两种方式获得chapter_id
|
||
'''
|
||
# url = 'https://www.qidian.com/ajax/book/category?_csrfToken={_csrfToken}&bookId={bookId}&w_tsfp={w_tsfp}'
|
||
url = 'https://www.qidian.com/book/1031940621/'
|
||
|
||
header = {
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
||
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||
'Cache-Control': 'no-cache',
|
||
'Connection': 'keep-alive',
|
||
'Cookie': '_yep_uuid=59404ab0-2696-4162-b763-1256a5ca1dca; e1=%7B%22l6%22%3A%221%22%2C%22l1%22%3A%22%22%2C%22pid%22%3A%22qd_P_xiangqing%22%2C%22eid%22%3A%22%22%7D; e2=%7B%22l6%22%3A%221%22%2C%22l1%22%3A%22%22%2C%22pid%22%3A%22qd_P_xiangqing%22%2C%22eid%22%3A%22%22%7D; newstatisticUUID=1669693998_1518827460; _csrfToken=9VUvhprKzOz80xLUYXqgOzIcm011iQ9DfhwSyUD2; Hm_lvt_f00f67093ce2f38f215010b699629083=1701842016; supportwebp=true; supportWebp=true; _ga=GA1.1.67407022.1669694001; _ga_FZMMH98S83=GS1.1.1701860853.4.1.1701860855.0.0.0; _ga_PFYW0QLV3P=GS1.1.1701860853.4.1.1701860855.0.0.0; fu=923381569; trkf=1; traffic_utm_referer=https%3A//cn.bing.com/; w_tsfp=ltvgWVEE2utBvS0Q6KvslUKvEj87Z2R7xFw0D+M9Os09AaYjV5iM2IZ+utfldCyCt5Mxutrd9MVxYnGAU9QgexgdRcSYb5tH1VPHx8NlntdKRQJtA5KOD1McdbpzvTJCL24LIRDu3mt3ItRJmONgj14K5yZ137ZlCa8hbMFbixsAqOPFm/97DxvSliPXAHGHM3wLc+6C6rgv8LlSgW2DugDuLi11A7lD2UGS0yoeG3pV8w2pJbsDal7wcpK9Uv8wrTPzwjn3apCs2RYj4VA3sB49AtX02TXKL3ZEIAtrZUqukO18Lv3wdaN4qzsLX/hITghGqlkd5usw+EBJWXnsZSOLAf8r4wEEQ/JcrZ6+NA==',
|
||
'Host': 'www.qidian.com',
|
||
'Pragma': 'no-cache',
|
||
# TODO refer是防盗链,即访问当前请求的上一级,确保访问当前页面是合理的
|
||
'Referer': 'https://www.qidian.com/all/',
|
||
'Sec-Fetch-Dest': 'document',
|
||
'Sec-Fetch-Mode': 'navigate',
|
||
'Sec-Fetch-Site': 'same-origin',
|
||
'Sec-Fetch-User': '?1',
|
||
'Upgrade-Insecure-Requests': '1',
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
||
'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
|
||
'sec-ch-ua-mobile': '?0',
|
||
'sec-ch-ua-platform': '"Windows"',
|
||
}
|
||
|
||
response = requests.get(url, headers=header)
|
||
doc = pq(response.text)
|
||
result =[]
|
||
for item in doc('.chapter-item a').items():
|
||
strs = item.attr('href').split('/')
|
||
chapter_id, book_id = strs[-2], strs[-3]
|
||
# result.append((book_id,chapter_id))
|
||
crawel_detail(book_id,chatpter_id=chapter_id)
|
||
|