self_example/Spider/spider_practice/起点/crawel_list.py

34 lines
864 B
Python

# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2024/03/27 14:00
@Usage :
@Desc :爬取起点 指定章节的指定详情页
'''
import requests
from pyquery import PyQuery as pq
with open(f"./test1.html", 'r', encoding='utf-8') as file:
content = file.read()
doc = pq(content)
result =[]
for item in doc('.chapter-item a').items():
strs = item.attr('href').split('/')
chapter_id, book_id = strs[-2], strs[-3]
result.append((book_id,chapter_id))
# title = doc('.chapter-item')
# with open(f"./output/{title.text}.txt", 'w', encoding='utf-8') as file:
# for line in doc('.content p').items():
# file.write(line.text() + '\n') # 写入每行,并在末尾添加换行符
# if __name__ == '__main__':
# book_id = 1031940621
# chapter_id = 705235484
# crawel_detail(book_id=book_id, chatpter_id=chapter_id)