self_example/Spider/spider_practice/起点/crawel_list.py

# -*- encoding:utf-8 -*-

'''
@Author : dingjiawen
@Date : 2024/03/27 14:00
@Usage :
@Desc :爬取起点 指定章节的指定详情页
'''

import requests
from pyquery import PyQuery as pq

with open(f"./test1.html", 'r', encoding='utf-8') as file:
    content = file.read()

doc = pq(content)
result =[]
for item in doc('.chapter-item a').items():
    strs = item.attr('href').split('/')
    chapter_id, book_id = strs[-2], strs[-3]
    result.append((book_id,chapter_id))


# title = doc('.chapter-item')
# with open(f"./output/{title.text}.txt", 'w', encoding='utf-8') as file:
#     for line in doc('.content p').items():
#         file.write(line.text() + '\n')  # 写入每行，并在末尾添加换行符


# if __name__ == '__main__':
#     book_id = 1031940621
#     chapter_id = 705235484
#     crawel_detail(book_id=book_id, chatpter_id=chapter_id)