diff --git a/Spider/Chapter09_代理的使用/付费代理/__init__.py b/Spider/Chapter09_代理的使用/付费代理/__init__.py new file mode 100644 index 0000000..faef866 --- /dev/null +++ b/Spider/Chapter09_代理的使用/付费代理/__init__.py @@ -0,0 +1,8 @@ +#-*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/14 16:39 +@Usage : +@Desc : +''' \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/付费代理/ip.json b/Spider/Chapter09_代理的使用/付费代理/ip.json new file mode 100644 index 0000000..770a66a --- /dev/null +++ b/Spider/Chapter09_代理的使用/付费代理/ip.json @@ -0,0 +1,21 @@ +{ + "code": 0, + "msg": "", + "data": { + "count": 10, + "dedup_count": 10, + "order_left_count": 990, + "proxy_list": [ + "124.172.117.189:19812", + "219.133.31.120:26947", + "183.237.194.145:28436", + "183.62.172.50:23485", + "163.125.157.243:17503", + "183.57.42.79:26483", + "202.103.150.70:17251", + "182.254.129.124:15395", + "58.251.132.181:20659", + "112.95.241.76:21948" + ] + } +} \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/付费代理/test.py b/Spider/Chapter09_代理的使用/付费代理/test.py new file mode 100644 index 0000000..9bf34d3 --- /dev/null +++ b/Spider/Chapter09_代理的使用/付费代理/test.py @@ -0,0 +1,45 @@ +# -*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/14 16:40 +@Usage : +@Desc : +''' + +import requests +import json + +PROXY_API = 'https://dps.kdlapi.com/api/getdps/?secret_id=oimi28znnx51x79f3r0d&num=10&signature=25zjft23etaeswom3ipa56bsyqnne347&pt=1&format=json&sep=2' + + +def get_proxies(): + response = requests.get(PROXY_API) + res = json.loads(response.text) + return res['data']['proxy_list'] + + +def test_proxies(): + proxies = get_proxies() + # 注意这里要用户名和密码 在订单中心可以看https://www.kuaidaili.com/uc/dps/?orderid=930254289411869 + auth = "d2118699212:bxb0p3l8" + for proxy in proxies: + proxy = proxy.strip() + print(f"using proxy {proxy}") + p = { + 'http': f'http://{auth}@{proxy}', + 'https': f'http://{auth}@{proxy}', + + } + try: + requests.Request() + response = requests.get('http://www.httpbin.org/ip', proxies=p) + # response = requests.get('http://www.baidu.com', proxies=p) + print(response.text) + except requests.ConnectionError as e: + print(e) + print(f"proxy {proxy} is invalid") + + +if __name__ == '__main__': + test_proxies() diff --git a/Spider/Chapter09_代理的使用/代理反爬实战/__init__.py b/Spider/Chapter09_代理的使用/代理反爬实战/__init__.py new file mode 100644 index 0000000..e6578a0 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理反爬实战/__init__.py @@ -0,0 +1,8 @@ +#-*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/14 19:09 +@Usage : +@Desc : +''' \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/代理反爬实战/core/__init__.py b/Spider/Chapter09_代理的使用/代理反爬实战/core/__init__.py new file mode 100644 index 0000000..7da6ace --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理反爬实战/core/__init__.py @@ -0,0 +1,8 @@ +#-*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/14 19:20 +@Usage : +@Desc : +''' \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/代理反爬实战/core/config.py b/Spider/Chapter09_代理的使用/代理反爬实战/core/config.py new file mode 100644 index 0000000..f1b3380 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理反爬实战/core/config.py @@ -0,0 +1,31 @@ +# -*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/14 19:22 +@Usage : +@Desc : +''' + +from environs import Env + +env = Env() +env.read_env() + +REDIS_HOST = env.str('REDIS_HOST', '192.168.118.202') + +REDIS_PORT = env.int('REDIS_PORT', 6379) + +REDIS_PASSWORD = env.str('REDIS_PASSWORD', None) + +REDIS_KEY = env.str('REDIS_KEY', 'antispider5') + +PROXY_POOL_URL = env.str('PROXY_POOL_URL', 'http://127.0.0.1:5555/random') + +IS_AUTH_PROXY = env.bool('IS_AUTH_PROXY', True) + +TIMEOUT = env.int('TIMEOUT', 10) + +MAX_FAILED_TIME = env.int('MAX_FAILED_TIME', 20) + +VALID_STATUSES = env.list('VALID_STATUSES', [200]) diff --git a/Spider/Chapter09_代理的使用/代理反爬实战/core/db.py b/Spider/Chapter09_代理的使用/代理反爬实战/core/db.py new file mode 100644 index 0000000..824f1a9 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理反爬实战/core/db.py @@ -0,0 +1,48 @@ +#-*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/14 19:26 +@Usage : +@Desc : +''' + +from redis import StrictRedis +from core.config import * +from pickle import dumps, loads +from core.request import MovieRequest + +# Request对象不能直接存取,可以通过pickle的dumps和loads进行序列化和反序列化 +class RedisQueue(): + def __init__(self): + """ + init redis connection + """ + self.db = StrictRedis( + host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD) + + def add(self, request): + """ + add request to queue + :param request: request + :param fail_time: fail times + :return: result + """ + if isinstance(request, MovieRequest): + return self.db.rpush(REDIS_KEY, dumps(request)) + return False + + def pop(self): + """ + get next request + :return: Request or None + """ + if self.db.llen(REDIS_KEY): + return loads(self.db.lpop(REDIS_KEY)) + return False + + def clear(self): + self.db.delete(REDIS_KEY) + + def empty(self): + return self.db.llen(REDIS_KEY) == 0 \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/代理反爬实战/core/request.py b/Spider/Chapter09_代理的使用/代理反爬实战/core/request.py new file mode 100644 index 0000000..f671b1f --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理反爬实战/core/request.py @@ -0,0 +1,20 @@ +# -*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/14 19:21 +@Usage : +@Desc : +''' + +from core.config import * +from requests import Request + + +class MovieRequest(Request): + def __init__(self, url, callback, method='GET', headers=None, fail_time=0, timeout=TIMEOUT): + Request.__init__(self, method, url, headers) + # 增加几个参数,分别代表回调函数,失败次数,和超时时间 + self.callback = callback + self.fail_time = fail_time + self.timeout = timeout diff --git a/Spider/Chapter09_代理的使用/代理反爬实战/core/spider.py b/Spider/Chapter09_代理的使用/代理反爬实战/core/spider.py new file mode 100644 index 0000000..cf9d92c --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理反爬实战/core/spider.py @@ -0,0 +1,192 @@ +# -*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/14 19:34 +@Usage : 实际爬取 依赖于之前代理池的运行 +@Desc : +''' + +import re +import requests +from urllib.parse import urljoin +from requests import Session +from requests.exceptions import RequestException +from core.config import * +from core.db import RedisQueue +from core.request import MovieRequest +from pyquery import PyQuery as pq +from loguru import logger + +BASE_URL = 'https://antispider5.scrape.center/' +HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' +} + + +class Spider(): + session = Session() + queue = RedisQueue() + + @logger.catch + def get_proxy(self): + """ + get proxy from proxypool + :return: proxy + """ + response = requests.get(PROXY_POOL_URL) + if response.status_code == 200: + logger.debug(f'get proxy {response.text}') + return response.text + + @logger.catch + def get_proxy_safe(self): + """ + get proxy from proxypool + :return: proxy + """ + response = requests.get(PROXY_POOL_URL) + if response.status_code == 200: + logger.debug(f'get proxy {response.text}') + return response.text + + def start(self): + """ + start request + """ + self.session.headers.update(HEADERS) + start_url = BASE_URL + request = MovieRequest( + url=start_url, callback=self.parse_index) + # schedule first request + self.queue.add(request) + + def parse_index(self, response): + """ + parse index page + :param response: response + :return: new request + """ + doc = pq(response.text) + + # request for detail + items = doc('.item .name').items() + for item in items: + detail_url = urljoin(BASE_URL, item.attr('href')) + request = MovieRequest( + url=detail_url, callback=self.parse_detail) + yield request + + # request for next page + next_href = doc('.next').attr('href') + if next_href: + next_url = urljoin(BASE_URL, next_href) + request = MovieRequest( + url=next_url, callback=self.parse_index) + yield request + + def parse_detail(self, response): + """ + parse detail + :param response: response of detail + :return: data + """ + doc = pq(response.text) + cover = doc('img.cover').attr('src') + name = doc('a > h2').text() + categories = [item.text() + for item in doc('.categories button span').items()] + published_at = doc('.info:contains(上映)').text() + published_at = re.search('(\d{4}-\d{2}-\d{2})', published_at).group(1) \ + if published_at and re.search('\d{4}-\d{2}-\d{2}', published_at) else None + drama = doc('.drama p').text() + score = doc('p.score').text() + score = float(score) if score else None + yield { + 'cover': cover, + 'name': name, + 'categories': categories, + 'published_at': published_at, + 'drama': drama, + 'score': score + } + + def request(self, request): + """ + execute request + :param request: weixin request + :return: response + """ + try: + proxy = self.get_proxy() + logger.debug(f'get proxy {proxy}') + proxies = None + auth = "d2118699212:bxb0p3l8" + if proxy: + if IS_AUTH_PROXY: + proxies = { + 'http': f'http://{auth}@{proxy}', + 'https': f'http://{auth}@{proxy}', + } + else: + proxies = { + 'http': 'http://' + proxy, + 'https': 'https://' + proxy + } + return self.session.send(request.prepare(), + timeout=request.timeout, + proxies=proxies) + except RequestException: + logger.exception(f'requesting {request.url} failed') + + def error(self, request): + """ + error handling + :param request: request + :return: + """ + request.fail_time = request.fail_time + 1 + logger.debug( + f'request of {request.url} failed {request.fail_time} times') + if request.fail_time < MAX_FAILED_TIME: + self.queue.add(request) + + def schedule(self): + """ + schedule request + :return: + """ + # 从池子里取,然后不断爬取 + while not self.queue.empty(): + request = self.queue.pop() + callback = request.callback + logger.debug(f'executing request {request.url}') + response = self.request(request) + logger.debug(f'response status {response} of {request.url}') + if not response or not response.status_code in VALID_STATUSES: + self.error(request) + continue + results = list(callback(response)) + if not results: + self.error(request) + continue + for result in results: + if isinstance(result, MovieRequest): + logger.debug(f'generated new request {result.url}') + self.queue.add(result) + if isinstance(result, dict): + # 到这里就证明是爬取成功了,可以保存之类的了 + logger.debug(f'scraped new data {result}') + + def run(self): + """ + run + :return: + """ + self.start() + self.schedule() + + +if __name__ == '__main__': + spider = Spider() + spider.run() diff --git a/Spider/Chapter09_代理的使用/代理池的维护/__init__.py b/Spider/Chapter09_代理的使用/代理池的维护/__init__.py new file mode 100644 index 0000000..ed8e7df --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/__init__.py @@ -0,0 +1,8 @@ +#-*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/14 12:57 +@Usage : +@Desc : +''' \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/代理的设置/aiohttpDemo.py b/Spider/Chapter09_代理的使用/代理的设置/aiohttpDemo.py index e067db2..f107bf4 100644 --- a/Spider/Chapter09_代理的使用/代理的设置/aiohttpDemo.py +++ b/Spider/Chapter09_代理的使用/代理的设置/aiohttpDemo.py @@ -22,5 +22,19 @@ async def main(): print(await response.text()) +async def socks(): + from aiohttp_socks import ProxyConnector, ProxyType + connector = ProxyConnector( + proxy_type=ProxyType.HTTP, + host='127.0.0.1', + port=7890, + # username='user', + # password='password', + # rdns=True + ) + async with aiohttp.ClientSession(connector=connector) as session: + async with session.get('https://httpbin.org/get') as response: + print(await response.text()) + if __name__ == '__main__': asyncio.get_event_loop().run_until_complete(main()) diff --git a/Spider/Chapter09_代理的使用/代理的设置/playwrightDemo.py b/Spider/Chapter09_代理的使用/代理的设置/playwrightDemo.py new file mode 100644 index 0000000..b410349 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理的设置/playwrightDemo.py @@ -0,0 +1,45 @@ +# -*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/14 12:39 +@Usage : +@Desc : +''' + +from playwright.sync_api import sync_playwright + + +def http(): + with sync_playwright() as p: + browser = p.chromium.launch(headless=False, proxy={ + 'server': 'http://127.0.0.1:7890' + }) + page = browser.new_page() + page.goto('https://httpbin.org/get') + print(page.content()) + browser.close() + + +def http_auth(): + with sync_playwright() as p: + browser = p.chromium.launch(proxy={ + 'server': 'http://127.0.0.1:7890', + 'username': 'foo', + 'password': 'bar' + }) + page = browser.new_page() + page.goto('https://httpbin.org/get') + print(page.content()) + browser.close() + + +def socks(): + with sync_playwright() as p: + browser = p.chromium.launch(proxy={ + 'server': 'socks5://127.0.0.1:7891' + }) + page = browser.new_page() + page.goto('https://httpbin.org/get') + print(page.content()) + browser.close()