diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/.dockerignore b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/.dockerignore new file mode 100644 index 0000000..3eab792 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/.dockerignore @@ -0,0 +1,135 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +proxypool/.env +.DS_Store +.vscode \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/.gitignore b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/.gitignore new file mode 100644 index 0000000..16a7490 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/.gitignore @@ -0,0 +1,7 @@ +*.vscode +*.pyc +*.db +venv +/.idea +*.log +.DS_Store \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/Dockerfile b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/Dockerfile new file mode 100644 index 0000000..c5ca544 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.7-alpine AS build +COPY requirements.txt . +RUN apk update &&\ + apk add --no-cache gcc g++ libffi-dev openssl-dev libxml2-dev libxslt-dev build-base musl-dev &&\ + pip install -U pip &&\ + pip install --timeout 30 --user --no-cache-dir --no-warn-script-location -r requirements.txt + +FROM python:3.7-alpine +ENV APP_ENV=prod +ENV LOCAL_PKG="/root/.local" +COPY --from=build ${LOCAL_PKG} ${LOCAL_PKG} +RUN apk update && apk add --no-cache libffi-dev openssl-dev libxslt-dev &&\ + ln -sf ${LOCAL_PKG}/bin/* /usr/local/bin/ +WORKDIR /app +COPY . . +EXPOSE 5555 +VOLUME ["/app/proxypool/crawlers/private"] +ENTRYPOINT ["supervisord", "-c", "supervisord.conf"] \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/LICENSE b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/LICENSE new file mode 100644 index 0000000..89052ac --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Germey + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/README.md b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/README.md new file mode 100644 index 0000000..d01d7a0 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/README.md @@ -0,0 +1,359 @@ +# ProxyPool + +![build](https://github.com/Python3WebSpider/ProxyPool/workflows/build/badge.svg) +![deploy](https://github.com/Python3WebSpider/ProxyPool/workflows/deploy/badge.svg) +![](https://img.shields.io/badge/python-3.6%2B-brightgreen) +![Docker Pulls](https://img.shields.io/docker/pulls/germey/proxypool) + +简易高效的代理池,提供如下功能: + +- 定时抓取免费代理网站,简易可扩展。 +- 使用 Redis 对代理进行存储并对代理可用性进行排序。 +- 定时测试和筛选,剔除不可用代理,留下可用代理。 +- 提供代理 API,随机取用测试通过的可用代理。 + +代理池原理解析可见「[如何搭建一个高效的代理池](https://cuiqingcai.com/7048.html)」,建议使用之前阅读。 + +## 使用准备 + +首先当然是克隆代码并进入 ProxyPool 文件夹: + +``` +git clone https://github.com/Python3WebSpider/ProxyPool.git +cd ProxyPool +``` + +然后选用下面 Docker 和常规方式任意一个执行即可。 + +## 使用要求 + +可以通过两种方式来运行代理池,一种方式是使用 Docker(推荐),另一种方式是常规方式运行,要求如下: + +### Docker + +如果使用 Docker,则需要安装如下环境: + +- Docker +- Docker-Compose + +安装方法自行搜索即可。 + +官方 Docker Hub 镜像:[germey/proxypool](https://hub.docker.com/r/germey/proxypool) + +### 常规方式 + +常规方式要求有 Python 环境、Redis 环境,具体要求如下: + +- Python>=3.6 +- Redis + +## Docker 运行 + +如果安装好了 Docker 和 Docker-Compose,只需要一条命令即可运行。 + +```shell script +docker-compose up +``` + +运行结果类似如下: + +``` +redis | 1:M 19 Feb 2020 17:09:43.940 * DB loaded from disk: 0.000 seconds +redis | 1:M 19 Feb 2020 17:09:43.940 * Ready to accept connections +proxypool | 2020-02-19 17:09:44,200 CRIT Supervisor is running as root. Privileges were not dropped because no user is specified in the config file. If you intend to run as root, you can set user=root in the config file to avoid this message. +proxypool | 2020-02-19 17:09:44,203 INFO supervisord started with pid 1 +proxypool | 2020-02-19 17:09:45,209 INFO spawned: 'getter' with pid 10 +proxypool | 2020-02-19 17:09:45,212 INFO spawned: 'server' with pid 11 +proxypool | 2020-02-19 17:09:45,216 INFO spawned: 'tester' with pid 12 +proxypool | 2020-02-19 17:09:46,596 INFO success: getter entered RUNNING state, process has stayed up for > than 1 seconds (startsecs) +proxypool | 2020-02-19 17:09:46,596 INFO success: server entered RUNNING state, process has stayed up for > than 1 seconds (startsecs) +proxypool | 2020-02-19 17:09:46,596 INFO success: tester entered RUNNING state, process has stayed up for > than 1 seconds (startsecs) +``` + +可以看到 Redis、Getter、Server、Tester 都已经启动成功。 + +这时候访问 [http://localhost:5555/random](http://localhost:5555/random) 即可获取一个随机可用代理。 + +当然你也可以选择自己 Build,直接运行如下命令即可: + +``` +docker-compose -f build.yaml up +``` + +如果下载速度特别慢,可以自行修改 Dockerfile,修改: + +```diff +- RUN pip install -r requirements.txt ++ RUN pip install -r requirements.txt -i https://pypi.douban.com/simple +``` + +## 常规方式运行 + +如果不使用 Docker 运行,配置好 Python、Redis 环境之后也可运行,步骤如下。 + +### 安装和配置 Redis + +本地安装 Redis、Docker 启动 Redis、远程 Redis 都是可以的,只要能正常连接使用即可。 + +首先可以需要一下环境变量,代理池会通过环境变量读取这些值。 + +设置 Redis 的环境变量有两种方式,一种是分别设置 host、port、password,另一种是设置连接字符串,设置方法分别如下: + +设置 host、port、password,如果 password 为空可以设置为空字符串,示例如下: + +```shell script +export PROXYPOOL_REDIS_HOST='localhost' +export PROXYPOOL_REDIS_PORT=6379 +export PROXYPOOL_REDIS_PASSWORD='' +export PROXYPOOL_REDIS_DB=0 +``` + +或者只设置连接字符串: + +```shell script +export PROXYPOOL_REDIS_CONNECTION_STRING='redis://localhost' +``` + +这里连接字符串的格式需要符合 `redis://[:password@]host[:port][/database]` 的格式, +中括号参数可以省略,port 默认是 6379,database 默认是 0,密码默认为空。 + +以上两种设置任选其一即可。 + +### 安装依赖包 + +这里强烈推荐使用 [Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands) +或 [virtualenv](https://virtualenv.pypa.io/en/latest/user_guide.html) 创建虚拟环境,Python 版本不低于 3.6。 + +然后 pip 安装依赖即可: + +```shell script +pip3 install -r requirements.txt +``` + +### 运行代理池 + +两种方式运行代理池,一种是 Tester、Getter、Server 全部运行,另一种是按需分别运行。 + +一般来说可以选择全部运行,命令如下: + +```shell script +python3 run.py +``` + +运行之后会启动 Tester、Getter、Server,这时访问 [http://localhost:5555/random](http://localhost:5555/random) 即可获取一个随机可用代理。 + +或者如果你弄清楚了代理池的架构,可以按需分别运行,命令如下: + +```shell script +python3 run.py --processor getter +python3 run.py --processor tester +python3 run.py --processor server +``` + +这里 processor 可以指定运行 Tester、Getter 还是 Server。 + +## 使用 + +成功运行之后可以通过 [http://localhost:5555/random](http://localhost:5555/random) 获取一个随机可用代理。 + +可以用程序对接实现,下面的示例展示了获取代理并爬取网页的过程: + +```python +import requests + +proxypool_url = 'http://127.0.0.1:5555/random' +target_url = 'http://httpbin.org/get' + +def get_random_proxy(): + """ + get random proxy from proxypool + :return: proxy + """ + return requests.get(proxypool_url).text.strip() + +def crawl(url, proxy): + """ + use proxy to crawl page + :param url: page url + :param proxy: proxy, such as 8.8.8.8:8888 + :return: html + """ + proxies = {'http': 'http://' + proxy} + return requests.get(url, proxies=proxies).text + + +def main(): + """ + main method, entry point + :return: none + """ + proxy = get_random_proxy() + print('get random proxy', proxy) + html = crawl(target_url, proxy) + print(html) + +if __name__ == '__main__': + main() +``` + +运行结果如下: + +``` +get random proxy 116.196.115.209:8080 +{ + "args": {}, + "headers": { + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate", + "Host": "httpbin.org", + "User-Agent": "python-requests/2.22.0", + "X-Amzn-Trace-Id": "Root=1-5e4d7140-662d9053c0a2e513c7278364" + }, + "origin": "116.196.115.209", + "url": "https://httpbin.org/get" +} +``` + +可以看到成功获取了代理,并请求 httpbin.org 验证了代理的可用性。 + +## 可配置项 + +代理池可以通过设置环境变量来配置一些参数。 + +### 开关 + +- ENABLE_TESTER:允许 Tester 启动,默认 true +- ENABLE_GETTER:允许 Getter 启动,默认 true +- ENABLE_SERVER:运行 Server 启动,默认 true + +### 环境 + +- APP_ENV:运行环境,可以设置 dev、test、prod,即开发、测试、生产环境,默认 dev +- APP_DEBUG:调试模式,可以设置 true 或 false,默认 true +- APP_PROD_METHOD: 正式环境启动应用方式,默认是`gevent`, + 可选:`tornado`,`meinheld`(分别需要安装 tornado 或 meinheld 模块) + +### Redis 连接 + +- PROXYPOOL_REDIS_HOST / REDIS_HOST:Redis 的 Host,其中 PROXYPOOL_REDIS_HOST 会覆盖 REDIS_HOST 的值。 +- PROXYPOOL_REDIS_PORT / REDIS_PORT:Redis 的端口,其中 PROXYPOOL_REDIS_PORT 会覆盖 REDIS_PORT 的值。 +- PROXYPOOL_REDIS_PASSWORD / REDIS_PASSWORD:Redis 的密码,其中 PROXYPOOL_REDIS_PASSWORD 会覆盖 REDIS_PASSWORD 的值。 +- PROXYPOOL_REDIS_DB / REDIS_DB:Redis 的数据库索引,如 0、1,其中 PROXYPOOL_REDIS_DB 会覆盖 REDIS_DB 的值。 +- PROXYPOOL_REDIS_CONNECTION_STRING / REDIS_CONNECTION_STRING:Redis 连接字符串,其中 PROXYPOOL_REDIS_CONNECTION_STRING 会覆盖 REDIS_CONNECTION_STRING 的值。 +- PROXYPOOL_REDIS_KEY / REDIS_KEY:Redis 储存代理使用字典的名称,其中 PROXYPOOL_REDIS_KEY 会覆盖 REDIS_KEY 的值。 + +### 处理器 + +- CYCLE_TESTER:Tester 运行周期,即间隔多久运行一次测试,默认 20 秒 +- CYCLE_GETTER:Getter 运行周期,即间隔多久运行一次代理获取,默认 100 秒 +- TEST_URL:测试 URL,默认百度 +- TEST_TIMEOUT:测试超时时间,默认 10 秒 +- TEST_BATCH:批量测试数量,默认 20 个代理 +- TEST_VALID_STATUS:测试有效的状态码 +- API_HOST:代理 Server 运行 Host,默认 0.0.0.0 +- API_PORT:代理 Server 运行端口,默认 5555 +- API_THREADED:代理 Server 是否使用多线程,默认 true + +### 日志 + +- LOG_DIR:日志相对路径 +- LOG_RUNTIME_FILE:运行日志文件名称 +- LOG_ERROR_FILE:错误日志文件名称 +- LOG_ROTATION: 日志记录周转周期或大小,默认 500MB,见 [loguru - rotation](https://github.com/Delgan/loguru#easier-file-logging-with-rotation--retention--compression) +- LOG_RETENTION: 日志保留日期,默认 7 天,见 [loguru - retention](https://github.com/Delgan/loguru#easier-file-logging-with-rotation--retention--compression) +- ENABLE_LOG_FILE:是否输出 log 文件,默认 true,如果设置为 false,那么 ENABLE_LOG_RUNTIME_FILE 和 ENABLE_LOG_ERROR_FILE 都不会生效 +- ENABLE_LOG_RUNTIME_FILE:是否输出 runtime log 文件,默认 true +- ENABLE_LOG_ERROR_FILE:是否输出 error log 文件,默认 true + +以上内容均可使用环境变量配置,即在运行前设置对应环境变量值即可,如更改测试地址和 Redis 键名: + +```shell script +export TEST_URL=http://weibo.cn +export REDIS_KEY=proxies:weibo +``` + +即可构建一个专属于微博的代理池,有效的代理都是可以爬取微博的。 + +如果使用 Docker-Compose 启动代理池,则需要在 docker-compose.yml 文件里面指定环境变量,如: + +```yaml +version: "3" +services: + redis: + image: redis:alpine + container_name: redis + command: redis-server + ports: + - "6379:6379" + restart: always + proxypool: + build: . + image: "germey/proxypool" + container_name: proxypool + ports: + - "5555:5555" + restart: always + environment: + REDIS_HOST: redis + TEST_URL: http://weibo.cn + REDIS_KEY: proxies:weibo +``` + +## 扩展代理爬虫 + +代理的爬虫均放置在 proxypool/crawlers 文件夹下,目前对接了有限几个代理的爬虫。 + +若扩展一个爬虫,只需要在 crawlers 文件夹下新建一个 Python 文件声明一个 Class 即可。 + +写法规范如下: + +```python +from pyquery import PyQuery as pq +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler + +BASE_URL = 'http://www.664ip.cn/{page}.html' +MAX_PAGE = 5 + +class Daili66Crawler(BaseCrawler): + """ + daili66 crawler, http://www.66ip.cn/1.html + """ + urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('.containerbox table tr:gt(0)').items() + for tr in trs: + host = tr.find('td:nth-child(1)').text() + port = int(tr.find('td:nth-child(2)').text()) + yield Proxy(host=host, port=port) +``` + +在这里只需要定义一个 Crawler 继承 BaseCrawler 即可,然后定义好 urls 变量和 parse 方法即可。 + +- urls 变量即为爬取的代理网站网址列表,可以用程序定义也可写成固定内容。 +- parse 方法接收一个参数即 html,代理网址的 html,在 parse 方法里只需要写好 html 的解析,解析出 host 和 port,并构建 Proxy 对象 yield 返回即可。 + +网页的爬取不需要实现,BaseCrawler 已经有了默认实现,如需更改爬取方式,重写 crawl 方法即可。 + +欢迎大家多多发 Pull Request 贡献 Crawler,使其代理源更丰富强大起来。 + +## 部署 + +本项目提供了 Kubernetes 部署脚本,如需部署到 Kubernetes,请参考 [kubernetes](./kubernetes)。 + +## 待开发 + +- [ ] 前端页面管理 +- [ ] 使用情况统计分析 + +如有一起开发的兴趣可以在 Issue 留言,非常感谢! + +## LICENSE + +MIT diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/__init__.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/build.yaml b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/build.yaml new file mode 100644 index 0000000..74b2fd0 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/build.yaml @@ -0,0 +1,18 @@ +version: "3" +services: + redis4proxypool: + image: redis:alpine + container_name: redis4proxypool + ports: + - "6374:6379" + proxypool: + build: . + image: "germey/proxypool:master" + container_name: proxypool + ports: + - "5555:5555" + restart: always + # volumes: + # - proxypool/crawlers/private:/app/proxypool/crawlers/private + environment: + PROXYPOOL_REDIS_CONNECTION_STRING: redis://@redis4proxypool:6379/0 diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/docker-compose.yml b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/docker-compose.yml new file mode 100644 index 0000000..cf367f4 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/docker-compose.yml @@ -0,0 +1,18 @@ +version: "3" +services: + redis4proxypool: + image: redis:alpine + container_name: redis4proxypool + # ports: + # - "6374:6379" + proxypool: + image: "germey/proxypool:master" + container_name: proxypool + ports: + - "5555:5555" + restart: always + # volumes: + # - proxypool/crawlers/private:/app/proxypool/crawlers/private + environment: + PROXYPOOL_REDIS_HOST: redis4proxypool + diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/examples/__init__.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/examples/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/examples/usage.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/examples/usage.py new file mode 100644 index 0000000..bc699ba --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/examples/usage.py @@ -0,0 +1,39 @@ +import requests + + +proxypool_url = 'http://127.0.0.1:5555/random' +target_url = 'https://antispider5.scrape.center/' + + +def get_random_proxy(): + """ + get random proxy from proxypool + :return: proxy + """ + return requests.get(proxypool_url).text.strip() + + +def crawl(url, proxy): + """ + use proxy to crawl page + :param url: page url + :param proxy: proxy, such as 8.8.8.8:8888 + :return: html + """ + proxies = {'http': 'http://' + proxy} + return requests.get(url, proxies=proxies).text + + +def main(): + """ + main method, entry point + :return: none + """ + proxy = get_random_proxy() + print('get random proxy', proxy) + html = crawl(target_url, proxy) + print(html) + + +if __name__ == '__main__': + main() diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/examples/usage2.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/examples/usage2.py new file mode 100644 index 0000000..918c5eb --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/examples/usage2.py @@ -0,0 +1,95 @@ +# -*- coding: UTF-8 -*- + +''' +''' +import requests +import time +import threading +import urllib3 +from fake_headers import Headers +import uuid +from geolite2 import geolite2 +ips = [] + +# 爬数据的线程类 + +def getChinaIP(ip='127.0.0.1'): + reader = geolite2.reader() + ip_info = reader.get(ip) + geolite2.close() + print(ip_info) + return True if ip_info['country']['iso_code'] == 'CN' else False + + + +class CrawlThread(threading.Thread): + def __init__(self, proxyip): + super(CrawlThread, self).__init__() + self.proxyip = proxyip + + def run(self): + # 开始计时 + pure_ip_address = self.proxyip.split(':')[0] + # 验证IP归属 + if not getChinaIP(pure_ip_address): + # pass + raise ValueError('不是有效IP') + # + start = time.time() + # 消除关闭证书验证的警告 + urllib3.disable_warnings() + headers = Headers(headers=True).generate() + headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676' + headers['Pragma'] = 'no-cache' + headers['Host'] = 'bb.cf08tp.cn' + headers['x-forward-for'] = pure_ip_address + headers['Cookie'] = 'PHPSESSID={}'.format( + ''.join(str(uuid.uuid1()).split('-'))) + print(headers) + html = requests.get(headers=headers, url=targetUrl, proxies={ + "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode() + # 结束计时 + end = time.time() + # 输出内容 + print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) + + "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************") + +# 获取代理IP的线程类 + + +class GetIpThread(threading.Thread): + def __init__(self, fetchSecond): + super(GetIpThread, self).__init__() + self.fetchSecond = fetchSecond + + def run(self): + global ips + while True: + # 获取IP列表 + res = requests.get(apiUrl).content.decode() + # 按照\n分割获取到的IP + ips = res.split('\n') + # 利用每一个IP + for proxyip in ips: + if proxyip.strip(): + # 开启一个线程 + # CrawlThread(proxyip).start() + try: + CrawlThread(proxyip).run() + time.sleep(1.5) + except Exception as e: + print(e) + # 休眠 + time.sleep(len(ips) /self.fetchSecond ) + + +if __name__ == '__main__': + # 获取IP的API接口 + # apiUrl = "http://127.0.0.1:5555/all" + apiUrl = "http://127.0.0.1:5555/random" + # 要抓取的目标网站地址 + targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp=" + # targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp=' + fetchSecond = 5 + # 开始自动获取IP + GetIpThread(fetchSecond).start() diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/.helmignore b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/.helmignore new file mode 100644 index 0000000..9716c30 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/.helmignore @@ -0,0 +1,24 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ +image/ \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/Chart.yaml b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/Chart.yaml new file mode 100644 index 0000000..58db2bc --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/Chart.yaml @@ -0,0 +1,27 @@ +apiVersion: v2 +name: proxypool +description: A Efficient Proxy Pool + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# Keywords about this application. +keywords: + - proxypool + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +appVersion: 1.16.0 diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/README.md b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/README.md new file mode 100644 index 0000000..327880d --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/README.md @@ -0,0 +1,42 @@ +# Kubernetes 部署 + +这是用来快速部署本代理池的 Helm Charts。 + +首先需要有一个 Kubernetes 集群,其次需要安装 Helm,确保 helm 命令可以正常运行。 + +安装参考: + +- Kubernetes:[https://setup.scrape.center/kubernetes](https://setup.scrape.center/kubernetes)。 +- Helm: [https://setup.scrape.center/helm](https://setup.scrape.center/helm)。 + +## 安装 + +安装直接使用 helm 命令在本文件夹运行即可,使用 `-n` 可以制定 NameSpace。 + +```shell +helm install proxypool-app . -n scrape +``` + +其中 proxypool-app 就是应用的名字,可以任意取名,它会用作代理池 Deplyment 的名称。 + +如果需要覆盖变量,可以修改 values.yaml 文件,执行如下命令安装: + +```shell +helm install proxypool-app . -f values.yaml -n scrape +``` + +## 更新 + +如果需要更新配置,可以修改 values.yaml 文件,执行如下命令更新版本: + +```shell +helm upgrade proxypool-app . -f values.yaml -n scrape +``` + +## 卸载 + +如果不想使用了,可以只用 uninstall 命令卸载: + +```shell +helm uninstall proxypool-app -n scrape +``` diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/_helpers.tpl b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/_helpers.tpl new file mode 100644 index 0000000..31911df --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/_helpers.tpl @@ -0,0 +1,53 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "proxypool.name" -}} +{{- default .Chart.Name .Values.name | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "proxypool.fullname" -}} +{{- if .Values.fullname }} +{{- .Values.fullname | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.name }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "proxypool.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "proxypool.labels" -}} +helm.sh/chart: {{ include "proxypool.chart" . }} +{{ include "proxypool.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "proxypool.selectorLabels" -}} +app.kubernetes.io/name: {{ include "proxypool.fullname" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/proxypool-deployment.yaml b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/proxypool-deployment.yaml new file mode 100644 index 0000000..a12854d --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/proxypool-deployment.yaml @@ -0,0 +1,37 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "proxypool.fullname" . }} + labels: + {{- include "proxypool.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.deployment.replicas }} + revisionHistoryLimit: {{ .Values.deployment.revisionHistoryLimit }} + selector: + matchLabels: + {{- include "proxypool.labels" . | nindent 8 }} + template: + metadata: + labels: + {{- include "proxypool.labels" . | nindent 8 }} + spec: + restartPolicy: {{ .Values.deployment.restartPolicy }} + containers: + - name: {{ include "proxypool.fullname" . }} + image: {{ .Values.deployment.image }} + ports: + - containerPort: 5555 + protocol: TCP + imagePullPolicy: {{ .Values.deployment.imagePullPolicy }} + livenessProbe: + httpGet: + path: /random + port: 5555 + initialDelaySeconds: 60 + periodSeconds: 5 + failureThreshold: 5 + timeoutSeconds: 10 + resources: + {{- toYaml .Values.deployment.resources | nindent 12 }} + env: + {{- toYaml .Values.deployment.env | nindent 12 }} diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/proxypool-ingress.yaml b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/proxypool-ingress.yaml new file mode 100644 index 0000000..0706f5d --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/proxypool-ingress.yaml @@ -0,0 +1,41 @@ +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "proxypool.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "proxypool.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ . }} + backend: + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/proxypool-service.yaml b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/proxypool-service.yaml new file mode 100644 index 0000000..3d4285b --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/proxypool-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "proxypool.fullname" . }} + labels: + {{- include "proxypool.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: 5555 + protocol: TCP + name: http + selector: + {{- include "proxypool.selectorLabels" . | nindent 4 }} diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/redis-deployment.yaml b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/redis-deployment.yaml new file mode 100644 index 0000000..4acf435 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/redis-deployment.yaml @@ -0,0 +1,30 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: proxypool-redis + name: proxypool-redis +spec: + replicas: 1 + revisionHistoryLimit: 1 + selector: + matchLabels: + app: proxypool-redis + template: + metadata: + labels: + app: proxypool-redis + spec: + containers: + - image: redis:alpine + name: proxypool-redis + ports: + - containerPort: 6379 + resources: + limits: + memory: "100Mi" + cpu: "100m" + requests: + memory: "100Mi" + cpu: "100m" + restartPolicy: Always diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/redis-service.yaml b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/redis-service.yaml new file mode 100644 index 0000000..5dbda55 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/templates/redis-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: proxypool-redis + name: proxypool-redis +spec: + ports: + - name: "6379" + port: 6379 + targetPort: 6379 + selector: + app: proxypool-redis \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/values.yaml b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/values.yaml new file mode 100644 index 0000000..15b2537 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/kubernetes/values.yaml @@ -0,0 +1,39 @@ +name: proxypool +fullname: proxypool-app + +deployment: + image: germey/proxypool:master + imagePullPolicy: Always + restartPolicy: Always + revisionHistoryLimit: 2 + successfulJobsHistoryLimit: 1 + replicas: 1 + resources: + limits: + memory: "200Mi" + cpu: "80m" + requests: + memory: "200Mi" + cpu: "80m" + env: + - name: PROXYPOOL_REDIS_HOST + value: "proxypool-redis" + - name: PROXYPOOL_REDIS_PORT + value: "6379" + +service: + type: ClusterIP + port: 80 + +ingress: + enabled: true + annotations: + kubernetes.io/ingress.class: nginx + hosts: + - host: proxypool.scrape.center + paths: + - "/" + tls: + - secretName: tls-wildcard-scrape-center + hosts: + - proxypool.scrape.center diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/.gitignore b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/.gitignore new file mode 100644 index 0000000..9f26378 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/.gitignore @@ -0,0 +1,134 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +.idea/ +*.log \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/__init__.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/__init__.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/__init__.py new file mode 100644 index 0000000..e68db8d --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/__init__.py @@ -0,0 +1,15 @@ +import pkgutil +from .base import BasePaidCrawler +import inspect + + +# load classes subclass of BaseCrawler +classes = [] +for loader, name, is_pkg in pkgutil.walk_packages(__path__): + module = loader.find_module(name).load_module(name) + for name, value in inspect.getmembers(module): + globals()[name] = value + if inspect.isclass(value) and issubclass(value, BasePaidCrawler) and value is not BasePaidCrawler \ + and not getattr(value, 'ignore', False): + classes.append(value) +__all__ = __ALL__ = classes diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/base.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/base.py new file mode 100644 index 0000000..995ea67 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/base.py @@ -0,0 +1,93 @@ +from retrying import RetryError, retry +import requests +from loguru import logger +from proxypool.setting import GET_TIMEOUT +from fake_headers import Headers +import time + +# 免费的节点 +class BaseCrawler(object): + urls = [] + + @retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000) + def fetch(self, url, **kwargs): + try: + headers = Headers(headers=True).generate() + kwargs.setdefault('timeout', GET_TIMEOUT) + kwargs.setdefault('verify', False) + kwargs.setdefault('headers', headers) + response = requests.get(url, **kwargs) + if response.status_code == 200: + response.encoding = 'utf-8' + return response.text + except (requests.ConnectionError, requests.ReadTimeout): + return + + def process(self, html, url): + """ + used for parse html + """ + for proxy in self.parse(html): + logger.info(f'fetched proxy {proxy.string()} from {url}') + yield proxy + + def crawl(self): + """ + crawl main method + """ + try: + for url in self.urls: + logger.info(f'fetching {url}') + html = self.fetch(url) + if not html: + continue + time.sleep(.5) + yield from self.process(html, url) + except RetryError: + logger.error( + f'crawler {self} crawled proxy unsuccessfully, ' + 'please check if target url is valid or network issue') + + +# 付费的节点 +class BasePaidCrawler(object): + urls = [] + + @retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000) + def fetch(self, url, **kwargs): + try: + headers = Headers(headers=True).generate() + kwargs.setdefault('timeout', GET_TIMEOUT) + kwargs.setdefault('verify', False) + kwargs.setdefault('headers', headers) + response = requests.get(url, **kwargs) + if response.status_code == 200: + response.encoding = 'utf-8' + return response.text + except (requests.ConnectionError, requests.ReadTimeout): + return + + def process(self, response, url): + """ + used for parse html + """ + for proxy in self.parse(response): + logger.info(f'fetched proxy {proxy.string()} from {url}') + yield proxy + + def crawl(self): + """ + crawl main method + """ + try: + for url in self.urls: + logger.info(f'fetching {url}') + response = self.fetch(url) + if not response: + continue + time.sleep(.5) + yield from self.process(response, url) + except RetryError: + logger.error( + f'crawler {self} crawled proxy unsuccessfully, ' + 'please check if target url is valid or network issue') diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/private/__init__.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/private/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/__init__.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/daili66.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/daili66.py new file mode 100644 index 0000000..aec7ea6 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/daili66.py @@ -0,0 +1,32 @@ +from pyquery import PyQuery as pq +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler + + +BASE_URL = 'http://www.66ip.cn/{page}.html' +MAX_PAGE = 3 + + +class Daili66Crawler(BaseCrawler): + """ + daili66 crawler, http://www.66ip.cn/1.html + """ + urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('.containerbox table tr:gt(0)').items() + for tr in trs: + host = tr.find('td:nth-child(1)').text() + port = int(tr.find('td:nth-child(2)').text()) + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = Daili66Crawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/data5u.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/data5u.py new file mode 100644 index 0000000..62158c2 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/data5u.py @@ -0,0 +1,31 @@ +from pyquery import PyQuery as pq +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +from loguru import logger + +BASE_URL = 'http://www.data5u.com' + + +class Data5UCrawler(BaseCrawler): + """ + data5u crawler, http://www.data5u.com + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + items = doc('.wlist ul.l2').items() + for item in items: + host = item.find('span:first-child').text() + port = int(item.find('span:nth-child(2)').text()) + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = Data5UCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/docip.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/docip.py new file mode 100644 index 0000000..26a6f8a --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/docip.py @@ -0,0 +1,40 @@ +import time +from retrying import RetryError +from loguru import logger +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import json + +BASE_URL = 'https://www.docip.net/data/free.json?t={date}' + + + +class DocipCrawler(BaseCrawler): + """ + Docip crawler, https://www.docip.net/data/free.json + """ + urls = [BASE_URL.format(date=time.strftime("%Y%m%d", time.localtime()))] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + try: + result = json.loads(html) + proxy_list = result['data'] + for proxy_item in proxy_list: + # TODO 这里的逻辑有变化,因为返回的ip变了 + ip_and_port =proxy_item['ip'] + host = ip_and_port.split(":")[0] + port = ip_and_port.split(":")[1] + yield Proxy(host=host, port=port) + except json.JSONDecodeError: + print("json.JSONDecodeError") + return + + +if __name__ == '__main__': + crawler = DocipCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/fatezero.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/fatezero.py new file mode 100644 index 0000000..681cf9e --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/fatezero.py @@ -0,0 +1,31 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re +import json +BASE_URL = 'http://proxylist.fatezero.org/proxy.list' + + +class FatezeroCrawler(BaseCrawler): + """ + Fatezero crawler,http://proxylist.fatezero.org + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + + hosts_ports = html.split('\n') + for addr in hosts_ports: + if(addr): + ip_address = json.loads(addr) + host = ip_address['host'] + port = ip_address['port'] + yield Proxy(host=host, port=port) + +if __name__ == '__main__': + crawler = FatezeroCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/geonodedaili.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/geonodedaili.py new file mode 100644 index 0000000..f71f16e --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/geonodedaili.py @@ -0,0 +1,71 @@ +import time +from retrying import RetryError +from loguru import logger +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import json + +BASE_URL = 'https://proxylist.geonode.com/api/proxy-list?limit=500&page={page}&sort_by=lastChecked&sort_type=desc' +MAX_PAGE = 18 + + +class GeonodeCrawler(BaseCrawler): + """ + Geonode crawler, https://proxylist.geonode.com/ + """ + urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + try: + result = json.loads(html) + proxy_list = result['data'] + for proxy_item in proxy_list: + host = proxy_item['ip'] + port = proxy_item['port'] + yield Proxy(host=host, port=port) + except json.JSONDecodeError: + print("json.JSONDecodeError") + return + + def crawl(self): + """ + override crawl main method + add headers + """ + headers = { + 'authority': 'proxylist.geonode.com', + 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"', + 'accept': 'application/json, text/plain, */*', + 'sec-ch-ua-mobile': '?0', + 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36', + 'sec-ch-ua-platform': '"macOS"', + 'origin': 'https://geonode.com', + 'sec-fetch-site': 'same-site', + 'sec-fetch-mode': 'cors', + 'sec-fetch-dest': 'empty', + 'referer': 'https://geonode.com/', + 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7', + 'if-none-match': 'W/"c25d-BXjLTmP+/yYXtIz4OEcmdOWSv88"', + } + try: + for url in self.urls: + logger.info(f'fetching {url}') + html = self.fetch(url, headers=headers) + if not html: + continue + time.sleep(.5) + yield from self.process(html, url) + except RetryError: + logger.error( + f'crawler {self} crawled proxy unsuccessfully, ' + 'please check if target url is valid or network issue') + + +if __name__ == '__main__': + crawler = GeonodeCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/goubanjia.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/goubanjia.py new file mode 100644 index 0000000..5715785 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/goubanjia.py @@ -0,0 +1,44 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re +from pyquery import PyQuery as pq +import time +BASE_URL = 'http://www.goubanjia.com/' + + +class GoubanjiaCrawler(BaseCrawler): + """ + ip Goubanjia crawler, http://www.goubanjia.com/ + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html)('.ip').items() + # ''.join([*filter(lambda x: x != '',re.compile('\>([\d:\.]*)\<').findall(td.html()))]) + for td in doc: + trs = td.children() + ip_str = '' + for tr in trs: + attrib = tr.attrib + if 'style' in attrib and 'none' in tr.attrib['style']: + continue + ip_str+= '' if not tr.text else tr.text + addr_split = ip_str.split(':') + if(len(addr_split) == 2): + host = addr_split[0] + port = addr_split[1] + yield Proxy(host=host, port=port) + else: + port = trs[-1].text + host = ip_str.replace(port,'') + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = GoubanjiaCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/ihuan.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/ihuan.py new file mode 100644 index 0000000..4ca5e52 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/ihuan.py @@ -0,0 +1,36 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re +from pyquery import PyQuery as pq +import time +BASE_URL = 'https://ip.ihuan.me/today/{path}.html' + + +class IhuanCrawler(BaseCrawler): + """ + ip ihuan crawler, https://ip.ihuan.me + """ + path = time.strftime("%Y/%m/%d/%H", time.localtime()) + urls = [BASE_URL.format(path=path)] + ignore = False + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + # doc = pq(html)('.text-left') + ip_address = re.compile('([\d:\.]*).*?
') + hosts_ports = ip_address.findall(html) + for addr in hosts_ports: + addr_split = addr.split(':') + if(len(addr_split) == 2): + host = addr_split[0] + port = addr_split[1] + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = IhuanCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/ip3366.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/ip3366.py new file mode 100644 index 0000000..dfbc06f --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/ip3366.py @@ -0,0 +1,32 @@ +from proxypool.crawlers.base import BaseCrawler +from proxypool.schemas.proxy import Proxy +import re + + +MAX_PAGE = 3 +BASE_URL = 'http://www.ip3366.net/free/?stype={stype}&page={page}' + + +class IP3366Crawler(BaseCrawler): + """ + ip3366 crawler, http://www.ip3366.net/ + """ + urls = [BASE_URL.format(stype=stype,page=i) for stype in range(1,3) for i in range(1, 8)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + ip_address = re.compile('\s*(.*?)\s*(.*?)') + # \s * 匹配空格,起到换行作用 + re_ip_address = ip_address.findall(html) + for address, port in re_ip_address: + proxy = Proxy(host=address.strip(), port=int(port.strip())) + yield proxy + + +if __name__ == '__main__': + crawler = IP3366Crawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/ip89.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/ip89.py new file mode 100644 index 0000000..f67c387 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/ip89.py @@ -0,0 +1,33 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re + +MAX_NUM = 9999 +BASE_URL = 'http://api.89ip.cn/tqdl.html?api=1&num={MAX_NUM}&port=&address=&isp='.format(MAX_NUM=MAX_NUM) + + +class Ip89Crawler(BaseCrawler): + """ + 89ip crawler, http://api.89ip.cn + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + ip_address = re.compile('([\d:\.]*)
') + hosts_ports = ip_address.findall(html) + for addr in hosts_ports: + addr_split = addr.split(':') + if(len(addr_split) == 2): + host = addr_split[0] + port = addr_split[1] + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = Ip89Crawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/iphai.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/iphai.py new file mode 100644 index 0000000..baa7983 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/iphai.py @@ -0,0 +1,35 @@ +from proxypool.crawlers.base import BaseCrawler +from proxypool.schemas.proxy import Proxy +import re + + +BASE_URL = 'http://www.iphai.com/' + +class IPHaiCrawler(BaseCrawler): + """ + iphai crawler, http://www.iphai.com/ + """ + urls = [BASE_URL] + ignore = True + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + find_tr = re.compile('(.*?)', re.S) + trs = find_tr.findall(html) + for s in range(1, len(trs)): + find_ip = re.compile('\s+(\d+\.\d+\.\d+\.\d+)\s+', re.S) + re_ip_address = find_ip.findall(trs[s]) + find_port = re.compile('\s+(\d+)\s+', re.S) + re_port = find_port.findall(trs[s]) + for address, port in zip(re_ip_address, re_port): + proxy = Proxy(host=address.strip(), port=int(port.strip())) + yield proxy + +if __name__ == '__main__': + crawler = IPHaiCrawler() + for proxy in crawler.crawl(): + print(proxy) + diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/jiangxianli.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/jiangxianli.py new file mode 100644 index 0000000..861dd1e --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/jiangxianli.py @@ -0,0 +1,39 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import json + + +BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}' + +MAX_PAGE = 3 + + +class JiangxianliCrawler(BaseCrawler): + """ + jiangxianli crawler,https://ip.jiangxianli.com/ + """ + + urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + + result = json.loads(html) + if result['code'] != 0: + return + MAX_PAGE = int(result['data']['last_page']) + hosts_ports = result['data']['data'] + for ip_address in hosts_ports: + if(ip_address): + host = ip_address['ip'] + port = ip_address['port'] + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = JiangxianliCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/kuaidaili.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/kuaidaili.py new file mode 100644 index 0000000..3602833 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/kuaidaili.py @@ -0,0 +1,33 @@ +from proxypool.crawlers.base import BaseCrawler +from proxypool.schemas.proxy import Proxy +import re +from pyquery import PyQuery as pq + + +BASE_URL = 'https://www.kuaidaili.com/free/{type}/{page}/' +MAX_PAGE = 3 + + +class KuaidailiCrawler(BaseCrawler): + """ + kuaidaili crawler, https://www.kuaidaili.com/ + """ + urls = [BASE_URL.format(type=type,page=page) for type in ('intr','inha') for page in range(1, MAX_PAGE + 1)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + for item in doc('table tr').items(): + td_ip = item.find('td[data-title="IP"]').text() + td_port = item.find('td[data-title="PORT"]').text() + if td_ip and td_port: + yield Proxy(host=td_ip, port=td_port) + + +if __name__ == '__main__': + crawler = KuaidailiCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/seofangfa.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/seofangfa.py new file mode 100644 index 0000000..1293fc0 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/seofangfa.py @@ -0,0 +1,34 @@ +import requests +from pyquery import PyQuery as pq + +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler + +requests.packages.urllib3.disable_warnings() +BASE_URL = "https://proxy.seofangfa.com/" +MAX_PAGE = 1 + + +class SeoFangFaCrawler(BaseCrawler): + """ + seo方法 crawler, https://proxy.seofangfa.com/ + """ + urls = ["https://proxy.seofangfa.com/"] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('.table tr:gt(0)').items() + for tr in trs: + host = tr.find('td:nth-child(1)').text() + port = int(tr.find('td:nth-child(2)').text()) + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = SeoFangFaCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/taiyangdaili.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/taiyangdaili.py new file mode 100644 index 0000000..edbacad --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/taiyangdaili.py @@ -0,0 +1,31 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +from pyquery import PyQuery as pq + +BaseUrl = 'http://www.taiyanghttp.com/free/page{num}' +MAX_PAGE = 3 + + +class TaiyangdailiCrawler(BaseCrawler): + """ + taiyangdaili crawler, http://www.taiyanghttp.com/free/ + """ + urls = [BaseUrl.format(num=i) for i in range(1, 6)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('#ip_list .tr.ip_tr').items() + for tr in trs: + host = tr.find('div:nth-child(1)').text() + port = tr.find('div:nth-child(2)').text() + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = TaiyangdailiCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/uqidata.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/uqidata.py new file mode 100644 index 0000000..3e54b2d --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/uqidata.py @@ -0,0 +1,49 @@ +from pyquery import PyQuery as pq +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +from loguru import logger + +BASE_URL = 'https://ip.uqidata.com/free/index.html' + + +class UqidataCrawler(BaseCrawler): + """ + Uqidata crawler, https://ip.uqidata.com/free/index.html + """ + urls = [BASE_URL] + ignore = True + + def encode(input_str): + tmp = [] + for i in range(len(input_str)): + tmp.append("ABCDEFGHIZ".find(input_str[i])) + result = "".join(str(i) for i in tmp) + result = int(result) >> 0x03 + return result + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('#main_container .inner table tbody tr:nth-child(n+3)').items() + for tr in trs: + ip_html = tr('td.ip').find("*").items() + host = '' + for i in ip_html: + if i.attr('style') is not None and 'none' in i.attr('style'): + continue + if i.text() == '': + continue + host += i.text() + + port_code = tr('td.port').attr('class').split(' ')[1] + port = UqidataCrawler.encode(port_code) + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = UqidataCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/xiaoshudaili.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/xiaoshudaili.py new file mode 100644 index 0000000..4476309 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/xiaoshudaili.py @@ -0,0 +1,54 @@ +import re +from pyquery import PyQuery as pq +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler + +BASE_URL = "http://www.xsdaili.cn/" +PAGE_BASE_URL = "http://www.xsdaili.cn/dayProxy/ip/{page}.html" +MAX_PAGE = 3 + + +class XiaoShuCrawler(BaseCrawler): + """ + 小舒代理 crawler, http://www.xsdaili.cn/ + """ + + def __init__(self): + """ + init urls + """ + try: + html = self.fetch(url=BASE_URL) + except: + self.urls = [] + return + doc = pq(html) + title = doc(".title:eq(0) a").items() + latest_page = 0 + for t in title: + res = re.search(r"/(\d+)\.html", t.attr("href")) + latest_page = int(res.group(1)) if res else 0 + if latest_page: + self.urls = [PAGE_BASE_URL.format(page=page) for page in range( + latest_page - MAX_PAGE, latest_page)] + else: + self.urls = [] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + contents = doc('.cont').text() + contents = contents.split("\n") + for content in contents: + c = content[:content.find("@")] + host, port = c.split(":") + yield Proxy(host=host, port=int(port)) + + +if __name__ == '__main__': + crawler = XiaoShuCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/xicidaili.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/xicidaili.py new file mode 100644 index 0000000..53a4872 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/xicidaili.py @@ -0,0 +1,35 @@ +from pyquery import PyQuery as pq +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +from loguru import logger + +BASE_URL = 'https://www.xicidaili.com/' + + +class XicidailiCrawler(BaseCrawler): + """ + xididaili crawler, https://www.xicidaili.com/ + """ + urls = [BASE_URL] + ignore = True + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + items = doc('#ip_list tr:contains(高匿)').items() + for item in items: + country = item.find('td.country').text() + if not country or country.strip() != '高匿': + continue + host = item.find('td:nth-child(2)').text() + port = int(item.find('td:nth-child(3)').text()) + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = XicidailiCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/xiladaili.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/xiladaili.py new file mode 100644 index 0000000..70a75ff --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/xiladaili.py @@ -0,0 +1,32 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +from lxml import etree + +BASE_URL = "http://www.xiladaili.com/" +MAX_PAGE = 5 + + +class XiladailiCrawler(BaseCrawler): + """ + xiladaili crawler, http://www.xiladaili.com/ + """ + urls = ["http://www.xiladaili.com/"] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + etree_html = etree.HTML(html) + ip_ports = etree_html.xpath("//tbody/tr/td[1]/text()") + + for ip_port in ip_ports: + host = ip_port.partition(":")[0] + port = ip_port.partition(":")[2] + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = XiladailiCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/yqie.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/yqie.py new file mode 100644 index 0000000..fb3feaf --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/yqie.py @@ -0,0 +1,32 @@ +from pyquery import PyQuery as pq + +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler + +BASE_URL = "http://ip.yqie.com/ipproxy.htm" +MAX_PAGE = 1 + + +class YqIeCrawler(BaseCrawler): + """ + ip yqie crawler, http://ip.yqie.com/ipproxy.htm + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('#GridViewOrder tr:gt(0)').items() + for tr in trs: + host = tr.find('td:nth-child(1)').text() + port = int(tr.find('td:nth-child(2)').text()) + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = YqIeCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/zhandaye.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/zhandaye.py new file mode 100644 index 0000000..1522cdf --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/crawlers/public/zhandaye.py @@ -0,0 +1,59 @@ +from pyquery import PyQuery as pq +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +from loguru import logger +import re + + +BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html' +MAX_PAGE = 5 * 2 + + +class ZhandayeCrawler(BaseCrawler): + """ + zhandaye crawler, https://www.zdaye.com/dayProxy/ + """ + urls_catalog = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE)] + headers = { + 'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36' + } + urls = [] + ignore = True + + def crawl(self): + self.crawl_catalog() + yield from super().crawl() + + def crawl_catalog(self): + for url in self.urls_catalog: + logger.info(f'fetching {url}') + html = self.fetch(url, headers=self.headers) + self.parse_catalog(html) + + def parse_catalog(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + for item in doc('#J_posts_list .thread_item div div p a').items(): + url = 'https://www.zdaye.com' + item.attr('href') + logger.info(f'get detail url: {url}') + self.urls.append(url) + + def parse(self, html): + doc = pq(html) + trs = doc('.cont br').items() + for tr in trs: + line = tr[0].tail + match = re.search(r'(\d+\.\d+\.\d+\.\d+):(\d+)', line) + if match: + host = match.group(1) + port = match.group(2) + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = ZhandayeCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/exceptions/__init__.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/exceptions/__init__.py new file mode 100644 index 0000000..b54b1e8 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/exceptions/__init__.py @@ -0,0 +1 @@ +from .empty import PoolEmptyException \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/exceptions/empty.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/exceptions/empty.py new file mode 100644 index 0000000..255c7fb --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/exceptions/empty.py @@ -0,0 +1,7 @@ +class PoolEmptyException(Exception): + def __str__(self): + """ + proxypool is used out + :return: + """ + return repr('no proxy in proxypool') diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/processors/__init__.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/processors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/processors/getter.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/processors/getter.py new file mode 100644 index 0000000..ca4105d --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/processors/getter.py @@ -0,0 +1,43 @@ +from loguru import logger +from proxypool.storages.redis1 import RedisClient +from proxypool.setting import PROXY_NUMBER_MAX +from proxypool.crawlers import __all__ as crawlers_cls + + +class Getter(object): + """ + getter of proxypool + """ + + def __init__(self): + """ + init db and crawlers + """ + self.redis = RedisClient() + self.crawlers_cls = crawlers_cls + self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls] + + def is_full(self): + """ + if proxypool if full + return: bool + """ + return self.redis.count() >= PROXY_NUMBER_MAX + + @logger.catch + def run(self): + """ + run crawlers to get proxy + :return: + """ + if self.is_full(): + return + for crawler in self.crawlers: + logger.info(f'crawler {crawler} to get proxy') + for proxy in crawler.crawl(): + self.redis.add(proxy) + + +if __name__ == '__main__': + getter = Getter() + getter.run() diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/processors/server.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/processors/server.py new file mode 100644 index 0000000..a953e36 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/processors/server.py @@ -0,0 +1,92 @@ +from flask import Flask, g, request +from proxypool.storages.redis1 import RedisClient +from proxypool.setting import API_HOST, API_PORT, API_THREADED, API_KEY, IS_DEV +import functools + +__all__ = ['app'] + +app = Flask(__name__) +if IS_DEV: + app.debug = True + + +def auth_required(func): + @functools.wraps(func) + def decorator(*args, **kwargs): + # conditional decorator, when setting API_KEY is set, otherwise just ignore this decorator + if API_KEY == "": + return func(*args, **kwargs) + if request.headers.get('API-KEY', None) is not None: + api_key = request.headers.get('API-KEY') + else: + return {"message": "Please provide an API key in header"}, 400 + # Check if API key is correct and valid + if request.method == "GET" and api_key == API_KEY: + return func(*args, **kwargs) + else: + return {"message": "The provided API key is not valid"}, 403 + + return decorator + + +def get_conn(): + """ + get redis client object + :return: + """ + if not hasattr(g, 'redis'): + g.redis = RedisClient() + return g.redis + + +@app.route('/') +@auth_required +def index(): + """ + get home page, you can define your own templates + :return: + """ + return '

Welcome to Proxy Pool System

' + + +@app.route('/random') +@auth_required +def get_proxy(): + """ + get a random proxy + :return: get a random proxy + """ + conn = get_conn() + return conn.random().string() + + +@app.route('/all') +@auth_required +def get_proxy_all(): + """ + get a random proxy + :return: get a random proxy + """ + conn = get_conn() + proxies = conn.all() + proxies_string = '' + if proxies: + for proxy in proxies: + proxies_string += str(proxy) + '\n' + + return proxies_string + + +@app.route('/count') +@auth_required +def get_count(): + """ + get the count of proxies + :return: count, int + """ + conn = get_conn() + return str(conn.count()) + + +if __name__ == '__main__': + app.run(host=API_HOST, port=API_PORT, threaded=API_THREADED) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/processors/tester.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/processors/tester.py new file mode 100644 index 0000000..aa14910 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/processors/tester.py @@ -0,0 +1,108 @@ +import asyncio +import aiohttp +from loguru import logger +from proxypool.schemas import Proxy +from proxypool.storages.redis1 import RedisClient +from proxypool.setting import TEST_TIMEOUT, TEST_BATCH, TEST_URL, TEST_VALID_STATUS, TEST_ANONYMOUS, \ + TEST_DONT_SET_MAX_SCORE,IS_PAID +from aiohttp import ClientProxyConnectionError, ServerDisconnectedError, ClientOSError, ClientHttpProxyError +from asyncio import TimeoutError + +EXCEPTIONS = ( + ClientProxyConnectionError, + ConnectionRefusedError, + TimeoutError, + ServerDisconnectedError, + ClientOSError, + ClientHttpProxyError, + AssertionError +) + + +class Tester(object): + """ + tester for testing proxies in queue + """ + + def __init__(self): + """ + init redis + """ + self.redis = RedisClient() + self.loop = asyncio.get_event_loop() + + async def test(self, proxy: Proxy): + """ + test single proxy + :param proxy: Proxy object + :return: + """ + async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: + try: + logger.debug(f'testing {proxy.string()}') + # if TEST_ANONYMOUS is True, make sure that + # the proxy has the effect of hiding the real IP + if TEST_ANONYMOUS: + url = 'https://httpbin.org/ip' + auth = "d2118699212:bxb0p3l8" + if IS_PAID: + proxys = f'http://{auth}@{proxy.string()}' + else: + proxys = f'http://{proxy.string()}' + async with session.get(url, timeout=TEST_TIMEOUT) as response: + resp_json = await response.json() + origin_ip = resp_json['origin'] + async with session.get(url, proxy=proxys, timeout=TEST_TIMEOUT) as response: + resp_json = await response.json() + anonymous_ip = resp_json['origin'] + # 通过去获取https://httpbin.org/ip返回ip是否相同来判断是否代理成功 + assert origin_ip != anonymous_ip + assert proxy.host == anonymous_ip + async with session.get(TEST_URL, proxy=proxys, timeout=TEST_TIMEOUT, + allow_redirects=False) as response: + if response.status in TEST_VALID_STATUS: + if TEST_DONT_SET_MAX_SCORE: + logger.debug(f'proxy {proxy.string()} is valid, remain current score') + else: + self.redis.max(proxy) + logger.debug(f'proxy {proxy.string()} is valid, set max score') + else: + self.redis.decrease(proxy) + logger.debug(f'proxy {proxy.string()} is invalid, decrease score') + except EXCEPTIONS: + # 如果报错了就是用redis减分 + self.redis.decrease(proxy) + logger.debug(f'proxy {proxy.string()} is invalid, decrease score') + + @logger.catch + def run(self): + """ + test main method + :return: + """ + # event loop of aiohttp + logger.info('stating tester...') + count = self.redis.count() + logger.debug(f'{count} proxies to test') + cursor = 0 + while True: + logger.debug(f'testing proxies use cursor {cursor}, count {TEST_BATCH}') + cursor, proxies = self.redis.batch(cursor, count=TEST_BATCH) + if proxies: + tasks = [self.test(proxy) for proxy in proxies] + self.loop.run_until_complete(asyncio.wait(tasks)) + if not cursor: + break + + +def run_tester(): + host = '96.113.165.182' + port = '3128' + tasks = [tester.test(Proxy(host=host, port=port))] + tester.loop.run_until_complete(asyncio.wait(tasks)) + + +if __name__ == '__main__': + tester = Tester() + tester.run() + # run_tester() diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/scheduler.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/scheduler.py new file mode 100644 index 0000000..a2d18ab --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/scheduler.py @@ -0,0 +1,143 @@ +import time +import multiprocessing +from proxypool.processors.server import app +from proxypool.processors.getter import Getter +from proxypool.processors.tester import Tester +from proxypool.setting import APP_PROD_METHOD_GEVENT, APP_PROD_METHOD_MEINHELD, APP_PROD_METHOD_TORNADO, CYCLE_GETTER, CYCLE_TESTER, API_HOST, \ + API_THREADED, API_PORT, ENABLE_SERVER, IS_PROD, APP_PROD_METHOD, \ + ENABLE_GETTER, ENABLE_TESTER, IS_WINDOWS +from loguru import logger + + +if IS_WINDOWS: + multiprocessing.freeze_support() + +tester_process, getter_process, server_process = None, None, None + + +class Scheduler(): + """ + scheduler + """ + + def run_tester(self, cycle=CYCLE_TESTER): + """ + run tester + """ + if not ENABLE_TESTER: + logger.info('tester not enabled, exit') + return + tester = Tester() + loop = 0 + while True: + logger.debug(f'tester loop {loop} start...') + tester.run() + loop += 1 + time.sleep(cycle) + + def run_getter(self, cycle=CYCLE_GETTER): + """ + run getter + """ + if not ENABLE_GETTER: + logger.info('getter not enabled, exit') + return + getter = Getter() + loop = 0 + while True: + logger.debug(f'getter loop {loop} start...') + getter.run() + loop += 1 + time.sleep(cycle) + + def run_server(self): + """ + run server for api + """ + if not ENABLE_SERVER: + logger.info('server not enabled, exit') + return + if IS_PROD: + if APP_PROD_METHOD == APP_PROD_METHOD_GEVENT: + try: + from gevent.pywsgi import WSGIServer + except ImportError as e: + logger.exception(e) + else: + http_server = WSGIServer((API_HOST, API_PORT), app) + http_server.serve_forever() + + elif APP_PROD_METHOD == APP_PROD_METHOD_TORNADO: + try: + from tornado.wsgi import WSGIContainer + from tornado.httpserver import HTTPServer + from tornado.ioloop import IOLoop + except ImportError as e: + logger.exception(e) + else: + http_server = HTTPServer(WSGIContainer(app)) + http_server.listen(API_PORT) + IOLoop.instance().start() + + elif APP_PROD_METHOD == APP_PROD_METHOD_MEINHELD: + try: + import meinheld + except ImportError as e: + logger.exception(e) + else: + meinheld.listen((API_HOST, API_PORT)) + meinheld.run(app) + + else: + logger.error("unsupported APP_PROD_METHOD") + return + else: + app.run(host=API_HOST, port=API_PORT, threaded=API_THREADED, use_reloader=False) + + def run(self): + global tester_process, getter_process, server_process + try: + logger.info('starting proxypool...') + if ENABLE_TESTER: + tester_process = multiprocessing.Process( + target=self.run_tester) + logger.info(f'starting tester, pid {tester_process.pid}...') + tester_process.start() + + if ENABLE_GETTER: + getter_process = multiprocessing.Process( + target=self.run_getter) + logger.info(f'starting getter, pid {getter_process.pid}...') + getter_process.start() + + if ENABLE_SERVER: + server_process = multiprocessing.Process( + target=self.run_server) + logger.info(f'starting server, pid {server_process.pid}...') + server_process.start() + + tester_process and tester_process.join() + getter_process and getter_process.join() + server_process and server_process.join() + except KeyboardInterrupt: + logger.info('received keyboard interrupt signal') + tester_process and tester_process.terminate() + getter_process and getter_process.terminate() + server_process and server_process.terminate() + finally: + # must call join method before calling is_alive + tester_process and tester_process.join() + getter_process and getter_process.join() + server_process and server_process.join() + logger.info( + f'tester is {"alive" if tester_process.is_alive() else "dead"}') + logger.info( + f'getter is {"alive" if getter_process.is_alive() else "dead"}') + logger.info( + f'server is {"alive" if server_process.is_alive() else "dead"}') + logger.info('proxy terminated') + + +if __name__ == '__main__': + scheduler = Scheduler() + scheduler.run() diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/schemas/__init__.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/schemas/__init__.py new file mode 100644 index 0000000..699f6dc --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/schemas/__init__.py @@ -0,0 +1 @@ +from .proxy import Proxy \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/schemas/proxy.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/schemas/proxy.py new file mode 100644 index 0000000..8be3fb3 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/schemas/proxy.py @@ -0,0 +1,30 @@ +from attr import attrs, attr + + +@attrs +class Proxy(object): + """ + proxy schema + """ + host = attr(type=str, default=None) + port = attr(type=int, default=None) + + def __str__(self): + """ + to string, for print + :return: + """ + return f'{self.host}:{self.port}' + + def string(self): + """ + to string + :return: : + """ + return self.__str__() + + +if __name__ == '__main__': + proxy = Proxy(host='8.8.8.8', port=8888) + print('proxy', proxy) + print('proxy', proxy.string()) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/setting.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/setting.py new file mode 100644 index 0000000..98eee1a --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/setting.py @@ -0,0 +1,123 @@ +import platform +from os.path import dirname, abspath, join +from environs import Env +from loguru import logger +import shutil + + +env = Env() +env.read_env() + +# definition of flags +IS_WINDOWS = platform.system().lower() == 'windows' + +# definition of dirs +ROOT_DIR = dirname(dirname(abspath(__file__))) +LOG_DIR = join(ROOT_DIR, env.str('LOG_DIR', 'logs')) + +# definition of environments +DEV_MODE, TEST_MODE, PROD_MODE = 'dev', 'test', 'prod' +APP_ENV = env.str('APP_ENV', DEV_MODE).lower() +APP_DEBUG = env.bool('APP_DEBUG', True if APP_ENV == DEV_MODE else False) +APP_DEV = IS_DEV = APP_ENV == DEV_MODE +APP_PROD = IS_PROD = APP_ENV == PROD_MODE +APP_TEST = IS_TEST = APP_ENV == TEST_MODE + + +# Which WSGI container is used to run applications +# - gevent: pip install gevent +# - tornado: pip install tornado +# - meinheld: pip install meinheld +APP_PROD_METHOD_GEVENT = 'gevent' +APP_PROD_METHOD_TORNADO = 'tornado' +APP_PROD_METHOD_MEINHELD = 'meinheld' +APP_PROD_METHOD = env.str('APP_PROD_METHOD', APP_PROD_METHOD_GEVENT).lower() + +# redis host +# 更改了host的权限 +REDIS_HOST = env.str('PROXYPOOL_REDIS_HOST', + env.str('REDIS_HOST', '192.168.118.202')) +# redis port +REDIS_PORT = env.int('PROXYPOOL_REDIS_PORT', env.int('REDIS_PORT', 6379)) +# redis password, if no password, set it to None +REDIS_PASSWORD = env.str('PROXYPOOL_REDIS_PASSWORD', + env.str('REDIS_PASSWORD', None)) +# redis db, if no choice, set it to 0 +REDIS_DB = env.int('PROXYPOOL_REDIS_DB', env.int('REDIS_DB', 0)) +# redis connection string, like redis://[password]@host:port or rediss://[password]@host:port/0, +# please refer to https://redis-py.readthedocs.io/en/stable/connections.html#redis.client.Redis.from_url +REDIS_CONNECTION_STRING = env.str( + 'PROXYPOOL_REDIS_CONNECTION_STRING', env.str('REDIS_CONNECTION_STRING', None)) + +# redis hash table key name +REDIS_KEY = env.str('PROXYPOOL_REDIS_KEY', env.str( + 'REDIS_KEY', 'proxies:universal')) + +# definition of proxy scores +IS_PAID =env.bool('IS_PAID', True) +PROXY_SCORE_MAX = env.int('PROXY_SCORE_MAX', 100) +PROXY_SCORE_MIN = env.int('PROXY_SCORE_MIN', 0) +PROXY_SCORE_INIT = env.int('PROXY_SCORE_INIT', 10) + +# definition of proxy number +PROXY_NUMBER_MAX = 50000 +PROXY_NUMBER_MIN = 0 + +# definition of tester cycle, it will test every CYCLE_TESTER second +CYCLE_TESTER = env.int('CYCLE_TESTER', 20) +# definition of getter cycle, it will get proxy every CYCLE_GETTER second +CYCLE_GETTER = env.int('CYCLE_GETTER', 100) +GET_TIMEOUT = env.int('GET_TIMEOUT', 10) + +# definition of tester +TEST_URL = env.str('TEST_URL', 'http://www.baidu.com') +TEST_TIMEOUT = env.int('TEST_TIMEOUT', 10) +TEST_BATCH = env.int('TEST_BATCH', 20) +# only save anonymous proxy +TEST_ANONYMOUS = env.bool('TEST_ANONYMOUS', True) +# TEST_HEADERS = env.json('TEST_HEADERS', { +# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', +# }) +TEST_VALID_STATUS = env.list('TEST_VALID_STATUS', [200, 206, 302]) +# whether to set max score when one proxy is tested valid +TEST_DONT_SET_MAX_SCORE = env.bool('TEST_DONT_SET_MAX_SCORE', False) + +# definition of api +API_HOST = env.str('API_HOST', '0.0.0.0') +API_PORT = env.int('API_PORT', 5555) +API_THREADED = env.bool('API_THREADED', True) +# add an api key to get proxy +# need a header of `API-KEY` in get request to pass the authenticate +# API_KEY='', do not need `API-KEY` header +API_KEY = env.str('API_KEY', '') + +# flags of enable +ENABLE_TESTER = env.bool('ENABLE_TESTER', True) +ENABLE_GETTER = env.bool('ENABLE_GETTER', True) +ENABLE_SERVER = env.bool('ENABLE_SERVER', True) + + +ENABLE_LOG_FILE = env.bool('ENABLE_LOG_FILE', True) +ENABLE_LOG_RUNTIME_FILE = env.bool('ENABLE_LOG_RUNTIME_FILE', True) +ENABLE_LOG_ERROR_FILE = env.bool('ENABLE_LOG_ERROR_FILE', True) + + +LOG_LEVEL_MAP = { + DEV_MODE: "DEBUG", + TEST_MODE: "INFO", + PROD_MODE: "ERROR" +} + +LOG_LEVEL = LOG_LEVEL_MAP.get(APP_ENV) +LOG_ROTATION = env.str('LOG_ROTATION', '500MB') +LOG_RETENTION = env.str('LOG_RETENTION', '1 week') + +if ENABLE_LOG_FILE: + if ENABLE_LOG_RUNTIME_FILE: + logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), + level=LOG_LEVEL, rotation=LOG_ROTATION, retention=LOG_RETENTION) + if ENABLE_LOG_ERROR_FILE: + logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), + level='ERROR', rotation=LOG_ROTATION) +else: + shutil.rmtree(LOG_DIR, ignore_errors=True) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/storages/__init__.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/storages/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/storages/redis1.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/storages/redis1.py new file mode 100644 index 0000000..0482fa1 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/storages/redis1.py @@ -0,0 +1,149 @@ +import redis +from proxypool.exceptions import PoolEmptyException +from proxypool.schemas.proxy import Proxy +from proxypool.setting import REDIS_CONNECTION_STRING, REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_DB, REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MIN, \ + PROXY_SCORE_INIT +from random import choice +from typing import List +from loguru import logger +from proxypool.utils.proxy import is_valid_proxy, convert_proxy_or_proxies + + +REDIS_CLIENT_VERSION = redis.__version__ +print(REDIS_CLIENT_VERSION) +IS_REDIS_VERSION_2 = REDIS_CLIENT_VERSION.startswith('2.') + + +class RedisClient(object): + """ + redis connection client of proxypool + """ + + def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=REDIS_DB, + connection_string=REDIS_CONNECTION_STRING, **kwargs): + """ + init redis client + :param host: redis host + :param port: redis port + :param password: redis password + :param connection_string: redis connection_string + """ + # if set connection_string, just use it + if connection_string: + self.db = redis.StrictRedis.from_url(connection_string, decode_responses=True, **kwargs) + else: + self.db = redis.StrictRedis( + host=host, port=port, password=password, db=db, decode_responses=True, **kwargs) + + # 增加可用代理 + def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int: + """ + add proxy and set it to init score + :param proxy: proxy, ip:port, like 8.8.8.8:88 + :param score: int score + :return: result + """ + if not is_valid_proxy(f'{proxy.host}:{proxy.port}'): + logger.info(f'invalid proxy {proxy}, throw it') + return + if not self.exists(proxy): + if IS_REDIS_VERSION_2: + # if False: + return self.db.zadd(REDIS_KEY, score, proxy.string()) + return self.db.zadd(REDIS_KEY, {proxy.string(): score}) + + # 随机挑选一个可以用代理 + def random(self) -> Proxy: + """ + get random proxy + firstly try to get proxy with max score + if not exists, try to get proxy by rank + if not exists, raise error + :return: proxy, like 8.8.8.8:8 + """ + # try to get proxy with max score + proxies = self.db.zrangebyscore( + REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX) + if len(proxies): + return convert_proxy_or_proxies(choice(proxies)) + # else get proxy by rank + proxies = self.db.zrevrange( + REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX) + if len(proxies): + return convert_proxy_or_proxies(choice(proxies)) + # else raise error + raise PoolEmptyException + + # 给不好的代理减分 + def decrease(self, proxy: Proxy) -> int: + """ + decrease score of proxy, if small than PROXY_SCORE_MIN, delete it + :param proxy: proxy + :return: new score + """ + if IS_REDIS_VERSION_2: + # if False: + self.db.zincrby(REDIS_KEY, proxy.string(), -1) + else: + self.db.zincrby(REDIS_KEY, -1, proxy.string()) + score = self.db.zscore(REDIS_KEY, proxy.string()) + logger.info(f'{proxy.string()} score decrease 1, current {score}') + # 小于最小值直接删除 + if score <= PROXY_SCORE_MIN: + logger.info(f'{proxy.string()} current score {score}, remove') + self.db.zrem(REDIS_KEY, proxy.string()) + + # 是否存在这个代理 + def exists(self, proxy: Proxy) -> bool: + """ + if proxy exists + :param proxy: proxy + :return: if exists, bool + """ + return not self.db.zscore(REDIS_KEY, proxy.string()) is None + + # 将代理的score设置成最大的 + def max(self, proxy: Proxy) -> int: + """ + set proxy to max score + :param proxy: proxy + :return: new score + """ + logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}') + if IS_REDIS_VERSION_2: + # if False: + return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string()) + return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX}) + + # 有多少个代理 + def count(self) -> int: + """ + get count of proxies + :return: count, int + """ + return self.db.zcard(REDIS_KEY) + + # 返回所有代理 + def all(self) -> List[Proxy]: + """ + get all proxies + :return: list of proxies + """ + return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX)) + + # 根据游标和数量,返回对应个数的代理 + def batch(self, cursor, count) -> List[Proxy]: + """ + get batch of proxies + :param cursor: scan cursor + :param count: scan count + :return: list of proxies + """ + cursor, proxies = self.db.zscan(REDIS_KEY, cursor, count=count) + return cursor, convert_proxy_or_proxies([i[0] for i in proxies]) + + +if __name__ == '__main__': + conn = RedisClient() + result = conn.random() + print(result) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/utils/__init__.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/utils/proxy.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/utils/proxy.py new file mode 100644 index 0000000..2a97ce4 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/proxypool/utils/proxy.py @@ -0,0 +1,94 @@ +from proxypool.schemas import Proxy + +import sys +import os + +# pythonpath= os.path.abspath(os.path.dirname(os.path.dirname(__file__))) +# print(pythonpath) +# sys.path.insert(0,pythonpath) + +def is_valid_proxy(data): + """ + check this string is within proxy format + """ + if is_auth_proxy(data): + host, port = extract_auth_proxy(data) + return is_ip_valid(host) and is_port_valid(port) + elif data.__contains__(':'): + ip = data.split(':')[0] + port = data.split(':')[1] + return is_ip_valid(ip) and is_port_valid(port) + else: + return is_ip_valid(data) + + +def is_ip_valid(ip): + """ + check this string is within ip format + """ + if is_auth_proxy(ip): + ip = ip.split('@')[1] + a = ip.split('.') + if len(a) != 4: + return False + for x in a: + if not x.isdigit(): + return False + i = int(x) + if i < 0 or i > 255: + return False + return True + + +def is_port_valid(port): + return port.isdigit() + + +def convert_proxy_or_proxies(data): + """ + convert list of str to valid proxies or proxy + :param data: + :return: + """ + if not data: + return None + # if list of proxies + if isinstance(data, list): + result = [] + for item in data: + # skip invalid item + item = item.strip() + if not is_valid_proxy(item): continue + if is_auth_proxy(item): + host, port = extract_auth_proxy(item) + else: + host, port = item.split(':') + result.append(Proxy(host=host, port=int(port))) + return result + if isinstance(data, str) and is_valid_proxy(data): + if is_auth_proxy(data): + host, port = extract_auth_proxy(data) + else: + host, port = data.split(':') + return Proxy(host=host, port=int(port)) + + +def is_auth_proxy(data: str) -> bool: + return '@' in data + + +def extract_auth_proxy(data: str) -> (str, str): + """ + extract host and port from a proxy with authentication + """ + auth = data.split('@')[0] + ip_port = data.split('@')[1] + ip = ip_port.split(':')[0] + port = ip_port.split(':')[1] + host = auth + '@' + ip + return host, port + + +if __name__ == '__main__': + proxy = 'test1234:test5678.@117.68.216.212:32425' + print(extract_auth_proxy(proxy)) diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/release.sh b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/release.sh new file mode 100644 index 0000000..342cd06 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/release.sh @@ -0,0 +1,2 @@ +git tag -a "`date +'%Y%m%d'`" -m "Release `date +'%Y%m%d'`" +git push origin --tags \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/requirements.txt b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/requirements.txt new file mode 100644 index 0000000..c9407c7 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/requirements.txt @@ -0,0 +1,17 @@ +environs>=9.3.0,<10.0.0 +Flask>=1.1.2,<2.0.0 +attrs>=20.3.0,<21.0.0 +retrying>=1.3.3,<2.0.0 +aiohttp>=3.8.1,<4.0.0 +requests>=2.25.1,<3.0.0 +loguru>=0.5.3,<1.0.0 +pyquery>=1.4.3,<2.0.0 +supervisor>=4.2.1,<5.0.0 +redis>=3.5.3,<4.0.0 +lxml>=4.6.5,<5.0.0 +fake_headers>=1.0.2,<2.0.0 +maxminddb_geolite2==2018.703 +gevent>=21.8.0,<22.0.0 +tornado>=6.0,<7.0 +itsdangerous==0.24 +MarkupSafe<2.1.0 diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/run.py b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/run.py new file mode 100644 index 0000000..e858da9 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/run.py @@ -0,0 +1,14 @@ +from proxypool.scheduler import Scheduler +import argparse + + +parser = argparse.ArgumentParser(description='ProxyPool') +parser.add_argument('--processor', type=str, help='processor to run') +args = parser.parse_args() + +if __name__ == '__main__': + # if processor set, just run it + if args.processor: + getattr(Scheduler(), f'run_{args.processor}')() + else: + Scheduler().run() diff --git a/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/supervisord.conf b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/supervisord.conf new file mode 100644 index 0000000..aff2cd6 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/ProxyPool/supervisord.conf @@ -0,0 +1,40 @@ +[unix_http_server] +file=/run/supervisor.sock +chmod=0700 + +[supervisord] +pidfile=/run/supervisord.pid +nodaemon=true + +[supervisorctl] +serverurl=unix:///run/supervisor.sock + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory=supervisor.rpcinterface:make_main_rpcinterface + +[program:tester] +process_name=tester +command=python3 run.py --processor tester +directory=/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 + +[program:getter] +process_name=getter +command=python3 run.py --processor getter +directory=/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 + +[program:server] +process_name=server +command=python3 run.py --processor server +directory=/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 \ No newline at end of file diff --git a/Spider/Chapter09_代理的使用/代理池的维护/baseinformation.txt b/Spider/Chapter09_代理的使用/代理池的维护/baseinformation.txt new file mode 100644 index 0000000..35ae3f9 --- /dev/null +++ b/Spider/Chapter09_代理的使用/代理池的维护/baseinformation.txt @@ -0,0 +1,2 @@ +ProxyPool所有内容参考:https://github.com/Python3WebSpider/ProxyPool +上述详细解释了如何部署代理池,基于docker或者k8s \ No newline at end of file diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/.github/ISSUE_TEMPLATE/bug_report.md b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..22003f1 --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,32 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: bug +assignees: Germey + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Screenshots** +If applicable, add screenshots to help explain your problem. + +**Environments (please complete the following information):** + - OS: [e.g. macOS 10.15.2] + - Python [e.g. Python 3.6] + - Browser [e.g. Chrome 67 ] + +**Additional context** +Add any other context about the problem here. diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/.github/workflows/build.yml b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/.github/workflows/build.yml new file mode 100644 index 0000000..6b1b6c2 --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/.github/workflows/build.yml @@ -0,0 +1,24 @@ +name: build +on: + push: + branches: + - master + paths-ignore: + - .gitignore + - README.md + - '.github/ISSUE_TEMPLATE/**' +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout Source + uses: actions/checkout@v1 + - name: Docker Login + run: docker login -u germey -p ${{ secrets.DOCKERHUB_LOGIN_PASSWORD }} + - name: Build the Docker Image + run: docker-compose build + - name: Tag and Push Master Version + run: | + docker tag germey/accountpool germey/accountpool:master + docker push germey/accountpool:master + diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/.gitignore b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/.gitignore new file mode 100644 index 0000000..11b7b2c --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/.gitignore @@ -0,0 +1,144 @@ +/.idea +*.pyc +ghostdriver.log +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +logs/ \ No newline at end of file diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/Dockerfile b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/Dockerfile new file mode 100644 index 0000000..7770153 --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.6 +WORKDIR /app +COPY requirements.txt . +RUN pip install -r requirements.txt +COPY . . +CMD ["supervisord", "-c", "supervisord.conf"] diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/README.md b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/README.md new file mode 100644 index 0000000..32b27e6 --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/README.md @@ -0,0 +1,216 @@ +# AccountPool + +![build](https://github.com/Python3WebSpider/AccountPool/workflows/build/badge.svg) +![](https://img.shields.io/badge/python-3.6%2B-brightgreen) +![Docker Pulls](https://img.shields.io/docker/pulls/germey/accountpool) + +简易高效的账号池,提供如下功能: + +- 定时模拟登录账号,将 Cookies 或 JWT 等信息存储到 Redis 数据库。 +- 定时测试,剔除不可用 Cookies 或 JWT。 +- 提供 API,随机取用测试通过的可用 Cookies 或 JWT。 + +## 使用要求 + +可以通过两种方式来运行账号池,一种方式是使用 Docker(推荐),另一种方式是常规方式运行。 + +### Docker + +如果使用 Docker,则需要安装如下环境: + +- Docker +- Docker-Compose + +### 常规方式 + +常规方式要求有 Python 环境、Redis 环境,具体要求如下: + +- Python>=3.6 +- Redis + +## Docker 运行 + +如果安装好了 Docker 和 Docker-Compose,只需要一条命令即可运行。 + +```shell script +docker-compose up +``` + +运行结果类似如下: + +``` +redis4accountpool is up-to-date +Recreating accountpool ... done +Attaching to redis4accountpool, accountpool +redis4accountpool | 1:C 31 Aug 2023 03:53:10.335 * oO0OoO0OoO0Oo Redis is starting oO0OoO0OoO0Oo +redis4accountpool | 1:C 31 Aug 2023 03:53:10.335 * Redis version=7.2.0, bits=64, commit=00000000, modified=0, pid=1, just started +redis4accountpool | 1:C 31 Aug 2023 03:53:10.335 # Warning: no config file specified, using the default config. In order to specify a config file use redis-server /path/to/redis.conf +redis4accountpool | 1:M 31 Aug 2023 03:53:10.335 * monotonic clock: POSIX clock_gettime +redis4accountpool | 1:M 31 Aug 2023 03:53:10.336 * Running mode=standalone, port=6379. +redis4accountpool | 1:M 31 Aug 2023 03:53:10.336 * Server initialized +redis4accountpool | 1:M 31 Aug 2023 03:53:10.336 * Ready to accept connections tcp +redis4accountpool | 1:C 31 Aug 2023 04:03:11.226 * oO0OoO0OoO0Oo Redis is starting oO0OoO0OoO0Oo +redis4accountpool | 1:C 31 Aug 2023 04:03:11.226 * Redis version=7.2.0, bits=64, commit=00000000, modified=0, pid=1, just started +redis4accountpool | 1:C 31 Aug 2023 04:03:11.226 # Warning: no config file specified, using the default config. In order to specify a config file use redis-server /path/to/redis.conf +redis4accountpool | 1:M 31 Aug 2023 04:03:11.226 * monotonic clock: POSIX clock_gettime +redis4accountpool | 1:M 31 Aug 2023 04:03:11.227 * Running mode=standalone, port=6379. +redis4accountpool | 1:M 31 Aug 2023 04:03:11.227 * Server initialized +redis4accountpool | 1:M 31 Aug 2023 04:03:11.227 * Ready to accept connections tcp +accountpool | 2023-08-31 04:06:20,737 CRIT Supervisor is running as root. Privileges were not dropped because no user is specified in the config file. If you intend to run as root, you can set user=root in the config file to avoid this message. +accountpool | 2023-08-31 04:06:20,739 INFO supervisord started with pid 1 +accountpool | 2023-08-31 04:06:21,742 INFO spawned: 'generator' with pid 10 +accountpool | 2023-08-31 04:06:21,744 INFO spawned: 'server' with pid 11 +accountpool | 2023-08-31 04:06:21,746 INFO spawned: 'tester' with pid 12 +accountpool | 2023-08-31 04:06:21.990 | DEBUG | accountpool.scheduler:run_tester:31 - tester loop 0 start... +accountpool | 2023-08-31 04:06:21.990 | DEBUG | accountpool.scheduler:run_generator:46 - getter loop 0 start... +accountpool | * Running on all addresses. +accountpool | WARNING: This is a development server. Do not use it in a production deployment. +accountpool | * Running on http://172.24.0.3:6777/ (Press CTRL+C to quit) +accountpool | 2023-08-31 04:06:22.004 | DEBUG | accountpool.processors.generator:run:39 - start to run generator +accountpool | 2023-08-31 04:06:22.005 | DEBUG | accountpool.processors.generator:run:43 - start to generate credential of admin1 +accountpool | 2023-08-31 04:06:23,007 INFO success: generator entered RUNNING state, process has stayed up for > than 1 seconds (startsecs) +accountpool | 2023-08-31 04:06:23,007 INFO success: server entered RUNNING state, process has stayed up for > than 1 seconds (startsecs) +accountpool | 2023-08-31 04:06:23,007 INFO success: tester entered RUNNING state, process has stayed up for > than 1 seconds (startsecs) +``` + +可以看到 Redis、Generator、Server、Tester 都已经启动成功。 + +另外还需要导入一些账号信息到 Redis 数据库里面,由于已经用 Docker 启动了 Redis 数据库,运行在 6333 端口上。 + +这时候可以执行脚本: + +``` +export REDIS_PORT=6333 +python3 importer.py antispider7 +``` + +运行完成之后如果没有报错就说明账号导入成功了,可以自行连上 Redis 看下。 + +过一会访问 [http://localhost:6777/antispider7/random](http://localhost:6777/antispider7/random) 即可获取一个 [antispider7](https://antispider7.scrape.center) 的随机可用 Cookies。 + +## 常规方式运行 + +如果不使用 Docker 运行,配置好 Python、Redis 环境之后也可运行,步骤如下。 + +### 安装和配置 Redis + +本地安装 Redis、Docker 启动 Redis、远程 Redis 都是可以的,只要能正常连接使用即可。 + +首先可以需要一下环境变量,代理池会通过环境变量读取这些值。 + +设置 Redis 的环境变量有两种方式,一种是分别设置 host、port、password,另一种是设置连接字符串,设置方法分别如下: + +设置 host、port、password,如果 password 为空可以设置为空字符串,示例如下: + +```shell script +export REDIS_HOST='localhost' +export REDIS_PORT=6379 +export REDIS_PASSWORD='' +export REDIS_DB=0 +``` + +或者只设置连接字符串: + +```shell script +export REDIS_CONNECTION_STRING='redis://[password]@host:port/db' +``` + +如果没有密码也要设置为: + +```shell script +export REDIS_CONNECTION_STRING='redis://@host:port/db' +``` + +这里连接字符串的格式需要符合 `redis://[password]@host:port/db` 的格式,注意不要遗漏 `@`。 + +以上两种设置任选其一即可。 + +### 安装依赖包 + +这里强烈推荐使用 [Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands) +或 [virtualenv](https://virtualenv.pypa.io/en/latest/user_guide.html) 创建虚拟环境,Python 版本不低于 3.6。 + +然后 pip 安装依赖即可: + +```shell script +pip3 install -r requirements.txt +``` + +### 运行代理池 + +两种方式运行账号池,一种是 Tester、Generator、Server 全部运行,另一种是按需分别运行。 + +一般来说可以选择全部运行,命令如下: + +```shell script +python3 run.py +``` + +运行之后会启动 Tester、Generator、Server,这时访问 [http://localhost:6777//random](http://localhost:6777//random) 即可获取一个随机可用代理。 + +或者如果你弄清楚了账号池的架构,可以按需分别运行,命令如下: + +```shell script +python3 run.py --processor getter +python3 run.py --processor tester +python3 run.py --processor server +``` + +这里 processor 可以指定运行 Tester、Generator 还是 Server。 + +## 可配置项 + +账号池可以通过设置环境变量来配置一些参数。 + +### 开关 + +- ENABLE_TESTER:允许 Tester 启动,默认 true +- ENABLE_GENERATOR:允许 Generator 启动,默认 true +- ENABLE_SERVER:运行 Server 启动,默认 true + +### 环境 + +- APP_ENV:运行环境,可以设置 dev、test、prod,即开发、测试、生产环境,默认 dev +- APP_DEBUG:调试模式,可以设置 true 或 false,默认 true + +### Redis 连接 + +- REDIS_HOST:Redis 的 Host +- REDIS_PORT:Redis 的端口 +- REDIS_PASSWORD:Redis 的密码 +- REDIS_DB:Redis 的数据库索引,如 0、1 +- REDIS_CONNECTION_STRING:Redis 连接字符串 +- REDIS_KEY:Redis 储存代理使用字典的名称 + +### 处理器 + +- CYCLE_TESTER:Tester 运行周期,即间隔多久运行一次测试,默认 20 秒 +- CYCLE_GETTER:Getter 运行周期,即间隔多久运行一次代理获取,默认 100 秒 +- API_HOST:代理 Server 运行 Host,默认 0.0.0.0 +- API_PORT:代理 Server 运行端口,默认 6777 +- API_THREADED:代理 Server 是否使用多线程,默认 true + +### 日志 + +- LOG_DIR:日志相对路径 +- LOG_RUNTIME_FILE:运行日志文件名称 +- LOG_ERROR_FILE:错误日志文件名称 + +## 部署 + +本项目提供了 Kubernetes 部署脚本,如需部署到 Kubernetes,执行如下命令即可: + +```shell script +cat deployment.yml | sed 's/\${TAG}/latest/g' | kubectl apply -f - +``` + +## 待开发 + +- [ ] 前端页面管理 +- [ ] 使用情况统计分析 + +如有一起开发的兴趣可以在 Issue 留言,非常感谢! + +## LICENSE + +MIT diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/__init__.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/exceptions/__init__.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/exceptions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/exceptions/init.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/exceptions/init.py new file mode 100644 index 0000000..bcf9b3e --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/exceptions/init.py @@ -0,0 +1,7 @@ +class InitException(Exception): + def __str__(self): + """ + init error + :return: + """ + return repr('init failed') diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/processors/__init__.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/processors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/processors/generator.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/processors/generator.py new file mode 100644 index 0000000..7852131 --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/processors/generator.py @@ -0,0 +1,111 @@ +from accountpool.exceptions.init import InitException +from accountpool.storages.redis import RedisClient +from loguru import logger + + +class BaseGenerator(object): + def __init__(self, website=None): + """ + init base generator + :param website: name of website + """ + self.website = website + if not self.website: + raise InitException + self.account_operator = RedisClient(type='account', website=self.website) + self.credential_operator = RedisClient(type='credential', website=self.website) + + def generate(self, username, password): + """ + generate method + :param username: username + :param password: password + :return: + """ + raise NotImplementedError + + def init(self): + """ + do init + """ + pass + + def run(self): + """ + run main process + :return: + """ + self.init() + logger.debug('start to run generator') + for username, password in self.account_operator.all().items(): + if self.credential_operator.get(username): + continue + logger.debug(f'start to generate credential of {username}') + self.generate(username, password) + + +import requests + + +class Antispider6Generator(BaseGenerator): + + def init(self): + """ + do init + """ + if self.account_operator.count() == 0: + self.account_operator.set('admin', 'admin') + self.account_operator.set('admin2', 'admin2') + + def generate(self, username, password): + """ + generate main process + """ + if self.credential_operator.get(username): + logger.debug(f'credential of {username} exists, skip') + return + login_url = 'https://antispider6.scrape.center/login' + s = requests.Session() + s.post(login_url, data={ + 'username': username, + 'password': password + }) + result = [] + for cookie in s.cookies: + print(cookie.name, cookie.value) + result.append(f'{cookie.name}={cookie.value}') + result = ';'.join(result) + logger.debug(f'get credential {result}') + self.credential_operator.set(username, result) + + +class Antispider7Generator(BaseGenerator): + + MAX_COUNT = 100 + + def init(self): + """ + do init + """ + for i in range(1, self.MAX_COUNT + 1): + self.account_operator.set(f'admin{i}', f'admin{i}') + + def generate(self, username, password): + """ + generate main process + """ + if self.credential_operator.get(username): + logger.debug(f'credential of {username} exists, skip') + return + login_url = 'https://antispider7.scrape.center/api/login' + s = requests.Session() + r = s.post(login_url, json={ + 'username': username, + 'password': password + }) + if r.status_code != 200: + logger.error(f'error occurred while generating credential of {username}, error code {r.status_code}') + return + token = r.json().get('token') + logger.debug(f'get credential {token}') + self.credential_operator.set(username, token) diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/processors/server.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/processors/server.py new file mode 100644 index 0000000..74b7208 --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/processors/server.py @@ -0,0 +1,69 @@ +import json +from flask import Flask, g +from accountpool.storages.redis import RedisClient +from accountpool.setting import GENERATOR_MAP +from loguru import logger + +__all__ = ['app'] + +app = Flask(__name__) + +account = 'account' +credential = 'credential' + + +@app.route('/') +def index(): + return '

Welcome to Account Pool System

' + + +def get_conn(): + """ + get connection + :return: + """ + for website in GENERATOR_MAP: + if not hasattr(g, website): + setattr(g, f'{website}_{credential}', RedisClient(credential, website)) + setattr(g, f'{website}_{account}', RedisClient(account, website)) + return g + + +@app.route('//random') +def random(website): + """ + ger random credential /weibo/random + :return: random credential + """ + g = get_conn() + result = getattr(g, f'{website}_{credential}').random() + logger.debug(f'get credential {result}') + return result + + +@app.route('//add//') +def add(website, username, password): + """ + add account /weibo/add/user/password + :param website: website + :param username: username + :param password: password + :return: + """ + g = get_conn() + getattr(g, f'{website}_{account}').set(username, password) + return json.dumps({'status': '1'}) + + +@app.route('//count') +def count(website): + """ + get count of credential + """ + g = get_conn() + count = getattr(g, f'{website}_{credential}').count() + return json.dumps({'status': 'ok', 'count': count}) + + +if __name__ == '__main__': + app.run(host='0.0.0.0') diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/processors/tester.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/processors/tester.py new file mode 100644 index 0000000..7cc44d9 --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/processors/tester.py @@ -0,0 +1,90 @@ +import json +import requests +from requests.exceptions import ConnectionError +from accountpool.storages.redis import * +from accountpool.exceptions.init import InitException +from loguru import logger + + +class BaseTester(object): + """ + base tester + """ + + def __init__(self, website=None): + """ + init base tester + """ + self.website = website + if not self.website: + raise InitException + self.account_operator = RedisClient(type='account', website=self.website) + self.credential_operator = RedisClient(type='credential', website=self.website) + + def test(self, username, credential): + """ + test single credential + """ + raise NotImplementedError + + def run(self): + """ + test all credentials + """ + credentials = self.credential_operator.all() + for username, credential in credentials.items(): + self.test(username, credential) + + +class Antispider6Tester(BaseTester): + """ + tester for antispider6 + """ + + def __init__(self, website=None): + BaseTester.__init__(self, website) + + def test(self, username, credential): + """ + test single credential + """ + logger.info(f'testing credential for {username}') + try: + test_url = TEST_URL_MAP[self.website] + response = requests.get(test_url, headers={ + 'Cookie': credential + }, timeout=5, allow_redirects=False) + if response.status_code == 200: + logger.info('credential is valid') + else: + logger.info('credential is not valid, delete it') + self.credential_operator.delete(username) + except ConnectionError: + logger.info('test failed') + + +class Antispider7Tester(BaseTester): + """ + tester for antispider7 + """ + + def __init__(self, website=None): + BaseTester.__init__(self, website) + + def test(self, username, credential): + """ + test single credential + """ + logger.info(f'testing credential for {username}') + try: + test_url = TEST_URL_MAP[self.website] + response = requests.get(test_url, headers={ + 'authorization': f'jwt {credential}' + }, timeout=5, allow_redirects=False) + if response.status_code == 200: + logger.info('credential is valid') + else: + logger.info('credential is not valid, delete it') + self.credential_operator.delete(username) + except ConnectionError: + logger.info('test failed') diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/scheduler.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/scheduler.py new file mode 100644 index 0000000..12f5b9d --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/scheduler.py @@ -0,0 +1,95 @@ +import time +import multiprocessing +from accountpool.processors.server import app +from accountpool.processors import generator as generators +from accountpool.processors import tester as testers +from accountpool.setting import CYCLE_GENERATOR, CYCLE_TESTER, API_HOST, API_THREADED, API_PORT, ENABLE_SERVER, \ + ENABLE_GENERATOR, ENABLE_TESTER, IS_WINDOWS, TESTER_MAP, GENERATOR_MAP +from loguru import logger + +if IS_WINDOWS: + multiprocessing.freeze_support() + +tester_process, generator_process, server_process = None, None, None + + +class Scheduler(object): + """ + scheduler + """ + + def run_tester(self, website, cycle=CYCLE_TESTER): + """ + run tester + """ + if not ENABLE_TESTER: + logger.info('tester not enabled, exit') + return + tester = getattr(testers, TESTER_MAP[website])(website) + loop = 0 + while True: + logger.debug(f'tester loop {loop} start...') + tester.run() + loop += 1 + time.sleep(cycle) + + def run_generator(self, website, cycle=CYCLE_GENERATOR): + """ + run getter + """ + if not ENABLE_GENERATOR: + logger.info('getter not enabled, exit') + return + generator = getattr(generators, GENERATOR_MAP[website])(website) + loop = 0 + while True: + logger.debug(f'getter loop {loop} start...') + generator.run() + loop += 1 + time.sleep(cycle) + + def run_server(self, _): + """ + run server for api + """ + if not ENABLE_SERVER: + logger.info('server not enabled, exit') + return + app.run(host=API_HOST, port=API_PORT, threaded=API_THREADED) + + def run(self, website): + global tester_process, generator_process, server_process + try: + logger.info(f'starting account pool for website {website}...') + if ENABLE_TESTER: + tester_process = multiprocessing.Process(target=self.run_tester, args=(website,)) + logger.info(f'starting tester, pid {tester_process.pid}...') + tester_process.start() + + if ENABLE_GENERATOR: + generator_process = multiprocessing.Process(target=self.run_generator, args=(website,)) + logger.info(f'starting getter, pid{generator_process.pid}...') + generator_process.start() + + if ENABLE_SERVER: + server_process = multiprocessing.Process(target=self.run_server, args=(website,)) + logger.info(f'starting server, pid{server_process.pid}...') + server_process.start() + + tester_process.join() + generator_process.join() + server_process.join() + except KeyboardInterrupt: + logger.info('received keyboard interrupt signal') + tester_process.terminate() + generator_process.terminate() + server_process.terminate() + finally: + # must call join method before calling is_alive + tester_process.join() + generator_process.join() + server_process.join() + logger.info(f'tester is {"alive" if tester_process.is_alive() else "dead"}') + logger.info(f'getter is {"alive" if generator_process.is_alive() else "dead"}') + logger.info(f'server is {"alive" if server_process.is_alive() else "dead"}') + logger.info('accountpool terminated') diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/setting.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/setting.py new file mode 100644 index 0000000..3176275 --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/setting.py @@ -0,0 +1,83 @@ +import platform +from os.path import dirname, abspath, join +from environs import Env +from loguru import logger +from accountpool.utils.parse import parse_redis_connection_string + +env = Env() +env.read_env() + +# definition of flags +IS_WINDOWS = platform.system().lower() == 'windows' + +# definition of dirs +ROOT_DIR = dirname(dirname(abspath(__file__))) +LOG_DIR = join(ROOT_DIR, env.str('LOG_DIR', 'logs')) + +# definition of environments +DEV_MODE, TEST_MODE, PROD_MODE = 'dev', 'test', 'prod' +APP_ENV = env.str('APP_ENV', DEV_MODE).lower() +APP_DEBUG = env.bool('APP_DEBUG', True if APP_ENV == DEV_MODE else False) +APP_DEV = IS_DEV = APP_ENV == DEV_MODE +APP_PROD = IS_PROD = APP_ENV == PROD_MODE +APP_TEST = IS_TEST = APP_ENV == TEST_MODE + +# redis host +REDIS_HOST = env.str('REDIS_HOST', '127.0.0.1') +# redis port +REDIS_PORT = env.int('REDIS_PORT', 6379) +# redis password, if no password, set it to None +REDIS_PASSWORD = env.str('REDIS_PASSWORD', None) +# redis db, if no choice, set it to 0 +REDIS_DB = env.int('REDIS_DB', 0) +# redis connection string, like redis://[password]@host:port or rediss://[password]@host:port/0 +REDIS_CONNECTION_STRING = env.str('REDIS_CONNECTION_STRING', None) + +if REDIS_CONNECTION_STRING: + REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_DB = parse_redis_connection_string(REDIS_CONNECTION_STRING) + +# redis hash table key name +REDIS_ACCOUNT_KEY = env.str('REDIS_ACCOUNT_KEY', 'accounts:%s') +REDIS_CREDENTIAL_KEY = env.str('REDIS_CREDENTIAL_KEY', 'credential:%s') + +# integrated generator +GENERATOR_MAP = { + 'antispider6': 'Antispider6Generator', + 'antispider7': 'Antispider7Generator' +} + +# integrated tester +TESTER_MAP = { + 'antispider6': 'Antispider6Tester', + 'antispider7': 'Antispider7Tester', +} + +# definition of tester cycle, it will test every CYCLE_TESTER second +CYCLE_TESTER = env.int('CYCLE_TESTER', 600) +# definition of getter cycle, it will get proxy every CYCLE_GENERATOR second +CYCLE_GENERATOR = env.int('CYCLE_GENERATOR', 600) +GET_TIMEOUT = env.int('GET_TIMEOUT', 10) + +# definition of tester +TEST_URL = env.str('TEST_URL', 'http://www.baidu.com') +TEST_TIMEOUT = env.int('TEST_TIMEOUT', 10) +TEST_BATCH = env.int('TEST_BATCH', 20) +# test url +TEST_URL_MAP = { + 'antispider6': 'https://antispider6.scrape.center/', + 'antispider7': 'https://antispider7.scrape.center/' +} + +# definition of api +API_HOST = env.str('API_HOST', '0.0.0.0') +API_PORT = env.int('API_PORT', 6789) +API_THREADED = env.bool('API_THREADED', True) + +# flags of enable +ENABLE_TESTER = env.bool('ENABLE_TESTER', True) +ENABLE_GENERATOR = env.bool('ENABLE_GENERATOR', True) +ENABLE_SERVER = env.bool('ENABLE_SERVER', True) + +logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), level='DEBUG', rotation='1 week', + retention='20 days') +logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), level='ERROR', rotation='1 week') diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/storages/__init__.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/storages/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/storages/redis.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/storages/redis.py new file mode 100644 index 0000000..cd42903 --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/storages/redis.py @@ -0,0 +1,80 @@ +import random +import redis +from accountpool.setting import * + + +class RedisClient(object): + """ + redis client + """ + + def __init__(self, type, website, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD): + """ + init redis client + :param host: redis host + :param port: redis port + :param password: redis password + """ + self.db = redis.StrictRedis(host=host, port=port, password=password, decode_responses=True) + self.type = type + self.website = website + + def name(self): + """ + get hash name + :return: name of hash + """ + return f'{self.type}:{self.website}' + + def set(self, username, value): + """ + set key-value + :param username: username + :param value: password or cookies + :return: + """ + return self.db.hset(self.name(), username, value) + + def get(self, username): + """ + get value + :param username: username + :return: + """ + return self.db.hget(self.name(), username) + + def delete(self, username): + """ + delete key-value + :param username: username + :return: result + """ + return self.db.hdel(self.name(), username) + + def count(self): + """ + get count + :return: count + """ + return self.db.hlen(self.name()) + + def random(self): + """ + get random cookies or password + :return: random cookies or password + """ + return random.choice(self.db.hvals(self.name())) + + def usernames(self): + """ + get all usernames + :return: all usernames + """ + return self.db.hkeys(self.name()) + + def all(self): + """ + get all key-values + :return: map of key-values + """ + return self.db.hgetall(self.name()) diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/utils/__init__.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/utils/parse.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/utils/parse.py new file mode 100644 index 0000000..b3f42f5 --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/accountpool/utils/parse.py @@ -0,0 +1,13 @@ +import re + +def parse_redis_connection_string(connection_string): + """ + parse a redis connection string, for example: + redis://[password]@host:port + rediss://[password]@host:port + :param connection_string: + :return: + """ + result = re.match('rediss?:\/\/(.*?)@(.*?):(\d+)\/(\d+)', connection_string) + return result.group(2), int(result.group(3)), (result.group(1) or None), (result.group(4) or 0) if result \ + else ('localhost', 6379, None) diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/deployment.yml b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/deployment.yml new file mode 100644 index 0000000..1173c2e --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/deployment.yml @@ -0,0 +1,99 @@ +apiVersion: v1 +kind: Namespace +metadata: + creationTimestamp: null + name: accountpool +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: accountpool + namespace: accountpool +spec: + storageClassName: azure-file + accessModes: + - ReadWriteMany + resources: + requests: + storage: 2Gi +--- +apiVersion: v1 +items: + - apiVersion: v1 + kind: Service + metadata: + annotations: + kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml + kompose.version: 1.20.0 () + creationTimestamp: null + labels: + io.kompose.service: accountpool + name: accountpool + namespace: accountpool + spec: + ports: + - name: "6777" + port: 6777 + targetPort: 6777 + selector: + io.kompose.service: accountpool + status: + loadBalancer: {} + - apiVersion: apps/v1 + kind: Deployment + metadata: + annotations: + kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml + kompose.version: 1.20.0 () + creationTimestamp: null + labels: + io.kompose.service: accountpool + name: accountpool + namespace: accountpool + spec: + replicas: 2 + revisionHistoryLimit: 1 + strategy: {} + selector: + matchLabels: + io.kompose.service: accountpool + template: + metadata: + annotations: + kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml + kompose.version: 1.20.0 () + creationTimestamp: null + labels: + io.kompose.service: accountpool + spec: + containers: + - env: + - name: REDIS_CONNECTION_STRING + valueFrom: + secretKeyRef: + name: redis + key: connection_string + - name: REDIS_PORT + value: '6379' + image: germey/accountpool:${TAG} + name: accountpool + resources: + limits: + memory: "500Mi" + cpu: "300m" + requests: + memory: "500Mi" + cpu: "300m" + ports: + - containerPort: 6777 + volumeMounts: + - mountPath: "/app/accountpool/logs" + name: accountpool + restartPolicy: Always + volumes: + - name: accountpool + persistentVolumeClaim: + claimName: pvc-accountpool + status: {} +kind: List +metadata: {} diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/docker-compose.yml b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/docker-compose.yml new file mode 100644 index 0000000..d2e7044 --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/docker-compose.yml @@ -0,0 +1,19 @@ +version: '3' +services: + redis4accountpool: + image: redis:alpine + container_name: redis4accountpool + command: redis-server + ports: + - "6333:6379" + accountpool: + build: . + image: 'germey/accountpool' + container_name: accountpool + ports: + - "6777:6777" + environment: + REDIS_HOST: redis4accountpool + REDIS_PORT: "6379" + API_PORT: "6777" + WEBSITE: antispider7 \ No newline at end of file diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/importer.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/importer.py new file mode 100644 index 0000000..917d328 --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/importer.py @@ -0,0 +1,14 @@ +from accountpool.storages.redis import RedisClient +import argparse + +parser = argparse.ArgumentParser(description='AccountPool') +parser.add_argument('website', type=str, help='website') +args = parser.parse_args() +website = args.website + +conn = RedisClient('account', args.website) +start = 1 +end = 100 +for i in range(start, end + 1): + username = password = f'admin{i}' + conn.set(username, password) diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/register.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/register.py new file mode 100644 index 0000000..782398a --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/register.py @@ -0,0 +1,28 @@ +import argparse +from acinonyx import run +import requests +from loguru import logger + +# This is a script for registering account for antispider7, using acinonyx to accelerate. + +parser = argparse.ArgumentParser(description='AccountPool') +parser.add_argument('website', type=str, help='website') +args = parser.parse_args() +website = args.website + + +@logger.catch() +def register(username, password): + logger.debug(f'register using {username} and {password}') + response = requests.post(f'https://{website}.scrape.center/api/register', json={ + 'username': username, + 'password': password + }) + print(response.json()) + + +if __name__ == '__main__': + accounts = [] + for index in range(1, 1000): + accounts.append((f'admin{index}', f'admin{index}')) + run(register, accounts) diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/requirements.txt b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/requirements.txt new file mode 100644 index 0000000..fdd46fd --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/requirements.txt @@ -0,0 +1,8 @@ +requests==2.13.0 +selenium==3.4.0 +redis==2.10.5 +Flask==1.1.4 +environs==7.2.0 +loguru==0.3.2 +supervisor==4.1.0 +MarkupSafe==2.0.1 diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/run.py b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/run.py new file mode 100644 index 0000000..3d88310 --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/run.py @@ -0,0 +1,15 @@ +from accountpool.scheduler import Scheduler +import argparse + +parser = argparse.ArgumentParser(description='AccountPool') +parser.add_argument('website', type=str, help='website') +parser.add_argument('--processor', type=str, help='processor to run') +args = parser.parse_args() +website = args.website + +if __name__ == '__main__': + # if processor set, just run it + if args.processor: + getattr(Scheduler(), f'run_{args.processor}')(website) + else: + Scheduler().run(website) diff --git a/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/supervisord.conf b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/supervisord.conf new file mode 100644 index 0000000..6d9fd15 --- /dev/null +++ b/Spider/Chapter10_模拟登录/大规模账号池的搭建/AccountPool/supervisord.conf @@ -0,0 +1,29 @@ +[supervisord] +nodaemon=true + +[program:tester] +process_name=tester +command=python3 run.py %(ENV_WEBSITE)s --processor tester +directory=/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 + +[program:generator] +process_name=generator +command=python3 run.py %(ENV_WEBSITE)s --processor generator +directory=/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 + +[program:server] +process_name=server +command=python3 run.py %(ENV_WEBSITE)s --processor server +directory=/app +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 \ No newline at end of file