爬虫更新

This commit is contained in:
kevinding1125 2023-12-15 17:44:05 +08:00
parent 0650cfb369
commit 408fcea992
91 changed files with 4104 additions and 0 deletions

View File

@ -0,0 +1,135 @@
# Created by .ignore support plugin (hsz.mobi)
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
proxypool/.env
.DS_Store
.vscode

View File

@ -0,0 +1,7 @@
*.vscode
*.pyc
*.db
venv
/.idea
*.log
.DS_Store

View File

@ -0,0 +1,18 @@
FROM python:3.7-alpine AS build
COPY requirements.txt .
RUN apk update &&\
apk add --no-cache gcc g++ libffi-dev openssl-dev libxml2-dev libxslt-dev build-base musl-dev &&\
pip install -U pip &&\
pip install --timeout 30 --user --no-cache-dir --no-warn-script-location -r requirements.txt
FROM python:3.7-alpine
ENV APP_ENV=prod
ENV LOCAL_PKG="/root/.local"
COPY --from=build ${LOCAL_PKG} ${LOCAL_PKG}
RUN apk update && apk add --no-cache libffi-dev openssl-dev libxslt-dev &&\
ln -sf ${LOCAL_PKG}/bin/* /usr/local/bin/
WORKDIR /app
COPY . .
EXPOSE 5555
VOLUME ["/app/proxypool/crawlers/private"]
ENTRYPOINT ["supervisord", "-c", "supervisord.conf"]

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2020 Germey
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,359 @@
# ProxyPool
![build](https://github.com/Python3WebSpider/ProxyPool/workflows/build/badge.svg)
![deploy](https://github.com/Python3WebSpider/ProxyPool/workflows/deploy/badge.svg)
![](https://img.shields.io/badge/python-3.6%2B-brightgreen)
![Docker Pulls](https://img.shields.io/docker/pulls/germey/proxypool)
简易高效的代理池,提供如下功能:
- 定时抓取免费代理网站,简易可扩展。
- 使用 Redis 对代理进行存储并对代理可用性进行排序。
- 定时测试和筛选,剔除不可用代理,留下可用代理。
- 提供代理 API随机取用测试通过的可用代理。
代理池原理解析可见「[如何搭建一个高效的代理池](https://cuiqingcai.com/7048.html)」,建议使用之前阅读。
## 使用准备
首先当然是克隆代码并进入 ProxyPool 文件夹:
```
git clone https://github.com/Python3WebSpider/ProxyPool.git
cd ProxyPool
```
然后选用下面 Docker 和常规方式任意一个执行即可。
## 使用要求
可以通过两种方式来运行代理池,一种方式是使用 Docker推荐另一种方式是常规方式运行要求如下
### Docker
如果使用 Docker则需要安装如下环境
- Docker
- Docker-Compose
安装方法自行搜索即可。
官方 Docker Hub 镜像:[germey/proxypool](https://hub.docker.com/r/germey/proxypool)
### 常规方式
常规方式要求有 Python 环境、Redis 环境,具体要求如下:
- Python>=3.6
- Redis
## Docker 运行
如果安装好了 Docker 和 Docker-Compose只需要一条命令即可运行。
```shell script
docker-compose up
```
运行结果类似如下:
```
redis | 1:M 19 Feb 2020 17:09:43.940 * DB loaded from disk: 0.000 seconds
redis | 1:M 19 Feb 2020 17:09:43.940 * Ready to accept connections
proxypool | 2020-02-19 17:09:44,200 CRIT Supervisor is running as root. Privileges were not dropped because no user is specified in the config file. If you intend to run as root, you can set user=root in the config file to avoid this message.
proxypool | 2020-02-19 17:09:44,203 INFO supervisord started with pid 1
proxypool | 2020-02-19 17:09:45,209 INFO spawned: 'getter' with pid 10
proxypool | 2020-02-19 17:09:45,212 INFO spawned: 'server' with pid 11
proxypool | 2020-02-19 17:09:45,216 INFO spawned: 'tester' with pid 12
proxypool | 2020-02-19 17:09:46,596 INFO success: getter entered RUNNING state, process has stayed up for > than 1 seconds (startsecs)
proxypool | 2020-02-19 17:09:46,596 INFO success: server entered RUNNING state, process has stayed up for > than 1 seconds (startsecs)
proxypool | 2020-02-19 17:09:46,596 INFO success: tester entered RUNNING state, process has stayed up for > than 1 seconds (startsecs)
```
可以看到 Redis、Getter、Server、Tester 都已经启动成功。
这时候访问 [http://localhost:5555/random](http://localhost:5555/random) 即可获取一个随机可用代理。
当然你也可以选择自己 Build直接运行如下命令即可
```
docker-compose -f build.yaml up
```
如果下载速度特别慢,可以自行修改 Dockerfile修改
```diff
- RUN pip install -r requirements.txt
+ RUN pip install -r requirements.txt -i https://pypi.douban.com/simple
```
## 常规方式运行
如果不使用 Docker 运行,配置好 Python、Redis 环境之后也可运行,步骤如下。
### 安装和配置 Redis
本地安装 Redis、Docker 启动 Redis、远程 Redis 都是可以的,只要能正常连接使用即可。
首先可以需要一下环境变量,代理池会通过环境变量读取这些值。
设置 Redis 的环境变量有两种方式,一种是分别设置 host、port、password另一种是设置连接字符串设置方法分别如下
设置 host、port、password如果 password 为空可以设置为空字符串,示例如下:
```shell script
export PROXYPOOL_REDIS_HOST='localhost'
export PROXYPOOL_REDIS_PORT=6379
export PROXYPOOL_REDIS_PASSWORD=''
export PROXYPOOL_REDIS_DB=0
```
或者只设置连接字符串:
```shell script
export PROXYPOOL_REDIS_CONNECTION_STRING='redis://localhost'
```
这里连接字符串的格式需要符合 `redis://[:password@]host[:port][/database]` 的格式,
中括号参数可以省略port 默认是 6379database 默认是 0密码默认为空。
以上两种设置任选其一即可。
### 安装依赖包
这里强烈推荐使用 [Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands)
或 [virtualenv](https://virtualenv.pypa.io/en/latest/user_guide.html) 创建虚拟环境Python 版本不低于 3.6。
然后 pip 安装依赖即可:
```shell script
pip3 install -r requirements.txt
```
### 运行代理池
两种方式运行代理池,一种是 Tester、Getter、Server 全部运行,另一种是按需分别运行。
一般来说可以选择全部运行,命令如下:
```shell script
python3 run.py
```
运行之后会启动 Tester、Getter、Server这时访问 [http://localhost:5555/random](http://localhost:5555/random) 即可获取一个随机可用代理。
或者如果你弄清楚了代理池的架构,可以按需分别运行,命令如下:
```shell script
python3 run.py --processor getter
python3 run.py --processor tester
python3 run.py --processor server
```
这里 processor 可以指定运行 Tester、Getter 还是 Server。
## 使用
成功运行之后可以通过 [http://localhost:5555/random](http://localhost:5555/random) 获取一个随机可用代理。
可以用程序对接实现,下面的示例展示了获取代理并爬取网页的过程:
```python
import requests
proxypool_url = 'http://127.0.0.1:5555/random'
target_url = 'http://httpbin.org/get'
def get_random_proxy():
"""
get random proxy from proxypool
:return: proxy
"""
return requests.get(proxypool_url).text.strip()
def crawl(url, proxy):
"""
use proxy to crawl page
:param url: page url
:param proxy: proxy, such as 8.8.8.8:8888
:return: html
"""
proxies = {'http': 'http://' + proxy}
return requests.get(url, proxies=proxies).text
def main():
"""
main method, entry point
:return: none
"""
proxy = get_random_proxy()
print('get random proxy', proxy)
html = crawl(target_url, proxy)
print(html)
if __name__ == '__main__':
main()
```
运行结果如下:
```
get random proxy 116.196.115.209:8080
{
"args": {},
"headers": {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Host": "httpbin.org",
"User-Agent": "python-requests/2.22.0",
"X-Amzn-Trace-Id": "Root=1-5e4d7140-662d9053c0a2e513c7278364"
},
"origin": "116.196.115.209",
"url": "https://httpbin.org/get"
}
```
可以看到成功获取了代理,并请求 httpbin.org 验证了代理的可用性。
## 可配置项
代理池可以通过设置环境变量来配置一些参数。
### 开关
- ENABLE_TESTER允许 Tester 启动,默认 true
- ENABLE_GETTER允许 Getter 启动,默认 true
- ENABLE_SERVER运行 Server 启动,默认 true
### 环境
- APP_ENV运行环境可以设置 dev、test、prod即开发、测试、生产环境默认 dev
- APP_DEBUG调试模式可以设置 true 或 false默认 true
- APP_PROD_METHOD: 正式环境启动应用方式,默认是`gevent`
可选:`tornado``meinheld`(分别需要安装 tornado 或 meinheld 模块)
### Redis 连接
- PROXYPOOL_REDIS_HOST / REDIS_HOSTRedis 的 Host其中 PROXYPOOL_REDIS_HOST 会覆盖 REDIS_HOST 的值。
- PROXYPOOL_REDIS_PORT / REDIS_PORTRedis 的端口,其中 PROXYPOOL_REDIS_PORT 会覆盖 REDIS_PORT 的值。
- PROXYPOOL_REDIS_PASSWORD / REDIS_PASSWORDRedis 的密码,其中 PROXYPOOL_REDIS_PASSWORD 会覆盖 REDIS_PASSWORD 的值。
- PROXYPOOL_REDIS_DB / REDIS_DBRedis 的数据库索引,如 0、1其中 PROXYPOOL_REDIS_DB 会覆盖 REDIS_DB 的值。
- PROXYPOOL_REDIS_CONNECTION_STRING / REDIS_CONNECTION_STRINGRedis 连接字符串,其中 PROXYPOOL_REDIS_CONNECTION_STRING 会覆盖 REDIS_CONNECTION_STRING 的值。
- PROXYPOOL_REDIS_KEY / REDIS_KEYRedis 储存代理使用字典的名称,其中 PROXYPOOL_REDIS_KEY 会覆盖 REDIS_KEY 的值。
### 处理器
- CYCLE_TESTERTester 运行周期,即间隔多久运行一次测试,默认 20 秒
- CYCLE_GETTERGetter 运行周期,即间隔多久运行一次代理获取,默认 100 秒
- TEST_URL测试 URL默认百度
- TEST_TIMEOUT测试超时时间默认 10 秒
- TEST_BATCH批量测试数量默认 20 个代理
- TEST_VALID_STATUS测试有效的状态码
- API_HOST代理 Server 运行 Host默认 0.0.0.0
- API_PORT代理 Server 运行端口,默认 5555
- API_THREADED代理 Server 是否使用多线程,默认 true
### 日志
- LOG_DIR日志相对路径
- LOG_RUNTIME_FILE运行日志文件名称
- LOG_ERROR_FILE错误日志文件名称
- LOG_ROTATION: 日志记录周转周期或大小,默认 500MB见 [loguru - rotation](https://github.com/Delgan/loguru#easier-file-logging-with-rotation--retention--compression)
- LOG_RETENTION: 日志保留日期,默认 7 天,见 [loguru - retention](https://github.com/Delgan/loguru#easier-file-logging-with-rotation--retention--compression)
- ENABLE_LOG_FILE是否输出 log 文件,默认 true如果设置为 false那么 ENABLE_LOG_RUNTIME_FILE 和 ENABLE_LOG_ERROR_FILE 都不会生效
- ENABLE_LOG_RUNTIME_FILE是否输出 runtime log 文件,默认 true
- ENABLE_LOG_ERROR_FILE是否输出 error log 文件,默认 true
以上内容均可使用环境变量配置,即在运行前设置对应环境变量值即可,如更改测试地址和 Redis 键名:
```shell script
export TEST_URL=http://weibo.cn
export REDIS_KEY=proxies:weibo
```
即可构建一个专属于微博的代理池,有效的代理都是可以爬取微博的。
如果使用 Docker-Compose 启动代理池,则需要在 docker-compose.yml 文件里面指定环境变量,如:
```yaml
version: "3"
services:
redis:
image: redis:alpine
container_name: redis
command: redis-server
ports:
- "6379:6379"
restart: always
proxypool:
build: .
image: "germey/proxypool"
container_name: proxypool
ports:
- "5555:5555"
restart: always
environment:
REDIS_HOST: redis
TEST_URL: http://weibo.cn
REDIS_KEY: proxies:weibo
```
## 扩展代理爬虫
代理的爬虫均放置在 proxypool/crawlers 文件夹下,目前对接了有限几个代理的爬虫。
若扩展一个爬虫,只需要在 crawlers 文件夹下新建一个 Python 文件声明一个 Class 即可。
写法规范如下:
```python
from pyquery import PyQuery as pq
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
BASE_URL = 'http://www.664ip.cn/{page}.html'
MAX_PAGE = 5
class Daili66Crawler(BaseCrawler):
"""
daili66 crawler, http://www.66ip.cn/1.html
"""
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)
trs = doc('.containerbox table tr:gt(0)').items()
for tr in trs:
host = tr.find('td:nth-child(1)').text()
port = int(tr.find('td:nth-child(2)').text())
yield Proxy(host=host, port=port)
```
在这里只需要定义一个 Crawler 继承 BaseCrawler 即可,然后定义好 urls 变量和 parse 方法即可。
- urls 变量即为爬取的代理网站网址列表,可以用程序定义也可写成固定内容。
- parse 方法接收一个参数即 html代理网址的 html在 parse 方法里只需要写好 html 的解析,解析出 host 和 port并构建 Proxy 对象 yield 返回即可。
网页的爬取不需要实现BaseCrawler 已经有了默认实现,如需更改爬取方式,重写 crawl 方法即可。
欢迎大家多多发 Pull Request 贡献 Crawler使其代理源更丰富强大起来。
## 部署
本项目提供了 Kubernetes 部署脚本,如需部署到 Kubernetes请参考 [kubernetes](./kubernetes)。
## 待开发
- [ ] 前端页面管理
- [ ] 使用情况统计分析
如有一起开发的兴趣可以在 Issue 留言,非常感谢!
## LICENSE
MIT

View File

@ -0,0 +1,18 @@
version: "3"
services:
redis4proxypool:
image: redis:alpine
container_name: redis4proxypool
ports:
- "6374:6379"
proxypool:
build: .
image: "germey/proxypool:master"
container_name: proxypool
ports:
- "5555:5555"
restart: always
# volumes:
# - proxypool/crawlers/private:/app/proxypool/crawlers/private
environment:
PROXYPOOL_REDIS_CONNECTION_STRING: redis://@redis4proxypool:6379/0

View File

@ -0,0 +1,18 @@
version: "3"
services:
redis4proxypool:
image: redis:alpine
container_name: redis4proxypool
# ports:
# - "6374:6379"
proxypool:
image: "germey/proxypool:master"
container_name: proxypool
ports:
- "5555:5555"
restart: always
# volumes:
# - proxypool/crawlers/private:/app/proxypool/crawlers/private
environment:
PROXYPOOL_REDIS_HOST: redis4proxypool

View File

@ -0,0 +1,39 @@
import requests
proxypool_url = 'http://127.0.0.1:5555/random'
target_url = 'https://antispider5.scrape.center/'
def get_random_proxy():
"""
get random proxy from proxypool
:return: proxy
"""
return requests.get(proxypool_url).text.strip()
def crawl(url, proxy):
"""
use proxy to crawl page
:param url: page url
:param proxy: proxy, such as 8.8.8.8:8888
:return: html
"""
proxies = {'http': 'http://' + proxy}
return requests.get(url, proxies=proxies).text
def main():
"""
main method, entry point
:return: none
"""
proxy = get_random_proxy()
print('get random proxy', proxy)
html = crawl(target_url, proxy)
print(html)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,95 @@
# -*- coding: UTF-8 -*-
'''
'''
import requests
import time
import threading
import urllib3
from fake_headers import Headers
import uuid
from geolite2 import geolite2
ips = []
# 爬数据的线程类
def getChinaIP(ip='127.0.0.1'):
reader = geolite2.reader()
ip_info = reader.get(ip)
geolite2.close()
print(ip_info)
return True if ip_info['country']['iso_code'] == 'CN' else False
class CrawlThread(threading.Thread):
def __init__(self, proxyip):
super(CrawlThread, self).__init__()
self.proxyip = proxyip
def run(self):
# 开始计时
pure_ip_address = self.proxyip.split(':')[0]
# 验证IP归属
if not getChinaIP(pure_ip_address):
# pass
raise ValueError('不是有效IP')
#
start = time.time()
# 消除关闭证书验证的警告
urllib3.disable_warnings()
headers = Headers(headers=True).generate()
headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
headers['Pragma'] = 'no-cache'
headers['Host'] = 'bb.cf08tp.cn'
headers['x-forward-for'] = pure_ip_address
headers['Cookie'] = 'PHPSESSID={}'.format(
''.join(str(uuid.uuid1()).split('-')))
print(headers)
html = requests.get(headers=headers, url=targetUrl, proxies={
"http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode()
# 结束计时
end = time.time()
# 输出内容
print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) +
"毫秒 " + self.proxyip + " 获取到如下HTML内容\n" + html + "\n*************")
# 获取代理IP的线程类
class GetIpThread(threading.Thread):
def __init__(self, fetchSecond):
super(GetIpThread, self).__init__()
self.fetchSecond = fetchSecond
def run(self):
global ips
while True:
# 获取IP列表
res = requests.get(apiUrl).content.decode()
# 按照\n分割获取到的IP
ips = res.split('\n')
# 利用每一个IP
for proxyip in ips:
if proxyip.strip():
# 开启一个线程
# CrawlThread(proxyip).start()
try:
CrawlThread(proxyip).run()
time.sleep(1.5)
except Exception as e:
print(e)
# 休眠
time.sleep(len(ips) /self.fetchSecond )
if __name__ == '__main__':
# 获取IP的API接口
# apiUrl = "http://127.0.0.1:5555/all"
apiUrl = "http://127.0.0.1:5555/random"
# 要抓取的目标网站地址
targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp="
# targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp='
fetchSecond = 5
# 开始自动获取IP
GetIpThread(fetchSecond).start()

View File

@ -0,0 +1,24 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/
image/

View File

@ -0,0 +1,27 @@
apiVersion: v2
name: proxypool
description: A Efficient Proxy Pool
# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application
# Keywords about this application.
keywords:
- proxypool
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
appVersion: 1.16.0

View File

@ -0,0 +1,42 @@
# Kubernetes 部署
这是用来快速部署本代理池的 Helm Charts。
首先需要有一个 Kubernetes 集群,其次需要安装 Helm确保 helm 命令可以正常运行。
安装参考:
- Kubernetes[https://setup.scrape.center/kubernetes](https://setup.scrape.center/kubernetes)。
- Helm: [https://setup.scrape.center/helm](https://setup.scrape.center/helm)。
## 安装
安装直接使用 helm 命令在本文件夹运行即可,使用 `-n` 可以制定 NameSpace。
```shell
helm install proxypool-app . -n scrape
```
其中 proxypool-app 就是应用的名字,可以任意取名,它会用作代理池 Deplyment 的名称。
如果需要覆盖变量,可以修改 values.yaml 文件,执行如下命令安装:
```shell
helm install proxypool-app . -f values.yaml -n scrape
```
## 更新
如果需要更新配置,可以修改 values.yaml 文件,执行如下命令更新版本:
```shell
helm upgrade proxypool-app . -f values.yaml -n scrape
```
## 卸载
如果不想使用了,可以只用 uninstall 命令卸载:
```shell
helm uninstall proxypool-app -n scrape
```

View File

@ -0,0 +1,53 @@
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "proxypool.name" -}}
{{- default .Chart.Name .Values.name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "proxypool.fullname" -}}
{{- if .Values.fullname }}
{{- .Values.fullname | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.name }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "proxypool.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "proxypool.labels" -}}
helm.sh/chart: {{ include "proxypool.chart" . }}
{{ include "proxypool.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
{{/*
Selector labels
*/}}
{{- define "proxypool.selectorLabels" -}}
app.kubernetes.io/name: {{ include "proxypool.fullname" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}

View File

@ -0,0 +1,37 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "proxypool.fullname" . }}
labels:
{{- include "proxypool.labels" . | nindent 4 }}
spec:
replicas: {{ .Values.deployment.replicas }}
revisionHistoryLimit: {{ .Values.deployment.revisionHistoryLimit }}
selector:
matchLabels:
{{- include "proxypool.labels" . | nindent 8 }}
template:
metadata:
labels:
{{- include "proxypool.labels" . | nindent 8 }}
spec:
restartPolicy: {{ .Values.deployment.restartPolicy }}
containers:
- name: {{ include "proxypool.fullname" . }}
image: {{ .Values.deployment.image }}
ports:
- containerPort: 5555
protocol: TCP
imagePullPolicy: {{ .Values.deployment.imagePullPolicy }}
livenessProbe:
httpGet:
path: /random
port: 5555
initialDelaySeconds: 60
periodSeconds: 5
failureThreshold: 5
timeoutSeconds: 10
resources:
{{- toYaml .Values.deployment.resources | nindent 12 }}
env:
{{- toYaml .Values.deployment.env | nindent 12 }}

View File

@ -0,0 +1,41 @@
{{- if .Values.ingress.enabled -}}
{{- $fullName := include "proxypool.fullname" . -}}
{{- $svcPort := .Values.service.port -}}
{{- if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
apiVersion: networking.k8s.io/v1beta1
{{- else -}}
apiVersion: extensions/v1beta1
{{- end }}
kind: Ingress
metadata:
name: {{ $fullName }}
labels:
{{- include "proxypool.labels" . | nindent 4 }}
{{- with .Values.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
{{- if .Values.ingress.tls }}
tls:
{{- range .Values.ingress.tls }}
- hosts:
{{- range .hosts }}
- {{ . | quote }}
{{- end }}
secretName: {{ .secretName }}
{{- end }}
{{- end }}
rules:
{{- range .Values.ingress.hosts }}
- host: {{ .host | quote }}
http:
paths:
{{- range .paths }}
- path: {{ . }}
backend:
serviceName: {{ $fullName }}
servicePort: {{ $svcPort }}
{{- end }}
{{- end }}
{{- end }}

View File

@ -0,0 +1,15 @@
apiVersion: v1
kind: Service
metadata:
name: {{ include "proxypool.fullname" . }}
labels:
{{- include "proxypool.labels" . | nindent 4 }}
spec:
type: {{ .Values.service.type }}
ports:
- port: {{ .Values.service.port }}
targetPort: 5555
protocol: TCP
name: http
selector:
{{- include "proxypool.selectorLabels" . | nindent 4 }}

View File

@ -0,0 +1,30 @@
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: proxypool-redis
name: proxypool-redis
spec:
replicas: 1
revisionHistoryLimit: 1
selector:
matchLabels:
app: proxypool-redis
template:
metadata:
labels:
app: proxypool-redis
spec:
containers:
- image: redis:alpine
name: proxypool-redis
ports:
- containerPort: 6379
resources:
limits:
memory: "100Mi"
cpu: "100m"
requests:
memory: "100Mi"
cpu: "100m"
restartPolicy: Always

View File

@ -0,0 +1,13 @@
apiVersion: v1
kind: Service
metadata:
labels:
app: proxypool-redis
name: proxypool-redis
spec:
ports:
- name: "6379"
port: 6379
targetPort: 6379
selector:
app: proxypool-redis

View File

@ -0,0 +1,39 @@
name: proxypool
fullname: proxypool-app
deployment:
image: germey/proxypool:master
imagePullPolicy: Always
restartPolicy: Always
revisionHistoryLimit: 2
successfulJobsHistoryLimit: 1
replicas: 1
resources:
limits:
memory: "200Mi"
cpu: "80m"
requests:
memory: "200Mi"
cpu: "80m"
env:
- name: PROXYPOOL_REDIS_HOST
value: "proxypool-redis"
- name: PROXYPOOL_REDIS_PORT
value: "6379"
service:
type: ClusterIP
port: 80
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: nginx
hosts:
- host: proxypool.scrape.center
paths:
- "/"
tls:
- secretName: tls-wildcard-scrape-center
hosts:
- proxypool.scrape.center

View File

@ -0,0 +1,134 @@
# Created by .ignore support plugin (hsz.mobi)
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
.idea/
*.log

View File

@ -0,0 +1,15 @@
import pkgutil
from .base import BasePaidCrawler
import inspect
# load classes subclass of BaseCrawler
classes = []
for loader, name, is_pkg in pkgutil.walk_packages(__path__):
module = loader.find_module(name).load_module(name)
for name, value in inspect.getmembers(module):
globals()[name] = value
if inspect.isclass(value) and issubclass(value, BasePaidCrawler) and value is not BasePaidCrawler \
and not getattr(value, 'ignore', False):
classes.append(value)
__all__ = __ALL__ = classes

View File

@ -0,0 +1,93 @@
from retrying import RetryError, retry
import requests
from loguru import logger
from proxypool.setting import GET_TIMEOUT
from fake_headers import Headers
import time
# 免费的节点
class BaseCrawler(object):
urls = []
@retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000)
def fetch(self, url, **kwargs):
try:
headers = Headers(headers=True).generate()
kwargs.setdefault('timeout', GET_TIMEOUT)
kwargs.setdefault('verify', False)
kwargs.setdefault('headers', headers)
response = requests.get(url, **kwargs)
if response.status_code == 200:
response.encoding = 'utf-8'
return response.text
except (requests.ConnectionError, requests.ReadTimeout):
return
def process(self, html, url):
"""
used for parse html
"""
for proxy in self.parse(html):
logger.info(f'fetched proxy {proxy.string()} from {url}')
yield proxy
def crawl(self):
"""
crawl main method
"""
try:
for url in self.urls:
logger.info(f'fetching {url}')
html = self.fetch(url)
if not html:
continue
time.sleep(.5)
yield from self.process(html, url)
except RetryError:
logger.error(
f'crawler {self} crawled proxy unsuccessfully, '
'please check if target url is valid or network issue')
# 付费的节点
class BasePaidCrawler(object):
urls = []
@retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000)
def fetch(self, url, **kwargs):
try:
headers = Headers(headers=True).generate()
kwargs.setdefault('timeout', GET_TIMEOUT)
kwargs.setdefault('verify', False)
kwargs.setdefault('headers', headers)
response = requests.get(url, **kwargs)
if response.status_code == 200:
response.encoding = 'utf-8'
return response.text
except (requests.ConnectionError, requests.ReadTimeout):
return
def process(self, response, url):
"""
used for parse html
"""
for proxy in self.parse(response):
logger.info(f'fetched proxy {proxy.string()} from {url}')
yield proxy
def crawl(self):
"""
crawl main method
"""
try:
for url in self.urls:
logger.info(f'fetching {url}')
response = self.fetch(url)
if not response:
continue
time.sleep(.5)
yield from self.process(response, url)
except RetryError:
logger.error(
f'crawler {self} crawled proxy unsuccessfully, '
'please check if target url is valid or network issue')

View File

@ -0,0 +1,32 @@
from pyquery import PyQuery as pq
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
BASE_URL = 'http://www.66ip.cn/{page}.html'
MAX_PAGE = 3
class Daili66Crawler(BaseCrawler):
"""
daili66 crawler, http://www.66ip.cn/1.html
"""
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)
trs = doc('.containerbox table tr:gt(0)').items()
for tr in trs:
host = tr.find('td:nth-child(1)').text()
port = int(tr.find('td:nth-child(2)').text())
yield Proxy(host=host, port=port)
if __name__ == '__main__':
crawler = Daili66Crawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,31 @@
from pyquery import PyQuery as pq
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
from loguru import logger
BASE_URL = 'http://www.data5u.com'
class Data5UCrawler(BaseCrawler):
"""
data5u crawler, http://www.data5u.com
"""
urls = [BASE_URL]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)
items = doc('.wlist ul.l2').items()
for item in items:
host = item.find('span:first-child').text()
port = int(item.find('span:nth-child(2)').text())
yield Proxy(host=host, port=port)
if __name__ == '__main__':
crawler = Data5UCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,40 @@
import time
from retrying import RetryError
from loguru import logger
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
import json
BASE_URL = 'https://www.docip.net/data/free.json?t={date}'
class DocipCrawler(BaseCrawler):
"""
Docip crawler, https://www.docip.net/data/free.json
"""
urls = [BASE_URL.format(date=time.strftime("%Y%m%d", time.localtime()))]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
try:
result = json.loads(html)
proxy_list = result['data']
for proxy_item in proxy_list:
# TODO 这里的逻辑有变化,因为返回的ip变了
ip_and_port =proxy_item['ip']
host = ip_and_port.split(":")[0]
port = ip_and_port.split(":")[1]
yield Proxy(host=host, port=port)
except json.JSONDecodeError:
print("json.JSONDecodeError")
return
if __name__ == '__main__':
crawler = DocipCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,31 @@
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
import re
import json
BASE_URL = 'http://proxylist.fatezero.org/proxy.list'
class FatezeroCrawler(BaseCrawler):
"""
Fatezero crawler,http://proxylist.fatezero.org
"""
urls = [BASE_URL]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
hosts_ports = html.split('\n')
for addr in hosts_ports:
if(addr):
ip_address = json.loads(addr)
host = ip_address['host']
port = ip_address['port']
yield Proxy(host=host, port=port)
if __name__ == '__main__':
crawler = FatezeroCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,71 @@
import time
from retrying import RetryError
from loguru import logger
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
import json
BASE_URL = 'https://proxylist.geonode.com/api/proxy-list?limit=500&page={page}&sort_by=lastChecked&sort_type=desc'
MAX_PAGE = 18
class GeonodeCrawler(BaseCrawler):
"""
Geonode crawler, https://proxylist.geonode.com/
"""
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
try:
result = json.loads(html)
proxy_list = result['data']
for proxy_item in proxy_list:
host = proxy_item['ip']
port = proxy_item['port']
yield Proxy(host=host, port=port)
except json.JSONDecodeError:
print("json.JSONDecodeError")
return
def crawl(self):
"""
override crawl main method
add headers
"""
headers = {
'authority': 'proxylist.geonode.com',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"',
'accept': 'application/json, text/plain, */*',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'origin': 'https://geonode.com',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://geonode.com/',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
'if-none-match': 'W/"c25d-BXjLTmP+/yYXtIz4OEcmdOWSv88"',
}
try:
for url in self.urls:
logger.info(f'fetching {url}')
html = self.fetch(url, headers=headers)
if not html:
continue
time.sleep(.5)
yield from self.process(html, url)
except RetryError:
logger.error(
f'crawler {self} crawled proxy unsuccessfully, '
'please check if target url is valid or network issue')
if __name__ == '__main__':
crawler = GeonodeCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,44 @@
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
import re
from pyquery import PyQuery as pq
import time
BASE_URL = 'http://www.goubanjia.com/'
class GoubanjiaCrawler(BaseCrawler):
"""
ip Goubanjia crawler, http://www.goubanjia.com/
"""
urls = [BASE_URL]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)('.ip').items()
# ''.join([*filter(lambda x: x != '',re.compile('\>([\d:\.]*)\<').findall(td.html()))])
for td in doc:
trs = td.children()
ip_str = ''
for tr in trs:
attrib = tr.attrib
if 'style' in attrib and 'none' in tr.attrib['style']:
continue
ip_str+= '' if not tr.text else tr.text
addr_split = ip_str.split(':')
if(len(addr_split) == 2):
host = addr_split[0]
port = addr_split[1]
yield Proxy(host=host, port=port)
else:
port = trs[-1].text
host = ip_str.replace(port,'')
yield Proxy(host=host, port=port)
if __name__ == '__main__':
crawler = GoubanjiaCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,36 @@
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
import re
from pyquery import PyQuery as pq
import time
BASE_URL = 'https://ip.ihuan.me/today/{path}.html'
class IhuanCrawler(BaseCrawler):
"""
ip ihuan crawler, https://ip.ihuan.me
"""
path = time.strftime("%Y/%m/%d/%H", time.localtime())
urls = [BASE_URL.format(path=path)]
ignore = False
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
# doc = pq(html)('.text-left')
ip_address = re.compile('([\d:\.]*).*?<br>')
hosts_ports = ip_address.findall(html)
for addr in hosts_ports:
addr_split = addr.split(':')
if(len(addr_split) == 2):
host = addr_split[0]
port = addr_split[1]
yield Proxy(host=host, port=port)
if __name__ == '__main__':
crawler = IhuanCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,32 @@
from proxypool.crawlers.base import BaseCrawler
from proxypool.schemas.proxy import Proxy
import re
MAX_PAGE = 3
BASE_URL = 'http://www.ip3366.net/free/?stype={stype}&page={page}'
class IP3366Crawler(BaseCrawler):
"""
ip3366 crawler, http://www.ip3366.net/
"""
urls = [BASE_URL.format(stype=stype,page=i) for stype in range(1,3) for i in range(1, 8)]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
# \s * 匹配空格,起到换行作用
re_ip_address = ip_address.findall(html)
for address, port in re_ip_address:
proxy = Proxy(host=address.strip(), port=int(port.strip()))
yield proxy
if __name__ == '__main__':
crawler = IP3366Crawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,33 @@
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
import re
MAX_NUM = 9999
BASE_URL = 'http://api.89ip.cn/tqdl.html?api=1&num={MAX_NUM}&port=&address=&isp='.format(MAX_NUM=MAX_NUM)
class Ip89Crawler(BaseCrawler):
"""
89ip crawler, http://api.89ip.cn
"""
urls = [BASE_URL]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
ip_address = re.compile('([\d:\.]*)<br>')
hosts_ports = ip_address.findall(html)
for addr in hosts_ports:
addr_split = addr.split(':')
if(len(addr_split) == 2):
host = addr_split[0]
port = addr_split[1]
yield Proxy(host=host, port=port)
if __name__ == '__main__':
crawler = Ip89Crawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,35 @@
from proxypool.crawlers.base import BaseCrawler
from proxypool.schemas.proxy import Proxy
import re
BASE_URL = 'http://www.iphai.com/'
class IPHaiCrawler(BaseCrawler):
"""
iphai crawler, http://www.iphai.com/
"""
urls = [BASE_URL]
ignore = True
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
find_tr = re.compile('<tr>(.*?)</tr>', re.S)
trs = find_tr.findall(html)
for s in range(1, len(trs)):
find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S)
re_ip_address = find_ip.findall(trs[s])
find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
re_port = find_port.findall(trs[s])
for address, port in zip(re_ip_address, re_port):
proxy = Proxy(host=address.strip(), port=int(port.strip()))
yield proxy
if __name__ == '__main__':
crawler = IPHaiCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,39 @@
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
import json
BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}'
MAX_PAGE = 3
class JiangxianliCrawler(BaseCrawler):
"""
jiangxianli crawler,https://ip.jiangxianli.com/
"""
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
result = json.loads(html)
if result['code'] != 0:
return
MAX_PAGE = int(result['data']['last_page'])
hosts_ports = result['data']['data']
for ip_address in hosts_ports:
if(ip_address):
host = ip_address['ip']
port = ip_address['port']
yield Proxy(host=host, port=port)
if __name__ == '__main__':
crawler = JiangxianliCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,33 @@
from proxypool.crawlers.base import BaseCrawler
from proxypool.schemas.proxy import Proxy
import re
from pyquery import PyQuery as pq
BASE_URL = 'https://www.kuaidaili.com/free/{type}/{page}/'
MAX_PAGE = 3
class KuaidailiCrawler(BaseCrawler):
"""
kuaidaili crawler, https://www.kuaidaili.com/
"""
urls = [BASE_URL.format(type=type,page=page) for type in ('intr','inha') for page in range(1, MAX_PAGE + 1)]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)
for item in doc('table tr').items():
td_ip = item.find('td[data-title="IP"]').text()
td_port = item.find('td[data-title="PORT"]').text()
if td_ip and td_port:
yield Proxy(host=td_ip, port=td_port)
if __name__ == '__main__':
crawler = KuaidailiCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,34 @@
import requests
from pyquery import PyQuery as pq
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
requests.packages.urllib3.disable_warnings()
BASE_URL = "https://proxy.seofangfa.com/"
MAX_PAGE = 1
class SeoFangFaCrawler(BaseCrawler):
"""
seo方法 crawler, https://proxy.seofangfa.com/
"""
urls = ["https://proxy.seofangfa.com/"]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)
trs = doc('.table tr:gt(0)').items()
for tr in trs:
host = tr.find('td:nth-child(1)').text()
port = int(tr.find('td:nth-child(2)').text())
yield Proxy(host=host, port=port)
if __name__ == '__main__':
crawler = SeoFangFaCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,31 @@
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
from pyquery import PyQuery as pq
BaseUrl = 'http://www.taiyanghttp.com/free/page{num}'
MAX_PAGE = 3
class TaiyangdailiCrawler(BaseCrawler):
"""
taiyangdaili crawler, http://www.taiyanghttp.com/free/
"""
urls = [BaseUrl.format(num=i) for i in range(1, 6)]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)
trs = doc('#ip_list .tr.ip_tr').items()
for tr in trs:
host = tr.find('div:nth-child(1)').text()
port = tr.find('div:nth-child(2)').text()
yield Proxy(host=host, port=port)
if __name__ == '__main__':
crawler = TaiyangdailiCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,49 @@
from pyquery import PyQuery as pq
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
from loguru import logger
BASE_URL = 'https://ip.uqidata.com/free/index.html'
class UqidataCrawler(BaseCrawler):
"""
Uqidata crawler, https://ip.uqidata.com/free/index.html
"""
urls = [BASE_URL]
ignore = True
def encode(input_str):
tmp = []
for i in range(len(input_str)):
tmp.append("ABCDEFGHIZ".find(input_str[i]))
result = "".join(str(i) for i in tmp)
result = int(result) >> 0x03
return result
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)
trs = doc('#main_container .inner table tbody tr:nth-child(n+3)').items()
for tr in trs:
ip_html = tr('td.ip').find("*").items()
host = ''
for i in ip_html:
if i.attr('style') is not None and 'none' in i.attr('style'):
continue
if i.text() == '':
continue
host += i.text()
port_code = tr('td.port').attr('class').split(' ')[1]
port = UqidataCrawler.encode(port_code)
yield Proxy(host=host, port=port)
if __name__ == '__main__':
crawler = UqidataCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,54 @@
import re
from pyquery import PyQuery as pq
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
BASE_URL = "http://www.xsdaili.cn/"
PAGE_BASE_URL = "http://www.xsdaili.cn/dayProxy/ip/{page}.html"
MAX_PAGE = 3
class XiaoShuCrawler(BaseCrawler):
"""
小舒代理 crawler, http://www.xsdaili.cn/
"""
def __init__(self):
"""
init urls
"""
try:
html = self.fetch(url=BASE_URL)
except:
self.urls = []
return
doc = pq(html)
title = doc(".title:eq(0) a").items()
latest_page = 0
for t in title:
res = re.search(r"/(\d+)\.html", t.attr("href"))
latest_page = int(res.group(1)) if res else 0
if latest_page:
self.urls = [PAGE_BASE_URL.format(page=page) for page in range(
latest_page - MAX_PAGE, latest_page)]
else:
self.urls = []
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)
contents = doc('.cont').text()
contents = contents.split("\n")
for content in contents:
c = content[:content.find("@")]
host, port = c.split(":")
yield Proxy(host=host, port=int(port))
if __name__ == '__main__':
crawler = XiaoShuCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,35 @@
from pyquery import PyQuery as pq
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
from loguru import logger
BASE_URL = 'https://www.xicidaili.com/'
class XicidailiCrawler(BaseCrawler):
"""
xididaili crawler, https://www.xicidaili.com/
"""
urls = [BASE_URL]
ignore = True
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)
items = doc('#ip_list tr:contains(高匿)').items()
for item in items:
country = item.find('td.country').text()
if not country or country.strip() != '高匿':
continue
host = item.find('td:nth-child(2)').text()
port = int(item.find('td:nth-child(3)').text())
yield Proxy(host=host, port=port)
if __name__ == '__main__':
crawler = XicidailiCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,32 @@
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
from lxml import etree
BASE_URL = "http://www.xiladaili.com/"
MAX_PAGE = 5
class XiladailiCrawler(BaseCrawler):
"""
xiladaili crawler, http://www.xiladaili.com/
"""
urls = ["http://www.xiladaili.com/"]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
etree_html = etree.HTML(html)
ip_ports = etree_html.xpath("//tbody/tr/td[1]/text()")
for ip_port in ip_ports:
host = ip_port.partition(":")[0]
port = ip_port.partition(":")[2]
yield Proxy(host=host, port=port)
if __name__ == '__main__':
crawler = XiladailiCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,32 @@
from pyquery import PyQuery as pq
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
BASE_URL = "http://ip.yqie.com/ipproxy.htm"
MAX_PAGE = 1
class YqIeCrawler(BaseCrawler):
"""
ip yqie crawler, http://ip.yqie.com/ipproxy.htm
"""
urls = [BASE_URL]
def parse(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)
trs = doc('#GridViewOrder tr:gt(0)').items()
for tr in trs:
host = tr.find('td:nth-child(1)').text()
port = int(tr.find('td:nth-child(2)').text())
yield Proxy(host=host, port=port)
if __name__ == '__main__':
crawler = YqIeCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1,59 @@
from pyquery import PyQuery as pq
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
from loguru import logger
import re
BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html'
MAX_PAGE = 5 * 2
class ZhandayeCrawler(BaseCrawler):
"""
zhandaye crawler, https://www.zdaye.com/dayProxy/
"""
urls_catalog = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE)]
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
urls = []
ignore = True
def crawl(self):
self.crawl_catalog()
yield from super().crawl()
def crawl_catalog(self):
for url in self.urls_catalog:
logger.info(f'fetching {url}')
html = self.fetch(url, headers=self.headers)
self.parse_catalog(html)
def parse_catalog(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)
for item in doc('#J_posts_list .thread_item div div p a').items():
url = 'https://www.zdaye.com' + item.attr('href')
logger.info(f'get detail url: {url}')
self.urls.append(url)
def parse(self, html):
doc = pq(html)
trs = doc('.cont br').items()
for tr in trs:
line = tr[0].tail
match = re.search(r'(\d+\.\d+\.\d+\.\d+):(\d+)', line)
if match:
host = match.group(1)
port = match.group(2)
yield Proxy(host=host, port=port)
if __name__ == '__main__':
crawler = ZhandayeCrawler()
for proxy in crawler.crawl():
print(proxy)

View File

@ -0,0 +1 @@
from .empty import PoolEmptyException

View File

@ -0,0 +1,7 @@
class PoolEmptyException(Exception):
def __str__(self):
"""
proxypool is used out
:return:
"""
return repr('no proxy in proxypool')

View File

@ -0,0 +1,43 @@
from loguru import logger
from proxypool.storages.redis1 import RedisClient
from proxypool.setting import PROXY_NUMBER_MAX
from proxypool.crawlers import __all__ as crawlers_cls
class Getter(object):
"""
getter of proxypool
"""
def __init__(self):
"""
init db and crawlers
"""
self.redis = RedisClient()
self.crawlers_cls = crawlers_cls
self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]
def is_full(self):
"""
if proxypool if full
return: bool
"""
return self.redis.count() >= PROXY_NUMBER_MAX
@logger.catch
def run(self):
"""
run crawlers to get proxy
:return:
"""
if self.is_full():
return
for crawler in self.crawlers:
logger.info(f'crawler {crawler} to get proxy')
for proxy in crawler.crawl():
self.redis.add(proxy)
if __name__ == '__main__':
getter = Getter()
getter.run()

View File

@ -0,0 +1,92 @@
from flask import Flask, g, request
from proxypool.storages.redis1 import RedisClient
from proxypool.setting import API_HOST, API_PORT, API_THREADED, API_KEY, IS_DEV
import functools
__all__ = ['app']
app = Flask(__name__)
if IS_DEV:
app.debug = True
def auth_required(func):
@functools.wraps(func)
def decorator(*args, **kwargs):
# conditional decorator, when setting API_KEY is set, otherwise just ignore this decorator
if API_KEY == "":
return func(*args, **kwargs)
if request.headers.get('API-KEY', None) is not None:
api_key = request.headers.get('API-KEY')
else:
return {"message": "Please provide an API key in header"}, 400
# Check if API key is correct and valid
if request.method == "GET" and api_key == API_KEY:
return func(*args, **kwargs)
else:
return {"message": "The provided API key is not valid"}, 403
return decorator
def get_conn():
"""
get redis client object
:return:
"""
if not hasattr(g, 'redis'):
g.redis = RedisClient()
return g.redis
@app.route('/')
@auth_required
def index():
"""
get home page, you can define your own templates
:return:
"""
return '<h2>Welcome to Proxy Pool System</h2>'
@app.route('/random')
@auth_required
def get_proxy():
"""
get a random proxy
:return: get a random proxy
"""
conn = get_conn()
return conn.random().string()
@app.route('/all')
@auth_required
def get_proxy_all():
"""
get a random proxy
:return: get a random proxy
"""
conn = get_conn()
proxies = conn.all()
proxies_string = ''
if proxies:
for proxy in proxies:
proxies_string += str(proxy) + '\n'
return proxies_string
@app.route('/count')
@auth_required
def get_count():
"""
get the count of proxies
:return: count, int
"""
conn = get_conn()
return str(conn.count())
if __name__ == '__main__':
app.run(host=API_HOST, port=API_PORT, threaded=API_THREADED)

View File

@ -0,0 +1,108 @@
import asyncio
import aiohttp
from loguru import logger
from proxypool.schemas import Proxy
from proxypool.storages.redis1 import RedisClient
from proxypool.setting import TEST_TIMEOUT, TEST_BATCH, TEST_URL, TEST_VALID_STATUS, TEST_ANONYMOUS, \
TEST_DONT_SET_MAX_SCORE,IS_PAID
from aiohttp import ClientProxyConnectionError, ServerDisconnectedError, ClientOSError, ClientHttpProxyError
from asyncio import TimeoutError
EXCEPTIONS = (
ClientProxyConnectionError,
ConnectionRefusedError,
TimeoutError,
ServerDisconnectedError,
ClientOSError,
ClientHttpProxyError,
AssertionError
)
class Tester(object):
"""
tester for testing proxies in queue
"""
def __init__(self):
"""
init redis
"""
self.redis = RedisClient()
self.loop = asyncio.get_event_loop()
async def test(self, proxy: Proxy):
"""
test single proxy
:param proxy: Proxy object
:return:
"""
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
try:
logger.debug(f'testing {proxy.string()}')
# if TEST_ANONYMOUS is True, make sure that
# the proxy has the effect of hiding the real IP
if TEST_ANONYMOUS:
url = 'https://httpbin.org/ip'
auth = "d2118699212:bxb0p3l8"
if IS_PAID:
proxys = f'http://{auth}@{proxy.string()}'
else:
proxys = f'http://{proxy.string()}'
async with session.get(url, timeout=TEST_TIMEOUT) as response:
resp_json = await response.json()
origin_ip = resp_json['origin']
async with session.get(url, proxy=proxys, timeout=TEST_TIMEOUT) as response:
resp_json = await response.json()
anonymous_ip = resp_json['origin']
# 通过去获取https://httpbin.org/ip返回ip是否相同来判断是否代理成功
assert origin_ip != anonymous_ip
assert proxy.host == anonymous_ip
async with session.get(TEST_URL, proxy=proxys, timeout=TEST_TIMEOUT,
allow_redirects=False) as response:
if response.status in TEST_VALID_STATUS:
if TEST_DONT_SET_MAX_SCORE:
logger.debug(f'proxy {proxy.string()} is valid, remain current score')
else:
self.redis.max(proxy)
logger.debug(f'proxy {proxy.string()} is valid, set max score')
else:
self.redis.decrease(proxy)
logger.debug(f'proxy {proxy.string()} is invalid, decrease score')
except EXCEPTIONS:
# 如果报错了就是用redis减分
self.redis.decrease(proxy)
logger.debug(f'proxy {proxy.string()} is invalid, decrease score')
@logger.catch
def run(self):
"""
test main method
:return:
"""
# event loop of aiohttp
logger.info('stating tester...')
count = self.redis.count()
logger.debug(f'{count} proxies to test')
cursor = 0
while True:
logger.debug(f'testing proxies use cursor {cursor}, count {TEST_BATCH}')
cursor, proxies = self.redis.batch(cursor, count=TEST_BATCH)
if proxies:
tasks = [self.test(proxy) for proxy in proxies]
self.loop.run_until_complete(asyncio.wait(tasks))
if not cursor:
break
def run_tester():
host = '96.113.165.182'
port = '3128'
tasks = [tester.test(Proxy(host=host, port=port))]
tester.loop.run_until_complete(asyncio.wait(tasks))
if __name__ == '__main__':
tester = Tester()
tester.run()
# run_tester()

View File

@ -0,0 +1,143 @@
import time
import multiprocessing
from proxypool.processors.server import app
from proxypool.processors.getter import Getter
from proxypool.processors.tester import Tester
from proxypool.setting import APP_PROD_METHOD_GEVENT, APP_PROD_METHOD_MEINHELD, APP_PROD_METHOD_TORNADO, CYCLE_GETTER, CYCLE_TESTER, API_HOST, \
API_THREADED, API_PORT, ENABLE_SERVER, IS_PROD, APP_PROD_METHOD, \
ENABLE_GETTER, ENABLE_TESTER, IS_WINDOWS
from loguru import logger
if IS_WINDOWS:
multiprocessing.freeze_support()
tester_process, getter_process, server_process = None, None, None
class Scheduler():
"""
scheduler
"""
def run_tester(self, cycle=CYCLE_TESTER):
"""
run tester
"""
if not ENABLE_TESTER:
logger.info('tester not enabled, exit')
return
tester = Tester()
loop = 0
while True:
logger.debug(f'tester loop {loop} start...')
tester.run()
loop += 1
time.sleep(cycle)
def run_getter(self, cycle=CYCLE_GETTER):
"""
run getter
"""
if not ENABLE_GETTER:
logger.info('getter not enabled, exit')
return
getter = Getter()
loop = 0
while True:
logger.debug(f'getter loop {loop} start...')
getter.run()
loop += 1
time.sleep(cycle)
def run_server(self):
"""
run server for api
"""
if not ENABLE_SERVER:
logger.info('server not enabled, exit')
return
if IS_PROD:
if APP_PROD_METHOD == APP_PROD_METHOD_GEVENT:
try:
from gevent.pywsgi import WSGIServer
except ImportError as e:
logger.exception(e)
else:
http_server = WSGIServer((API_HOST, API_PORT), app)
http_server.serve_forever()
elif APP_PROD_METHOD == APP_PROD_METHOD_TORNADO:
try:
from tornado.wsgi import WSGIContainer
from tornado.httpserver import HTTPServer
from tornado.ioloop import IOLoop
except ImportError as e:
logger.exception(e)
else:
http_server = HTTPServer(WSGIContainer(app))
http_server.listen(API_PORT)
IOLoop.instance().start()
elif APP_PROD_METHOD == APP_PROD_METHOD_MEINHELD:
try:
import meinheld
except ImportError as e:
logger.exception(e)
else:
meinheld.listen((API_HOST, API_PORT))
meinheld.run(app)
else:
logger.error("unsupported APP_PROD_METHOD")
return
else:
app.run(host=API_HOST, port=API_PORT, threaded=API_THREADED, use_reloader=False)
def run(self):
global tester_process, getter_process, server_process
try:
logger.info('starting proxypool...')
if ENABLE_TESTER:
tester_process = multiprocessing.Process(
target=self.run_tester)
logger.info(f'starting tester, pid {tester_process.pid}...')
tester_process.start()
if ENABLE_GETTER:
getter_process = multiprocessing.Process(
target=self.run_getter)
logger.info(f'starting getter, pid {getter_process.pid}...')
getter_process.start()
if ENABLE_SERVER:
server_process = multiprocessing.Process(
target=self.run_server)
logger.info(f'starting server, pid {server_process.pid}...')
server_process.start()
tester_process and tester_process.join()
getter_process and getter_process.join()
server_process and server_process.join()
except KeyboardInterrupt:
logger.info('received keyboard interrupt signal')
tester_process and tester_process.terminate()
getter_process and getter_process.terminate()
server_process and server_process.terminate()
finally:
# must call join method before calling is_alive
tester_process and tester_process.join()
getter_process and getter_process.join()
server_process and server_process.join()
logger.info(
f'tester is {"alive" if tester_process.is_alive() else "dead"}')
logger.info(
f'getter is {"alive" if getter_process.is_alive() else "dead"}')
logger.info(
f'server is {"alive" if server_process.is_alive() else "dead"}')
logger.info('proxy terminated')
if __name__ == '__main__':
scheduler = Scheduler()
scheduler.run()

View File

@ -0,0 +1 @@
from .proxy import Proxy

View File

@ -0,0 +1,30 @@
from attr import attrs, attr
@attrs
class Proxy(object):
"""
proxy schema
"""
host = attr(type=str, default=None)
port = attr(type=int, default=None)
def __str__(self):
"""
to string, for print
:return:
"""
return f'{self.host}:{self.port}'
def string(self):
"""
to string
:return: <host>:<port>
"""
return self.__str__()
if __name__ == '__main__':
proxy = Proxy(host='8.8.8.8', port=8888)
print('proxy', proxy)
print('proxy', proxy.string())

View File

@ -0,0 +1,123 @@
import platform
from os.path import dirname, abspath, join
from environs import Env
from loguru import logger
import shutil
env = Env()
env.read_env()
# definition of flags
IS_WINDOWS = platform.system().lower() == 'windows'
# definition of dirs
ROOT_DIR = dirname(dirname(abspath(__file__)))
LOG_DIR = join(ROOT_DIR, env.str('LOG_DIR', 'logs'))
# definition of environments
DEV_MODE, TEST_MODE, PROD_MODE = 'dev', 'test', 'prod'
APP_ENV = env.str('APP_ENV', DEV_MODE).lower()
APP_DEBUG = env.bool('APP_DEBUG', True if APP_ENV == DEV_MODE else False)
APP_DEV = IS_DEV = APP_ENV == DEV_MODE
APP_PROD = IS_PROD = APP_ENV == PROD_MODE
APP_TEST = IS_TEST = APP_ENV == TEST_MODE
# Which WSGI container is used to run applications
# - gevent: pip install gevent
# - tornado: pip install tornado
# - meinheld: pip install meinheld
APP_PROD_METHOD_GEVENT = 'gevent'
APP_PROD_METHOD_TORNADO = 'tornado'
APP_PROD_METHOD_MEINHELD = 'meinheld'
APP_PROD_METHOD = env.str('APP_PROD_METHOD', APP_PROD_METHOD_GEVENT).lower()
# redis host
# 更改了host的权限
REDIS_HOST = env.str('PROXYPOOL_REDIS_HOST',
env.str('REDIS_HOST', '192.168.118.202'))
# redis port
REDIS_PORT = env.int('PROXYPOOL_REDIS_PORT', env.int('REDIS_PORT', 6379))
# redis password, if no password, set it to None
REDIS_PASSWORD = env.str('PROXYPOOL_REDIS_PASSWORD',
env.str('REDIS_PASSWORD', None))
# redis db, if no choice, set it to 0
REDIS_DB = env.int('PROXYPOOL_REDIS_DB', env.int('REDIS_DB', 0))
# redis connection string, like redis://[password]@host:port or rediss://[password]@host:port/0,
# please refer to https://redis-py.readthedocs.io/en/stable/connections.html#redis.client.Redis.from_url
REDIS_CONNECTION_STRING = env.str(
'PROXYPOOL_REDIS_CONNECTION_STRING', env.str('REDIS_CONNECTION_STRING', None))
# redis hash table key name
REDIS_KEY = env.str('PROXYPOOL_REDIS_KEY', env.str(
'REDIS_KEY', 'proxies:universal'))
# definition of proxy scores
IS_PAID =env.bool('IS_PAID', True)
PROXY_SCORE_MAX = env.int('PROXY_SCORE_MAX', 100)
PROXY_SCORE_MIN = env.int('PROXY_SCORE_MIN', 0)
PROXY_SCORE_INIT = env.int('PROXY_SCORE_INIT', 10)
# definition of proxy number
PROXY_NUMBER_MAX = 50000
PROXY_NUMBER_MIN = 0
# definition of tester cycle, it will test every CYCLE_TESTER second
CYCLE_TESTER = env.int('CYCLE_TESTER', 20)
# definition of getter cycle, it will get proxy every CYCLE_GETTER second
CYCLE_GETTER = env.int('CYCLE_GETTER', 100)
GET_TIMEOUT = env.int('GET_TIMEOUT', 10)
# definition of tester
TEST_URL = env.str('TEST_URL', 'http://www.baidu.com')
TEST_TIMEOUT = env.int('TEST_TIMEOUT', 10)
TEST_BATCH = env.int('TEST_BATCH', 20)
# only save anonymous proxy
TEST_ANONYMOUS = env.bool('TEST_ANONYMOUS', True)
# TEST_HEADERS = env.json('TEST_HEADERS', {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
# })
TEST_VALID_STATUS = env.list('TEST_VALID_STATUS', [200, 206, 302])
# whether to set max score when one proxy is tested valid
TEST_DONT_SET_MAX_SCORE = env.bool('TEST_DONT_SET_MAX_SCORE', False)
# definition of api
API_HOST = env.str('API_HOST', '0.0.0.0')
API_PORT = env.int('API_PORT', 5555)
API_THREADED = env.bool('API_THREADED', True)
# add an api key to get proxy
# need a header of `API-KEY` in get request to pass the authenticate
# API_KEY='', do not need `API-KEY` header
API_KEY = env.str('API_KEY', '')
# flags of enable
ENABLE_TESTER = env.bool('ENABLE_TESTER', True)
ENABLE_GETTER = env.bool('ENABLE_GETTER', True)
ENABLE_SERVER = env.bool('ENABLE_SERVER', True)
ENABLE_LOG_FILE = env.bool('ENABLE_LOG_FILE', True)
ENABLE_LOG_RUNTIME_FILE = env.bool('ENABLE_LOG_RUNTIME_FILE', True)
ENABLE_LOG_ERROR_FILE = env.bool('ENABLE_LOG_ERROR_FILE', True)
LOG_LEVEL_MAP = {
DEV_MODE: "DEBUG",
TEST_MODE: "INFO",
PROD_MODE: "ERROR"
}
LOG_LEVEL = LOG_LEVEL_MAP.get(APP_ENV)
LOG_ROTATION = env.str('LOG_ROTATION', '500MB')
LOG_RETENTION = env.str('LOG_RETENTION', '1 week')
if ENABLE_LOG_FILE:
if ENABLE_LOG_RUNTIME_FILE:
logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')),
level=LOG_LEVEL, rotation=LOG_ROTATION, retention=LOG_RETENTION)
if ENABLE_LOG_ERROR_FILE:
logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')),
level='ERROR', rotation=LOG_ROTATION)
else:
shutil.rmtree(LOG_DIR, ignore_errors=True)

View File

@ -0,0 +1,149 @@
import redis
from proxypool.exceptions import PoolEmptyException
from proxypool.schemas.proxy import Proxy
from proxypool.setting import REDIS_CONNECTION_STRING, REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_DB, REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MIN, \
PROXY_SCORE_INIT
from random import choice
from typing import List
from loguru import logger
from proxypool.utils.proxy import is_valid_proxy, convert_proxy_or_proxies
REDIS_CLIENT_VERSION = redis.__version__
print(REDIS_CLIENT_VERSION)
IS_REDIS_VERSION_2 = REDIS_CLIENT_VERSION.startswith('2.')
class RedisClient(object):
"""
redis connection client of proxypool
"""
def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=REDIS_DB,
connection_string=REDIS_CONNECTION_STRING, **kwargs):
"""
init redis client
:param host: redis host
:param port: redis port
:param password: redis password
:param connection_string: redis connection_string
"""
# if set connection_string, just use it
if connection_string:
self.db = redis.StrictRedis.from_url(connection_string, decode_responses=True, **kwargs)
else:
self.db = redis.StrictRedis(
host=host, port=port, password=password, db=db, decode_responses=True, **kwargs)
# 增加可用代理
def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int:
"""
add proxy and set it to init score
:param proxy: proxy, ip:port, like 8.8.8.8:88
:param score: int score
:return: result
"""
if not is_valid_proxy(f'{proxy.host}:{proxy.port}'):
logger.info(f'invalid proxy {proxy}, throw it')
return
if not self.exists(proxy):
if IS_REDIS_VERSION_2:
# if False:
return self.db.zadd(REDIS_KEY, score, proxy.string())
return self.db.zadd(REDIS_KEY, {proxy.string(): score})
# 随机挑选一个可以用代理
def random(self) -> Proxy:
"""
get random proxy
firstly try to get proxy with max score
if not exists, try to get proxy by rank
if not exists, raise error
:return: proxy, like 8.8.8.8:8
"""
# try to get proxy with max score
proxies = self.db.zrangebyscore(
REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX)
if len(proxies):
return convert_proxy_or_proxies(choice(proxies))
# else get proxy by rank
proxies = self.db.zrevrange(
REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX)
if len(proxies):
return convert_proxy_or_proxies(choice(proxies))
# else raise error
raise PoolEmptyException
# 给不好的代理减分
def decrease(self, proxy: Proxy) -> int:
"""
decrease score of proxy, if small than PROXY_SCORE_MIN, delete it
:param proxy: proxy
:return: new score
"""
if IS_REDIS_VERSION_2:
# if False:
self.db.zincrby(REDIS_KEY, proxy.string(), -1)
else:
self.db.zincrby(REDIS_KEY, -1, proxy.string())
score = self.db.zscore(REDIS_KEY, proxy.string())
logger.info(f'{proxy.string()} score decrease 1, current {score}')
# 小于最小值直接删除
if score <= PROXY_SCORE_MIN:
logger.info(f'{proxy.string()} current score {score}, remove')
self.db.zrem(REDIS_KEY, proxy.string())
# 是否存在这个代理
def exists(self, proxy: Proxy) -> bool:
"""
if proxy exists
:param proxy: proxy
:return: if exists, bool
"""
return not self.db.zscore(REDIS_KEY, proxy.string()) is None
# 将代理的score设置成最大的
def max(self, proxy: Proxy) -> int:
"""
set proxy to max score
:param proxy: proxy
:return: new score
"""
logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}')
if IS_REDIS_VERSION_2:
# if False:
return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string())
return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
# 有多少个代理
def count(self) -> int:
"""
get count of proxies
:return: count, int
"""
return self.db.zcard(REDIS_KEY)
# 返回所有代理
def all(self) -> List[Proxy]:
"""
get all proxies
:return: list of proxies
"""
return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX))
# 根据游标和数量,返回对应个数的代理
def batch(self, cursor, count) -> List[Proxy]:
"""
get batch of proxies
:param cursor: scan cursor
:param count: scan count
:return: list of proxies
"""
cursor, proxies = self.db.zscan(REDIS_KEY, cursor, count=count)
return cursor, convert_proxy_or_proxies([i[0] for i in proxies])
if __name__ == '__main__':
conn = RedisClient()
result = conn.random()
print(result)

View File

@ -0,0 +1,94 @@
from proxypool.schemas import Proxy
import sys
import os
# pythonpath= os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
# print(pythonpath)
# sys.path.insert(0,pythonpath)
def is_valid_proxy(data):
"""
check this string is within proxy format
"""
if is_auth_proxy(data):
host, port = extract_auth_proxy(data)
return is_ip_valid(host) and is_port_valid(port)
elif data.__contains__(':'):
ip = data.split(':')[0]
port = data.split(':')[1]
return is_ip_valid(ip) and is_port_valid(port)
else:
return is_ip_valid(data)
def is_ip_valid(ip):
"""
check this string is within ip format
"""
if is_auth_proxy(ip):
ip = ip.split('@')[1]
a = ip.split('.')
if len(a) != 4:
return False
for x in a:
if not x.isdigit():
return False
i = int(x)
if i < 0 or i > 255:
return False
return True
def is_port_valid(port):
return port.isdigit()
def convert_proxy_or_proxies(data):
"""
convert list of str to valid proxies or proxy
:param data:
:return:
"""
if not data:
return None
# if list of proxies
if isinstance(data, list):
result = []
for item in data:
# skip invalid item
item = item.strip()
if not is_valid_proxy(item): continue
if is_auth_proxy(item):
host, port = extract_auth_proxy(item)
else:
host, port = item.split(':')
result.append(Proxy(host=host, port=int(port)))
return result
if isinstance(data, str) and is_valid_proxy(data):
if is_auth_proxy(data):
host, port = extract_auth_proxy(data)
else:
host, port = data.split(':')
return Proxy(host=host, port=int(port))
def is_auth_proxy(data: str) -> bool:
return '@' in data
def extract_auth_proxy(data: str) -> (str, str):
"""
extract host and port from a proxy with authentication
"""
auth = data.split('@')[0]
ip_port = data.split('@')[1]
ip = ip_port.split(':')[0]
port = ip_port.split(':')[1]
host = auth + '@' + ip
return host, port
if __name__ == '__main__':
proxy = 'test1234:test5678.@117.68.216.212:32425'
print(extract_auth_proxy(proxy))

View File

@ -0,0 +1,2 @@
git tag -a "`date +'%Y%m%d'`" -m "Release `date +'%Y%m%d'`"
git push origin --tags

View File

@ -0,0 +1,17 @@
environs>=9.3.0,<10.0.0
Flask>=1.1.2,<2.0.0
attrs>=20.3.0,<21.0.0
retrying>=1.3.3,<2.0.0
aiohttp>=3.8.1,<4.0.0
requests>=2.25.1,<3.0.0
loguru>=0.5.3,<1.0.0
pyquery>=1.4.3,<2.0.0
supervisor>=4.2.1,<5.0.0
redis>=3.5.3,<4.0.0
lxml>=4.6.5,<5.0.0
fake_headers>=1.0.2,<2.0.0
maxminddb_geolite2==2018.703
gevent>=21.8.0,<22.0.0
tornado>=6.0,<7.0
itsdangerous==0.24
MarkupSafe<2.1.0

View File

@ -0,0 +1,14 @@
from proxypool.scheduler import Scheduler
import argparse
parser = argparse.ArgumentParser(description='ProxyPool')
parser.add_argument('--processor', type=str, help='processor to run')
args = parser.parse_args()
if __name__ == '__main__':
# if processor set, just run it
if args.processor:
getattr(Scheduler(), f'run_{args.processor}')()
else:
Scheduler().run()

View File

@ -0,0 +1,40 @@
[unix_http_server]
file=/run/supervisor.sock
chmod=0700
[supervisord]
pidfile=/run/supervisord.pid
nodaemon=true
[supervisorctl]
serverurl=unix:///run/supervisor.sock
[rpcinterface:supervisor]
supervisor.rpcinterface_factory=supervisor.rpcinterface:make_main_rpcinterface
[program:tester]
process_name=tester
command=python3 run.py --processor tester
directory=/app
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:getter]
process_name=getter
command=python3 run.py --processor getter
directory=/app
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:server]
process_name=server
command=python3 run.py --processor server
directory=/app
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0

View File

@ -0,0 +1,2 @@
ProxyPool所有内容参考https://github.com/Python3WebSpider/ProxyPool
上述详细解释了如何部署代理池基于docker或者k8s

View File

@ -0,0 +1,32 @@
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: bug
assignees: Germey
---
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the behavior:
1. Go to '...'
2. Click on '....'
3. Scroll down to '....'
4. See error
**Expected behavior**
A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots to help explain your problem.
**Environments (please complete the following information):**
- OS: [e.g. macOS 10.15.2]
- Python [e.g. Python 3.6]
- Browser [e.g. Chrome 67 ]
**Additional context**
Add any other context about the problem here.

View File

@ -0,0 +1,24 @@
name: build
on:
push:
branches:
- master
paths-ignore:
- .gitignore
- README.md
- '.github/ISSUE_TEMPLATE/**'
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Checkout Source
uses: actions/checkout@v1
- name: Docker Login
run: docker login -u germey -p ${{ secrets.DOCKERHUB_LOGIN_PASSWORD }}
- name: Build the Docker Image
run: docker-compose build
- name: Tag and Push Master Version
run: |
docker tag germey/accountpool germey/accountpool:master
docker push germey/accountpool:master

View File

@ -0,0 +1,144 @@
/.idea
*.pyc
ghostdriver.log
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
logs/

View File

@ -0,0 +1,6 @@
FROM python:3.6
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY . .
CMD ["supervisord", "-c", "supervisord.conf"]

View File

@ -0,0 +1,216 @@
# AccountPool
![build](https://github.com/Python3WebSpider/AccountPool/workflows/build/badge.svg)
![](https://img.shields.io/badge/python-3.6%2B-brightgreen)
![Docker Pulls](https://img.shields.io/docker/pulls/germey/accountpool)
简易高效的账号池,提供如下功能:
- 定时模拟登录账号,将 Cookies 或 JWT 等信息存储到 Redis 数据库。
- 定时测试,剔除不可用 Cookies 或 JWT。
- 提供 API随机取用测试通过的可用 Cookies 或 JWT。
## 使用要求
可以通过两种方式来运行账号池,一种方式是使用 Docker推荐另一种方式是常规方式运行。
### Docker
如果使用 Docker则需要安装如下环境
- Docker
- Docker-Compose
### 常规方式
常规方式要求有 Python 环境、Redis 环境,具体要求如下:
- Python>=3.6
- Redis
## Docker 运行
如果安装好了 Docker 和 Docker-Compose只需要一条命令即可运行。
```shell script
docker-compose up
```
运行结果类似如下:
```
redis4accountpool is up-to-date
Recreating accountpool ... done
Attaching to redis4accountpool, accountpool
redis4accountpool | 1:C 31 Aug 2023 03:53:10.335 * oO0OoO0OoO0Oo Redis is starting oO0OoO0OoO0Oo
redis4accountpool | 1:C 31 Aug 2023 03:53:10.335 * Redis version=7.2.0, bits=64, commit=00000000, modified=0, pid=1, just started
redis4accountpool | 1:C 31 Aug 2023 03:53:10.335 # Warning: no config file specified, using the default config. In order to specify a config file use redis-server /path/to/redis.conf
redis4accountpool | 1:M 31 Aug 2023 03:53:10.335 * monotonic clock: POSIX clock_gettime
redis4accountpool | 1:M 31 Aug 2023 03:53:10.336 * Running mode=standalone, port=6379.
redis4accountpool | 1:M 31 Aug 2023 03:53:10.336 * Server initialized
redis4accountpool | 1:M 31 Aug 2023 03:53:10.336 * Ready to accept connections tcp
redis4accountpool | 1:C 31 Aug 2023 04:03:11.226 * oO0OoO0OoO0Oo Redis is starting oO0OoO0OoO0Oo
redis4accountpool | 1:C 31 Aug 2023 04:03:11.226 * Redis version=7.2.0, bits=64, commit=00000000, modified=0, pid=1, just started
redis4accountpool | 1:C 31 Aug 2023 04:03:11.226 # Warning: no config file specified, using the default config. In order to specify a config file use redis-server /path/to/redis.conf
redis4accountpool | 1:M 31 Aug 2023 04:03:11.226 * monotonic clock: POSIX clock_gettime
redis4accountpool | 1:M 31 Aug 2023 04:03:11.227 * Running mode=standalone, port=6379.
redis4accountpool | 1:M 31 Aug 2023 04:03:11.227 * Server initialized
redis4accountpool | 1:M 31 Aug 2023 04:03:11.227 * Ready to accept connections tcp
accountpool | 2023-08-31 04:06:20,737 CRIT Supervisor is running as root. Privileges were not dropped because no user is specified in the config file. If you intend to run as root, you can set user=root in the config file to avoid this message.
accountpool | 2023-08-31 04:06:20,739 INFO supervisord started with pid 1
accountpool | 2023-08-31 04:06:21,742 INFO spawned: 'generator' with pid 10
accountpool | 2023-08-31 04:06:21,744 INFO spawned: 'server' with pid 11
accountpool | 2023-08-31 04:06:21,746 INFO spawned: 'tester' with pid 12
accountpool | 2023-08-31 04:06:21.990 | DEBUG | accountpool.scheduler:run_tester:31 - tester loop 0 start...
accountpool | 2023-08-31 04:06:21.990 | DEBUG | accountpool.scheduler:run_generator:46 - getter loop 0 start...
accountpool | * Running on all addresses.
accountpool | WARNING: This is a development server. Do not use it in a production deployment.
accountpool | * Running on http://172.24.0.3:6777/ (Press CTRL+C to quit)
accountpool | 2023-08-31 04:06:22.004 | DEBUG | accountpool.processors.generator:run:39 - start to run generator
accountpool | 2023-08-31 04:06:22.005 | DEBUG | accountpool.processors.generator:run:43 - start to generate credential of admin1
accountpool | 2023-08-31 04:06:23,007 INFO success: generator entered RUNNING state, process has stayed up for > than 1 seconds (startsecs)
accountpool | 2023-08-31 04:06:23,007 INFO success: server entered RUNNING state, process has stayed up for > than 1 seconds (startsecs)
accountpool | 2023-08-31 04:06:23,007 INFO success: tester entered RUNNING state, process has stayed up for > than 1 seconds (startsecs)
```
可以看到 Redis、Generator、Server、Tester 都已经启动成功。
另外还需要导入一些账号信息到 Redis 数据库里面,由于已经用 Docker 启动了 Redis 数据库,运行在 6333 端口上。
这时候可以执行脚本:
```
export REDIS_PORT=6333
python3 importer.py antispider7
```
运行完成之后如果没有报错就说明账号导入成功了,可以自行连上 Redis 看下。
过一会访问 [http://localhost:6777/antispider7/random](http://localhost:6777/antispider7/random) 即可获取一个 [antispider7](https://antispider7.scrape.center) 的随机可用 Cookies。
## 常规方式运行
如果不使用 Docker 运行,配置好 Python、Redis 环境之后也可运行,步骤如下。
### 安装和配置 Redis
本地安装 Redis、Docker 启动 Redis、远程 Redis 都是可以的,只要能正常连接使用即可。
首先可以需要一下环境变量,代理池会通过环境变量读取这些值。
设置 Redis 的环境变量有两种方式,一种是分别设置 host、port、password另一种是设置连接字符串设置方法分别如下
设置 host、port、password如果 password 为空可以设置为空字符串,示例如下:
```shell script
export REDIS_HOST='localhost'
export REDIS_PORT=6379
export REDIS_PASSWORD=''
export REDIS_DB=0
```
或者只设置连接字符串:
```shell script
export REDIS_CONNECTION_STRING='redis://[password]@host:port/db'
```
如果没有密码也要设置为:
```shell script
export REDIS_CONNECTION_STRING='redis://@host:port/db'
```
这里连接字符串的格式需要符合 `redis://[password]@host:port/db` 的格式,注意不要遗漏 `@`
以上两种设置任选其一即可。
### 安装依赖包
这里强烈推荐使用 [Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands)
或 [virtualenv](https://virtualenv.pypa.io/en/latest/user_guide.html) 创建虚拟环境Python 版本不低于 3.6。
然后 pip 安装依赖即可:
```shell script
pip3 install -r requirements.txt
```
### 运行代理池
两种方式运行账号池,一种是 Tester、Generator、Server 全部运行,另一种是按需分别运行。
一般来说可以选择全部运行,命令如下:
```shell script
python3 run.py <website>
```
运行之后会启动 Tester、Generator、Server这时访问 [http://localhost:6777/<website>/random](http://localhost:6777/<website>/random) 即可获取一个随机可用代理。
或者如果你弄清楚了账号池的架构,可以按需分别运行,命令如下:
```shell script
python3 run.py <website> --processor getter
python3 run.py <website> --processor tester
python3 run.py <website> --processor server
```
这里 processor 可以指定运行 Tester、Generator 还是 Server。
## 可配置项
账号池可以通过设置环境变量来配置一些参数。
### 开关
- ENABLE_TESTER允许 Tester 启动,默认 true
- ENABLE_GENERATOR允许 Generator 启动,默认 true
- ENABLE_SERVER运行 Server 启动,默认 true
### 环境
- APP_ENV运行环境可以设置 dev、test、prod即开发、测试、生产环境默认 dev
- APP_DEBUG调试模式可以设置 true 或 false默认 true
### Redis 连接
- REDIS_HOSTRedis 的 Host
- REDIS_PORTRedis 的端口
- REDIS_PASSWORDRedis 的密码
- REDIS_DBRedis 的数据库索引,如 0、1
- REDIS_CONNECTION_STRINGRedis 连接字符串
- REDIS_KEYRedis 储存代理使用字典的名称
### 处理器
- CYCLE_TESTERTester 运行周期,即间隔多久运行一次测试,默认 20 秒
- CYCLE_GETTERGetter 运行周期,即间隔多久运行一次代理获取,默认 100 秒
- API_HOST代理 Server 运行 Host默认 0.0.0.0
- API_PORT代理 Server 运行端口,默认 6777
- API_THREADED代理 Server 是否使用多线程,默认 true
### 日志
- LOG_DIR日志相对路径
- LOG_RUNTIME_FILE运行日志文件名称
- LOG_ERROR_FILE错误日志文件名称
## 部署
本项目提供了 Kubernetes 部署脚本,如需部署到 Kubernetes执行如下命令即可
```shell script
cat deployment.yml | sed 's/\${TAG}/latest/g' | kubectl apply -f -
```
## 待开发
- [ ] 前端页面管理
- [ ] 使用情况统计分析
如有一起开发的兴趣可以在 Issue 留言,非常感谢!
## LICENSE
MIT

View File

@ -0,0 +1,7 @@
class InitException(Exception):
def __str__(self):
"""
init error
:return:
"""
return repr('init failed')

View File

@ -0,0 +1,111 @@
from accountpool.exceptions.init import InitException
from accountpool.storages.redis import RedisClient
from loguru import logger
class BaseGenerator(object):
def __init__(self, website=None):
"""
init base generator
:param website: name of website
"""
self.website = website
if not self.website:
raise InitException
self.account_operator = RedisClient(type='account', website=self.website)
self.credential_operator = RedisClient(type='credential', website=self.website)
def generate(self, username, password):
"""
generate method
:param username: username
:param password: password
:return:
"""
raise NotImplementedError
def init(self):
"""
do init
"""
pass
def run(self):
"""
run main process
:return:
"""
self.init()
logger.debug('start to run generator')
for username, password in self.account_operator.all().items():
if self.credential_operator.get(username):
continue
logger.debug(f'start to generate credential of {username}')
self.generate(username, password)
import requests
class Antispider6Generator(BaseGenerator):
def init(self):
"""
do init
"""
if self.account_operator.count() == 0:
self.account_operator.set('admin', 'admin')
self.account_operator.set('admin2', 'admin2')
def generate(self, username, password):
"""
generate main process
"""
if self.credential_operator.get(username):
logger.debug(f'credential of {username} exists, skip')
return
login_url = 'https://antispider6.scrape.center/login'
s = requests.Session()
s.post(login_url, data={
'username': username,
'password': password
})
result = []
for cookie in s.cookies:
print(cookie.name, cookie.value)
result.append(f'{cookie.name}={cookie.value}')
result = ';'.join(result)
logger.debug(f'get credential {result}')
self.credential_operator.set(username, result)
class Antispider7Generator(BaseGenerator):
MAX_COUNT = 100
def init(self):
"""
do init
"""
for i in range(1, self.MAX_COUNT + 1):
self.account_operator.set(f'admin{i}', f'admin{i}')
def generate(self, username, password):
"""
generate main process
"""
if self.credential_operator.get(username):
logger.debug(f'credential of {username} exists, skip')
return
login_url = 'https://antispider7.scrape.center/api/login'
s = requests.Session()
r = s.post(login_url, json={
'username': username,
'password': password
})
if r.status_code != 200:
logger.error(f'error occurred while generating credential of {username}, error code {r.status_code}')
return
token = r.json().get('token')
logger.debug(f'get credential {token}')
self.credential_operator.set(username, token)

View File

@ -0,0 +1,69 @@
import json
from flask import Flask, g
from accountpool.storages.redis import RedisClient
from accountpool.setting import GENERATOR_MAP
from loguru import logger
__all__ = ['app']
app = Flask(__name__)
account = 'account'
credential = 'credential'
@app.route('/')
def index():
return '<h2>Welcome to Account Pool System</h2>'
def get_conn():
"""
get connection
:return:
"""
for website in GENERATOR_MAP:
if not hasattr(g, website):
setattr(g, f'{website}_{credential}', RedisClient(credential, website))
setattr(g, f'{website}_{account}', RedisClient(account, website))
return g
@app.route('/<website>/random')
def random(website):
"""
ger random credential /weibo/random
:return: random credential
"""
g = get_conn()
result = getattr(g, f'{website}_{credential}').random()
logger.debug(f'get credential {result}')
return result
@app.route('/<website>/add/<username>/<password>')
def add(website, username, password):
"""
add account /weibo/add/user/password
:param website: website
:param username: username
:param password: password
:return:
"""
g = get_conn()
getattr(g, f'{website}_{account}').set(username, password)
return json.dumps({'status': '1'})
@app.route('/<website>/count')
def count(website):
"""
get count of credential
"""
g = get_conn()
count = getattr(g, f'{website}_{credential}').count()
return json.dumps({'status': 'ok', 'count': count})
if __name__ == '__main__':
app.run(host='0.0.0.0')

View File

@ -0,0 +1,90 @@
import json
import requests
from requests.exceptions import ConnectionError
from accountpool.storages.redis import *
from accountpool.exceptions.init import InitException
from loguru import logger
class BaseTester(object):
"""
base tester
"""
def __init__(self, website=None):
"""
init base tester
"""
self.website = website
if not self.website:
raise InitException
self.account_operator = RedisClient(type='account', website=self.website)
self.credential_operator = RedisClient(type='credential', website=self.website)
def test(self, username, credential):
"""
test single credential
"""
raise NotImplementedError
def run(self):
"""
test all credentials
"""
credentials = self.credential_operator.all()
for username, credential in credentials.items():
self.test(username, credential)
class Antispider6Tester(BaseTester):
"""
tester for antispider6
"""
def __init__(self, website=None):
BaseTester.__init__(self, website)
def test(self, username, credential):
"""
test single credential
"""
logger.info(f'testing credential for {username}')
try:
test_url = TEST_URL_MAP[self.website]
response = requests.get(test_url, headers={
'Cookie': credential
}, timeout=5, allow_redirects=False)
if response.status_code == 200:
logger.info('credential is valid')
else:
logger.info('credential is not valid, delete it')
self.credential_operator.delete(username)
except ConnectionError:
logger.info('test failed')
class Antispider7Tester(BaseTester):
"""
tester for antispider7
"""
def __init__(self, website=None):
BaseTester.__init__(self, website)
def test(self, username, credential):
"""
test single credential
"""
logger.info(f'testing credential for {username}')
try:
test_url = TEST_URL_MAP[self.website]
response = requests.get(test_url, headers={
'authorization': f'jwt {credential}'
}, timeout=5, allow_redirects=False)
if response.status_code == 200:
logger.info('credential is valid')
else:
logger.info('credential is not valid, delete it')
self.credential_operator.delete(username)
except ConnectionError:
logger.info('test failed')

View File

@ -0,0 +1,95 @@
import time
import multiprocessing
from accountpool.processors.server import app
from accountpool.processors import generator as generators
from accountpool.processors import tester as testers
from accountpool.setting import CYCLE_GENERATOR, CYCLE_TESTER, API_HOST, API_THREADED, API_PORT, ENABLE_SERVER, \
ENABLE_GENERATOR, ENABLE_TESTER, IS_WINDOWS, TESTER_MAP, GENERATOR_MAP
from loguru import logger
if IS_WINDOWS:
multiprocessing.freeze_support()
tester_process, generator_process, server_process = None, None, None
class Scheduler(object):
"""
scheduler
"""
def run_tester(self, website, cycle=CYCLE_TESTER):
"""
run tester
"""
if not ENABLE_TESTER:
logger.info('tester not enabled, exit')
return
tester = getattr(testers, TESTER_MAP[website])(website)
loop = 0
while True:
logger.debug(f'tester loop {loop} start...')
tester.run()
loop += 1
time.sleep(cycle)
def run_generator(self, website, cycle=CYCLE_GENERATOR):
"""
run getter
"""
if not ENABLE_GENERATOR:
logger.info('getter not enabled, exit')
return
generator = getattr(generators, GENERATOR_MAP[website])(website)
loop = 0
while True:
logger.debug(f'getter loop {loop} start...')
generator.run()
loop += 1
time.sleep(cycle)
def run_server(self, _):
"""
run server for api
"""
if not ENABLE_SERVER:
logger.info('server not enabled, exit')
return
app.run(host=API_HOST, port=API_PORT, threaded=API_THREADED)
def run(self, website):
global tester_process, generator_process, server_process
try:
logger.info(f'starting account pool for website {website}...')
if ENABLE_TESTER:
tester_process = multiprocessing.Process(target=self.run_tester, args=(website,))
logger.info(f'starting tester, pid {tester_process.pid}...')
tester_process.start()
if ENABLE_GENERATOR:
generator_process = multiprocessing.Process(target=self.run_generator, args=(website,))
logger.info(f'starting getter, pid{generator_process.pid}...')
generator_process.start()
if ENABLE_SERVER:
server_process = multiprocessing.Process(target=self.run_server, args=(website,))
logger.info(f'starting server, pid{server_process.pid}...')
server_process.start()
tester_process.join()
generator_process.join()
server_process.join()
except KeyboardInterrupt:
logger.info('received keyboard interrupt signal')
tester_process.terminate()
generator_process.terminate()
server_process.terminate()
finally:
# must call join method before calling is_alive
tester_process.join()
generator_process.join()
server_process.join()
logger.info(f'tester is {"alive" if tester_process.is_alive() else "dead"}')
logger.info(f'getter is {"alive" if generator_process.is_alive() else "dead"}')
logger.info(f'server is {"alive" if server_process.is_alive() else "dead"}')
logger.info('accountpool terminated')

View File

@ -0,0 +1,83 @@
import platform
from os.path import dirname, abspath, join
from environs import Env
from loguru import logger
from accountpool.utils.parse import parse_redis_connection_string
env = Env()
env.read_env()
# definition of flags
IS_WINDOWS = platform.system().lower() == 'windows'
# definition of dirs
ROOT_DIR = dirname(dirname(abspath(__file__)))
LOG_DIR = join(ROOT_DIR, env.str('LOG_DIR', 'logs'))
# definition of environments
DEV_MODE, TEST_MODE, PROD_MODE = 'dev', 'test', 'prod'
APP_ENV = env.str('APP_ENV', DEV_MODE).lower()
APP_DEBUG = env.bool('APP_DEBUG', True if APP_ENV == DEV_MODE else False)
APP_DEV = IS_DEV = APP_ENV == DEV_MODE
APP_PROD = IS_PROD = APP_ENV == PROD_MODE
APP_TEST = IS_TEST = APP_ENV == TEST_MODE
# redis host
REDIS_HOST = env.str('REDIS_HOST', '127.0.0.1')
# redis port
REDIS_PORT = env.int('REDIS_PORT', 6379)
# redis password, if no password, set it to None
REDIS_PASSWORD = env.str('REDIS_PASSWORD', None)
# redis db, if no choice, set it to 0
REDIS_DB = env.int('REDIS_DB', 0)
# redis connection string, like redis://[password]@host:port or rediss://[password]@host:port/0
REDIS_CONNECTION_STRING = env.str('REDIS_CONNECTION_STRING', None)
if REDIS_CONNECTION_STRING:
REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_DB = parse_redis_connection_string(REDIS_CONNECTION_STRING)
# redis hash table key name
REDIS_ACCOUNT_KEY = env.str('REDIS_ACCOUNT_KEY', 'accounts:%s')
REDIS_CREDENTIAL_KEY = env.str('REDIS_CREDENTIAL_KEY', 'credential:%s')
# integrated generator
GENERATOR_MAP = {
'antispider6': 'Antispider6Generator',
'antispider7': 'Antispider7Generator'
}
# integrated tester
TESTER_MAP = {
'antispider6': 'Antispider6Tester',
'antispider7': 'Antispider7Tester',
}
# definition of tester cycle, it will test every CYCLE_TESTER second
CYCLE_TESTER = env.int('CYCLE_TESTER', 600)
# definition of getter cycle, it will get proxy every CYCLE_GENERATOR second
CYCLE_GENERATOR = env.int('CYCLE_GENERATOR', 600)
GET_TIMEOUT = env.int('GET_TIMEOUT', 10)
# definition of tester
TEST_URL = env.str('TEST_URL', 'http://www.baidu.com')
TEST_TIMEOUT = env.int('TEST_TIMEOUT', 10)
TEST_BATCH = env.int('TEST_BATCH', 20)
# test url
TEST_URL_MAP = {
'antispider6': 'https://antispider6.scrape.center/',
'antispider7': 'https://antispider7.scrape.center/'
}
# definition of api
API_HOST = env.str('API_HOST', '0.0.0.0')
API_PORT = env.int('API_PORT', 6789)
API_THREADED = env.bool('API_THREADED', True)
# flags of enable
ENABLE_TESTER = env.bool('ENABLE_TESTER', True)
ENABLE_GENERATOR = env.bool('ENABLE_GENERATOR', True)
ENABLE_SERVER = env.bool('ENABLE_SERVER', True)
logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), level='DEBUG', rotation='1 week',
retention='20 days')
logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), level='ERROR', rotation='1 week')

View File

@ -0,0 +1,80 @@
import random
import redis
from accountpool.setting import *
class RedisClient(object):
"""
redis client
"""
def __init__(self, type, website, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD):
"""
init redis client
:param host: redis host
:param port: redis port
:param password: redis password
"""
self.db = redis.StrictRedis(host=host, port=port, password=password, decode_responses=True)
self.type = type
self.website = website
def name(self):
"""
get hash name
:return: name of hash
"""
return f'{self.type}:{self.website}'
def set(self, username, value):
"""
set key-value
:param username: username
:param value: password or cookies
:return:
"""
return self.db.hset(self.name(), username, value)
def get(self, username):
"""
get value
:param username: username
:return:
"""
return self.db.hget(self.name(), username)
def delete(self, username):
"""
delete key-value
:param username: username
:return: result
"""
return self.db.hdel(self.name(), username)
def count(self):
"""
get count
:return: count
"""
return self.db.hlen(self.name())
def random(self):
"""
get random cookies or password
:return: random cookies or password
"""
return random.choice(self.db.hvals(self.name()))
def usernames(self):
"""
get all usernames
:return: all usernames
"""
return self.db.hkeys(self.name())
def all(self):
"""
get all key-values
:return: map of key-values
"""
return self.db.hgetall(self.name())

View File

@ -0,0 +1,13 @@
import re
def parse_redis_connection_string(connection_string):
"""
parse a redis connection string, for example:
redis://[password]@host:port
rediss://[password]@host:port
:param connection_string:
:return:
"""
result = re.match('rediss?:\/\/(.*?)@(.*?):(\d+)\/(\d+)', connection_string)
return result.group(2), int(result.group(3)), (result.group(1) or None), (result.group(4) or 0) if result \
else ('localhost', 6379, None)

View File

@ -0,0 +1,99 @@
apiVersion: v1
kind: Namespace
metadata:
creationTimestamp: null
name: accountpool
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: accountpool
namespace: accountpool
spec:
storageClassName: azure-file
accessModes:
- ReadWriteMany
resources:
requests:
storage: 2Gi
---
apiVersion: v1
items:
- apiVersion: v1
kind: Service
metadata:
annotations:
kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml
kompose.version: 1.20.0 ()
creationTimestamp: null
labels:
io.kompose.service: accountpool
name: accountpool
namespace: accountpool
spec:
ports:
- name: "6777"
port: 6777
targetPort: 6777
selector:
io.kompose.service: accountpool
status:
loadBalancer: {}
- apiVersion: apps/v1
kind: Deployment
metadata:
annotations:
kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml
kompose.version: 1.20.0 ()
creationTimestamp: null
labels:
io.kompose.service: accountpool
name: accountpool
namespace: accountpool
spec:
replicas: 2
revisionHistoryLimit: 1
strategy: {}
selector:
matchLabels:
io.kompose.service: accountpool
template:
metadata:
annotations:
kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml
kompose.version: 1.20.0 ()
creationTimestamp: null
labels:
io.kompose.service: accountpool
spec:
containers:
- env:
- name: REDIS_CONNECTION_STRING
valueFrom:
secretKeyRef:
name: redis
key: connection_string
- name: REDIS_PORT
value: '6379'
image: germey/accountpool:${TAG}
name: accountpool
resources:
limits:
memory: "500Mi"
cpu: "300m"
requests:
memory: "500Mi"
cpu: "300m"
ports:
- containerPort: 6777
volumeMounts:
- mountPath: "/app/accountpool/logs"
name: accountpool
restartPolicy: Always
volumes:
- name: accountpool
persistentVolumeClaim:
claimName: pvc-accountpool
status: {}
kind: List
metadata: {}

View File

@ -0,0 +1,19 @@
version: '3'
services:
redis4accountpool:
image: redis:alpine
container_name: redis4accountpool
command: redis-server
ports:
- "6333:6379"
accountpool:
build: .
image: 'germey/accountpool'
container_name: accountpool
ports:
- "6777:6777"
environment:
REDIS_HOST: redis4accountpool
REDIS_PORT: "6379"
API_PORT: "6777"
WEBSITE: antispider7

View File

@ -0,0 +1,14 @@
from accountpool.storages.redis import RedisClient
import argparse
parser = argparse.ArgumentParser(description='AccountPool')
parser.add_argument('website', type=str, help='website')
args = parser.parse_args()
website = args.website
conn = RedisClient('account', args.website)
start = 1
end = 100
for i in range(start, end + 1):
username = password = f'admin{i}'
conn.set(username, password)

View File

@ -0,0 +1,28 @@
import argparse
from acinonyx import run
import requests
from loguru import logger
# This is a script for registering account for antispider7, using acinonyx to accelerate.
parser = argparse.ArgumentParser(description='AccountPool')
parser.add_argument('website', type=str, help='website')
args = parser.parse_args()
website = args.website
@logger.catch()
def register(username, password):
logger.debug(f'register using {username} and {password}')
response = requests.post(f'https://{website}.scrape.center/api/register', json={
'username': username,
'password': password
})
print(response.json())
if __name__ == '__main__':
accounts = []
for index in range(1, 1000):
accounts.append((f'admin{index}', f'admin{index}'))
run(register, accounts)

View File

@ -0,0 +1,8 @@
requests==2.13.0
selenium==3.4.0
redis==2.10.5
Flask==1.1.4
environs==7.2.0
loguru==0.3.2
supervisor==4.1.0
MarkupSafe==2.0.1

View File

@ -0,0 +1,15 @@
from accountpool.scheduler import Scheduler
import argparse
parser = argparse.ArgumentParser(description='AccountPool')
parser.add_argument('website', type=str, help='website')
parser.add_argument('--processor', type=str, help='processor to run')
args = parser.parse_args()
website = args.website
if __name__ == '__main__':
# if processor set, just run it
if args.processor:
getattr(Scheduler(), f'run_{args.processor}')(website)
else:
Scheduler().run(website)

View File

@ -0,0 +1,29 @@
[supervisord]
nodaemon=true
[program:tester]
process_name=tester
command=python3 run.py %(ENV_WEBSITE)s --processor tester
directory=/app
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:generator]
process_name=generator
command=python3 run.py %(ENV_WEBSITE)s --processor generator
directory=/app
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:server]
process_name=server
command=python3 run.py %(ENV_WEBSITE)s --processor server
directory=/app
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0