20231215爬虫更新代理池相关内容

This commit is contained in:
kevinding1125 2023-12-15 14:29:31 +08:00
parent 9e4b2f4679
commit b428606ea5
12 changed files with 448 additions and 0 deletions

View File

@ -0,0 +1,8 @@
#-*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/12/14 16:39
@Usage :
@Desc :
'''

View File

@ -0,0 +1,21 @@
{
"code": 0,
"msg": "",
"data": {
"count": 10,
"dedup_count": 10,
"order_left_count": 990,
"proxy_list": [
"124.172.117.189:19812",
"219.133.31.120:26947",
"183.237.194.145:28436",
"183.62.172.50:23485",
"163.125.157.243:17503",
"183.57.42.79:26483",
"202.103.150.70:17251",
"182.254.129.124:15395",
"58.251.132.181:20659",
"112.95.241.76:21948"
]
}
}

View File

@ -0,0 +1,45 @@
# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/12/14 16:40
@Usage :
@Desc :
'''
import requests
import json
PROXY_API = 'https://dps.kdlapi.com/api/getdps/?secret_id=oimi28znnx51x79f3r0d&num=10&signature=25zjft23etaeswom3ipa56bsyqnne347&pt=1&format=json&sep=2'
def get_proxies():
response = requests.get(PROXY_API)
res = json.loads(response.text)
return res['data']['proxy_list']
def test_proxies():
proxies = get_proxies()
# 注意这里要用户名和密码 在订单中心可以看https://www.kuaidaili.com/uc/dps/?orderid=930254289411869
auth = "d2118699212:bxb0p3l8"
for proxy in proxies:
proxy = proxy.strip()
print(f"using proxy {proxy}")
p = {
'http': f'http://{auth}@{proxy}',
'https': f'http://{auth}@{proxy}',
}
try:
requests.Request()
response = requests.get('http://www.httpbin.org/ip', proxies=p)
# response = requests.get('http://www.baidu.com', proxies=p)
print(response.text)
except requests.ConnectionError as e:
print(e)
print(f"proxy {proxy} is invalid")
if __name__ == '__main__':
test_proxies()

View File

@ -0,0 +1,8 @@
#-*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/12/14 19:09
@Usage :
@Desc :
'''

View File

@ -0,0 +1,8 @@
#-*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/12/14 19:20
@Usage :
@Desc :
'''

View File

@ -0,0 +1,31 @@
# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/12/14 19:22
@Usage :
@Desc :
'''
from environs import Env
env = Env()
env.read_env()
REDIS_HOST = env.str('REDIS_HOST', '192.168.118.202')
REDIS_PORT = env.int('REDIS_PORT', 6379)
REDIS_PASSWORD = env.str('REDIS_PASSWORD', None)
REDIS_KEY = env.str('REDIS_KEY', 'antispider5')
PROXY_POOL_URL = env.str('PROXY_POOL_URL', 'http://127.0.0.1:5555/random')
IS_AUTH_PROXY = env.bool('IS_AUTH_PROXY', True)
TIMEOUT = env.int('TIMEOUT', 10)
MAX_FAILED_TIME = env.int('MAX_FAILED_TIME', 20)
VALID_STATUSES = env.list('VALID_STATUSES', [200])

View File

@ -0,0 +1,48 @@
#-*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/12/14 19:26
@Usage :
@Desc :
'''
from redis import StrictRedis
from core.config import *
from pickle import dumps, loads
from core.request import MovieRequest
# Request对象不能直接存取可以通过pickle的dumps和loads进行序列化和反序列化
class RedisQueue():
def __init__(self):
"""
init redis connection
"""
self.db = StrictRedis(
host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD)
def add(self, request):
"""
add request to queue
:param request: request
:param fail_time: fail times
:return: result
"""
if isinstance(request, MovieRequest):
return self.db.rpush(REDIS_KEY, dumps(request))
return False
def pop(self):
"""
get next request
:return: Request or None
"""
if self.db.llen(REDIS_KEY):
return loads(self.db.lpop(REDIS_KEY))
return False
def clear(self):
self.db.delete(REDIS_KEY)
def empty(self):
return self.db.llen(REDIS_KEY) == 0

View File

@ -0,0 +1,20 @@
# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/12/14 19:21
@Usage :
@Desc :
'''
from core.config import *
from requests import Request
class MovieRequest(Request):
def __init__(self, url, callback, method='GET', headers=None, fail_time=0, timeout=TIMEOUT):
Request.__init__(self, method, url, headers)
# 增加几个参数,分别代表回调函数,失败次数,和超时时间
self.callback = callback
self.fail_time = fail_time
self.timeout = timeout

View File

@ -0,0 +1,192 @@
# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/12/14 19:34
@Usage : 实际爬取 依赖于之前代理池的运行
@Desc :
'''
import re
import requests
from urllib.parse import urljoin
from requests import Session
from requests.exceptions import RequestException
from core.config import *
from core.db import RedisQueue
from core.request import MovieRequest
from pyquery import PyQuery as pq
from loguru import logger
BASE_URL = 'https://antispider5.scrape.center/'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
}
class Spider():
session = Session()
queue = RedisQueue()
@logger.catch
def get_proxy(self):
"""
get proxy from proxypool
:return: proxy
"""
response = requests.get(PROXY_POOL_URL)
if response.status_code == 200:
logger.debug(f'get proxy {response.text}')
return response.text
@logger.catch
def get_proxy_safe(self):
"""
get proxy from proxypool
:return: proxy
"""
response = requests.get(PROXY_POOL_URL)
if response.status_code == 200:
logger.debug(f'get proxy {response.text}')
return response.text
def start(self):
"""
start request
"""
self.session.headers.update(HEADERS)
start_url = BASE_URL
request = MovieRequest(
url=start_url, callback=self.parse_index)
# schedule first request
self.queue.add(request)
def parse_index(self, response):
"""
parse index page
:param response: response
:return: new request
"""
doc = pq(response.text)
# request for detail
items = doc('.item .name').items()
for item in items:
detail_url = urljoin(BASE_URL, item.attr('href'))
request = MovieRequest(
url=detail_url, callback=self.parse_detail)
yield request
# request for next page
next_href = doc('.next').attr('href')
if next_href:
next_url = urljoin(BASE_URL, next_href)
request = MovieRequest(
url=next_url, callback=self.parse_index)
yield request
def parse_detail(self, response):
"""
parse detail
:param response: response of detail
:return: data
"""
doc = pq(response.text)
cover = doc('img.cover').attr('src')
name = doc('a > h2').text()
categories = [item.text()
for item in doc('.categories button span').items()]
published_at = doc('.info:contains(上映)').text()
published_at = re.search('(\d{4}-\d{2}-\d{2})', published_at).group(1) \
if published_at and re.search('\d{4}-\d{2}-\d{2}', published_at) else None
drama = doc('.drama p').text()
score = doc('p.score').text()
score = float(score) if score else None
yield {
'cover': cover,
'name': name,
'categories': categories,
'published_at': published_at,
'drama': drama,
'score': score
}
def request(self, request):
"""
execute request
:param request: weixin request
:return: response
"""
try:
proxy = self.get_proxy()
logger.debug(f'get proxy {proxy}')
proxies = None
auth = "d2118699212:bxb0p3l8"
if proxy:
if IS_AUTH_PROXY:
proxies = {
'http': f'http://{auth}@{proxy}',
'https': f'http://{auth}@{proxy}',
}
else:
proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy
}
return self.session.send(request.prepare(),
timeout=request.timeout,
proxies=proxies)
except RequestException:
logger.exception(f'requesting {request.url} failed')
def error(self, request):
"""
error handling
:param request: request
:return:
"""
request.fail_time = request.fail_time + 1
logger.debug(
f'request of {request.url} failed {request.fail_time} times')
if request.fail_time < MAX_FAILED_TIME:
self.queue.add(request)
def schedule(self):
"""
schedule request
:return:
"""
# 从池子里取,然后不断爬取
while not self.queue.empty():
request = self.queue.pop()
callback = request.callback
logger.debug(f'executing request {request.url}')
response = self.request(request)
logger.debug(f'response status {response} of {request.url}')
if not response or not response.status_code in VALID_STATUSES:
self.error(request)
continue
results = list(callback(response))
if not results:
self.error(request)
continue
for result in results:
if isinstance(result, MovieRequest):
logger.debug(f'generated new request {result.url}')
self.queue.add(result)
if isinstance(result, dict):
# 到这里就证明是爬取成功了,可以保存之类的了
logger.debug(f'scraped new data {result}')
def run(self):
"""
run
:return:
"""
self.start()
self.schedule()
if __name__ == '__main__':
spider = Spider()
spider.run()

View File

@ -0,0 +1,8 @@
#-*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/12/14 12:57
@Usage :
@Desc :
'''

View File

@ -22,5 +22,19 @@ async def main():
print(await response.text())
async def socks():
from aiohttp_socks import ProxyConnector, ProxyType
connector = ProxyConnector(
proxy_type=ProxyType.HTTP,
host='127.0.0.1',
port=7890,
# username='user',
# password='password',
# rdns=True
)
async with aiohttp.ClientSession(connector=connector) as session:
async with session.get('https://httpbin.org/get') as response:
print(await response.text())
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(main())

View File

@ -0,0 +1,45 @@
# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/12/14 12:39
@Usage :
@Desc :
'''
from playwright.sync_api import sync_playwright
def http():
with sync_playwright() as p:
browser = p.chromium.launch(headless=False, proxy={
'server': 'http://127.0.0.1:7890'
})
page = browser.new_page()
page.goto('https://httpbin.org/get')
print(page.content())
browser.close()
def http_auth():
with sync_playwright() as p:
browser = p.chromium.launch(proxy={
'server': 'http://127.0.0.1:7890',
'username': 'foo',
'password': 'bar'
})
page = browser.new_page()
page.goto('https://httpbin.org/get')
print(page.content())
browser.close()
def socks():
with sync_playwright() as p:
browser = p.chromium.launch(proxy={
'server': 'socks5://127.0.0.1:7891'
})
page = browser.new_page()
page.goto('https://httpbin.org/get')
print(page.content())
browser.close()