20231215爬虫更新代理池相关内容
This commit is contained in:
parent
9e4b2f4679
commit
b428606ea5
|
|
@ -0,0 +1,8 @@
|
|||
#-*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/12/14 16:39
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
{
|
||||
"code": 0,
|
||||
"msg": "",
|
||||
"data": {
|
||||
"count": 10,
|
||||
"dedup_count": 10,
|
||||
"order_left_count": 990,
|
||||
"proxy_list": [
|
||||
"124.172.117.189:19812",
|
||||
"219.133.31.120:26947",
|
||||
"183.237.194.145:28436",
|
||||
"183.62.172.50:23485",
|
||||
"163.125.157.243:17503",
|
||||
"183.57.42.79:26483",
|
||||
"202.103.150.70:17251",
|
||||
"182.254.129.124:15395",
|
||||
"58.251.132.181:20659",
|
||||
"112.95.241.76:21948"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
# -*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/12/14 16:40
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
||||
import requests
|
||||
import json
|
||||
|
||||
PROXY_API = 'https://dps.kdlapi.com/api/getdps/?secret_id=oimi28znnx51x79f3r0d&num=10&signature=25zjft23etaeswom3ipa56bsyqnne347&pt=1&format=json&sep=2'
|
||||
|
||||
|
||||
def get_proxies():
|
||||
response = requests.get(PROXY_API)
|
||||
res = json.loads(response.text)
|
||||
return res['data']['proxy_list']
|
||||
|
||||
|
||||
def test_proxies():
|
||||
proxies = get_proxies()
|
||||
# 注意这里要用户名和密码 在订单中心可以看https://www.kuaidaili.com/uc/dps/?orderid=930254289411869
|
||||
auth = "d2118699212:bxb0p3l8"
|
||||
for proxy in proxies:
|
||||
proxy = proxy.strip()
|
||||
print(f"using proxy {proxy}")
|
||||
p = {
|
||||
'http': f'http://{auth}@{proxy}',
|
||||
'https': f'http://{auth}@{proxy}',
|
||||
|
||||
}
|
||||
try:
|
||||
requests.Request()
|
||||
response = requests.get('http://www.httpbin.org/ip', proxies=p)
|
||||
# response = requests.get('http://www.baidu.com', proxies=p)
|
||||
print(response.text)
|
||||
except requests.ConnectionError as e:
|
||||
print(e)
|
||||
print(f"proxy {proxy} is invalid")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_proxies()
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
#-*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/12/14 19:09
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
#-*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/12/14 19:20
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
# -*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/12/14 19:22
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
||||
from environs import Env
|
||||
|
||||
env = Env()
|
||||
env.read_env()
|
||||
|
||||
REDIS_HOST = env.str('REDIS_HOST', '192.168.118.202')
|
||||
|
||||
REDIS_PORT = env.int('REDIS_PORT', 6379)
|
||||
|
||||
REDIS_PASSWORD = env.str('REDIS_PASSWORD', None)
|
||||
|
||||
REDIS_KEY = env.str('REDIS_KEY', 'antispider5')
|
||||
|
||||
PROXY_POOL_URL = env.str('PROXY_POOL_URL', 'http://127.0.0.1:5555/random')
|
||||
|
||||
IS_AUTH_PROXY = env.bool('IS_AUTH_PROXY', True)
|
||||
|
||||
TIMEOUT = env.int('TIMEOUT', 10)
|
||||
|
||||
MAX_FAILED_TIME = env.int('MAX_FAILED_TIME', 20)
|
||||
|
||||
VALID_STATUSES = env.list('VALID_STATUSES', [200])
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
#-*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/12/14 19:26
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
||||
from redis import StrictRedis
|
||||
from core.config import *
|
||||
from pickle import dumps, loads
|
||||
from core.request import MovieRequest
|
||||
|
||||
# Request对象不能直接存取,可以通过pickle的dumps和loads进行序列化和反序列化
|
||||
class RedisQueue():
|
||||
def __init__(self):
|
||||
"""
|
||||
init redis connection
|
||||
"""
|
||||
self.db = StrictRedis(
|
||||
host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD)
|
||||
|
||||
def add(self, request):
|
||||
"""
|
||||
add request to queue
|
||||
:param request: request
|
||||
:param fail_time: fail times
|
||||
:return: result
|
||||
"""
|
||||
if isinstance(request, MovieRequest):
|
||||
return self.db.rpush(REDIS_KEY, dumps(request))
|
||||
return False
|
||||
|
||||
def pop(self):
|
||||
"""
|
||||
get next request
|
||||
:return: Request or None
|
||||
"""
|
||||
if self.db.llen(REDIS_KEY):
|
||||
return loads(self.db.lpop(REDIS_KEY))
|
||||
return False
|
||||
|
||||
def clear(self):
|
||||
self.db.delete(REDIS_KEY)
|
||||
|
||||
def empty(self):
|
||||
return self.db.llen(REDIS_KEY) == 0
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
# -*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/12/14 19:21
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
||||
from core.config import *
|
||||
from requests import Request
|
||||
|
||||
|
||||
class MovieRequest(Request):
|
||||
def __init__(self, url, callback, method='GET', headers=None, fail_time=0, timeout=TIMEOUT):
|
||||
Request.__init__(self, method, url, headers)
|
||||
# 增加几个参数,分别代表回调函数,失败次数,和超时时间
|
||||
self.callback = callback
|
||||
self.fail_time = fail_time
|
||||
self.timeout = timeout
|
||||
|
|
@ -0,0 +1,192 @@
|
|||
# -*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/12/14 19:34
|
||||
@Usage : 实际爬取 依赖于之前代理池的运行
|
||||
@Desc :
|
||||
'''
|
||||
|
||||
import re
|
||||
import requests
|
||||
from urllib.parse import urljoin
|
||||
from requests import Session
|
||||
from requests.exceptions import RequestException
|
||||
from core.config import *
|
||||
from core.db import RedisQueue
|
||||
from core.request import MovieRequest
|
||||
from pyquery import PyQuery as pq
|
||||
from loguru import logger
|
||||
|
||||
BASE_URL = 'https://antispider5.scrape.center/'
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
|
||||
}
|
||||
|
||||
|
||||
class Spider():
|
||||
session = Session()
|
||||
queue = RedisQueue()
|
||||
|
||||
@logger.catch
|
||||
def get_proxy(self):
|
||||
"""
|
||||
get proxy from proxypool
|
||||
:return: proxy
|
||||
"""
|
||||
response = requests.get(PROXY_POOL_URL)
|
||||
if response.status_code == 200:
|
||||
logger.debug(f'get proxy {response.text}')
|
||||
return response.text
|
||||
|
||||
@logger.catch
|
||||
def get_proxy_safe(self):
|
||||
"""
|
||||
get proxy from proxypool
|
||||
:return: proxy
|
||||
"""
|
||||
response = requests.get(PROXY_POOL_URL)
|
||||
if response.status_code == 200:
|
||||
logger.debug(f'get proxy {response.text}')
|
||||
return response.text
|
||||
|
||||
def start(self):
|
||||
"""
|
||||
start request
|
||||
"""
|
||||
self.session.headers.update(HEADERS)
|
||||
start_url = BASE_URL
|
||||
request = MovieRequest(
|
||||
url=start_url, callback=self.parse_index)
|
||||
# schedule first request
|
||||
self.queue.add(request)
|
||||
|
||||
def parse_index(self, response):
|
||||
"""
|
||||
parse index page
|
||||
:param response: response
|
||||
:return: new request
|
||||
"""
|
||||
doc = pq(response.text)
|
||||
|
||||
# request for detail
|
||||
items = doc('.item .name').items()
|
||||
for item in items:
|
||||
detail_url = urljoin(BASE_URL, item.attr('href'))
|
||||
request = MovieRequest(
|
||||
url=detail_url, callback=self.parse_detail)
|
||||
yield request
|
||||
|
||||
# request for next page
|
||||
next_href = doc('.next').attr('href')
|
||||
if next_href:
|
||||
next_url = urljoin(BASE_URL, next_href)
|
||||
request = MovieRequest(
|
||||
url=next_url, callback=self.parse_index)
|
||||
yield request
|
||||
|
||||
def parse_detail(self, response):
|
||||
"""
|
||||
parse detail
|
||||
:param response: response of detail
|
||||
:return: data
|
||||
"""
|
||||
doc = pq(response.text)
|
||||
cover = doc('img.cover').attr('src')
|
||||
name = doc('a > h2').text()
|
||||
categories = [item.text()
|
||||
for item in doc('.categories button span').items()]
|
||||
published_at = doc('.info:contains(上映)').text()
|
||||
published_at = re.search('(\d{4}-\d{2}-\d{2})', published_at).group(1) \
|
||||
if published_at and re.search('\d{4}-\d{2}-\d{2}', published_at) else None
|
||||
drama = doc('.drama p').text()
|
||||
score = doc('p.score').text()
|
||||
score = float(score) if score else None
|
||||
yield {
|
||||
'cover': cover,
|
||||
'name': name,
|
||||
'categories': categories,
|
||||
'published_at': published_at,
|
||||
'drama': drama,
|
||||
'score': score
|
||||
}
|
||||
|
||||
def request(self, request):
|
||||
"""
|
||||
execute request
|
||||
:param request: weixin request
|
||||
:return: response
|
||||
"""
|
||||
try:
|
||||
proxy = self.get_proxy()
|
||||
logger.debug(f'get proxy {proxy}')
|
||||
proxies = None
|
||||
auth = "d2118699212:bxb0p3l8"
|
||||
if proxy:
|
||||
if IS_AUTH_PROXY:
|
||||
proxies = {
|
||||
'http': f'http://{auth}@{proxy}',
|
||||
'https': f'http://{auth}@{proxy}',
|
||||
}
|
||||
else:
|
||||
proxies = {
|
||||
'http': 'http://' + proxy,
|
||||
'https': 'https://' + proxy
|
||||
}
|
||||
return self.session.send(request.prepare(),
|
||||
timeout=request.timeout,
|
||||
proxies=proxies)
|
||||
except RequestException:
|
||||
logger.exception(f'requesting {request.url} failed')
|
||||
|
||||
def error(self, request):
|
||||
"""
|
||||
error handling
|
||||
:param request: request
|
||||
:return:
|
||||
"""
|
||||
request.fail_time = request.fail_time + 1
|
||||
logger.debug(
|
||||
f'request of {request.url} failed {request.fail_time} times')
|
||||
if request.fail_time < MAX_FAILED_TIME:
|
||||
self.queue.add(request)
|
||||
|
||||
def schedule(self):
|
||||
"""
|
||||
schedule request
|
||||
:return:
|
||||
"""
|
||||
# 从池子里取,然后不断爬取
|
||||
while not self.queue.empty():
|
||||
request = self.queue.pop()
|
||||
callback = request.callback
|
||||
logger.debug(f'executing request {request.url}')
|
||||
response = self.request(request)
|
||||
logger.debug(f'response status {response} of {request.url}')
|
||||
if not response or not response.status_code in VALID_STATUSES:
|
||||
self.error(request)
|
||||
continue
|
||||
results = list(callback(response))
|
||||
if not results:
|
||||
self.error(request)
|
||||
continue
|
||||
for result in results:
|
||||
if isinstance(result, MovieRequest):
|
||||
logger.debug(f'generated new request {result.url}')
|
||||
self.queue.add(result)
|
||||
if isinstance(result, dict):
|
||||
# 到这里就证明是爬取成功了,可以保存之类的了
|
||||
logger.debug(f'scraped new data {result}')
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
run
|
||||
:return:
|
||||
"""
|
||||
self.start()
|
||||
self.schedule()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
spider = Spider()
|
||||
spider.run()
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
#-*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/12/14 12:57
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
|
@ -22,5 +22,19 @@ async def main():
|
|||
print(await response.text())
|
||||
|
||||
|
||||
async def socks():
|
||||
from aiohttp_socks import ProxyConnector, ProxyType
|
||||
connector = ProxyConnector(
|
||||
proxy_type=ProxyType.HTTP,
|
||||
host='127.0.0.1',
|
||||
port=7890,
|
||||
# username='user',
|
||||
# password='password',
|
||||
# rdns=True
|
||||
)
|
||||
async with aiohttp.ClientSession(connector=connector) as session:
|
||||
async with session.get('https://httpbin.org/get') as response:
|
||||
print(await response.text())
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.get_event_loop().run_until_complete(main())
|
||||
|
|
|
|||
|
|
@ -0,0 +1,45 @@
|
|||
# -*- encoding:utf-8 -*-
|
||||
|
||||
'''
|
||||
@Author : dingjiawen
|
||||
@Date : 2023/12/14 12:39
|
||||
@Usage :
|
||||
@Desc :
|
||||
'''
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
|
||||
def http():
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=False, proxy={
|
||||
'server': 'http://127.0.0.1:7890'
|
||||
})
|
||||
page = browser.new_page()
|
||||
page.goto('https://httpbin.org/get')
|
||||
print(page.content())
|
||||
browser.close()
|
||||
|
||||
|
||||
def http_auth():
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(proxy={
|
||||
'server': 'http://127.0.0.1:7890',
|
||||
'username': 'foo',
|
||||
'password': 'bar'
|
||||
})
|
||||
page = browser.new_page()
|
||||
page.goto('https://httpbin.org/get')
|
||||
print(page.content())
|
||||
browser.close()
|
||||
|
||||
|
||||
def socks():
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(proxy={
|
||||
'server': 'socks5://127.0.0.1:7891'
|
||||
})
|
||||
page = browser.new_page()
|
||||
page.goto('https://httpbin.org/get')
|
||||
print(page.content())
|
||||
browser.close()
|
||||
Loading…
Reference in New Issue