96 lines
3.0 KiB
Python
96 lines
3.0 KiB
Python
# -*- coding: UTF-8 -*-
|
||
|
||
'''
|
||
'''
|
||
import requests
|
||
import time
|
||
import threading
|
||
import urllib3
|
||
from fake_headers import Headers
|
||
import uuid
|
||
from geolite2 import geolite2
|
||
ips = []
|
||
|
||
# 爬数据的线程类
|
||
|
||
def getChinaIP(ip='127.0.0.1'):
|
||
reader = geolite2.reader()
|
||
ip_info = reader.get(ip)
|
||
geolite2.close()
|
||
print(ip_info)
|
||
return True if ip_info['country']['iso_code'] == 'CN' else False
|
||
|
||
|
||
|
||
class CrawlThread(threading.Thread):
|
||
def __init__(self, proxyip):
|
||
super(CrawlThread, self).__init__()
|
||
self.proxyip = proxyip
|
||
|
||
def run(self):
|
||
# 开始计时
|
||
pure_ip_address = self.proxyip.split(':')[0]
|
||
# 验证IP归属
|
||
if not getChinaIP(pure_ip_address):
|
||
# pass
|
||
raise ValueError('不是有效IP')
|
||
#
|
||
start = time.time()
|
||
# 消除关闭证书验证的警告
|
||
urllib3.disable_warnings()
|
||
headers = Headers(headers=True).generate()
|
||
headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
|
||
headers['Pragma'] = 'no-cache'
|
||
headers['Host'] = 'bb.cf08tp.cn'
|
||
headers['x-forward-for'] = pure_ip_address
|
||
headers['Cookie'] = 'PHPSESSID={}'.format(
|
||
''.join(str(uuid.uuid1()).split('-')))
|
||
print(headers)
|
||
html = requests.get(headers=headers, url=targetUrl, proxies={
|
||
"http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode()
|
||
# 结束计时
|
||
end = time.time()
|
||
# 输出内容
|
||
print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) +
|
||
"毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************")
|
||
|
||
# 获取代理IP的线程类
|
||
|
||
|
||
class GetIpThread(threading.Thread):
|
||
def __init__(self, fetchSecond):
|
||
super(GetIpThread, self).__init__()
|
||
self.fetchSecond = fetchSecond
|
||
|
||
def run(self):
|
||
global ips
|
||
while True:
|
||
# 获取IP列表
|
||
res = requests.get(apiUrl).content.decode()
|
||
# 按照\n分割获取到的IP
|
||
ips = res.split('\n')
|
||
# 利用每一个IP
|
||
for proxyip in ips:
|
||
if proxyip.strip():
|
||
# 开启一个线程
|
||
# CrawlThread(proxyip).start()
|
||
try:
|
||
CrawlThread(proxyip).run()
|
||
time.sleep(1.5)
|
||
except Exception as e:
|
||
print(e)
|
||
# 休眠
|
||
time.sleep(len(ips) /self.fetchSecond )
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# 获取IP的API接口
|
||
# apiUrl = "http://127.0.0.1:5555/all"
|
||
apiUrl = "http://127.0.0.1:5555/random"
|
||
# 要抓取的目标网站地址
|
||
targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp="
|
||
# targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp='
|
||
fetchSecond = 5
|
||
# 开始自动获取IP
|
||
GetIpThread(fetchSecond).start()
|