65 lines
2.1 KiB
Python
65 lines
2.1 KiB
Python
# -*- encoding:utf-8 -*-
|
|
|
|
'''
|
|
@Author : dingjiawen
|
|
@Date : 2023/12/11 19:53
|
|
@Usage :
|
|
@Desc : 尝试使用selenium和ocr技术去爬取 https://captcha7.scrape.center/
|
|
@参考:https://github.com/Python3WebSpider/CrackImageCaptcha/blob/master/main.py
|
|
'''
|
|
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.wait import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.common.exceptions import TimeoutException
|
|
from io import BytesIO
|
|
from PIL import Image
|
|
import numpy as np
|
|
import tesserocr
|
|
import re
|
|
from retrying import retry
|
|
import time
|
|
|
|
browser = webdriver.Chrome()
|
|
|
|
|
|
# 预处理,提高图片识别率
|
|
def preProcess(image):
|
|
image = image.convert('L')
|
|
array = np.array(image)
|
|
array = np.where(array > 50, 255, 0)
|
|
return Image.fromarray(array.astype('uint8'))
|
|
|
|
|
|
@retry(stop_max_attempt_number=10, retry_on_result=lambda x: x is False)
|
|
def login():
|
|
browser.get('https://captcha7.scrape.center/')
|
|
# send_keys输入
|
|
browser.find_element(By.CSS_SELECTOR, ".username input[type='text']").send_keys('admin')
|
|
browser.find_element(By.CSS_SELECTOR, ".password input[type='password']").send_keys('admin')
|
|
captcha = browser.find_element(By.CSS_SELECTOR, "#captcha")
|
|
image = Image.open(BytesIO(captcha.screenshot_as_png))
|
|
print("处理前:",tesserocr.image_to_text(image))
|
|
# 预处理提高识别率
|
|
# image = preProcess(image)
|
|
# captcha = tesserocr.image_to_text(image)
|
|
# 模式匹配,消除空格等
|
|
captcha=tesserocr.image_to_text(image)
|
|
print("处理后:",captcha)
|
|
captcha = re.sub(' ', '', captcha)
|
|
print("模式匹配后",captcha)
|
|
browser.find_element(By.CSS_SELECTOR, ".captcha input[type='text']").send_keys(captcha)
|
|
# 点击登录
|
|
browser.find_element(By.CSS_SELECTOR, '.login').click()
|
|
try:
|
|
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, '//h2[contains(.,"登录成功")]')))
|
|
time.sleep(5)
|
|
browser.close()
|
|
return True
|
|
except TimeoutException:
|
|
return False
|
|
|
|
|
|
if __name__ == '__main__':
|
|
login() |