self_example/Spider/Chapter08_验证码的识别/OCR技术识别图形验证码/demo3爬取实战.py

65 lines
2.1 KiB
Python

# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/12/11 19:53
@Usage :
@Desc : 尝试使用selenium和ocr技术去爬取 https://captcha7.scrape.center/
@参考:https://github.com/Python3WebSpider/CrackImageCaptcha/blob/master/main.py
'''
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from io import BytesIO
from PIL import Image
import numpy as np
import tesserocr
import re
from retrying import retry
import time
browser = webdriver.Chrome()
# 预处理,提高图片识别率
def preProcess(image):
image = image.convert('L')
array = np.array(image)
array = np.where(array > 50, 255, 0)
return Image.fromarray(array.astype('uint8'))
@retry(stop_max_attempt_number=10, retry_on_result=lambda x: x is False)
def login():
browser.get('https://captcha7.scrape.center/')
# send_keys输入
browser.find_element(By.CSS_SELECTOR, ".username input[type='text']").send_keys('admin')
browser.find_element(By.CSS_SELECTOR, ".password input[type='password']").send_keys('admin')
captcha = browser.find_element(By.CSS_SELECTOR, "#captcha")
image = Image.open(BytesIO(captcha.screenshot_as_png))
print("处理前:",tesserocr.image_to_text(image))
# 预处理提高识别率
# image = preProcess(image)
# captcha = tesserocr.image_to_text(image)
# 模式匹配,消除空格等
captcha=tesserocr.image_to_text(image)
print("处理后:",captcha)
captcha = re.sub(' ', '', captcha)
print("模式匹配后",captcha)
browser.find_element(By.CSS_SELECTOR, ".captcha input[type='text']").send_keys(captcha)
# 点击登录
browser.find_element(By.CSS_SELECTOR, '.login').click()
try:
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, '//h2[contains(.,"登录成功")]')))
time.sleep(5)
browser.close()
return True
except TimeoutException:
return False
if __name__ == '__main__':
login()