# -*- encoding:utf-8 -*- ''' @Author : dingjiawen @Date : 2023/12/11 19:53 @Usage : @Desc : 尝试使用selenium和ocr技术去爬取 https://captcha7.scrape.center/ @参考:https://github.com/Python3WebSpider/CrackImageCaptcha/blob/master/main.py ''' from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from io import BytesIO from PIL import Image import numpy as np import tesserocr import re from retrying import retry import time browser = webdriver.Chrome() # 预处理,提高图片识别率 def preProcess(image): image = image.convert('L') array = np.array(image) array = np.where(array > 50, 255, 0) return Image.fromarray(array.astype('uint8')) @retry(stop_max_attempt_number=10, retry_on_result=lambda x: x is False) def login(): browser.get('https://captcha7.scrape.center/') # send_keys输入 browser.find_element(By.CSS_SELECTOR, ".username input[type='text']").send_keys('admin') browser.find_element(By.CSS_SELECTOR, ".password input[type='password']").send_keys('admin') captcha = browser.find_element(By.CSS_SELECTOR, "#captcha") image = Image.open(BytesIO(captcha.screenshot_as_png)) print("处理前:",tesserocr.image_to_text(image)) # 预处理提高识别率 # image = preProcess(image) # captcha = tesserocr.image_to_text(image) # 模式匹配,消除空格等 captcha=tesserocr.image_to_text(image) print("处理后:",captcha) captcha = re.sub(' ', '', captcha) print("模式匹配后",captcha) browser.find_element(By.CSS_SELECTOR, ".captcha input[type='text']").send_keys(captcha) # 点击登录 browser.find_element(By.CSS_SELECTOR, '.login').click() try: WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, '//h2[contains(.,"登录成功")]'))) time.sleep(5) browser.close() return True except TimeoutException: return False if __name__ == '__main__': login()