# -*- encoding:utf-8 -*- ''' @Author : dingjiawen @Date : 2023/12/7 19:32 @Usage : @Desc : ''' from selenium import webdriver from pyquery import PyQuery as pq from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait import re # 解析名字,排序获得正确的顺序 def parse_name(name_html): chars = name_html('.char') items = [] for char in chars.items(): items.append({ 'text': char.text().strip(), 'left': int(re.search('(\d+)px', char.attr('style')).group(1)) }) items = sorted(items, key=lambda x: x['left'], reverse=False) return ''.join([item.get('text') for item in items]) # 判断如果是完整的就不进行下述操作 def parse_name_whole(name_html): has_whole = name_html('.whole') if has_whole: return name_html.text() else: chars = name_html('.char') items = [] for char in chars.items(): items.append({ 'text': char.text().strip(), 'left': int(re.search('(\d+)px', char.attr('style')).group(1)) }) items = sorted(items, key=lambda x: x['left'], reverse=False) return ''.join([item.get('text') for item in items]) browser = webdriver.Chrome() browser.get('https://antispider3.scrape.center/') WebDriverWait(browser, 10) \ .until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.item'))) html = browser.page_source doc = pq(html) names = doc('.item .name') for name_html in names.items(): name = parse_name_whole(name_html) print(name) browser.close()