60 lines
1.7 KiB
Python
60 lines
1.7 KiB
Python
# -*- encoding:utf-8 -*-
|
|
|
|
'''
|
|
@Author : dingjiawen
|
|
@Date : 2023/12/7 19:32
|
|
@Usage :
|
|
@Desc :
|
|
'''
|
|
|
|
from selenium import webdriver
|
|
from pyquery import PyQuery as pq
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.webdriver.support.wait import WebDriverWait
|
|
import re
|
|
|
|
|
|
# 解析名字,排序获得正确的顺序
|
|
def parse_name(name_html):
|
|
chars = name_html('.char')
|
|
items = []
|
|
for char in chars.items():
|
|
items.append({
|
|
'text': char.text().strip(),
|
|
'left': int(re.search('(\d+)px', char.attr('style')).group(1))
|
|
})
|
|
items = sorted(items, key=lambda x: x['left'], reverse=False)
|
|
return ''.join([item.get('text') for item in items])
|
|
|
|
|
|
# 判断如果是完整的就不进行下述操作
|
|
def parse_name_whole(name_html):
|
|
has_whole = name_html('.whole')
|
|
if has_whole:
|
|
return name_html.text()
|
|
else:
|
|
chars = name_html('.char')
|
|
items = []
|
|
for char in chars.items():
|
|
items.append({
|
|
'text': char.text().strip(),
|
|
'left': int(re.search('(\d+)px', char.attr('style')).group(1))
|
|
})
|
|
items = sorted(items, key=lambda x: x['left'], reverse=False)
|
|
return ''.join([item.get('text') for item in items])
|
|
|
|
|
|
browser = webdriver.Chrome()
|
|
browser.get('https://antispider3.scrape.center/')
|
|
WebDriverWait(browser, 10) \
|
|
.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.item')))
|
|
html = browser.page_source
|
|
doc = pq(html)
|
|
names = doc('.item .name')
|
|
|
|
for name_html in names.items():
|
|
name = parse_name_whole(name_html)
|
|
print(name)
|
|
browser.close()
|