diff --git a/Spider/BaseInformation b/Spider/BaseInformation.txt similarity index 100% rename from Spider/BaseInformation rename to Spider/BaseInformation.txt diff --git a/Spider/Chapter10_模拟登录/__init__.py b/Spider/Chapter10_模拟登录/__init__.py new file mode 100644 index 0000000..e4cbee3 --- /dev/null +++ b/Spider/Chapter10_模拟登录/__init__.py @@ -0,0 +1,8 @@ +#-*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/15 14:30 +@Usage : +@Desc : +''' \ No newline at end of file diff --git a/Spider/Chapter10_模拟登录/基于JWT模拟登录爬取实战/__init__.py b/Spider/Chapter10_模拟登录/基于JWT模拟登录爬取实战/__init__.py new file mode 100644 index 0000000..a9ce791 --- /dev/null +++ b/Spider/Chapter10_模拟登录/基于JWT模拟登录爬取实战/__init__.py @@ -0,0 +1,8 @@ +#-*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/15 16:17 +@Usage : +@Desc : +''' \ No newline at end of file diff --git a/Spider/Chapter10_模拟登录/基于JWT模拟登录爬取实战/spider1.py b/Spider/Chapter10_模拟登录/基于JWT模拟登录爬取实战/spider1.py new file mode 100644 index 0000000..d6eb3be --- /dev/null +++ b/Spider/Chapter10_模拟登录/基于JWT模拟登录爬取实战/spider1.py @@ -0,0 +1,37 @@ +# -*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/15 16:17 +@Usage : +@Desc : +''' + +import requests +from urllib.parse import urljoin + +BASE_URL = 'https://login3.scrape.center/' +LOGIN_URL = urljoin(BASE_URL, '/api/login') +INDEX_URL = urljoin(BASE_URL, '/api/book') +USERNAME = 'admin' +PASSWORD = 'admin' + +response_login = requests.post(LOGIN_URL, json={ + 'username': USERNAME, + 'password': PASSWORD +}) +data = response_login.json() +print('Response JSON', data) +jwt = data.get('token') +print('JWT', jwt) + +headers = { + 'Authorization': f'jwt {jwt}' +} +response_index = requests.get(INDEX_URL, params={ + 'limit': 18, + 'offset': 0 +}, headers=headers) +print('Response Status', response_index.status_code) +print('Response URL', response_index.url) +print('Response Data', response_index.json()) diff --git a/Spider/Chapter10_模拟登录/基于Session和Cookie的模拟登陆爬取实战/__init__.py b/Spider/Chapter10_模拟登录/基于Session和Cookie的模拟登陆爬取实战/__init__.py new file mode 100644 index 0000000..c6f8588 --- /dev/null +++ b/Spider/Chapter10_模拟登录/基于Session和Cookie的模拟登陆爬取实战/__init__.py @@ -0,0 +1,8 @@ +#-*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/15 15:27 +@Usage : +@Desc : +''' \ No newline at end of file diff --git a/Spider/Chapter10_模拟登录/基于Session和Cookie的模拟登陆爬取实战/spider1.py b/Spider/Chapter10_模拟登录/基于Session和Cookie的模拟登陆爬取实战/spider1.py new file mode 100644 index 0000000..e5fe9fe --- /dev/null +++ b/Spider/Chapter10_模拟登录/基于Session和Cookie的模拟登陆爬取实战/spider1.py @@ -0,0 +1,25 @@ +#-*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/15 15:37 +@Usage : +@Desc : +''' +import requests +from urllib.parse import urljoin + +BASE_URL = 'https://login2.scrape.center/' +LOGIN_URL = urljoin(BASE_URL, '/login') +INDEX_URL = urljoin(BASE_URL, '/page/1') +USERNAME = 'admin' +PASSWORD = 'admin' + +response_login = requests.post(LOGIN_URL, data={ + 'username': USERNAME, + 'password': PASSWORD +}) + +response_index = requests.get(INDEX_URL) +print('Response Status', response_index.status_code) +print('Response URL', response_index.url) diff --git a/Spider/Chapter10_模拟登录/基于Session和Cookie的模拟登陆爬取实战/spider2.py b/Spider/Chapter10_模拟登录/基于Session和Cookie的模拟登陆爬取实战/spider2.py new file mode 100644 index 0000000..79e819b --- /dev/null +++ b/Spider/Chapter10_模拟登录/基于Session和Cookie的模拟登陆爬取实战/spider2.py @@ -0,0 +1,29 @@ +#-*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/15 15:43 +@Usage : +@Desc : +''' + +import requests +from urllib.parse import urljoin + +BASE_URL = 'https://login2.scrape.center/' +LOGIN_URL = urljoin(BASE_URL, '/login') +INDEX_URL = urljoin(BASE_URL, '/page/1') +USERNAME = 'admin' +PASSWORD = 'admin' + +response_login = requests.post(LOGIN_URL, data={ + 'username': USERNAME, + 'password': PASSWORD +}, allow_redirects=False) + +cookies = response_login.cookies +print('Cookies', cookies) + +response_index = requests.get(INDEX_URL, cookies=cookies) +print('Response Status', response_index.status_code) +print('Response URL', response_index.url) \ No newline at end of file diff --git a/Spider/Chapter10_模拟登录/基于Session和Cookie的模拟登陆爬取实战/spider3.py b/Spider/Chapter10_模拟登录/基于Session和Cookie的模拟登陆爬取实战/spider3.py new file mode 100644 index 0000000..2531013 --- /dev/null +++ b/Spider/Chapter10_模拟登录/基于Session和Cookie的模拟登陆爬取实战/spider3.py @@ -0,0 +1,31 @@ +#-*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/15 15:48 +@Usage : +@Desc : +''' + +import requests +from urllib.parse import urljoin + +BASE_URL = 'https://login2.scrape.center/' +LOGIN_URL = urljoin(BASE_URL, '/login') +INDEX_URL = urljoin(BASE_URL, '/page/1') +USERNAME = 'admin' +PASSWORD = 'admin' + +session = requests.Session() + +response_login = session.post(LOGIN_URL, data={ + 'username': USERNAME, + 'password': PASSWORD +}) + +cookies = session.cookies +print('Cookies', cookies) + +response_index = session.get(INDEX_URL) +print('Response Status', response_index.status_code) +print('Response URL', response_index.url) \ No newline at end of file diff --git a/Spider/Chapter10_模拟登录/基于Session和Cookie的模拟登陆爬取实战/spider4.py b/Spider/Chapter10_模拟登录/基于Session和Cookie的模拟登陆爬取实战/spider4.py new file mode 100644 index 0000000..ffd9dbd --- /dev/null +++ b/Spider/Chapter10_模拟登录/基于Session和Cookie的模拟登陆爬取实战/spider4.py @@ -0,0 +1,41 @@ +# -*- encoding:utf-8 -*- + +''' +@Author : dingjiawen +@Date : 2023/12/15 15:53 +@Usage : +@Desc : +''' + +from urllib.parse import urljoin +from selenium import webdriver +from selenium.webdriver.common.by import By +import requests +import time + +BASE_URL = 'https://login2.scrape.center/' +LOGIN_URL = urljoin(BASE_URL, '/login') +INDEX_URL = urljoin(BASE_URL, '/page/1') +USERNAME = 'admin' +PASSWORD = 'admin' + +browser = webdriver.Chrome() +browser.get(BASE_URL) +browser.find_element(By.CSS_SELECTOR, 'input[name="username"]').send_keys(USERNAME) +browser.find_element(By.CSS_SELECTOR, 'input[name="password"]').send_keys(PASSWORD) +browser.find_element(By.CSS_SELECTOR, 'input[type="submit"]').click() +time.sleep(10) + +# get cookies from selenium +cookies = browser.get_cookies() +print('Cookies', cookies) +browser.close() + +# set cookies to requests +session = requests.Session() +for cookie in cookies: + session.cookies.set(cookie['name'], cookie['value']) + +response_index = session.get(INDEX_URL) +print('Response Status', response_index.status_code) +print('Response URL', response_index.url)