220 lines
4.8 KiB
Python
220 lines
4.8 KiB
Python
# -*- encoding:utf-8 -*-
|
||
|
||
'''
|
||
@Author : dingjiawen
|
||
@Date : 2023/11/7 15:51
|
||
@Usage :
|
||
@Desc : request库学习
|
||
'''
|
||
|
||
import requests
|
||
import re
|
||
from requests.packages import urllib3
|
||
from requests.auth import HTTPBasicAuth
|
||
from requests_oauthlib import OAuth1
|
||
|
||
'''
|
||
基本get使用
|
||
'''
|
||
|
||
|
||
def get():
|
||
data = {
|
||
'name': 'germey',
|
||
'age': 25
|
||
}
|
||
response = requests.get('https://httpbin.org/get', params=data)
|
||
|
||
print(response.text)
|
||
print(type(response.json())) # dict
|
||
print(response.json())
|
||
pass
|
||
|
||
|
||
'''
|
||
抓取网页:使用模式匹配,抓取标题
|
||
'''
|
||
|
||
|
||
def getPattern():
|
||
response = requests.get('https://ssr1.scrape.center/')
|
||
pattern = '<h2.*?>(.*?)</h2>'
|
||
pattern = re.compile(pattern, re.S)
|
||
titles = re.findall(pattern, response.text)
|
||
|
||
print(titles)
|
||
|
||
pass
|
||
|
||
|
||
'''
|
||
抓取二进制数据:使用模式匹配,抓取标题
|
||
'''
|
||
|
||
|
||
def getBinary():
|
||
response = requests.get('https://scrape.center/favicon.ico')
|
||
print(response.text)
|
||
print(response.content)
|
||
|
||
with open('favicon.ico', 'wb') as f:
|
||
f.write(response.content)
|
||
|
||
pass
|
||
|
||
|
||
'''
|
||
基本response的相关参数
|
||
'''
|
||
|
||
|
||
def getResponse():
|
||
response = requests.get('https://ssr1.scrape.center/')
|
||
|
||
print(type(response.status_code), response.status_code)
|
||
print(type(response.headers), response.headers)
|
||
print(type(response.cookies), response.cookies)
|
||
print(type(response.history), response.history)
|
||
|
||
exit() if not response.status_code == requests.codes.ok else print('Request Success!')
|
||
|
||
|
||
'''
|
||
基本post使用
|
||
'''
|
||
|
||
|
||
def post():
|
||
data = {
|
||
'name': 'germey',
|
||
'age': 25
|
||
}
|
||
response = requests.post('https://httpbin.org/get', data=data)
|
||
|
||
print(response.text)
|
||
|
||
pass
|
||
|
||
|
||
'''
|
||
高级用法:上传文件
|
||
'''
|
||
|
||
|
||
def postFile():
|
||
file = {
|
||
'file': open('favicon.ico', 'rb')
|
||
}
|
||
response = requests.post('https://httpbin.org/post', files=file)
|
||
|
||
print(response.text)
|
||
|
||
pass
|
||
|
||
|
||
'''
|
||
高级用法:cookie
|
||
cookie成功模拟了登录状态,这样就能爬取登录之后才能看到的页面了
|
||
'''
|
||
|
||
|
||
def postCookie():
|
||
response = requests.get('https://www.baidu.com')
|
||
print(response.cookies)
|
||
|
||
for key, value in response.cookies.items():
|
||
print(key, "=", value)
|
||
|
||
pass
|
||
|
||
|
||
'''
|
||
Session维持:
|
||
如果第一次请求利用request库的post方法登录了某个网站,第二次想获取成功登录后自己的个人信息,
|
||
于是又用requests库的get方法区请求个人信息页面,这实际上相当于打开了两个浏览器,是两个完全独立的操作,这时需要维持Session
|
||
'''
|
||
|
||
|
||
def session():
|
||
s = requests.Session()
|
||
s.get("https://www.httpbin.org/cookies/set/number/123456789")
|
||
r = s.get('https://www.httpbin.org/cookies')
|
||
print(r.text) # {"cookies": {"number": "123456789"}}
|
||
|
||
|
||
'''
|
||
SSL证书验证:
|
||
有些网站的HTTPS证书可能并不被CA机构认可,出现SSL证书错误
|
||
'''
|
||
|
||
|
||
def SSL():
|
||
# response = requests.get("https://ssr2.scrape.center/")
|
||
# print(response.status_code) # requests.exceptions.SSLError: HTTPSConnectionPool(host='ssr2.scrape.center', port=443): Max retries exceeded with url
|
||
|
||
urllib3.disable_warnings()
|
||
response = requests.get("https://ssr2.scrape.center/", verify=False)
|
||
print(response.status_code) # 200
|
||
|
||
|
||
'''
|
||
超时验证:
|
||
防止服务器不能即时响应
|
||
'''
|
||
|
||
|
||
def timeout():
|
||
# 设置超时时间1秒
|
||
response = requests.get("https://www.httpbin.org/get", timeout=1)
|
||
# 如果不设置,则永久等待,如果设置为timeout=(5,30)则连接超时时间5秒,读取超时时间30秒
|
||
print(response.status_code)
|
||
|
||
|
||
'''
|
||
身份认证:
|
||
在访问启用了基本身份认证的网站时,首先会弹出一个认证窗口
|
||
'''
|
||
|
||
|
||
def Auth():
|
||
# 用户名和密码都是admin
|
||
response = requests.get("https://ssr3.scrape.center/", auth=('admin', 'admin'))
|
||
|
||
print(response.status_code)
|
||
|
||
|
||
'''
|
||
request还提供了其他认证方式,如OAuth认证,不过此时需要安装requests_oauthlib包
|
||
'''
|
||
def OAuth():
|
||
# 用户名和密码都是admin
|
||
url = 'https://api.twitter.com/1.1/account/verify_credentials.json'
|
||
auth = OAuth('your_app_key', 'your_app_sercet', 'user_oauth_token', 'user_oauth_token_secret')
|
||
response = requests.get(url, auth=auth)
|
||
|
||
print(response.status_code)
|
||
|
||
|
||
|
||
'''
|
||
代理设置:
|
||
某些网站请求几次可以正常获取内容。但一旦开始大规模爬取,可能弹出验证码或者挑战到登录认证页面等
|
||
可以使用代理来解决这个问题
|
||
'''
|
||
def proxy():
|
||
# 用户名和密码都是admin
|
||
url = 'https://api.twitter.com/1.1/account/verify_credentials.json'
|
||
|
||
proxies ={
|
||
'http':'http://10.10.10.10:1080',
|
||
'https':'http://user:password@10.10.10.10:1080/'
|
||
}
|
||
|
||
response = requests.get(url, proxy=proxies)
|
||
|
||
print(response.status_code)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
Auth()
|