# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/11/7 15:51
@Usage :
@Desc : request库学习
'''
import requests
import re
from requests.packages import urllib3
from requests.auth import HTTPBasicAuth
from requests_oauthlib import OAuth1
'''
基本get使用
'''
def get():
data = {
'name': 'germey',
'age': 25
}
response = requests.get('https://httpbin.org/get', params=data)
print(response.text)
print(type(response.json())) # dict
print(response.json())
pass
'''
抓取网页:使用模式匹配,抓取标题
'''
def getPattern():
response = requests.get('https://ssr1.scrape.center/')
pattern = '
(.*?)'
pattern = re.compile(pattern, re.S)
titles = re.findall(pattern, response.text)
print(titles)
pass
'''
抓取二进制数据:使用模式匹配,抓取标题
'''
def getBinary():
response = requests.get('https://scrape.center/favicon.ico')
print(response.text)
print(response.content)
with open('favicon.ico', 'wb') as f:
f.write(response.content)
pass
'''
基本response的相关参数
'''
def getResponse():
response = requests.get('https://ssr1.scrape.center/')
print(type(response.status_code), response.status_code)
print(type(response.headers), response.headers)
print(type(response.cookies), response.cookies)
print(type(response.history), response.history)
exit() if not response.status_code == requests.codes.ok else print('Request Success!')
'''
基本post使用
'''
def post():
data = {
'name': 'germey',
'age': 25
}
response = requests.post('https://httpbin.org/get', data=data)
print(response.text)
pass
'''
高级用法:上传文件
'''
def postFile():
file = {
'file': open('favicon.ico', 'rb')
}
response = requests.post('https://httpbin.org/post', files=file)
print(response.text)
pass
'''
高级用法:cookie
cookie成功模拟了登录状态,这样就能爬取登录之后才能看到的页面了
'''
def postCookie():
response = requests.get('https://www.baidu.com')
print(response.cookies)
for key, value in response.cookies.items():
print(key, "=", value)
pass
'''
Session维持:
如果第一次请求利用request库的post方法登录了某个网站,第二次想获取成功登录后自己的个人信息,
于是又用requests库的get方法区请求个人信息页面,这实际上相当于打开了两个浏览器,是两个完全独立的操作,这时需要维持Session
'''
def session():
s = requests.Session()
s.get("https://www.httpbin.org/cookies/set/number/123456789")
r = s.get('https://www.httpbin.org/cookies')
print(r.text) # {"cookies": {"number": "123456789"}}
'''
SSL证书验证:
有些网站的HTTPS证书可能并不被CA机构认可,出现SSL证书错误
'''
def SSL():
# response = requests.get("https://ssr2.scrape.center/")
# print(response.status_code) # requests.exceptions.SSLError: HTTPSConnectionPool(host='ssr2.scrape.center', port=443): Max retries exceeded with url
urllib3.disable_warnings()
response = requests.get("https://ssr2.scrape.center/", verify=False)
print(response.status_code) # 200
'''
超时验证:
防止服务器不能即时响应
'''
def timeout():
# 设置超时时间1秒
response = requests.get("https://www.httpbin.org/get", timeout=1)
# 如果不设置,则永久等待,如果设置为timeout=(5,30)则连接超时时间5秒,读取超时时间30秒
print(response.status_code)
'''
身份认证:
在访问启用了基本身份认证的网站时,首先会弹出一个认证窗口
'''
def Auth():
# 用户名和密码都是admin
response = requests.get("https://ssr3.scrape.center/", auth=('admin', 'admin'))
print(response.status_code)
'''
request还提供了其他认证方式,如OAuth认证,不过此时需要安装requests_oauthlib包
'''
def OAuth():
# 用户名和密码都是admin
url = 'https://api.twitter.com/1.1/account/verify_credentials.json'
auth = OAuth('your_app_key', 'your_app_sercet', 'user_oauth_token', 'user_oauth_token_secret')
response = requests.get(url, auth=auth)
print(response.status_code)
'''
代理设置:
某些网站请求几次可以正常获取内容。但一旦开始大规模爬取,可能弹出验证码或者挑战到登录认证页面等
可以使用代理来解决这个问题
'''
def proxy():
# 用户名和密码都是admin
url = 'https://api.twitter.com/1.1/account/verify_credentials.json'
proxies ={
'http':'http://10.10.10.10:1080',
'https':'http://user:password@10.10.10.10:1080/'
}
response = requests.get(url, proxy=proxies)
print(response.status_code)
if __name__ == '__main__':
Auth()