self_example/Spider/chapter02_爬虫基本库/request库/requestLearning.py

220 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- encoding:utf-8 -*-
'''
@Author : dingjiawen
@Date : 2023/11/7 15:51
@Usage :
@Desc : request库学习
'''
import requests
import re
from requests.packages import urllib3
from requests.auth import HTTPBasicAuth
from requests_oauthlib import OAuth1
'''
基本get使用
'''
def get():
data = {
'name': 'germey',
'age': 25
}
response = requests.get('https://httpbin.org/get', params=data)
print(response.text)
print(type(response.json())) # dict
print(response.json())
pass
'''
抓取网页:使用模式匹配,抓取标题
'''
def getPattern():
response = requests.get('https://ssr1.scrape.center/')
pattern = '<h2.*?>(.*?)</h2>'
pattern = re.compile(pattern, re.S)
titles = re.findall(pattern, response.text)
print(titles)
pass
'''
抓取二进制数据:使用模式匹配,抓取标题
'''
def getBinary():
response = requests.get('https://scrape.center/favicon.ico')
print(response.text)
print(response.content)
with open('favicon.ico', 'wb') as f:
f.write(response.content)
pass
'''
基本response的相关参数
'''
def getResponse():
response = requests.get('https://ssr1.scrape.center/')
print(type(response.status_code), response.status_code)
print(type(response.headers), response.headers)
print(type(response.cookies), response.cookies)
print(type(response.history), response.history)
exit() if not response.status_code == requests.codes.ok else print('Request Success!')
'''
基本post使用
'''
def post():
data = {
'name': 'germey',
'age': 25
}
response = requests.post('https://httpbin.org/get', data=data)
print(response.text)
pass
'''
高级用法:上传文件
'''
def postFile():
file = {
'file': open('favicon.ico', 'rb')
}
response = requests.post('https://httpbin.org/post', files=file)
print(response.text)
pass
'''
高级用法:cookie
cookie成功模拟了登录状态这样就能爬取登录之后才能看到的页面了
'''
def postCookie():
response = requests.get('https://www.baidu.com')
print(response.cookies)
for key, value in response.cookies.items():
print(key, "=", value)
pass
'''
Session维持:
如果第一次请求利用request库的post方法登录了某个网站第二次想获取成功登录后自己的个人信息
于是又用requests库的get方法区请求个人信息页面这实际上相当于打开了两个浏览器是两个完全独立的操作这时需要维持Session
'''
def session():
s = requests.Session()
s.get("https://www.httpbin.org/cookies/set/number/123456789")
r = s.get('https://www.httpbin.org/cookies')
print(r.text) # {"cookies": {"number": "123456789"}}
'''
SSL证书验证:
有些网站的HTTPS证书可能并不被CA机构认可出现SSL证书错误
'''
def SSL():
# response = requests.get("https://ssr2.scrape.center/")
# print(response.status_code) # requests.exceptions.SSLError: HTTPSConnectionPool(host='ssr2.scrape.center', port=443): Max retries exceeded with url
urllib3.disable_warnings()
response = requests.get("https://ssr2.scrape.center/", verify=False)
print(response.status_code) # 200
'''
超时验证:
防止服务器不能即时响应
'''
def timeout():
# 设置超时时间1秒
response = requests.get("https://www.httpbin.org/get", timeout=1)
# 如果不设置则永久等待如果设置为timeout=(5,30)则连接超时时间5秒读取超时时间30秒
print(response.status_code)
'''
身份认证:
在访问启用了基本身份认证的网站时,首先会弹出一个认证窗口
'''
def Auth():
# 用户名和密码都是admin
response = requests.get("https://ssr3.scrape.center/", auth=('admin', 'admin'))
print(response.status_code)
'''
request还提供了其他认证方式如OAuth认证不过此时需要安装requests_oauthlib包
'''
def OAuth():
# 用户名和密码都是admin
url = 'https://api.twitter.com/1.1/account/verify_credentials.json'
auth = OAuth('your_app_key', 'your_app_sercet', 'user_oauth_token', 'user_oauth_token_secret')
response = requests.get(url, auth=auth)
print(response.status_code)
'''
代理设置:
某些网站请求几次可以正常获取内容。但一旦开始大规模爬取,可能弹出验证码或者挑战到登录认证页面等
可以使用代理来解决这个问题
'''
def proxy():
# 用户名和密码都是admin
url = 'https://api.twitter.com/1.1/account/verify_credentials.json'
proxies ={
'http':'http://10.10.10.10:1080',
'https':'http://user:password@10.10.10.10:1080/'
}
response = requests.get(url, proxy=proxies)
print(response.status_code)
if __name__ == '__main__':
Auth()