# -*- encoding:utf-8 -*- ''' @Author : dingjiawen @Date : 2023/11/7 15:51 @Usage : @Desc : request库学习 ''' import requests import re from requests.packages import urllib3 from requests.auth import HTTPBasicAuth from requests_oauthlib import OAuth1 ''' 基本get使用 ''' def get(): data = { 'name': 'germey', 'age': 25 } response = requests.get('https://httpbin.org/get', params=data) print(response.text) print(type(response.json())) # dict print(response.json()) pass ''' 抓取网页:使用模式匹配,抓取标题 ''' def getPattern(): response = requests.get('https://ssr1.scrape.center/') pattern = '(.*?)' pattern = re.compile(pattern, re.S) titles = re.findall(pattern, response.text) print(titles) pass ''' 抓取二进制数据:使用模式匹配,抓取标题 ''' def getBinary(): response = requests.get('https://scrape.center/favicon.ico') print(response.text) print(response.content) with open('favicon.ico', 'wb') as f: f.write(response.content) pass ''' 基本response的相关参数 ''' def getResponse(): response = requests.get('https://ssr1.scrape.center/') print(type(response.status_code), response.status_code) print(type(response.headers), response.headers) print(type(response.cookies), response.cookies) print(type(response.history), response.history) exit() if not response.status_code == requests.codes.ok else print('Request Success!') ''' 基本post使用 ''' def post(): data = { 'name': 'germey', 'age': 25 } response = requests.post('https://httpbin.org/get', data=data) print(response.text) pass ''' 高级用法:上传文件 ''' def postFile(): file = { 'file': open('favicon.ico', 'rb') } response = requests.post('https://httpbin.org/post', files=file) print(response.text) pass ''' 高级用法:cookie cookie成功模拟了登录状态,这样就能爬取登录之后才能看到的页面了 ''' def postCookie(): response = requests.get('https://www.baidu.com') print(response.cookies) for key, value in response.cookies.items(): print(key, "=", value) pass ''' Session维持: 如果第一次请求利用request库的post方法登录了某个网站,第二次想获取成功登录后自己的个人信息, 于是又用requests库的get方法区请求个人信息页面,这实际上相当于打开了两个浏览器,是两个完全独立的操作,这时需要维持Session ''' def session(): s = requests.Session() s.get("https://www.httpbin.org/cookies/set/number/123456789") r = s.get('https://www.httpbin.org/cookies') print(r.text) # {"cookies": {"number": "123456789"}} ''' SSL证书验证: 有些网站的HTTPS证书可能并不被CA机构认可,出现SSL证书错误 ''' def SSL(): # response = requests.get("https://ssr2.scrape.center/") # print(response.status_code) # requests.exceptions.SSLError: HTTPSConnectionPool(host='ssr2.scrape.center', port=443): Max retries exceeded with url urllib3.disable_warnings() response = requests.get("https://ssr2.scrape.center/", verify=False) print(response.status_code) # 200 ''' 超时验证: 防止服务器不能即时响应 ''' def timeout(): # 设置超时时间1秒 response = requests.get("https://www.httpbin.org/get", timeout=1) # 如果不设置,则永久等待,如果设置为timeout=(5,30)则连接超时时间5秒,读取超时时间30秒 print(response.status_code) ''' 身份认证: 在访问启用了基本身份认证的网站时,首先会弹出一个认证窗口 ''' def Auth(): # 用户名和密码都是admin response = requests.get("https://ssr3.scrape.center/", auth=('admin', 'admin')) print(response.status_code) ''' request还提供了其他认证方式,如OAuth认证,不过此时需要安装requests_oauthlib包 ''' def OAuth(): # 用户名和密码都是admin url = 'https://api.twitter.com/1.1/account/verify_credentials.json' auth = OAuth('your_app_key', 'your_app_sercet', 'user_oauth_token', 'user_oauth_token_secret') response = requests.get(url, auth=auth) print(response.status_code) ''' 代理设置: 某些网站请求几次可以正常获取内容。但一旦开始大规模爬取,可能弹出验证码或者挑战到登录认证页面等 可以使用代理来解决这个问题 ''' def proxy(): # 用户名和密码都是admin url = 'https://api.twitter.com/1.1/account/verify_credentials.json' proxies ={ 'http':'http://10.10.10.10:1080', 'https':'http://user:password@10.10.10.10:1080/' } response = requests.get(url, proxy=proxies) print(response.status_code) if __name__ == '__main__': Auth()