python爬虫-requests库
1.1安装requests库
pip install requests
#cmd安装requests库
若多个版本python
python**x** -m pip install requests
#x为指定版本
import requests
#测试是否安装
1.2基本用法
requests.get()
用于请求目标网站,类型为HTTPresponse
print(response.status_code)
#打印状态码
print(response.url)
#打印请求url
print(response.headers)
#打印头信息
print(reponse.cookies)
#打印cookie信息
print(reponse.text)
#以文本形式打印网页源码
print(reponse.content)
#以字节流形式打印
打印状态码时
状态码:200,证明请求目标网站正常
若状态码为403一般是目标存有防火墙,触发了反爬策略被限制了IP
1.3请求方式
requests.get('http://...../')
requests.post('http://...../')
requests.put('http://...../')
requests.delete('http://...../')
requests.head('http://..../')
requests.options('http://..../')
1.4基本的get请求
response=requests.get('http://..../')
print(reponse.text)
1.5带有参数的get请求
1.5.1直接将参数放在url中
response=requests.get("http://...params=1")
print(response.text)
1.5.2先将参数写在data中,发起请求时将params参数指定为data
data={'params':'1',}reponse=requests.get('https://....',params=data)
print(response.text)
1.6基本post请求
response = requests.post('http://...../')
1.6.1解析json
response = requests.get('http://..../')
print(response.text)
print(response.json()) #response.json()方法同json.loads(response.text)
print(type(response.json()))
1.6.2保存一个二进制文件
response = requests.get('http://.....')
b = reponse.content
with open('D://.../,'wrba') as
f: #w写,r读,b二进制,a追加到文件后面,若无文件创建新文件
f.write(b)
1.6.3为请求添加头信息
heads['User-Agent'] = 'Mozilla/5.0 ' \
'(Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 ' \
'(KHTML, like Gecko) Version/5.1 Safari/534.50'
response = requests.get( 'http://www.baidu.com',headers = heads )
print(response)
#可避开防火墙,隐藏身份
1.6.4使用代理高匿查询
import requests
import re
def get_html(url):
proxy = {
'http': '47.92.113.71:80',
}
heads = {}
heads['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
req = requests.get(url,headers=heads,proxies=proxy)
html = req.text
return html
def get_ipport(html):
regex = r'<td data-title="IP">(.+)</td>'
iplist = re.findall(regex, html)
regex2 = '<td data-title="PORT">(.+)</td>'
portlist = re.findall(regex2, html)
regex3 = r'<td data-title="类型">(.+)</td>'
typelist = re.findall(regex3, html)
sumray = []
for i in iplist:
for p in portlist:
for t in typelist:
pass
pass
a = t+','+i + ':' + p
sumray.append(a)
print('高匿代理')
print(sumray)
if __ name __ == '__ main __':
url = 'http://www.baidu.com'
get_ipport(get_html(url))
1.6.5获取cookie
import requests
response = requests.get('http://www.baidu.com')
print(response.cookies)
print(type(response.cookies))
for k,v in response.cookies.items():
print(k + ':' + v)
1.6.6会话维持
import requests
session = requests.Session()
session.get('https://www.crrcgo.cc/admin/crr_supplier.html')
response = session.get('https://www.crrcgo.cc/admin/')
print (response.text)
1.6.7证书验证设置
import requests
from requests.packages import urllib3
urllib3.disable_warnings() #从urllib3中消除警告
response = requests.get('https://www.baidu.com',verify=False) #证书验证设为FALSE
print(response.status_code)
1.6.8超时异常捕获
import requests
from requests.exceptions import ReadTimeout
try:
res = requests.get('http://www.baidu.com', timeout=0.1)
print(res.status_code)
except ReadTimeout:
print(timeout)
1.6.9异常处理
import requests
from requests.exceptions import ReadTimeout,HTTPError,RequestException
try:
response = requests.get('http://www.baidu.com',timeout=1)
print(response.status_code)
except ReadTimeout:
print('timeout')
except HTTPError:
print('httperror')
except RequestException:
print('reqerror')
#使用try…except来捕获异常
1.7异常
requests.ConnectionError #网络连接异常,如DNS查询失败,拒绝连接等
requests.HTTPError #http错误异常
requests.URLRequired #url缺失异常
requests.TooManyRedirects #超过最大重定向次数,产生重定向异常
requests.ConnectTimeout #连接远程服务器超时异常
requests.Timeout #请求URL超时,产生超时异常
1.8测试
1.8.1返回状态码判断
from re import X
from telnetlib import STATUS
from urllib import response
import requests
X = requests.get('https://www.baidu.com/')
if X.status_code == 200 :
print(X.status_code)
else:
print("gg")
1.8.2加入params字典
import requests header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36', } param ={ 'wd':'55' #百度搜索参数 } X = requests.get('http://www.baidu.com', headers= header,params=param) print(X.url)
r"(.*?)"
pattern = re.compile(‘
+'.?>(. ?).?star”>(. ?)
+'.?integer">(. ?).?fraction">(. ?).?’, re.S)
'<td"+"*?IP*? >(.*?)</td>'