Python爬虫请求模块
urllib.request模块
python2 :urllib2、urllib
python3 :把urllib和urllib2合并,urllib.request
urllib.request.urlopen(“⽹址”)
作⽤ :向⽹站发起⼀个请求并获取响应
字节流 = response.read()
字符串 = response.read().decode(“utf-8”)
urllib.request.Request"⽹址",headers=“字典”)
urlopen()不⽀持重构 User-Agent
read() 读取服务器响应的内容
getcode() 返回HTTP的响应码
geturl() 返回实际数据的URL(防⽌重定向问题)
import urllib.request
headers={
'Connection': 'keep-alive',
'Cookie': '_ga=GA1.2.1534949298.1588228249; _gat=1; UM_distinctid=171c9c7d7b194-0999c7998ea8b9-34594f7d-e1000-171c9c7d7b273d; CNZZDATA30085487=cnzz_eid%3D851745317-1588225928-null%26ntime%3D1588225928; HTTP_REFERER=www.baidu.com; Hm_lvt_b2e5ac9401b5820ffa4e9fa608593a5b=1588228250; Hm_lpvt_b2e5ac9401b5820ffa4e9fa608593a5b=1588228250',
'Host': 'book.km.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3722.400 QQBrowser/10.5.3751.400'
}
#1.创建请求对象
req = urllib.request.Request('http://book.km.com/',headers=headers)
#2.获取响应对象
response = urllib.request.urlopen(req)
#3.读取响应对象内容
html_con = response.read().decode('gbk')
#encode 字符串转换为bytes
#decode bytes转换为字符串
print(html_con)
使用流程: 1.利用Request()方法构建请求对象; 2.利用urlopen()方法获取响应对象; 3.利用响应对象中的read().decode('utf-8')的内容 |
urllib.parse模块
常⽤⽅法
urlencode(字典)
import urllib.parse
name = {'wd':'火影忍者'}
en_name = urllib.parse.urlencode(name)
print(en_name)
#wd=%E7%81%AB%E5%BD%B1%E5%BF%8D%E8%80%85
例子:
import urllib.parse
import urllib.request
#https://www.kuaikanmanhua.com/s/result/%E7%81%AB%E5%BD%B1%E5%BF%8D%E8%80%85
#str_name = input('请录入想搜素的动漫名字')
str_name = '飞驰人生'
str_base_url = 'https://www.kuaikanmanhua.com/s/result/'
dict_result = {'result':str_name}
dict_result = urllib.parse.urlencode(dict_result)
all_url = str_base_url+ dict_result
print(all_url)
headers = {
'cookie': 'nickname=%257F; sajssdk_2015_cross_new_user=1;' \
' Hm_lvt_c826b0776d05b85d834c5936296dc1d5=1588232406; ' \
'kk_s_t=1588232406495; sensorsdata2015jssdkcross=' \
'%7B%22distinct_id' \
'%22%3A%22171ca073fc7a5c-0c7652610d3f8b-34594f7d-921600-171ca073fc84d9%22%2C%22' \
'first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type' \
'%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword' \
'%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer' \
'%22%3A%22%22%7D%2C%22%24device_id%22%3A%22171ca073fc7a5c-0c7652610d3f8b-34594f7d-921600-171ca073fc84d9%22%7D; ' \
'Hm_lpvt_c826b0776d05b85d834c5936296dc1d5=1588232590',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3722.400 QQBrowser/10.5.3751.400'
}
#1.创建请求对象
req = urllib.request.Request(all_url,headers=headers)
#2.获取响应对象
response = urllib.request.urlopen(req)
#3.读取响应对象内容
html = response.read().decode('utf-8')
print(html)
#4.写入文件
with open(str_name+'.html','w',encoding='utf-8') as f:
f.write(html)
请求方式
get方式:查询参数在url地址中显示:
POST请求:
在Request函数添加data参数
req = urllib.request.Request(url,data=data,headers=headers)
注意:data数据必须转换为bytes类型提交
import urllib.parse
import urllib.request
import json
#有道翻译
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
}
str_word = input('请输入您要翻译的内容:\n')
data = {
'i': str_word,
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': '15886863154849',
'sign': 'a7c28b2eec9fadfe406e74c05463a0a0',
'ts': '1588686315484',
'bv': 'ec579abcd509567b8d56407a80835950',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action':' FY_BY_REALTlME'
}
#使用urlencode()转换把字符串为URL类型,但要注意必须把data转化为bytes
data = urllib.parse.urlencode(data).encode('utf-8')
#data = bytes(data,'utf-8')
req = urllib.request.Request(url,data=data,headers=headers)
response = urllib.request.urlopen(req)
#print(response.read())
html = response.read().decode('utf-8')
#dict_html = dict(html) #ValueError: dictionary update sequence element #0 has length 1; 2 is required
dict_html = json.loads(html) #将json类型的字符串转换为字典
#print(dict_html) #{'type': 'ZH_CN2EN', 'errorCode': 0, 'elapsedTime': 1, 'translateResult': [[{'src': '飞机', 'tgt': 'The plane'}]]}
print('翻译的结果为:')
print(dict_html['translateResult'][0][0]['tgt'])
requests模块常用用法
requests模块get实例
import requests
#pip3 install requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
}
url_serch = 'https://y.qq.com/download/download.js?'
response_s = requests.get(url_serch,headers=headers)
#print(response_s.text) # 返回一个unicode的str类型的数据
#print(response_s.content) # 返回一个字节流数据
response_s.encoding = 'utf-8' #指定编码格式
# print(response_s.content.decode('utf-8'))
print(response_s.url) # 返回url
print(response_s.text)
requests模块POST实例
#有道翻译requests post实例
import requests
import json
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
}
#str_word = '你好'
str_word = input('请输入您要翻译的内容:\n')
data = {
'i': str_word,
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': '15886863154849',
'sign': 'a7c28b2eec9fadfe406e74c05463a0a0',
'ts': '1588686315484',
'bv': 'ec579abcd509567b8d56407a80835950',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action':' FY_BY_REALTlME'
}
#使用urlencode()转换把字符串为URL类型,但要注意必须把data转化为bytes
#data = urllib.parse.urlencode(data).encode('utf-8') #这里不用进行转换
response = requests.post(url,headers=headers,data=data)
response.encoding = 'utf-8'
#print(response.text)
dict_html = json.loads(response.text)
print('翻译的结果为:')
print(dict_html['translateResult'][0][0]['tgt'])
request设置代理
import requests
url = 'http://httpbin.org/ip'
request_s = requests.get(url)
print(request_s.text)
# {
# "origin": "xxx.101.129.xxx"
# }
#代理网站:
# 西刺免费代理IP:
# https://www.xicidaili.com/
# 快代理:
# https://www.kuaidaili.com/free/
# 代理云:
# http://www.dailiyun.com/
#设置代理IP
proxy = {
'http':'223.68.190.130:8181'
}
request_s = requests.get(url,proxies=proxy)
print(request_s.text)
# {
# "origin": "223.68.190.130"
# }
cookie
cookie :通过在客户端记录的信息确定⽤户身份。(反爬)
HTTP是⼀种⽆连接协议,客户端和服务器交互仅仅限于 请求/响应过程,结束后断开,下⼀次请求时,服务器会认为是⼀个新的客户端,为了维护他们之间的连接,让服务器知道这是前⼀个⽤户发起的请求,必须在⼀个地⽅保存客户端信息(就是cookie)。
session
session :通过在服务端记录的信息确定⽤户身份 这⾥这个session就是⼀个指的是会话.
import requests
import json
#目标
# {result_message: "验证码校验成功", result_code: "4"}
# 1.获取12306登录图片
def get12306_pic(req):
print('login print log')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
'Host': 'kyfw.12306.cn'
}
#url_re_image = 'https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login'
url_re_image = 'https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login'
pic_response = req.post(url_re_image,headers=headers)
#pic_response.encoding = 'utf-8'
print('print log')
print(pic_response.status_code)
with open('picture12306.png','wb') as f:
f.write(pic_response.content)
def post_pic_data(req):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
'Host': 'kyfw.12306.cn'
}
codeStr = input('请输入验证码坐标:')
url = 'https://kyfw.12306.cn/passport/captcha/captcha-check'
data = {
'answer': codeStr,
'rand': 'sjrand',
'login_site': 'E'
}
response = req.post(url, data=data, headers=headers)
response.encoding = 'utf-8'
print(response.text)
return codeStr
def login(req,codeStr):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
'Host': 'kyfw.12306.cn',
'Origin': 'https://kyfw.12306.cn',
'Cookie':'_passport_ct=f5ef22f598bb4c4a8fbcb4a678043164t9699; _passport_session=05706b8dc52a4fe2812c5b50f6f75b7d3485; BIGipServerpassport=870842634.50215.0000; RAIL_EXPIRATION=1589113745971; RAIL_DEVICEID=DKBT_G74qXb-tbP19-55WHMutkmzYqNrU7Xbkl0LDUr7aPrMo2fTpYb-0_Awaez51rY64B3vUkEUYwXyQJTcNxmAa89Wwb7kI1_LUsGGksYAZO7uC4_lu3evIY3dT82mwCUT5fGJ5IooZbrytsA3tyLj14f53LrX; route=495c805987d0f5c8c84b14f60212447d; BIGipServerotn=451936778.64545.0000; BIGipServerpool_passport=384631306.50215.0000; _jc_save_toStation=%u4E0A%u6D77%2CSHH; _jc_save_toDate=2020-05-07; _jc_save_wfdc_flag=dc; _jc_save_fromStation=%u5E7F%u5DDE%2CGZQ; _jc_save_fromDate=2020-05-08; BIGipServerportal=2949906698.16671.0000'
}
username = input('请输入用户名:')
password = input('请输入密码:')
url = 'https://kyfw.12306.cn/passport/web/login'
data = {
'answer': codeStr,
'username': username,
'password': password
}
response = req.post(url,data=data,headers=headers)
response.encoding = 'utf-8'
print(response.text)
if __name__ == '__main__':
req = requests.session()
get12306_pic(req)
codeStr = post_pic_data(req)
login(req, codeStr)
SSL
什么是SSL证书?
SSL证书是数字证书的⼀种,类似于驾驶证、护照和营业执照的电⼦副本。因为配置在服务器上,也称为SSL服务器证书。SSL 证书就是遵守 SSL协议,由受信任的数字证书颁发机构CA,在验证服务器身份后颁发,具有服务器身份验证和数据传输加密功能
import requests
url = 'https://inv-veri.chinatax.gov.cn/'
#res = requests.get(url) #报错如下
#由于访问的网站是一个不被信任的网站,导致出现证书认证失败
#requests.exceptions.SSLError: HTTPSConnectionPool(host='inv-veri.chinatax.gov.cn', port=443):
# Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1,
# '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed:
# self signed certificate in certificate chain (_ssl.c:1108)')))
#解决方法
res = requests.get(url,verify=False)
print(res.status_code)