爬虫基本操作
pyhton基于http协议进行网络请求的第三方库
import request
1.发送请求
requests.get(ulr,* headers, params, proxies) - 发送get求情
requests.post(ulr,* headers, params, proxies) - 发送post请求
参数:
ulr - 请求地址(一个网站的网址,接口的地址,图片地址)
headers - 设置请求头(设置cookie和User-Agent的时候使用)
params - 设置参数
response = requests.get('https://dict.youdao.com/')
2.获取响应信息
设置编码方式(乱码的时候才设置)
requests.encoding = ’GBK‘
获取响应头信息
print(requests.headers)
获取响应体
a.获取text值(用于请求网页,直接拿到网页源代码)
print(response.text)
b.获取json解析结果(用于返回json数据的数据接口)
print(requests.json())
c.获取content值(获取二进制类型的原数据,用于图片,视频,音频的下载)、
print(response.content)
添加cookie
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67',
'cookie':'_zap=4c2801b2-54dc-461b-889e-07eef34acee5; d_c0="AFDQk3kcfBOPTiCFdisXpPevlXSrNeYWtPQ=|1627526478"; _xsrf=OKRhVGNTb5hXz8MwTMCP8n26g3fexYre; __snaker__id=DqUuixU49an0Hawy; _9755xjdesxxd_=32; gdxidpyhxdE=EhAylW6HWo13p9L5e/4WBE6NvP4QtlY/bOv8AHYwV7bKUlTRtGHqjiBVNU1/cgcbXyo5hacKKxA+9OnsYrpql3tQ1U9b7kJzkVCJ8HwXTDbgeQvb2Z/JKKZo4epB+hC+H3EaYH0EVMA9TRxP5O8V1Ilp\Twp+bVxXjZ\9R7qNI/Nz2Ba:1627998713354; YD00517437729195:WM_NI=UXQ8v0GQVSwTAAXeyr0wh05ZrMxvJJ8idtca+e0jaueS8CeuliccBSYdwLG9/10CNGRE16AKF8sXDwpku/HLbzmCvHQqTVNqHuL8/El8SINFy5MpHb2UO/ansRJ2D9p7RlA=; YD00517437729195:WM_NIKE=9ca17ae2e6ffcda170e2e6ee96c839eda89b97d773928e8fb6c55e878e9ebbf173a99c00a5f265a7a79f8dc92af0fea7c3b92a97889fd9f53c858f9ab4f660839f9886fc7af4a9a8d1b852b293b897c942fcb7ac95e26be9bb008be47fa78df8a4e87fb0b7ad84c1599bbcb9adc633edef00a4cd4d929bffd7c44186e8bdb9c77db4b5bbd0cb48a2a6a3a4f54493af9cb7cd7b8cab84d9ae4fb0e89daacd4ebbeb8391cb6af28b81d0e466b18e86d3c733b4e99e8dc437e2a3; YD00517437729195:WM_TID=BX7OEmBIrQ5EAEVFFQNrnhD1Xt9LyAhZ; captcha_session_v2="2|1:0|10:1627997817|18:captcha_session_v2|88:ODlVTEt1WW5mNTFNSjBKYmhTVTE3OWhUaGVOOUtMRmI2ZzFZdDltNnI1OWRHZENPeVdVZVo1MFJncWpsWS8vVA==|b977344dbfa2b84134b8cb5653d560f14385849e7d69f6dea519f64afe264d3c"; captcha_ticket_v2="2|1:0|10:1627997831|17:captcha_ticket_v2|704:eyJ2YWxpZGF0ZSI6IkNOMzFfeFhNd2V1U0RDc2RBTjJlMkUtUl9jQmxleTFUTVlKRHFFZHplYlNWdWUybG1naUxsS0VtZF9EZl9NWUhYWkdiVmJ6bTRTQXpVTUc0TEs5UTdSaDQ5eS5mU2tYYUkubU9MbXJ0Ty1TMDFjZzc4Qy02LXc2cFNrVHpydEYySHdPb0hla1A3LWs4OXlGa29UbThPRDhHLkVmWTFWSGVnX2hSdXI2U3JodVphY05LWWp2OTg2b3V1dXZnY05uYlpkU2lCTVAweTI1N2lDWjFwNDdGZ0o5aWlWRnJhLkkuWWUyaC1EQVlUZUxHRDB5aXhQbXZpcW0wRUhET2k3VWNtTDVPZjFRcHhmWU9mdlV1ZnE2NmZuTWF0ZjhmU1lNSEJQTUt3Wmc2VFFNRGJLYVRVWTdKOGNyVzkyd0lYa2t0NElKRGxXNjRqSnZScklQTC11U1VKclhZaHdBbFlKdTZIMmdQeXJWUm5nSmpPUFlPemdpcl9ldmhWeXZ0TGhpVXp1bVl0QmJBT2JjTXZjR3k1VktNWmZKazZVcW5vb3RLYS1CNncuQy5jRFg5THdfVU9nUjRGckF0N05KZjY4bXBsZml2Lk5jWGw4V2ouNHpKQ25lbmRqSkllTF9FVkVwLnZsZmVCNEtkNW91QWlPazRFUGhKQU1VcDB2b2VUbTFBMyJ9|4be46b954f9a70ca9ce9ade0dcaff69edb82c8a1036e0e394121e1952e611bed"; z_c0="2|1:0|10:1627997845|4:z_c0|92:Mi4xMnlzVUNnQUFBQUFBVU5DVGVSeDhFeVlBQUFCZ0FsVk5sWlQyWVFCQ3F6bzRxaWh3eVJSNDR6MHRweFB3NWtTbUtR|79ceee947f39fbfa737c051771da3ed354c00fc918200deea4d9f29fea72d0c4"; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1627526514,1627997812,1628587974,1628664263; SESSIONID=Im8Wt3OeoVKimQpEjw9toVmEkgadsBQOoJh7dW2HpBI; JOID=U10RCkplalokAXfIfG0OAOB8XU9qKAZve3VAhhQKOBBAZCa8OKcXTE0FcsR54Seb4vHQB7PX5WHvNe65x7wALxQ=; osd=Wl8WBktsaF0oAH7Ke2EPCeJ7UU5jKgFjenxCgRgLMRJHaCe1OqAbTUQHdch46CWc7vDZBbTb5GjtMuK4zr4HIxU=; tst=r; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1628665013; KLBRSID=e42bab774ac0012482937540873c03cf|1628666008|1628664260'
}
response = requests.get('https://www.zhihu.com/', headers=headers)
print(response.text)
json解析
获取今日头条的json数据接口,然后在发送请求
response