request
基本使用
from lxml import etree
import requests
r = requests.get('https://www.qq.com/')
html = etree.HTML(r.text)
li = html.xpath('//div[@class="layout qq-main cf"]/div[@class="col col-2 fl"]/div[@class="mod m-topic"]/div[2]/ul/li')
print(len(li))
for item in li:
x = item.xpath('a[last()]/text()')
print(x)
1.GET请求
(1)GET请求添加额外信息
import requests
data = {
'name':'lu',
'age':'22'
}
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
r = requests.get('http://httpbin.org/get',params=data,headers=headers)
#params相当于重新构造URL
print(r.text)
如果返回的是json数据,可以直接调用r.json()得到一个字典
(2)抓取二进制数据
图片,音频,视频这些文件都是二进制数据
import requests
r = requests.get('https://img.ivsky.com/img/bizhi/pre/201906/27/haian_shatan.jpg')
with open('immm.jpg','wb') as f:
f.write(r.content)
#将二进制数据写入文件
2.POST请求
import requests
data = dict()
r = requests.post('http://httpbin.org/post',data=data)
print(r.text)
print(r.status_code)
#状态码 200成功
print(r.headers)
#响应头
print(r.cookies)
print(r.url)
print(r.history)
3.高级用法
(1)文件上传
文件上传会单独有一个file字段来标识
files = {'file':open('immm.jpg','rb')}
r = requests.post('http://httpbin.org/post',files=files)
(2)Cookies
获取Cookies
import requests
r = requests.post('https://www.baidu.com')
for key,value in r.cookies.items():
print(key,value)
设置Cookies
import requests
cookies = 'a=3;b=4'
headers = {
'Cookie':cookies,
'User-Agent':''
}
r = requests.post('https://www.baidu.com',headers=headers)
#1
jar = requests.cookies.RequestsCookieJar()
for cookie in cookies.split(';'):
key,value = cookie.split('=',1)
jar.set(key,value)
r = requests.post('https://www.baidu.com',cookies=jar)
#2
(3)代理设置
使用proxies参数
import requests
proxies = {
'http':'http://10.10.1.10:3128',
'https':'http://10.10.1.10:1080'
}
r = requests.get('https://www.baidu.com',proxies=proxies)
(4)Prepared Request
可以将请求表示为数据结构
import requests
url = 'http://httpbin.org/get'
data = dict()
headers = dict()
s = requests.Session()
req = requests.Request('GET',url,data=data,headers=headers)
#构建Request
preped = s.prepare_request(req)
#转换为prepared_request对象
r = s.send(preped)
#发送
print(r.text)
urllib
基本使用
import urllib.request
from lxml import etree
r = urllib.request.urlopen('https://blog.csdn.net/ljq1998/article/details/99423615')
html = etree.HTML(r.read().decode('utf-8'))
x = html.xpath('//h1[@class="title-article"]/text()')
print(x)
1.urlencode()
将字典序列化为GET请求参数
2.quote()
将中文字符转化为URL编码
import urllib.parse
url = 'http://www.baidu.com?'
print(url)
data = {
'name':urllib.parse.quote('刘嘉强'),
'age':22
}
url = url + urllib.parse.urlencode(data)
print(url)