基础Python爬虫
Requests
1. requests.get() # 获取html页面 2. requests.head() # 获取html页面头信息 3. requests.post() # 提交post请求 4. requests.put() # 提交put请求 5. requests.patch() # 提交局部修改 6. requests.delete() # 提交删除请求
requests.get()
import requests
r = requests.get('http://www.baidu.com')
r.status_code # 返回状态码
r.headers # 响应头信息
r.encoding # header中猜测的编码
r.apparent_encoding # 内容分析出的编码
r.content # 二进制内容
r.text # 页面内容(根据r.encoding将r.content解码成unicode格式)
import requests
r = requests.get('http://www.baidu.com')
r.encoding = r.apparent_encoding
r.text
r.json() #json获取页面内容
r = requests.get(https://www.github.com/timeline.json)
r.headers['Content-Type']
r.json()
User Agent 身份
identity = {'user-agent': 'yourUserAgent'}
import requests
url = "http://www.qiushibaike.com/text/"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
r = requests.get(url=url, headers=headers)
r.encoding = "utf8"
print(r.text)
proxies 代理ip
import requests
proxies = {
"https": "https://114.113.126.87:80",
"http": "http://114.250.25.19:80",
}
r = requests.get("https://www.taobao.com", proxies=proxies)
print(r.content.decode("utf-8"))
requests.post()
kv = {'key1':'value1', 'key2':'value2'}
r = requests.post('http://httpbin.org/post', data = kv)
print(r.text)
{
"args":{
},
"data":"",
"files":{
},
"form":{
"key1":"value1",
"key2":"value2"
},
"headers":{
"Accept":"*/*",
"Accept-Encoding":"gzip, deflate",
"Connection":"close",
"Content-Length":"23",
"Content-Type":"application/x-www-form-urlencoded",
"Host":"httpbin.org",
"User-Agent":"python-requests/2.18.4"
},
"json":null,
"origin":"183.240.196.81",
"url":"http://httpbin.org/post"
}