1、request解析网页
#用get方式获取网址
import urllib.request
response=urllib.request.urlopen("http://www.baidu.com",timeout=1)#有些网站会直接发现是爬虫,不会让访问,一直在加载,所以要加上时间限制
print(response.read().decode('utf-8'))#读取内容,直接复制到.html文件里就是baidu的网页
#也就是说获取到的是网页源码
#获取post请求
import urllib.request
import urllib.parse
#测试用网站httpbin.org,可以测试反应
data=bytes(urllib.parse.urlencode({"Hello":"world"}),encoding="utf-8")#用post时要传递数据包,常放置用户名,密码以及cookie信息来模拟用户登陆
response=urllib.request.urlopen("http://httpbin.org/post",data=data)
print(response.read().decode("utf-8"))
///
{
"args": {},
"data": "",
"files": {},
"form": {
"Hello": "world"
},
"headers": {
"Accept-Encoding": "identity",
"Content-Length": "11",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "httpbin.org",
"User-Agent": "Python-urllib/3.7",
"X-Amzn-Trace-Id": "Root=1-5f3a4f2c-57550ddce6fa664460e4078c"
},
"json": null,
"origin": "111.61.124.9",
"url": "http://httpbin.org/post"
}
///
2、超时处理
import urllib.request
try:
response=urllib.request.urlopen("http://www.baidu.com")#有些网站会直接发现是爬虫,不会让访问,一直在加载,所以要加上时间限制
print(response.read().decode('utf-8'),timeout=1)
except urllib.URLError as e:
print("time out")
3、响应头
import urllib.request
response=urllib.request.urlopen("http://httpbin.org/get")
print(response.read().decode("utf-8"))
print(response.status)#响应状态码(418被发现爬虫,200正常访问返回,404是找不到)
print(response.getheaders())#是一个列表,是网页的response Header,是用户发给服务器的请求
#看具体值
print(response.getheader("Server"))
4、浏览器伪装
要是直接访问会被当成爬虫,返回418。
关键是user-Agent要是浏览器的user-Angent。
正常网站里,network里的headers有user-Agent
import urllib.request
import urllib.parse
#url="https://www.douban.com"
url="http://httpbin.org/post"
data=bytes(urllib.parse.urlencode({"Hello":"world"}),encoding="utf-8")#输入用户名和密码并转化为二进制
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"}#还可以继续加accept,referer等
req=urllib.request.Request(url=url,data=data,headers=headers,method="POST")#封装一下请求对象
response=urllib.request.urlopen(req)#对对象进行相关操作
print(response.read().decode("utf-8"))
///
{
"args": {},
"data": "",
"files": {},
"form": {
"Hello": "world"
},
"headers": {
"Accept-Encoding": "identity",
"Content-Length": "11",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "httpbin.org",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-5f3a569a-8451026cca89456099efd4d0"
},
"json": null,
"origin": "111.61.124.9",
"url": "http://httpbin.org/post"
}
///
#这是被当成了人访问
访问豆瓣试一试,不用登录,去掉data,访问方式也就变成了get
import urllib.request
import urllib.parse
url="https://www.douban.com"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"}
req=urllib.request.Request(url=url,headers=headers,method="GET")
response=urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
#此时访问成功