获取一个get请求
try:
response=urllib.request.urlopen("http://www.baidu.com",timeout=1)
print(response.read().decode('utf-8')) #对获取到的网页进行utf-8解码
except urllib.error.URLError as e: #获取异常
print("time out\n")
获取一个post请求
import urllib.parse
data = bytes(urllib.parse.urlencode({"hello":"world"}),encoding="utf-8")
# bytes 将要传给服务器的信息转化为二进制信息, parse是python用于解析命令行参数和选项的标准模块
response = urllib.request.urlopen("http://httpbin.org/post",data=data)
# post操作不能简单的打开,必须传一些参数给服务器
print(response.read().decode("utf-8"))
#用decode对获得的信息进行简单编码
超时处理
try:
response = urllib.request.urlopen("http://httpbin.org/get",timeout=1)
# timeout截至时间
print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
print("time out!")
response = urllib.request.urlopen("http://www.baidu.com",timeout=2)
print(response.status)
#status状态码
print(response.getheader("Server"))
#可以利用getheaders来获取浏览器返回全部的header,也可以用getheader来获取单独的参数
反反爬机制,伪装成浏览器,利用User-Agent
url = "http://httpbin.org/post"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36"}
#可以加入更多的参数,包括cookie等,越详细浏览器越难反爬
data = bytes(urllib.parse.urlencode({'name':'eric'}),encoding="utf-8")
#传入的参数
req = urllib.request.Request(url=url,data=data,headers=headers,method="POST")
#post的方法与网站后缀一致
#封装成一个对象将参数同时传给urlopen
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
例子,伪装骗取豆瓣的反爬机制
url = "https://www.douban.com"
headers={"User-Agent":" Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36"}
req = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))