获取一个get请求
import urllib.request
response=urllib.request.urlopen("http://www.baidu.com")
print(response) #返回一个对象,可以用read函数解析
<http.client.HTTPResponse object at 0x000002888933D970>
import urllib.request
response=urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode('utf-8')) #对获取到的网页进行utf-8中文解码
将所有数据保存到一个txt文件后,将后缀改为html,再打开就可以看到百度的界面了
file:///C:/Users/93983/Desktop/新建文本文档.html
获取一个post请求:模拟用户真实登录
httpbin.org
{ "args": {}, "data": "", "files": {}, "form": {}, "headers": { "Accept": "application/json", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", "Content-Length": "0", "Host": "httpbin.org", "Origin": "http://httpbin.org", "Referer": "http://httpbin.org/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0", "X-Amzn-Trace-Id": "Root=1-61de3a9d-2ec682387d0423344d03124b" }, "json": null, "origin": "117.182.125.206", "url": "http://httpbin.org/post" }
import urllib.request
import urllib.parse #解析器
data=bytes(urllib.parse.urlencode({"hello":"world"}),encoding="utf-8") #bytes()将键值对转为二进制的模式
response=urllib.request.urlopen("http://httpbin.org/post",data=data) #post访问要传数据进去
print(response.read().decode('utf-8'))
{
"args": {},
"data": "",
"files": {},
"form": {
"hello": "world"
},
"headers": {
"Accept-Encoding": "identity",
"Content-Length": "11",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "httpbin.org",
"User-Agent": "Python-urllib/3.8", #我们是爬虫
"X-Amzn-Trace-Id": "Root=1-61de3b8b-332f7b365dd4e94c6e9f5d48"
},
"json": null,
"origin": "117.182.125.206",
"url": "http://httpbin.org/post"
}
访问时间过长,超时处理
import urllib.request
try:
response=urllib.request.urlopen("http://httpbin.org/get",timeout=0.01)
print(response.read().decode('utf-8'))
except urllib.error.URLError as e:
print("time out!")
time out!
返回状态码
import urllib.request
response=urllib.request.urlopen("http://httpbin.org/get")
print(response.status)
200
import urllib.request
response=urllib.request.urlopen("http://douban.com")
print(response.status)
HTTP Error 418: #你被发现是一个爬虫
得到响应表头
import urllib.request
response=urllib.request.urlopen("http://www.baidu.com")
print(response.getheaders())
[('Bdpagetype', '1'), ('Bdqid', '0xa481ca57001e6640'), ('Cache-Control', 'private'), ('Content-Type', 'text/html;charset=utf-8'), ('Date', 'Wed, 12 Jan 2022 02:34:58 GMT'), ('Expires', 'Wed, 12 Jan 2022 02:34:01 GMT'), ('P3p', 'CP=" OTI DSP COR IVA OUR IND COM "'), ('P3p', 'CP=" OTI DSP COR IVA OUR IND COM "'), ('Server', 'BWS/1.1'), ('Set-Cookie', 'BAIDUID=B5C5DE0D5D5A79F3D3CA8EBABED35998:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('Set-Cookie', 'BIDUPSID=B5C5DE0D5D5A79F3D3CA8EBABED35998; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('Set-Cookie', 'PSTM=1641954898; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('Set-Cookie', 'BAIDUID=B5C5DE0D5D5A79F3C7F8D7876F4D4BDB:FG=1; max-age=31536000; expires=Thu, 12-Jan-23 02:34:58 GMT; domain=.baidu.com; path=/; version=1; comment=bd'), ('Set-Cookie', 'BDSVRTM=0; path=/'), ('Set-Cookie', 'BD_HOME=1; path=/'), ('Set-Cookie', 'H_PS_PSSID=35639_35104_31253_35627_34968_34584_35491_35580_35245_35688_26350_35623_35514_35562; path=/; domain=.baidu.com'), ('Traceid', '1641954898019408308211853978169228813888'), ('Vary', 'Accept-Encoding'), ('Vary', 'Accept-Encoding'), ('X-Frame-Options', 'sameorigin'), ('X-Ua-Compatible', 'IE=Edge,chrome=1'), ('Connection', 'close'), ('Transfer-Encoding', 'chunked')]
尝试访问豆瓣
访问豆瓣所需要的全部信息
用测试网址模拟浏览器访问
import urllib.request
import urllib.parse
#url="heeps://www.douban.com"
url="http://httpbin.org/post" #测试网址
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"
}
data=bytes(urllib.parse.urlencode({'name':'eric'}),encoding='utf-8')
#模拟浏览器发送请求,构建请求对象
req=urllib.request.Request(url=url,data=data,headers=headers,method="POST")
#构建响应对象
response=urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
{
"args": {},
"data": "",
"files": {},
"form": {
"name": "eric"
},
"headers": {
"Accept-Encoding": "identity",
"Content-Length": "9",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "httpbin.org",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-61de4241-466f806f46f051653362756d"
},
"json": null,
"origin": "117.182.125.206",
"url": "http://httpbin.org/post"
}
用get请求访问豆瓣
import urllib.request
url="https://www.douban.com"
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"
}
req=urllib.request.Request(url=url,headers=headers)
response=urllib.request.urlopen(req)
print(response.read().decode("utf-8"))