Urllib库
环境 python 3.x
导入
import urllib.request
打开一个网页
file = urllib.request.urlopen("http://www.baidu.com")
获取打开网页的信息
print(file.info())
print(file.getcode())
print(file.geturl())
编码解码,URL标准不允许中文,冒号等
print(urllib.request.quote("http://www.sina.com.cn"))
print(urllib.request.unquote("http%3A//www.sina.com.cn"))
读所有信息,读一行信息
data = file.read()
dataline = file.readline()
写文件
fhandle = open("../1.html","wb")
fhandle.write(data)
fhandle.close()
或
filename=urllib.request.urlretrieve(
"http://edu.51cto.com",filename="../2.html")
urllib.request.urlcleanup()
模拟浏览器
builder_opener()
url="http://www.baidu.com"
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
file=opener.open(url)
或 add_header()
url="http://www.baidu.com"
req=urllib.request.Request(url)
req.add_header("User_Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36")
file=urllib.request.urlopen(req)
print(file.getcode())
设置超时时间,并捕捉超时异常
for i in range(1, 100):
try:
file=urllib.request.urlopen("http://www.baidu.com",
timeout=1)
data=file.read()
print(len(data))
except Exception as e:
print("出现异常-->"+str(e))
构造GET请求:
keywd="hello"
url="http://www.baidu.com/s?wd="+keywd
req=urllib.request.Request(url)
data=urllib.request.urlopen(req).read()
fhandle=open("../3.html", "wb")
fhandle.write(data)
fhandle.close()
构造中文GET请求:
keywd="你好"
url="http://www.baidu.com/s?wd="+urllib.request.quote(keywd)
构造POST请求:
url="http://www.iqianyue.com/mypost/"
postdata=urllib.parse.urlencode({
"name":"1",
"pass":"2"
}).encode('utf-8')
req=urllib.request.Request(url, postdata)
req.add_header("User-Agent","ozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36")
data=urllib.request.urlopen(req).read()
fhandle=open("../4.html","wb")
fhandle.write(data)
fhandle.close()
使用代理服务器,防止ip屏蔽
这里是免费代理ip
def use_proxy(proxy_addr, url):
import urllib.request
proxy = urllib.request.ProxyHandler({'http': proxy_addr})
opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
data = urllib.request.urlopen(url).read().decode('utf-8')
return data
proxy_addr = '101.236.23.202:8866'
data = use_proxy(proxy_addr, "http://www.baidu.com")
print(len(data))
开启DebugLog
import urllib.request
httphd = urllib.request.HTTPHandler(debuglevel=1)
httpshd = urllib.request.HTTPSHandler(debuglevel=1)
opener = urllib.request.build_opener(httphd, httpshd)
urllib.request.install_opener(opener)
data = urllib.request.urlopen("http://edu.51cto.com")
URLError
import urllib.request
import urllib.error
try:
urllib.request.urlopen("http://blog.csdn.net")
except urllib.error.URLError as e:
if hasattr(e, "code")
print(e.code)
if hasattr(e, "reason"):
print(e.reason)