《精通python网络爬虫》学习笔记一_精通 python 网络爬虫目录-CSDN博客

本文链接：https://blog.csdn.net/sinat_25721683/article/details/81109446

Urllib库
URLError

Urllib库

环境 python 3.x
导入
import urllib.request
打开一个网页
file = urllib.request.urlopen("http://www.baidu.com")
获取打开网页的信息

print(file.info())
print(file.getcode())
print(file.geturl())

编码解码，URL标准不允许中文，冒号等

print(urllib.request.quote("http://www.sina.com.cn"))
print(urllib.request.unquote("http%3A//www.sina.com.cn"))

读所有信息，读一行信息

data = file.read()
dataline = file.readline()

写文件

fhandle = open("../1.html","wb")
fhandle.write(data)
fhandle.close()

或

filename=urllib.request.urlretrieve(
"http://edu.51cto.com",filename="../2.html")
urllib.request.urlcleanup()

模拟浏览器
builder_opener()

url="http://www.baidu.com"
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
file=opener.open(url)

或 add_header()

url="http://www.baidu.com"
req=urllib.request.Request(url)
req.add_header("User_Agent", 
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 
(KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36")
file=urllib.request.urlopen(req)
print(file.getcode())

设置超时时间，并捕捉超时异常

for i in range(1, 100):
    try:
        file=urllib.request.urlopen("http://www.baidu.com", 
        timeout=1)
        data=file.read()
        print(len(data))
    except Exception as e:
        print("出现异常-->"+str(e))

构造GET请求：

keywd="hello"
url="http://www.baidu.com/s?wd="+keywd
req=urllib.request.Request(url)
data=urllib.request.urlopen(req).read()
fhandle=open("../3.html", "wb")
fhandle.write(data)
fhandle.close()

构造中文GET请求：

keywd="你好"
url="http://www.baidu.com/s?wd="+urllib.request.quote(keywd)

构造POST请求：

url="http://www.iqianyue.com/mypost/"
postdata=urllib.parse.urlencode({
    "name":"1",
    "pass":"2"
}).encode('utf-8')
req=urllib.request.Request(url, postdata)
req.add_header("User-Agent","ozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36")
data=urllib.request.urlopen(req).read()
fhandle=open("../4.html","wb")
fhandle.write(data)
fhandle.close()

使用代理服务器，防止ip屏蔽
这里是免费代理ip

def use_proxy(proxy_addr, url):
    import urllib.request
    proxy = urllib.request.ProxyHandler({'http': proxy_addr})
    opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    data = urllib.request.urlopen(url).read().decode('utf-8')
    return data
proxy_addr = '101.236.23.202:8866'
data = use_proxy(proxy_addr, "http://www.baidu.com")
print(len(data))

开启DebugLog

import urllib.request

httphd = urllib.request.HTTPHandler(debuglevel=1)
httpshd = urllib.request.HTTPSHandler(debuglevel=1)
opener = urllib.request.build_opener(httphd, httpshd)
urllib.request.install_opener(opener)
data = urllib.request.urlopen("http://edu.51cto.com")

URLError

import urllib.request
import urllib.error
try:
    urllib.request.urlopen("http://blog.csdn.net")
except urllib.error.URLError as e:
    if hasattr(e, "code")
        print(e.code)
    if hasattr(e, "reason"):
        print(e.reason)