1.使用 urllib 库爬取百度首页
import urllib.request
file = urllib.request.urlopen("http://www.baidu.com")
data = file.read()
dataline = file.readline()
print (dataline)
print (data)
2.将爬取的百度网页保存到本地
第一种方法,读写文件
import urllib.request
file = urllib.request.urlopen("http://www.baidu.com")
data = file.read()
dataline = file.readline()
print (dataline)
print (data)
filehandle=open("baidu.html","wb")
filehandle.write(data)
filehandle.close()
第二种方法,通过 urlretrieve
import urllib.request
file = urllib.request.urlretrieve("http://www.baidu.com",filename="baidu1.html")
urllib.request.urlcleanup()#清除 urlretrieve 执行过程中产生的缓存
3.其他常用的 urllib 库方法
import urllib.request
file = urllib.request.urlopen("http://www.baidu.com")
print(file.info())#获取环境信息
print(file.getcode())#获取状态码
print(file.geturl())#获取 url
print(urllib.request.quote("http://www.baidu.com"))# 编码
print(urllib.request.unquote("http://www.baidu.com"))#解码
输出:
Date: Sat, 13 Jan 2018 05:49:19 GMT
Content-Type: text/html; charset=utf-8
Transfer-Encoding: chunked
Connection: Close
Vary: Accept-Encoding
Set-Cookie: BAIDUID=9981F2EFCB230C032146C2145B91631E:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
Set-Cookie: BIDUPSID=9981F2EFCB230C032146C2145B91631E; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
Set-Cookie: PSTM=1515822559; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
Set-Cookie: BDSVRTM=0; path=/
Set-Cookie: BD_HOME=0; path=/
Set-Cookie: H_PS_PSSID=1450_25548_21113_18559_17001; path=/; domain=.baidu.com
P3P: CP=" OTI DSP COR IVA OUR IND COM "
Cache-Control: private
Cxy_all: baidu+8cff54495dd050c33c72b1048fd134d8
Expires: Sat, 13 Jan 2018 05:49:18 GMT
X-Powered-By: HPHP
Server: BWS/1.1
X-UA-Compatible: IE=Edge,chrome=1
BDPAGETYPE: 1
BDQID: 0xac3620350004e8ef
BDUSERID: 0
200
http://www.baidu.com
http%3A//www.baidu.com
http://www.baidu.com
4.添加 User-Agent模拟浏览器访问
使用 build_opener() 修改报头
import urllib.request
url = "http://www.baidu.com"
headers = ("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
data = opener.open(url).read()
print(data)
使用 add_header()添加报头
import urllib.request
url = "http://www.baidu.com"
req = urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
data = urllib.request.urlopen(req).read()
print(data)
5.通过 timeout控制超时时间
import urllib.request
for _ in range(100):
try:
url = "http://www.baidu.com"
data = urllib.request.urlopen(url,timeout=0.3)
print(len(data.read()))
except Exception as e:
print ("出错{}".format(e))
6.HTTP 协议请求之 GET
#百度搜索关键词 get 请求,拼凑 url即可实现
import urllib.request
url= "http://www.baidu.com/s?wd="
key= "python 学习"
key_code = urllib.request.quote(key)#编码
url_all = url+key_code
req = urllib.request.Request(url_all)
data = urllib.request.urlopen(req).read()
filebaidu = open("baidu2.html","wb")
filebaidu.write(data)
filebaidu.close()
7.HTTP 协议请求之 POST
#百度搜索关键词 post 请求
import urllib.request
import urllib.parse
url= "http://www.iqianyue.com/mypost/"
postdata = urllib.parse.urlencode({
"name":"zhangkun","pass":"zhangkun"
}).encode('utf-8') #将数据使用 urlencode编码之后,石笋 encode() 设置为 utf-8 编码
req = urllib.request.Request(url,postdata)
req.add_header("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
data = urllib.request.urlopen(req).read()
filebaidu = open("post.html","wb")
filebaidu.write(data)
filebaidu.close()
8.使用代理服务器
使用代理 IP 防止本地 IP 被封
西刺免费代理:http://www.xicidaili.com/
def use_proxy(proxy_addr,url):
import urllib.request
#设置代理服务器信息
proxy = urllib.request.ProxyHandler({'https':proxy_addr})
#创建一个自定义 opener 对象
opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
#创建全局默认的 opener 对象,方便使用
urllib.request.install_opener(opener)
data = urllib.request.urlopen(url).read().decode('utf-8')
return data
proxy_addr = "219.138.58.117:3128"
data = use_proxy(proxy_addr,"http://www.baidu.com")
print(data)
9.开启 DebugLog
import urllib.request
httphd = urllib.request.HTTPHandler(debuglevel=1)
httpshd = urllib.request.HTTPSHandler(debuglevel=1)
opener = urllib.request.build_opener(httphd,httpshd)
urllib.request.install_opener(opener)
data = urllib.request.urlopen("http://www.baidu.com")
10.异常处理 URLError
import urllib.request
import urllib.error
try:
urllib.request.urlopen("http://")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)