1 data = urllib.request.urlopen(url='http://www.sina.com') # 2 print(data.info()) # 获取网页的信息 3 print(data.getcode()) # 获取爬去过后是否成功的状态码 4 print(data.geturl()) # 获取爬取的url 5 print(data.read()) # 获取爬取网页的内容 6 7 data2 = urllib.request.quote('http://www.baidu.com') # 对url进行编码,去除特殊的字符,讲中文进行编码 8 >>>http%3A//www.baidu.com 9 data3 = urllib.request.unquote(data2) # 对网页url进行解码 10 >>>http://www.baidu.com 11 12 # 注意以下三种添加UA的方式, headers的方式,是字典还是元组 13 #《一》 14 headers = ('User-Agent', 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36') 15 16 opener = urllib.request.build_opener() #构建opener对象 17 opener.addheaders = [headers] # 给opener对象添加UA 18 data = opener.open(url) # 带UA进行爬取url 19 print(data.read()) 20 21 #《二》 22 headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36'} 23 24 25 request= urllib.request.Request(url, headers=headers) 26 data = urllib.request.urlopen(request) 27 print(data.read()) 28 29 #《三》 30 request= urllib.request.Request(url) 31 request.add_header('User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36') 32 data = urllib.request.urlopen(request) 33 print(data.read()) 34 35 #get 36 key = '周星驰电影全集' 37 new_key = urllib.request.quote(key) #对中文进行编码 38 url = 'http://www.baidu.com/ssid=e1156d6c/from=844b/s?word='+ new_key 39 request = urllib.request.Request(url) 40 data = urllib.request.urlopen(request ) 41 42 #post 传参是字典形式,需要进行两次编码 43 key = {} 44 new_key = urllib.request.urlencode(key).encode('utf-8') 45 request = urllib.request.Request(url, data=new_key) 46 47 # 代理ip (http://www.xicidaili.com/) 48 49 proxy_addr = '118.114.77.47:8080' 50 url = 'http://www.baidu.com' 51 52 proxy = urllib.request.ProxyHandler({'http':proxy_addr}) # 使用ProxyHandler类构建对象,注意传参方式 53 opener = urllib.request.build_opener(proxy, 54 urllib.request.HTTPHandler) # 使用代理地址构建opener对象,这里需要添加HTTPHandler类 55 urllib.request.install_opener(opener) # opener设置为全局对象 56 # data = opener.open(url) 57 data = urllib.request.urlopen(url) # 有了全局对象,使用urlopen就会使用opener对象来打开 58 print(data.read().decode('utf8')) 59 60 # DebugLog 打印调试日志设置如下 61 httphd = urllib.request.HTTPHandler(debuglevel=1) debuglevel设置为1 62 httpshd = urllib.request.HTTPSHandler(debuglevel=1) 63 opener = urllib.request.build_opener(httphd,httpshd) 64 urllib.request.install_opener(opener) 65 data = urllib.request.urlopen(url) 66 67 # 异常处理,两个类 68 urllib.error.URLError 69 urllib.error.HTTPError # URLError的子类 70 # 一般来说,产生URLError的原因有如下几种可能:1)连接不上服务器2)远程URL不存在3)无网络4)触发了HTTPError