python爬虫之urllib

最新推荐文章于 2020-10-23 11:15:36 发布

fonyer

最新推荐文章于 2020-10-23 11:15:36 发布

阅读量321

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/fonyer/article/details/79256220

版权

python 专栏收录该内容

12 篇文章

订阅专栏

#coding=utf-8    
#urllib操作类  

import time
import urllib.request
import urllib.parse
from urllib.error import HTTPError, URLError
import sys
class myUrllib:

	@staticmethod
	def get_headers(headers):
		default_headers = {
			'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
			#'Referer': r'http://www.baidu.com/',
			'Connection': 'keep-alive',
			'Cookie':'uuid_tt_dd=2845574184150781887; _ga=GA1.2.1608505838; dc_tos=p308'
		}
		headers = headers and dict(default_headers,**headers) or default_headers  
		return headers


	@staticmethod
	def get(url,headers={}):
		headers = myUrllib.get_headers(headers)
		#data=urllib.parse.urlencode(query_data).encode('utf-8')
		#r/R:非转义的原始字符串 
		#u/U:表示unicode字符串 
		#b:bytes 
		url=r'%s'%url
		request = urllib.request.Request(url,headers=headers,method='GET')
		try:
			html = urllib.request.urlopen(request).read()
			page = html.decode('utf-8')
		except HTTPError as e:
			print (e.code,e.reason)
		except URLError as e:
			print (e.reason)
		return page

	@staticmethod
	def post(url,data={},headers={}):
		headers = myUrllib.get_headers(headers)
		data=urllib.parse.urlencode(data)
		binary_data=data.encode('utf-8')
		url=r'%s'%url
		request=urllib.request.Request(url,data=binary_data,headers=headers,method='POST')#发送请求，传送表单数据    
		# response=urllib.request.urlopen(request)#接受反馈的信息
		# data=response.read()#读取反馈信息
		# data=data.decode('utf-8')
		#print (data.encode('gb18030'))
		#print (response.geturl())#返回获取的真实的URL
		#info()：返回一个对象，表示远程服务器返回的头信息。
		#getcode()：返回Http状态码，如果是http请求，200表示请求成功完成;404表示网址未找到。
		#geturl()：返回请求的url地址。
		
		try:
			html = urllib.request.urlopen(request).read()
			page = html.decode('utf-8')
		except HTTPError as e:
			print (e.code,e.reason)
		except URLError as e:
			print (e.reason)
		return page

		

getInfo = myUrllib.get('http://localhost:88/test/c.php?act=category',{'Referer': r'https://www.baidu.com/'})
print(getInfo)

sys.exit() 

postInfo = myUrllib.post('http://localhost:88/test/c.php',{'id':1010},{'Referer': r'https://www.baidu.com/'})
print(postInfo)

d:\python\crawler>python urllib01.py
HTTP_HOST:
localhost:88

HTTP_USER_AGENT:
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/63.0.3239.108 Safari/537.36

HTTP_COOKIE:
uuid_tt_dd=2845574184150781887; _ga=GA1.2.1608505838; dc_tos=p308

HTTP_REFERER:
https://www.baidu.com/

REQUEST_METHOD:
GET

GET DATA:
array(1) {
["act"]=>
string(8) "category"
}

#设置代理

#coding=utf-8
import urllib.request
import random
from urllib.error import HTTPError, URLError

def proxy_handler(url,iplist,wfile):
	#ip = random.choice(iplist)
	for ip in iplist:
		try:
			print('*'*20,'\n ip:',ip)
			proxy_support = urllib.request.ProxyHandler({'http':ip})
			opener = urllib.request.build_opener(proxy_support)
			opener.addheaders = [('User-Agent',r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36')]
			urllib.request.install_opener(opener)
			response = urllib.request.urlopen(url)
			code = response.getcode()
			url = response.geturl()
			print('*'*20,'\n url:',url)
			print('*'*20,'\n code:',code)
			info = response.info()
			print('*'*20,'\n info:',info)
			if code == 200:
				page = response.read()
				#写入文件
				page = str(page, encoding='utf-8')
				fw = open(wfile,'w',encoding='UTF-8')
				fw.write(page)
				fw.close()
				print('*'*20,'\n write file:',wfile)
				break
		except HTTPError as e:
			print (e.code,e.reason)
			continue
		except URLError as e:
			print (e.reason)
			continue
	

url = r'http://ip.chinaz.com/'
iplist = ['182.42.244.169:808','122.72.18.34:80','52.44.16.168:3129']
wfile = 'page.txt'
proxy_handler(url,iplist,wfile)

d:\python\crawler>python proxy01.py
********************
ip: 182.42.244.169:808
[WinError 10061] 由于目标计算机积极拒绝，无法连接。
********************
ip: 122.72.18.34:80
********************
url: http://ip.chinaz.com/
********************
code: 200
********************
info: Cache-Control: private
Content-Length: 33900
Content-Type: text/html; charset=utf-8
Server: Microsoft-IIS/7.5
X-AspNet-Version: 4.0.30319
Set-Cookie: qHistory=aHR0cDovL2lwLmNoaW5hei5jb20rSVAv5pyN5Yqh5Zmo5Zyw5Z2A5p+l6K
i; domain=.chinaz.com; expires=Tue, 05-Feb-2019 15:03:42 GMT; path=/
X-Powered-By: ASP.NET
Date: Mon, 05 Feb 2018 15:03:42 GMT
X-Cache: MISS from GD-SZ-WEB-01
X-Cache-Lookup: MISS from GD-SZ-WEB-01:80
Connection: close

********************
write file: page.txt