去年的大坑。
第一个写的爬虫网页是在去年,在原有的基础上进行修改并增加了一些功能。
1、可以使用代理IP
2、检查代理IP的可用性
3、可直接返回网页,也可返回经beautifulSoup编译的对象。
以上纯属自娱自乐。
还是建议使用爬虫框架,可以少走很多弯路
# -*- coding:utf8 -*-
#-------------------------------------------------------------------------------
# Name: 02_Source.ReadURL
# Abstract: 读取网页,并用BS4进行编译,返回一个BS4编译后的结果
# Description:
#
# Created: 2012-12-10
# Author: baihc
# Contact: baihc@esrichina.com.cn
#
# Copyright: (c) 2012 Gistech All Rights Reserved.
# License: license
#-------------------------------------------------------------------------------
from bs4 import BeautifulSoup
import urllib2
import time
import sys
import random
import copy
reload(sys)
sys.setdefaultencoding('utf-8')
class RequestURL():
def __init__(self,poxy=None):
if poxy==None:
self.poxy=['114.80.136.112:7780','125.39.66.150:80','61.55.141.11:81','61.153.98.6:8080','202.171.253.111:80']#可用http代理,格式“IP地址:端口”
self.url=self.poxy
else:
self.poxy=poxy
self.url=poxy
def checkIP(self,ip):
url="http://www.baidu.com/"
opener = urllib2.build_opener( urllib2.ProxyHandler( {'http':ip} ), urllib2.HTTPHandler( debuglevel = 0 ) )#设置代理
urllib2.install_opener( opener )#开启代理
i_headers = {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5", "Referer": 'http://www.baidu.com'}#模拟浏览器
req = urllib2.Request( url, headers = i_headers )#开始进行请求
try:
readers = urllib2.urlopen( req, timeout = 10 )
except:
return True
statusCode=readers.getcode()
if statusCode==404:
return True
else:
return False
def request2bs(self,url,data=None,checkip=False,usePoxy=True):
'''传入一个url地址,返回经过BS编译过的对象。'''
result=self.request(url,data,checkip)
if result:
return [BeautifulSoup(result[0]),result[1]]
else:
return None
def request(self,url,data=None,checkip=False,usePoxy=True):
'''传入一个url地址,返回未经BS编译过的对象。'''
count = 0 #记录尝试访问的次数
sleep_download_time = 0 #访问失败后,睡眠的时间
time_out = 10 #设置超时的时间,如果超过这个时间,就会返回error,进行睡眠。
flag=True
while flag:
if len(self.url)<=1:
self.url=copy.copy(self.poxy)
http_dl = copy.copy(self.url)
http_num = random.randrange(len(self.url))-1 #代理的index
ipAdress=http_dl[http_num]
if checkip:
ipflag=True
while ipflag:
if self.checkIP(ipAdress):
http_dl.remove(ipAdress)
print '%s is invalid'%ipAdress
if len(http_dl)<=1:
time.sleep(300)
http_dl = copy.copy(self.url)
http_num= random.randrange(len(http_dl))-1
ipAdress=http_dl[http_num]
else:
ipflag=False
user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0', \
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0', \
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \
(KHTML, like Gecko) Element Browser 5.0', \
'IBM WebExplorer /v0.94', 'Galaxy/1.0 [en] (Mac OS X 10.5.6; U; en)', \
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', \
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', \
'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \
Version/6.0 Mobile/10A5355d Safari/8536.25', \
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/28.0.1468.0 Safari/537.36', \
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)',\
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5"]
opener = urllib2.build_opener( urllib2.ProxyHandler( {'http':ipAdress} ), urllib2.HTTPHandler( debuglevel = 0 ) )#设置代理
if usePoxy:
urllib2.install_opener( opener )#开启代理
i_headers = {"User-Agent":user_agents[random.randrange(len(user_agents))], "Referer": 'http://www.baidu.com'}#模拟浏览器
req = urllib2.Request( url, data,headers = i_headers )#开始进行请求
try:
readers = urllib2.urlopen( req, timeout = time_out )
statusCode=readers.getcode()
reader=readers.read()#打开网址并read,如果相应时间大于timeout,则会返回错误,执行expect
flag = False
return [reader,statusCode]
except :
count += 1 #失败记录+1
s = sys.exc_info()
# print "Error '%s' happened on line %d" % ( s[1], s[2].tb_lineno )
http_num = random.randrange(len(self.url))
try:
if s[1].code==404:
f=open(ur'NoneDataUrl.txt','a')
f.write(url+'\n')
f.close()
return None
except:
print(type(s[1]))
sleep_download_time = random.randrange(10)
print 'sleep.....%s,proxyIP:%s'%(sleep_download_time,ipAdress)
time.sleep( sleep_download_time )
if count >= 10:#访问时间超过10次则返回None
f=open(ur'NoneDataUrl.txt','a')
f.write(url+'\n')
f.close()
return None