爬虫程序(二)---读取网页

去年的大坑。

第一个写的爬虫网页是在去年,在原有的基础上进行修改并增加了一些功能。

1、可以使用代理IP

2、检查代理IP的可用性

3、可直接返回网页,也可返回经beautifulSoup编译的对象。

以上纯属自娱自乐。

还是建议使用爬虫框架,可以少走很多弯路

# -*- coding:utf8 -*-
#-------------------------------------------------------------------------------
# Name:         02_Source.ReadURL
# Abstract:      读取网页,并用BS4进行编译,返回一个BS4编译后的结果
# Description: 
#
# Created:      2012-12-10
# Author:        baihc
# Contact:       baihc@esrichina.com.cn
#
# Copyright:    (c) 2012 Gistech All Rights Reserved.
# License:        license
#-------------------------------------------------------------------------------

from bs4 import BeautifulSoup
import urllib2
import time
import sys
import random
import copy

reload(sys)
sys.setdefaultencoding('utf-8')   


class RequestURL():
    def __init__(self,poxy=None):
        if poxy==None:
            self.poxy=['114.80.136.112:7780','125.39.66.150:80','61.55.141.11:81','61.153.98.6:8080','202.171.253.111:80']#可用http代理,格式“IP地址:端口”
            self.url=self.poxy
        else:
            self.poxy=poxy
            self.url=poxy
    
    def checkIP(self,ip):
        url="http://www.baidu.com/"
        opener = urllib2.build_opener( urllib2.ProxyHandler( {'http':ip} ), urllib2.HTTPHandler( debuglevel = 0 ) )#设置代理
        urllib2.install_opener( opener )#开启代理
        i_headers = {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5", "Referer": 'http://www.baidu.com'}#模拟浏览器
        req = urllib2.Request( url, headers = i_headers )#开始进行请求
        try:
            readers = urllib2.urlopen( req, timeout = 10 )
        except:
            return True
        statusCode=readers.getcode()
        if statusCode==404:
            return True
        else:
            return False
        
    def request2bs(self,url,data=None,checkip=False,usePoxy=True):
        '''传入一个url地址,返回经过BS编译过的对象。'''
        result=self.request(url,data,checkip)
        if result:
            return [BeautifulSoup(result[0]),result[1]]
        else:
            return None
        
    
    def request(self,url,data=None,checkip=False,usePoxy=True):
        '''传入一个url地址,返回未经BS编译过的对象。'''
        count = 0 #记录尝试访问的次数
        sleep_download_time = 0 #访问失败后,睡眠的时间
        time_out = 10 #设置超时的时间,如果超过这个时间,就会返回error,进行睡眠。
        
        flag=True
        while flag:
            if len(self.url)<=1:
                self.url=copy.copy(self.poxy)
            http_dl = copy.copy(self.url)
            http_num = random.randrange(len(self.url))-1 #代理的index
            ipAdress=http_dl[http_num]
            if checkip:
                ipflag=True
                while ipflag:
                    if self.checkIP(ipAdress):
                        http_dl.remove(ipAdress)
                        print '%s is invalid'%ipAdress
                        if len(http_dl)<=1:
                            time.sleep(300)
                            http_dl = copy.copy(self.url)
                        http_num= random.randrange(len(http_dl))-1
                        ipAdress=http_dl[http_num]
                    else:
                        ipflag=False
                        
            user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0', \
                            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0', \
                            'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \
                            (KHTML, like Gecko) Element Browser 5.0', \
                            'IBM WebExplorer /v0.94', 'Galaxy/1.0 [en] (Mac OS X 10.5.6; U; en)', \
                            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', \
                            'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', \
                            'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \
                            Version/6.0 Mobile/10A5355d Safari/8536.25', \
                            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \
                            Chrome/28.0.1468.0 Safari/537.36', \
                            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)',\
                            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5"]
            
            opener = urllib2.build_opener( urllib2.ProxyHandler( {'http':ipAdress} ), urllib2.HTTPHandler( debuglevel = 0 ) )#设置代理
            if usePoxy:
                urllib2.install_opener( opener )#开启代理
            i_headers = {"User-Agent":user_agents[random.randrange(len(user_agents))], "Referer": 'http://www.baidu.com'}#模拟浏览器
            req = urllib2.Request( url, data,headers = i_headers )#开始进行请求
            try:
                readers = urllib2.urlopen( req, timeout = time_out )
                statusCode=readers.getcode()
                reader=readers.read()#打开网址并read,如果相应时间大于timeout,则会返回错误,执行expect
                flag = False
                return [reader,statusCode]
            except :
                count += 1 #失败记录+1
                s = sys.exc_info()
#                print "Error '%s' happened on line %d" % ( s[1], s[2].tb_lineno )
                http_num = random.randrange(len(self.url))
                try:
                    if s[1].code==404:
                        f=open(ur'NoneDataUrl.txt','a')
                        f.write(url+'\n')
                        f.close()
                        return None
                except:
                    print(type(s[1]))
                sleep_download_time = random.randrange(10)
                print 'sleep.....%s,proxyIP:%s'%(sleep_download_time,ipAdress)
                time.sleep( sleep_download_time )
                if count >= 10:#访问时间超过10次则返回None
                    f=open(ur'NoneDataUrl.txt','a')
                    f.write(url+'\n')
                    f.close()
                    return None


  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值