python爬虫有要求会redis么_我正在写一个爬虫(gevent+requests+redis-py),出现了一些问题,看看各位有啥好的解决方案没?...

我的爬虫大致思想的是这样的,我想爬取某些列表页面上所有的列表url,有很多页,我遍历这些页面,然后抓去这些页面上的内容,当某个页面请求出错的时候,我就将它保存到一个数据库,下次从这个数据库里面把错误的取出来,然后再处理,这样一直循环,直到所有的都被处理完。不多说了,直接代码吧(更详细的问题描述见代码的注释)代码有点乱了,gist地址:https://gist.github.com/penkzhou/a657720be302f72269ca :

# _*_ coding: utf-8 _*_

import sys

reload(sys)

sys.setdefaultencoding("utf8")

from gevent import monkey

monkey.patch_all()

import requests

import redis

import gevent

from gevent.pool import Pool

from bs4 import BeautifulSoup

import time

from pymongo import MongoClient, ReadPreference

import json

import redis.connection

redis.connection.socket = gevent.socket

mongo_connection = MongoClient(

'%s:%d' % (

JobProjectConfiguration.save_mongo_host,

JobProjectConfiguration.save_mongo_port),

read_preference=ReadPreference.SECONDARY,

max_pool_size=10, use_greenlets=True)

mongo_db = mongo_connection.jobdigg

redis_connection = redis.ConnectionPool(

host=JobProjectConfiguration.url_queue_redis_host,

port=JobProjectConfiguration.url_queue_redis_port,

db=JobProjectConfiguration.url_queue_redis_db

)

redis_proxy_pool = redis.ConnectionPool(

host=JobProjectConfiguration.proxy_queue_redis_host,

port=JobProjectConfiguration.proxy_queue_redis_port,

db=JobProjectConfiguration.proxy_queue_redis_db

)

proxy_pool = []

pool_num = 100

header = {

"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",

"Accept-Encoding": "gzip,deflate,sdch",

"Accept-Language": "zh-CN,zh;q=0.8",

"Cache-Control": "max-age=0",

"Connection": "keep-alive",

"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36"

}

def WYUrlGenerator():

print '51 Dig start : the url...'

start = time.time()

redis_db = redis.Redis(connection_pool=redis_connection)

urllist = WYJobUrlYield()

gpool = Pool(pool_num)

for uargs in urllist:

gpool.spawn(GenerateUrl, uargs)

gpool.join()

# 从这里开始,循环的从错误url集合里面取url,直至取完所有的

length = redis_db.scard("error_url_list")

while length > 0:

errorlist = ErrorUrlGenerator()

epool = Pool(pool_num)

for url in errorlist:

epool.spawn(GenerateUrl, url)

epool.join()

length = redis_db.scard("error_url_list")

end = time.time()

print 'dig end : the url...all spend time is %0.2f' % (end - start)

def WYJobUrlYield():

for page in xrange(3000):

page += 1

url = "http://some.crawl.url with page num %s" % page

jobitem = {

"url": url,

"type": "jobtype"

}

jobvalue = json.dumps(jobitem)

yield jobvalue

#从错误url的集合里面取出url 再次处理

def ErrorUrlGenerator():

redis_db = redis.Redis(connection_pool=redis_connection)

urllist = redis_db.smembers("error_url_list")

for url in urllist:

yield url

def GenerateUrl(sourcejob):

redis_db = redis.StrictRedis(connection_pool=redis_connection)

pipe = redis_db.pipeline()

newitem = json.loads(sourcejob)

url = newitem["url"]

urltype = newitem["type"]

try:

ip = proxy_pool.getProxy()

proxy = {"http": "http://"+ip["proxy"]}

timeout = gevent.Timeout(5, ConnectionError)#在这里设置超时,保证每次请求不会太长,阻塞后面的请求,超过5秒抛出错误

timeout.start()

r = requests.get(url, headers=header, proxies=proxy)

jobs = BeautifulSoup(r.text)

if urltype == "urltype": #获取页面的所有url,然后保存到redis的一个set里面

results = jobs.findAll("a", {"class": "classname"})

for result in results:

url = result["href"]

urlitem = {

"url": url,

"type": "urltype"

}

urlvalue = json.dumps(urlitem)

pipe.sadd("url_list", urlitem) # 这里将获取的url保存至url_list 这个redis集合里面

pipe.srem("error_url_list", sourcejob) #运行到这里,说明当前url如果是错误的url,那么就已经被处理,在这里删除掉

pipe.execute()

except Exception as e:

error_name = e.__class__.__name__

if error_name == "ConnectionError" or error_name == "ProxyError":

#通过判断错误类型(因为一些链接或者代理错误,

#我会直接扔回专门保存错误url的集合里面,然后下次再去取出来处理)

redis_db.sadd('error_url_list', sourcejob)

#现在我面临最恼火的问题就是其它比较正常,就在这里,

#当程序开启的时候,偶尔会出现sadd抛出异常

#因为这里是出了异常才在这里处理错误的url的

#(将它保存到error_url_list,供后来的再处理),现在添加的时候出了异常,

#这样就会让一大部分错误的url无法保存到对应的数据库,最后导致爬到的数据太少,

#异常信息大致为:

# ConnectionError

# > args=(,)> failed with ConnectionError

# Traceback (most recent call last):

# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/greenlet.py", line 327, in run

# result = self._run(*self.args, **self.kwargs)

# File "61.py", line 147, in GenerateUrl

# redis_db.sadd('error_url_list', sourcejob)

# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 1248, in sadd

# return self.execute_command('SADD', name, *values)

# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 461, in execute_command

# return self.parse_response(connection, command_name, **options)

# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 471, in parse_response

# response = connection.read_response()

# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 339, in read_response

# response = self._parser.read_response()

# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 110, in read_response

# response = self.read()

# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 103, in read

# return self._fp.readline()[:-2]

# File "/usr/local/lib/python2.7/socket.py", line 447, in readline

# data = self._sock.recv(self._rbufsize)

# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/socket.py", line 392, in recv

# self._wait(self._read_event)

# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/socket.py", line 298, in _wait

# self.hub.wait(watcher)

# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 341, in wait

# result = waiter.get()

# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 568, in get

# return self.hub.switch()

# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 331, in switch

# return greenlet.switch(self)

if __name__ == '__main__':

st = time.time()

time.sleep(5)

WYUrlGenerator()

et = time.time()

print "**************end****************,the spend time is %0.2f" % (et - st)

不知道各位对我这段代码有什么看法,或者吐槽也行,自己找了一些相关资料,成效不大。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值