python异步爬虫框架_python 自建爬虫复用简单框架(gevent异步)

一般爬虫可以分为以下几个步骤:

一、打开指定网页

二、解析网页

三、处理/存储数据,新增任务网页

另外异步的话,需要调度器。

简单爬虫的话,不需要搞复杂验证码,requests/urllib修改cookie,header就能访问的话,写一个打开,一个解析就够了,处理数据和新任务,直接写在解析类就下,gevent也可以直接异步。

项目路径:ur'D:\python_py\my_scrapy/scrapy_tools'

# scrapy_tools下添加__init__.py作为包使用

itemparse.py

按照数据的结构建立相应的xpath 结构

#-*- coding: utf-8 -*-

"""Created on Fri Jul 07 17:24:34 2017

@author: willowj"""

importsys

stdout, stdin, stderr=sys.stdout, sys.stdin, sys.stderr

reload(sys)

sys.stdout, sys.stdin, sys.stderr=stdout, stdin, stderr

sys.setdefaultencoding('utf8')importgeventimportpandas as pdimportnumpy as npfrom lxml importhtmlimporttimeimportcodecsimportjsondeflist_0e(list_):ifisinstance(list_, list):if notlist_:returnNoneelse:if len(list_)>1:print 'warning : list>1,list[1]:', list_[1] #,len(list_)

returnlist_[0]else:returnlist_classItemParse(object):"""docstring for zhihu_topi"""name= 'ItemParse'base_url= 'https://www.zhihu.com/topic/19551147/top-answers'pageN_x= '//div[@class="zm-invite-pager"]//span[last()-1]/a/text()'new_urls_x=None#以下一条数据的节点,以及每一项

items_node_x = '//div[@class="feed-main"]'

#注意每一条数据的属性都是在一个项目里搜索,xpath '.'开头

item_xs =dict(

question_name= '''.//a[@class='question_link']/text()''',#question_href = '''.//a[@class='question_link']/@href''',

author = './/div[@data-action="/answer/content"]/@data-author-name',

author_href= '''.//a[@class='author-link']/@href''',

ups_x= './/div[@class="zm-item-vote-info"]/@data-votecount',

answers_text= ".//textarea/text()",

commentN= './/a[@name="addcomment"]/text()[last()]',

entry_url= './/div[@data-action="/answer/content"]/@data-entry-url',#re:

#z = re.compile('\.')

)#换页url样式

defgetnextpages(self):if self.pageN > 1:#自定义换也规则,只有一页则为 False

urls = [self.base_url + '?page=%s' %nfor n in range(self.pageN,1,-1)

]returnurlsdef __init__(self, html_):#self.item_atrr_xpath()

self.results =[]

self.new_urls=[]

self.pageN=self.update_page_n(html_)

self.nextpages=self.getnextpages()

self.parase(html_)defparase(self, html_):#优先使用xpath,,补充使用re; 找不到的item 返回none

etree =html.document_fromstring(html_)

items_nodes=etree.xpath(self.items_node_x)#results = []

for ee initems_nodes:

ee_str=None

ite={}for item,itemx inself.item_xs.items():#re, or xpath

if hasattr(itemx, 'findall'):if ee_str isNone:

ee_str=html.to_string(ee)

ite[item]=itemx.findall(ee_str)#xpath

elif isinstance(itemx, str) orisinstance(itemx, unicode):if itemx.startswith('./'):

ite[item]=ee.xpath(itemx)else:printitemraise 'xpath not startwith ./'

else:printitemraise 'not re.pattarn object or xpath str'

if len(ite[item]) ==0:

ite[item]=Noneelif len(ite[item]) == 1:

ite[item]=ite[item][0]else:

ite[item]= '\n'.join([str(__i) for __i inite[item]])

self.results.append(ite)#new_url

ifself.new_urls_x:

self.new_urls.extend(etree.xpath(self.new_urls_x))#获取有多少页

defupdate_page_n(self, html_):ifself.pageN_x:

etree=html.document_fromstring(html_)

pages=etree.xpath(self.pageN_x)

pages=list_0e(pages)ifisinstance(pages, str):

pages.strip()if pages andpages.isdigit():returnint(pages)else:return 1

#普通的获取项目下所有换页

def get_nextpages(self, opener, sleep_sec=None):for url inself.nextpages:ifsleep_sec:

time.sleep(sleep_sec)#if not hasattr(opener, 'get')

_re =opener.get(url)print_re.status_code, _re.url

self.parase(_re.text)printtime.time()#暂时把 异步控制和存储方法写到了这里

#gevent 协程方法

def __gevent_get_nextpages(self, opener):printid(opener)whileself.nextpages:#start_time = time.time()

url =self.nextpages.pop()printgevent.getcurrent()

zhihu_re=opener.get(url)#gevent.sleep(5)

printzhihu_re.status_code, url

self.parase(zhihu_re.text)printtime.time()#gevent 协程方法

def get_nextpages_by_gevent(self, opener_class, g_n=4):'''param: opener_class : 创建网页打开器的类

g_n: 协程数量,默认4个'''

from gevent importmonkey; monkey.patch_all()

start_time=time.time()

gs= [gevent.spawn(self.__gevent_get_nextpages, opener_class())for i inrange(g_n)

]

gevent.joinall(gs)print time.time() -start_time

self.save_to_excel()def save_to_excel(self, path=None):ifpath:

save_name=pathelse:

save_name= u''+self.name \+ time.strftime('%Y%m%d_%H_%M', time.localtime()) \+ '.xlsx'

printsave_name

result_pd=pd.DataFrame(self.results)print 'pd ok'result_pd.to_excel(u'' + save_name, encoding='gb18030')print 'saved to' +save_namedef save_to_json(self, path=None):ifpath:

save_name=pathelse:

save_name= u''+self.name \+ time.strftime('%Y%m%d_%H_%M', time.localtime()) \+ '.json'

printsave_name

with codecs.open(save_name,'w', encdoing='gb18030') as f:

f.write(josn.dumps(self.results))print 'saved to'+ save_name

View Code

使用时继承类重写类属性和getnextpages 换页方法

web_opener.py

使用requests.Session,保持会话的方式速度大概会快一倍

对应gevent异步,多少个协程就会生成同等的会话,各自打开网页互补干扰。 方法暂时写在itemparse.py

#-*- coding: utf-8 -*-

"""2017年8月17日星期四

下午 17:22

@author: willowj"""

importsys

sys.setdefaultencoding('utf8')importrequests#from requests.cookies import (#cookiejar_from_dict, extract_cookies_to_jar, RequestsCookieJar, merge_cookies)

classSessionFopener(object):"""requests 封装的网页打开器

param: headers 默认使用类属性,实例化的时候自己可以传入

cookie_dic 默认禁用

proxies 默认无"""headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Accept-Encoding':'gzip, deflate, sdch','Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6','Cache-Control':'max-age=0','Connection':'keep-alive',#'Cookie':'q'

#'Host':'www.zhihu.com',

'Upgrade-Insecure-Requests':'1','User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',

}def __init__(self, headers=None, cookie_dic=None, proxies=None):

self.req_s=requests.Session()

self.req_s.adapters.DEFAULT_RETRIES= 3self.req_s.keep_alive=Trueifheaders:

self.req_s.headers=headerselse:

self.req_s.headers=self.headersif notcookie_dic:

cookie_dic={}

self.req_s.cookies=requests.cookies.cookiejar_from_dict(cookie_dic)ifproxies:

self.req_s.proxies=proxiesdefclose(self):

self.req_s.close()def get(self, *arg, **karg):return self.req_s.get(*arg, **karg)def post(self, *arg, **karg):return self.req_s.post(*arg, **karg)def set_cookiejar(self, cookie_dic={}):

self.req_s.cookies=requests.cookies.cookiejar_from_dict(cookie_dic)defadd_cookiejar(self, cookie_dic):

self.req_s.cookies=requests.cookies.merge_cookies(self.req_s.cookies, cookie_dic)def set_headers(self, headers={}):

self.req_s.headers=headersdefadd_headers(self, headers_dic):for k,v inheader_dic:

self.req_s.headers[k]=vdefset_proxies(self, proxies):

self.req_s.proxies=proxies

@classmethoddefcookiejar_from_dict(cls, cookie_dic):returnrequests.cookies.cookiejar_from_dict(cookie_dic)def __enter__(self):print 'enter'

returnselfdef __exit__(self, *used):

self.req_s.close()delself.req_sprint 'exit'

if __name__ == '__main__':

with SessionFopener() as req_o:

res_p= req_o.get('http://httpbin.org/get')print res_p.json()

View Code

大众点评店铺爬取示例:

只需要继承后重写解析的节点、换页的url形式就行

暂时未考虑外链接。

#-*- coding: utf-8 -*-

"""Created

2017年8月17日星期四

下午 19:33

@author: Administrator"""

importsys

stdout, stdin, stderr=sys.stdout, sys.stdin, sys.stderr

reload(sys)

sys.stdout, sys.stdin, sys.stderr=stdout, stdin, stderr

sys.setdefaultencoding('utf8')

sys.path.append(ur'D:\python_py\my_scrapy')from scrapy_tools.web_opener importSessionFopenerfrom scrapy_tools.itemparse importItemParseclassDzdpItemParse(ItemParse):"""广州酒家(文昌店)的点评

docstring for zhihu_topi"""name= u'DzdpItemParse广州酒家'base_url= 'https://www.dianping.com/shop/516983/review_more'pageN_x= ".//a[@class='PageLink'][last()]/text()"new_urls_x=None#以下一条数据的节点,以及每一项

items_node_x = './/div[@class="comment-list"]/ul/li'

#注意每一条数据的属性都是在一个项目里搜索,xpath '.'开头

item_xs =dict(

user_id= '''.//*[@class="J_card"]/@user-id''',#question_href = '''.//a[@class='question_link']/@href''' ,

comm_per= """.//span[@class='comm-per']/text()""",

total_mark= """.//*[@class="user-info"]/span[1]/@class""",

taste= """.//*[@class="comment-rst"]/span[1]/text()""",

environment= """.//*[@class="comment-rst"]/span[2]/text()""",

sevice= """.//*[@class="comment-rst"]/span[3]/text()""",

comments_agree= '''.//span[@class="heart-num"]/text()''',

comment_text= """.//*[@class="J_brief-cont"]/text()""",

comment_date= '''.//*[@class="time"]/text()''',

recommend_food=\

u'''.//*[@class="comment-recommend" \

and (contains(text(),推荐) \

or contains(text(),喜欢))]\

[1]/a/text()'''

#中文得使用unicode

#re:

#z = re.compile('\.')

)defgetnextpages(self):if self.pageN > 1:#自定义换也规则,只有一页则为 False

urls = [self.base_url + '?pageno=%s' %nfor n in range(self.pageN, 1, -1)

]returnurls

open_s= SessionFopener() #实例化一个打开器

respon_= open_s.get(DzdpItemParse.base_url) #打开初始页

gzjj_item = DzdpItemParse(respon_.text) #解析对象用初始页html实例化

#同步方式的话,使用普通方法

gzjj_item.get_nextpages(open_s, sleep_sec=None)#异步方法:#gzjj_item.get_nextpages_by_gevent(SessionFopener) #实例异步方法

View Code

结果:本来打开一个网页0.5279 s,开四个协程后77.71s爬完613个页面,平均0.13s一个,速度提升至4倍

200 https://www.dianping.com/shop/516983/review_more?pageno=600

1503074965.07

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=602

1503074965.1

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=601

1503074965.14

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=604

1503074965.54

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=607

1503074965.59

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=605

1503074965.64

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=606

1503074965.67

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=611

1503074966.1

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=609

1503074966.15

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=610

1503074966.18

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=608

1503074966.22

>(

200 https://www.dianping.com/shop/516983/review_more?pageno=612

1503074966.7

200 https://www.dianping.com/shop/516983/review_more?pageno=614

1503074966.74

200 https://www.dianping.com/shop/516983/review_more?pageno=615

1503074967.05

200 https://www.dianping.com/shop/516983/review_more?pageno=613

1503074967.09

77.7100000381DzdpItemParse广州酒家20170819_00_49.xlsx

pd ok

saved to DzdpItemParse广州酒家20170819_00_49.xlsx

View Code

分布式多进程、入数据库的话,还得单独写调度器、与数据对接的模块

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值