python最简单爬虫案例_简单的爬虫小例子

实例目标:爬取知网空间300个期刊论文

参考链接有详细的原理和教程

一。调度器:用来控制整个流程

spider_main.py

#!/usr/bin/env python#-*- coding: utf-8 -*-

'spider_main-调度器'

importurl_managerimporthtml_downloaderimporthtml_parserimporthtml_outputerfrom gevent importmonkey; monkey.patch_all()importgeventclassSpiderMain(object):"""docstring for SpiderMain"""

def __init__(self):

super(SpiderMain, self).__init__()

self.urls=url_manager.UrlManager()

self.downloader=html_downloader.HtmlDownloader()

self.parser=html_parser.HtmlParser()

self.outputer=html_outputer.HtmlOutputer()

self.count= 1 #统计爬的网页数

defgevent_01(self):

sum_ge= 1temp=[]while sum_ge <= 50: #加入50个协程并发处理(非常好用)

temp.append(gevent.spawn(self.craw, sum_ge))

sum_ge= sum_ge + 1gevent.joinall(temp)defcraw(self, n):whileTrue:if self.count > 300: return #爬300个网页

ifself.urls.has_new_url():try:

new_url=self.urls.get_new_url()

html_cont=self.downloader.downloader(new_url)

new_urls, new_data=self.parser.parser(new_url,html_cont)

self.urls.add_new_urls(new_urls)if new_data is None: continue #网页中没有要爬的内容就到下一个网页

self.outputer.collect_data(new_data)print '协程%d' %nprint 'craw %d : %s' %(self.count,new_url)

self.count= self.count + 1 #爬一个网页自增

exceptException as e:print 'craw failed'

else:

gevent.sleep(0)#没有新的链接要怕,交出控制权给另一个协程

if __name__ == '__main__':

root_url= 'http://www.cnki.com.cn/index.htm'obj_spider=SpiderMain()

obj_spider.urls.add_new_url(root_url)

obj_spider.gevent_01()

obj_spider.outputer.output_html()

二。URL管理器(数据库,redis缓存, 内存) : 管理待抓取URL集合和已抓取URL集合 (防止重复和循环抓取)

url_manager.py

#!/usr/bin/env python#-*- coding: utf-8 -*-

'url_manager-URL管理器'

classUrlManager(object):"""docstring for UrlManager"""

def __init__(self):

super(UrlManager, self).__init__()

self.new_urls=set()

self.old_urls=set()defadd_new_url(self,url):if url is None: return

if url not in self.new_urls and url not inself.old_urls:

self.new_urls.add(url)defadd_new_urls(self,urls):if urls is None or len(urls) == 0: return

for url inurls:

self.add_new_url(url)defget_new_url(self):

new_url=self.new_urls.pop()

self.old_urls.add(new_url)returnnew_urldefhas_new_url(self):return len(self.new_urls) != 0

三。网页下载器(官方的urllib2,第三方requeests) : 将互联网上的URL对应的网页下载到本地的工具

html_downloader.py

#!/usr/bin/env python#-*- coding: utf-8 -*-

'html_downloader-网页下载器'

importurllib2classHtmlDownloader(object):"""docstring for html_downloader"""

def __init__(self):

super(HtmlDownloader, self).__init__()defdownloader(self,url):if url is None: returnNone

response= urllib2.urlopen(url) #第一种最直接的下载方式(可看参考链接)

if response.getcode() != 200: returnNonereturn response.read()

四。网页解释器(正则表达式,lxml,官方自带html.parser,第三方的BeautifulSoup) : 从网页中提取有价值数据的工具

html_parser.py

#!/usr/bin/env python#-*- coding: utf-8 -*-

'html_parser-网页解释器'

from bs4 importBeautifulSoupimportreimporturlparseclassHtmlParser(object):"""docstring for html_parser"""

def __init__(self):

super(HtmlParser, self).__init__()def __get_new_urls__(self, page_url, soup):

new_urls=set()

links_01= soup.find_all('a',href=re.compile(r'/Journal/')) #获取是期刊的链接

#links = soup.find_all('a',href=re.compile(r'/item/'))

for link inlinks_01:

new_url= link['href']

new_full_url= urlparse.urljoin(page_url, new_url) #https://baike.baidu.com

new_urls.add(new_full_url)

links= soup.find_all('a',class_='zt_name',href=re.compile(r'/Article/')) #获取文章的链接

#links = soup.find_all('a',href=re.compile(r'/item/'))

for link inlinks:

new_url= link['href']

new_full_url= urlparse.urljoin(page_url, new_url) #https://baike.baidu.com

new_urls.add(new_full_url)returnnew_urlsdef __get_new_data__(self,page_url,soup):

res_data={}

title_node= soup.find('h1',class_='xx_title')if title_node is None: returnres_data['title'] =title_node.get_text()

summary_node= soup.find('div',class_='xx_font')

res_data['summary'] =summary_node.get_text()

res_data['url'] =page_urlreturnres_datadefparser(self, page_url, html_cont):if page_url is None or html_cont is None: returnsoup= BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')

new_urls= self.__get_new_urls__(page_url,soup)

new_data= self.__get_new_data__(page_url,soup)return new_urls, new_data

五。输出器 : 输出数据

html_outputer.py

#!/usr/bin/env python#-*- coding: utf-8 -*-

'html_outputer-输出器'

classHtmlOutputer(object):"""docstring for html_outputer"""

def __init__(self):

super(HtmlOutputer, self).__init__()

self.datas=[]defcollect_data(self, data):if data is None: returnself.datas.append(data)defoutput_html(self):

fout= open('output.html','w')

fout.write('')

fout.write('

')

fout.write('

')

fout.write('

fout.write('

')

fout.write('

%s' % data['url'].encode('utf-8'))

fout.write('

%s' % data['title'].encode('utf-8'))

fout.write('

%s' % data['summary'].encode('utf-8'))

fout.write('

')

fout.write('

')

fout.write('')

fout.write('')

fout.close()

六。问题

用了多个协程来并发出来后,解决要等待网络io的问题(下载器下载网页), 但也存在不知道哪个协程先处理完成,哪个后处理完成,导致爬的网页顺序是没有规律的,不过这应该问题不大0.0

七。演示

八。参考

https://www.liaoxuefeng.com/wiki/001374738125095c955c1e6d8bb493182103fac9270762a000/001407503089986d175822da68d4d6685fbe849a0e0ca35000

https://www.imooc.com/learn/563

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值