结合调度器完成爬虫任务

最新推荐文章于 2023-08-04 17:30:07 发布

飞飞翼

最新推荐文章于 2023-08-04 17:30:07 发布

阅读量1.6k

点赞数 1

分类专栏： PyThon

本文链接：https://blog.csdn.net/qq_36935391/article/details/78795130

版权

PyThon 专栏收录该内容

25 篇文章 1 订阅

订阅专栏

学习任务
1.编写一个简单的爬虫程序

爬虫的结果

crow 1:https://baike.baidu.com/view/21087.html
crow 2:http://v.baidu.com/
crow failed
crow 2:http://map.baidu.com/
crow failed
crow 2:https://baike.baidu.com/ziran
crow failed
crow 2:https://baike.baidu.com/kedou/
crow failed
crow 2:https://baike.baidu.com/uc/favolemma
crow failed
crow 2:https://baike.baidu.com/redirect/bd57D_QYRJrWtaYXMHfo4iHnUkk-10lbsmyUNTIBCyStIVCwG90uXOqxPuW_YBCkKIYcdmiuJg
crow failed
crow 2:https://baike.baidu.com/item/%E9%A9%AC%E7%89%B9%E5%88%A9
crow 3:https://baike.baidu.com/item/%E5%BA%94%E7%94%A8%E7%A8%8B%E5%BA%8F%E7%BC%96%E7%A8%8B%E6%8E%A5%E5%8F%A3
crow 4:https://baike.baidu.com/feiyi?fr=dhlfeiyi

学习目标
知识目标
1.熟悉爬虫的调度程序
2.掌握简单的爬虫架构
能力目标
1.能够自主编写一个简单的爬虫
初始化函数

#初始化个个器材
    def __init__(self):
        self.urls=url_manage.UrlManager();
        self.downloder=html_downloader.HtmlDownloader();
        self.parser=html_parser.Html_Parser();
        self.outputer=html_outputer.Html_Outputer()

爬虫调度函数

 #爬虫的调度程序
    def craw(self,root_url):
         count=1;
         self.urls.add_new_url(root_url);
         while self.urls.has_new_url:
             try:
                new_url=self.urls.get_new_url()
                print 'crow %d:%s'%(count,new_url)
                html_cont=self.downloder.download(new_url)
                new_urls ,new_data=self.parser.parse(new_url,html_cont)
                self.urls.add_new_urls(new_urls );
                self.outputer.collect_data(new_data)
                if count==100:
                     break;
                count=count+1;
             except:
                 print 'crow failed'
         self.outputer.output_html()

爬虫开始函数

if __name__=='__main__':
    #入口url
    root_url='https://baike.baidu.com/view/21087.html';
    obj_spider=SpiderMain();
    #启动爬虫
    obj_spider.craw(root_url)

知识点总结
1.爬虫调度程序
2.调度程序调度三个模块

问题
1.什么是爬虫的调度程序
2.爬虫调度程序最重要的一个函数
3.自己写一个简单的爬虫程序

答案
1.爬虫调度器询问URL管理器，判断是否有待爬取的URL，如果有的话，
从调度器获取一个待爬取的URL，URL管理器将这个URL返回给调度器，
调度器得到这个URL，将URL返回给下载器，下载URL内容，下载好后下
载器将URL的内容返回给调度器，调度器将URL的内容传给解析器，解析
其内容然后得到有价值的数据，得到新的URL列表传给调度器调度器收集
价值数据，传给应用，另一方面，调度器会将新的URL列表传给URL管理器，
如果还有待爬取的URL，上面步骤会循环
最后，调度器会调用应用的方法，输出有价值的数据

2.最重要的函数是

if __name__=='__main__':
    #入口url
    root_url='https://baike.baidu.com/view/21087.html';
    obj_spider=SpiderMain();
    #启动爬虫
    obj_spider.craw(root_url)

爬虫的入口函数

3.
URL管理器模块

#!/user/bin/env python
# _*_ coding:utf-8 _*_
#URL管理器
class UrlManager():
    def __init__(self):
        self.new_urls=set();
        self.odd_urls=set();


    def add_new_url(self,url):
         if url is None:
            return
         if url not in self.new_urls and url not in self.odd_urls:
            self.new_urls.add(url)
    def add_new_urls(self,urs):
        if urs is None and len(urs)==0:
            return
        for url in urs:
            self.add_new_url(url)


    def has_new_url(self):
        return len(self.new_urls)!=0;


    def get_new_url(self):
        new_url=self.new_urls.pop();
        self.odd_urls.add(new_url)
        return new_url

网页下载器模块

#!/user/bin/env python
# _*_ coding:utf-8 _*_
#HTML下载器
import urllib2
class HtmlDownloader():


    #传进来一个要下载的URl
    def download(self,url):
        #如果url为空，则返回空
        if url is None:
            return None;
        response=urllib2.urlopen(url);
        # 获取请求值
        if response.getcode()!=200:
            return None;
        else:
            #返回html字符串
            return response.read()

网页解析器模块

#!/user/bin/env python
# _*_ coding:utf-8 _*_
#html解析器
from bs4 import BeautifulSoup
import re,urlparse
class Html_Parser():
    #获取新的URL
    def _get_new_urls(self,page_url,soup):
        #存储URL
        new_urls=set();
        #获取所有的URl
        #/view/\d+\.htm
        links=soup.find_all('a',href=re.compile(r''))
        for link in links:
            #获取每一个URL
            new_url=link['href']
            #上面获取的URl不完整要拼接
            #urljoin  这个函数能够按照page_url的格式拼接字符串
            new_full_url=urlparse.urljoin(page_url,new_url)
            new_urls.add(new_full_url)
        return new_urls
    #解析数据   我们需要解析title和soup两个数据
    def _get_new_data(self,page_url,soup):
        res_data={}
        res_data['url']=page_url;
        #匹配title节点
        title_node=soup.find('dd',class_='lemmaWgt-lemmaTitle-title',).find('h1')
        #获取title的文本信息
        res_data['title']=title_node.get_text();
        summary_node=soup.find('div',class_='lemma-summary')
        res_data['summary'] = summary_node.get_text();
        return res_data
    #从cont中解析出两个数据（新的URL列表和数据）
    def  parse(self,page_url,html_cont):
        if page_url is None and html_cont is None:
            return;
        soup=BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
        new_urls=self._get_new_urls(page_url,soup)
        #进行解析出新的数据
        new_data = self._get_new_data(page_url, soup)
        return new_urls,new_data

爬虫调度模块

#!/user/bin/env python
# _*_ coding:utf-8 _*_
import url_manage,html_downloader,html_parser,html_outputer
class SpiderMain(object):


    #初始化个个器材
    def __init__(self):
        self.urls=url_manage.UrlManager();
        self.downloder=html_downloader.HtmlDownloader();
        self.parser=html_parser.Html_Parser();
        self.outputer=html_outputer.Html_Outputer()


     #爬虫的调度程序
    def craw(self,root_url):
         count=1;
         self.urls.add_new_url(root_url);
         while self.urls.has_new_url:
             try:
                new_url=self.urls.get_new_url()
                print 'crow %d:%s'%(count,new_url)
                html_cont=self.downloder.download(new_url)
                new_urls ,new_data=self.parser.parse(new_url,html_cont)
                self.urls.add_new_urls(new_urls );
                self.outputer.collect_data(new_data)
                if count==100:
                     break;
                count=count+1;
             except:
                 print 'crow failed'
         self.outputer.output_html()






if __name__=='__main__':
    #入口url
    root_url='https://baike.baidu.com/view/21087.html';
    obj_spider=SpiderMain();
    #启动爬虫
    obj_spider.craw(root_url)

输出爬取的信息

#!/user/bin/env python
# _*_ coding:utf-8 _*_、
#输出爬出来的数据
class Html_Outputer():


    #用于数据的维护
    def __init__(self):
        self.datas=[]


    #用于收集数据
    def collect_data(self,data):
        if data is None:
            return
        self.datas.append(data)
        pass
     #用于将收集好的数据写到HTML中
    def output_html(self):
        #建立一个文件，写的模式
        font=open('outhtml.html','w')
        font.write("<html>")
        font.write("<body>")
        font.write("<table>")
        for data in self.datas:


            font.write("<tr>")
            font.write("<td>%s</td>"%data['url'])
            font.write("<td>%s</td>" % data['title'].encode('utf-8'))
            font.write("<td>%s</td>" % data['summary'].encode('utf-8'))
            font.write("</tr>")


        font.write("</table>")
        font.write("</body>")
        font.write("</html>")
        font.close();
        pass