python_慕课\Python开发简单爬虫\7-7 开始运行爬虫和爬取结果展.py



-- 7-3 URL管理器  https://www.imooc.com/video/10690 






-- D:\project_py\py_001\baike_spider\url_manager.py
'''
Created on 2017年12月4日


@author: Administrator
'''
class UrlManager(object):


    def __init__(self):
        self.new_urls=set()
        self.old_urls=set()




    def add_new_url(self,url):
        if url is None:
            return
        if url not in self.new_urls and  url not in self.old_urls:
            self.new_urls.add(url)




    def add_new_urls(self,urls):
        if urls is None or len(urls)==0:
            return
        for url in urls:
            self.add_new_url(url)


    def has_new_url(self):
        return  len(self.new_urls)!=0


    def get_new_url(self):
        new_url=self.new_urls.pop()
        self.old_urls.add(new_url)
        return  new_url






--  7-4 HTML下载器html_downloade  https://www.imooc.com/video/10691 




--  D:\project_py\py_001\baike_spider\html_downloader.py




import urllib2


class HtmlDownloader(object):
    def download(self,url):
        if url is None:
            return None


        response= urllib2.urlopen(url)
        if response.getcode()!=200:
            return  None


        return  response.read()




-- 7-5 HTML解析器html_parser  https://www.imooc.com/video/10692 




--  pycharm 中如何用快捷键自动import需要的库 https://segmentfault.com/q/1010000004340490 
win: Alt + Enter




-- D:\project_py\py_001\baike_spider\html_parser.py


import urlparse


from bs4 import BeautifulSoup
import re


class HtmlParser( object ):
    def _get_new_urls(self, page_url, soup):
        new_urls = set()
        # /view/123.htm
        links = soup.find( 'a', href=re.compile( r"/view/\d+\.htm" ) )
        for link in links:
            new_url = link['href']
            new_full_url = urlparse.urljoin( page_url, new_url )
            new_urls.add( new_full_url )
        return new_urls


    def _get_new_data(self, page_url, soup):
        res_data = {}


        # url
        res_data['url'] = page_url


        # <dd class="lemmaWgt-lemmaTitle-title">  <h1>Python</h1>
        title_node = soup.find( 'dd', class_='lemmaWgt-lemmaTitle-title' ).find( 'h1' )
        res_data['title'] = title_node.get_text()
        # <div class="lemma-summary" label-module="lemmaSummary">
        summary_node = soup.find( 'div', class_='lemma-summary' )
        res_data['summary'] = summary_node.get_text()


        return res_data


    def parse(self, page_url, html_cont):
        if page_url is None or html_cont is None:
            return


        soup = BeautifulSoup( html_cont, 'html.parser', from_encoding='utf-8' )
        new_urls = self._get_new_urls( page_url, soup )
        new_data = self._get_new_data( page_url, soup )
        return new_urls, new_data


</div>




--  7-6 HTML输出器  https://www.imooc.com/video/10693 


-- D:\project_py\py_001\baike_spider\html_outputer.py


class HtmlOutputer( object ):
    def __init__(self):
        self.datas = []


    def collect_data(self, data):
        if data is None:
            return


        self.datas.append( data )


    def output_html(self):
        fout = open( 'output.html', 'w' )
        fout.write( '<html>' )
        fout.write( '<body>' )


        #ascii
        for data in self.datas:
            fout.write('<tr>')
            fout.write('<td>%s</td>'%data['url'])
            fout.write( '<td>%s</td>' % data['title'].encode('utf-8') )
            fout.write( '<td>%s</td>' % data['summary'].encode('utf-8') )


        fout.write( '</body>' )
        fout.write( '</html>' )
        fout.close()






-- 7-7 开始运行爬虫和爬取结果展  https://www.imooc.com/video/10694 

































  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值