-- 7-3 URL管理器 https://www.imooc.com/video/10690
-- D:\project_py\py_001\baike_spider\url_manager.py
'''
Created on 2017年12月4日
@author: Administrator
'''
class UrlManager(object):
def __init__(self):
self.new_urls=set()
self.old_urls=set()
def add_new_url(self,url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self,urls):
if urls is None or len(urls)==0:
return
for url in urls:
self.add_new_url(url)
def has_new_url(self):
return len(self.new_urls)!=0
def get_new_url(self):
new_url=self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
-- 7-4 HTML下载器html_downloade https://www.imooc.com/video/10691
-- D:\project_py\py_001\baike_spider\html_downloader.py
import urllib2
class HtmlDownloader(object):
def download(self,url):
if url is None:
return None
response= urllib2.urlopen(url)
if response.getcode()!=200:
return None
return response.read()
-- 7-5 HTML解析器html_parser https://www.imooc.com/video/10692
-- pycharm 中如何用快捷键自动import需要的库 https://segmentfault.com/q/1010000004340490
win: Alt + Enter
-- D:\project_py\py_001\baike_spider\html_parser.py
import urlparse
from bs4 import BeautifulSoup
import re
class HtmlParser( object ):
def _get_new_urls(self, page_url, soup):
new_urls = set()
# /view/123.htm
links = soup.find( 'a', href=re.compile( r"/view/\d+\.htm" ) )
for link in links:
new_url = link['href']
new_full_url = urlparse.urljoin( page_url, new_url )
new_urls.add( new_full_url )
return new_urls
def _get_new_data(self, page_url, soup):
res_data = {}
# url
res_data['url'] = page_url
# <dd class="lemmaWgt-lemmaTitle-title"> <h1>Python</h1>
title_node = soup.find( 'dd', class_='lemmaWgt-lemmaTitle-title' ).find( 'h1' )
res_data['title'] = title_node.get_text()
# <div class="lemma-summary" label-module="lemmaSummary">
summary_node = soup.find( 'div', class_='lemma-summary' )
res_data['summary'] = summary_node.get_text()
return res_data
def parse(self, page_url, html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup( html_cont, 'html.parser', from_encoding='utf-8' )
new_urls = self._get_new_urls( page_url, soup )
new_data = self._get_new_data( page_url, soup )
return new_urls, new_data
</div>
-- 7-6 HTML输出器 https://www.imooc.com/video/10693
-- D:\project_py\py_001\baike_spider\html_outputer.py
class HtmlOutputer( object ):
def __init__(self):
self.datas = []
def collect_data(self, data):
if data is None:
return
self.datas.append( data )
def output_html(self):
fout = open( 'output.html', 'w' )
fout.write( '<html>' )
fout.write( '<body>' )
#ascii
for data in self.datas:
fout.write('<tr>')
fout.write('<td>%s</td>'%data['url'])
fout.write( '<td>%s</td>' % data['title'].encode('utf-8') )
fout.write( '<td>%s</td>' % data['summary'].encode('utf-8') )
fout.write( '</body>' )
fout.write( '</html>' )
fout.close()
-- 7-7 开始运行爬虫和爬取结果展 https://www.imooc.com/video/10694