爬虫的原理是从一个起始地址开始,获取网页的内容。从网页中解析出两种数据:其他网页的链接,网页的数据内容。
爬虫主要由几个功能组成:下载器(Downloader),地址管理器(URL_manager),网页解析器(parser),输出器(outputer)等。
各部分功能:
下载器负责下载网页的源代码。
class HtmlDownloader(object):
def download(self,new_url):
if new_url is None:
return None
response = urllib.request.urlopen(new_url)
if response.getcode()!=200:
return None
html_cont = response.read()
return html_cont
地址管理器负责储存新旧两种网页地址,新地址是待爬取的地址,旧地址集可防止重复爬取。
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
def add_new_url(self,url):
if url is None:
return None
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self,urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)
网页解析器负责从源代码中解析出有用的数据,包括其他网页的链接、希望得到的有效数据。
class HtmlParser(object):
def _get_new_urls(self, url, soup):
urlset = set()
#http://lianghui.huanqiu.com/2016/roll/2016-03/8722543.html?from=bdwz
#http://news.baidu.com
new_urls = soup.find_all('a',href=re.compile(r'.lianghui.huanqiu.|.huanqiu.lianghui.'))
for link in new_urls:
new_url = link['href']
urlset.add(new_url)
return urlset
def _get_new_data(self, url, soup):
res_data = {}
res_data['url'] = url
title_node = soup.find('div',class_='conText').find('h1')
if title_node is None:
return None
res_data['title'] = title_node.get_text()
soup.find('div', class_='reTopics').decompose()
soup.find('div', class_='text-c mg_t20').decompose()
content_node = soup.select('.conText .text > p')
if content_node is None:
return None
str = ''
i=0
for content in content_node:
#content.select('').decompose()
#print(type(content))
str = str + content.get_text().strip() + '\n'
i+=1
res_data['content'] = str
print(res_data)
return res_data
输出器把得到的数据输出到文件中保存。
fout = open("output.txt",'w',encoding='utf-8')
for data in self.datas:
fout.write('%s'% data['title'])
fout.write('\n')
fout.write('%s'% data['url'])
fout.write('\n')
fout.write('%s'% data['content'])
fout.write('\n=====NEXT PAGE=====\n')
fout.close()