上代码之前先说下这个简易爬虫框架的思路:
目标:爬取百度百科python下的所有带标签的链接
相关模块及框架:urllib BeautifulSoup
这是一个简单的爬虫框架,主要分5个模块:调度器、url管理器、html下载器、html解析器、html输出器
调度器:爬虫的入口
url管理器:首先把入口url交给下载器,下载器下载html数据,交给解析器,解析器解析到需要的数据并存储,同时得到新的的url并把url交给url管理器,用于下一条数据的下载
知道没有url或爬虫终端,输出结果
上代码:
1,调度器
from MySpider import urls_manager, html_downloader, html_paser, html_outer
class LegendSpider(object):
def __init__(self):
# url管理器
self.url_manager = urls_manager.UrlManager()
# html下载器
self.downloader = html_downloader.HtmlDownloader()
# HTML解析器
self.parser = html_paser.HtmlParser()
# html输出器
self.outer = html_outer.Htmlouter()
# 开始爬虫
def start_crow(self):
count = 1
# 把爬虫的入口url添加到url管理器
self.url_manager.add_new_url(root_url)
# 当URL管理器里面有url时继续爬虫
while self.url_manager.has_new_url():
try:
# 获取到url管理器的url
new_url = self.url_manager.get_new_url()
print('craw: %d: %s' % (count, new_url))
# 通过URL下载器把刚获取到的url内容下载下来
html_content = self.downloader.download_html(new_url)
# 通过url解析器解析到下载下来的HTML内容,获取到新的url和需要的数据
new_urls, new_data = self.parser.parse(new_url, html_content)
# 把获取到的新的url继续添加到URL管理器,用于后面继续爬虫
self.url_manager.add_new_urls(new_urls)
# 把爬取到的数据收集起来
self.outer.collect_datas(new_data)
if count == 50:
break
count = count+1
except:
print('craw failed')
# 将爬取到的数据输出
self.outer.out_put()
if __name__ == '__main__':
# 爬虫入口
root_url = 'https://baike.baidu.com/item/Python'
# 创建爬虫调度器类
spider_obj = LegendSpider()
# 调度器对象调用开始爬虫方法
spider_obj.start_crow()
2,url管理器
class UrlManager(object):
def __init__(self):
# 初始化时候定义未爬取、已爬取两个url set
self.new_urls = set()
self.old_urls = set()
# 添加新的url
def add_new_url(self, new_url):
if new_url is None:
return
if new_url not in self.new_urls and new_url not in self.old_urls:
self.new_urls.add(new_url)
# 添加新的多个url
def add_new_urls(self, new_urls):
if new_urls is None or len(new_urls) == 0:
return
for new_url in new_urls:
self.add_new_url(new_url)
# 是否还有未爬取的url
def has_new_url(self):
return len(self.new_urls) != 0
# 获取未爬取的url
def get_new_url(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
3,html下载器
from urllib import request
class HtmlDownloader(object):
# 下载url相应页面的数据
def download_html(self, new_url):
if new_url is None:
return None
with request.urlopen(new_url) as f:
if f.status != 200:
return None
return f.read().decode('utf-8')
4,html解析器
import re
from urllib import parse
from bs4 import BeautifulSoup
class HtmlParser(object):
def parse(self, page_url, html_content):
if page_url is None or html_content is None:
return
# 解析使用第三方框架BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
new_urls = self.get_new_urls(page_url, soup)
new_data = self.get_new_data(page_url, soup)
return new_urls, new_data
def get_new_urls(self, page_url, soup):
new_urls = set()
# < a target = "_blank" href = "/item/%E6%95%99%E5%AD%A6" > 教学 < / a >
links = soup.find_all('a', href=re.compile(r'/item/[0-9a-zA-Z%]+'))
for link in links:
new_url = link['href']
full_url = parse.urljoin(page_url, new_url)
new_urls.add(full_url)
return new_urls
def get_new_data(self, page_url, soup):
new_data = {'url': page_url}
# <dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1>
title_node = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1')
new_data['title'] = title_node.get_text()
# < div class ="lemma-summary" label-module="lemmaSummary" >
summary_node = soup.find('div', class_='lemma-summary')
new_data['summary'] = summary_node.get_text()
return new_data
5,html输出器
class Htmlouter(object):
def __init__(self):
self.datas = []
def collect_datas(self, new_data):
if new_data is None:
return
self.datas.append(new_data)
# 输出html格式文件
def out_put(self):
fout = open('out.html', 'w', encoding='utf-8')
fout.write('<html>')
fout.write('<head>')
# 网页的编码格式为utf - 8
fout.write('<meta charset = "UTF-8"/>')
fout.write('</head>')
fout.write('<body>')
fout.write('<table>')
for data in self.datas:
fout.write('<tr>')
fout.write('<td>%s</td>' % data['url'])
fout.write('<td>%s</td>' % data['title'])
fout.write('<td>%s</td>' % data['summary'])
fout.write('</td>')
fout.write('</tr>')
fout.write('</table>')
fout.write('</body>')
fout.write('</html>')
结果就是这样式儿的: