1.抓取步骤
2.如何确定抓取策略:
- 1.url格式
- 2.数据格式
- 3.页面编码
点击右键审查元素
查看关键的标签对和网页的编码
3.百度百科相关词条信息
注意:目前是定向爬虫,爬取的数据策略可能随着网站更新而变化
4.启动模块-main.py
# coding=utf-8
#!/usr/bin/python
import url_manager, html_download, html_parser, html_outputer
class SpiderMain(object):
def __init__(self):
# URL 管理器
self.urls = url_manager.UrlManager()
# URL 下载器
self.downloader = html_download.HtmlDownload()
# URL 解析器
self.parser = html_parser.HtmlParser()
# URL 输出器
self.outputer = html_outputer.HtmlOutputer()
# 爬虫的调度程序
def craw(self, root_url):
# 爬取词条个数
count = 1
self.urls.add_new_url(root_url)
while self.urls.has_new_url():
try:
# 获取待爬取的 URL
new_url = self.urls.get_new_url()
print("craw %d : %s" % (count, new_url))
# 获取网页的源码
html_content = self.downloader.downloader(new_url)
if html_content is None:
print("html_content None")
# 分析网页结构,获得这个网页的urls和标题,简介
new_urls, new_data = self.parser.parse(new_url, html_content)
# 更新总的新的urls库
self.urls.add_new_urls(new_urls)
# 保存这个urls的数据信息
self.outputer.collect_data(new_data)
# 只爬取 1000 条的数据
if(count == 1000):
break
count = count + 1
except:
# traceback.print_exc()
print("craw failed")
self.outputer.output_html()
if __name__ == '__main__':
# 入口 URL:百度百科的 Python 相关的百度词条
root_url = "https://baike.baidu.com/item/Python/407313"
obj_spider = SpiderMain()
# 启动爬虫
obj_spider.craw(root_url)
5.URL 管理器-html_manager.py
# coding=utf-8 # !/usr/bin/python # URL 管理器 class UrlManager(object): def __init__(self): # 保存url,set可以保证不重复的url self.new_urls = set() # 未爬取信息的url self.old_urls = set() # 已爬取信息的url def add_new_url(self, url): if url is None: return # 这个url不在new_urls和old_urls,则加入到new_urls if url not in self.new_urls and url not in self.old_urls: self.new_urls.add(url) # print("add_new_url", url) def add_new_urls(self, urls): """ :param urls: list形式 :return: None """ if urls is None or len(urls) == 0: return for url in urls: self.add_new_url(url) def has_new_url(self): """ 判断是否还有新的未爬取内容的url :return: bool """ return len(self.new_urls) != 0 def get_new_url(self): """ 取出并且移除 :return: 取出的url """ new_url = self.new_urls.pop() self.old_urls.add(new_url) #print("get_new_url", new_url) return new_url
6.HTML 下载器-html_download.py
# coding=utf-8 #!/usr/bin/python ''' HTML 下载器 ''' import urllib.request class HtmlDownload(object): def downloader(self, url): # 获取url的源码 if url is None: return None response = urllib.request.urlopen(url) if response.getcode() != 200: print("response.getcode() =", response.getcode()) return None return response.read()
7.Html 解析器-html_parser.py
# coding=utf-8 # !/usr/bin/python ''' Html 解析器 传入 url 解析新的url列表(title 和 summary) ''' from bs4 import BeautifulSoup import re from urllib.parse import urlparse from urllib.parse import urljoin import urllib.parse """ 三个方法: get_new_urls(self, page_url, soup): 查找urls get_new_data(self, page_url, soup): 查找html里的信息 parse(self, page_url, html_content): BeautifulSoup分析html_content, """ class HtmlParser(object): # 分析爬取的网页源码,获得urls和网页信息 def _get_new_urls(self, page_url, soup): new_urls = set() # 获取所有的链接,例如a 标签 ''' <a class="lock-lemma" target="_blank" href="/view/10812319.htm" ''' links = soup.find_all('a', href=re.compile(r"/item/")) for link in links: new_url = link['href'] # 拼接 url # python 2x # new_full_url = urlparse.urljoin(page_url, new_url) # page_url保存主机部分,new_url拼接在后面 new_full_url = urllib.parse.urljoin(page_url, new_url) new_urls.add(new_full_url) # page_url https://baike.baidu.com/item/Python/407313 # new_url /item/秒懂本尊答 # new_full_url https://baike.baidu.com/item/秒懂本尊答 return new_urls def _get_new_data(self, page_url, soup): """ 获得标题和标题简介 :param page_url: :param soup: :return: """ res_data = {} # Url res_data['url'] = page_url # F12 右键Pyton-查看(N) ''' <dd class="lemmaWgt-lemmaTitle-title"> <h1>Python</h1> <h2>(计算机程序设计语言)</h2> <a href="javascript:;" class="edit-lemma cmn-btn-hover-blue cmn-btn-28 j-edit-link" style="display: inline-block;"><em class="cmn-icon wiki-lemma-icons wiki-lemma-icons_edit-lemma"></em>编辑</a> <a class="lock-lemma" target="_blank" href="/view/10812319.htm" title="锁定"><em class="cmn-icon wiki-lemma-icons wiki-lemma-icons_lock-lemma"></em>锁定</a> </dd> ''' # dd标签里的h1 title_node = soup.find( 'dd', class_='lemmaWgt-lemmaTitle-title').find("h1") res_data['title'] = title_node.get_text() ''' <div class="lemma-summary" label-module="lemmaSummary"> <div class="para" label-module="para">Python<sup>[1]</sup><a class="sup-anchor" name="ref_[1]_21087"> </a> (英国发音:/ˈpaɪθən/ 美国发音:/ˈpaɪθɑːn/), 是一种面向对象的解释型<a target="_blank" href="/item/%E8%AE%A1%E7%AE%97%E6%9C%BA%E7%A8%8B%E5%BA%8F%E8%AE%BE%E8%AE%A1%E8%AF%AD%E8%A8%80">计算机程序设计语言</a>,由荷兰人<a target="_blank" href="/item/Guido%20van%20Rossum">Guido van Rossum</a>于1989年发明,第一个公开发行版发行于1991年。</div><div class="para" label-module="para">Python是纯粹的<a target="_blank" href="/item/%E8%87%AA%E7%94%B1%E8%BD%AF%E4%BB%B6">自由软件</a>, <a target="_blank" href="/item/%E6%BA%90%E4%BB%A3%E7%A0%81/3969" data-lemmaid="3969">源代码</a>和<a target="_blank" href="/item/%E8%A7%A3%E9%87%8A%E5%99%A8">解释器</a>CPython遵循 <a target="_blank" href="/item/GPL">GPL</a>(<a target="_blank" href="/item/GNU">GNU</a> General Public License)协议。Python语法简洁清晰,特色之一是强制用空白符(white space)作为语句缩进。</div><div class="para" label-module="para">Python具有丰富和强大的库。它常被昵称为<a target="_blank" href="/item/%E8%83%B6%E6%B0%B4%E8%AF%AD%E8%A8%80">胶水语言</a>,能够把用其他语言制作的各种模块(尤其是<a target="_blank" href="/item/C/7252092" data-lemmaid="7252092">C</a>/<a target="_blank" href="/item/C%2B%2B">C++</a>)很轻松地联结在一起。常见的一种应用情形是,使用Python快速生成程序的原型(有时甚至是程序的最终界面),然后对其中有特别要求的部分,用更合适的语言改写,比如<a target="_blank" href="/item/3D%E6%B8%B8%E6%88%8F">3D游戏</a>中的图形渲染模块,性能要求特别高,就可以用C/C++重写,而后封装为Python可以调用的扩展类库。需要注意的是在您使用扩展类库时可能需要考虑平台问题,某些可能不提供<a target="_blank" href="/item/%E8%B7%A8%E5%B9%B3%E5%8F%B0">跨平台</a>的实现。</div><div class="para" label-module="para">7月20日,IEEE发布2017年编程语言排行榜:Python高居首位<sup>[2]</sup><a class="sup-anchor" name="ref_[2]_21087"> </a> 。</div> </div> ''' summary_node = soup.find('div', class_="lemma-summary") res_data['summary'] = summary_node.get_text() return res_data def parse(self, page_url, html_content): if page_url is None or html_content is None: return # print('parse html_content = ', html_content) soup = BeautifulSoup( html_content, 'html.parser', from_encoding='utf-8') # 获取html源码里的链接 new_urls = self._get_new_urls(page_url, soup) # 获取html里源码的数据信息 new_data = self._get_new_data(page_url, soup) return new_urls, new_data
8.Html 输出器-html_outputer.py
# coding=utf-8 #!/usr/bin/python ''' Html 输出器 ''' class HtmlOutputer(object): def __init__(self): self.datas = [] def collect_data(self, data): if data is None: return #print("collect_data = ", data) self.datas.append(data) def output_html(self): fout = open('output.html', 'w', encoding='utf-8') fout.write('<html>') fout.write('<body>') fout.write('<table>') for data in self.datas: fout.write('<tr>') fout.write("<td>%s</td>" % data['url']) fout.write("<td>%s</td>" % data['title']) fout.write("<td>%s</td>" % data['summary']) fout.write('</tr>') fout.write('</table>') fout.write('</body>') fout.write('</html>') fout.close()
9.代码结构
10.运行结果
爬取1000个百科相关网页,并解析title和summary
此文转载文,著作权归作者所有,如有侵权联系小编删除!
原文地址:https://blog.csdn.net/qq_27009517/article/details/