想想Python的import导包,很方便,爬虫也可以自己建立模块,把网址给他,模块解析返回网页信息,在爬取大型网页时很方便。安装我们写爬虫程序的内容,可以分为URL管理器、HTML下载器、HTML解析器、数据存储器、爬虫调度器
URL管理器
class UrlManager():
#初始化连个空集合
def __init__(self):
self.new_urls = set()
self.old_urls = set()
#添加url到待爬取url集合中
def add_new_url(self, url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
#判断是否有新的url
def has_new_url(self):
return self.new_url_size() != 0
#在未被爬取的URL集合中获取一个url连接
def get_new_url(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
#把爬取到的链接添加到待爬取url集合中
def add_new_urls(self,urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)
#获得待爬取url大小
def new_url_size(self):
return len(self.new_urls)
#获得已经爬取的url大小
def old_url_size(self):
return len(self.old_urls)
HTML下载器
import requests
class HtmlDownloader():
def download(self,url):
#判断url是否为空
if url is None:
return None
headers = {
'User-Agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko) Chrome/70.0.3538.77Safari/537.36'
}
res = requests.get(url,headers=headers)
# print(res.text)
if res.status_code == 200:
res.encoding = 'utf-8'
return res.text
else:
return None
HTML解析器
import re
from urllib import request
from bs4 import BeautifulSoup
class HtmlParser(object):
def parser(self,page_url,html_cont):
#判断传入的url和网页内容是否为空
if page_url is None or html_cont is None:
return
#构建soup对象
soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
new_urls = self._get_new_urls(soup)
new_data = self._get_new_data(page_url,soup)
return new_urls,new_data
def _get_new_urls(self,soup):
new_urls = set()
#抽取符合要求的a标签,原书代码不可用,百度已经更新
# links = soup.find_all('a',href=re.compile(r'/view/\d+\.htm'))
links = soup.find_all('a', href=re.compile(r'/item/.*'))
for link in links:
#提取标签的href属性
new_url = link['href']
#拼接成完整网址
# print(new_url)
base_url = 'https://baike.baidu.com'
new_full_url = request.urljoin(base_url,new_url)
# print(new_full_url)
new_urls.add(new_full_url)
return new_urls
def _get_new_data(self,page_url,soup):
data={}
data['url']=page_url
title = soup.find('dd',class_='lemmaWgt-lemmaTitle-title').find('h1')
print(title.string)
data['title']=title.string
summary = soup.find('div',class_='lemma-summary').find('div',class_='para').find('a')
print(summary.string)
data['summary']=summary.string
return data
数据存储器
import codecs
class DataOutput():
def __init__(self):
self.datas = []
def store_data(self,data):
if data is None:
return
self.datas.append(data)
def output_html(self):
#以网页格式写入数据,数据为表
fout = codecs.open('bike.html','w',encoding='utf-8')
fout.write("<html>")
fout.write("<head><meta charset='utf-8'></head>")
fout.write("<body>")
fout.write("<table>")
for data in self.datas:
fout.write("<tr>")
fout.write("<td>%s</td>"%data['url'])
fout.write("<td>%s</td>"%data['title'])
fout.write("<td>%s</td>"%data['summary'])
fout.write("<tr>")
self.datas.remove(data)
fout.write("</html>")
fout.write("</body>")
fout.write("</table>")
fout.close()
爬虫调度器
from spider_base.DataOutput import DataOutput
from spider_base.HtmlDownloader import HtmlDownloader
from spider_base.HtmlParser import HtmlParser
from spider_base.UrlManager import UrlManager
class SpiderMan():
def __init__(self):
self.manager = UrlManager()
self.downloader = HtmlDownloader()
self.parser= HtmlParser()
self.output = DataOutput()
def crawl(self,root_url):
self.manager.add_new_url(root_url)
#爬取数据100条,不限制就会一直爬下去
while(self.manager.has_new_url() and self.manager.old_url_size()<100):
try:
new_url = self.manager.get_new_url()
html = self.downloader.download(new_url)
new_urls,data = self.parser.parser(new_url,html)
self.manager.add_new_urls(new_urls)
self.output.store_data(data)
print('已经抓取的链接数%s'%self.manager.old_url_size())
except Exception as e:
print('crawl failed')
self.output.output_html()
if __name__=='__main__':
spider_man = SpiderMan()
spider_man.crawl('https://baike.baidu.com/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB')