简单的爬虫框架——百度百科

URLManager.py
class URLManager:
	def __init__(self):
		self.new_urls = set()  # 用来存放待爬取的网址
		self.old_urls = set()  # 用来存放已经爬取的网址

	def is_new_urls_empty(self):
		if len(self.new_urls) > 0:
			return False
		else:
			return True

	def add_new_url(self, url):
		if url in self.old_urls:
			pass
		else:
			self.new_urls.add(url)

	def add_new_urls(self, urls):
		for url in urls:
			if url in self.old_urls:
				pass
			else:
				self.new_urls.add(url)

	def get_new_url(self):
		url = self.new_urls.pop()
		self.old_urls.add(url)
		return url
HTMLDownLoader.py
import urllib.request


class HTMDownLoader:

	def get_page(self, url):
		headers = {
			'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
		}
		resqust = urllib.request.Request(url=url, headers=headers)
		response = urllib.request.urlopen(resqust)
		html = response.read().decode('utf-8')
		# print(html)
		return html
HTMLParser.py
from lxml import etree
import urllib.parse


class HTMLParser:
   def parse_content(self, content):
      self.selector = etree.HTML(content, etree.HTMLParser())

   def get_url(self, page_url):
      items = self.selector.xpath('//div[@class="lemma-summary"]//a')
      urls = []
      for item in items:
         url = urllib.parse.urljoin(page_url, item.get('href'))
         # print(url)
         urls.append(url)
      return urls

   def get_abstract(self):
      title = self.selector.xpath('//h1/text()')
      abstracts = self.selector.xpath('//div[@class="lemma-summary"]//text()')
      abstract = str()
      for t in abstracts:
         if t != "\n":
            abstract += t
      return title[0], abstract
FileWriter.py
import os


class FileWriter:
	def __init__(self):
		self.path = None
		self.name = None
		self.file = None

	def set_path(self, path):
		"""
		设置文件保存的目录
		:param path: 保存的文件夹
		:return:
		"""
		self.path = path

	def set_file_name(self, nane):
		self.name = nane

	def open_file(self):
		os.makedirs(self.path, exist_ok=True)
		self.file = open(os.path.join(self.path, self.name), "w", encoding='utf-8')

	def write_file(self, title, abstract):
		self.file.write(title + "	" + abstract)

	def close_file(self):
		self.file.close()
FrameManager.py
from HTMLDownLoader import HTMDownLoader
from HTMLParser import HTMLParser
from URLManager import URLManager
from FileWriter import FileWriter

import urllib.parse
import random
import time


class FrameManager:
	def __init__(self):
		self.downloader = HTMDownLoader()
		self.parser = HTMLParser()
		self.urls = URLManager()
		self.file = FileWriter()

		self.num = 0

	def crawl(self, url):
		#  将起始url添加到待爬取的url中
		self.urls.add_new_url(url=url)
		self.file.set_path(r"D:\tutorial\MyFrame")
		self.file.set_file_name("百度百科.txt")
		self.file.open_file()
		while not self.urls.is_new_urls_empty() and self.num < 5:
			current_url = self.urls.get_new_url()

			html = self.downloader.get_page(current_url)	# 下载网页内容
			self.num += 1
			print("爬取第", self.num, "个页面完成")
			self.parser.parse_content(html)
			urls = self.parser.get_url(current_url)   # 解析网页内容,获取里面的urls
			title, abstract = self.parser.get_abstract()
			print(title)
			print(abstract)
			self.file.write_file(title, abstract)
			print("解析第", self.num, "个页面完成")
			self.urls.add_new_urls(urls=urls)               # 将网页里面的url添加到待爬取的urls

			time.sleep(random.randint(5, 15))
		self.file.close_file()


if __name__ == "__main__":
	framemanager = FrameManager()
	keyword = input("请输入待爬取的百科词条")
	base_url = "https://baike.baidu.com/item/"
	quote = urllib.parse.quote(keyword, encoding="utf-8")
	start_url = base_url + quote
	framemanager.crawl(start_url)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值