中国古诗词网 import requests import re from pymongo import MongoClient class Poetry: def __init__(self): self.headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"} self.client = MongoClient(host="127.0.0.1", port=27017) self.poetry = self.client['spider']['poetry'] self.poetries = [] def get_text(self, url): r = requests.get(url, headers=self.headers) r.encoding = r.apparent_encoding text = r.text return text def parse_text(self, text): #添加re.DOTALL方法后,.可以匹配\n即.可以匹配任意字符 titles = re.findall(r'<div class="cont">.*?<b>(.*?)</b>', text, re.DOTALL) dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL) authors = re.findall(r'<p class="source">.*?<a.*?<a.*?>(.*?)</a>', text, re.DOTALL) raw_contents = re.findall(r'<div class="contson".*?>(.*?)</div>', text, re.DOTALL) contents = [] for item in raw_contents: new_item = re.sub(r'<.*?>|\n', "", item) contents.append(new_item.strip()) tmp = zip(titles, dynasties, authors, contents) for item in tmp: title, dynasty, author, content = item poetry = { 'title': title, 'dynasty': dynasty, 'author': author, 'content': content } self.poetries.append(poetry) def print_poetries(self): for item in self.poetries: print(item) def save_to_db(self): self.poetry.insert_many(self.poetries) def run(self): template_url = "https://www.gushiwen.org/default_{}.aspx" for i in range(1, 11): url = template_url.format(i) text = self.get_text(url) self.parse_text(text) self.print_poetries() self.save_to_db() p = Poetry() p.run()