用正则表达式爬取古诗词网


在这里插入图片描述

import requests
import re
from pymongo import MongoClient


class Poetry:
    def __init__(self):
        self.headers = {
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"}
        self.client = MongoClient(host="127.0.0.1", port=27017)
        self.poetry = self.client['spider']['poetry']
        self.poetries = []

    def get_text(self, url):
        r = requests.get(url, headers=self.headers)
        r.encoding = r.apparent_encoding
        text = r.text
        return text

    def parse_text(self, text):
        #添加re.DOTALL方法后,.可以匹配\n即.可以匹配任意字符
        titles = re.findall(r'<div class="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
        dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL)
        authors = re.findall(r'<p class="source">.*?<a.*?<a.*?>(.*?)</a>', text, re.DOTALL)
        raw_contents = re.findall(r'<div class="contson".*?>(.*?)</div>', text, re.DOTALL)
        contents = []
        for item in raw_contents:
            new_item = re.sub(r'<.*?>|\n', "", item)
            contents.append(new_item.strip())
        tmp = zip(titles, dynasties, authors, contents)
        for item in tmp:
            title, dynasty, author, content = item
            poetry = {
                'title': title,
                'dynasty': dynasty,
                'author': author,
                'content': content
            }
            self.poetries.append(poetry)

    def print_poetries(self):
        for item in self.poetries:
            print(item)

    def save_to_db(self):
        self.poetry.insert_many(self.poetries)

    def run(self):
        template_url = "https://www.gushiwen.org/default_{}.aspx"
        for i in range(1, 11):
            url = template_url.format(i)
            text = self.get_text(url)
            self.parse_text(text)
        self.print_poetries()
        self.save_to_db()


p = Poetry()
p.run()
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值