爬虫之诗文传颂

# encoding: utf-8
# author: Batac

import requests
import re
import json

class ShiwenSpider:
    """诗文数据分析工具"""

    def __init__(self):
        """程序初始化"""
        self.current_page = 1
        self.total_page = 2
        self.base_url = "https://www.***.org/default_"+str(self.current_page)+".aspx"
        self.header = {
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
        }

    def parse_url(self):
        """发送请求, 获取数据"""
        html = requests.get(self.base_url, headers=self.header)
        return html.content.decode('utf-8')

    def data_contetn(self, html_str):
        """解析数据"""
        total_page = re.findall(r'<label id="sumPage".*?>(.*?)</label>', html_str)
        if len(total_page) > 0:
            # 记录总页数
            page = int(total_page[0])
            if self.total_page < page:
                self.total_page = page
        titles = re.findall(r'<div\sclass="yizhu">.*?<b>(.*?)</b>', html_str, re.DOTALL)
        chaodai = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', html_str)
        author = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', html_str)
        contents = re.findall(r'<div\sclass="contson"\sid=".*?">(.*?)</div>', html_str, re.DOTALL)
        pems = []
        for content in contents:
            x = re.sub('<.*?>', '', content)
            pems.append(x.strip())
        total_list = []
        for value in zip(titles, author, chaodai, pems):
            item = {}
            title, name, chao, con = value
            item["title"] = title
            item["chaodai"] = chao
            item["name"] = name
            item["content"] = con
            total_list.append(item)
        return total_list


    def save_data(self,list):
        """保存数据"""
        with open("movice.txt", "a", encoding="utf-8") as f:
            for content in list:
                f.write(json.dumps(content, ensure_ascii=False, indent=2))
                f.write("\n")
        print("第"+str(self.current_page)+"页保存结束")

    def run(self):
        """运行项目"""
        while self.total_page >= self.current_page:
            print("第" + str(self.current_page) + "页开始查询数据")
            html = self.parse_url()
            list = self.data_contetn(html)
            self.save_data(list)
            self.current_page += 1
            self.base_url = "https://www.***.org/default_"+str(self.current_page)+".aspx"




if __name__ == "__main__":
    sw = ShiwenSpider()
    sw.run()

备注:项目只用作学习交流使用;

发布了124 篇原创文章 · 获赞 11 · 访问量 4万+
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 大白 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览