Python3爬取网站全网小说，保存本地及mysql数据库，处理本地文件乱序及合并

最新推荐文章于 2023-09-30 00:15:00 发布

路过了全世界

最新推荐文章于 2023-09-30 00:15:00 发布

阅读量3.4k

点赞数

分类专栏：项目学习 python mysql 文章标签： python

本文链接：https://blog.csdn.net/qq_39750658/article/details/108703337

版权

项目学习同时被 3 个专栏收录

2 篇文章 0 订阅

订阅专栏

python

2 篇文章 0 订阅

订阅专栏

mysql

1 篇文章 0 订阅

订阅专栏

源码在最后

//                            _ooOoo_
//                           o8888888o
//                           88" . "88
//                           (| -_- |)
//                            O\ = /O
//                        ____/`---'\____
//                      .   ' \| |// `.
//                       / \||| : |||// \
//                     / _||||| -:- |||||- \
//                       | | \\\ - /// | |
//                     | \_| ''\---/'' | |
//                      \ .-\__ `-` ___/-. /
//                   ___`. .' /--.--\ `. . __
//                ."" '< `.___\_<|>_/___.' >'"".
//               | | : `- \`.;`\ _ /`;.`/ - ` : | |
//                 \ \ `-. \_ __\ /__ _/ .-` / /
//         ======`-.____`-.___\_____/___.-`____.-'======
//                            `=---='
//
//         .............................................
//                  佛祖镇楼                  BUG辟易
//          佛曰:
//                  写字楼里写字间，写字间里程序员；
//                  程序人员写程序，又拿程序换酒钱。
//                  酒醒只在网上坐，酒醉还来网下眠；
//                  酒醉酒醒日复日，网上网下年复年。
//                  但愿老死电脑间，不愿鞠躬老板前；
//                  奔驰宝马贵者趣，公交自行程序员。
//                  别人笑我忒疯癫，我笑自己命太贱；
//                  不见满街漂亮妹，哪个归得程序员？

1.工程搭建

scrapy startproject xinnuan

2.实现过程

1.在items.py中定义自己要抓取的数据

import scrapy


class XinnuanItem(scrapy.Item):
    name = scrapy.Field()
    chapter = scrapy.Field()
    url = scrapy.Field()
    uri = scrapy.Field()
    content = scrapy.Field()

2.在spiders下新建xinnuan2.py编写爬取逻辑

# -*- coding: utf-8 -*-
import scrapy
import unicodedata
from scrapy.http import Request
from ..items import XinnuanItem


class Xncwxw2Spider(scrapy.Spider):
    name = 'xncwxw2'
    allowed_domains = ['m.xncwxw2.com']
    start_urls = ['http://m.xncwxw2.com/xclass/0/1.html']
    base_url = 'http://m.xncwxw2.com'

    def parse(self, response):
        # num = response.xpath("//p[@class='page']/input/@value").extract()[0].split("/")[-1]
        # print("共 %s 页"%num)
        # 下一页
        next_link = response.xpath("//p[@class='page']/a[text()='[下页]']/@href").extract()
        if next_link:
            yield Request(url=self.base_url + next_link[0],callback=self.parse)

        page_links = response.xpath('//div[@id="main"]/div/a/@href').extract()
        for link in page_links:
            link = self.base_url + link
            yield Request(url=link,callback=self.parse_chapter_list)


    # 进入章节列表
    def parse_chapter_list(self,response):
        next_link = response.xpath('//span[@class="right"]/a/@href').extract()
        # 下一页
        if next_link:
            yield Request(url=self.base_url+next_link[0],callback=self.parse_chapter_list)

        chapter_links = response.xpath('//div[@class="info_chapters"]/ul[2]//a/@href').extract()
        if chapter_links:
            for link in chapter_links:
                yield Request(url=self.base_url+link,callback=self.parse_chapter)


    # 进入章节内容
    def parse_chapter(self,response):
        # 下一步
        next_link = response.xpath('//a[text()="下一页"]/@href').extract()
        if next_link:
            yield Request(url=self.base_url, callback=self.parse_chapter)

        item = XinnuanItem()
        item['url'] = response.url
        item['uri'] = response.url.replace(self.base_url,"").replace(".html","")
        item['name'] = response.xpath('//div[@class="pagetitle"]/h1/text()').extract()[0]
        item['chapter'] = unicodedata.normalize('NFKC',response.xpath('//div[@id="nr_title"]/text()').extract()[0])
        contents = response.xpath('//div[@id="nr1"]/text()').extract()
        content = "".join(contents[6:])
        item['content'] = unicodedata.normalize('NFKC',content)
        yield item

3.编写pipelines异步处理保存内容

本地保存

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os

class XinnuanPipeline(object):

    def __init__(self, file_dir=False):
    	# 定义分隔符
        self.separate = '/'
        if file_dir:
            self.file_dir = file_dir

    # 初始化获取settings 的属性
    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        return cls(settings['FILE_DIR'])

    def process_item(self, item, spider):
        path = self.file_dir + item['name'] + self.separate
        if not os.path.exists(path):
            os.makedirs(path)

        filePath = path + item['uri'][9:] + '.hj'
        if os.path.exists(filePath):
            os.remove(filePath)

        with open(filePath,mode='a',encoding='gbk') as f:
            f.write(item['chapter'])
            f.write(item['content'])
        return item

数据库保存

import pymysql
from twisted.enterprise import adbapi


class MysqldbPipeline(object):

    def __init__(self,dbpool):
        self.dbpool = dbpool

    @classmethod
    def from_settings(cls, settings):  # 函数名固定，会被scrapy调用，直接可用settings的值
        """
        数据库建立连接
        :param settings: 配置参数
        :return: 实例化参数
        """
        adbparams = dict(
            host=settings['MYSQL_HOST'],
            db=settings['MYSQL_DBNAME'],
            user=settings['MYSQL_USER'],
            password=settings['MYSQL_PASSWORD'],
            cursorclass=pymysql.cursors.DictCursor  # 指定cursor类型
        )
        # 连接数据池ConnectionPool，使用pymysql或者Mysqldb连接
        dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
        # 返回实例化参数
        return cls(dbpool)

    def process_item(self, item, spider):
        """
        使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作，返回一个对象
        """
        query = self.dbpool.runInteraction(self.do_insert, item)  # 指定操作方法和操作数据
        # 添加异常处理
        query.addCallback(self.handle_error)  # 处理异常

    def do_insert(self, cursor, item):
        # 对数据库进行插入操作，并不需要commit，twisted会自动commit
        insert_sql = """
           INSERT INTO xinnuan2(url,uri,name,chapter,content) VALUES(%s,%s,%s,%s,%s)
                       """
        cursor.execute(insert_sql, (item['url'], item['uri'], item['name'], item['chapter'],
                                    item['content']))

    def handle_error(self, failure):
        if failure:
            # 打印错误信息
            print(failure)

4.修改项目配置文件settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for xinnuan project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'xinnuan'

SPIDER_MODULES = ['xinnuan.spiders']
NEWSPIDER_MODULE = 'xinnuan.spiders'

# 设置日志的等级与日志存放的路径
LOG_LEVEL= "DEBUG"
LOG_FILE="../logs/xiaoshuo.log"

# 文件保存路径
FILE_DIR="F:/txt/xinnuan2/"

# 自定义mysql配置
MYSQL_HOST="127.0.0.1"
MYSQL_DBNAME="python"
MYSQL_USER="root"
MYSQL_PASSWORD="root"

# 默认是注释的，这个东西非常重要，如果不写很容易被判断为电脑，简单点洗一个Mozilla/5.0即可
USER_AGENT = 'Mozilla/5.0'

# 是否遵循机器人协议，默认是true，需要改为false，否则很多东西爬不了
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 16
CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'xinnuan.middlewares.XinnuanSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'xinnuan.middlewares.XinnuanDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'xinnuan.pipelines.pipelines.XinnuanPipeline': 300,
    'xinnuan.pipelines.mysqlPipelines.MysqldbPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

5.编写项目运行文件

from scrapy import cmdline

cmdline.execute('scrapy crawl xncwxw2'.split())

6.编写本地保存文件排序及合并

import os
import shutil

file_dir = 'F:/txt/xinnuan2'

# 合并文件
def merge(dir):
    if not os.path.exists(dir):
        print("目录不存在！")
        return
    listdir = os.listdir(dir)
    for name in listdir:
        path = getpath(dir,name)
        if ismerge(path): # 判断是否符合合并条件 要不要都行
            merging(path,name,dir)

# 合并文件
def merging(dir,fileName,savePath):
    if not os.path.exists(dir):
        return
    if not os.path.exists(savePath):
        return
    if not os.path.isdir(dir):
        return
    print("目录文件夹：%s 正在合并……"%dir)
    print("正在生成文件：%s.txt ……"%fileName)
    filePath = getpath(savePath,fileName) + '.txt'
    if os.path.exists(filePath):
        os.remove(filePath)
    with open(filePath,mode='a',encoding='gbk')as f:
        fileList = os.listdir(dir)
        fileList.sort()
        print(fileList)
        for name in fileList:
            if not name.endswith('.hj'):
                continue
            path = getpath(dir, name)
            if not os.path.isfile(path):
                continue
            with open(path,mode='r',encoding='gbk') as r:
                line = r.readline()
                while line:
                    f.write(line)
                    line = r.readline()

    print("文件《%s.txt》生成成功！！！" % fileName)
    # 删除已合并文件夹
    shutil.rmtree(dir)



def getpath(dir,name):
    return dir + '/' + name


# 合并条件判断
def ismerge(dir):
    if not os.path.exists(dir):
        return False
    if not os.path.isdir(dir):
        return False
    listdir = os.listdir(dir)
    for name in listdir:
        path = getpath(dir,name)
        if not os.path.isfile(path):
            return False
        if not path.endswith(".hj"):
            return False
    return True

merge(file_dir)