Python3爬取网站全网小说,保存本地及mysql数据库,处理本地文件乱序及合并

2 篇文章 0 订阅
1 篇文章 0 订阅

源码在最后

//                            _ooOoo_
//                           o8888888o
//                           88" . "88
//                           (| -_- |)
//                            O\ = /O
//                        ____/`---'\____
//                      .   ' \| |// `.
//                       / \||| : |||// \
//                     / _||||| -:- |||||- \
//                       | | \\\ - /// | |
//                     | \_| ''\---/'' | |
//                      \ .-\__ `-` ___/-. /
//                   ___`. .' /--.--\ `. . __
//                ."" '< `.___\_<|>_/___.' >'"".
//               | | : `- \`.;`\ _ /`;.`/ - ` : | |
//                 \ \ `-. \_ __\ /__ _/ .-` / /
//         ======`-.____`-.___\_____/___.-`____.-'======
//                            `=---='
//
//         .............................................
//                  佛祖镇楼                  BUG辟易
//          佛曰:
//                  写字楼里写字间,写字间里程序员;
//                  程序人员写程序,又拿程序换酒钱。
//                  酒醒只在网上坐,酒醉还来网下眠;
//                  酒醉酒醒日复日,网上网下年复年。
//                  但愿老死电脑间,不愿鞠躬老板前;
//                  奔驰宝马贵者趣,公交自行程序员。
//                  别人笑我忒疯癫,我笑自己命太贱;
//                  不见满街漂亮妹,哪个归得程序员?

1.工程搭建

scrapy startproject xinnuan

2.实现过程

1.在items.py中定义自己要抓取的数据

import scrapy


class XinnuanItem(scrapy.Item):
    name = scrapy.Field()
    chapter = scrapy.Field()
    url = scrapy.Field()
    uri = scrapy.Field()
    content = scrapy.Field()

2.在spiders下新建xinnuan2.py编写爬取逻辑

# -*- coding: utf-8 -*-
import scrapy
import unicodedata
from scrapy.http import Request
from ..items import XinnuanItem


class Xncwxw2Spider(scrapy.Spider):
    name = 'xncwxw2'
    allowed_domains = ['m.xncwxw2.com']
    start_urls = ['http://m.xncwxw2.com/xclass/0/1.html']
    base_url = 'http://m.xncwxw2.com'

    def parse(self, response):
        # num = response.xpath("//p[@class='page']/input/@value").extract()[0].split("/")[-1]
        # print("共 %s 页"%num)
        # 下一页
        next_link = response.xpath("//p[@class='page']/a[text()='[下页]']/@href").extract()
        if next_link:
            yield Request(url=self.base_url + next_link[0],callback=self.parse)

        page_links = response.xpath('//div[@id="main"]/div/a/@href').extract()
        for link in page_links:
            link = self.base_url + link
            yield Request(url=link,callback=self.parse_chapter_list)


    # 进入章节列表
    def parse_chapter_list(self,response):
        next_link = response.xpath('//span[@class="right"]/a/@href').extract()
        # 下一页
        if next_link:
            yield Request(url=self.base_url+next_link[0],callback=self.parse_chapter_list)

        chapter_links = response.xpath('//div[@class="info_chapters"]/ul[2]//a/@href').extract()
        if chapter_links:
            for link in chapter_links:
                yield Request(url=self.base_url+link,callback=self.parse_chapter)


    # 进入章节内容
    def parse_chapter(self,response):
        # 下一步
        next_link = response.xpath('//a[text()="下一页"]/@href').extract()
        if next_link:
            yield Request(url=self.base_url, callback=self.parse_chapter)

        item = XinnuanItem()
        item['url'] = response.url
        item['uri'] = response.url.replace(self.base_url,"").replace(".html","")
        item['name'] = response.xpath('//div[@class="pagetitle"]/h1/text()').extract()[0]
        item['chapter'] = unicodedata.normalize('NFKC',response.xpath('//div[@id="nr_title"]/text()').extract()[0])
        contents = response.xpath('//div[@id="nr1"]/text()').extract()
        content = "".join(contents[6:])
        item['content'] = unicodedata.normalize('NFKC',content)
        yield item



3.编写pipelines异步处理保存内容

  • 本地保存
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os

class XinnuanPipeline(object):

    def __init__(self, file_dir=False):
    	# 定义分隔符
        self.separate = '/'
        if file_dir:
            self.file_dir = file_dir

    # 初始化获取settings 的属性
    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        return cls(settings['FILE_DIR'])

    def process_item(self, item, spider):
        path = self.file_dir + item['name'] + self.separate
        if not os.path.exists(path):
            os.makedirs(path)

        filePath = path + item['uri'][9:] + '.hj'
        if os.path.exists(filePath):
            os.remove(filePath)

        with open(filePath,mode='a',encoding='gbk') as f:
            f.write(item['chapter'])
            f.write(item['content'])
        return item

  • 数据库保存
import pymysql
from twisted.enterprise import adbapi


class MysqldbPipeline(object):

    def __init__(self,dbpool):
        self.dbpool = dbpool

    @classmethod
    def from_settings(cls, settings):  # 函数名固定,会被scrapy调用,直接可用settings的值
        """
        数据库建立连接
        :param settings: 配置参数
        :return: 实例化参数
        """
        adbparams = dict(
            host=settings['MYSQL_HOST'],
            db=settings['MYSQL_DBNAME'],
            user=settings['MYSQL_USER'],
            password=settings['MYSQL_PASSWORD'],
            cursorclass=pymysql.cursors.DictCursor  # 指定cursor类型
        )
        # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
        dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
        # 返回实例化参数
        return cls(dbpool)

    def process_item(self, item, spider):
        """
        使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象
        """
        query = self.dbpool.runInteraction(self.do_insert, item)  # 指定操作方法和操作数据
        # 添加异常处理
        query.addCallback(self.handle_error)  # 处理异常

    def do_insert(self, cursor, item):
        # 对数据库进行插入操作,并不需要commit,twisted会自动commit
        insert_sql = """
           INSERT INTO xinnuan2(url,uri,name,chapter,content) VALUES(%s,%s,%s,%s,%s)
                       """
        cursor.execute(insert_sql, (item['url'], item['uri'], item['name'], item['chapter'],
                                    item['content']))

    def handle_error(self, failure):
        if failure:
            # 打印错误信息
            print(failure)

4.修改项目配置文件settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for xinnuan project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'xinnuan'

SPIDER_MODULES = ['xinnuan.spiders']
NEWSPIDER_MODULE = 'xinnuan.spiders'

# 设置日志的等级与日志存放的路径
LOG_LEVEL= "DEBUG"
LOG_FILE="../logs/xiaoshuo.log"

# 文件保存路径
FILE_DIR="F:/txt/xinnuan2/"

# 自定义mysql配置
MYSQL_HOST="127.0.0.1"
MYSQL_DBNAME="python"
MYSQL_USER="root"
MYSQL_PASSWORD="root"

# 默认是注释的,这个东西非常重要,如果不写很容易被判断为电脑,简单点洗一个Mozilla/5.0即可
USER_AGENT = 'Mozilla/5.0'

# 是否遵循机器人协议,默认是true,需要改为false,否则很多东西爬不了
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 16
CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'xinnuan.middlewares.XinnuanSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'xinnuan.middlewares.XinnuanDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'xinnuan.pipelines.pipelines.XinnuanPipeline': 300,
    'xinnuan.pipelines.mysqlPipelines.MysqldbPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

5.编写项目运行文件

from scrapy import cmdline

cmdline.execute('scrapy crawl xncwxw2'.split())

6.编写本地保存文件排序及合并

import os
import shutil

file_dir = 'F:/txt/xinnuan2'

# 合并文件
def merge(dir):
    if not os.path.exists(dir):
        print("目录不存在!")
        return
    listdir = os.listdir(dir)
    for name in listdir:
        path = getpath(dir,name)
        if ismerge(path): # 判断是否符合合并条件 要不要都行
            merging(path,name,dir)

# 合并文件
def merging(dir,fileName,savePath):
    if not os.path.exists(dir):
        return
    if not os.path.exists(savePath):
        return
    if not os.path.isdir(dir):
        return
    print("目录文件夹:%s 正在合并……"%dir)
    print("正在生成文件:%s.txt ……"%fileName)
    filePath = getpath(savePath,fileName) + '.txt'
    if os.path.exists(filePath):
        os.remove(filePath)
    with open(filePath,mode='a',encoding='gbk')as f:
        fileList = os.listdir(dir)
        fileList.sort()
        print(fileList)
        for name in fileList:
            if not name.endswith('.hj'):
                continue
            path = getpath(dir, name)
            if not os.path.isfile(path):
                continue
            with open(path,mode='r',encoding='gbk') as r:
                line = r.readline()
                while line:
                    f.write(line)
                    line = r.readline()

    print("文件《%s.txt》生成成功!!!" % fileName)
    # 删除已合并文件夹
    shutil.rmtree(dir)



def getpath(dir,name):
    return dir + '/' + name


# 合并条件判断
def ismerge(dir):
    if not os.path.exists(dir):
        return False
    if not os.path.isdir(dir):
        return False
    listdir = os.listdir(dir)
    for name in listdir:
        path = getpath(dir,name)
        if not os.path.isfile(path):
            return False
        if not path.endswith(".hj"):
            return False
    return True

merge(file_dir)

3.说明

  • 直接运行本项目脚本会爬取目标网站全网小说,耗时长
  • 可修改spiders下脚本,去除下一页爬取减少爬取量
  • final 下脚本在项目运行完毕后单独运行

4.最后

有人的捧个人场,有钱的捧个钱场

在这里插入图片描述

源码地址: xinnuan

  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是Python爬取豆瓣Top250电影并保存MySQL数据库的代码: ``` import requests from bs4 import BeautifulSoup import pymysql # 连接MySQL数据库 db = pymysql.connect(host='localhost', port=3306, user='root', password='your_password', db='your_database') cursor = db.cursor() # 创建表格 cursor.execute('CREATE TABLE IF NOT EXISTS top250(' 'id INT PRIMARY KEY AUTO_INCREMENT,' 'title VARCHAR(255) NOT NULL,' 'score FLOAT(3,1) NOT NULL,' 'director VARCHAR(255) NOT NULL,' 'actor VARCHAR(255) NOT NULL,' 'year VARCHAR(4) NOT NULL)') # 爬取Top250电影 url = 'https://movie.douban.com/top250' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') for item in soup.find_all('div', class_='info'): # 解析出电影名、评分、导演、演员、年份信息 title = item.find('span', class_='title').string score = item.find('span', class_='rating_num').string director = item.find('p').text.split('\n')[1].strip().strip('导演: ') actor = item.find('p').text.split('\n')[2].strip().strip('主演: ') year = item.find('p',class_='').text.split('\n')[1].strip().split('/')[0] # 数据库写入 cursor.execute("INSERT INTO top250(title, score, director, actor, year) VALUES (%s, %s, %s, %s, %s)", (title, score, director, actor, year)) db.commit() # 关闭数据库连接 db.close() ``` 这段代码可以将豆瓣Top250电影的电影名、评分、导演、演员、年份信息爬取下来,并保存到本地的MySQL数据库中。同时,还包括了创建表格的操作,因此不需要手动在数据库中建立表格。注意替换代码中的“your_password”和“your_database”为你自己的密码和数据库名称。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值