《Python笔记》Requests爬虫（2）爬取小说

最新推荐文章于 2023-09-15 21:16:55 发布

学弟不想努力了

最新推荐文章于 2023-09-15 21:16:55 发布

阅读量402

点赞数

分类专栏： Python

本文链接：https://blog.csdn.net/Eternal_Blue/article/details/103166655

版权

Python 专栏收录该内容

13 篇文章 1 订阅

订阅专栏

前言

这里只需要一个py文件就能实现数据采集

它区别于之前记录的方式，这里没有使用Scrapy框架，直接通过Requests提取

使用Requests，需要提前下载好第三方插件库

代码注释我已经写的挺清晰的了~~~

目标：

1. 创建普通的python爬虫项目

2. 爬取正确的数据

(1) 对爬取的数据进行格式转换

3. 爬取的数据进行数据库存储

一、新建一个py文件

# 文件名
myCrawler.py

二、py代码如下

# coding:utf-8
import requests
from lxml import etree
import pymysql

#  执行命令：python myCrawler.py
def job():
    # 获取网页源代码
    url = 'http://book.zongheng.com/chapter/885037/58155562.html'
    data = requests.get(url)
    selector = etree.HTML(data.content)

    # 标题章节div(包括章节数、本章名称)
    title_1 = selector.xpath("//div[@class='title_txtbox']/text()") # ['第一章黑衣剑客']
    # 取出标题文字
    title_2 = str(title_1).split("['")[-1].split("']")[0]   # 第一章黑衣剑客

    # 1.提取标题,章节数以外的文字
    if "章" in title_2:
        title = title_2[str(title_2).index("章") + 1:]    # 黑衣剑客
    else:
        title = title_2

    # 2.提取章节数,根据"章"字对标题进行截取
    if "章" in title_2:
        chapter = title_2[0:title_2.index("章") + 1]     # 第一章
    else:
        chapter = "-"

    # 3.内容 content
    content_1 = selector.xpath("//div[@class='content']//text()")
    # 数组转字符串： String = "".join(arrary)
    # 字符串清除空格： String.strip()
    # 数组转字符串，并清空内容里的空格
    content = "".join(content_1).strip()
    #print(chapter + '\t' + title + '\n' + content + '\n\n')

    # 连接数据库
    db = pymysql.connect("127.0.0.1", "root", "123456", "mypython")
    cursor = db.cursor()
    print(" ---- 数据库连接成功 ---- ")
    querySql = "SElECT * from fiction where chapter ='%s' " % (chapter)
    cursor.execute(querySql)
    data = cursor.fetchone()
    # 判断数据库是否存在
    if data:
        print(" ---- 数据已存在 ---- ")
    else:
        # 数据不存在就新增入库
        insertSql = "INSERT INTO fiction(chapter, title, content) VALUES( '%s', '%s','%s' ) " % (chapter, title, content)
        try:
            cursor.execute(insertSql)
            db.commit()  # 提交到数据库执行
            print(" ---- 新增成功 ---- ")
        except Exception as e:
            db.rollback()  # 异常回滚
            print(e)
    cursor.close()

# 执行及调用方法
job()

三、执行代码

python myCrawler.py

四、本地MYSQL数据库

CREATE DATABASE mypython
 
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
 
-- ----------------------------
-- Table structure for fiction
-- ----------------------------
DROP TABLE IF EXISTS `fiction`;
CREATE TABLE `fiction`  (
  `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键id',
  `chapter` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '小说章节数',
  `title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '小说标题',
  `content` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci COMMENT '小说内容',
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8mb4 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
 
SET FOREIGN_KEY_CHECKS = 1;