request爬取豆瓣网小说网页前10页所有小说的信息,并保存到mysql

最新推荐文章于 2022-09-05 17:01:59 发布

qq_51803767

最新推荐文章于 2022-09-05 17:01:59 发布

阅读量703

点赞数 3

分类专栏： 1 文章标签： mysql python 数据库爬虫

本文链接：https://blog.csdn.net/qq_51803767/article/details/115824090

版权

1 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

文章目录

导入文件

import requests
from lxml import etree
import MySQLdb
import datetime

初始化爬取网页和代理头

    def __init__(self):
        self.url="https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T"
        self.headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
                    }

获取网页内容

    def getData(self,url):#获取网页内容
        response=requests.get(url,headers=self.headers)
        response=response.content.decode()
        return response

解析处理数据

    def parseData(self,data,i):
        html=etree.HTML(data)
        el_list=html.xpath('//*[@id="subject_list"]/ul/li/div[2]')
        for el in el_list:
            data_dict={}
            data_dict['title']=el.xpath('./h2/a/text()')[0].strip()
            data_dict['author']=el.xpath('./div[1]/text()')[0].strip().split(" / ")[0]
            data_dict['publisher']=el.xpath('./div[1]/text()')[0].strip().split(" / ")[-3]
            #处理日期数据
            date=el.xpath('./div[1]/text()')[0].strip().split(" / ")[-2].replace("年", "-").replace("日", "-").replace("月", "-").replace(".", "-").strip("-")
            date=date.split("-") 
            if(len(date)<=2):
                date=datetime.datetime(int(date[0]),int(date[1]),1)
            else:
                date=datetime.datetime(int(date[0]),int(date[1]),int(date[2]))
            data_dict['publish_time']=date
            #处理数字数据
            data_dict['price']=el.xpath('./div[1]/text()')[0].strip().split(" / ")[-1].strip()
            rating=el.xpath('./div[2]/span[2]/text()')[0].strip()
            data_dict['rating']=float(rating)
            data_dict['comments']=el.xpath('./div[2]/span[3]/text()')[0].strip()
            #处理简介
            if(el.xpath('./p/text()')):
                data_dict['introduction']=el.xpath('./p/text()')[0].strip()
            else:
                data_dict['introduction']=""
            
            
            data_list.append(data_dict)
        #翻页
        self.url="https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start={}&type=T".format(i*20)

数据爬取用xml定位数据
其中日期类型格式样式太乱，需要处理转成datetime类型

将爬出数据保存到mysql

    def savedb(self):
        db = MySQLdb.connect("localhost", "root", "root", "db1", charset='utf8mb4' )
        cursor = db.cursor()
        #创建表
        try:
        # 执行sql语句
            cursor.execute('''CREATE TABLE novel(
                    id INT PRIMARY KEY AUTO_INCREMENT,
                    title VARCHAR(32) NOT NULL,
                    author VARCHAR(32) NOT NULL,
                    publisher VARCHAR(32) NOT NULL,
                    publish_time DATE,
                    price varchar(32),
                    rating DOUBLE,
                    comments VARCHAR(32),
                    introduction VARCHAR(128)
                );''')
            for i in range(len(data_list)):
                sql = "INSERT INTO novel(title,author,publisher,publish_time,price,rating,comments,introduction) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s)" 
                cursor.execute(sql,[data_list[i]['title'],data_list[i]['author'],data_list[i]['publisher'],data_list[i]['publish_time'],data_list[i]['price'],data_list[i]['rating'],data_list[i]['comments'],data_list[i]['introduction']])
                db.commit()
        except:
        # Rollback in case there is any error
            db.rollback()

        # 关闭数据库连接
        db.close()

保存数据时遇到emoj表情数据，mysql无法保存
解决办法：将数据库字符集改为utf8mb4 注：需mysql5.53以上版本
保存结果展示：
在这里插入图片描述

查询数据库内容

    def query(self,sql):
        db = MySQLdb.connect("localhost", "root", "root", "db1", charset='utf8mb4' )
        # 使用cursor()方法获取操作游标 
        cursor = db.cursor()
        try:
        # 执行sql语句
            cursor.execute(sql)
            results = cursor.fetchall()
            for row in results:
                title=row[1]
                author=row[2]
                publisher=row[3]
                publish_time=row[4]
                price=row[5]
                rating=row[6]
                comments=row[7]
                introduction=row[8]
                print("书名:"+title+",作者:"+author+",出版社:"+publisher+",出版时间:"+str(publish_time)+",价格:"+price+",评分:"+str(rating)+",评价人数:"+comments)
                print("简介\n"+introduction)
        except:
        # Rollback in case there is any error
            db.rollback()
        # 关闭数据库连接
        db.close()

运行函数

    def run(self):
        for i in range(1,11):
            data=self.getData(self.url)
            self.parseData(data,i)
        self.savedb()

主函数

if __name__ =='__main__':
    douban=Douban()
    douban.run()
    douban.query("SELECT * FROM novel WHERE id=1;")

查询结果展示
在这里插入图片描述

qq_51803767

关注

3
点赞
踩
5

收藏

觉得还不错? 一键收藏
5
评论
request爬取豆瓣网小说网页前10页所有小说的信息,并保存到mysql

文章目录初始化爬取网页和代理头获取网页内容解析处理数据将爬出数据保存到mysql查询数据库内容运行函数主函数初始化爬取网页和代理头 def __init__(self): self.url="https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T" self.headers={ 'User-Agent': 'Mozilla/5.0 (Windows N
复制链接

扫一扫