爬取小说并保存到数据库中

最新推荐文章于 2024-01-20 14:31:26 发布

boy_china_tian

最新推荐文章于 2024-01-20 14:31:26 发布

阅读量3.9k

点赞数 11

本文链接：https://blog.csdn.net/weixin_48192346/article/details/113833087

版权

爬取小说并存入数据库中

小说网站

小说网站：https://www.qb50.com/fenlei
这个网站没有反爬虫措施，所以小说资源很容易爬取下来
这里用requests和xpath直接获取

创建数据库

创建两种类型的数据库，一种是存放小说的数据表，还要一种是小说数据表名和其作者，前一种需要大量来创建，这里在python代码中实现，第二种直接在数据库中创建

代码实现

import requests
from lxml import etree
import random
import pymysql

list1=['a','b','c','d','e','f','g','h','i','j','k','l','n','o','p','q','r','s','t','u','v','w','x','y','z']

#随机生成数据库表
def mysql():
    wood=pymysql.Connect(host='localhost',user='root',password='root',db='we')
    random.shuffle(list1)
    data=random.sample(list1,random.randint(3,9))
    data=''.join(data)
    we=wood.cursor()
    sql='''create table {}(
    name varchar(20),
    content text);
'''.format(data)
    we.execute(sql)
    we.close()
    wood.close()
    return data

#将小说名字，图片对应的表导入books表中
def books(book_name,author,data):
    book_name=book_name
    author=author
    data=data
    photo=None
    wood=pymysql.Connect(host='localhost',user='root',password='root',db='we')
    we=wood.cursor()
    sql='insert into books values("%s","%s","%s","%s")'%(book_name,author,data,photo)
    we.execute(sql)
    wood.commit()
    we.close()
    wood.close()

#将小说文本数据导入随机表中
def dao(item,data):
    name=data
    item=item
    wood = pymysql.Connect(host='localhost', user='root', password='root', db='we')
    we=wood.cursor()
    sql='insert into %s values ("%s","%s")'%(name,item['name'],item['text'])
    we.execute(sql)
    wood.commit()
    we.close()
    wood.close()

#获取小说内容
def fun3(url_,name):
    url_=url_
    name=name
    wood=requests.get(url=url_,headers=headers)
    we=etree.HTML(wood.text)
    text=we.xpath('//div[@id="content"]/text()')
    text=''.join(text).replace('全本小说网 www.qb50.com，最快更新最新章节！','')
    item={}
    item['name']=name
    item['text']=text
    return item
#获取小说的详细链接
def fun2(next_url,book_name):
    next_url=next_url
    book_name=book_name
    # photo=photo
    headers = {
        'User-Agent': 'zilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 Edg/88.0.705.63'}
    wwwd = requests.get(url=next_url, headers=headers)
    wwwd.encoding=wwwd.apparent_encoding
    we=etree.HTML(wwwd.text)

    author=we.xpath('//div[@id="info"]/h1/small/a/text()')[0]
    all_list=we.xpath('//div[@class="zjbox"]/dl[@class="zjlist"]/dd')

    #创建随机表,并获取表名
    data=mysql()
    books(book_name,author,data)
    for li in all_list:
        try:
            url_=next_url+li.xpath('./a/@href')[0]
            name=li.xpath('./a/text()')[0]
            # 小说每一章节详细数据
            item=fun3(url_,name)
            dao(item,data)
            print('爬取成功',item['name'])
        except:
            continue



for i in range(1,8):
    url='https://www.qb50.com/fenlei/{}_1/'.format(i)
    headers={'User-Agent':'zilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 Edg/88.0.705.63'}
    wood=requests.get(url=url,headers=headers)
    we=etree.HTML(wood.text)
    all_list=we.xpath('//div[@class="shu_cont"]/div[@class="shu_box"]')
    for li in all_list:
        book_name=li.xpath('./div[2]/h4/a/text()')[0]
        next_url=li.xpath('./div[2]/h4/a/@href')[0]
        tu=li.xpath('./div[1]/a/img/@src')[0]
        # photo=fun(tu)
        fun2(next_url,book_name)