使用Python爬取笔趣阁小说列表并添加到

最新推荐文章于 2024-08-14 08:38:25 发布

HI阡陌

最新推荐文章于 2024-08-14 08:38:25 发布

阅读量2.7k

点赞数 1

分类专栏： Python 文章标签： python

本文链接：https://blog.csdn.net/lzd1164961158/article/details/77145926

版权

Python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

1:首先需要安装爬虫所需要的第三方库

requests
BeautifulSoup
pymysql或者sqllit3(安装python自带)
lxml 解析器

2:安装完之后接下来是代码部分:

首先新建一个sql帮助类 SqlServerHelper.py

# -*- coding: UTF-8 -*-
#!/usr/bin/env python
#-------------------------------------------------------------------------------
# Name:        Mir_LongZd
# Purpose:
# 操作数据库帮助类
# Author:      Long



import pymysql
import  sqlite3

class MSSQL:
    """
    初始化数据库
    """
    def __init__(self,host,user,pwd,db):
        self.host=host
        self.user=user
        self.pwd=pwd
        self.db=db


    """
    获取connection
    """
    def __GetConnect(self):
        if not  self.db:
            raise (NameError,"没有设置数据库信息")
        self.conn=pymysql.connect(host=self.host,user=self.user,password=self.pwd,database=self.db,charset="utf8")
        cur = self.conn.cursor()
        if not cur:
            raise(NameError,"连接数据库失败")
        else:
            return cur




    def ExecQuery(self,sql):
        cur = self.__GetConnect()
        cur.execute(sql)
        resList = cur.fetchall()

        #查询完毕后必须关闭连接
        self.conn.close()
        return resList

    def ExecNonQuery(self,sql):
        cur = self.__GetConnect()
        cur.execute(sql)
        self.conn.commit()
        self.conn.close()


"""
sqllit3操作数据库
"""
class SQLLIT:
    def __init__(self,dbpath):
        self.dbpath=dbpath


    def __Getcontent(self):
        if str(self.dbpath)=="" or self.dbpath is None:
            raise (NameError,"没有数据库")
        else:
            self.conn=sqlite3.connect(self.dbpath)
            self.cu=self.conn.cursor()
            if not self.cu:
                raise(NameError,"连接数据库失败")
            else:
               return self.cu

    #
    #查
    #
    def GetList(self,sql):
        cur = self.__Getcontent()
        cur.execute(sql)
        resList = cur.fetchall()

        #查询完毕后必须关闭连接
        self.conn.close()
        return resList


    #
    #增删改
    #
    def ExecNonQuery(self,sql):
        cur = self.__Getcontent()
        cur.execute(sql)
        self.conn.commit()
        self.conn.close()

然后我们在建一个发起请求的帮助类

# -*- coding: UTF-8 -*-
#!/usr/bin/env python
#-------------------------------------------------------------------------------
# Name:        Mir_LongZd
# Purpose:
# 爬取网页帮助类
# Author:      Long

import  requests
from bs4 import BeautifulSoup

class RequestHelper:

    def __init__(self,url,encoding):
        if url is not None and str(url)!="":
          self.wb_data=requests.get(url)
          self.wb_data.encoding=encoding

        else:
            raise (NameError,"请输入爬取地址")


    def GetContent(self,select):
        self.soup=BeautifulSoup(self.wb_data.text,'lxml')
        self.select_data=self.soup.select(select)
        return self.select_data

我们先看下笔趣阁网站 ,分析下网页

这里我们只爬取这些列表

f12可以看到文章名和作者所在的层级,找到元素之后我们就可以用代码实现爬取了

# -*- coding: UTF-8 -*-
#!/usr/bin/env python
#-------------------------------------------------------------------------------
# Name:        Mir_LongZd
# Purpose:
# 爬取笔趣阁小说网站
# Author:      Long

import  requests  #导入模块
from bs4 import  BeautifulSoup
import  datetime
from  Helper.RequestHe import  RequestHelper  #导入帮助类
import  uuid #guid
from Helper.SqlServerHelper import SQLLIT  #导入数据库帮助类



url="http://www.biquzi.com/xuanhuan/" #发起请求 笔趣阁网站
wb_data=RequestHelper(url,"gbk")
href_data=wb_data.GetContent("#newscontent .l li .s2 a")#获取书名集合
au_data=wb_data.GetContent("#newscontent .l li .s5")#获取作者

au_href=[]
novellist=[]#存储文章章节列表
for h,a in zip(href_data,au_data):#循环获取的书名和作者
    getdata={
        'guid':str(uuid.uuid1()),
        'href':h.get("href"),
        'title':h.string,
        'author':a.string

    }
    au_href.append(getdata) #将字典加入到列表中  便于后面操作
    """
    将循环的数据加入到数据库中
    """





"""
  将循环的数据加入到数据库中
 """

for item in au_href:#遍历所有的小说
    data=SQLLIT("C:\\Users\Mir_LongZd\\Documents\\py.db") #实例化数据库方法
    t_url=item['href']
    guid=item['guid']
    selectsql="select count(1) FROM novel WHERE ID='"+guid+"' and title='"+item['title']+"' "
    count=tuple(data.GetList(selectsql)) #data.GetList(selectsql)返回的是一个元组
    if count[0][0]>0:#判断是否存在次记录,不存在则添加
        print(count[0][0])
        print("大于0")
        print(len(data.GetList(selectsql)))

    else:
        insertsql="insert INTO novel VALUES('"+guid+"','"+item['title']+"','"+item['author']+"','"+t_url+"') "
        data.ExecNonQuery(insertsql) #执行数据添加操作
        """
        根据当前的小说连接发起requests请求,抓取该小说对应的章节
        """
    t_data=RequestHelper(t_url,"gbk")
    soup=t_data.GetContent("#list dd a")
    for item1 in soup:#遍历抓取的章节
        href=item1.get("href")
        title=item1.string
        t_selectsql="select count(1) FROM novel_list WHERE pid='"+guid+"' and title='"+title+"' "
        t_count=tuple(data.GetList(t_selectsql))
        if t_count[0][0]>0:
            print("大于0")
        else:
            t_insertsql="insert INTO novel_list VALUES('"+str(uuid.uuid1())+"','"+title+"','"+guid+"','"+href+"') "
            data.ExecNonQuery(t_insertsql)#执行数据添加操作