python爬虫，以某小说网站为例

最新推荐文章于 2024-08-24 16:40:35 发布

每天一罐可乐

最新推荐文章于 2024-08-24 16:40:35 发布

阅读量2k

点赞数

分类专栏： python 文章标签： Python 爬虫小说排行榜 BeautifulSoup 数据存储

本文链接：https://blog.csdn.net/a417655340/article/details/124558942

版权

python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

python的简单爬虫，通过获取网页的源代码分理处有用的数据再进行分析整理保存的过程。

大体可以分为三步：

获取网页源代码
整理剥离有用代码
储存信息

1、通过requests库获取网站源码

'''
Description: html工具类
Version: 1.0
Autor: 李广凯
Date: 2022-02-24 20:23:23
LastEditors: 李广凯
LastEditTime: 2022-04-04 22:48:56
'''
import requests
# 取得网页源码
def getHTML(start_url):
    html = requests.get(start_url)
    html.raise_for_status
    # html.encoding = 'utf-8'
    html.encoding = html.apparent_encoding
    return html.text

2、整理网页源代码

#获取排行榜标签：日排行、周排行、月排行
def getListTag(html):
    # 标签列表
    list_tag = []
    soup = BeautifulSoup(html, 'html.parser')
    useful = soup.find(class_='tab')
    li_list = useful.find_all('li')
    for li in li_list:
        list_tag.append(li.string)
    return list_tag


# 获取排行榜的url，书名，封面
def getBookTopList(html):
    global list_book_top
    list_tag = getListTag(html)
    list_book_url = []
    list_book_name = []
    list_book_img = []
    list_book_top = []
    soup = BeautifulSoup(html, 'html.parser')
    useful = soup.find(class_='tabCon')
    ul_context = useful.find_all('ul')
    for ul in ul_context:
        list_book_img_cache = []
        list_book_name_cache = []
        list_book_url_cache = []
        for li in ul:
            li = str(li).replace('\n', '')
            book_img = re.findall('src="(.*?)"', li)
            if len(book_img):
                list_book_img_cache.append(book_img[0])
            if len(li):
                book_url = re.search('href="(.*?)"', li).group(1)
                list_book_url_cache.append(book_url)
                book_name = re.search('title="(.*?)"', li).group(1)
                list_book_name_cache.append(book_name)
        list_book_img.append(list_book_img_cache)
        list_book_name.append(list_book_name_cache)
        list_book_url.append(list_book_url_cache)
    print(list_book_name)
    for i in range(len(list_tag)):
        for k in range(len(list_book_name[i])):
            dic_book_top_cache = {
                '榜单名': str(list_tag[i]),
                '书名': str(list_book_name[i][k]),
                '封面': str(list_book_img[i][k]),
                'url': str(list_book_url[i][k])
            }
            list_book_top.append(dic_book_top_cache)
    print('获取排行榜列表成功!')
    return list_book_top


# 补充排行榜列表（作者、书籍简介）
def insertBookListTop():
    url_list = []
    for book in list_book_top:
        url_list.append(book['url'])
    pool = Pool(5)
    result = pool.map(getBookSimpleInfo, url_list)
    for i in range(len(result)):
        list_book_top[i]['作者'] = str(result[i][0][3:])
        list_book_top[i]['简介'] = str(result[i][1])
        list_book_top[i]['下载地址'] = str(result[i][2])


#获取下载小说文件的url
def downloadBookFile(download_before_url):
    html = html_tool.getHTML(download_before_url)
    soup = BeautifulSoup(html, 'html.parser')
    useful = soup.find(class_='down-btn-group')
    a_context = useful.find('a')
    a_context = str(a_context).replace('\n', '')
    download_book_url = re.search('href="(.*?)"', a_context).group(1)
    return download_book_url


# 添加书籍信息
def getBookSimpleInfo(url):
    html = html_tool.getHTML(url)
    soup = BeautifulSoup(html, 'html.parser')
    useful = soup.find(class_='info2')
    h3_context = useful.find('h3')
    p_context = useful.find('p')
    author = h3_context.string
    info = p_context.string
    # 下载页面url
    a_context = soup.find(class_='btn btn-danger')
    a_context = str(a_context).replace('\n', '')
    download_before_url = re.search('href="(.*?)"', a_context).group(1)
    download_before_url = start_url + download_before_url
    download = downloadBookFile(download_before_url)
    info = str(info).replace('\n', '')
    info = str(info).replace(' ', '')
    return author, info, download

3、储存文件或插入数据库

#储存文件
def saveFile(book_info):
    url = book_info['下载地址']
    path = './book_file/' + book_info['书名'] + '.txt'
    r = requests.get(url)
    with open(path, "wb") as code:
        code.write(r.content)
    # urlretrieve(url,path)
    print(book_info['书名'] + '下载完成!')


# 插入数据库
def insertDB(book):
    file_path = './book_file/' + book['书名'] + '.txt'
    datetime = time.strftime("%Y-%m-%d", time.localtime())
    if book['榜单名'] == '日排行':
        sql = "INSERT INTO day_top(bname,bimg,bauthor,binfo,bfile,bdate) VALUES(" + '"' + book[
            '书名'] + '","' + book['封面'] + '","' + book['作者'] + '","' + book[
                '简介'] + '","' + file_path + '","' + datetime + '")'
    if book['榜单名'] == '周排行':
        sql = "INSERT INTO week_top(bname,bimg,bauthor,binfo,bfile,bdate) VALUES(" + '"' + book[
            '书名'] + '","' + book['封面'] + '","' + book['作者'] + '","' + book[
                '简介'] + '","' + file_path + '","' + datetime + '")'
    if book['榜单名'] == '月排行':
        sql = "INSERT INTO month_top(bname,bimg,bauthor,binfo,bfile,bdate) VALUES(" + '"' + book[
            '书名'] + '","' + book['封面'] + '","' + book['作者'] + '","' + book[
                '简介'] + '","' + file_path + '","' + datetime + '")'
    db.insertData(sql)

因为只是获取排行榜数据，所以就用一个全局列表list_book_top[] 来临时存储信息。代码整体还是比较简单，就是分析提取书籍信息部分比较麻烦。