python爬虫,以某小说网站为例

python的简单爬虫,通过获取网页的源代码分理处有用的数据再进行分析整理保存的过程。

大体可以分为三步:

  • 获取网页源代码
  • 整理剥离有用代码
  • 储存信息

1、通过requests库获取网站源码

'''
Description: html工具类
Version: 1.0
Autor: 李广凯
Date: 2022-02-24 20:23:23
LastEditors: 李广凯
LastEditTime: 2022-04-04 22:48:56
'''
import requests
# 取得网页源码
def getHTML(start_url):
    html = requests.get(start_url)
    html.raise_for_status
    # html.encoding = 'utf-8'
    html.encoding = html.apparent_encoding
    return html.text

2、整理网页源代码

#获取排行榜标签:日排行、周排行、月排行
def getListTag(html):
    # 标签列表
    list_tag = []
    soup = BeautifulSoup(html, 'html.parser')
    useful = soup.find(class_='tab')
    li_list = useful.find_all('li')
    for li in li_list:
        list_tag.append(li.string)
    return list_tag


# 获取排行榜的url,书名,封面
def getBookTopList(html):
    global list_book_top
    list_tag = getListTag(html)
    list_book_url = []
    list_book_name = []
    list_book_img = []
    list_book_top = []
    soup = BeautifulSoup(html, 'html.parser')
    useful = soup.find(class_='tabCon')
    ul_context = useful.find_all('ul')
    for ul in ul_context:
        list_book_img_cache = []
        list_book_name_cache = []
        list_book_url_cache = []
        for li in ul:
            li = str(li).replace('\n', '')
            book_img = re.findall('src="(.*?)"', li)
            if len(book_img):
                list_book_img_cache.append(book_img[0])
            if len(li):
                book_url = re.search('href="(.*?)"', li).group(1)
                list_book_url_cache.append(book_url)
                book_name = re.search('title="(.*?)"', li).group(1)
                list_book_name_cache.append(book_name)
        list_book_img.append(list_book_img_cache)
        list_book_name.append(list_book_name_cache)
        list_book_url.append(list_book_url_cache)
    print(list_book_name)
    for i in range(len(list_tag)):
        for k in range(len(list_book_name[i])):
            dic_book_top_cache = {
                '榜单名': str(list_tag[i]),
                '书名': str(list_book_name[i][k]),
                '封面': str(list_book_img[i][k]),
                'url': str(list_book_url[i][k])
            }
            list_book_top.append(dic_book_top_cache)
    print('获取排行榜列表成功!')
    return list_book_top


# 补充排行榜列表(作者、书籍简介)
def insertBookListTop():
    url_list = []
    for book in list_book_top:
        url_list.append(book['url'])
    pool = Pool(5)
    result = pool.map(getBookSimpleInfo, url_list)
    for i in range(len(result)):
        list_book_top[i]['作者'] = str(result[i][0][3:])
        list_book_top[i]['简介'] = str(result[i][1])
        list_book_top[i]['下载地址'] = str(result[i][2])


#获取下载小说文件的url
def downloadBookFile(download_before_url):
    html = html_tool.getHTML(download_before_url)
    soup = BeautifulSoup(html, 'html.parser')
    useful = soup.find(class_='down-btn-group')
    a_context = useful.find('a')
    a_context = str(a_context).replace('\n', '')
    download_book_url = re.search('href="(.*?)"', a_context).group(1)
    return download_book_url


# 添加书籍信息
def getBookSimpleInfo(url):
    html = html_tool.getHTML(url)
    soup = BeautifulSoup(html, 'html.parser')
    useful = soup.find(class_='info2')
    h3_context = useful.find('h3')
    p_context = useful.find('p')
    author = h3_context.string
    info = p_context.string
    # 下载页面url
    a_context = soup.find(class_='btn btn-danger')
    a_context = str(a_context).replace('\n', '')
    download_before_url = re.search('href="(.*?)"', a_context).group(1)
    download_before_url = start_url + download_before_url
    download = downloadBookFile(download_before_url)
    info = str(info).replace('\n', '')
    info = str(info).replace(' ', '')
    return author, info, download

3、储存文件或插入数据库

#储存文件
def saveFile(book_info):
    url = book_info['下载地址']
    path = './book_file/' + book_info['书名'] + '.txt'
    r = requests.get(url)
    with open(path, "wb") as code:
        code.write(r.content)
    # urlretrieve(url,path)
    print(book_info['书名'] + '下载完成!')


# 插入数据库
def insertDB(book):
    file_path = './book_file/' + book['书名'] + '.txt'
    datetime = time.strftime("%Y-%m-%d", time.localtime())
    if book['榜单名'] == '日排行':
        sql = "INSERT INTO day_top(bname,bimg,bauthor,binfo,bfile,bdate) VALUES(" + '"' + book[
            '书名'] + '","' + book['封面'] + '","' + book['作者'] + '","' + book[
                '简介'] + '","' + file_path + '","' + datetime + '")'
    if book['榜单名'] == '周排行':
        sql = "INSERT INTO week_top(bname,bimg,bauthor,binfo,bfile,bdate) VALUES(" + '"' + book[
            '书名'] + '","' + book['封面'] + '","' + book['作者'] + '","' + book[
                '简介'] + '","' + file_path + '","' + datetime + '")'
    if book['榜单名'] == '月排行':
        sql = "INSERT INTO month_top(bname,bimg,bauthor,binfo,bfile,bdate) VALUES(" + '"' + book[
            '书名'] + '","' + book['封面'] + '","' + book['作者'] + '","' + book[
                '简介'] + '","' + file_path + '","' + datetime + '")'
    db.insertData(sql)

因为只是获取排行榜数据,所以就用一个全局列表list_book_top[] 来临时存储信息。代码整体还是比较简单,就是分析提取书籍信息部分比较麻烦。

评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值