排行榜小说批量下载（未写入数据库，格式还需更改）

最新推荐文章于 2024-05-08 17:02:41 发布

酒中醉去梦中来

最新推荐文章于 2024-05-08 17:02:41 发布

阅读量306

点赞数

分类专栏： python爬虫

python爬虫专栏收录该内容

11 篇文章 0 订阅

订阅专栏

'''
url='http://www.qu.la/paihangbang/'
爬取笔趣阁的排行版小说
'''

import requests
import bs4


# 抓取整个网页
def get_html(url):
    try:
        r = requests.get(url, timeout=300)
        r.raise_for_status
        # 我手动测试了编码。并设置好，这样有助于效率的提升
        r.encoding = ('utf-8')
        # print(r.text)
        return r.text
    except:
        return "Someting Wrong！"


# 获取排行榜小说及其链接：
def get_content(url):
    '''
    爬取每一类型小说排行榜，
    按顺序写入文件，
    文件内容为 小说名字+小说链接
    将内容保存到列表
    并且返回一个装满url链接的列表
    '''
    url_list = []
    html = get_html(url)
    soup = bs4.BeautifulSoup(html, 'lxml')
    # 由于小说排版的原因，历史类和完本类小说不在一个div里
    # print(soup.prettify())

    category_list = soup.find_all('div', class_='index_toplist mright mbottom')

    history_finished_list = soup.find_all('div', class_='index_toplist mbottom')
    # print(category_list)
    for cate in category_list:#分别遍历
        name = cate.find('div', class_='toptab').span.string
        with open('小说/novel_list.csv', 'a+') as f:
            f.write("\n小说种类：{} \n".format(name))
            print("\n小说种类：{} \n".format(name))

        # 我们直接通过style属性来定位总排行榜
        general_list = cate.find(style='display: block;')
        # 找到全部的小说名字，发现他们全部都包含在li标签之中
        book_list = general_list.find_all('li')
        # 循环遍历出每一个小说的的名字，以及链接
        for book in book_list:
            link = 'http://www.qu.la/' + book.a['href']
            title = book.a['title']
            # 我们将所有文章的url地址保存在一个列表变量里
            url_list.append(link)
            # 这里使用a模式，防止清空文件
            with open('./小说/novel_list.csv', 'a') as f:
                f.write("小说名：{:<} \t 小说地址：{:<} \n".format(title, link))
                print("小说名：{:<} \t 小说地址：{:<} \n".format(title, link))
    # print(history_finished_list)
    for cate in history_finished_list:
        name = cate.find('div', class_='toptab').span.string
        with open('小说/novel_list.csv', 'a') as f:
            f.write("\n小说种类：{} \n".format(name))
            print("\n小说种类：{} \n".format(name))

        general_list = cate.find(style='display: block;')
        book_list = general_list.find_all('li')
        for book in book_list:
            link = 'http://www.qu.la' + book.a['href']
            title = book.a['title']
            url_list.append(link)
            with open('小说/novel_list.csv', 'a') as f:
                f.write("小说名：{:<} \t 小说地址：{:<} \n".format(title, link))
                print("小说名：{:<} \t 小说地址：{:<} \n".format(title, link))
    # print(url_list)
    return url_list


# 获取单本小说的所有章节链接:
def get_txt_url(url):
    '''
    获取该小说每个章节的url地址：
    并创建小说文件

    '''
    # print(url)
    url_list = []
    html = get_html(url)
    soup = bs4.BeautifulSoup(html, 'lxml')
    lista = soup.find_all('dd')
    txt_name = soup.find('h1').text#小说名字
    with open('小说/{}.txt'.format(txt_name), "a+") as f:
        f.write('小说标题：{} \n'.format(txt_name))
    for url1 in lista:
        url_list.append(str(url) + url1.a['href'])

    return url_list, txt_name


# 获取单页文章的内容并保存到本地:
def get_one_txt(url, txt_name):
    '''
    获取小说每个章节的文本
    并写入到本地
    '''
    html = get_html(url).replace('<br/>', "\n")
    soup = bs4.BeautifulSoup(html, 'lxml')
    #print(soup.prettify())
    try:
        print("==============================================1")
        txt = soup.find('div', id='content').text.replace('chaptererror();', '\n')
        txt = txt.replace('<br/>', '-----------------------------')
        #print(txt)
        #txt=txt.replace('&nbsp;&nbsp;&nbsp;&nbsp;', '   ')
        title = soup.find('title').text.split('_')[0]#以_拆分取第一个
#　　&nbsp;&nbsp;&nbsp;&nbsp;
        with open('小说/{}.txt'.format(txt_name), "a", encoding='gb18030', errors='ignore') as f:
            #print(txt)
            f.write(title + '\n')
            # print(txt)
            f.write(txt)
            print('当前小说：{} 当前章节{} 已经下载完毕'.format(txt_name, title))
    except NameError as e:
        print('---->', e)

def main():
     url="https://www.qu.la/paihangbang/"
     url_list=get_content(url)#获取排行榜的所有小说链接
     for url in url_list:#遍历这些链接
        url_list1,txtname=get_txt_url(url)#获取单本小说的所有章节链接，以及小说名称
        for url1 in url_list1:#遍历小说的所有章节
            #print(url1)
            get_one_txt(url1,txtname)#获取每章节文章的内容并保存到本地

if __name__ == "__main__":
    main()

酒中醉去梦中来

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
排行榜小说批量下载（未写入数据库，格式还需更改）

'''url='http://www.qu.la/paihangbang/'爬取笔趣阁的排行版小说'''import requestsimport bs4# 抓取整个网页def get_html(url): try: r = requests.get(url, timeout=300) r.raise_for_status ...
复制链接

扫一扫

专栏目录