笔趣阁爬虫

最新推荐文章于 2024-06-28 16:48:54 发布

winegd

最新推荐文章于 2024-06-28 16:48:54 发布

阅读量936

点赞数

文章标签： pycharm python 正则表达式

本文链接：https://blog.csdn.net/zhinian06/article/details/108027047

版权

笔趣阁爬虫
新手练习，欢迎大佬指正，因为还没学习多线程和多进程所以爬取的速度会比较慢，主要我设置了每隔2s访问一次（为了防止被封ip)，如果觉得慢的话可以考虑删掉time.sleep(2)这一行代码

import requests
import re
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time

# url = "http://www.biquge.info/10_10229/"
#输入要下载书的链接
url = input('请输入要下载书的笔趣阁链接：')
# 使用虚假身份
ua = UserAgent().random
#正则模板，找到每一章对应的链接
findurl = re.compile(r'<dd><a href="(.*?)" title="(.*?)">(.*?)</a></dd>')
#正则模板，找到每一章对应的内容
findbase = re.compile(r'<meta content="(.*?)" property="og:novel:read_url"/>')
#正则模块，找到书名
findbook_name = re.compile(r'<meta content="(.*?)" property="og:title"/>')
#定义一个空的列表用来存放每一章对应的链接
urllist = []
#访问该书的网页并解析
def askurl_1(url):
    head = {
        'User-Agent': ua
    }
    time.sleep(2)
    res = requests.get(url, head)
    html = res.content.decode('utf-8')
    soup = BeautifulSoup(html, 'lxml')
    list = re.findall(findurl,str(soup))
    baseurl = re.findall(findbase,str(soup))
    book_name = re.findall(findbook_name,str(soup))
    book_name = book_name[0]
    for url in list:
        urllist.append(baseurl[0] + url[0]) #此处因为找到的链接只有一半，所以需要拼接
    return book_name,urllist

#访问每一章对应的网页并解析
def askurl_2(url):
    head = {
        'User-Agent':ua
    }
    time.sleep(2)
    res = requests.get(url,head)
    html = res.content.decode('utf-8')
    soup = BeautifulSoup(html,'lxml')
    post_content = soup.find('div',id = "content")
    title = soup.find('h1').text#章节的标题
    content = post_content.text#章节的内容
    return title,content
#保存文件，默认是保存到python文件所在的目录
def save(book_name,title,content ):
    book_name = str(book_name)
    fp = open("./%s.txt"%book_name,'ab')
    fp.write(title.encode('utf-8'))
    fp.write(content.encode('utf-8'))
    fp.close()
#主函数
def main():
    book_name,urllist = askurl_1(url)
    j = 1
    print(f'该书共有{len(urllist)}章，请耐心等待')
    for i in urllist:#把列表里面的章节链接提取出来
        title, content = askurl_2(i)
        save(book_name, title, content)
        print('第%d章下载完成' % j)
        j += 1
    print('下载完毕，感谢使用')


main()#调用主函数