笔趣阁爬虫
新手练习,欢迎大佬指正,因为还没学习多线程和多进程所以爬取的速度会比较慢,主要我设置了每隔2s访问一次(为了防止被封ip),如果觉得慢的话可以考虑删掉time.sleep(2)这一行代码
import requests
import re
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time
# url = "http://www.biquge.info/10_10229/"
#输入要下载书的链接
url = input('请输入要下载书的笔趣阁链接:')
# 使用虚假身份
ua = UserAgent().random
#正则模板,找到每一章对应的链接
findurl = re.compile(r'<dd><a href="(.*?)" title="(.*?)">(.*?)</a></dd>')
#正则模板,找到每一章对应的内容
findbase = re.compile(r'<meta content="(.*?)" property="og:novel:read_url"/>')
#正则模块,找到书名
findbook_name = re.compile(r'<meta content="(.*?)" property="og:title"/>')
#定义一个空的列表用来存放每一章对应的链接
urllist = []
#访问该书的网页并解析
def askurl_1(url):
head = {
'User-Agent': ua
}
time.sleep(2)
res = requests.get(url, head)
html = res.content.decode('utf-8')
soup = BeautifulSoup(html, 'lxml')
list = re.findall(findurl,str(soup))
baseurl = re.findall(findbase,str(soup))
book_name = re.findall(findbook_name,str(soup))
book_name = book_name[0]
for url in list:
urllist.append(baseurl[0] + url[0]) #此处因为找到的链接只有一半,所以需要拼接
return book_name,urllist
#访问每一章对应的网页并解析
def askurl_2(url):
head = {
'User-Agent':ua
}
time.sleep(2)
res = requests.get(url,head)
html = res.content.decode('utf-8')
soup = BeautifulSoup(html,'lxml')
post_content = soup.find('div',id = "content")
title = soup.find('h1').text#章节的标题
content = post_content.text#章节的内容
return title,content
#保存文件,默认是保存到python文件所在的目录
def save(book_name,title,content ):
book_name = str(book_name)
fp = open("./%s.txt"%book_name,'ab')
fp.write(title.encode('utf-8'))
fp.write(content.encode('utf-8'))
fp.close()
#主函数
def main():
book_name,urllist = askurl_1(url)
j = 1
print(f'该书共有{len(urllist)}章,请耐心等待')
for i in urllist:#把列表里面的章节链接提取出来
title, content = askurl_2(i)
save(book_name, title, content)
print('第%d章下载完成' % j)
j += 1
print('下载完毕,感谢使用')
main()#调用主函数