我想从类似笔趣阁的目录式书站下小说,用过uncle小说(52论坛),莫名失败,于是乎自己写了个py爬虫,十分简陋,仅供参考。
exe版本:request.exe - 蓝奏云
谢谢!
import requests
import re
import datetime
from urllib.parse import urlparse
from bs4 import BeautifulSoup
headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
url=input("输入网址(例如:https://www.ibiquge.net/xx_xxxx/)\n")
response = requests.get(url)
response.encoding = 'utf-8'
soup=BeautifulSoup(response.text,'lxml')
ddlist=soup.find_all("dd")
fname="book_"+datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d_%H-%M-%S')+'.txt'
file = open(fname,"w",encoding="utf-8")
# f1=open("cc.txt","w",encoding=response.apparent_encoding)
flag=1
h='href'
for cc in ddlist:
flag=flag+1
if flag>=14:
for child in cc.contents:
# print(child.attr['href'])
if type(child)==type(cc):
file.write("\n")
file.write(str(child.string))
file.write("\n")
child['href']=urlparse(url).scheme+"://"+urlparse(url).netloc+child['href']
print(child['href'])
res = requests.get(child['href'])
res.encoding = res.apparent_encoding
soupt=BeautifulSoup(res.text,'lxml')
plist=soupt.find(id="content")
pl=str(plist).replace('<br/>','\n')
file.write(re.sub('<[^<]+?>', '', str(pl)).replace(' ', '\n').strip())
file.close()
print('ok')