- 爬笔趣阁的,或许有点小bug,权当过过手瘾
- 害,居然打不出代码块,就这样好了
import requests
import random
from lxml import etree
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
def getContent(url):
try:
req = requests.get(url = url,headers = headers,timeout = 30)
req.encoding = req.apparent_encoding
html = etree.HTML(req.text)
content = ""
for each in html.xpath("//div[@id ='content']/text()"):
temp = "".join(each.split())
if temp !="":
content +="\n"+temp
title = html.xpath("//div[@class='bookname']/h1")[0].text
except:
print("再次尝试")
return getContent(url)
return title,content
def getNovel(url,name):
with open("./{}.txt".format(name), "w") as f:
req = requests.get(url = url,headers = headers)
req.encoding = req.apparent_encoding
html = etree.HTML(req.text)
lis = html.xpath("//div[@id = 'list']/dl/dd/a/@href")
length = len(lis)
for index in range(length):
if index % 100 ==0:
time.sleep(5)
else:
time.sleep(random.uniform(0,1))
con_url = "http://www.xbiquge.la"+lis[index]
title,content = getContent(con_url)
f.write("\n")
f.write(title)
f.write("\n")
f.write(content)
print("已完成:"+str(index/length))
if __name__ == '__main__':
url = "http://www.xbiquge.la/10/10489/"
getNovel(url,"三寸人间")