import requests
from lxml import etree
import re
import threading
def spider(link,sub):
html1=requests.get(link,headers=headers)
k=r'<div class="content" id="content">(?P<contend>.*?)</div>'
contend=re.findall(k,html1.text)[0]
next_url=link[:-5]+'_2'+'.html'
html2=requests.get(next_url,headers=headers)
contend+=re.findall(k,html2.text)[0]
contend=contend.replace('</p><p>','\n ')
contend=contend.replace('</p>','')
contend=contend.replace('<p>','')
k2='<a.*?</a>'
contend=re.sub(k2,'',contend)
k3='<title>第.*?页\(./2\)(?P<title>.*?)新笔趣阁</title>'
try:
title=re.findall(k3,html1.text)[0]
except:
k3=r'<title>(?P<title>.*?)\(1/2\).*?新笔趣阁</title>'
title=re.findall(k3,html1.text)
if title:
title=title[0]
else:
title=''
contend=title+'\n'+contend
dic[sub]=contend
url='https://tvzhishi.com/1370_1370938/'
base_url='https://tvzhishi.com'
headers={
'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
}
urls=[]
dic={}
flag=True
k2='<title>(?P<title>.*?)小说章节列表_全本 - 新笔趣阁</title>'
try:
title=re.findall(k2,requests.get(url).text)[0]
except:
k2='<title>(?P<title>.*?)小说章节列表_连载 - 新笔趣阁</title>'
title=re.findall(k2,requests.get(url).text)[0]
while flag:
print('正在解析:',url)
html=requests.get(url,headers=headers)
Html=etree.HTML(html.text)
urls_=Html.xpath('//ul[@class="section-list fix"]//a/@href')
urls.extend(urls_[6:])
k=r'<a class="index-container-btn.*?" href="(?P<next>.*?)"'
url=base_url+re.findall(k,html.text)[1]
if url.find('javascript:void(0);')!=-1:
flag=False
urls=[base_url+it for it in urls]
url_cons=[]
print(len(urls))
sub=1
for pos,link in enumerate(urls):
url_cons.append(link)
if (pos+1)%50==0 or len(urls)==pos+1:
thread=[]
for i in range(50):
try:
thread.append(threading.Thread(target=spider,args=(url_cons[i],sub,)))
print('正在下载:第',sub,'章')
sub+=1
except:
break
for it in thread:
it.start()
for it in thread:
it.join()
url_cons.clear()
keys=sorted(dic.keys())
with open('{}.txt'.format(title),'w',encoding='utf-8') as f:
for key in keys:
f.write(dic[key]+'\n\n')
tvzhishi 下载
最新推荐文章于 2024-08-11 18:44:16 发布