tvzhishi 下载

import requests
from lxml import etree
import re
import threading
    
def spider(link,sub):
    html1=requests.get(link,headers=headers)
    k=r'<div class="content" id="content">(?P<contend>.*?)</div>'
    contend=re.findall(k,html1.text)[0]
    
    next_url=link[:-5]+'_2'+'.html'
    html2=requests.get(next_url,headers=headers)
    contend+=re.findall(k,html2.text)[0]
    
    contend=contend.replace('</p><p>','\n  ')
    contend=contend.replace('</p>','')
    contend=contend.replace('<p>','')
    
    k2='<a.*?</a>'
    contend=re.sub(k2,'',contend)
    
    k3='<title>第.*?页\(./2\)(?P<title>.*?)新笔趣阁</title>'
    try:
        title=re.findall(k3,html1.text)[0]
    except:
        k3=r'<title>(?P<title>.*?)\(1/2\).*?新笔趣阁</title>'
        title=re.findall(k3,html1.text)
        if title:
            title=title[0]
        else:
            title=''
    
    contend=title+'\n'+contend
    dic[sub]=contend

url='https://tvzhishi.com/1370_1370938/'
base_url='https://tvzhishi.com'
headers={
    'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
    }

urls=[]
dic={}
flag=True

k2='<title>(?P<title>.*?)小说章节列表_全本 - 新笔趣阁</title>'
try:
    title=re.findall(k2,requests.get(url).text)[0]
except:
    k2='<title>(?P<title>.*?)小说章节列表_连载 - 新笔趣阁</title>'
    title=re.findall(k2,requests.get(url).text)[0]

while flag:
    print('正在解析:',url)
    html=requests.get(url,headers=headers)
    Html=etree.HTML(html.text)
    urls_=Html.xpath('//ul[@class="section-list fix"]//a/@href')
    urls.extend(urls_[6:])
    k=r'<a class="index-container-btn.*?" href="(?P<next>.*?)"'
    url=base_url+re.findall(k,html.text)[1]
    if url.find('javascript:void(0);')!=-1:
        flag=False

urls=[base_url+it for it in urls]

url_cons=[]
print(len(urls))
sub=1
for pos,link in enumerate(urls):
    url_cons.append(link)
    if (pos+1)%50==0 or len(urls)==pos+1:
        thread=[]
        for i in range(50):
            try:
                thread.append(threading.Thread(target=spider,args=(url_cons[i],sub,)))
                print('正在下载:第',sub,'章')
                sub+=1
            except:
                break
        for it in thread:
            it.start()
        for it in thread:
            it.join()
        url_cons.clear()

keys=sorted(dic.keys())
with open('{}.txt'.format(title),'w',encoding='utf-8') as f:
    for key in keys:
        f.write(dic[key]+'\n\n')

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值