import requests
from bs4 import BeautifulSoup
import html
import re
import time
def get_html_txt(url):
try:
html=Sess.get(url=url)
html.encoding=html.apparent_encoding
txts=BeautifulSoup(html.text,'html.parser')
return txts.select('#content')[0].get_text()
except:
get_html_txt(url)
def wt(txt,tit):
txt=str(txt)
tit=str(tit).replace('*','')
tit = str(tit).replace('~', '')
tit = str(tit).replace('/', '')
tit = str(tit).replace('(', '')
tit = str(tit).replace(')', '')
tit = str(tit).replace('【', '')
tit = str(tit).replace('】', '')
with open('./超神机械师/'+tit+'.txt','w',encoding='utf-8') as f:
f.write(txt)
if __name__ == '__main__':
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'clickbids=40160; Hm_lvt_c979821d0eeb958aa7201d31a6991f34=1584758773; Hm_lvt_6dfe3c8f195b43b8e667a2a2e5936122=1584758818; Hm_lpvt_6dfe3c8f195b43b8e667a2a2e5936122=1584758818; Hm_lpvt_c979821d0eeb958aa7201d31a6991f34=1584758818',
'Host': 'www.biquge.info',
'Referer': 'http://www.biquge.info/40_40160/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
Sess = requests.session()
Sess.headers = headers
url = 'http://www.biquge.info/40_40160/'
html_txt = Sess.get(url=url)
html_txt.encoding = html_txt.apparent_encoding
txts = BeautifulSoup(html_txt.text, 'html.parser')
urls=[]
for t in txts.select('#list > dl > dd'):
href=t.a['href']
title=t.a['title']
urls.append((url+href,title))
for u in urls:
txt=get_html_txt(u[0])
wt(txt,u[1])
time.sleep(3)