import requests
import re
from lxml import etree
import threading
import time
import random
import os
BASE_URL = "https://www.x81zw1.com/"
PROXIES = {"http":None,"https":None}
THREAD_NUM = 10
SAVE_PATH = "2"
def get_zjml(zjml_path):
rsp = requests.get(zjml_path,proxies=PROXIES)
rsp_txt = rsp.text
print(rsp_txt[:100])
tree = etree.HTML(rsp_txt)
xpath_ = '//*[@id="list"]/dl[2]/dd'
l = tree.xpath(xpath_)
res = []
for i in range(len(l)):
item = l[i].getchildren()[0]
res.append((item.attrib['href'],item.text,i))
return res
def get_content(item):
url,title,index = item
if os.path.exists(SAVE_PATH+"//"+str(index)+".txt"):return
url = BASE_URL+url
rsp = requests.get(url,proxies=PROXIES)
rsp_txt = rsp.text
re_ = re.compile('<div id="content">(.*?)<div class="bottem2">',re.S)
res = re_.findall(rsp_txt)[0]
res = res.replace("<br><br>","\n\n")
res = res.replace("\u3000"," ")
res = res.split("无尽的昏迷过后")[0]
with open(SAVE_PATH+"//"+str(index)+".txt","w",encoding='utf-8') as f:
f.write(title)
f.write("\n\n")
f.write(res)
def get_xiaoshuo(items):
while len(items):
item = items.pop()
#time.sleep(random.randint(0,3))
get_content(item)
if __name__ == '__main__':
# 章节目录
zj_ml = "https://www.x81zw1.com/book/17/17500/"
res = get_zjml(zj_ml)
#print(res[:10])
# '/book/0/436/309558.html', '第1章 血脉重生'
thread_list = []
for i in range(THREAD_NUM):
t = threading.Thread(target=get_xiaoshuo,args=(res,))
t.start()
thread_list.append(t)
for t in thread_list:
t.join()
合并
f = open('炼气3000.txt','w')
for i in range(1287):
print(i)
with open(f"2/{i}.txt","r") as f_:
txt = f_.read()
f.write(txt)
f.write("\n\n")
f.close()