一、本次的项目思路
1.使用ThreadPoolExecutor和requests两个类完成多线程爬取
2.用tkinter写界面
二、总体的程序运行如下
1.原始未运行的界面
2.运行时的界面
**三、代码实现
1.多线程爬虫代码
# 返回小说页面所有章节url
def return_urls(self):
url = self.e.get()
try:
res = requests.get(url=url, headers=headers)
except MissingSchema as m:
messagebox.showwarning(title="警告",message="缺少网址或网址错误")
print("缺少网址或网址错误")
return
pass
res.encoding="utf-8"
html = etree.HTML(res.text)
try:
self.title = html.xpath("//div[@id='info']/h1/text()")[0]
if not os.path.exists("./{}".format(self.title)):
os.mkdir(self.title)
except:
self.title = "临时文件夹(小说)"
if not os.path.exists("./{}".format(self.title)):
os.mkdir(self.title)
urls = html.xpath("//div[@class='box_con']//dd/a/@href")
for u in urls:
u = "https://www.xxbiquge.net/" + u
self.new_urls.append(u)
pass
# 单个爬取小说的样式
def get_txt(self,url):
res = requests.get(url=url,headers=headers)
res.encoding="utf-8"
html = etree.HTML(res.text)
name = html.xpath("//div[@class='bookname']/h1/text()")[0]
content =html.xpath("//div[@id='content']//text()")
content ="".join(content)
# print(name)
return name,content
# print(content)
pass
# 执行爬取的多线程主函数
def main(self):
count = 0
urls = self.return_urls()
executor = ThreadPoolExecutor(max_workers=100)
tasks = [executor.submit(self.get_txt, url) for url in self.new_urls]
for f in as_completed(tasks):
count+=1
self.show.config(text="总共章节/已下载===>{}/{}".format(len(self.new_urls),count))
self.update()
name,content = f.result()
with open("./{}/{}.txt".format(self.title,name),"w",encoding="utf-8")as f:
f.write(content)
print(name,"下载完成...")
pass
2.界面代码实现
def Ui(self):
self.title('趣阁网')
self.geometry("300x200")
self.resizable(0,0)
l = tk.Label(self,text="输入网址:")
l.grid(row=0,column=0,padx=10,pady=10)
self.v = tk.StringVar()
self.e = tk.Entry(self,width=25,textvariable=self.v)
self.e.grid(row=0,column=1,pady=10)
self.v.set("https://www.xxbiquge.net/9_9208/")
b = tk.Button(self,text="下 载",width=10,bg="pink",relief="flat",command=lambda :self.Tread_start(self.main))
b.grid(row=1,column=0,padx=10)
q = tk.Button(self, text="退 出", width=10, bg="pink", relief="flat", command=lambda :self.quit())
q.grid(row=1, column=1,padx=10)
self.show = tk.Label(text="暂无下载任务!")
self.show.grid(row=2,column=0,columnspan=2,pady=100)
pass