1 import requests
2 import time
3 import threading
4 import queue
5 from lxml import etree
6 # "https://ishuo.cn/duanzi"
7 # header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"}
8 #爬取网页线程---爬取段子列表所在的网页,放入队列
9 class Mythread1(threading.Thread):
10 def __init__(self,threaName,pageQueue,dataQueue):
11 threading.Thread.__init__(self)
12 self.threaName=threaName #线程名
13 self.pageQueue = pageQueue #页码队列
14 self.dataQueue = dataQueue #数据队列
15 self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"}
16
17 def run(self):
18 print("启动线程",self.threaName)
19 while not vlog1:
20 try:
21 page=self.pageQueue.get()
22 url="https://ishuo.cn/duanzi"
23 data=requests.get(url,headers=self.headers).text
24 time.sleep(0.5)
25 self.dataQueue.put(data) #将数据放入到数据队列中
26 except Exception as e:
27 pass
28 print("结束线程",self.threaName)
29
30
31 #解析网页线程---从队列中拿出列表网页,进行解析,并存储到本地
32 class Mythread2(threading.Thread):
33 def __init__(self,threaName,dataQueue,filename):
34 threading.Thread.__init__(self)
35 self.threaName=threaName
36 self.dataQueue = dataQueue
37 self.filename = filename
38
39 def run(self):
40 print("启动线程",self.threaName)
41 while not vlog1:
42 try:
43 data1=self.dataQueue.get() #取出数据队列中的数据
44 html=etree.HTML(data1)
45 node_list=html.xpath('//div[@class="info"]/a') #获取这一页网页中所有符合条件的a标签
46 # print(node_list)
47 for node in node_list:
48 data2=node.text #依次获取a标签的信息
49 self.filename.write(data2 "\n") #将信息写入文件
50 except Exception as e:
51 pass
52 print("结束线程",self.threaName)
53
54
55
56
57 vlog1=False #判断页码队列中是否为空
58 vlog2=False #判断数据队列中是否为空
59
60
61 def main():
62 #页码队列
63 pageQueue=queue.Queue(1)
64 pageQueue.put(1)
65 #存放采集结果的数据队列
66 dataQueue=queue.Queue()
67 #保存到文件
68 filename=open(r"D:\软件\python\python_work\Python_day18\123.txt","a")
69 #启动线程
70 t1=Mythread1("采集线程",pageQueue,dataQueue)
71 t1.start()
72 t2=Mythread2("解析线程",dataQueue,filename)
73 t2.start()
74 #结束主线程
75 #当pageQueue为空时,结束采集线程
76 while not pageQueue.empty():
77 pass
78 global vlog1
79 vlog1=True
80
81 # 当dataQueue为空时,结束解析线程
82 while not pageQueue.empty():
83 pass
84 global vlog2 #定义全局变量
85 vlog2 =True
86
87 t1.join()
88 t2.join()
89 filename.close() #当2个线程执行完之后关闭文件
90 print("结束!")
91
92
93
94 if __name__=='__main__':
95 main()
来源:https://www.icode9.com/content-1-664001.html