#coding=utf-8
import requests
from retrying import retry
from lxml import etree
import time
from queue import Queue
import threading
# import chardet
class QiuBai:
def __init__(self):
self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
# 创建队列 (生产者模式) 网页链接队列,获取网页内容队列,保存队列
self.url_temp = "https://www.ximalaya.com/keji/p{}/"
# 第一层网页链接队列
self.url_first_queue = Queue()
# 第二层网页链接队列
self.url_second_queue = Queue()
# 第一层网内内容解析
self.html_first_queue = Queue()
# 第一层网内内容解析
self.html_second_queue = Queue()
# 将获取到的内容进行保存
self.content_list_queue = Queue()
# 第一层网页队列=添加
def get_url_list(self):
url_list = [self.url_temp.format(i) for i in range(1,3)]
for url in url_list:
self.url_first_queue.put(url)
print(url)
return url_list
# 解析网页
@retry(stop_max_attempt_number=4)
def _parse_url(self, url,return_str=False):
r = requests.get(url, headers=self.headers, timeout=5)
assert r.status_code == 200
print(r.status_code)
# print(chardet.detect(r.content))
time.sleep(1)
return etree.HTML(r.content.decode("utf-8"))
# 第一层网页解析并将解析后的内容放入队列中
def parse_first_url(self):
while 1:
url = self.url_first_queue.get()
print(url)
try:
html = self._parse_url(url)
except:
html = None
print('Error')
self.html_first_queue.put(html)
self.url_first_queue.task_done() #配合get计数减少1
# 获取第二层网页的链接
def get_firstUrl_list(self):
while 1:
html = self.html_first_queue.get()
# print(html.xpath('//text()'))
if html is not None:
html_first_list = html.xpath("//a[@class = 'u0jN album-title lg']/@href")
for u in html_first_list:
url_second = "https://www.ximalaya.com" + u
self.url_second_queue.put(url_second)
self.html_first_queue.task_done() #配合get计数减少1
# 解析第二层网页并将解析后的内容放入队列中
def parse_second_url(self):
while 1:
# while self.url_queue.empty() != True:
url = self.url_second_queue.get()
print(url)
try:
html = self._parse_url(url)
except:
html = None
print('Error')
self.html_second_queue.put(html)
print(self.html_second_queue.qsize())
print('@'*100)
self.url_second_queue.task_done() #配合get计数减少1
print('具体内容页面数量',self.url_second_queue.qsize())
# 获取所需的内容
def get_concent_list(self):
while 1:
html_1 = self.html_second_queue.get()
print(html_1)
if html_1 is not None:
print(html_1.xpath('//h1/text()')[0])
content_list = html_1.xpath('//h1/text()')[0]
self.content_list_queue.put(content_list)
self.html_second_queue.task_done() # 配合get计数减少1
print('数量',self.html_second_queue.qsize())
# 将所需内容保存
def save_content_list(self): #4.保存
while 1:
content_list = self.content_list_queue.get()
with open("C:/Users/JZG/Desktop/threads8-15/output/title.txt","a",encoding="utf-8") as fp:
fp.write(content_list + "\n")
self.content_list_queue.task_done() #配合get计数减少1
print('内容数量',self.content_list_queue.qsize())
def run(self):
lock = threading.RLock()
thread_list = []
t_url = threading.Thread(target=self.get_url_list)
thread_list.append(t_url)
#2遍历,发送请求,获取响应
for i in range(2):
t_first_parse = threading.Thread(target=self.parse_first_url)
thread_list.append(t_first_parse)
#3.提取具体页面网址
t_get_firstUrl_list = threading.Thread(target=self.get_firstUrl_list)
thread_list.append(t_get_firstUrl_list)
# 2遍历,发送请求,获取响应
for i in range(5):
try:
lock.acquire()
t_second_parse = threading.Thread(target=self.parse_second_url)
thread_list.append(t_second_parse)
finally:
lock.release()
# 3.提在具体页面取所需信息
for i in range(3):
try:
lock.acquire()
t_get_content_list = threading.Thread(target=self.get_concent_list)
thread_list.append(t_get_content_list)
finally:
lock.release()
#4.保存
t_save = threading.Thread(target=self.save_content_list)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True) #设置守护线程,说明该线程不重要,主线程结束,子线程结束
t.start() #线程启动
print('哈哈哈哈哈')
time.sleep(5)
for q in [self.url_first_queue,self.url_second_queue,self.html_first_queue,self.html_second_queue,self.content_list_queue]:
q.join() #等待,让主线程等待,队列计数为0之后才会结束,否则会一直等待
if __name__ == '__main__':
print(time.ctime(time.time()))
qiubai = QiuBai()
qiubai.run()
print(time.ctime(time.time()))
python多线程简单案例
最新推荐文章于 2024-06-20 18:33:48 发布