import re
from parse_url import parse_url
import requests
import time
import threading
from queue import Queue
start = time.time()
class Neihan_Spider(object):
def __init__(self):
self.url_temp = "http://www.budejie.com/text/{}"
self.headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"}
self.url_queue = Queue()
self.html_queue = Queue()
self.content_queue = Queue()
def get_url_list(self):
# return [self.url_temp.format(i) for i in range(1, 10)]
for i in range(1, 30):
self.url_queue.put(self.url_temp.format(i)) # put方法记录+1
def parse_url(self):
while True:
url = self.url_queue.get()
print(url)
response = requests.get(url, headers=self.headers)
# return response.content.decode()
self.html_queue.put(response.content.decode())
self.url_queue.task_done() # 使get方法记录-1
def get_content_list(self):
while True:
html_str = self.html_queue.get()
p = re.compile(
r"""<div class="j-r-list-c-desc">\n \n \n <a href="/detail-\d+.html">(.*?)</a>""",
re.S)
duanzi_str = p.findall(html_str) # 获取当前页面所有段子的文字, 结果为一个列表, 每一个元素为一条段子
# return duanzi_str
self.content_queue.put(duanzi_str)
self.html_queue.task_done()
def save_data(self):
while True:
s = self.content_queue.get()
with open("neihan.txt", 'a') as f:
# 将每一个段子中的<br />换行字符变为真正的换行
for s1 in s:
p2 = re.compile(r"<br />")
s2 = p2.sub(r"\n", s1)
f.write(s2 + "\n\n")
self.content_queue.task_done()
def run(self):
thread_list = []
# 1.准备url
t_url = threading.Thread(target=self.get_url_list)
thread_list.append(t_url)
# 2.请求url, 获取响应
for i in range(3):
t_parse = threading.Thread(target=self.parse_url)
thread_list.append(t_parse)
# 3.处理数据
for i in range(2):
t_content = threading.Thread(target=self.get_content_list)
thread_list.append(t_content)
# 4.保存
t_save = threading.Thread(target=self.save_data)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True) # 把子线程设置为守护进程, 该线程不重要, 主线程结束, 子线程结束
t.start()
# 让主线程等待阻塞, 等待队列的任务完成之后再结束
for q in [self.url_queue, self.html_queue, self.content_queue]:
q.join()
print("主线程结束")
if __name__ == '__main__':
nei_han = Neihan_Spider()
nei_han.run()
end = time.time()
print(end - start)
爬虫初阶(四)—— 百思不得姐多线程爬虫案例
最新推荐文章于 2023-03-25 13:36:16 发布