python queue threading_python Queue，threading多线程实战

最新推荐文章于 2024-04-21 11:41:36 发布

weixin_39548972

最新推荐文章于 2024-04-21 11:41:36 发布

阅读量161

点赞数

文章标签： python queue threading

本文链接：https://blog.csdn.net/weixin_39548972/article/details/113963431

版权

# coding=utf-8

import requests

import lxml.html

import json

import re

from queue import Queue

import threading

CRAWL_EXIT = False #采集网页页码队列是否为空

PARSE_EXIT = False #数据队列是否为空

etree = lxml.html.etree

#爬取网页源代码的类

class ThreadCrawls(threading.Thread):

#初始化方法，线程名字，页码队列,数据队列

def __init__(self,threadName,pageQueue,dataQueue):

threading.Thread.__init__(self)

#线程名

self.threadName = threadName

#页码队列

self.pageQueue = pageQueue

#数据队列

self.dataQueue = dataQueue

# 请求头防止被屏蔽

self.headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}

def run(self):

# run方法

print("开启采集线程"+self.threadName)

while not CRAWL_EXIT:

# 当网页页码队列不为空的时候

try:

# 从pageQueue中取出一个页码先进先出

# 并且 block = False当队列头为空的时候不阻塞线程

page = self.pageQueue.get(False)

#网页的url

url = "https://movie.douban.com/subject/30166972/reviews?start=" + str((page - 1) * 20)

response = requests.get(url, headers=self.headers) #发送请求得到响应

response.encoding = "utf-8" #设置编码

content = response.text #获得响应源代码

content = re.sub("[\t\r\n]", "", content) # 取出制表符换行符

#将爬取到的网页源代码放入dataQueue队列中

self.dataQueue.put(content)

except: #出现异常的时候直接跳过等于当页码队列为空的时候我们就跳过

pass

print("结束采集线程"+self.threadName)

#解析网页源代码抽取数据的类

class ThreadParses(threading.Thread):

def __init__(self,threadName,dataQueue,localFile,lock):

threading.Thread.__init__(self)

#线程名字

self.threadName = threadName

#数据队列

self.dataQueue = dataQueue

#解析后的数据所存放的文件地址

self.localFile = localFile

#互斥锁防止同时写入数据造成数据的混乱

self.lock = lock

def run(self):

# run方法

print("开启解析线程"+self.threadName)

while not PARSE_EXIT:

# 当数据队列不为空的时候

try:

# False表示队列头为空的时候不阻塞线程

html = self.dataQueue.get(False) #获得网页源代码

self.parse(html) #解析html文档获得内容

except:

pass

print("结束解析线程" + self.threadName)

#解析html文档获得内容

def parse(self,html):

text = etree.HTML(html)

node_list = text.xpath(".//div[@class='main review-item']")

# print(node_list)

for node in node_list:

# 获取用户名username .//a[@class='name']

username = node.xpath(".//a[@class='name']")[0].text

# print(username)

# time .//span[@class='main-meta']

time = node.xpath(".//span[@class='main-meta']")[0].text

# print(time)

# title .//div[@class='main-bd']/h2/a

title = node.xpath(".//div[@class='main-bd']/h2/a")[0].text

# print(title)

# content .//div[@class='short-content']

content = node.xpath(".//div[@class='short-content']")[0].text

# print(content)

# 赞 .//div[@class='action']/a/span

zan = node.xpath(".//div[@class='action']/a/span")[0].text

# print(zan)

items = {

"username": username,

"title": title,

"content": content, #写入会没有数据，因为它的评论是动态加载的此处没用动态获取数据所有没有

"time": time,

"zan": zan

}

# with 会自动打开和关闭 io 所以不需要手动关闭

with self.lock:

# 互斥锁写入

#json.dumps 把字典对象转为json字符串然后写入文件中

self.localFile.write(json.dumps(items,ensure_ascii=False)+"\n")

#抓取数据和提取数据保存数据到文件

def main():

# 1. 页码队列

pageQueue = Queue(20) #表示有20个页码 Queue 先进先出

# 装载20个页码进队列

for i in range(1,21):

pageQueue.put(i)

# 2. 数据队列网页HTML源代码

dataQueue = Queue(20) #表示有20个源代码

#以追加的方式打开本地文件

localFile = open("dbduanping.json","a",encoding="utf-8")

lock = threading.Lock(); #创建互斥锁

# 3. 三个采集数据线程的名字

crawlList = ['采集1号线程', '采集2号线程', '采集3号线程']

# 创建启动和存储三个采集线程

threadCrawls = []

for threadName in crawlList:

# 创建采集线程对象

thread = ThreadCrawls(threadName,pageQueue,dataQueue)

thread.start() #开启线程开启的时候会自动执行run方法 (爬取网页源代码并存入dataQueue)

threadCrawls.append(thread) #把线程添加进入采集线程列表

# 4. 三个提取数据的解析线程名字

parseList = ['解析1号线程', '解析2号线程', '解析3号线程']

# 创建启动和存储三个解析线程

threadParses = []

for threadName in parseList:

#创建解析线程对象

thread = ThreadParses(threadName,dataQueue,localFile,lock)

thread.start() #开启解析线程自动调用run方法解析内容存到本地

threadParses.append(thread)

while not pageQueue.empty():

pass #当页码队列不为空的时候跳过

#如果页码队列为空采集线程退出循环

global CRAWL_EXIT

CRAWL_EXIT = True #为空了采集页码队列

print("pageQueue 为空")

for thread in threadCrawls:

thread.join() #阻塞子线程必须等我执行完才会往下执行

while not dataQueue.empty():

pass # 当数据队列不为空的时候跳过

print("dataQueue 为空")

global PARSE_EXIT

PARSE_EXIT = True # 为空了数据队列

for thread in threadParses:

thread.join()

with lock:

#使用互锁关闭文件

localFile.close()

if __name__ == "__main__":

main()

weixin_39548972

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫