多线程
生产者
def produce():
sql = """SELECT * FROM %s"""
conn, cursor = mysql_conn()
df = pd.read_sql(sql % table_name, conn)
for name, groups in df.groupby("update_province"):
print "name", name, len(groups), groups.shape
q.put((name, groups), block=True)
print "q.qsize()", q.qsize()
消费者
def customer():
while not q.empty():
deal = q.get(block=True)
print "cur group start", deal[0]
if deal[0]:
df = ipv6_update_city(deal[1])
else:
df = deal[1]
df["minip"] = df["minip"].map(inet_ntoa6)
df["maxip"] = df["maxip"].map(inet_ntoa6)
data = ["$".join(['%s' % _item for _item in item]) + "\n" for item in df.values]
del df
with open("city_update_" + str(int(time.time())) + ".csv", "a+") as f:
f.write(
"$".join(["id", "minip", "maxip", "update_province", "city", "update_city", "update_city_cidr"]) + "\n")
f.writelines(data)
f.close()
print "cur group end", deal[0], q.qsize()
time.sleep(5)
q.task_done()
入口函数
def func():
produce()
[Thread(target=customer).start() for _ in range(4)]
q.join()
多进程
# -*- coding: utf-8 -*-
import codecs
import time
from multiprocessing import Process, Manager
import pandas as pd
class ProducerProcess(Process):
"""定义生产者进程"""
def __init__(self, group, task_queue):
Process.__init__(self)
self.group = group
self.task_queue = task_queue
def run(self):
self.task_queue.put(self.group)
time.sleep(1)
class ConsumerProcess(Process):
"""定义消费者进程"""
def __init__(self, task_queue):
Process.__init__(self)
self.task_queue = task_queue
def run(self):
while True:
if not self.task_queue.empty(): # 判断任务队列是否为空
data = self.task_queue.get() # 从队列中获取数据
process_data(data) # 处理数据
time.sleep(2) # 处理数据耗时
else:
break
# 处理数据的函数(计算密集型任务)
def process_data(data):
data["C"] = data["A"] + "hello"
ls = [",".join(item) + "\n" for item in data.values.tolist()]
with codecs.open("test.csv", 'a+', 'utf8') as wfp:
wfp.writelines(ls)
def func():
# 创建共享任务队列
manager = Manager()
task_queue = manager.Queue()
# 假设有一个Pandas DataFrame,根据某一列进行分组
df = pd.DataFrame({'A': [1, 2, 3, 4, 5],
'B': ['a', 'b', 'a', 'b', 'a']})
df = df.astype({"A": "string"})
# 创建生产者进程
producer_processes = [ProducerProcess(group, task_queue) for _, group in df.groupby('B')]
print "task_queue", task_queue.qsize()
# 创建消费者进程
consumer_processes = [ConsumerProcess(task_queue) for _ in range(2)]
# 启动生产者进程
for process in producer_processes:
process.start()
# 启动消费者进程
for process in consumer_processes:
process.start()
# 等待生产者进程结束
for process in producer_processes:
process.join()
# 等待消费者进程结束
for process in consumer_processes:
process.join()
if __name__ == '__main__':
func()