multiprocessing并发、大dataframe切分、

要求:
1、要求读取一个大的csv文件(接近2000万行),用于机器学习形成关联
2、字段包括 itemid、timestamp
3、记录的dataframe,对记录做笛卡尔积,同时timestamp差值超出300秒部分不要,itemid相同时也不要
4、单个服务器处理生成数据

最终:
1、并发multiprocessing
2、切分成小dataframe做笛卡尔积(不可避免会有数据缺失)

multiprocessing实现数据并发处理

process_num = 100
split_count  = 30*24`
split_length = int(len(df_result)//split_count)`

list_of_splits = []
for n in range(0,split_count+1):
    list_of_splits.append(df_result.iloc[n * split_length:(n + 1) * split_length])
for i in range(0,process_num):
    childprocess = Process(target=inter_relation_gen, args=(list_of_splits[i::process_num],))
    childprocess.start()

实现dataframe的笛卡尔积,同时对笛卡尔积的的数据做条件

def inter_relation_gen(inter_source_df_list):
    for inter_source_df in inter_source_df_list:
        radom_id = str(uuid.uuid4())
        print ('started inter_relation_gen' + ':'+str(inter_source_df.shape)+'---'+radom_id + '\n')
        df_1 = inter_source_df.sort_values(by=['timestamp:float'], ascending=True)
        df_1.reset_index(drop=True, inplace=True)
        # print('df_1.shape', df_1.shape, len(df_1))
        # print(df_1.head(10))
        result = []
        for i in range(0,len(df_1)):
            user_id = df_1.iloc[i,[0]].item()
            # print ('user_id',user_id)
            timestamp_user = df_1.iloc[i,[1]].item()
            # print('timestamp_user', timestamp_user)
            #逐行读取,读取x分钟内数据,超出时间开始下一行,超出50个开始下一行
            k = 0
            for  j in range(i+1,len(df_1)):
                item_id = df_1.iloc[j,[0]].item()
                timestamp = df_1.iloc[j,[1]].item()
                if user_id==item_id :
                    continue
                elif timestamp - timestamp_user > 300:
                    break
                else:
                    result.append([user_id,item_id,timestamp])
                    k = k + 1
                    if k==50:
                        break
        result = pd.DataFrame(result,columns=['user_id:token','item_id:token','timestamp:float'])
        result.drop_duplicates(inplace=True)
        print ('result.shape 1:',result.shape)
        result.to_csv(path_or_buf=Write_path + 'bert4recbole0.inter' + '.'+ str(radom_id),
            sep=',', index=False, header=False, quoting=csv.QUOTE_NONE, escapechar='\\')
        result = []
        for i in range(len(df_1)).__reversed__():
            user_id = df_1.iloc[i,[0]].item()
            # print ('user_id',user_id)
            timestamp_user = df_1.iloc[i,[1]].item()
            # print('timestamp_user', timestamp_user)
            # 逐行读取,读取x分钟内数据,超出时间开始下一行,超出50个开始下一行
            k = 0
            for j in range(i - 1).__reversed__():
                item_id = df_1.iloc[j,[0]].item()
                timestamp = df_1.iloc[j,[1]].item()

                if user_id == item_id:
                    continue
                elif timestamp_user - timestamp > 300:
                    break
                else:
                    result.append([user_id, item_id, timestamp])
                    k = k + 1
                    if k == 50:
                        break

        result = pd.DataFrame(result, columns=['user_id:token', 'item_id:token', 'timestamp:float'])
        print ('result.shape 2:',result.shape)
        result.drop_duplicates(inplace=True)
        result.to_csv(
            path_or_buf=Write_path + 'bert4recbole1.inter'+ '.'+ str(radom_id),
            sep=',', index=False, header=False, quoting=csv.QUOTE_NONE, escapechar='\\')
        print ('end inter_relation_gen' + ':'+str(inter_source_df.shape)+'---'+radom_id + '\n')

实现大型Dataframe切分

split_count  = 30*24 ##切成多少个
split_length = int(len(df_result)//split_count) #每个切块里面多少条记录
list_of_splits = [] #用于存储切完的数据
for n in range(0,split_count+1):
    list_of_splits.append(df_result.iloc[n * split_length:(n + 1) * split_length])

完整代码

import pandas as pd
import time,csv
pd.set_option('display.max_rows', 3000)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 3000)
pd.set_option('max_colwidth', 100)
import uuid
from sklearn.preprocessing import  LabelEncoder
from multiprocessing import Process
# from multiprocessing import Pool
# from multiprocessing.dummy import Pool as ThreadPool
encoder = LabelEncoder()
def unix_timestamp_specified(specified_time):
  try:
    try:
        unix_timestamp = int(time.mktime(time.strptime(specified_time, "%d/%m/%Y %H:%M:%S.%f")))
        return unix_timestamp
    except Exception as e :
        unix_timestamp = int(time.mktime(time.strptime(specified_time, "%d/%m/%Y %H:%M:%S")))
        return unix_timestamp
  except Exception as e:
    print (specified_time)
    return int(time.time())
Write_path = r"dataset/bert4recbole/"
df = pd.read_csv(r'../../data/202405.csv')
df = df.astype('str')
df['timestamp'] = df['insert_time'].apply(unix_timestamp_specified)
df = df.sort_values(by=['timestamp'], ascending=True)
df.reset_index(drop=True, inplace=True)

def inter_relation_gen(inter_source_df_list):
    for inter_source_df in inter_source_df_list:
        radom_id = str(uuid.uuid4())
        print ('started inter_relation_gen' + ':'+str(inter_source_df.shape)+'---'+radom_id + '\n')
        df_1 = inter_source_df.sort_values(by=['timestamp:float'], ascending=True)
        df_1.reset_index(drop=True, inplace=True)
        # print('df_1.shape', df_1.shape, len(df_1))
        # print(df_1.head(10))
        result = []
        for i in range(0,len(df_1)):
            user_id = df_1.iloc[i,[0]].item()
            # print ('user_id',user_id)
            timestamp_user = df_1.iloc[i,[1]].item()

            # print('timestamp_user', timestamp_user)
            #逐行读取,读取x分钟内数据,超出时间开始下一行,超出50个开始下一行
            k = 0
            for  j in range(i+1,len(df_1)):
                item_id = df_1.iloc[j,[0]].item()
                timestamp = df_1.iloc[j,[1]].item()

                if user_id==item_id :
                    continue
                elif timestamp - timestamp_user > 300:
                    break
                else:
                    result.append([user_id,item_id,timestamp])
                    k = k + 1
                    if k==50:
                        break
        result = pd.DataFrame(result,columns=['user_id:token','item_id:token','timestamp:float'])
        result.drop_duplicates(inplace=True)
        print ('result.shape 1:',result.shape)
        result.to_csv(path_or_buf=Write_path + 'bert4recbole0.inter' + '.'+ str(radom_id),
            sep=',', index=False, header=False, quoting=csv.QUOTE_NONE, escapechar='\\')
        result = []
        for i in range(len(df_1)).__reversed__():
            user_id = df_1.iloc[i,[0]].item()
            # print ('user_id',user_id)
            timestamp_user = df_1.iloc[i,[1]].item()
            # print('timestamp_user', timestamp_user)
            # 逐行读取,读取x分钟内数据,超出时间开始下一行,超出50个开始下一行
            k = 0
            for j in range(i - 1).__reversed__():
                item_id = df_1.iloc[j,[0]].item()
                timestamp = df_1.iloc[j,[1]].item()

                if user_id == item_id:
                    continue
                elif timestamp_user - timestamp > 300:
                    break
                else:
                    result.append([user_id, item_id, timestamp])
                    k = k + 1
                    if k == 50:
                        break
        result = pd.DataFrame(result, columns=['user_id:token', 'item_id:token', 'timestamp:float'])
        print ('result.shape 2:',result.shape)
        result.drop_duplicates(inplace=True)
        result.to_csv(
            path_or_buf=Write_path + 'bert4recbole1.inter'+ '.'+ str(radom_id),
            sep=',', index=False, header=False, quoting=csv.QUOTE_NONE, escapechar='\\')
        print ('end inter_relation_gen' + ':'+str(inter_source_df.shape)+'---'+radom_id + '\n')

if __name__ == '__main__':

    df_result = df
    print ('df_result',df_result.shape)
    print ('----------------------\n')
    process_num = 100
    split_count  = 30*24
    split_length = int(len(df_result)//split_count)

    list_of_splits = []
    for n in range(0,split_count+1):
        list_of_splits.append(df_result.iloc[n * split_length:(n + 1) * split_length])
    for i in range(0,process_num):
        childprocess = Process(target=inter_relation_gen, args=(list_of_splits[i::process_num],))
        childprocess.start()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值