要求:
1、要求读取一个大的csv文件(接近2000万行),用于机器学习形成关联
2、字段包括 itemid、timestamp
3、记录的dataframe,对记录做笛卡尔积,同时timestamp差值超出300秒部分不要,itemid相同时也不要
4、单个服务器处理生成数据
最终:
1、并发multiprocessing
2、切分成小dataframe做笛卡尔积(不可避免会有数据缺失)
multiprocessing实现数据并发处理
process_num = 100
split_count = 30*24`
split_length = int(len(df_result)//split_count)`
list_of_splits = []
for n in range(0,split_count+1):
list_of_splits.append(df_result.iloc[n * split_length:(n + 1) * split_length])
for i in range(0,process_num):
childprocess = Process(target=inter_relation_gen, args=(list_of_splits[i::process_num],))
childprocess.start()
实现dataframe的笛卡尔积,同时对笛卡尔积的的数据做条件
def inter_relation_gen(inter_source_df_list):
for inter_source_df in inter_source_df_list:
radom_id = str(uuid.uuid4())
print ('started inter_relation_gen' + ':'+str(inter_source_df.shape)+'---'+radom_id + '\n')
df_1 = inter_source_df.sort_values(by=['timestamp:float'], ascending=True)
df_1.reset_index(drop=True, inplace=True)
# print('df_1.shape', df_1.shape, len(df_1))
# print(df_1.head(10))
result = []
for i in range(0,len(df_1)):
user_id = df_1.iloc[i,[0]].item()
# print ('user_id',user_id)
timestamp_user = df_1.iloc[i,[1]].item()
# print('timestamp_user', timestamp_user)
#逐行读取,读取x分钟内数据,超出时间开始下一行,超出50个开始下一行
k = 0
for j in range(i+1,len(df_1)):
item_id = df_1.iloc[j,[0]].item()
timestamp = df_1.iloc[j,[1]].item()
if user_id==item_id :
continue
elif timestamp - timestamp_user > 300:
break
else:
result.append([user_id,item_id,timestamp])
k = k + 1
if k==50:
break
result = pd.DataFrame(result,columns=['user_id:token','item_id:token','timestamp:float'])
result.drop_duplicates(inplace=True)
print ('result.shape 1:',result.shape)
result.to_csv(path_or_buf=Write_path + 'bert4recbole0.inter' + '.'+ str(radom_id),
sep=',', index=False, header=False, quoting=csv.QUOTE_NONE, escapechar='\\')
result = []
for i in range(len(df_1)).__reversed__():
user_id = df_1.iloc[i,[0]].item()
# print ('user_id',user_id)
timestamp_user = df_1.iloc[i,[1]].item()
# print('timestamp_user', timestamp_user)
# 逐行读取,读取x分钟内数据,超出时间开始下一行,超出50个开始下一行
k = 0
for j in range(i - 1).__reversed__():
item_id = df_1.iloc[j,[0]].item()
timestamp = df_1.iloc[j,[1]].item()
if user_id == item_id:
continue
elif timestamp_user - timestamp > 300:
break
else:
result.append([user_id, item_id, timestamp])
k = k + 1
if k == 50:
break
result = pd.DataFrame(result, columns=['user_id:token', 'item_id:token', 'timestamp:float'])
print ('result.shape 2:',result.shape)
result.drop_duplicates(inplace=True)
result.to_csv(
path_or_buf=Write_path + 'bert4recbole1.inter'+ '.'+ str(radom_id),
sep=',', index=False, header=False, quoting=csv.QUOTE_NONE, escapechar='\\')
print ('end inter_relation_gen' + ':'+str(inter_source_df.shape)+'---'+radom_id + '\n')
实现大型Dataframe切分
split_count = 30*24 ##切成多少个
split_length = int(len(df_result)//split_count) #每个切块里面多少条记录
list_of_splits = [] #用于存储切完的数据
for n in range(0,split_count+1):
list_of_splits.append(df_result.iloc[n * split_length:(n + 1) * split_length])
完整代码
import pandas as pd
import time,csv
pd.set_option('display.max_rows', 3000)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 3000)
pd.set_option('max_colwidth', 100)
import uuid
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Process
# from multiprocessing import Pool
# from multiprocessing.dummy import Pool as ThreadPool
encoder = LabelEncoder()
def unix_timestamp_specified(specified_time):
try:
try:
unix_timestamp = int(time.mktime(time.strptime(specified_time, "%d/%m/%Y %H:%M:%S.%f")))
return unix_timestamp
except Exception as e :
unix_timestamp = int(time.mktime(time.strptime(specified_time, "%d/%m/%Y %H:%M:%S")))
return unix_timestamp
except Exception as e:
print (specified_time)
return int(time.time())
Write_path = r"dataset/bert4recbole/"
df = pd.read_csv(r'../../data/202405.csv')
df = df.astype('str')
df['timestamp'] = df['insert_time'].apply(unix_timestamp_specified)
df = df.sort_values(by=['timestamp'], ascending=True)
df.reset_index(drop=True, inplace=True)
def inter_relation_gen(inter_source_df_list):
for inter_source_df in inter_source_df_list:
radom_id = str(uuid.uuid4())
print ('started inter_relation_gen' + ':'+str(inter_source_df.shape)+'---'+radom_id + '\n')
df_1 = inter_source_df.sort_values(by=['timestamp:float'], ascending=True)
df_1.reset_index(drop=True, inplace=True)
# print('df_1.shape', df_1.shape, len(df_1))
# print(df_1.head(10))
result = []
for i in range(0,len(df_1)):
user_id = df_1.iloc[i,[0]].item()
# print ('user_id',user_id)
timestamp_user = df_1.iloc[i,[1]].item()
# print('timestamp_user', timestamp_user)
#逐行读取,读取x分钟内数据,超出时间开始下一行,超出50个开始下一行
k = 0
for j in range(i+1,len(df_1)):
item_id = df_1.iloc[j,[0]].item()
timestamp = df_1.iloc[j,[1]].item()
if user_id==item_id :
continue
elif timestamp - timestamp_user > 300:
break
else:
result.append([user_id,item_id,timestamp])
k = k + 1
if k==50:
break
result = pd.DataFrame(result,columns=['user_id:token','item_id:token','timestamp:float'])
result.drop_duplicates(inplace=True)
print ('result.shape 1:',result.shape)
result.to_csv(path_or_buf=Write_path + 'bert4recbole0.inter' + '.'+ str(radom_id),
sep=',', index=False, header=False, quoting=csv.QUOTE_NONE, escapechar='\\')
result = []
for i in range(len(df_1)).__reversed__():
user_id = df_1.iloc[i,[0]].item()
# print ('user_id',user_id)
timestamp_user = df_1.iloc[i,[1]].item()
# print('timestamp_user', timestamp_user)
# 逐行读取,读取x分钟内数据,超出时间开始下一行,超出50个开始下一行
k = 0
for j in range(i - 1).__reversed__():
item_id = df_1.iloc[j,[0]].item()
timestamp = df_1.iloc[j,[1]].item()
if user_id == item_id:
continue
elif timestamp_user - timestamp > 300:
break
else:
result.append([user_id, item_id, timestamp])
k = k + 1
if k == 50:
break
result = pd.DataFrame(result, columns=['user_id:token', 'item_id:token', 'timestamp:float'])
print ('result.shape 2:',result.shape)
result.drop_duplicates(inplace=True)
result.to_csv(
path_or_buf=Write_path + 'bert4recbole1.inter'+ '.'+ str(radom_id),
sep=',', index=False, header=False, quoting=csv.QUOTE_NONE, escapechar='\\')
print ('end inter_relation_gen' + ':'+str(inter_source_df.shape)+'---'+radom_id + '\n')
if __name__ == '__main__':
df_result = df
print ('df_result',df_result.shape)
print ('----------------------\n')
process_num = 100
split_count = 30*24
split_length = int(len(df_result)//split_count)
list_of_splits = []
for n in range(0,split_count+1):
list_of_splits.append(df_result.iloc[n * split_length:(n + 1) * split_length])
for i in range(0,process_num):
childprocess = Process(target=inter_relation_gen, args=(list_of_splits[i::process_num],))
childprocess.start()