记录第一次数据预处理过程
数据描述
item_idx和user_idx是商品和用户名分别对应的id。train_pair.json 是一些二元组[u,i]的列表,表示用户u喜欢商品i,是我们的训练数据(这里你需要自己构造负例)。valid_pair和test_pair里面的格式一样,也是一些二元组,是一个用户对应一个正例商品和100个负例商品,我们任务的目的是给这个用户的这101个商品排序,然后看那一个整理商品的排序位置。
最终数据目标
类似于gowalla中 train.txt 是
每行第一个是user 后面是正例item
test.txt 中: user - 要测试的items
从list到txt的remap:
import torch
import numpy as np
from torch.utils.data import Dataset
import json
import string
org_path = "D:\GraphNetworkCF-main\Data\men\\user_idx.json"
des_path = "D:\GraphNetworkCF-main\Data\men\\user_list.txt"
with open(des_path, "a") as fl:
fl.write("org_id remap_id\n")
with open(org_path,'r',encoding='utf8')as fp:
json_datas = json.load(fp)
#print('这是文件中的json数据:',json_datas)
#print('这是读取到文件数据的数据类型:', type(json_datas))
length = len(json_datas)
count = 1
for json_data in json_datas:
with open(des_path, "a") as fl:
fl.write(str(json_data)+" "+str(count)+"\n")
count = count + 1
映射,转化为索引应该比字符串好处理了
一共4807个user,43832个item
train_pair 到 user-items 初步
import torch
import numpy as np
from torch.utils.data import Dataset
import json
import string
import textwrap
import fileinput
import os
org_path_user = "D:\GraphNetworkCF-main\Data\men\\user_list.txt"
org_path_item = "D:\GraphNetworkCF-main\Data\men\\item_list.txt"
org_path_train_pari = "D:\GraphNetworkCF-main\Data\men\\train_pair.json"
des_path = "D:\GraphNetworkCF-main\Data\men\\train.txt"
#coding:utf-8
with open(org_path_user,'r',encoding='utf-8') as f:
user_dic=[]
for line in f.readlines():
line=line.strip('\n') #去掉换行符\n
b=line.split(' ') #将每一行以空格为分隔符转换成列表
user_dic.append(b)
user_dic=dict(user_dic)
with open(org_path_item, 'r', encoding='utf-8') as f:
item_dic = []
for line in f.readlines():
line = line.strip('\n') # 去掉换行符\n
b = line.split(' ') # 将每一行以空格为分隔符转换成列表
item_dic.append(b)
item_dic=dict(item_dic)
with open(org_path_train_pari,'r',encoding='utf8')as fp:
json_datas = json.load(fp)
#print('这是文件中的json数据:',json_datas)
#print('这是读取到文件数据的数据类型:', type(json_datas))
length = len(json_datas)
#print(length)
# 0 是 user 1是item
#print(json_datas[2][0])
now_user_id = -1
for json_data in json_datas:
nxt_user = json_data[0]
nxt_item = json_data[1]
nxt_user_id = user_dic[nxt_user]
item_id = item_dic[nxt_item]
#print(nxt_user_id)
#print(item_id)
if now_user_id != nxt_user_id:
# 第一次
now_user_id = nxt_user_id
with open(des_path, "a") as fl:
fl.write("\n")
fl.write(str(now_user_id) + " " + str(item_id))
else:
with open(des_path, "a") as fl:
fl.write(" " + str(item_id))
进一步处理train.txt
import torch
import numpy as np
from torch.utils.data import Dataset
import json
import string
import textwrap
import fileinput
import os
org_path_train = "D:\GraphNetworkCF-main\Data\men\\test.txt"
des_path = "D:\GraphNetworkCF-main\Data\men\\test_1.txt"
trains = {}
with open(org_path_train,'r',encoding='utf-8') as f:
line_dic=[]
for line in f.readlines():
if len(line) == 0:break
line=line.strip('\n') #去掉换行符\n
items = [int(i) for i in line.split(' ')] # 物品
uid, train_items = items[0], items[1:]
train_items.sort()
trains[uid] = train_items
#print(trains.items())
train = sorted(trains.items(), key=lambda d: d[0])
# print(train)
# print(type(train))
#count = 4807; # 0 - 4806
#print(max(train))
#print(len(train))
# print(type(train[count]))
# print(train[count][1])
length = len(train) # 1 - 4807
with open(des_path, "a") as fl:
for i in range(0,length):
fl.write("\n")
uid = i # 0 - 4806
u_items = train[i][1]
#print(u_items)
fl.write(str(uid)+" ")
for item in u_items:
fl.write(str(item)+" ")
最终效果