数据预处理
- 各种ID不用变,UserID,OccupationID,MovieID
- 类别数据用字典转化为数字类型,Gender,Age,Genres
- Title进行word2vec的转换
import pandas as pd
import numpy as np
import pickle
import re
def load_data():
users_title=['UserID','Gender','Age','JobID','Zip-code']
users=pd.read_csv('./ml-1m/users.dat',sep='::',header=None,names=users_title,engine='python')
users=users.filter(regex='UserID|Gender|Age|JobID')
users_orig=users.values
gender_map={'F':0,'M':1}
users['Gender']=users['Gender'].map(gender_map)
age_map={val:ii for ii,val in enumerate(set(users['Age']))}
users['Age']=users['Age'].map(age_map)
movies_title = ['MovieID', 'Title', 'Genres']
movies = pd.read_csv('./ml-1m/movies.dat', sep='::', header=None, names=movies_title, engine = 'python')
movies_orig = movies.values
pattern = re.compile(r'^(.*)\((\d+)\)$')
title_map = {val:pattern.match(val).group(1) for ii,val in enumerate(set(movies['Title']))}
movies['Title'] = movies['Title'].map(title_map)
genres_set=set()
for val in movies['Genres'].str.split('|'):
genres_set.update(val)
genres_set.add('<PAD>')
genres2int={val:ii for ii,val in enumerate(genres_set)}
genres_map = {val:[genres2int[row] for row in val.split('|')] for ii,val in enumerate(set(movies['Genres']))}
for key in genres_map:
for cnt in range(max(genres2int.values()) - len(genres_map[key])):
genres_map[key].insert(len(genres_map[key]) + cnt,genres2int['<PAD>'])
movies['Genres'] = movies['Genres'].map(genres_map)
title_set = set()
for val in movies['Title'].str.split():
title_set.update(val)
title_set.add('<PAD>')
title2int = {val:ii for ii, val in enumerate(title_set)}
title_count = 15
title_map = {val:[title2int[row] for row in val.split()] for ii,val in enumerate(set(movies['Title']))}
for key in title_map:
for cnt in range(title_count - len(title_map[key])):
title_map[key].insert(len(title_map[key]) + cnt,title2int['<PAD>'])
movies['Title'] = movies['Title'].map(title_map)
ratings_title = ['UserID','MovieID', 'ratings', 'timestamps']
ratings = pd.read_csv('./ml-1m/ratings.dat', sep='::', header=None, names=ratings_title, engine = 'python')
ratings = ratings.filter(regex='UserID|MovieID|ratings')
data = pd.merge(pd.merge(ratings, users), movies)
target_fields = ['ratings']
features_pd, targets_pd = data.drop(target_fields, axis=1), data[target_fields]
features = features_pd.values
targets_values = targets_pd.values
return title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig
加载数据并保存到本地
- title_count:Title字段的长度(15)
- title_set:Title文本的集合
- genres2int:电影类型转数字的字典
- features:是输入X
- targets_values:是学习目标y
- ratings:评分数据集的Pandas对象
- users:用户数据集的Pandas对象
- movies:电影数据的Pandas对象
- data:三个数据集组合在一起的Pandas对象
- movies_orig:没有做数据处理的原始电影数据
- users_orig:没有做数据处理的原始用户数据
title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = load_data()
pickle.dump((title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig), open('preprocess.p', 'wb'))
预处理后的数据
users.head()
| UserID | Gender | Age | JobID |
---|
0 | 1 | 0 | 0 | 10 |
---|
1 | 2 | 1 | 5 | 16 |
---|
2 | 3 | 1 | 6 | 15 |
---|
3 | 4 | 1 | 2 | 7 |
---|
4 | 5 | 1 | 6 | 20 |
---|
movies.head()
| MovieID | Title | Genres |
---|
0 | 1 | [2194, 4563, 2402, 2402, 2402, 2402, 2402, 240... | [16, 18, 5, 13, 13, 13, 13, 13, 13, 13, 13, 13... |
---|
1 | 2 | [2558, 2402, 2402, 2402, 2402, 2402, 2402, 240... | [10, 18, 1, 13, 13, 13, 13, 13, 13, 13, 13, 13... |
---|
2 | 3 | [1335, 4290, 3288, 2402, 2402, 2402, 2402, 240... | [5, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13... |
---|
3 | 4 | [2423, 5164, 3171, 2402, 2402, 2402, 2402, 240... | [5, 17, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13... |
---|
4 | 5 | [4573, 2552, 1568, 2808, 2806, 1319, 2402, 240... | [5, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13... |
---|
movies.values[0]
array([1,
list([2194, 4563, 2402, 2402, 2402, 2402, 2402, 2402, 2402, 2402, 2402, 2402, 2402, 2402, 2402]),
list([16, 18, 5, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13])],
dtype=object)