4-协同过滤(推荐系统)

协同过滤

导入包,数据

from fastai.collab import *
from fastai.tabular import *
path = untar_data(URLs.ML_SAMPLE)
path
WindowsPath('C:/Users/Wither8848/.fastai/data/movie_lens_sample')
ratings = pd.read_csv(path/'ratings.csv')
ratings.head()
userIdmovieIdratingtimestamp
07310974.01255504951
15619243.51172695223
21572603.51291598691
335812105.0957481884
41303162.01138999234

建立模型,导入数据

data = CollabDataBunch.from_df(ratings, seed=42)

数据集的标题

user,item,title = 'userId','movieId','title'

建立模型

y_range = [0,5.5]#评分范围
learn = collab_learner(data, n_factors=50, y_range=y_range)#建立模型,嵌入矩阵宽度为50,嵌入其实就是和one-hot矩阵相乘,就是数组查询

训练

learn.fit_one_cycle(3, 5e-3)
epochtrain_lossvalid_losstime
01.6098710.94191300:47
10.8353810.67062000:47
20.6450650.66007600:46

电影数据集

载入数据集

path=Config.data_path()/'ml-100k'
  • 设置分隔符,分隔符为tab,设置表格头
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                      names=[user,item,'rating','timestamp'])
ratings.head()
userIdmovieIdratingtimestamp
01962423881250949
11863023891717742
2223771878887116
3244512880606923
41663461886397596
  • 设置解码方式,之前的数据集使用的是latin-1编码
movies = pd.read_csv(path/'u.item',  delimiter='|', encoding='latin-1', header=None,
                    names=[item, 'title', 'date', 'N', 'url', *[f'g{i}' for i in range(19)]])
movies.head()
movieIdtitledateNurlg0g1g2g3g4...g9g10g11g12g13g14g15g16g17g18
01Toy Story (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?Toy%20Story%2...00011...0000000000
12GoldenEye (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?GoldenEye%20(...01100...0000000100
23Four Rooms (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?Four%20Rooms%...00000...0000000100
34Get Shorty (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?Get%20Shorty%...01000...0000000000
45Copycat (1995)01-Jan-1995NaNhttp://us.imdb.com/M/title-exact?Copycat%20(1995)00000...0000000100

5 rows × 24 columns

  • 合并表格头
rating_movie = ratings.merge(movies[[item, title]])
rating_movie.head()
userIdmovieIdratingtimestamptitle
01962423881250949Kolya (1996)
1632423875747190Kolya (1996)
22262425883888671Kolya (1996)
31542423879138235Kolya (1996)
43062425876503793Kolya (1996)
  • 验证集为10%,替换表格的电影名为title,而不是id
data = CollabDataBunch.from_df(rating_movie, seed=42, valid_pct=0.1, item_name=title)
  • 查看数据集
data.show_batch()
userIdtitletarget
237Quiz Show (1994)4.0
871Replacement Killers, The (1998)3.0
342Roman Holiday (1953)5.0
686Raiders of the Lost Ark (1981)4.0
312Babe (1995)5.0

建立模型

  • 设置y的范围,因为sigmoid趋近5但是达不到,所以范围设为5.5
y_range = [0,5.5]
learn = collab_learner(data, n_factors=40, y_range=y_range, wd=1e-1)

找到最佳学习率

learn.lr_find()
learn.recorder.plot(skip_end=15)
<div>
    <style>
        /* Turns off some styling */
        progress {
            /* gets rid of default border in Firefox and Opera. */
            border: none;
            /* Needs to be in here for Safari polyfill so background images work as expected. */
            background-size: auto;
        }
        .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {
            background: #F44336;
        }
    </style>
  <progress value='0' class='' max='1', style='width:300px; height:20px; vertical-align: middle;'></progress>
  0.00% [0/1 00:00<00:00]
</div>
epochtrain_lossvalid_losstime

<div>
    <style>
        /* Turns off some styling */
        progress {
            /* gets rid of default border in Firefox and Opera. */
            border: none;
            /* Needs to be in here for Safari polyfill so background images work as expected. */
            background-size: auto;
        }
        .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {
            background: #F44336;
        }
    </style>
  <progress value='99' class='' max='1406', style='width:300px; height:20px; vertical-align: middle;'></progress>
  7.04% [99/1406 00:23<05:13 3.0871]
</div>



LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-w8phBszb-1585459572842)(output_31_2.png)]

训练

learn.fit_one_cycle(5, 5e-3)
epochtrain_lossvalid_losstime
00.9348810.94341400:53
10.8863810.87896100:53
20.7995830.83157100:54
30.6354700.81632600:55
40.5346440.81520700:55

保存

learn.save('dotprod')

理解数据

载入模型,提取最多人看的电影

learn.load('dotprod');
learn.model
EmbeddingDotBias(
  (u_weight): Embedding(944, 40)
  (i_weight): Embedding(1654, 40)
  (u_bias): Embedding(944, 1)
  (i_bias): Embedding(1654, 1)
)
g = rating_movie.groupby(title)['rating'].count()
top_movies = g.sort_values(ascending=False).index.values[:1000]
top_movies[:10]
array(['Star Wars (1977)', 'Contact (1997)', 'Fargo (1996)', 'Return of the Jedi (1983)', 'Liar Liar (1997)',
       'English Patient, The (1996)', 'Scream (1996)', 'Toy Story (1995)', 'Air Force One (1997)',
       'Independence Day (ID4) (1996)'], dtype=object)

查看数据偏差

is_item=True可以看到电影的偏差,也就是电影的好坏

movie_bias = learn.bias(top_movies, is_item=True)
movie_bias.shape
torch.Size([1000])

按平均分分组排序

mean_ratings = rating_movie.groupby(title)['rating'].mean()
movie_ratings = [(b, i, mean_ratings.loc[i]) for i,b in zip(top_movies,movie_bias)]
item0 = lambda o:o[0]
sorted(movie_ratings, key=item0)[:15]
[(tensor(-0.3865),
  'Children of the Corn: The Gathering (1996)',
  1.3157894736842106),
 (tensor(-0.3158), 'Mortal Kombat: Annihilation (1997)', 1.9534883720930232),
 (tensor(-0.3114),
  'Lawnmower Man 2: Beyond Cyberspace (1996)',
  1.7142857142857142),
 (tensor(-0.2630), 'Island of Dr. Moreau, The (1996)', 2.1578947368421053),
 (tensor(-0.2620), 'Cable Guy, The (1996)', 2.339622641509434),
 (tensor(-0.2514), 'Bio-Dome (1996)', 1.903225806451613),
 (tensor(-0.2494), 'Striptease (1996)', 2.2388059701492535),
 (tensor(-0.2407), 'Barb Wire (1996)', 1.9333333333333333),
 (tensor(-0.2351), "McHale's Navy (1997)", 2.1884057971014492),
 (tensor(-0.2127), "Joe's Apartment (1996)", 2.2444444444444445),
 (tensor(-0.2112), 'Ready to Wear (Pret-A-Porter) (1994)', 1.8333333333333333),
 (tensor(-0.2108), 'Crow: City of Angels, The (1996)', 1.9487179487179487),
 (tensor(-0.2074), 'Beverly Hills Ninja (1997)', 2.3125),
 (tensor(-0.1996), 'Beautician and the Beast, The (1997)', 2.313953488372093),
 (tensor(-0.1941), 'Leave It to Beaver (1997)', 1.8409090909090908)]
sorted(movie_ratings, key=lambda o: o[0], reverse=True)[:15]
[(tensor(0.6122), 'Titanic (1997)', 4.2457142857142856),
 (tensor(0.6111), "Schindler's List (1993)", 4.466442953020135),
 (tensor(0.5523), 'L.A. Confidential (1997)', 4.161616161616162),
 (tensor(0.5477), 'Star Wars (1977)', 4.3584905660377355),
 (tensor(0.5465), 'Shawshank Redemption, The (1994)', 4.445229681978798),
 (tensor(0.5152), 'Rear Window (1954)', 4.3875598086124405),
 (tensor(0.5145), 'Silence of the Lambs, The (1991)', 4.28974358974359),
 (tensor(0.5138), 'Good Will Hunting (1997)', 4.262626262626263),
 (tensor(0.5129), 'As Good As It Gets (1997)', 4.196428571428571),
 (tensor(0.4850), 'Casablanca (1942)', 4.45679012345679),
 (tensor(0.4815), 'Close Shave, A (1995)', 4.491071428571429),
 (tensor(0.4799), 'Boot, Das (1981)', 4.203980099502488),
 (tensor(0.4724), 'Godfather, The (1972)', 4.283292978208232),
 (tensor(0.4709), 'Apt Pupil (1998)', 4.1),
 (tensor(0.4659), 'Usual Suspects, The (1995)', 4.385767790262173)]

查看数据权重

movie_w = learn.weight(top_movies, is_item=True)
movie_w.shape
torch.Size([1000, 40])

数据降维,pca主成分分析

movie_pca = movie_w.pca(3)
movie_pca.shape
torch.Size([1000, 3])
fac0,fac1,fac2 = movie_pca.t()
movie_comp = [(f, i) for f,i in zip(fac0, top_movies)]

排序

fac0排序
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]
[(tensor(1.2644), 'Home Alone 3 (1997)'),
 (tensor(1.2343), 'Jungle2Jungle (1997)'),
 (tensor(1.1393), 'Leave It to Beaver (1997)'),
 (tensor(1.1283), "McHale's Navy (1997)"),
 (tensor(1.0802), 'Bio-Dome (1996)'),
 (tensor(1.0767), 'D3: The Mighty Ducks (1996)'),
 (tensor(1.0556), 'Grease 2 (1982)'),
 (tensor(1.0416), 'Children of the Corn: The Gathering (1996)'),
 (tensor(1.0365), 'Batman & Robin (1997)'),
 (tensor(1.0237), 'Free Willy 3: The Rescue (1997)')]
sorted(movie_comp, key=itemgetter(0))[:10]
[(tensor(-1.0996), 'Casablanca (1942)'),
 (tensor(-1.0553), 'Wrong Trousers, The (1993)'),
 (tensor(-1.0370), 'Lawrence of Arabia (1962)'),
 (tensor(-1.0336), 'When We Were Kings (1996)'),
 (tensor(-1.0198), 'Close Shave, A (1995)'),
 (tensor(-1.0137), 'Shall We Dance? (1996)'),
 (tensor(-1.0088), 'Chinatown (1974)'),
 (tensor(-0.9817), 'Some Folks Call It a Sling Blade (1993)'),
 (tensor(-0.9786),
  'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)'),
 (tensor(-0.9571), 'Citizen Kane (1941)')]
fac1排序
movie_comp = [(f, i) for f,i in zip(fac1, top_movies)]
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]
[(tensor(0.8437), 'Ready to Wear (Pret-A-Porter) (1994)'),
 (tensor(0.7802), 'Keys to Tulsa (1997)'),
 (tensor(0.7405), 'Trainspotting (1996)'),
 (tensor(0.7400), 'Jude (1996)'),
 (tensor(0.7397), 'Big Night (1996)'),
 (tensor(0.7165), 'Brazil (1985)'),
 (tensor(0.7136), 'Exotica (1994)'),
 (tensor(0.7030), 'Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922)'),
 (tensor(0.7022), 'Sweet Hereafter, The (1997)'),
 (tensor(0.6906), 'Cable Guy, The (1996)')]
sorted(movie_comp, key=itemgetter(0))[:10]
[(tensor(-1.1935), 'Braveheart (1995)'),
 (tensor(-1.1555), 'Raiders of the Lost Ark (1981)'),
 (tensor(-1.0948), 'Titanic (1997)'),
 (tensor(-0.8792), 'Independence Day (ID4) (1996)'),
 (tensor(-0.8687), "It's a Wonderful Life (1946)"),
 (tensor(-0.8572), 'Return of the Jedi (1983)'),
 (tensor(-0.8526), 'Hunt for Red October, The (1990)'),
 (tensor(-0.8456), 'Star Wars (1977)'),
 (tensor(-0.8433), 'Forrest Gump (1994)'),
 (tensor(-0.8419), 'Shawshank Redemption, The (1994)')]

绘图

idxs = np.random.choice(len(top_movies), 50, replace=False)
idxs = list(range(50))
X = fac0[idxs]
Y = fac2[idxs]
plt.figure(figsize=(15,15))
plt.scatter(X, Y)
for i, x, y in zip(top_movies[idxs], X, Y):
    plt.text(x,y,i, color=np.random.rand(3)*0.7, fontsize=11)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-airbWPNv-1585459572843)(output_61_0.png)]

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值