python亲和性分析法推荐电影论文_3.亲和性分析推荐电影.ipynb

{

"cells": [

{

"cell_type": "code",

"execution_count": 1,

"metadata": {},

"outputs": [],

"source": [

"# data \"ml-100k\" from http://grouplens.org/datasets/movielens/"

]

},

{

"cell_type": "code",

"execution_count": 2,

"metadata": {},

"outputs": [],

"source": [

"import os\n",

"data_folder = os.path.join(\".\", \"data\")\n",

"ratings_filename = os.path.join(data_folder, \"u.data\")"

]

},

{

"cell_type": "code",

"execution_count": 3,

"metadata": {},

"outputs": [],

"source": [

"import pandas as pd"

]

},

{

"cell_type": "code",

"execution_count": 4,

"metadata": {},

"outputs": [

{

"data": {

"text/html": [

"

\n",

"

" .dataframe thead tr:only-child th {\n",

" text-align: right;\n",

" }\n",

"\n",

" .dataframe thead th {\n",

" text-align: left;\n",

" }\n",

"\n",

" .dataframe tbody tr th {\n",

" vertical-align: top;\n",

" }\n",

"\n",

"

" \n",

"

\n",

"

\n",

"

UserID\n",

"

MovieID\n",

"

Rating\n",

"

Datetime\n",

"

\n",

"

\n",

"

\n",

"

\n",

"

0\n",

"

196\n",

"

242\n",

"

3\n",

"

1997-12-04 15:55:49\n",

"

\n",

"

\n",

"

1\n",

"

186\n",

"

302\n",

"

3\n",

"

1998-04-04 19:22:22\n",

"

\n",

"

\n",

"

2\n",

"

22\n",

"

377\n",

"

1\n",

"

1997-11-07 07:18:36\n",

"

\n",

"

\n",

"

3\n",

"

244\n",

"

51\n",

"

2\n",

"

1997-11-27 05:02:03\n",

"

\n",

"

\n",

"

4\n",

"

166\n",

"

346\n",

"

1\n",

"

1998-02-02 05:33:16\n",

"

\n",

"

\n",

"

\n",

"

"

],

"text/plain": [

" UserID MovieID Rating Datetime\n",

"0 196 242 3 1997-12-04 15:55:49\n",

"1 186 302 3 1998-04-04 19:22:22\n",

"2 22 377 1 1997-11-07 07:18:36\n",

"3 244 51 2 1997-11-27 05:02:03\n",

"4 166 346 1 1998-02-02 05:33:16"

]

},

"execution_count": 4,

"metadata": {},

"output_type": "execute_result"

}

],

"source": [

"all_ratings = pd.read_csv(ratings_filename, delimiter=\"\\t\", header=None, names = [\"UserID\", \"MovieID\", \"Rating\", \"Datetime\"])\n",

"all_ratings[\"Datetime\"] = pd.to_datetime(all_ratings['Datetime'],unit='s')\n",

"all_ratings[:5]"

]

},

{

"cell_type": "code",

"execution_count": 5,

"metadata": {},

"outputs": [

{

"data": {

"text/html": [

"

\n",

"

" .dataframe thead tr:only-child th {\n",

" text-align: right;\n",

" }\n",

"\n",

" .dataframe thead th {\n",

" text-align: left;\n",

" }\n",

"\n",

" .dataframe tbody tr th {\n",

" vertical-align: top;\n",

" }\n",

"\n",

"

" \n",

"

\n",

"

\n",

"

UserID\n",

"

MovieID\n",

"

Rating\n",

"

Datetime\n",

"

\n",

"

\n",

"

\n",

"

\n",

"

81098\n",

"

675\n",

"

86\n",

"

4\n",

"

1998-03-10 00:26:14\n",

"

\n",

"

\n",

"

90696\n",

"

675\n",

"

223\n",

"

1\n",

"

1998-03-10 00:35:51\n",

"

\n",

"

\n",

"

92650\n",

"

675\n",

"

235\n",

"

1\n",

"

1998-03-10 00:35:51\n",

"

\n",

"

\n",

"

95459\n",

"

675\n",

"

242\n",

"

4\n",

"

1998-03-10 00:08:42\n",

"

\n",

"

\n",

"

82845\n",

"

675\n",

"

244\n",

"

3\n",

"

1998-03-10 00:29:35\n",

"

\n",

"

\n",

"

53293\n",

"

675\n",

"

258\n",

"

3\n",

"

1998-03-10 00:11:19\n",

"

\n",

"

\n",

"

97286\n",

"

675\n",

"

269\n",

"

5\n",

"

1998-03-10 00:08:07\n",

"

\n",

"

\n",

"

93720\n",

"

675\n",

"

272\n",

"

3\n",

"

1998-03-10 00:07:11\n",

"

\n",

"

\n",

"

73389\n",

"

675\n",

"

286\n",

"

4\n",

"

1998-03-10 00:07:11\n",

"

\n",

"

\n",

"

77524\n",

"

675\n",

"

303\n",

"

5\n",

"

1998-03-10 00:08:42\n",

"

\n",

"

\n",

"

47367\n",

"

675\n",

"

305\n",

"

4\n",

"

1998-03-10 00:09:08\n",

"

\n",

"

\n",

"

44300\n",

"

675\n",

"

306\n",

"

5\n",

"

1998-03-10 00:08:07\n",

"

\n",

"

\n",

"

53730\n",

"

675\n",

"

311\n",

"

3\n",

"

1998-03-10 00:10:47\n",

"

\n",

"

\n",

"

54284\n",

"

675\n",

"

312\n",

"

2\n",

"

1998-03-10 00:10:24\n",

"

\n",

"

\n",

"

63291\n",

"

675\n",

"

318\n",

"

5\n",

"

1998-03-10 00:21:13\n",

"

\n",

"

\n",

"

87082\n",

"

675\n",

"

321\n",

"

2\n",

"

1998-03-10 00:11:48\n",

"

\n",

"

\n",

"

56108\n",

"

675\n",

"

344\n",

"

4\n",

"

1998-03-10 00:12:34\n",

"

\n",

"

\n",

"

53046\n",

"

675\n",

"

347\n",

"

4\n",

"

1998-03-10 00:07:11\n",

"

\n",

"

\n",

"

94617\n",

"

675\n",

"

427\n",

"

5\n",

"

1998-03-10 00:28:11\n",

"

\n",

"

\n",

"

69915\n",

"

675\n",

"

463\n",

"

5\n",

"

1998-03-10 00:16:43\n",

"

\n",

"

\n",

"

46744\n",

"

675\n",

"

509\n",

"

5\n",

"

1998-03-10 00:24:25\n",

"

\n",

"

\n",

"

46598\n",

"

675\n",

"

531\n",

"

5\n",

"

1998-03-10 00:18:28\n",

"

\n",

"

\n",

"

52962\n",

"

675\n",

"

650\n",

"

5\n",

"

1998-03-10 00:32:51\n",

"

\n",

"

\n",

"

94029\n",

"

675\n",

"

750\n",

"

4\n",

"

1998-03-10 00:08:07\n",

"

\n",

"

\n",

"

53223\n",

"

675\n",

"

874\n",

"

4\n",

"

1998-03-10 00:11:19\n",

"

\n",

"

\n",

"

62277\n",

"

675\n",

"

891\n",

"

2\n",

"

1998-03-10 00:12:59\n",

"

\n",

"

\n",

"

77274\n",

"

675\n",

"

896\n",

"

5\n",

"

1998-03-10 00:09:35\n",

"

\n",

"

\n",

"

66194\n",

"

675\n",

"

900\n",

"

4\n",

"

1998-03-10 00:10:24\n",

"

\n",

"

\n",

"

54994\n",

"

675\n",

"

937\n",

"

1\n",

"

1998-03-10 00:35:51\n",

"

\n",

"

\n",

"

61742\n",

"

675\n",

"

1007\n",

"

4\n",

"

1998-03-10 00:25:22\n",

"

\n",

"

\n",

"

49225\n",

"

675\n",

"

1101\n",

"

4\n",

"

1998-03-10 00:33:49\n",

"

\n",

"

\n",

"

50692\n",

"

675\n",

"

1255\n",

"

1\n",

"

1998-03-10 00:35:51\n",

"

\n",

"

\n",

"

74202\n",

"

675\n",

"

1628\n",

"

5\n",

"

1998-03-10 00:30:37\n",

"

\n",

"

\n",

"

47866\n",

"

675\n",

"

1653\n",

"

5\n",

"

1998-03-10 00:31:53\n",

"

\n",

"

\n",

"

\n",

"

"

],

"text/plain": [

" UserID MovieID Rating Datetime\n",

"81098 675 86 4 1998-03-10 00:26:14\n",

"90696 675 223 1 1998-03-10 00:35:51\n",

"92650 675 235 1 1998-03-10 00:35:51\n",

"95459 675 242 4 1998-03-10 00:08:42\n",

"82845 675 244 3 1998-03-10 00:29:35\n",

"53293 675 258 3 1998-03-10 00:11:19\n",

"97286 675 269 5 1998-03-10 00:08:07\n",

"93720 675 272 3 1998-03-10 00:07:11\n",

"73389 675 286 4 1998-03-10 00:07:11\n",

"77524 675 303 5 1998-03-10 00:08:42\n",

"47367 675 305 4 1998-03-10 00:09:08\n",

"44300 675 306 5 1998-03-10 00:08:07\n",

"53730 675 311 3 1998-03-10 00:10:47\n",

"54284 675 312 2 1998-03-10 00:10:24\n",

"63291 675 318 5 1998-03-10 00:21:13\n",

"87082 675 321 2 1998-03-10 00:11:48\n",

"56108 675 344 4 1998-03-10 00:12:34\n",

"53046 675 347 4 1998-03-10 00:07:11\n",

"94617 675 427 5 1998-03-10 00:28:11\n",

"69915 675 463 5 1998-03-10 00:16:43\n",

"46744 675 509 5 1998-03-10 00:24:25\n",

"46598 675 531 5 1998-03-10 00:18:28\n",

"52962 675 650 5 1998-03-10 00:32:51\n",

"94029 675 750 4 1998-03-10 00:08:07\n",

"53223 675 874 4 1998-03-10 00:11:19\n",

"62277 675 891 2 1998-03-10 00:12:59\n",

"77274 675 896 5 1998-03-10 00:09:35\n",

"66194 675 900 4 1998-03-10 00:10:24\n",

"54994 675 937 1 1998-03-10 00:35:51\n",

"61742 675 1007 4 1998-03-10 00:25:22\n",

"49225 675 1101 4 1998-03-10 00:33:49\n",

"50692 675 1255 1 1998-03-10 00:35:51\n",

"74202 675 1628 5 1998-03-10 00:30:37\n",

"47866 675 1653 5 1998-03-10 00:31:53"

]

},

"execution_count": 5,

"metadata": {},

"output_type": "execute_result"

}

],

"source": [

"# As you can see, there are no review for most movies, such as #213\n",

"all_ratings[all_ratings[\"UserID\"] == 675].sort_values(\"MovieID\") "

]

},

{

"cell_type": "code",

"execution_count": 6,

"metadata": {},

"outputs": [

{

"data": {

"text/html": [

"

\n",

"

" .dataframe thead tr:only-child th {\n",

" text-align: right;\n",

" }\n",

"\n",

" .dataframe thead th {\n",

" text-align: left;\n",

" }\n",

"\n",

" .dataframe tbody tr th {\n",

" vertical-align: top;\n",

" }\n",

"\n",

"

" \n",

"

\n",

"

\n",

"

UserID\n",

"

MovieID\n",

"

Rating\n",

"

Datetime\n",

"

Favorable\n",

"

\n",

"

\n",

"

\n",

"

\n",

"

10\n",

"

62\n",

"

257\n",

"

2\n",

"

1997-11-12 22:07:14\n",

"

False\n",

"

\n",

"

\n",

"

11\n",

"

286\n",

"

1014\n",

"

5\n",

"

1997-11-17 15:38:45\n",

"

True\n",

"

\n",

"

\n",

"

12\n",

"

200\n",

"

222\n",

"

5\n",

"

1997-10-05 09:05:40\n",

"

True\n",

"

\n",

"

\n",

"

13\n",

"

210\n",

"

40\n",

"

3\n",

"

1998-03-27 21:59:54\n",

"

False\n",

"

\n",

"

\n",

"

14\n",

"

224\n",

"

29\n",

"

3\n",

"

1998-02-21 23:40:57\n",

"

False\n",

"

\n",

"

\n",

"

\n",

"

"

],

"text/plain": [

" UserID MovieID Rating Datetime Favorable\n",

"10 62 257 2 1997-11-12 22:07:14 False\n",

"11 286 1014 5 1997-11-17 15:38:45 True\n",

"12 200 222 5 1997-10-05 09:05:40 True\n",

"13 210 40 3 1998-03-27 21:59:54 False\n",

"14 224 29 3 1998-02-21 23:40:57 False"

]

},

"execution_count": 6,

"metadata": {},

"output_type": "execute_result"

}

],

"source": [

"# Not all reviews are favourable! Our goal is \"other recommended books\", so we only want favourable reviews\n",

"all_ratings[\"Favorable\"] = all_ratings[\"Rating\"] > 3\n",

"all_ratings[10:15]"

]

},

{

"cell_type": "code",

"execution_count": 7,

"metadata": {},

"outputs": [

{

"data": {

"text/html": [

"

\n",

"

" .dataframe thead tr:only-child th {\n",

" text-align: right;\n",

" }\n",

"\n",

" .dataframe thead th {\n",

" text-align: left;\n",

" }\n",

"\n",

" .dataframe tbody tr th {\n",

" vertical-align: top;\n",

" }\n",

"\n",

"

" \n",

"

\n",

"

\n",

"

UserID\n",

"

MovieID\n",

"

Rating\n",

"

Datetime\n",

"

Favorable\n",

"

\n",

"

\n",

"

\n",

"

\n",

"

202\n",

"

1\n",

"

61\n",

"

4\n",

"

1997-11-03 07:33:40\n",

"

True\n",

"

\n",

"

\n",

"

305\n",

"

1\n",

"

189\n",

"

3\n",

"

1998-03-01 06:15:28\n",

"

False\n",

"

\n",

"

\n",

"

333\n",

"

1\n",

"

33\n",

"

4\n",

"

1997-11-03 07:38:19\n",

"

True\n",

"

\n",

"

\n",

"

334\n",

"

1\n",

"

160\n",

"

4\n",

"

1997-09-24 03:42:27\n",

"

True\n",

"

\n",

"

\n",

"

478\n",

"

1\n",

"

20\n",

"

4\n",

"

1998-02-14 04:51:23\n",

"

True\n",

"

\n",

"

\n",

"

\n",

"

"

],

"text/plain": [

" UserID MovieID Rating Datetime Favorable\n",

"202 1 61 4 1997-11-03 07:33:40 True\n",

"305 1 189 3 1998-03-01 06:15:28 False\n",

"333 1 33 4 1997-11-03 07:38:19 True\n",

"334 1 160 4 1997-09-24 03:42:27 True\n",

"478 1 20 4 1998-02-14 04:51:23 True"

]

},

"execution_count": 7,

"metadata": {},

"output_type": "execute_result"

}

],

"source": [

"all_ratings[all_ratings[\"UserID\"] == 1][:5]"

]

},

{

"cell_type": "code",

"execution_count": 8,

"metadata": {},

"outputs": [],

"source": [

"# Sample the dataset. You can try increasing the size of the sample, but the run time will be considerably longer\n",

"ratings = all_ratings[all_ratings['UserID'].isin(range(200))] # & ratings[\"UserID\"].isin(range(100))]"

]

},

{

"cell_type": "code",

"execution_count": 9,

"metadata": {},

"outputs": [

{

"data": {

"text/html": [

"

\n",

"

" .dataframe thead tr:only-child th {\n",

" text-align: right;\n",

" }\n",

"\n",

" .dataframe thead th {\n",

" text-align: left;\n",

" }\n",

"\n",

" .dataframe tbody tr th {\n",

" vertical-align: top;\n",

" }\n",

"\n",

"

" \n",

"

\n",

"

\n",

"

UserID\n",

"

MovieID\n",

"

Rating\n",

"

Datetime\n",

"

Favorable\n",

"

\n",

"

\n",

"

\n",

"

\n",

"

16\n",

"

122\n",

"

387\n",

"

5\n",

"

1997-11-11 17:47:39\n",

"

True\n",

"

\n",

"

\n",

"

20\n",

"

119\n",

"

392\n",

"

4\n",

"

1998-01-30 16:13:34\n",

"

True\n",

"

\n",

"

\n",

"

21\n",

"

167\n",

"

486\n",

"

4\n",

"

1998-04-16 14:54:12\n",

"

True\n",

"

\n",

"

\n",

"

26\n",

"

38\n",

"

95\n",

"

5\n",

"

1998-04-13 01:14:54\n",

"

True\n",

"

\n",

"

\n",

"

28\n",

"

63\n",

"

277\n",

"

4\n",

"

1997-10-01 23:10:01\n",

"

True\n",

"

\n",

"

\n",

"

\n",

"

"

],

"text/plain": [

" UserID MovieID Rating Datetime Favorable\n",

"16 122 387 5 1997-11-11 17:47:39 True\n",

"20 119 392 4 1998-01-30 16:13:34 True\n",

"21 167 486 4 1998-04-16 14:54:12 True\n",

"26 38 95 5 1998-04-13 01:14:54 True\n",

"28 63 277 4 1997-10-01 23:10:01 True"

]

},

"execution_count": 9,

"metadata": {},

"output_type": "execute_result"

}

],

"source": [

"# We start by creating a dataset of each user's favourable reviews\n",

"favorable_ratings = ratings[ratings[\"Favorable\"]]\n",

"favorable_ratings[:5]"

]

},

{

"cell_type": "code",

"execution_count": 10,

"metadata": {},

"outputs": [

{

"data": {

"text/plain": [

"199"

]

},

"execution_count": 10,

"metadata": {},

"output_type": "execute_result"

}

],

"source": [

"# We are only interested in the reviewers who have more than one review\n",

"favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby(\"UserID\")[\"MovieID\"])\n",

"len(favorable_reviews_by_users)"

]

},

{

"cell_type": "code",

"execution_count": 11,

"metadata": {},

"outputs": [

{

"data": {

"text/html": [

"

\n",

"

" .dataframe thead tr:only-child th {\n",

" text-align: right;\n",

" }\n",

"\n",

" .dataframe thead th {\n",

" text-align: left;\n",

" }\n",

"\n",

" .dataframe tbody tr th {\n",

" vertical-align: top;\n",

" }\n",

"\n",

"

" \n",

"

\n",

"

\n",

"

Favorable\n",

"

\n",

"

\n",

"

MovieID\n",

"

\n",

"

\n",

"

\n",

"

\n",

"

\n",

"

50\n",

"

100.0\n",

"

\n",

"

\n",

"

100\n",

"

89.0\n",

"

\n",

"

\n",

"

258\n",

"

83.0\n",

"

\n",

"

\n",

"

181\n",

"

79.0\n",

"

\n",

"

\n",

"

174\n",

"

74.0\n",

"

\n",

"

\n",

"

\n",

"

"

],

"text/plain": [

" Favorable\n",

"MovieID \n",

"50 100.0\n",

"100 89.0\n",

"258 83.0\n",

"181 79.0\n",

"174 74.0"

]

},

"execution_count": 11,

"metadata": {},

"output_type": "execute_result"

}

],

"source": [

"# Find out how many movies have favourable ratings\n",

"num_favorable_by_movie = ratings[[\"MovieID\", \"Favorable\"]].groupby(\"MovieID\").sum()\n",

"num_favorable_by_movie.sort_values(\"Favorable\", ascending=False)[:5]"

]

},

{

"cell_type": "code",

"execution_count": 12,

"metadata": {},

"outputs": [],

"source": [

"from collections import defaultdict\n",

"\n",

"def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):\n",

" counts = defaultdict(int)\n",

" for user, reviews in favorable_reviews_by_users.items():\n",

" for itemset in k_1_itemsets:\n",

" if itemset.issubset(reviews):\n",

" for other_reviewed_movie in reviews - itemset:\n",

" current_superset = itemset | frozenset((other_reviewed_movie,))\n",

" counts[current_superset] += 1\n",

" return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])"

]

},

{

"cell_type": "code",

"execution_count": 13,

"metadata": {},

"outputs": [

{

"name": "stdout",

"output_type": "stream",

"text": [

"There are 16 movies with more than 50 favorable reviews\n",

"I found 93 frequent itemsets of length 2\n",

"I found 295 frequent itemsets of length 3\n",

"I found 593 frequent itemsets of length 4\n",

"I found 785 frequent itemsets of length 5\n",

"I found 677 frequent itemsets of length 6\n",

"I found 373 frequent itemsets of length 7\n",

"I found 126 frequent itemsets of length 8\n",

"I found 24 frequent itemsets of length 9\n",

"I found 2 frequent itemsets of length 10\n",

"Did not find any frequent itemsets of length 11\n"

]

}

],

"source": [

"import sys\n",

"frequent_itemsets = {} # itemsets are sorted by length\n",

"min_support = 50\n",

"\n",

"# k=1 candidates are the isbns with more than min_support favourable reviews\n",

"frequent_itemsets[1] = dict((frozenset((movie_id,)), row[\"Favorable\"])\n",

" for movie_id, row in num_favorable_by_movie.iterrows()\n",

" if row[\"Favorable\"] > min_support)\n",

"\n",

"print(\"There are {} movies with more than {} favorable reviews\".format(len(frequent_itemsets[1]), min_support))\n",

"sys.stdout.flush()\n",

"for k in range(2, 20):\n",

" # Generate candidates of length k, using the frequent itemsets of length k-1\n",

" # Only store the frequent itemsets\n",

" cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1],\n",

" min_support)\n",

" if len(cur_frequent_itemsets) == 0:\n",

" print(\"Did not find any frequent itemsets of length {}\".format(k))\n",

" sys.stdout.flush()\n",

" break\n",

" else:\n",

" print(\"I found {} frequent itemsets of length {}\".format(len(cur_frequent_itemsets), k))\n",

" #print(cur_frequent_itemsets)\n",

" sys.stdout.flush()\n",

" frequent_itemsets[k] = cur_frequent_itemsets\n",

"# We aren't interested in the itemsets of length 1, so remove those\n",

"del frequent_itemsets[1]"

]

},

{

"cell_type": "code",

"execution_count": 14,

"metadata": {},

"outputs": [

{

"name": "stdout",

"output_type": "stream",

"text": [

"Found a total of 2968 frequent itemsets\n"

]

}

],

"source": [

"print(\"Found a total of {0} frequent itemsets\".format(sum(len(itemsets) for itemsets in frequent_itemsets.values())))"

]

},

{

"cell_type": "code",

"execution_count": 15,

"metadata": {},

"outputs": [

{

"name": "stdout",

"output_type": "stream",

"text": [

"There are 15285 candidate rules\n"

]

}

],

"source": [

"# Now we create the association rules. First, they are candidates until the confidence has been tested\n",

"candidate_rules = []\n",

"for itemset_length, itemset_counts in frequent_itemsets.items():\n",

" for itemset in itemset_counts.keys():\n",

" for conclusion in itemset:\n",

" premise = itemset - set((conclusion,))\n",

" candidate_rules.append((premise, conclusion))\n",

"print(\"There are {} candidate rules\".format(len(candidate_rules)))"

]

},

{

"cell_type": "code",

"execution_count": 16,

"metadata": {},

"outputs": [

{

"name": "stdout",

"output_type": "stream",

"text": [

"[(frozenset({79}), 258), (frozenset({258}), 79), (frozenset({50}), 64), (frozenset({64}), 50), (frozenset({127}), 181)]\n"

]

}

],

"source": [

"print(candidate_rules[:5])"

]

},

{

"cell_type": "code",

"execution_count": 17,

"metadata": {},

"outputs": [],

"source": [

"# Now, we compute the confidence of each of these rules. This is very similar to what we did in chapter 1\n",

"correct_counts = defaultdict(int)\n",

"incorrect_counts = defaultdict(int)\n",

"for user, reviews in favorable_reviews_by_users.items():\n",

" for candidate_rule in candidate_rules:\n",

" premise, conclusion = candidate_rule\n",

" if premise.issubset(reviews):\n",

" if conclusion in reviews:\n",

" correct_counts[candidate_rule] += 1\n",

" else:\n",

" incorrect_counts[candidate_rule] += 1\n",

"rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])\n",

" for candidate_rule in candidate_rules}"

]

},

{

"cell_type": "code",

"execution_count": 18,

"metadata": {},

"outputs": [],

"source": [

"# Choose only rules above a minimum confidence level\n",

"min_confidence = 0.9"

]

},

{

"cell_type": "code",

"execution_count": 19,

"metadata": {},

"outputs": [

{

"name": "stdout",

"output_type": "stream",

"text": [

"5152\n"

]

}

],

"source": [

"# Filter out the rules with poor confidence\n",

"rule_confidence = {rule: confidence for rule, confidence in rule_confidence.items() if confidence > min_confidence}\n",

"print(len(rule_confidence))"

]

},

{

"cell_type": "code",

"execution_count": 20,

"metadata": {},

"outputs": [],

"source": [

"from operator import itemgetter\n",

"sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)"

]

},

{

"cell_type": "code",

"execution_count": 21,

"metadata": {},

"outputs": [

{

"name": "stdout",

"output_type": "stream",

"text": [

"Rule #1\n",

"Rule: 评论了 frozenset({64, 98, 56, 50, 7}) 的人,他也会评论 174\n",

" - 置信度Confidence: 1.000\n",

"\n",

"Rule #2\n",

"Rule: 评论了 frozenset({98, 100, 172, 79, 50, 56}) 的人,他也会评论 7\n",

" - 置信度Confidence: 1.000\n",

"\n",

"Rule #3\n",

"Rule: 评论了 frozenset({98, 172, 181, 174, 7}) 的人,他也会评论 50\n",

" - 置信度Confidence: 1.000\n",

"\n",

"Rule #4\n",

"Rule: 评论了 frozenset({64, 98, 100, 7, 172, 50}) 的人,他也会评论 174\n",

" - 置信度Confidence: 1.000\n",

"\n",

"Rule #5\n",

"Rule: 评论了 frozenset({64, 1, 7, 172, 79, 50}) 的人,他也会评论 181\n",

" - 置信度Confidence: 1.000\n",

"\n"

]

}

],

"source": [

"for index in range(5):\n",

" print(\"Rule #{0}\".format(index + 1))\n",

" (premise, conclusion) = sorted_confidence[index][0]\n",

" print(\"Rule: 评论了 {0} 的人,他也会评论 {1}\".format(premise, conclusion))\n",

" print(\" - 置信度Confidence: {0:.3f}\".format(rule_confidence[(premise, conclusion)]))\n",

" print(\"\")"

]

},

{

"cell_type": "code",

"execution_count": 22,

"metadata": {},

"outputs": [],

"source": [

"# Even better, we can get the movie titles themselves from the dataset\n",

"movie_name_filename = os.path.join(data_folder, \"u.item\")\n",

"movie_name_data = pd.read_csv(movie_name_filename, delimiter=\"|\", header=None, encoding = \"mac-roman\")\n",

"movie_name_data.columns = [\"MovieID\", \"Title\", \"Release Date\", \"Video Release\", \"IMDB\", \"\", \"Action\", \"Adventure\",\n",

" \"Animation\", \"Children's\", \"Comedy\", \"Crime\", \"Documentary\", \"Drama\", \"Fantasy\", \"Film-Noir\",\n",

" \"Horror\", \"Musical\", \"Mystery\", \"Romance\", \"Sci-Fi\", \"Thriller\", \"War\", \"Western\"]"

]

},

{

"cell_type": "code",

"execution_count": 23,

"metadata": {},

"outputs": [],

"source": [

"def get_movie_name(movie_id):\n",

" title_object = movie_name_data[movie_name_data[\"MovieID\"] == movie_id][\"Title\"]\n",

" title = title_object.values[0]\n",

" return title"

]

},

{

"cell_type": "code",

"execution_count": 24,

"metadata": {},

"outputs": [

{

"data": {

"text/plain": [

"'Get Shorty (1995)'"

]

},

"execution_count": 24,

"metadata": {},

"output_type": "execute_result"

}

],

"source": [

"get_movie_name(4)"

]

},

{

"cell_type": "code",

"execution_count": 25,

"metadata": {},

"outputs": [

{

"name": "stdout",

"output_type": "stream",

"text": [

"Rule #1\n",

"Rule: 评论了 Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Pulp Fiction (1994), Star Wars (1977), Twelve Monkeys (1995) 的人,他也会评论 Raiders of the Lost Ark (1981)\n",

" - 置信度Confidence: 1.000\n",

"\n",

"Rule #2\n",

"Rule: 评论了 Silence of the Lambs, The (1991), Fargo (1996), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977), Pulp Fiction (1994) 的人,他也会评论 Twelve Monkeys (1995)\n",

" - 置信度Confidence: 1.000\n",

"\n",

"Rule #3\n",

"Rule: 评论了 Silence of the Lambs, The (1991), Empire Strikes Back, The (1980), Return of the Jedi (1983), Raiders of the Lost Ark (1981), Twelve Monkeys (1995) 的人,他也会评论 Star Wars (1977)\n",

" - 置信度Confidence: 1.000\n",

"\n",

"Rule #4\n",

"Rule: 评论了 Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Fargo (1996), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Star Wars (1977) 的人,他也会评论 Raiders of the Lost Ark (1981)\n",

" - 置信度Confidence: 1.000\n",

"\n",

"Rule #5\n",

"Rule: 评论了 Shawshank Redemption, The (1994), Toy Story (1995), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977) 的人,他也会评论 Return of the Jedi (1983)\n",

" - 置信度Confidence: 1.000\n",

"\n"

]

}

],

"source": [

"for index in range(5):\n",

" print(\"Rule #{0}\".format(index + 1))\n",

" (premise, conclusion) = sorted_confidence[index][0]\n",

" premise_names = \", \".join(get_movie_name(idx) for idx in premise)\n",

" conclusion_name = get_movie_name(conclusion)\n",

" print(\"Rule: 评论了 {0} 的人,他也会评论 {1}\".format(premise_names, conclusion_name))\n",

" print(\" - 置信度Confidence: {0:.3f}\".format(rule_confidence[(premise, conclusion)]))\n",

" print(\"\")"

]

},

{

"cell_type": "code",

"execution_count": 26,

"metadata": {},

"outputs": [],

"source": [

"# Evaluation using test data\n",

"test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]\n",

"test_favorable = test_dataset[test_dataset[\"Favorable\"]]\n",

"#test_not_favourable = test_dataset[~test_dataset[\"Favourable\"]]\n",

"test_favorable_by_users = dict((k, frozenset(v.values)) for k, v in test_favorable.groupby(\"UserID\")[\"MovieID\"])\n",

"#test_not_favourable_by_users = dict((k, frozenset(v.values)) for k, v in test_not_favourable.groupby(\"UserID\")[\"MovieID\"])\n",

"#test_users = test_dataset[\"UserID\"].unique()"

]

},

{

"cell_type": "code",

"execution_count": 27,

"metadata": {},

"outputs": [

{

"data": {

"text/html": [

"

\n",

"

" .dataframe thead tr:only-child th {\n",

" text-align: right;\n",

" }\n",

"\n",

" .dataframe thead th {\n",

" text-align: left;\n",

" }\n",

"\n",

" .dataframe tbody tr th {\n",

" vertical-align: top;\n",

" }\n",

"\n",

"

" \n",

"

\n",

"

\n",

"

UserID\n",

"

MovieID\n",

"

Rating\n",

"

Datetime\n",

"

Favorable\n",

"

\n",

"

\n",

"

\n",

"

\n",

"

3\n",

"

244\n",

"

51\n",

"

2\n",

"

1997-11-27 05:02:03\n",

"

False\n",

"

\n",

"

\n",

"

5\n",

"

298\n",

"

474\n",

"

4\n",

"

1998-01-07 14:20:06\n",

"

True\n",

"

\n",

"

\n",

"

7\n",

"

253\n",

"

465\n",

"

5\n",

"

1998-04-03 18:34:27\n",

"

True\n",

"

\n",

"

\n",

"

8\n",

"

305\n",

"

451\n",

"

3\n",

"

1998-02-01 09:20:17\n",

"

False\n",

"

\n",

"

\n",

"

11\n",

"

286\n",

"

1014\n",

"

5\n",

"

1997-11-17 15:38:45\n",

"

True\n",

"

\n",

"

\n",

"

\n",

"

"

],

"text/plain": [

" UserID MovieID Rating Datetime Favorable\n",

"3 244 51 2 1997-11-27 05:02:03 False\n",

"5 298 474 4 1998-01-07 14:20:06 True\n",

"7 253 465 5 1998-04-03 18:34:27 True\n",

"8 305 451 3 1998-02-01 09:20:17 False\n",

"11 286 1014 5 1997-11-17 15:38:45 True"

]

},

"execution_count": 27,

"metadata": {},

"output_type": "execute_result"

}

],

"source": [

"test_dataset[:5]"

]

},

{

"cell_type": "code",

"execution_count": 28,

"metadata": {},

"outputs": [],

"source": [

"correct_counts = defaultdict(int)\n",

"incorrect_counts = defaultdict(int)\n",

"for user, reviews in test_favorable_by_users.items():\n",

" for candidate_rule in candidate_rules:\n",

" premise, conclusion = candidate_rule\n",

" if premise.issubset(reviews):\n",

" if conclusion in reviews:\n",

" correct_counts[candidate_rule] += 1\n",

" else:\n",

" incorrect_counts[candidate_rule] += 1"

]

},

{

"cell_type": "code",

"execution_count": 29,

"metadata": {},

"outputs": [

{

"name": "stdout",

"output_type": "stream",

"text": [

"5152\n"

]

}

],

"source": [

"test_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])\n",

" for candidate_rule in rule_confidence}\n",

"print(len(test_confidence))"

]

},

{

"cell_type": "code",

"execution_count": 30,

"metadata": {},

"outputs": [

{

"name": "stdout",

"output_type": "stream",

"text": [

"[((frozenset({64, 1, 7, 172, 79, 50}), 174), 1.0), ((frozenset({64, 258, 98, 7, 174, 181}), 172), 1.0), ((frozenset({64, 1, 98, 7, 79, 181, 56}), 174), 1.0), ((frozenset({64, 1, 98, 7, 172, 79, 181}), 174), 1.0), ((frozenset({64, 258, 98, 7, 174, 50, 181}), 172), 1.0)]\n"

]

}

],

"source": [

"sorted_test_confidence = sorted(test_confidence.items(), key=itemgetter(1), reverse=True)\n",

"print(sorted_test_confidence[:5])"

]

},

{

"cell_type": "code",

"execution_count": 31,

"metadata": {},

"outputs": [

{

"name": "stdout",

"output_type": "stream",

"text": [

"Rule #1\n",

"Rule: 评论了 Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Pulp Fiction (1994), Star Wars (1977), Twelve Monkeys (1995) 的人,他也会评论 Raiders of the Lost Ark (1981)\n",

" - 训练集上的置信度: 1.000\n",

" - 测试集上的置信度: 0.909\n",

"\n",

"Rule #2\n",

"Rule: 评论了 Silence of the Lambs, The (1991), Fargo (1996), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977), Pulp Fiction (1994) 的人,他也会评论 Twelve Monkeys (1995)\n",

" - 训练集上的置信度: 1.000\n",

" - 测试集上的置信度: 0.609\n",

"\n",

"Rule #3\n",

"Rule: 评论了 Silence of the Lambs, The (1991), Empire Strikes Back, The (1980), Return of the Jedi (1983), Raiders of the Lost Ark (1981), Twelve Monkeys (1995) 的人,他也会评论 Star Wars (1977)\n",

" - 训练集上的置信度: 1.000\n",

" - 测试集上的置信度: 0.946\n",

"\n",

"Rule #4\n",

"Rule: 评论了 Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Fargo (1996), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Star Wars (1977) 的人,他也会评论 Raiders of the Lost Ark (1981)\n",

" - 训练集上的置信度: 1.000\n",

" - 测试集上的置信度: 0.971\n",

"\n",

"Rule #5\n",

"Rule: 评论了 Shawshank Redemption, The (1994), Toy Story (1995), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977) 的人,他也会评论 Return of the Jedi (1983)\n",

" - 训练集上的置信度: 1.000\n",

" - 测试集上的置信度: 0.900\n",

"\n",

"Rule #6\n",

"Rule: 评论了 Toy Story (1995), Silence of the Lambs, The (1991), Fargo (1996), Raiders of the Lost Ark (1981), Godfather, The (1972) 的人,他也会评论 Pulp Fiction (1994)\n",

" - 训练集上的置信度: 1.000\n",

" - 测试集上的置信度: 0.750\n",

"\n",

"Rule #7\n",

"Rule: 评论了 Silence of the Lambs, The (1991), Empire Strikes Back, The (1980), Godfather, The (1972), Raiders of the Lost Ark (1981), Twelve Monkeys (1995) 的人,他也会评论 Shawshank Redemption, The (1994)\n",

" - 训练集上的置信度: 1.000\n",

" - 测试集上的置信度: 0.854\n",

"\n",

"Rule #8\n",

"Rule: 评论了 Pulp Fiction (1994), Toy Story (1995), Shawshank Redemption, The (1994), Godfather, The (1972) 的人,他也会评论 Silence of the Lambs, The (1991)\n",

" - 训练集上的置信度: 1.000\n",

" - 测试集上的置信度: 0.870\n",

"\n",

"Rule #9\n",

"Rule: 评论了 Shawshank Redemption, The (1994), Fargo (1996), Return of the Jedi (1983), Raiders of the Lost Ark (1981), Fugitive, The (1993) 的人,他也会评论 Pulp Fiction (1994)\n",

" - 训练集上的置信度: 1.000\n",

" - 测试集上的置信度: 0.756\n",

"\n",

"Rule #10\n",

"Rule: 评论了 Silence of the Lambs, The (1991), Fargo (1996), Empire Strikes Back, The (1980), Raiders of the Lost Ark (1981), Fugitive, The (1993), Star Wars (1977), Return of the Jedi (1983) 的人,他也会评论 Pulp Fiction (1994)\n",

" - 训练集上的置信度: 1.000\n",

" - 测试集上的置信度: 0.756\n",

"\n"

]

}

],

"source": [

"for index in range(10):\n",

" print(\"Rule #{0}\".format(index + 1))\n",

" (premise, conclusion) = sorted_confidence[index][0]\n",

" premise_names = \", \".join(get_movie_name(idx) for idx in premise)\n",

" conclusion_name = get_movie_name(conclusion)\n",

" print(\"Rule: 评论了 {0} 的人,他也会评论 {1}\".format(premise_names, conclusion_name))\n",

" print(\" - 训练集上的置信度: {0:.3f}\".format(rule_confidence.get((premise, conclusion), -1)))\n",

" print(\" - 测试集上的置信度: {0:.3f}\".format(test_confidence.get((premise, conclusion), -1)))\n",

" print(\"\")"

]

},

{

"cell_type": "code",

"execution_count": null,

"metadata": {},

"outputs": [],

"source": []

},

{

"cell_type": "code",

"execution_count": null,

"metadata": {},

"outputs": [],

"source": []

}

],

"metadata": {

"kernelspec": {

"display_name": "Python 3",

"language": "python",

"name": "python3"

},

"language_info": {

"codemirror_mode": {

"name": "ipython",

"version": 3

},

"file_extension": ".py",

"mimetype": "text/x-python",

"name": "python",

"nbconvert_exporter": "python",

"pygments_lexer": "ipython3",

"version": "3.5.2"

},

"toc": {

"colors": {

"hover_highlight": "#DAA520",

"navigate_num": "#000000",

"navigate_text": "#333333",

"running_highlight": "#FF0000",

"selected_highlight": "#FFD700",

"sidebar_border": "#EEEEEE",

"wrapper_background": "#FFFFFF"

},

"moveMenuLeft": true,

"nav_menu": {

"height": "12px",

"width": "252px"

},

"navigate_menu": true,

"number_sections": true,

"sideBar": true,

"threshold": 4,

"toc_cell": false,

"toc_section_display": "block",

"toc_window_display": false,

"widenNotebook": false

}

},

"nbformat": 4,

"nbformat_minor": 2

}

一键复制

编辑

Web IDE

原始数据

按行查看

历史

深度学习是机器学习的一个子领域,它基于人工神经网络的研究,特别是利用多层次的神经网络来进行学习和模式识别。深度学习模型能够学习数据的高层次特征,这些特征对于图像和语音识别、自然语言处理、医学图像分析等应用至关重要。以下是深度学习的一些关键概念和组成部分: 1. **神经网络(Neural Networks)**:深度学习的基础是人工神经网络,它是由多个层组成的网络结构,包括输入层、隐藏层和输出层。每个层由多个神经元组成,神经元之间通过权重连接。 2. **前馈神经网络(Feedforward Neural Networks)**:这是最常见的神经网络类型,信息从输入层流向隐藏层,最终到达输出层。 3. **卷积神经网络(Convolutional Neural Networks, CNNs)**:这种网络特别适合处理具有网格结构的数据,如图像。它们使用卷积层来提取图像的特征。 4. **循环神经网络(Recurrent Neural Networks, RNNs)**:这种网络能够处理序列数据,如时间序列或自然语言,因为它们具有记忆功能,能够捕捉数据中的时间依赖性。 5. **长短期记忆网络(Long Short-Term Memory, LSTM)**:LSTM 是一种特殊的 RNN,它能够学习长期依赖关系,非常适合复杂的序列预测任务。 6. **生成对抗网络(Generative Adversarial Networks, GANs)**:由两个网络组成,一个生成器和一个判别器,它们相互竞争,生成器生成数据,判别器评估数据的真实性。 7. **深度学习框架**:如 TensorFlow、Keras、PyTorch 等,这些框架提供了构建、训练和部署深度学习模型的工具和库。 8. **激活函数(Activation Functions)**:如 ReLU、Sigmoid、Tanh 等,它们在神经网络中用于添加非线性,使得网络能够学习复杂的函数。 9. **损失函数(Loss Functions)**:用于评估模型的预测与真实值之间的差异,常见的损失函数包括均方误差(MSE)、交叉熵(Cross-Entropy)等。 10. **优化算法(Optimization Algorithms)**:如梯度下降(Gradient Descent)、随机梯度下降(SGD)、Adam 等,用于更新网络权重,以最小化损失函数。 11. **正则化(Regularization)**:技术如 Dropout、L1/L2 正则化等,用于防止模型过拟合。 12. **迁移学习(Transfer Learning)**:利用在一个任务上训练好的模型来提高另一个相关任务的性能。 深度学习在许多领域都取得了显著的成就,但它也面临着一些挑战,如对大量数据的依赖、模型的解释性差、计算资源消耗大等。研究人员正在不断探索新的方法来解决这些问题。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值