//使用KNN推荐书籍
这是一种使用K最近邻的书推荐算法,该算法采用书名,并返回5个相似书的列表以及它们各自的距离。为确保统计意义,我删除了评分低于200的用户和图书评分低于100的图书。 然后,我使用KNN和算法“ brute”和指标“ cosine”创建了模型。
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
},
"colab": {
"name": "book_recommendation_using_knn.ipynb",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/emilyliublair/Machine-Learning-Projects/blob/main/book_recommendation_using_knn.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Y1onB6kUvo4Z"
},
"source": [
"# import libraries\n",
"import numpy as np\n",
"import pandas as pd\n",
"from scipy.sparse import csr_matrix\n",
"from sklearn.neighbors import NearestNeighbors\n",
"import matplotlib.pyplot as plt"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "iAQGqqO_vo4d"
},
"source": [
"# get data files\n",
"!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip\n",
"\n",
"!unzip book-crossings.zip\n",
"\n",
"books_filename = 'BX-Books.csv'\n",
"ratings_filename = 'BX-Book-Ratings.csv'"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "NClILWOiEd6Q"
},
"source": [
"# import csv data into dataframes\n",
"df_books = pd.read_csv(\n",
" books_filename,\n",
" encoding = \"ISO-8859-1\",\n",
" sep=\";\",\n",
" header=0,\n",
" names=['isbn', 'title', 'author'],\n",
" usecols=['isbn', 'title', 'author'],\n",
" dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})\n",
"\n",
"df_ratings = pd.read_csv(\n",
" ratings_filename,\n",
" encoding = \"ISO-8859-1\",\n",
" sep=\";\",\n",
" header=0,\n",
" names=['user', 'isbn', 'rating'],\n",
" usecols=['user', 'isbn', 'rating'],\n",
" dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "xAcXjkCFCh0A"
},
"source": [
"#merge 2 datasets\n",
"data=pd.merge(df_books, df_ratings, on='isbn')\n",
"\n",
"user_count = data.groupby('user')['rating'].count().reset_index().rename(columns = {'rating':'user_count'})\n",
"book_count = data.groupby('isbn')['rating'].count().reset_index().rename(columns={'rating':'book_count'})\n",
"\n",
"data = data.merge(user_count, on='user')\n",
"data=data.merge(book_count, on='isbn')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "H8arfkkb9gIP"
},
"source": [
"#clean out dataset to only retain statistically significant data\n",
"data = data.loc[data['user_count'] >=200]\n",
"data = data.loc[data['book_count'] >=100]\n",
"data = data.drop_duplicates(subset=['title', 'user'])"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Q6fwwHKV90eE"
},
"source": [
"#rearrange dataframe with title first and ratings as values\n",
"data_pivot = data.pivot(index='title', columns='user', values=['rating']).fillna(0)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "rRg4gsHxbDXu"
},
"source": [
"#build model\n",
"model = NearestNeighbors(algorithm='brute', metric='cosine')\n",
"model.fit(data_pivot)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "f5ZUd-L1SQz7"
},
"source": [
"# function to return recommended books\n",
"def get_recommends(book = \"\"):\n",
" row = data_pivot.loc[data_pivot.index == book]\n",
" distances, indices = model.kneighbors(np.reshape(row,[1,-1]),5, True)\n",
" recommended_books = []\n",
" list_books = []\n",
"\n",
" for i in range(0, len(distances.flatten())):\n",
" if i==0:\n",
" recommended_books.append(book)\n",
" if not(i==0):\n",
" book = []\n",
" book.append(data_pivot.index[indices[0][i]])\n",
" book.append(distances[0][i])\n",
" list_books.append(book)\n",
" recommended_books.append(list_books[::-1])\n",
" return recommended_books\n",
"\n",
"\n",
"books = get_recommends(\"The Queen of the Damned (Vampire Chronicles (Paperback))\")\n",
"print(books)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "jd2SLCh8oxMh"
},
"source": [
"books = get_recommends(\"Where the Heart Is (Oprah's Book Club (Paperback))\")\n",
"print(books)\n",
"\n",
"def test_book_recommendation():\n",
" test_pass = True\n",
" recommends = get_recommends(\"Where the Heart Is (Oprah's Book Club (Paperback))\")\n",
" if recommends[0] != \"Where the Heart Is (Oprah's Book Club (Paperback))\":\n",
" test_pass = False\n",
" recommended_books = [\"I'll Be Seeing You\", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']\n",
" recommended_books_dist = [0.8, 0.77, 0.77, 0.77]\n",
" for i in range(2): \n",
" if recommends[1][i][0] not in recommended_books:\n",
" test_pass = False\n",
" if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:\n",
" test_pass = False\n",
" if test_pass:\n",
" print(\"You passed the challenge! 🎉🎉🎉🎉🎉\")\n",
" else:\n",
" print(\"You havn't passed yet. Keep trying!\")\n",
"\n",
"test_book_recommendation()"
],
"execution_count": null,
"outputs": []
}
]
}