机器学习KNN源代码- 书本推荐

//使用KNN推荐书籍
这是一种使用K最近邻的书推荐算法,该算法采用书名,并返回5个相似书的列表以及它们各自的距离。为确保统计意义,我删除了评分低于200的用户和图书评分低于100的图书。 然后,我使用KNN和算法“ brute”和指标“ cosine”创建了模型。

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.5"
    },
    "colab": {
      "name": "book_recommendation_using_knn.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/emilyliublair/Machine-Learning-Projects/blob/main/book_recommendation_using_knn.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Y1onB6kUvo4Z"
      },
      "source": [
        "# import libraries\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "from scipy.sparse import csr_matrix\n",
        "from sklearn.neighbors import NearestNeighbors\n",
        "import matplotlib.pyplot as plt"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "iAQGqqO_vo4d"
      },
      "source": [
        "# get data files\n",
        "!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip\n",
        "\n",
        "!unzip book-crossings.zip\n",
        "\n",
        "books_filename = 'BX-Books.csv'\n",
        "ratings_filename = 'BX-Book-Ratings.csv'"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NClILWOiEd6Q"
      },
      "source": [
        "# import csv data into dataframes\n",
        "df_books = pd.read_csv(\n",
        "    books_filename,\n",
        "    encoding = \"ISO-8859-1\",\n",
        "    sep=\";\",\n",
        "    header=0,\n",
        "    names=['isbn', 'title', 'author'],\n",
        "    usecols=['isbn', 'title', 'author'],\n",
        "    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})\n",
        "\n",
        "df_ratings = pd.read_csv(\n",
        "    ratings_filename,\n",
        "    encoding = \"ISO-8859-1\",\n",
        "    sep=\";\",\n",
        "    header=0,\n",
        "    names=['user', 'isbn', 'rating'],\n",
        "    usecols=['user', 'isbn', 'rating'],\n",
        "    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xAcXjkCFCh0A"
      },
      "source": [
        "#merge 2 datasets\n",
        "data=pd.merge(df_books, df_ratings, on='isbn')\n",
        "\n",
        "user_count = data.groupby('user')['rating'].count().reset_index().rename(columns = {'rating':'user_count'})\n",
        "book_count = data.groupby('isbn')['rating'].count().reset_index().rename(columns={'rating':'book_count'})\n",
        "\n",
        "data = data.merge(user_count, on='user')\n",
        "data=data.merge(book_count, on='isbn')"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "H8arfkkb9gIP"
      },
      "source": [
        "#clean out dataset to only retain statistically significant data\n",
        "data = data.loc[data['user_count'] >=200]\n",
        "data = data.loc[data['book_count'] >=100]\n",
        "data = data.drop_duplicates(subset=['title', 'user'])"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Q6fwwHKV90eE"
      },
      "source": [
        "#rearrange dataframe with title first and ratings as values\n",
        "data_pivot = data.pivot(index='title', columns='user', values=['rating']).fillna(0)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "rRg4gsHxbDXu"
      },
      "source": [
        "#build model\n",
        "model = NearestNeighbors(algorithm='brute', metric='cosine')\n",
        "model.fit(data_pivot)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "f5ZUd-L1SQz7"
      },
      "source": [
        "# function to return recommended books\n",
        "def get_recommends(book = \"\"):\n",
        "  row = data_pivot.loc[data_pivot.index == book]\n",
        "  distances, indices = model.kneighbors(np.reshape(row,[1,-1]),5, True)\n",
        "  recommended_books = []\n",
        "  list_books = []\n",
        "\n",
        "  for i in range(0, len(distances.flatten())):\n",
        "    if i==0:\n",
        "      recommended_books.append(book)\n",
        "    if not(i==0):\n",
        "      book = []\n",
        "      book.append(data_pivot.index[indices[0][i]])\n",
        "      book.append(distances[0][i])\n",
        "      list_books.append(book)\n",
        "  recommended_books.append(list_books[::-1])\n",
        "  return recommended_books\n",
        "\n",
        "\n",
        "books = get_recommends(\"The Queen of the Damned (Vampire Chronicles (Paperback))\")\n",
        "print(books)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jd2SLCh8oxMh"
      },
      "source": [
        "books = get_recommends(\"Where the Heart Is (Oprah's Book Club (Paperback))\")\n",
        "print(books)\n",
        "\n",
        "def test_book_recommendation():\n",
        "  test_pass = True\n",
        "  recommends = get_recommends(\"Where the Heart Is (Oprah's Book Club (Paperback))\")\n",
        "  if recommends[0] != \"Where the Heart Is (Oprah's Book Club (Paperback))\":\n",
        "    test_pass = False\n",
        "  recommended_books = [\"I'll Be Seeing You\", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']\n",
        "  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]\n",
        "  for i in range(2): \n",
        "    if recommends[1][i][0] not in recommended_books:\n",
        "      test_pass = False\n",
        "    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:\n",
        "      test_pass = False\n",
        "  if test_pass:\n",
        "    print(\"You passed the challenge! 🎉🎉🎉🎉🎉\")\n",
        "  else:\n",
        "    print(\"You havn't passed yet. Keep trying!\")\n",
        "\n",
        "test_book_recommendation()"
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

打工人何苦为难打工人

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值