{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'surprise'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-1-002ce27085d1>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0msurprise\u001b[0m  \u001b[1;31m# run 'pip install scikit-surprise' to install surprise\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'surprise'"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import surprise  # run 'pip install scikit-surprise' to install surprise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class MatrixFacto(surprise.AlgoBase):\n",
    "    '''A basic rating prediction algorithm based on matrix factorization.'''\n",
    "    \n",
    "    def __init__(self, learning_rate, n_epochs, n_factors):\n",
    "        \n",
    "        self.lr = learning_rate  # learning rate for SGD\n",
    "        self.n_epochs = n_epochs  # number of iterations of SGD\n",
    "        self.n_factors = n_factors  # number of factors\n",
    "        \n",
    "    def fit(self, trainset):\n",
    "        '''Learn the vectors p_u and q_i with SGD'''\n",
    "        \n",
    "        print('Fitting data with SGD...')\n",
    "        \n",
    "        # Randomly initialize the user and item factors.\n",
    "        p = np.random.normal(0, .1, (trainset.n_users, self.n_factors))\n",
    "        q = np.random.normal(0, .1, (trainset.n_items, self.n_factors))\n",
    "        \n",
    "        # SGD procedure\n",
    "        for _ in range(self.n_epochs):\n",
    "            for u, i, r_ui in trainset.all_ratings():\n",
    "                err = r_ui - np.dot(p[u], q[i])\n",
    "                # Update vectors p_u and q_i\n",
    "                p[u] += self.lr * err * q[i]\n",
    "                q[i] += self.lr * err * p[u]\n",
    "                # Note: in the update of q_i, we should actually use the previous (non-updated) value of p_u.\n",
    "                # In practice it makes almost no difference.\n",
    "        \n",
    "        self.p, self.q = p, q\n",
    "        self.trainset = trainset\n",
    "\n",
    "    def estimate(self, u, i):\n",
    "        '''Return the estmimated rating of user u for item i.'''\n",
    "        \n",
    "        # return scalar product between p_u and q_i if user and item are known,\n",
    "        # else return the average of all ratings\n",
    "        if self.trainset.knows_user(u) and self.trainset.knows_item(i):\n",
    "            return np.dot(self.p[u], self.q[i])\n",
    "        else:\n",
    "            return self.trainset.global_mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# data loading. We'll use the movielens dataset (https://grouplens.org/datasets/movielens/100k/)\n",
    "# it will be downloaded automatically.\n",
    "data = surprise.Dataset.load_builtin('ml-100k')\n",
    "data.split(2)  # split data for 2-folds cross validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating RMSE of algorithm MatrixFacto.\n",
      "\n",
      "------------\n",
      "Fold 1\n",
      "Fitting data with SGD...\n",
      "RMSE: 0.9826\n",
      "------------\n",
      "Fold 2\n",
      "Fitting data with SGD...\n",
      "RMSE: 0.9873\n",
      "------------\n",
      "------------\n",
      "Mean RMSE: 0.9849\n",
      "------------\n",
      "------------\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CaseInsensitiveDefaultDict(list,\n",
       "                           {'rmse': [0.98263312180825368, 0.9872549391926676]})"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "algo = MatrixFacto(learning_rate=.01, n_epochs=10, n_factors=10)\n",
    "surprise.evaluate(algo, data, measures=['RMSE'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating RMSE of algorithm KNNBasic.\n",
      "\n",
      "------------\n",
      "Fold 1\n",
      "Computing the msd similarity matrix...\n",
      "Done computing similarity matrix.\n",
      "RMSE: 1.0101\n",
      "------------\n",
      "Fold 2\n",
      "Computing the msd similarity matrix...\n",
      "Done computing similarity matrix.\n",
      "RMSE: 0.9982\n",
      "------------\n",
      "------------\n",
      "Mean RMSE: 1.0042\n",
      "------------\n",
      "------------\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CaseInsensitiveDefaultDict(list,\n",
       "                           {'rmse': [1.0101383334175613, 0.99823558896449016]})"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# try a neighborhood-based algorithm (on the same data)\n",
    "algo = surprise.KNNBasic()\n",
    "surprise.evaluate(algo, data, measures=['RMSE'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating RMSE of algorithm SVD.\n",
      "\n",
      "------------\n",
      "Fold 1\n",
      "RMSE: 0.9604\n",
      "------------\n",
      "Fold 2\n",
      "RMSE: 0.9538\n",
      "------------\n",
      "------------\n",
      "Mean RMSE: 0.9571\n",
      "------------\n",
      "------------\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CaseInsensitiveDefaultDict(list,\n",
       "                           {'rmse': [0.96042083843476056,\n",
       "                             0.95382688332712151]})"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# try a more sophisticated matrix factorization algorithm (on the same data)\n",
    "algo = surprise.SVD()\n",
    "surprise.evaluate(algo, data, measures=['RMSE'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}