|
|
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "ename": "ModuleNotFoundError",
- "evalue": "No module named 'surprise'",
- "output_type": "error",
- "traceback": [
- "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
- "\u001b[1;32m<ipython-input-1-002ce27085d1>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0msurprise\u001b[0m \u001b[1;31m# run 'pip install scikit-surprise' to install surprise\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
- "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'surprise'"
- ]
- }
- ],
- "source": [
- "import numpy as np\n",
- "import surprise # run 'pip install scikit-surprise' to install surprise"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "class MatrixFacto(surprise.AlgoBase):\n",
- " '''A basic rating prediction algorithm based on matrix factorization.'''\n",
- " \n",
- " def __init__(self, learning_rate, n_epochs, n_factors):\n",
- " \n",
- " self.lr = learning_rate # learning rate for SGD\n",
- " self.n_epochs = n_epochs # number of iterations of SGD\n",
- " self.n_factors = n_factors # number of factors\n",
- " \n",
- " def fit(self, trainset):\n",
- " '''Learn the vectors p_u and q_i with SGD'''\n",
- " \n",
- " print('Fitting data with SGD...')\n",
- " \n",
- " # Randomly initialize the user and item factors.\n",
- " p = np.random.normal(0, .1, (trainset.n_users, self.n_factors))\n",
- " q = np.random.normal(0, .1, (trainset.n_items, self.n_factors))\n",
- " \n",
- " # SGD procedure\n",
- " for _ in range(self.n_epochs):\n",
- " for u, i, r_ui in trainset.all_ratings():\n",
- " err = r_ui - np.dot(p[u], q[i])\n",
- " # Update vectors p_u and q_i\n",
- " p[u] += self.lr * err * q[i]\n",
- " q[i] += self.lr * err * p[u]\n",
- " # Note: in the update of q_i, we should actually use the previous (non-updated) value of p_u.\n",
- " # In practice it makes almost no difference.\n",
- " \n",
- " self.p, self.q = p, q\n",
- " self.trainset = trainset\n",
- "\n",
- " def estimate(self, u, i):\n",
- " '''Return the estmimated rating of user u for item i.'''\n",
- " \n",
- " # return scalar product between p_u and q_i if user and item are known,\n",
- " # else return the average of all ratings\n",
- " if self.trainset.knows_user(u) and self.trainset.knows_item(i):\n",
- " return np.dot(self.p[u], self.q[i])\n",
- " else:\n",
- " return self.trainset.global_mean"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "# data loading. We'll use the movielens dataset (https://grouplens.org/datasets/movielens/100k/)\n",
- "# it will be downloaded automatically.\n",
- "data = surprise.Dataset.load_builtin('ml-100k')\n",
- "data.split(2) # split data for 2-folds cross validation"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Evaluating RMSE of algorithm MatrixFacto.\n",
- "\n",
- "------------\n",
- "Fold 1\n",
- "Fitting data with SGD...\n",
- "RMSE: 0.9826\n",
- "------------\n",
- "Fold 2\n",
- "Fitting data with SGD...\n",
- "RMSE: 0.9873\n",
- "------------\n",
- "------------\n",
- "Mean RMSE: 0.9849\n",
- "------------\n",
- "------------\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "CaseInsensitiveDefaultDict(list,\n",
- " {'rmse': [0.98263312180825368, 0.9872549391926676]})"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "algo = MatrixFacto(learning_rate=.01, n_epochs=10, n_factors=10)\n",
- "surprise.evaluate(algo, data, measures=['RMSE'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Evaluating RMSE of algorithm KNNBasic.\n",
- "\n",
- "------------\n",
- "Fold 1\n",
- "Computing the msd similarity matrix...\n",
- "Done computing similarity matrix.\n",
- "RMSE: 1.0101\n",
- "------------\n",
- "Fold 2\n",
- "Computing the msd similarity matrix...\n",
- "Done computing similarity matrix.\n",
- "RMSE: 0.9982\n",
- "------------\n",
- "------------\n",
- "Mean RMSE: 1.0042\n",
- "------------\n",
- "------------\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "CaseInsensitiveDefaultDict(list,\n",
- " {'rmse': [1.0101383334175613, 0.99823558896449016]})"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# try a neighborhood-based algorithm (on the same data)\n",
- "algo = surprise.KNNBasic()\n",
- "surprise.evaluate(algo, data, measures=['RMSE'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Evaluating RMSE of algorithm SVD.\n",
- "\n",
- "------------\n",
- "Fold 1\n",
- "RMSE: 0.9604\n",
- "------------\n",
- "Fold 2\n",
- "RMSE: 0.9538\n",
- "------------\n",
- "------------\n",
- "Mean RMSE: 0.9571\n",
- "------------\n",
- "------------\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "CaseInsensitiveDefaultDict(list,\n",
- " {'rmse': [0.96042083843476056,\n",
- " 0.95382688332712151]})"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# try a more sophisticated matrix factorization algorithm (on the same data)\n",
- "algo = surprise.SVD()\n",
- "surprise.evaluate(algo, data, measures=['RMSE'])"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.6"
- },
- "latex_envs": {
- "LaTeX_envs_menu_present": true,
- "autoclose": false,
- "autocomplete": true,
- "bibliofile": "biblio.bib",
- "cite_by": "apalike",
- "current_citInitial": 1,
- "eqLabelWithNumbers": true,
- "eqNumInitial": 1,
- "hotkeys": {
- "equation": "Ctrl-E",
- "itemize": "Ctrl-I"
- },
- "labels_anchors": false,
- "latex_user_defs": false,
- "report_style_numbering": false,
- "user_envs_cfg": false
- },
- "toc": {
- "base_numbering": 1,
- "nav_menu": {},
- "number_sections": true,
- "sideBar": true,
- "skip_h1_title": false,
- "title_cell": "Table of Contents",
- "title_sidebar": "Contents",
- "toc_cell": false,
- "toc_position": {},
- "toc_section_display": true,
- "toc_window_display": false
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
|