{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"ename": "ModuleNotFoundError",
|
|
"evalue": "No module named 'surprise'",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
|
"\u001b[1;32m<ipython-input-1-002ce27085d1>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0msurprise\u001b[0m \u001b[1;31m# run 'pip install scikit-surprise' to install surprise\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
|
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'surprise'"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"import surprise # run 'pip install scikit-surprise' to install surprise"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"class MatrixFacto(surprise.AlgoBase):\n",
|
|
" '''A basic rating prediction algorithm based on matrix factorization.'''\n",
|
|
" \n",
|
|
" def __init__(self, learning_rate, n_epochs, n_factors):\n",
|
|
" \n",
|
|
" self.lr = learning_rate # learning rate for SGD\n",
|
|
" self.n_epochs = n_epochs # number of iterations of SGD\n",
|
|
" self.n_factors = n_factors # number of factors\n",
|
|
" \n",
|
|
" def fit(self, trainset):\n",
|
|
" '''Learn the vectors p_u and q_i with SGD'''\n",
|
|
" \n",
|
|
" print('Fitting data with SGD...')\n",
|
|
" \n",
|
|
" # Randomly initialize the user and item factors.\n",
|
|
" p = np.random.normal(0, .1, (trainset.n_users, self.n_factors))\n",
|
|
" q = np.random.normal(0, .1, (trainset.n_items, self.n_factors))\n",
|
|
" \n",
|
|
" # SGD procedure\n",
|
|
" for _ in range(self.n_epochs):\n",
|
|
" for u, i, r_ui in trainset.all_ratings():\n",
|
|
" err = r_ui - np.dot(p[u], q[i])\n",
|
|
" # Update vectors p_u and q_i\n",
|
|
" p[u] += self.lr * err * q[i]\n",
|
|
" q[i] += self.lr * err * p[u]\n",
|
|
" # Note: in the update of q_i, we should actually use the previous (non-updated) value of p_u.\n",
|
|
" # In practice it makes almost no difference.\n",
|
|
" \n",
|
|
" self.p, self.q = p, q\n",
|
|
" self.trainset = trainset\n",
|
|
"\n",
|
|
" def estimate(self, u, i):\n",
|
|
" '''Return the estmimated rating of user u for item i.'''\n",
|
|
" \n",
|
|
" # return scalar product between p_u and q_i if user and item are known,\n",
|
|
" # else return the average of all ratings\n",
|
|
" if self.trainset.knows_user(u) and self.trainset.knows_item(i):\n",
|
|
" return np.dot(self.p[u], self.q[i])\n",
|
|
" else:\n",
|
|
" return self.trainset.global_mean"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# data loading. We'll use the movielens dataset (https://grouplens.org/datasets/movielens/100k/)\n",
|
|
"# it will be downloaded automatically.\n",
|
|
"data = surprise.Dataset.load_builtin('ml-100k')\n",
|
|
"data.split(2) # split data for 2-folds cross validation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Evaluating RMSE of algorithm MatrixFacto.\n",
|
|
"\n",
|
|
"------------\n",
|
|
"Fold 1\n",
|
|
"Fitting data with SGD...\n",
|
|
"RMSE: 0.9826\n",
|
|
"------------\n",
|
|
"Fold 2\n",
|
|
"Fitting data with SGD...\n",
|
|
"RMSE: 0.9873\n",
|
|
"------------\n",
|
|
"------------\n",
|
|
"Mean RMSE: 0.9849\n",
|
|
"------------\n",
|
|
"------------\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"CaseInsensitiveDefaultDict(list,\n",
|
|
" {'rmse': [0.98263312180825368, 0.9872549391926676]})"
|
|
]
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"algo = MatrixFacto(learning_rate=.01, n_epochs=10, n_factors=10)\n",
|
|
"surprise.evaluate(algo, data, measures=['RMSE'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Evaluating RMSE of algorithm KNNBasic.\n",
|
|
"\n",
|
|
"------------\n",
|
|
"Fold 1\n",
|
|
"Computing the msd similarity matrix...\n",
|
|
"Done computing similarity matrix.\n",
|
|
"RMSE: 1.0101\n",
|
|
"------------\n",
|
|
"Fold 2\n",
|
|
"Computing the msd similarity matrix...\n",
|
|
"Done computing similarity matrix.\n",
|
|
"RMSE: 0.9982\n",
|
|
"------------\n",
|
|
"------------\n",
|
|
"Mean RMSE: 1.0042\n",
|
|
"------------\n",
|
|
"------------\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"CaseInsensitiveDefaultDict(list,\n",
|
|
" {'rmse': [1.0101383334175613, 0.99823558896449016]})"
|
|
]
|
|
},
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# try a neighborhood-based algorithm (on the same data)\n",
|
|
"algo = surprise.KNNBasic()\n",
|
|
"surprise.evaluate(algo, data, measures=['RMSE'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Evaluating RMSE of algorithm SVD.\n",
|
|
"\n",
|
|
"------------\n",
|
|
"Fold 1\n",
|
|
"RMSE: 0.9604\n",
|
|
"------------\n",
|
|
"Fold 2\n",
|
|
"RMSE: 0.9538\n",
|
|
"------------\n",
|
|
"------------\n",
|
|
"Mean RMSE: 0.9571\n",
|
|
"------------\n",
|
|
"------------\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"CaseInsensitiveDefaultDict(list,\n",
|
|
" {'rmse': [0.96042083843476056,\n",
|
|
" 0.95382688332712151]})"
|
|
]
|
|
},
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# try a more sophisticated matrix factorization algorithm (on the same data)\n",
|
|
"algo = surprise.SVD()\n",
|
|
"surprise.evaluate(algo, data, measures=['RMSE'])"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.6"
|
|
},
|
|
"latex_envs": {
|
|
"LaTeX_envs_menu_present": true,
|
|
"autoclose": false,
|
|
"autocomplete": true,
|
|
"bibliofile": "biblio.bib",
|
|
"cite_by": "apalike",
|
|
"current_citInitial": 1,
|
|
"eqLabelWithNumbers": true,
|
|
"eqNumInitial": 1,
|
|
"hotkeys": {
|
|
"equation": "Ctrl-E",
|
|
"itemize": "Ctrl-I"
|
|
},
|
|
"labels_anchors": false,
|
|
"latex_user_defs": false,
|
|
"report_style_numbering": false,
|
|
"user_envs_cfg": false
|
|
},
|
|
"toc": {
|
|
"base_numbering": 1,
|
|
"nav_menu": {},
|
|
"number_sections": true,
|
|
"sideBar": true,
|
|
"skip_h1_title": false,
|
|
"title_cell": "Table of Contents",
|
|
"title_sidebar": "Contents",
|
|
"toc_cell": false,
|
|
"toc_position": {},
|
|
"toc_section_display": true,
|
|
"toc_window_display": false
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|