25개 이상의 토픽을 선택하실 수 없습니다. Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

274 lines
8.0 KiB

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'surprise'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-1-002ce27085d1>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0msurprise\u001b[0m \u001b[1;31m# run 'pip install scikit-surprise' to install surprise\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'surprise'"
]
}
],
"source": [
"import numpy as np\n",
"import surprise # run 'pip install scikit-surprise' to install surprise"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class MatrixFacto(surprise.AlgoBase):\n",
" '''A basic rating prediction algorithm based on matrix factorization.'''\n",
" \n",
" def __init__(self, learning_rate, n_epochs, n_factors):\n",
" \n",
" self.lr = learning_rate # learning rate for SGD\n",
" self.n_epochs = n_epochs # number of iterations of SGD\n",
" self.n_factors = n_factors # number of factors\n",
" \n",
" def fit(self, trainset):\n",
" '''Learn the vectors p_u and q_i with SGD'''\n",
" \n",
" print('Fitting data with SGD...')\n",
" \n",
" # Randomly initialize the user and item factors.\n",
" p = np.random.normal(0, .1, (trainset.n_users, self.n_factors))\n",
" q = np.random.normal(0, .1, (trainset.n_items, self.n_factors))\n",
" \n",
" # SGD procedure\n",
" for _ in range(self.n_epochs):\n",
" for u, i, r_ui in trainset.all_ratings():\n",
" err = r_ui - np.dot(p[u], q[i])\n",
" # Update vectors p_u and q_i\n",
" p[u] += self.lr * err * q[i]\n",
" q[i] += self.lr * err * p[u]\n",
" # Note: in the update of q_i, we should actually use the previous (non-updated) value of p_u.\n",
" # In practice it makes almost no difference.\n",
" \n",
" self.p, self.q = p, q\n",
" self.trainset = trainset\n",
"\n",
" def estimate(self, u, i):\n",
" '''Return the estmimated rating of user u for item i.'''\n",
" \n",
" # return scalar product between p_u and q_i if user and item are known,\n",
" # else return the average of all ratings\n",
" if self.trainset.knows_user(u) and self.trainset.knows_item(i):\n",
" return np.dot(self.p[u], self.q[i])\n",
" else:\n",
" return self.trainset.global_mean"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# data loading. We'll use the movielens dataset (https://grouplens.org/datasets/movielens/100k/)\n",
"# it will be downloaded automatically.\n",
"data = surprise.Dataset.load_builtin('ml-100k')\n",
"data.split(2) # split data for 2-folds cross validation"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluating RMSE of algorithm MatrixFacto.\n",
"\n",
"------------\n",
"Fold 1\n",
"Fitting data with SGD...\n",
"RMSE: 0.9826\n",
"------------\n",
"Fold 2\n",
"Fitting data with SGD...\n",
"RMSE: 0.9873\n",
"------------\n",
"------------\n",
"Mean RMSE: 0.9849\n",
"------------\n",
"------------\n"
]
},
{
"data": {
"text/plain": [
"CaseInsensitiveDefaultDict(list,\n",
" {'rmse': [0.98263312180825368, 0.9872549391926676]})"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"algo = MatrixFacto(learning_rate=.01, n_epochs=10, n_factors=10)\n",
"surprise.evaluate(algo, data, measures=['RMSE'])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluating RMSE of algorithm KNNBasic.\n",
"\n",
"------------\n",
"Fold 1\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"RMSE: 1.0101\n",
"------------\n",
"Fold 2\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"RMSE: 0.9982\n",
"------------\n",
"------------\n",
"Mean RMSE: 1.0042\n",
"------------\n",
"------------\n"
]
},
{
"data": {
"text/plain": [
"CaseInsensitiveDefaultDict(list,\n",
" {'rmse': [1.0101383334175613, 0.99823558896449016]})"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# try a neighborhood-based algorithm (on the same data)\n",
"algo = surprise.KNNBasic()\n",
"surprise.evaluate(algo, data, measures=['RMSE'])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluating RMSE of algorithm SVD.\n",
"\n",
"------------\n",
"Fold 1\n",
"RMSE: 0.9604\n",
"------------\n",
"Fold 2\n",
"RMSE: 0.9538\n",
"------------\n",
"------------\n",
"Mean RMSE: 0.9571\n",
"------------\n",
"------------\n"
]
},
{
"data": {
"text/plain": [
"CaseInsensitiveDefaultDict(list,\n",
" {'rmse': [0.96042083843476056,\n",
" 0.95382688332712151]})"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# try a more sophisticated matrix factorization algorithm (on the same data)\n",
"algo = surprise.SVD()\n",
"surprise.evaluate(algo, data, measures=['RMSE'])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
},
"latex_envs": {
"LaTeX_envs_menu_present": true,
"autoclose": false,
"autocomplete": true,
"bibliofile": "biblio.bib",
"cite_by": "apalike",
"current_citInitial": 1,
"eqLabelWithNumbers": true,
"eqNumInitial": 1,
"hotkeys": {
"equation": "Ctrl-E",
"itemize": "Ctrl-I"
},
"labels_anchors": false,
"latex_user_defs": false,
"report_style_numbering": false,
"user_envs_cfg": false
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}