diff --git a/notebooks/data_mining.ipynb b/notebooks/data_mining.ipynb new file mode 100644 index 0000000..1ae201e --- /dev/null +++ b/notebooks/data_mining.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Untitled0.ipynb","provenance":[],"authorship_tag":"ABX9TyM0ddfsei1qq14OInyayGof"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"FodWVxwnQsoO","executionInfo":{"status":"ok","timestamp":1651554696872,"user_tz":-480,"elapsed":4864,"user":{"displayName":"ming li","userId":"14148720490428311514"}},"outputId":"1bfdd089-bcfb-4a03-c212-92e9012af972"},"outputs":[{"output_type":"stream","name":"stdout","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')\n","import os"]},{"cell_type":"code","source":["!pip install igraph"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"egxxstKlUbBI","executionInfo":{"status":"ok","timestamp":1651554700151,"user_tz":-480,"elapsed":3284,"user":{"displayName":"ming li","userId":"14148720490428311514"}},"outputId":"5df62be8-958e-4791-a28f-130dc242d32a"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["Requirement already satisfied: igraph in /usr/local/lib/python3.7/dist-packages (0.9.10)\n","Requirement already satisfied: texttable>=1.6.2 in /usr/local/lib/python3.7/dist-packages (from igraph) (1.6.4)\n"]}]},{"cell_type":"code","source":["from igraph import *"],"metadata":{"id":"_TvuaXmsU2Nm","executionInfo":{"status":"ok","timestamp":1651554700151,"user_tz":-480,"elapsed":7,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["target = 'IssueCommentEvent'\n","path = '/content/drive/My Drive/social_computing/data/'+ target + '.txt'"],"metadata":{"id":"ycxti9acRYLt","executionInfo":{"status":"ok","timestamp":1651555382584,"user_tz":-480,"elapsed":544,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":38,"outputs":[]},{"cell_type":"code","source":["f = open(path, encoding = \"utf-8\")\n","file_data = f.readlines()\n","print(len(file_data))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"wA2YbybDRvXs","executionInfo":{"status":"ok","timestamp":1651555383214,"user_tz":-480,"elapsed":3,"user":{"displayName":"ming li","userId":"14148720490428311514"}},"outputId":"c2369efd-3f18-448f-a04f-7626d4670b43"},"execution_count":39,"outputs":[{"output_type":"stream","name":"stdout","text":["138808\n"]}]},{"cell_type":"code","source":["node_ids = []\n","for x in range(len(file_data)//2-1):\n"," pair = file_data[x*2+1].strip('\\n').split(' ')[1:]\n"," l = len(pair)\n"," for i in range(l-1):\n"," node_ids.append(int(pair[i]))\n","node_ids = list(set(node_ids))"],"metadata":{"id":"m_4GQ8gbPG5Y","executionInfo":{"status":"ok","timestamp":1651555384802,"user_tz":-480,"elapsed":1,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":40,"outputs":[]},{"cell_type":"code","source":["node_ids.sort()\n","print(node_ids[:100])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"waEiQx0SPdAz","executionInfo":{"status":"ok","timestamp":1651555385317,"user_tz":-480,"elapsed":2,"user":{"displayName":"ming li","userId":"14148720490428311514"}},"outputId":"2ede948d-48ae-4f43-d015-fda6061094f9"},"execution_count":41,"outputs":[{"output_type":"stream","name":"stdout","text":["[27, 28, 68, 130, 144, 363, 426, 507, 912, 966, 1032, 1252, 1462, 1734, 1904, 2019, 2376, 2548, 2727, 2747, 3045, 3282, 3287, 3314, 3542, 4016, 4048, 4828, 5266, 5497, 5625, 5954, 6040, 6094, 6133, 6738, 7050, 7497, 7661, 8393, 8514, 8970, 9302, 9525, 9664, 9823, 9980, 10664, 10865, 10910, 11137, 11493, 11572, 11573, 11598, 12494, 13677, 13992, 14507, 14603, 15257, 15293, 15422, 15435, 16267, 16309, 16667, 17409, 17504, 18027, 18294, 18639, 19355, 19438, 20669, 20693, 20724, 20954, 21653, 21829, 21950, 22361, 22601, 22826, 23393, 23513, 23647, 23715, 23892, 23916, 23931, 24954, 25348, 26288, 26554, 27039, 27372, 27412, 28039, 28438]\n"]}]},{"cell_type":"code","source":["# one to one map\n","id_map1 = dict()\n","id_map2 = dict()\n","for i in range(len(node_ids)):\n"," id_map1[str(i)] = node_ids[i]\n"," id_map2[str(node_ids[i])] = i"],"metadata":{"id":"ULIoErBoPmzO","executionInfo":{"status":"ok","timestamp":1651555385318,"user_tz":-480,"elapsed":2,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":42,"outputs":[]},{"cell_type":"code","source":["# If we simply feed all pairs rudely into grah, there will be too many nodes which can cause session crash\n","# we decide to create a dictionary to store ids of nodes having at least one edge, and we redifine these nodes' ids\n","temp = 0\n","node_pairs = []\n","for x in range(len(file_data)//2-1):\n"," pair = file_data[x*2+1].strip('\\n').split(' ')[1:]\n"," if temp%50000==0:\n"," node_pairs = list(set(node_pairs))\n"," print(temp)\n"," temp = temp + 1\n"," l = len(pair)\n"," for i in range(l-1):\n"," for j in range(i+1, l-1):\n"," first, second = pair[i], pair[j]\n"," node_pairs.append((id_map2[first],id_map2[second]))"],"metadata":{"id":"Wtvh9yJRWMR5","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1651555387334,"user_tz":-480,"elapsed":2018,"user":{"displayName":"ming li","userId":"14148720490428311514"}},"outputId":"622496a4-74ef-41a2-d326-e84cf31be6c9"},"execution_count":43,"outputs":[{"output_type":"stream","name":"stdout","text":["0\n","50000\n"]}]},{"cell_type":"code","source":["node_pairs = list(set(node_pairs))"],"metadata":{"id":"WOlNBikRZMEq","executionInfo":{"status":"ok","timestamp":1651555387334,"user_tz":-480,"elapsed":6,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":44,"outputs":[]},{"cell_type":"code","source":["print(f\"# node pairs:{len(node_pairs)}\")\n","print(f\"# node ids:{len(node_ids)}\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"bqVKDRUCa4k8","executionInfo":{"status":"ok","timestamp":1651555387334,"user_tz":-480,"elapsed":5,"user":{"displayName":"ming li","userId":"14148720490428311514"}},"outputId":"27b92c55-051e-4538-a85e-4b68d4c8bff3"},"execution_count":45,"outputs":[{"output_type":"stream","name":"stdout","text":["# node pairs:746445\n","# node ids:75728\n"]}]},{"cell_type":"code","source":["g = Graph(node_pairs)"],"metadata":{"id":"IVsNbDpuSABP","executionInfo":{"status":"ok","timestamp":1651555387986,"user_tz":-480,"elapsed":5,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":46,"outputs":[]},{"cell_type":"code","source":["Degree = g.degree()"],"metadata":{"id":"Y9ZQzkrJSzcc","executionInfo":{"status":"ok","timestamp":1651555387986,"user_tz":-480,"elapsed":5,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":47,"outputs":[]},{"cell_type":"code","source":["for i in range(10):\n"," index = Degree.index(max(Degree))\n"," print(id_map1[str(index)], '|','|',Degree[index])\n"," Degree[index] = 0"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"4eXwbKKV9_7X","executionInfo":{"status":"ok","timestamp":1651555387986,"user_tz":-480,"elapsed":5,"user":{"displayName":"ming li","userId":"14148720490428311514"}},"outputId":"618dff17-dd5f-4ea6-a6ee-d790459a169f"},"execution_count":48,"outputs":[{"output_type":"stream","name":"stdout","text":["321278 | | 2306\n","27193779 | | 1808\n","206084 | | 1738\n","7691631 | | 1698\n","3228505 | | 1551\n","10270250 | | 1390\n","11061773 | | 1317\n","1420493 | | 1255\n","24560307 | | 1139\n","9384267 | | 1108\n"]}]},{"cell_type":"code","source":["import matplotlib.pyplot as plt\n","import numpy as np\n","plt.rcParams.update({'font.size': 25})\n","degrees = g.degree()\n","x = [x for x in range(max(degrees)+1)]\n","degree_counts = [0 for x in range(max(degrees)+1)]\n","\n","for i in degrees:\n"," degree_counts[i] += 1\n","\n","\n","plt.figure(figsize=(40,10))\n","plt.loglog(x, degree_counts, linewidth=3.0)\n","plt.ylabel('Number of vertices having the given degree')\n","plt.xlabel('Degree')\n","plt.title('Degree Distribution of Vertices in the CiteSeer Graph')\n","\n","plt.grid(True)\n","\n","plt.show()\n","plt.draw()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":363},"id":"0FwFBZzjCYDT","executionInfo":{"status":"ok","timestamp":1651555389850,"user_tz":-480,"elapsed":1867,"user":{"displayName":"ming li","userId":"14148720490428311514"}},"outputId":"7a694a79-7fa5-4dcf-c0e7-7a4934af5c52"},"execution_count":49,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{"needs_background":"light"}},{"output_type":"display_data","data":{"text/plain":["
"]},"metadata":{}}]},{"cell_type":"code","source":["rank = g.pagerank(directed=False)"],"metadata":{"id":"z8hpn7ZoeE-t","executionInfo":{"status":"ok","timestamp":1651555389851,"user_tz":-480,"elapsed":9,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":50,"outputs":[]},{"cell_type":"code","source":["for i in range(10):\n"," index = rank.index(max(rank))\n"," print(id_map1[str(index)],'|','|', rank[index]*10000)\n"," rank[index] = 0"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"gx-nRA8NEbtG","executionInfo":{"status":"ok","timestamp":1651555389851,"user_tz":-480,"elapsed":8,"user":{"displayName":"ming li","userId":"14148720490428311514"}},"outputId":"2de173b8-83d8-4236-eef8-10df75c81f91"},"execution_count":51,"outputs":[{"output_type":"stream","name":"stdout","text":["321278 | | 12.836063163558814\n","206084 | | 12.182129136487386\n","7691631 | | 11.334998174029815\n","3228505 | | 9.581670921929092\n","27193779 | | 9.34194119126938\n","1420493 | | 8.09320255471226\n","10270250 | | 7.689296222718188\n","11061773 | | 6.591542444737357\n","19872456 | | 6.071249344805698\n","9384267 | | 5.9682403291913655\n"]}]},{"cell_type":"code","source":["degrees = g.degree()\n","rank = g.pagerank(directed=False)\n","final_de = []\n","final_ra = []\n","for i in range(10000):\n"," index = (rank.index(max(rank)))\n"," final_ra.append(id_map1[str(index)])\n"," rank[index] = 0\n"," index = (degrees.index(max(degrees)))\n"," final_de.append(id_map1[str(index)])\n"," degrees[index] = 0\n"],"metadata":{"id":"uXgp5T2GHKk_","executionInfo":{"status":"ok","timestamp":1651555433279,"user_tz":-480,"elapsed":43434,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":52,"outputs":[]},{"cell_type":"code","source":["import numpy as np\n","np.save(target+'_degree.npy', final_de)\n","np.save(target+'_rank.npy', final_ra)"],"metadata":{"id":"Vu6Kzwq1Hyvo","executionInfo":{"status":"ok","timestamp":1651555433280,"user_tz":-480,"elapsed":24,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":53,"outputs":[]},{"cell_type":"code","source":[""],"metadata":{"id":"Qo6ANanGIYPq","executionInfo":{"status":"ok","timestamp":1651555259074,"user_tz":-480,"elapsed":9,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":37,"outputs":[]}]} \ No newline at end of file diff --git a/notebooks/data_pull.ipynb b/notebooks/data_pull.ipynb new file mode 100644 index 0000000..dc963ca --- /dev/null +++ b/notebooks/data_pull.ipynb @@ -0,0 +1,306 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "odps.Schema {\n", + " login string \n", + " created_at date \n", + " database_id int64 \n", + " location string \n", + " company string \n", + " bio string \n", + " is_employee boolean \n", + " email string \n", + " infoname string \n", + " followers string \n", + " following string \n", + " time date \n", + " name string \n", + " lastupdatedat date \n", + " nextupdateat date \n", + "}\n", + "\n", + "odps.Schema {\n", + " id string \n", + " type string \n", + " action string \n", + " actor_id int64 \n", + " actor_login string \n", + " repo_id int64 \n", + " repo_name string \n", + " org_id int64 \n", + " org_login string \n", + " created_at datetime \n", + " issue_id int64 \n", + " issue_number int32 \n", + " issue_title string \n", + " issue_body string \n", + " issue_labels_name list \n", + " issue_labels_color list \n", + " issue_labels_default list \n", + " issue_labels_description list \n", + " issue_author_id int64 \n", + " issue_author_login string \n", + " issue_author_type string \n", + " issue_author_association string \n", + " issue_assignee_id int64 \n", + " issue_assignee_login string \n", + " issue_assignees_id list \n", + " issue_assignees_login list \n", + " issue_created_at datetime \n", + " issue_updated_at datetime \n", + " issue_comments int16 \n", + " issue_closed_at datetime \n", + " issue_comment_id int64 \n", + " issue_comment_body string \n", + " issue_comment_created_at datetime \n", + " issue_comment_updated_at datetime \n", + " issue_comment_author_association string \n", + " issue_comment_author_id int64 \n", + " issue_comment_author_login string \n", + " issue_comment_author_type string \n", + " pull_commits int16 \n", + " pull_additions int16 \n", + " pull_deletions int16 \n", + " pull_changed_files int32 \n", + " pull_merged int8 \n", + " pull_merge_commit_sha string \n", + " pull_merged_at datetime \n", + " pull_merged_by_id int64 \n", + " pull_merged_by_login string \n", + " pull_merged_by_type string \n", + " pull_requested_reviewer_id int64 \n", + " pull_requested_reviewer_login string \n", + " pull_requested_reviewer_type string \n", + " pull_review_comments int16 \n", + " repo_description string \n", + " repo_size int32 \n", + " repo_stargazers_count int32 \n", + " repo_forks_count int32 \n", + " repo_language string \n", + " repo_has_issues int8 \n", + " repo_has_projects int8 \n", + " repo_has_downloads int8 \n", + " repo_has_wiki int8 \n", + " repo_has_pages int8 \n", + " repo_license string \n", + " repo_default_branch string \n", + " repo_created_at datetime \n", + " repo_updated_at datetime \n", + " repo_pushed_at datetime \n", + " pull_review_id int64 \n", + " pull_review_comment_id int64 \n", + " pull_review_comment_path string \n", + " pull_review_comment_position string \n", + " pull_review_comment_author_id int64 \n", + " pull_review_comment_author_login string \n", + " pull_review_comment_author_type string \n", + " pull_review_comment_author_association string \n", + " pull_review_comment_body string \n", + " pull_review_comment_created_at datetime \n", + " pull_review_comment_updated_at datetime \n", + " push_id int64 \n", + " push_size int32 \n", + " push_distinct_size int32 \n", + " push_ref string \n", + " push_head string \n", + " push_before string \n", + " push_commits_name list \n", + " push_commits_email list \n", + " push_commits_message list \n", + " fork_forkee_id int64 \n", + " fork_forkee_full_name string \n", + " fork_forkee_owner_id int64 \n", + " fork_forkee_owner_login string \n", + " fork_forkee_owner_type string \n", + " delete_ref string \n", + " delete_ref_type string \n", + " delete_pusher_type string \n", + " create_ref string \n", + " create_ref_type string \n", + " create_master_branch string \n", + " create_description string \n", + " create_pusher_type string \n", + " gollum_pages_page_name list \n", + " gollum_pages_title list \n", + " gollum_pages_action list \n", + " member_login string \n", + " member_type string \n", + " member_id int64 \n", + " release_id int64 \n", + " release_tag_name string \n", + " release_target_commitish string \n", + " release_name string \n", + " release_draft int8 \n", + " release_author_id int64 \n", + " release_author_login string \n", + " release_author_type string \n", + " release_prerelease int8 \n", + " release_created_at datetime \n", + " release_published_at datetime \n", + " release_body string \n", + " release_assets_name list \n", + " release_assets_uploader_login list \n", + " release_assets_uploader_id list \n", + " release_assets_content_type list \n", + " release_assets_state list \n", + " release_assets_size list \n", + " release_assets_download_count list \n", + " commit_comment_id int64 \n", + " commit_comment_author_id int64 \n", + " commit_comment_author_login string \n", + " commit_comment_author_type string \n", + " commit_comment_author_association string \n", + " commit_comment_body string \n", + " commit_comment_path string \n", + " commit_comment_position string \n", + " commit_comment_line string \n", + " commit_comment_created_at datetime \n", + " commit_comment_updated_at datetime \n", + " pt string \n", + "}\n", + "\n" + ] + } + ], + "source": [ + "from odps import ODPS\n", + "from odps import options\n", + "from odps.df import DataFrame\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "pd.set_option('display.max_rows',None)\n", + "\n", + "ACCESS_ID = 'LTAI5t9uwJrh5eJ7Q5E37D1s'\n", + "SECRET_ACCESS_KEY = 'NCFHOAnvqfnTrpypgR4b3cNawP8fnB'\n", + "ODPS_PROJECT = 'OpenDigger_prod_dev'\n", + "ODPS_ENDPOINT = 'http://service.cn-shanghai.maxcompute.aliyun.com/api'\n", + "\n", + "o = ODPS(ACCESS_ID, SECRET_ACCESS_KEY,\n", + " project=ODPS_PROJECT, endpoint=ODPS_ENDPOINT)\n", + "options.tunnel.limit_instance_tunnel = False\n", + "# options.read_timeout = 10000000\n", + "\n", + "users = DataFrame(o.get_table('ods_github_users'))\n", + "print(users.dtypes)\n", + "\n", + "github_log = DataFrame(o.get_table('ods_github_log'))\n", + "print(github_log.dtypes)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "sql = '''\n", + " select type, count(repo_id), repo_id from ods_github_log\n", + " where pt='20151001'\n", + " and type in ('PullRequestEvent','WatchEvent','ForkEvent','IssueCommentEvent')\n", + " group by type, repo_id;\n", + "'''\n", + "\n", + "result = o.execute_sql(sql, hints={'odps.sql.allow.fullscan': 'true', 'odps.sql.submit.mode': 'script'})\n", + "with open('data\\count.txt', 'w') as f:\n", + " with result.open_reader() as reader:\n", + " for record in reader:\n", + " type = record['type']\n", + " count = record['_c1']\n", + " repo_id = record['repo_id'] \n", + " f.write('type: {type}, repo_id: {repo_id}, count: {count}\\n'.format(\n", + " type=type,\n", + " repo_id=repo_id,\n", + " count=count)) " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "graph_dict = {}\n", + "graph_dict['PullRequestEvent'] = {}\n", + "graph_dict['WatchEvent'] = {}\n", + "graph_dict['ForkEvent'] = {}\n", + "graph_dict['IssueCommentEvent'] = {}\n", + "sql = '''\n", + " select type, repo_id, actor_id\n", + " from ods_github_log\n", + " where pt='20151001'\n", + " and type in ('PullRequestEvent','WatchEvent','ForkEvent','IssueCommentEvent')\n", + " group by type, repo_id, actor_id;\n", + "'''\n", + "result = o.execute_sql(sql, hints={'odps.sql.allow.fullscan': 'true', 'odps.sql.submit.mode': 'script'})\n", + "with result.open_reader() as reader:\n", + " for record in reader:\n", + " type = record['type']\n", + " actor_id = record['actor_id']\n", + " repo_id = record['repo_id']\n", + " if actor_id not in graph_dict[type]:\n", + " graph_dict[type][actor_id] = []\n", + " graph_dict[type][actor_id].append(str(repo_id))\n", + "\n", + "# print(graph_dict)\n", + "\n", + "with open('data\\PullRequestEvent.txt', 'w') as f:\n", + " for key in graph_dict['PullRequestEvent']:\n", + " if len(graph_dict['PullRequestEvent'][key]) < 2:\n", + " continue\n", + " f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['PullRequestEvent'][key])))\n", + "\n", + "with open('data\\WatchEvent.txt', 'w') as f:\n", + " for key in graph_dict['WatchEvent']:\n", + " if len(graph_dict['WatchEvent'][key]) < 2:\n", + " continue\n", + " f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['WatchEvent'][key])))\n", + "\n", + "with open('data\\ForkEvent.txt', 'w') as f:\n", + " for key in graph_dict['ForkEvent']:\n", + " if len(graph_dict['ForkEvent'][key]) < 2:\n", + " continue\n", + " f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['ForkEvent'][key])))\n", + "\n", + "with open('data\\IssueCommentEvent.txt', 'w') as f:\n", + " for key in graph_dict['IssueCommentEvent']:\n", + " if len(graph_dict['IssueCommentEvent'][key]) < 2:\n", + " continue\n", + " f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['IssueCommentEvent'][key])))" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "caac794e4b8e34bcc9a4d9e1a06492e263031294735d822cbf2db7854bb6c6da" + }, + "kernelspec": { + "display_name": "Python 3.10.4 64-bit (windows store)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/learning.ipynb b/notebooks/learning.ipynb new file mode 100644 index 0000000..78f798d --- /dev/null +++ b/notebooks/learning.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"learning.ipynb","provenance":[],"authorship_tag":"ABX9TyOH0O9m/wI7scpeov7qGokX"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":43,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cMHw9PfA4PdG","executionInfo":{"status":"ok","timestamp":1651650933526,"user_tz":-480,"elapsed":3253,"user":{"displayName":"ming li","userId":"14148720490428311514"}},"outputId":"c546da43-6e70-4e0e-f90d-c167e3bc5cfc"},"outputs":[{"output_type":"stream","name":"stdout","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')\n","import os\n","import torch\n","import random"]},{"cell_type":"code","source":["path = '/content/drive/My Drive/social_computing/data/count.txt'"],"metadata":{"id":"in4a4Lmr4R5h","executionInfo":{"status":"ok","timestamp":1651650136175,"user_tz":-480,"elapsed":2,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":30,"outputs":[]},{"cell_type":"code","source":["f = open(path, encoding = \"utf-8\")\n","file_data = f.readlines()"],"metadata":{"id":"YynmL5fP4bwT","executionInfo":{"status":"ok","timestamp":1651650136175,"user_tz":-480,"elapsed":2,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":31,"outputs":[]},{"cell_type":"code","source":["from typing import Dict\n","id_list = []\n","for line in file_data:\n"," id_list.append(line.strip('\\n').split(' ')[3][:-1])\n","\n","# data_list['id'] = [PullRequest_count, Fork_count, Watch_count, IssueComment_count, class]\n","map = dict()\n","map['PullRequestEvent'] = 0\n","map['ForkEvent'] = 1\n","map['WatchEvent'] = 2\n","map['IssueCommentEvent'] = 3\n","data_list = dict()"],"metadata":{"id":"Uz02esuk6bRl","executionInfo":{"status":"ok","timestamp":1651650137149,"user_tz":-480,"elapsed":976,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":32,"outputs":[]},{"cell_type":"code","source":["# Initialize data_list\n","for id in id_list:\n"," data_list[id] = [0,0,0,0,0]"],"metadata":{"id":"JuyvyC-l9gRU","executionInfo":{"status":"ok","timestamp":1651650139027,"user_tz":-480,"elapsed":1885,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":33,"outputs":[]},{"cell_type":"code","source":["for line in file_data: \n"," raw = line.strip('\\n').split(' ') \n"," data_list[raw[3][:-1]][map[raw[1][:-1]]] = int(raw[-1])"],"metadata":{"id":"72-EFogQ82I2","executionInfo":{"status":"ok","timestamp":1651650140722,"user_tz":-480,"elapsed":1697,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":34,"outputs":[]},{"cell_type":"code","source":["p_list = []\n","final_data = []\n","for id in id_list:\n"," p_list.append(data_list[id][0])\n","p_list.sort(reverse=True)\n","threshold = p_list[int(len(p_list)*0.1)]\n","for id in id_list:\n"," if data_list[id][0]>=threshold:\n"," data_list[id][4] = 1\n"," final_data.append(data_list[id][1:])"],"metadata":{"id":"OiHpD_rX4nFS","executionInfo":{"status":"ok","timestamp":1651650364944,"user_tz":-480,"elapsed":3201,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":38,"outputs":[]},{"cell_type":"code","source":["final_data = torch.tensor(final_data)"],"metadata":{"id":"W7d1zzzaCSVS","executionInfo":{"status":"ok","timestamp":1651650424843,"user_tz":-480,"elapsed":473,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":39,"outputs":[]},{"cell_type":"code","source":["print(final_data.shape)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"VAQlRswJCXEy","executionInfo":{"status":"ok","timestamp":1651650434749,"user_tz":-480,"elapsed":409,"user":{"displayName":"ming li","userId":"14148720490428311514"}},"outputId":"5e2bf89e-ec92-4149-aa8b-01b42f232332"},"execution_count":40,"outputs":[{"output_type":"stream","name":"stdout","text":["torch.Size([883436, 4])\n"]}]},{"cell_type":"code","source":["class Dataset(torch.utils.data.Dataset):\n"," def __init__(self, data):\n"," self.labels = data[:, -1]\n"," self.x = data[:, 0:-1]\n"," def __len__(self):\n"," return len(self.labels)\n","\n"," def __getitem__(self, idx):\n"," x = self.x[idx].float()\n"," y = self.labels[idx]\n"," return x, y"],"metadata":{"id":"gPygHvtPBDZO","executionInfo":{"status":"ok","timestamp":1651651824344,"user_tz":-480,"elapsed":490,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":68,"outputs":[]},{"cell_type":"code","source":["# randomly sample 70% as training data, and remaining 30% data as testing data\n","sample = [i for i in range(len(final_data))]\n","sample = random.sample(sample, len(final_data))\n","train = Dataset(final_data[sample[:int(len(final_data)*0.7)]])\n","train_dataloader = torch.utils.data.DataLoader(train, batch_size=256, shuffle=True, drop_last=True)"],"metadata":{"id":"McymePR1CtOl","executionInfo":{"status":"ok","timestamp":1651651827418,"user_tz":-480,"elapsed":1657,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":69,"outputs":[]},{"cell_type":"code","source":["from torch.optim import Adam\n","from tqdm import tqdm\n","from torch import nn\n","def train(model, trainloader, learning_rate, epochs):\n"," use_cuda = torch.cuda.is_available()\n"," device = torch.device(\"cuda\" if use_cuda else \"cpu\")\n"," criterion = nn.CrossEntropyLoss()\n"," optimizer = Adam(model.parameters(), lr= learning_rate)\n"," if use_cuda:\n"," model = model.cuda()\n"," criterion = criterion.cuda()\n"," for epoch_num in range(epochs):\n"," acc_train = 0\n"," loss_train = 0\n"," for train_input, train_label in tqdm(trainloader):\n"," model.zero_grad()\n"," train_input = train_input.to(device)\n"," train_label = train_label.to(device)\n"," output = model(train_input)\n"," batch_loss = criterion(output, train_label)\n"," loss_train += batch_loss.item()\n"," batch_loss.backward()\n"," optimizer.step()\n"," with torch.no_grad():\n"," label_new = output.argmax(dim=1)\n"," acc_train = acc_train + (train_label==label_new).float().mean()\n"," with torch.no_grad():\n"," loss_train = loss_train / len(trainloader)\n"," acc_train = acc_train / len(trainloader)\n"," print(f'training loss:{loss_train}, training accuracy:{acc_train}')"],"metadata":{"id":"piLHSHz2EQ4J","executionInfo":{"status":"ok","timestamp":1651651827418,"user_tz":-480,"elapsed":3,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":70,"outputs":[]},{"cell_type":"code","source":["model = nn.Sequential(nn.Linear(3,2))\n","train(model, trainloader=train_dataloader, learning_rate=11, epochs=20)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":346},"id":"4SfwM6QhGggu","executionInfo":{"status":"error","timestamp":1651652161257,"user_tz":-480,"elapsed":17598,"user":{"displayName":"ming li","userId":"14148720490428311514"}},"outputId":"73e9fef0-0b46-4e36-8a2e-2e240723dfcf"},"execution_count":78,"outputs":[{"output_type":"stream","name":"stderr","text":["100%|██████████| 4831/4831 [00:10<00:00, 465.96it/s]\n"]},{"output_type":"stream","name":"stdout","text":["training loss:68.62970850901237, training accuracy:0.8554679155349731\n"]},{"output_type":"stream","name":"stderr","text":[" 64%|██████▍ | 3091/4831 [00:06<00:03, 442.70it/s]\n"]},{"output_type":"error","ename":"KeyboardInterrupt","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSequential\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLinear\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrainloader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_dataloader\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlearning_rate\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m11\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;32m\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(model, trainloader, learning_rate, epochs)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0macc_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mloss_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mtrain_input\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_label\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrainloader\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzero_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mtrain_input\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_input\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.7/dist-packages/tqdm/std.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1193\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1194\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1195\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1196\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1197\u001b[0m \u001b[0;31m# Update and possibly print the progressbar.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 528\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sampler_iter\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 530\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_next_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 531\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_num_yielded\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 532\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dataset_kind\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0m_DatasetKind\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIterable\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m_next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 568\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_next_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 569\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_next_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# may raise StopIteration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 570\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dataset_fetcher\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfetch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# may raise StopIteration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 571\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pin_memory\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 572\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpin_memory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpin_memory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py\u001b[0m in \u001b[0;36mfetch\u001b[0;34m(self, possibly_batched_index)\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mpossibly_batched_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 52\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcollate_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;31mKeyboardInterrupt\u001b[0m: "]}]},{"cell_type":"code","source":["test_data = final_data[sample[int(len(final_data)*0.7):]][:, :-1].float()\n","test_label = final_data[sample[int(len(final_data)*0.7):]][:, -1].long()\n","(model(test_data).argmax(dim=1)==test_label).float().mean()"],"metadata":{"id":"P6D3S8uaIZch","executionInfo":{"status":"ok","timestamp":1651652046350,"user_tz":-480,"elapsed":406,"user":{"displayName":"ming li","userId":"14148720490428311514"}}},"execution_count":76,"outputs":[]},{"cell_type":"code","source":[""],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"2xjuln6OGqoM","executionInfo":{"status":"ok","timestamp":1651652080979,"user_tz":-480,"elapsed":391,"user":{"displayName":"ming li","userId":"14148720490428311514"}},"outputId":"af0457f4-ec9f-4c63-9f16-55a421cb3a08"},"execution_count":77,"outputs":[{"output_type":"execute_result","data":{"text/plain":["tensor(0.9002)"]},"metadata":{},"execution_count":77}]},{"cell_type":"code","source":[""],"metadata":{"id":"8Mj19ZN-IrH2"},"execution_count":null,"outputs":[]}]} \ No newline at end of file