|
|
@ -0,0 +1,306 @@ |
|
|
|
{ |
|
|
|
"cells": [ |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 1, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"odps.Schema {\n", |
|
|
|
" login string \n", |
|
|
|
" created_at date \n", |
|
|
|
" database_id int64 \n", |
|
|
|
" location string \n", |
|
|
|
" company string \n", |
|
|
|
" bio string \n", |
|
|
|
" is_employee boolean \n", |
|
|
|
" email string \n", |
|
|
|
" infoname string \n", |
|
|
|
" followers string \n", |
|
|
|
" following string \n", |
|
|
|
" time date \n", |
|
|
|
" name string \n", |
|
|
|
" lastupdatedat date \n", |
|
|
|
" nextupdateat date \n", |
|
|
|
"}\n", |
|
|
|
"\n", |
|
|
|
"odps.Schema {\n", |
|
|
|
" id string \n", |
|
|
|
" type string \n", |
|
|
|
" action string \n", |
|
|
|
" actor_id int64 \n", |
|
|
|
" actor_login string \n", |
|
|
|
" repo_id int64 \n", |
|
|
|
" repo_name string \n", |
|
|
|
" org_id int64 \n", |
|
|
|
" org_login string \n", |
|
|
|
" created_at datetime \n", |
|
|
|
" issue_id int64 \n", |
|
|
|
" issue_number int32 \n", |
|
|
|
" issue_title string \n", |
|
|
|
" issue_body string \n", |
|
|
|
" issue_labels_name list<string> \n", |
|
|
|
" issue_labels_color list<string> \n", |
|
|
|
" issue_labels_default list<string> \n", |
|
|
|
" issue_labels_description list<string> \n", |
|
|
|
" issue_author_id int64 \n", |
|
|
|
" issue_author_login string \n", |
|
|
|
" issue_author_type string \n", |
|
|
|
" issue_author_association string \n", |
|
|
|
" issue_assignee_id int64 \n", |
|
|
|
" issue_assignee_login string \n", |
|
|
|
" issue_assignees_id list<string> \n", |
|
|
|
" issue_assignees_login list<string> \n", |
|
|
|
" issue_created_at datetime \n", |
|
|
|
" issue_updated_at datetime \n", |
|
|
|
" issue_comments int16 \n", |
|
|
|
" issue_closed_at datetime \n", |
|
|
|
" issue_comment_id int64 \n", |
|
|
|
" issue_comment_body string \n", |
|
|
|
" issue_comment_created_at datetime \n", |
|
|
|
" issue_comment_updated_at datetime \n", |
|
|
|
" issue_comment_author_association string \n", |
|
|
|
" issue_comment_author_id int64 \n", |
|
|
|
" issue_comment_author_login string \n", |
|
|
|
" issue_comment_author_type string \n", |
|
|
|
" pull_commits int16 \n", |
|
|
|
" pull_additions int16 \n", |
|
|
|
" pull_deletions int16 \n", |
|
|
|
" pull_changed_files int32 \n", |
|
|
|
" pull_merged int8 \n", |
|
|
|
" pull_merge_commit_sha string \n", |
|
|
|
" pull_merged_at datetime \n", |
|
|
|
" pull_merged_by_id int64 \n", |
|
|
|
" pull_merged_by_login string \n", |
|
|
|
" pull_merged_by_type string \n", |
|
|
|
" pull_requested_reviewer_id int64 \n", |
|
|
|
" pull_requested_reviewer_login string \n", |
|
|
|
" pull_requested_reviewer_type string \n", |
|
|
|
" pull_review_comments int16 \n", |
|
|
|
" repo_description string \n", |
|
|
|
" repo_size int32 \n", |
|
|
|
" repo_stargazers_count int32 \n", |
|
|
|
" repo_forks_count int32 \n", |
|
|
|
" repo_language string \n", |
|
|
|
" repo_has_issues int8 \n", |
|
|
|
" repo_has_projects int8 \n", |
|
|
|
" repo_has_downloads int8 \n", |
|
|
|
" repo_has_wiki int8 \n", |
|
|
|
" repo_has_pages int8 \n", |
|
|
|
" repo_license string \n", |
|
|
|
" repo_default_branch string \n", |
|
|
|
" repo_created_at datetime \n", |
|
|
|
" repo_updated_at datetime \n", |
|
|
|
" repo_pushed_at datetime \n", |
|
|
|
" pull_review_id int64 \n", |
|
|
|
" pull_review_comment_id int64 \n", |
|
|
|
" pull_review_comment_path string \n", |
|
|
|
" pull_review_comment_position string \n", |
|
|
|
" pull_review_comment_author_id int64 \n", |
|
|
|
" pull_review_comment_author_login string \n", |
|
|
|
" pull_review_comment_author_type string \n", |
|
|
|
" pull_review_comment_author_association string \n", |
|
|
|
" pull_review_comment_body string \n", |
|
|
|
" pull_review_comment_created_at datetime \n", |
|
|
|
" pull_review_comment_updated_at datetime \n", |
|
|
|
" push_id int64 \n", |
|
|
|
" push_size int32 \n", |
|
|
|
" push_distinct_size int32 \n", |
|
|
|
" push_ref string \n", |
|
|
|
" push_head string \n", |
|
|
|
" push_before string \n", |
|
|
|
" push_commits_name list<string> \n", |
|
|
|
" push_commits_email list<string> \n", |
|
|
|
" push_commits_message list<string> \n", |
|
|
|
" fork_forkee_id int64 \n", |
|
|
|
" fork_forkee_full_name string \n", |
|
|
|
" fork_forkee_owner_id int64 \n", |
|
|
|
" fork_forkee_owner_login string \n", |
|
|
|
" fork_forkee_owner_type string \n", |
|
|
|
" delete_ref string \n", |
|
|
|
" delete_ref_type string \n", |
|
|
|
" delete_pusher_type string \n", |
|
|
|
" create_ref string \n", |
|
|
|
" create_ref_type string \n", |
|
|
|
" create_master_branch string \n", |
|
|
|
" create_description string \n", |
|
|
|
" create_pusher_type string \n", |
|
|
|
" gollum_pages_page_name list<string> \n", |
|
|
|
" gollum_pages_title list<string> \n", |
|
|
|
" gollum_pages_action list<string> \n", |
|
|
|
" member_login string \n", |
|
|
|
" member_type string \n", |
|
|
|
" member_id int64 \n", |
|
|
|
" release_id int64 \n", |
|
|
|
" release_tag_name string \n", |
|
|
|
" release_target_commitish string \n", |
|
|
|
" release_name string \n", |
|
|
|
" release_draft int8 \n", |
|
|
|
" release_author_id int64 \n", |
|
|
|
" release_author_login string \n", |
|
|
|
" release_author_type string \n", |
|
|
|
" release_prerelease int8 \n", |
|
|
|
" release_created_at datetime \n", |
|
|
|
" release_published_at datetime \n", |
|
|
|
" release_body string \n", |
|
|
|
" release_assets_name list<string> \n", |
|
|
|
" release_assets_uploader_login list<string> \n", |
|
|
|
" release_assets_uploader_id list<string> \n", |
|
|
|
" release_assets_content_type list<string> \n", |
|
|
|
" release_assets_state list<string> \n", |
|
|
|
" release_assets_size list<string> \n", |
|
|
|
" release_assets_download_count list<string> \n", |
|
|
|
" commit_comment_id int64 \n", |
|
|
|
" commit_comment_author_id int64 \n", |
|
|
|
" commit_comment_author_login string \n", |
|
|
|
" commit_comment_author_type string \n", |
|
|
|
" commit_comment_author_association string \n", |
|
|
|
" commit_comment_body string \n", |
|
|
|
" commit_comment_path string \n", |
|
|
|
" commit_comment_position string \n", |
|
|
|
" commit_comment_line string \n", |
|
|
|
" commit_comment_created_at datetime \n", |
|
|
|
" commit_comment_updated_at datetime \n", |
|
|
|
" pt string \n", |
|
|
|
"}\n", |
|
|
|
"\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"from odps import ODPS\n", |
|
|
|
"from odps import options\n", |
|
|
|
"from odps.df import DataFrame\n", |
|
|
|
"import pandas as pd\n", |
|
|
|
"import numpy as np\n", |
|
|
|
"\n", |
|
|
|
"pd.set_option('display.max_rows',None)\n", |
|
|
|
"\n", |
|
|
|
"ACCESS_ID = 'LTAI5t9uwJrh5eJ7Q5E37D1s'\n", |
|
|
|
"SECRET_ACCESS_KEY = 'NCFHOAnvqfnTrpypgR4b3cNawP8fnB'\n", |
|
|
|
"ODPS_PROJECT = 'OpenDigger_prod_dev'\n", |
|
|
|
"ODPS_ENDPOINT = 'http://service.cn-shanghai.maxcompute.aliyun.com/api'\n", |
|
|
|
"\n", |
|
|
|
"o = ODPS(ACCESS_ID, SECRET_ACCESS_KEY,\n", |
|
|
|
" project=ODPS_PROJECT, endpoint=ODPS_ENDPOINT)\n", |
|
|
|
"options.tunnel.limit_instance_tunnel = False\n", |
|
|
|
"# options.read_timeout = 10000000\n", |
|
|
|
"\n", |
|
|
|
"users = DataFrame(o.get_table('ods_github_users'))\n", |
|
|
|
"print(users.dtypes)\n", |
|
|
|
"\n", |
|
|
|
"github_log = DataFrame(o.get_table('ods_github_log'))\n", |
|
|
|
"print(github_log.dtypes)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 8, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"sql = '''\n", |
|
|
|
" select type, count(repo_id), repo_id from ods_github_log\n", |
|
|
|
" where pt='20151001'\n", |
|
|
|
" and type in ('PullRequestEvent','WatchEvent','ForkEvent','IssueCommentEvent')\n", |
|
|
|
" group by type, repo_id;\n", |
|
|
|
"'''\n", |
|
|
|
"\n", |
|
|
|
"result = o.execute_sql(sql, hints={'odps.sql.allow.fullscan': 'true', 'odps.sql.submit.mode': 'script'})\n", |
|
|
|
"with open('data\\count.txt', 'w') as f:\n", |
|
|
|
" with result.open_reader() as reader:\n", |
|
|
|
" for record in reader:\n", |
|
|
|
" type = record['type']\n", |
|
|
|
" count = record['_c1']\n", |
|
|
|
" repo_id = record['repo_id'] \n", |
|
|
|
" f.write('type: {type}, repo_id: {repo_id}, count: {count}\\n'.format(\n", |
|
|
|
" type=type,\n", |
|
|
|
" repo_id=repo_id,\n", |
|
|
|
" count=count)) " |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 6, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"graph_dict = {}\n", |
|
|
|
"graph_dict['PullRequestEvent'] = {}\n", |
|
|
|
"graph_dict['WatchEvent'] = {}\n", |
|
|
|
"graph_dict['ForkEvent'] = {}\n", |
|
|
|
"graph_dict['IssueCommentEvent'] = {}\n", |
|
|
|
"sql = '''\n", |
|
|
|
" select type, repo_id, actor_id\n", |
|
|
|
" from ods_github_log\n", |
|
|
|
" where pt='20151001'\n", |
|
|
|
" and type in ('PullRequestEvent','WatchEvent','ForkEvent','IssueCommentEvent')\n", |
|
|
|
" group by type, repo_id, actor_id;\n", |
|
|
|
"'''\n", |
|
|
|
"result = o.execute_sql(sql, hints={'odps.sql.allow.fullscan': 'true', 'odps.sql.submit.mode': 'script'})\n", |
|
|
|
"with result.open_reader() as reader:\n", |
|
|
|
" for record in reader:\n", |
|
|
|
" type = record['type']\n", |
|
|
|
" actor_id = record['actor_id']\n", |
|
|
|
" repo_id = record['repo_id']\n", |
|
|
|
" if actor_id not in graph_dict[type]:\n", |
|
|
|
" graph_dict[type][actor_id] = []\n", |
|
|
|
" graph_dict[type][actor_id].append(str(repo_id))\n", |
|
|
|
"\n", |
|
|
|
"# print(graph_dict)\n", |
|
|
|
"\n", |
|
|
|
"with open('data\\PullRequestEvent.txt', 'w') as f:\n", |
|
|
|
" for key in graph_dict['PullRequestEvent']:\n", |
|
|
|
" if len(graph_dict['PullRequestEvent'][key]) < 2:\n", |
|
|
|
" continue\n", |
|
|
|
" f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['PullRequestEvent'][key])))\n", |
|
|
|
"\n", |
|
|
|
"with open('data\\WatchEvent.txt', 'w') as f:\n", |
|
|
|
" for key in graph_dict['WatchEvent']:\n", |
|
|
|
" if len(graph_dict['WatchEvent'][key]) < 2:\n", |
|
|
|
" continue\n", |
|
|
|
" f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['WatchEvent'][key])))\n", |
|
|
|
"\n", |
|
|
|
"with open('data\\ForkEvent.txt', 'w') as f:\n", |
|
|
|
" for key in graph_dict['ForkEvent']:\n", |
|
|
|
" if len(graph_dict['ForkEvent'][key]) < 2:\n", |
|
|
|
" continue\n", |
|
|
|
" f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['ForkEvent'][key])))\n", |
|
|
|
"\n", |
|
|
|
"with open('data\\IssueCommentEvent.txt', 'w') as f:\n", |
|
|
|
" for key in graph_dict['IssueCommentEvent']:\n", |
|
|
|
" if len(graph_dict['IssueCommentEvent'][key]) < 2:\n", |
|
|
|
" continue\n", |
|
|
|
" f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['IssueCommentEvent'][key])))" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"metadata": { |
|
|
|
"interpreter": { |
|
|
|
"hash": "caac794e4b8e34bcc9a4d9e1a06492e263031294735d822cbf2db7854bb6c6da" |
|
|
|
}, |
|
|
|
"kernelspec": { |
|
|
|
"display_name": "Python 3.10.4 64-bit (windows store)", |
|
|
|
"language": "python", |
|
|
|
"name": "python3" |
|
|
|
}, |
|
|
|
"language_info": { |
|
|
|
"codemirror_mode": { |
|
|
|
"name": "ipython", |
|
|
|
"version": 3 |
|
|
|
}, |
|
|
|
"file_extension": ".py", |
|
|
|
"mimetype": "text/x-python", |
|
|
|
"name": "python", |
|
|
|
"nbconvert_exporter": "python", |
|
|
|
"pygments_lexer": "ipython3", |
|
|
|
"version": "3.10.4" |
|
|
|
}, |
|
|
|
"orig_nbformat": 4 |
|
|
|
}, |
|
|
|
"nbformat": 4, |
|
|
|
"nbformat_minor": 2 |
|
|
|
} |