@ -0,0 +1,306 @@ |
{ |
"cells": [ |
{ |
"cell_type": "code", |
"execution_count": 1, |
"metadata": {}, |
"outputs": [ |
{ |
"name": "stdout", |
"output_type": "stream", |
"text": [ |
"odps.Schema {\n", |
" login string \n", |
" created_at date \n", |
" database_id int64 \n", |
" location string \n", |
" company string \n", |
" bio string \n", |
" is_employee boolean \n", |
" email string \n", |
" infoname string \n", |
" followers string \n", |
" following string \n", |
" time date \n", |
" name string \n", |
" lastupdatedat date \n", |
" nextupdateat date \n", |
"}\n", |
"\n", |
"odps.Schema {\n", |
" id string \n", |
" type string \n", |
" action string \n", |
" actor_id int64 \n", |
" actor_login string \n", |
" repo_id int64 \n", |
" repo_name string \n", |
" org_id int64 \n", |
" org_login string \n", |
" created_at datetime \n", |
" issue_id int64 \n", |
" issue_number int32 \n", |
" issue_title string \n", |
" issue_body string \n", |
" issue_labels_name list<string> \n", |
" issue_labels_color list<string> \n", |
" issue_labels_default list<string> \n", |
" issue_labels_description list<string> \n", |
" issue_author_id int64 \n", |
" issue_author_login string \n", |
" issue_author_type string \n", |
" issue_author_association string \n", |
" issue_assignee_id int64 \n", |
" issue_assignee_login string \n", |
" issue_assignees_id list<string> \n", |
" issue_assignees_login list<string> \n", |
" issue_created_at datetime \n", |
" issue_updated_at datetime \n", |
" issue_comments int16 \n", |
" issue_closed_at datetime \n", |
" issue_comment_id int64 \n", |
" issue_comment_body string \n", |
" issue_comment_created_at datetime \n", |
" issue_comment_updated_at datetime \n", |
" issue_comment_author_association string \n", |
" issue_comment_author_id int64 \n", |
" issue_comment_author_login string \n", |
" issue_comment_author_type string \n", |
" pull_commits int16 \n", |
" pull_additions int16 \n", |
" pull_deletions int16 \n", |
" pull_changed_files int32 \n", |
" pull_merged int8 \n", |
" pull_merge_commit_sha string \n", |
" pull_merged_at datetime \n", |
" pull_merged_by_id int64 \n", |
" pull_merged_by_login string \n", |
" pull_merged_by_type string \n", |
" pull_requested_reviewer_id int64 \n", |
" pull_requested_reviewer_login string \n", |
" pull_requested_reviewer_type string \n", |
" pull_review_comments int16 \n", |
" repo_description string \n", |
" repo_size int32 \n", |
" repo_stargazers_count int32 \n", |
" repo_forks_count int32 \n", |
" repo_language string \n", |
" repo_has_issues int8 \n", |
" repo_has_projects int8 \n", |
" repo_has_downloads int8 \n", |
" repo_has_wiki int8 \n", |
" repo_has_pages int8 \n", |
" repo_license string \n", |
" repo_default_branch string \n", |
" repo_created_at datetime \n", |
" repo_updated_at datetime \n", |
" repo_pushed_at datetime \n", |
" pull_review_id int64 \n", |
" pull_review_comment_id int64 \n", |
" pull_review_comment_path string \n", |
" pull_review_comment_position string \n", |
" pull_review_comment_author_id int64 \n", |
" pull_review_comment_author_login string \n", |
" pull_review_comment_author_type string \n", |
" pull_review_comment_author_association string \n", |
" pull_review_comment_body string \n", |
" pull_review_comment_created_at datetime \n", |
" pull_review_comment_updated_at datetime \n", |
" push_id int64 \n", |
" push_size int32 \n", |
" push_distinct_size int32 \n", |
" push_ref string \n", |
" push_head string \n", |
" push_before string \n", |
" push_commits_name list<string> \n", |
" push_commits_email list<string> \n", |
" push_commits_message list<string> \n", |
" fork_forkee_id int64 \n", |
" fork_forkee_full_name string \n", |
" fork_forkee_owner_id int64 \n", |
" fork_forkee_owner_login string \n", |
" fork_forkee_owner_type string \n", |
" delete_ref string \n", |
" delete_ref_type string \n", |
" delete_pusher_type string \n", |
" create_ref string \n", |
" create_ref_type string \n", |
" create_master_branch string \n", |
" create_description string \n", |
" create_pusher_type string \n", |
" gollum_pages_page_name list<string> \n", |
" gollum_pages_title list<string> \n", |
" gollum_pages_action list<string> \n", |
" member_login string \n", |
" member_type string \n", |
" member_id int64 \n", |
" release_id int64 \n", |
" release_tag_name string \n", |
" release_target_commitish string \n", |
" release_name string \n", |
" release_draft int8 \n", |
" release_author_id int64 \n", |
" release_author_login string \n", |
" release_author_type string \n", |
" release_prerelease int8 \n", |
" release_created_at datetime \n", |
" release_published_at datetime \n", |
" release_body string \n", |
" release_assets_name list<string> \n", |
" release_assets_uploader_login list<string> \n", |
" release_assets_uploader_id list<string> \n", |
" release_assets_content_type list<string> \n", |
" release_assets_state list<string> \n", |
" release_assets_size list<string> \n", |
" release_assets_download_count list<string> \n", |
" commit_comment_id int64 \n", |
" commit_comment_author_id int64 \n", |
" commit_comment_author_login string \n", |
" commit_comment_author_type string \n", |
" commit_comment_author_association string \n", |
" commit_comment_body string \n", |
" commit_comment_path string \n", |
" commit_comment_position string \n", |
" commit_comment_line string \n", |
" commit_comment_created_at datetime \n", |
" commit_comment_updated_at datetime \n", |
" pt string \n", |
"}\n", |
"\n" |
] |
} |
], |
"source": [ |
"from odps import ODPS\n", |
"from odps import options\n", |
"from odps.df import DataFrame\n", |
"import pandas as pd\n", |
"import numpy as np\n", |
"\n", |
"pd.set_option('display.max_rows',None)\n", |
"\n", |
"ACCESS_ID = 'LTAI5t9uwJrh5eJ7Q5E37D1s'\n", |
"SECRET_ACCESS_KEY = 'NCFHOAnvqfnTrpypgR4b3cNawP8fnB'\n", |
"ODPS_PROJECT = 'OpenDigger_prod_dev'\n", |
"ODPS_ENDPOINT = 'http://service.cn-shanghai.maxcompute.aliyun.com/api'\n", |
"\n", |
" project=ODPS_PROJECT, endpoint=ODPS_ENDPOINT)\n", |
"options.tunnel.limit_instance_tunnel = False\n", |
"# options.read_timeout = 10000000\n", |
"\n", |
"users = DataFrame(o.get_table('ods_github_users'))\n", |
"print(users.dtypes)\n", |
"\n", |
"github_log = DataFrame(o.get_table('ods_github_log'))\n", |
"print(github_log.dtypes)" |
] |
}, |
{ |
"cell_type": "code", |
"execution_count": 8, |
"metadata": {}, |
"outputs": [], |
"source": [ |
"sql = '''\n", |
" select type, count(repo_id), repo_id from ods_github_log\n", |
" where pt='20151001'\n", |
" and type in ('PullRequestEvent','WatchEvent','ForkEvent','IssueCommentEvent')\n", |
" group by type, repo_id;\n", |
"'''\n", |
"\n", |
"result = o.execute_sql(sql, hints={'odps.sql.allow.fullscan': 'true', 'odps.sql.submit.mode': 'script'})\n", |
"with open('data\\count.txt', 'w') as f:\n", |
" with result.open_reader() as reader:\n", |
" for record in reader:\n", |
" type = record['type']\n", |
" count = record['_c1']\n", |
" repo_id = record['repo_id'] \n", |
" f.write('type: {type}, repo_id: {repo_id}, count: {count}\\n'.format(\n", |
" type=type,\n", |
" repo_id=repo_id,\n", |
" count=count)) " |
] |
}, |
{ |
"cell_type": "code", |
"execution_count": 6, |
"metadata": {}, |
"outputs": [], |
"source": [ |
"graph_dict = {}\n", |
"graph_dict['PullRequestEvent'] = {}\n", |
"graph_dict['WatchEvent'] = {}\n", |
"graph_dict['ForkEvent'] = {}\n", |
"graph_dict['IssueCommentEvent'] = {}\n", |
"sql = '''\n", |
" select type, repo_id, actor_id\n", |
" from ods_github_log\n", |
" where pt='20151001'\n", |
" and type in ('PullRequestEvent','WatchEvent','ForkEvent','IssueCommentEvent')\n", |
" group by type, repo_id, actor_id;\n", |
"'''\n", |
"result = o.execute_sql(sql, hints={'odps.sql.allow.fullscan': 'true', 'odps.sql.submit.mode': 'script'})\n", |
"with result.open_reader() as reader:\n", |
" for record in reader:\n", |
" type = record['type']\n", |
" actor_id = record['actor_id']\n", |
" repo_id = record['repo_id']\n", |
" if actor_id not in graph_dict[type]:\n", |
" graph_dict[type][actor_id] = []\n", |
" graph_dict[type][actor_id].append(str(repo_id))\n", |
"\n", |
"# print(graph_dict)\n", |
"\n", |
"with open('data\\PullRequestEvent.txt', 'w') as f:\n", |
" for key in graph_dict['PullRequestEvent']:\n", |
" if len(graph_dict['PullRequestEvent'][key]) < 2:\n", |
" continue\n", |
" f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['PullRequestEvent'][key])))\n", |
"\n", |
"with open('data\\WatchEvent.txt', 'w') as f:\n", |
" for key in graph_dict['WatchEvent']:\n", |
" if len(graph_dict['WatchEvent'][key]) < 2:\n", |
" continue\n", |
" f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['WatchEvent'][key])))\n", |
"\n", |
"with open('data\\ForkEvent.txt', 'w') as f:\n", |
" for key in graph_dict['ForkEvent']:\n", |
" if len(graph_dict['ForkEvent'][key]) < 2:\n", |
" continue\n", |
" f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['ForkEvent'][key])))\n", |
"\n", |
"with open('data\\IssueCommentEvent.txt', 'w') as f:\n", |
" for key in graph_dict['IssueCommentEvent']:\n", |
" if len(graph_dict['IssueCommentEvent'][key]) < 2:\n", |
" continue\n", |
" f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['IssueCommentEvent'][key])))" |
] |
} |
], |
"metadata": { |
"interpreter": { |
"hash": "caac794e4b8e34bcc9a4d9e1a06492e263031294735d822cbf2db7854bb6c6da" |
}, |
"kernelspec": { |
"display_name": "Python 3.10.4 64-bit (windows store)", |
"language": "python", |
"name": "python3" |
}, |
"language_info": { |
"codemirror_mode": { |
"name": "ipython", |
"version": 3 |
}, |
"file_extension": ".py", |
"mimetype": "text/x-python", |
"name": "python", |
"nbconvert_exporter": "python", |
"pygments_lexer": "ipython3", |
"version": "3.10.4" |
}, |
"orig_nbformat": 4 |
}, |
"nbformat": 4, |
"nbformat_minor": 2 |
} |