{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "odps.Schema {\n", " login string \n", " created_at date \n", " database_id int64 \n", " location string \n", " company string \n", " bio string \n", " is_employee boolean \n", " email string \n", " infoname string \n", " followers string \n", " following string \n", " time date \n", " name string \n", " lastupdatedat date \n", " nextupdateat date \n", "}\n", "\n", "odps.Schema {\n", " id string \n", " type string \n", " action string \n", " actor_id int64 \n", " actor_login string \n", " repo_id int64 \n", " repo_name string \n", " org_id int64 \n", " org_login string \n", " created_at datetime \n", " issue_id int64 \n", " issue_number int32 \n", " issue_title string \n", " issue_body string \n", " issue_labels_name list<string> \n", " issue_labels_color list<string> \n", " issue_labels_default list<string> \n", " issue_labels_description list<string> \n", " issue_author_id int64 \n", " issue_author_login string \n", " issue_author_type string \n", " issue_author_association string \n", " issue_assignee_id int64 \n", " issue_assignee_login string \n", " issue_assignees_id list<string> \n", " issue_assignees_login list<string> \n", " issue_created_at datetime \n", " issue_updated_at datetime \n", " issue_comments int16 \n", " issue_closed_at datetime \n", " issue_comment_id int64 \n", " issue_comment_body string \n", " issue_comment_created_at datetime \n", " issue_comment_updated_at datetime \n", " issue_comment_author_association string \n", " issue_comment_author_id int64 \n", " issue_comment_author_login string \n", " issue_comment_author_type string \n", " pull_commits int16 \n", " pull_additions int16 \n", " pull_deletions int16 \n", " pull_changed_files int32 \n", " pull_merged int8 \n", " pull_merge_commit_sha string \n", " pull_merged_at datetime \n", " pull_merged_by_id int64 \n", " pull_merged_by_login string \n", " pull_merged_by_type string \n", " pull_requested_reviewer_id int64 \n", " pull_requested_reviewer_login string \n", " pull_requested_reviewer_type string \n", " pull_review_comments int16 \n", " repo_description string \n", " repo_size int32 \n", " repo_stargazers_count int32 \n", " repo_forks_count int32 \n", " repo_language string \n", " repo_has_issues int8 \n", " repo_has_projects int8 \n", " repo_has_downloads int8 \n", " repo_has_wiki int8 \n", " repo_has_pages int8 \n", " repo_license string \n", " repo_default_branch string \n", " repo_created_at datetime \n", " repo_updated_at datetime \n", " repo_pushed_at datetime \n", " pull_review_id int64 \n", " pull_review_comment_id int64 \n", " pull_review_comment_path string \n", " pull_review_comment_position string \n", " pull_review_comment_author_id int64 \n", " pull_review_comment_author_login string \n", " pull_review_comment_author_type string \n", " pull_review_comment_author_association string \n", " pull_review_comment_body string \n", " pull_review_comment_created_at datetime \n", " pull_review_comment_updated_at datetime \n", " push_id int64 \n", " push_size int32 \n", " push_distinct_size int32 \n", " push_ref string \n", " push_head string \n", " push_before string \n", " push_commits_name list<string> \n", " push_commits_email list<string> \n", " push_commits_message list<string> \n", " fork_forkee_id int64 \n", " fork_forkee_full_name string \n", " fork_forkee_owner_id int64 \n", " fork_forkee_owner_login string \n", " fork_forkee_owner_type string \n", " delete_ref string \n", " delete_ref_type string \n", " delete_pusher_type string \n", " create_ref string \n", " create_ref_type string \n", " create_master_branch string \n", " create_description string \n", " create_pusher_type string \n", " gollum_pages_page_name list<string> \n", " gollum_pages_title list<string> \n", " gollum_pages_action list<string> \n", " member_login string \n", " member_type string \n", " member_id int64 \n", " release_id int64 \n", " release_tag_name string \n", " release_target_commitish string \n", " release_name string \n", " release_draft int8 \n", " release_author_id int64 \n", " release_author_login string \n", " release_author_type string \n", " release_prerelease int8 \n", " release_created_at datetime \n", " release_published_at datetime \n", " release_body string \n", " release_assets_name list<string> \n", " release_assets_uploader_login list<string> \n", " release_assets_uploader_id list<string> \n", " release_assets_content_type list<string> \n", " release_assets_state list<string> \n", " release_assets_size list<string> \n", " release_assets_download_count list<string> \n", " commit_comment_id int64 \n", " commit_comment_author_id int64 \n", " commit_comment_author_login string \n", " commit_comment_author_type string \n", " commit_comment_author_association string \n", " commit_comment_body string \n", " commit_comment_path string \n", " commit_comment_position string \n", " commit_comment_line string \n", " commit_comment_created_at datetime \n", " commit_comment_updated_at datetime \n", " pt string \n", "}\n", "\n" ] } ], "source": [ "from odps import ODPS\n", "from odps import options\n", "from odps.df import DataFrame\n", "import pandas as pd\n", "import numpy as np\n", "\n", "pd.set_option('display.max_rows',None)\n", "\n", "ACCESS_ID = 'LTAI5t9uwJrh5eJ7Q5E37D1s'\n", "SECRET_ACCESS_KEY = 'NCFHOAnvqfnTrpypgR4b3cNawP8fnB'\n", "ODPS_PROJECT = 'OpenDigger_prod_dev'\n", "ODPS_ENDPOINT = 'http://service.cn-shanghai.maxcompute.aliyun.com/api'\n", "\n", "o = ODPS(ACCESS_ID, SECRET_ACCESS_KEY,\n", " project=ODPS_PROJECT, endpoint=ODPS_ENDPOINT)\n", "options.tunnel.limit_instance_tunnel = False\n", "# options.read_timeout = 10000000\n", "\n", "users = DataFrame(o.get_table('ods_github_users'))\n", "print(users.dtypes)\n", "\n", "github_log = DataFrame(o.get_table('ods_github_log'))\n", "print(github_log.dtypes)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "sql = '''\n", " select type, count(repo_id), repo_id from ods_github_log\n", " where pt='20151001'\n", " and type in ('PullRequestEvent','WatchEvent','ForkEvent','IssueCommentEvent')\n", " group by type, repo_id;\n", "'''\n", "\n", "result = o.execute_sql(sql, hints={'odps.sql.allow.fullscan': 'true', 'odps.sql.submit.mode': 'script'})\n", "with open('data\\count.txt', 'w') as f:\n", " with result.open_reader() as reader:\n", " for record in reader:\n", " type = record['type']\n", " count = record['_c1']\n", " repo_id = record['repo_id'] \n", " f.write('type: {type}, repo_id: {repo_id}, count: {count}\\n'.format(\n", " type=type,\n", " repo_id=repo_id,\n", " count=count)) " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "graph_dict = {}\n", "graph_dict['PullRequestEvent'] = {}\n", "graph_dict['WatchEvent'] = {}\n", "graph_dict['ForkEvent'] = {}\n", "graph_dict['IssueCommentEvent'] = {}\n", "sql = '''\n", " select type, repo_id, actor_id\n", " from ods_github_log\n", " where pt='20151001'\n", " and type in ('PullRequestEvent','WatchEvent','ForkEvent','IssueCommentEvent')\n", " group by type, repo_id, actor_id;\n", "'''\n", "result = o.execute_sql(sql, hints={'odps.sql.allow.fullscan': 'true', 'odps.sql.submit.mode': 'script'})\n", "with result.open_reader() as reader:\n", " for record in reader:\n", " type = record['type']\n", " actor_id = record['actor_id']\n", " repo_id = record['repo_id']\n", " if actor_id not in graph_dict[type]:\n", " graph_dict[type][actor_id] = []\n", " graph_dict[type][actor_id].append(str(repo_id))\n", "\n", "# print(graph_dict)\n", "\n", "with open('data\\PullRequestEvent.txt', 'w') as f:\n", " for key in graph_dict['PullRequestEvent']:\n", " if len(graph_dict['PullRequestEvent'][key]) < 2:\n", " continue\n", " f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['PullRequestEvent'][key])))\n", "\n", "with open('data\\WatchEvent.txt', 'w') as f:\n", " for key in graph_dict['WatchEvent']:\n", " if len(graph_dict['WatchEvent'][key]) < 2:\n", " continue\n", " f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['WatchEvent'][key])))\n", "\n", "with open('data\\ForkEvent.txt', 'w') as f:\n", " for key in graph_dict['ForkEvent']:\n", " if len(graph_dict['ForkEvent'][key]) < 2:\n", " continue\n", " f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['ForkEvent'][key])))\n", "\n", "with open('data\\IssueCommentEvent.txt', 'w') as f:\n", " for key in graph_dict['IssueCommentEvent']:\n", " if len(graph_dict['IssueCommentEvent'][key]) < 2:\n", " continue\n", " f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['IssueCommentEvent'][key])))" ] } ], "metadata": { "interpreter": { "hash": "caac794e4b8e34bcc9a4d9e1a06492e263031294735d822cbf2db7854bb6c6da" }, "kernelspec": { "display_name": "Python 3.10.4 64-bit (windows store)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }