Browse Source

Upload files to 'notebooks'

master
郭腾 1 year ago
parent
commit
5993035005
3 changed files with 308 additions and 0 deletions
  1. +1
    -0
      notebooks/data_mining.ipynb
  2. +306
    -0
      notebooks/data_pull.ipynb
  3. +1
    -0
      notebooks/learning.ipynb

+ 1
- 0
notebooks/data_mining.ipynb
File diff suppressed because it is too large
View File


+ 306
- 0
notebooks/data_pull.ipynb View File

@ -0,0 +1,306 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"odps.Schema {\n",
" login string \n",
" created_at date \n",
" database_id int64 \n",
" location string \n",
" company string \n",
" bio string \n",
" is_employee boolean \n",
" email string \n",
" infoname string \n",
" followers string \n",
" following string \n",
" time date \n",
" name string \n",
" lastupdatedat date \n",
" nextupdateat date \n",
"}\n",
"\n",
"odps.Schema {\n",
" id string \n",
" type string \n",
" action string \n",
" actor_id int64 \n",
" actor_login string \n",
" repo_id int64 \n",
" repo_name string \n",
" org_id int64 \n",
" org_login string \n",
" created_at datetime \n",
" issue_id int64 \n",
" issue_number int32 \n",
" issue_title string \n",
" issue_body string \n",
" issue_labels_name list<string> \n",
" issue_labels_color list<string> \n",
" issue_labels_default list<string> \n",
" issue_labels_description list<string> \n",
" issue_author_id int64 \n",
" issue_author_login string \n",
" issue_author_type string \n",
" issue_author_association string \n",
" issue_assignee_id int64 \n",
" issue_assignee_login string \n",
" issue_assignees_id list<string> \n",
" issue_assignees_login list<string> \n",
" issue_created_at datetime \n",
" issue_updated_at datetime \n",
" issue_comments int16 \n",
" issue_closed_at datetime \n",
" issue_comment_id int64 \n",
" issue_comment_body string \n",
" issue_comment_created_at datetime \n",
" issue_comment_updated_at datetime \n",
" issue_comment_author_association string \n",
" issue_comment_author_id int64 \n",
" issue_comment_author_login string \n",
" issue_comment_author_type string \n",
" pull_commits int16 \n",
" pull_additions int16 \n",
" pull_deletions int16 \n",
" pull_changed_files int32 \n",
" pull_merged int8 \n",
" pull_merge_commit_sha string \n",
" pull_merged_at datetime \n",
" pull_merged_by_id int64 \n",
" pull_merged_by_login string \n",
" pull_merged_by_type string \n",
" pull_requested_reviewer_id int64 \n",
" pull_requested_reviewer_login string \n",
" pull_requested_reviewer_type string \n",
" pull_review_comments int16 \n",
" repo_description string \n",
" repo_size int32 \n",
" repo_stargazers_count int32 \n",
" repo_forks_count int32 \n",
" repo_language string \n",
" repo_has_issues int8 \n",
" repo_has_projects int8 \n",
" repo_has_downloads int8 \n",
" repo_has_wiki int8 \n",
" repo_has_pages int8 \n",
" repo_license string \n",
" repo_default_branch string \n",
" repo_created_at datetime \n",
" repo_updated_at datetime \n",
" repo_pushed_at datetime \n",
" pull_review_id int64 \n",
" pull_review_comment_id int64 \n",
" pull_review_comment_path string \n",
" pull_review_comment_position string \n",
" pull_review_comment_author_id int64 \n",
" pull_review_comment_author_login string \n",
" pull_review_comment_author_type string \n",
" pull_review_comment_author_association string \n",
" pull_review_comment_body string \n",
" pull_review_comment_created_at datetime \n",
" pull_review_comment_updated_at datetime \n",
" push_id int64 \n",
" push_size int32 \n",
" push_distinct_size int32 \n",
" push_ref string \n",
" push_head string \n",
" push_before string \n",
" push_commits_name list<string> \n",
" push_commits_email list<string> \n",
" push_commits_message list<string> \n",
" fork_forkee_id int64 \n",
" fork_forkee_full_name string \n",
" fork_forkee_owner_id int64 \n",
" fork_forkee_owner_login string \n",
" fork_forkee_owner_type string \n",
" delete_ref string \n",
" delete_ref_type string \n",
" delete_pusher_type string \n",
" create_ref string \n",
" create_ref_type string \n",
" create_master_branch string \n",
" create_description string \n",
" create_pusher_type string \n",
" gollum_pages_page_name list<string> \n",
" gollum_pages_title list<string> \n",
" gollum_pages_action list<string> \n",
" member_login string \n",
" member_type string \n",
" member_id int64 \n",
" release_id int64 \n",
" release_tag_name string \n",
" release_target_commitish string \n",
" release_name string \n",
" release_draft int8 \n",
" release_author_id int64 \n",
" release_author_login string \n",
" release_author_type string \n",
" release_prerelease int8 \n",
" release_created_at datetime \n",
" release_published_at datetime \n",
" release_body string \n",
" release_assets_name list<string> \n",
" release_assets_uploader_login list<string> \n",
" release_assets_uploader_id list<string> \n",
" release_assets_content_type list<string> \n",
" release_assets_state list<string> \n",
" release_assets_size list<string> \n",
" release_assets_download_count list<string> \n",
" commit_comment_id int64 \n",
" commit_comment_author_id int64 \n",
" commit_comment_author_login string \n",
" commit_comment_author_type string \n",
" commit_comment_author_association string \n",
" commit_comment_body string \n",
" commit_comment_path string \n",
" commit_comment_position string \n",
" commit_comment_line string \n",
" commit_comment_created_at datetime \n",
" commit_comment_updated_at datetime \n",
" pt string \n",
"}\n",
"\n"
]
}
],
"source": [
"from odps import ODPS\n",
"from odps import options\n",
"from odps.df import DataFrame\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"pd.set_option('display.max_rows',None)\n",
"\n",
"ACCESS_ID = 'LTAI5t9uwJrh5eJ7Q5E37D1s'\n",
"SECRET_ACCESS_KEY = 'NCFHOAnvqfnTrpypgR4b3cNawP8fnB'\n",
"ODPS_PROJECT = 'OpenDigger_prod_dev'\n",
"ODPS_ENDPOINT = 'http://service.cn-shanghai.maxcompute.aliyun.com/api'\n",
"\n",
"o = ODPS(ACCESS_ID, SECRET_ACCESS_KEY,\n",
" project=ODPS_PROJECT, endpoint=ODPS_ENDPOINT)\n",
"options.tunnel.limit_instance_tunnel = False\n",
"# options.read_timeout = 10000000\n",
"\n",
"users = DataFrame(o.get_table('ods_github_users'))\n",
"print(users.dtypes)\n",
"\n",
"github_log = DataFrame(o.get_table('ods_github_log'))\n",
"print(github_log.dtypes)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"sql = '''\n",
" select type, count(repo_id), repo_id from ods_github_log\n",
" where pt='20151001'\n",
" and type in ('PullRequestEvent','WatchEvent','ForkEvent','IssueCommentEvent')\n",
" group by type, repo_id;\n",
"'''\n",
"\n",
"result = o.execute_sql(sql, hints={'odps.sql.allow.fullscan': 'true', 'odps.sql.submit.mode': 'script'})\n",
"with open('data\\count.txt', 'w') as f:\n",
" with result.open_reader() as reader:\n",
" for record in reader:\n",
" type = record['type']\n",
" count = record['_c1']\n",
" repo_id = record['repo_id'] \n",
" f.write('type: {type}, repo_id: {repo_id}, count: {count}\\n'.format(\n",
" type=type,\n",
" repo_id=repo_id,\n",
" count=count)) "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"graph_dict = {}\n",
"graph_dict['PullRequestEvent'] = {}\n",
"graph_dict['WatchEvent'] = {}\n",
"graph_dict['ForkEvent'] = {}\n",
"graph_dict['IssueCommentEvent'] = {}\n",
"sql = '''\n",
" select type, repo_id, actor_id\n",
" from ods_github_log\n",
" where pt='20151001'\n",
" and type in ('PullRequestEvent','WatchEvent','ForkEvent','IssueCommentEvent')\n",
" group by type, repo_id, actor_id;\n",
"'''\n",
"result = o.execute_sql(sql, hints={'odps.sql.allow.fullscan': 'true', 'odps.sql.submit.mode': 'script'})\n",
"with result.open_reader() as reader:\n",
" for record in reader:\n",
" type = record['type']\n",
" actor_id = record['actor_id']\n",
" repo_id = record['repo_id']\n",
" if actor_id not in graph_dict[type]:\n",
" graph_dict[type][actor_id] = []\n",
" graph_dict[type][actor_id].append(str(repo_id))\n",
"\n",
"# print(graph_dict)\n",
"\n",
"with open('data\\PullRequestEvent.txt', 'w') as f:\n",
" for key in graph_dict['PullRequestEvent']:\n",
" if len(graph_dict['PullRequestEvent'][key]) < 2:\n",
" continue\n",
" f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['PullRequestEvent'][key])))\n",
"\n",
"with open('data\\WatchEvent.txt', 'w') as f:\n",
" for key in graph_dict['WatchEvent']:\n",
" if len(graph_dict['WatchEvent'][key]) < 2:\n",
" continue\n",
" f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['WatchEvent'][key])))\n",
"\n",
"with open('data\\ForkEvent.txt', 'w') as f:\n",
" for key in graph_dict['ForkEvent']:\n",
" if len(graph_dict['ForkEvent'][key]) < 2:\n",
" continue\n",
" f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['ForkEvent'][key])))\n",
"\n",
"with open('data\\IssueCommentEvent.txt', 'w') as f:\n",
" for key in graph_dict['IssueCommentEvent']:\n",
" if len(graph_dict['IssueCommentEvent'][key]) < 2:\n",
" continue\n",
" f.write('actor_id: {actor_id}\\nrepo_id: {list}\\n'.format(actor_id=key, list=' '.join(graph_dict['IssueCommentEvent'][key])))"
]
}
],
"metadata": {
"interpreter": {
"hash": "caac794e4b8e34bcc9a4d9e1a06492e263031294735d822cbf2db7854bb6c6da"
},
"kernelspec": {
"display_name": "Python 3.10.4 64-bit (windows store)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 1
- 0
notebooks/learning.ipynb
File diff suppressed because it is too large
View File


Loading…
Cancel
Save