@ -0,0 +1,182 @@ | |||
## 1 平台部署 | |||
#### 1.1 部署流程: | |||
1.把原来的服务器/data数据scp到新服务器的/data | |||
在数据存放的机器上切换到相应目录并执行 scp ./data 用户名(新)@ip(新):/ | |||
2.把docker-compose.yml文件复制到新服务器 | |||
3.切换到yml文件所在目录,执行sudo docker-compose up -d | |||
#### 1.2 docker-compose.yml | |||
``` | |||
version: "3" | |||
services: | |||
oj-redis: | |||
image: redis:4.0-alpine | |||
container_name: oj-redis | |||
restart: always | |||
volumes: | |||
- /data/data/redis:/data | |||
oj-postgres: | |||
image: postgres:10-alpine | |||
container_name: oj-postgres | |||
restart: always | |||
command: postgres -c max_connections=1000 | |||
ports: | |||
- "127.0.0.1:12348:5432" | |||
volumes: | |||
- /data/data/postgres:/var/lib/postgresql/data | |||
environment: | |||
- POSTGRES_DB=onlinejudge | |||
- POSTGRES_USER=onlinejudge | |||
- POSTGRES_PASSWORD=onlinejudge | |||
judge-server: | |||
image: registry.cn-hangzhou.aliyuncs.com/wsl/judge_server | |||
container_name: judge-server | |||
restart: always | |||
read_only: true | |||
cap_drop: | |||
- SETPCAP | |||
- MKNOD | |||
- NET_BIND_SERVICE | |||
- SYS_CHROOT | |||
- SETFCAP | |||
- FSETID | |||
tmpfs: | |||
- /tmp | |||
volumes: | |||
- /data/backend/test_case:/test_case:ro | |||
- /data/judge_server/log:/log | |||
- /data/judge_server/run:/judger | |||
environment: | |||
- SERVICE_URL=http://judge-server:8080 | |||
- BACKEND_URL=http://oj-backend:8000/api/judge_server_heartbeat/ | |||
- TOKEN=DASETALENT | |||
oj-backend: | |||
image: registry.cn-hangzhou.aliyuncs.com/wsl/oj_backend | |||
container_name: oj-backend | |||
restart: always | |||
depends_on: | |||
- oj-redis | |||
- oj-postgres | |||
- judge-server | |||
volumes: | |||
- /data/data/backend:/data | |||
- /data/data/app:/app | |||
environment: | |||
- POSTGRES_DB=onlinejudge | |||
- POSTGRES_USER=onlinejudge | |||
- POSTGRES_PASSWORD=onlinejudge | |||
- JUDGE_SERVER_TOKEN=DASETALENT | |||
# - FORCE_HTTPS=1 | |||
# - STATIC_CDN_HOST=cdn.oj.com | |||
ports: | |||
- "0.0.0.0:80:8000" | |||
- "0.0.0.0:443:1443" | |||
``` | |||
## 2 题目创建要求 | |||
#### 2.1 题面 | |||
题目表述准确无歧义 | |||
明确给出输入数据范围 | |||
#### 2.2 测试用例 | |||
题面中给出的sample不应出现在测试用例中 | |||
样例中,小样例比例不高于20%,中等规模样例40%~60%,剩下为接近数据最大范围的样例 | |||
#### 2.3 标程 | |||
出题应准备能够保证正确性的标准程序代码 | |||
## 3 验题要求 | |||
#### 3.1 不同语言 | |||
需要检查相同算法,不同的语言的结果是否相同。 | |||
避免出现如c语言可以通过,python实现的相同算法代码超时现象 | |||
#### 3.2 验题机制 | |||
至少两人验题 | |||
出题人提交标程测试,验证测试用例的正确性 | |||
非出题人按照做题的流程,阅读题面,检查是否有歧义、表述不清等问题;编写代码进行提交测试。 | |||
实现标程的其它语言版本,测试是否出现3.1中的问题,视情况可通过修改题目时间限制。 | |||
## 4 机试具体流程 | |||
机试前准备阶段 | |||
1.机试oj主平台上创建比赛,设置密码,关闭real time rank,设置不可见。创建一个新服务器 | |||
2.创建题目,满足2题目创建要求 | |||
3.验题,满足3验题要求,验题时需要打开可见,先确保设置了密码,验完及时恢复不可见状态 | |||
4.将主平台备份至新服务器,运行服务,并验证可用性。参考1平台部署部分。 | |||
考场快开始阶段 | |||
5.确认 密码已设置,real time rank为关闭,并且处于不可见状态 | |||
马上开始阶段 | |||
6.设置比赛为可见,让考生能够进入比赛输入密码页面 | |||
7.公布比赛密码 | |||
考试中 | |||
8.观测平台可用性,若出现服务异常,优先进入服务器,执行sudo docker-compose restart热重启,没恢复则进行操作9;若恢复则继续考试,不需要操作9 | |||
9.让考生进入备用服务器考试 | |||
考试结束(立刻要做的) | |||
10.修改比赛密码,同时设置为不可见。 | |||
成绩统计 | |||
11.参考机试结束后续流程.pdf | |||
其中连接数据库部分做出如下变化: | |||
使用SSH通道: | |||
<img src="image/image-20230328110921250.png" alt="image-20230328110921250" style="zoom:50%;" /> | |||
连接数据库: | |||
<img src="image/image-20230328111052680.png" alt="image-20230328111052680" style="zoom:50%;" /> | |||
数据库、用户名、密码皆为onlinejudge | |||
## 机试结束,手动去做一个永久快照。 |
@ -0,0 +1,160 @@ | |||
证书在文件夹cert中,其中9085819__XXX的过期时间为2024年1月13日星期六 07:59:59 | |||
需要一年一换!!! | |||
证书找水杉那边管理的同学要 | |||
#### 1天梯证书部署: | |||
/src 天梯项目目录 | |||
/src/cert 证书存放目录 | |||
/src/.env 天梯环境变量配置文件 | |||
步骤: | |||
1.将有效证书放入/src/cert中 | |||
2.修改/src/.env文件: | |||
``` | |||
... | |||
NGINX_PORT=80 | |||
SSL_PORT=443 | |||
## 修改这两行中的文件名为新的有效证书名 | |||
SSL_CERTIFICATE_N=/app/certs/9085819__shuishan.net.cn.pem | |||
SSL_CERTIFICATE_KEY_N=/app/certs/9085819__shuishan.net.cn.key | |||
## | |||
... | |||
``` | |||
3.sudo docker-compose stop && sudo docker-compose start 重启天梯项目 | |||
4.访问mladder.shuishan.net.cn,检查是否可以访问 | |||
#### 2 校场证书部署: | |||
/jcdata 校场数据文件夹(以实际为准) | |||
/jcdata/backend/ssl 证书存放目录 | |||
/jcdata/backend_app/deploy/nginx/nginx.conf nginx配置文件 | |||
步骤: | |||
1.将有效证书放入/jcdata/backend/ssl中 | |||
2.修改/jcdata/backend_app/deploy/nginx/nginx.conf文件: | |||
``` | |||
... | |||
server { | |||
listen 1443 ssl http2 default_server; | |||
server_name _; | |||
ssl_certificate /data/ssl/9085819__shuishan.net.cn.pem; | |||
ssl_certificate_key /data/ssl/9085819__shuishan.net.cn.key; | |||
这两行修改,/data/ssl是容器内的地址,/jcdata/backend/ssl -> /data/ssl | |||
修改后面的文件名就行 | |||
ssl_protocols TLSv1.2; | |||
ssl_ciphers ... | |||
} | |||
} | |||
``` | |||
校场docker-compose.yml: | |||
``` | |||
version: "3" | |||
services: | |||
oj-redis: | |||
image: redis:4.0-alpine | |||
container_name: oj-redis | |||
restart: always | |||
volumes: | |||
- /jcdata/redis:/data | |||
oj-postgres: | |||
image: postgres:10-alpine | |||
container_name: oj-postgres | |||
restart: always | |||
volumes: | |||
- /data/jcdb:/var/lib/postgresql/data | |||
environment: | |||
- POSTGRES_DB=onlinejudge | |||
- POSTGRES_USER=onlinejudge | |||
- POSTGRES_PASSWORD=onlinejudge | |||
judge-server: | |||
image: dasetalent/judgeserver:v2.1 | |||
container_name: judge-server | |||
restart: always | |||
read_only: true | |||
cap_drop: | |||
- SETPCAP | |||
- MKNOD | |||
- NET_BIND_SERVICE | |||
- SYS_CHROOT | |||
- SETFCAP | |||
- FSETID | |||
tmpfs: | |||
- /tmp | |||
volumes: | |||
- /jcdata/backend/test_case:/test_case:ro | |||
- /jcdata/judge_server/log:/log | |||
- /jcdata/judge_server/run:/judger | |||
environment: | |||
- SERVICE_URL=http://judge-server:8080 | |||
- BACKEND_URL=http://oj-backend:8000/api/judge_server_heartbeat/ | |||
- TOKEN=CHANGE_THIS | |||
oj-backend: | |||
image: registry.cn-shanghai.aliyuncs.com/shuishan-data/shuishan-oj-backend:aliyun | |||
container_name: oj-backend | |||
restart: always | |||
depends_on: | |||
- oj-redis | |||
- oj-postgres | |||
- judge-server | |||
volumes: | |||
- /jcdata/backend_app:/app | |||
- /jcdata/backend:/data | |||
environment: | |||
- POSTGRES_DB=onlinejudge | |||
- POSTGRES_USER=onlinejudge | |||
- POSTGRES_PASSWORD=onlinejudge | |||
- JUDGE_SERVER_TOKEN=CHANGE_THIS | |||
# - FORCE_HTTPS=1 | |||
# - STATIC_CDN_HOST=cdn.oj.com | |||
ports: | |||
- "0.0.0.0:80:8000" | |||
- "0.0.0.0:443:1443" | |||
``` | |||
3.sudo docker-compose stop && sudo docker-compose start 重启校场 | |||
4.访问judgefield.shuishan.net.cn,检查是否可以访问 | |||
@ -0,0 +1,28 @@ | |||
## S3令牌有效期为1年,每过一年都需要更新一次 | |||
进入https://edu.ucloud.cn/ | |||
账号见dasetalent_host.md | |||
1.选择项目为实验室-陆雪松 | |||
 | |||
进入对象存储&CDN | |||
 | |||
点击tab标签中的令牌管理 | |||
 | |||
点击查看/编辑按钮 | |||
 | |||
点击重新设置,并设置为1年,点击确定即可 | |||
@ -0,0 +1,150 @@ | |||
#水杉天梯——助教使用手册 | |||
本手册主要展示了助教创建一个作业,上传作业需要的评测文件,上传作业需要的数 | |||
据集的具体流程,以及如何修改上传的数据集和编辑已创建作业。手册中针对多阶段 | |||
的文件结果提交给出了命名规范,规定了模板文件中的打包函数。 | |||
##一、创建作业 | |||
###作业描述部分 | |||
该部分将详细介绍表单中每个字段的含义,以及应该如何填写。 | |||
 | |||
**作业名称——中文名**:对应作业名。 | |||
**作业名称——英文名**:提交表单后生成的**zip压缩包名称**。 | |||
**图标**:如不上传,将使用默认的logo作为作业logo。(仅支持常用图片格式) | |||
###Web Page部分 | |||
 | |||
 | |||
**概述,数据描述,评估,限制条件**使用的是富文本编辑器,可以对文本框中的内容进行加粗,斜体,调整字体大小,上色等操作。 | |||
 | |||
这些文本框的内容将会以**html**的形式展现在生成的作业界面上。对应关系为: | |||
**概述**对应——**Overview**;**评估**对应——**Evaluation**;**限制条件**对应——**Terms and Condictions**;**数据描述**对应——**Get Data**。下图用红框圈出。 | |||
 | |||
 | |||
###多阶段部分 | |||
设置了多阶段机制,允许一次作业分多个阶段提交。 | |||
**(解释:多个阶段可以理解为一次作业的两个部分,例如实现svm/决策树分别完成同一个分类任务,将会对应两个leaderboard)** | |||
 | |||
**每日最高提交数**:该作业允许每日最高的提交数。 | |||
**最高提交数**:该作业开始到结束期间允许的提交数之和。 | |||
**阶段数量**:填写该作业分为几个阶段。填写完以后点击确认,会自动生成相应数量的阶段填写框。 | |||
 | |||
点击确认后,会出现对应数量的阶段内容框,每个阶段需要填写: | |||
**代码评测文件**:.py格式,助教自定义填写,如需参考可点击超链接。 | |||
**作业的参考预测文件**:助教上传用于评测的正确结果文件。 | |||
**每个阶段的开始时间**:当前阶段学生可以提交文件的开始时间。 | |||
 | |||
生成的作业将会分为多个阶段。点击每个阶段,可以提交对应阶段需要上传的文件。 | |||
**对于评测示例文件的补充说明**: | |||
**文件定位**:若交由本系统进行评测,该段代码无需改动。若在本地测试,则直接修改路径即可。 | |||
 | |||
**文件读取**:需要读取学生结果(prediction.txt)和参考结果(true.txt),注意学生结果文件的命名应要求学生固定命名(可在Jupyter模板文件中写好),参考结果文件命名在作业创建中上传的文件名一致。 | |||
 | |||
 | |||
 | |||
两个文件里的数据内容格式应保持一致,通过修改自定义的read_txt函数来读取不同的格式。 | |||
 | |||
**结果评测**:编写自定义的calculate_metric函数以满足不同的指标评测。(如果需要得到多个指标的结果,可自行丰富,得到多个score),以下为计算准确率的示例。 | |||
 | |||
 | |||
**结果输出**:输出文件要求写入规定名为(scores.txt)的文件,不能修改。输出文件的每一行代表对应的一个指标的分数结果,指标名、和小数点位数应与作业创建leaderboard处的填写内容一致。 | |||
 | |||
 | |||
**请助教在写完Evaluate.py后先在本地试运行,输出文件结果无误后再进行上传!** | |||
###排行榜部分 | |||
 | |||
**评估指标个数**:填写作业提交的评估指标个数,有的作业可能需要不只一个评估指标。填写评估指标个数将生成下图所示的指标框。 | |||
**评估标签**:填写评估标签名,如ACC,Precision,Recall等。 | |||
**数据格式**:保留几位小数。 | |||
**排序**:指标排序是升序还是降序。只能填**asc或desc**。 | |||
###其他 | |||
 | |||
**作业权限**:选择作业参与是否需要助教同意。如选择是,则参与作业需要助教后台审批,如选择否,则申请参与作业后自动通过请求。 | |||
开始时间,截至时间:填写作业开始和结束的时间。 | |||
 | |||
点击生成zip文件,将生成一个zip压缩包,点击上图中蓝色超链接生成压缩包。(**表单中除了图标可以不填,其余所有均为必填选项。**) | |||
下载得到压缩包以后,点击上传zip文件,将生成一个新的作业。(**压缩包文件名不能含中文,空格等。**) | |||
 | |||
上传成功后,可以点击蓝色view超链接,查看具体作业信息。 | |||
##编辑作业 | |||
如果需要编辑,在作业详情界面中点击选项——编辑。 | |||
 | |||
备注:编辑页面表单数量较多,此处只挑选助教有可能修改的部分做详细讲述,如需更深层次的编辑,请参考 | |||
https://github.com/codalab/codalab-competitions/wiki/Organizer_Codalab-competition-YAML-definition-language | |||
修改字段内容。 | |||
###作业描述部分 | |||
 | |||
参考第一部分创建作业的作业描述部分。 | |||
**Title**对应——**作业的中文名**。 | |||
**Description**对应——**描述**。 | |||
##Web Page部分 | |||
 | |||
编辑Web Pages,对应关系如一中Web Pages部分。 | |||
###多阶段部分 | |||
 | |||
**Start Date(UTC)**:修改每个phase的开始时间。 | |||
**Maximum Submissions(per User)**:修改作业最多允许的提交数之和。 | |||
**Max Submissions(per User) per day**:修改作业每人每天最多允许的提交数。 | |||
###排行榜部分 | |||
 | |||
 | |||
**Key Label**必须**相同**,对应——**评估标签**。 | |||
**Numberic format**对应——**数据格式**。 | |||
**Sorting**对应——**排序** | |||
###其他 | |||
 | |||
**Organizers need to approve the new teams**对应——第一部分中的作业权限。 | |||
A**nonymous leaderboard**——排行榜用户名是否匿名 | |||
 | |||
**Disallow leaderboard modifying**——提交是否可以被修改 | |||
**Force submission to leaderboard**——学生是否需要人工提交结果 | |||
 | |||
**Registration Required**——学生是否需要经过助教同意才能参与作业 | |||
 | |||
如果提交的结果比过去成绩好,则自动将当前结果提交到排行榜 | |||
##上传数据集 | |||
点击上传数据集按钮打开面板 | |||
 | |||
 | |||
点击选择文件选择数据集压缩包,然后点击提交按钮进行提交。 | |||
提交按钮下方会显示上传进度,提交完成后会有对话框提示完成。 | |||
 | |||
数据集格式说明: | |||
文件中需要包含input文件夹,其中放置数据集,可以有一个模板性的notebook文件 | |||
参考格式如下: | |||
 | |||
需要注意的是,压缩时需要选中这一级目录下所有文件及文件夹进行压缩,而不是对上一级目录进行压缩 | |||
上传后,学生视角的目录如下图所示: | |||
 | |||
其中read_only_sample文件为只读文件,作为备份 | |||
多阶段提交需要注意,需要额外加一个output文件夹,该目录下的结构如下图所示: | |||
 | |||
 | |||
有几个阶段就添加多少个phase_x文件夹。phase_x文件夹用于保存学生对应第x阶段代码输出结果。 | |||
##四、重置数据集 | |||
点击重置Jupyter按钮,可以将该作业对应的notebook删除,学生打开notebook时会以最新上传的数据集作为模板。 | |||
 | |||
使用情形:上传数据集有误需要更新时,已有学生打开了notebook,重新上传数据集不会更新已打开notebook的文件,需要点击该按钮解决。 | |||
 | |||
 | |||
##五、下载学生代码 | |||
 | |||
点击学生代码按钮即可下载 | |||
 | |||
可以下载所有学生的代码,学生文件以学号命名 | |||
 | |||
学生编写的notebook文件或py文件都会在对应目录下。 | |||
##六、模板notebook文件说明 | |||
模板文件中包含三块内容: | |||
1.具体题目的程序逻辑 | |||
 | |||
2.输出程序,output_string存储程序输出结果, phase_id为阶段id,每个阶段的程序输出分开存放 | |||
 | |||
3.打包程序 | |||
 | |||
需要改变的主要是第一部分,后两部分为固定代码,使学生按照系统规定地输出结果和打包结果。 | |||
 | |||
点击submit按钮可以将prediction_phase_1.zip提交到phase1的排行榜中。 |
@ -0,0 +1,189 @@ | |||
## 1资源准备 | |||
根据模型规模,合理安排每个容器使用的资源额度,计算所需要的总资源数目 | |||
如100个学生,每个容器需要1C,4G | |||
那么总资源需要100C,400G | |||
在K8S系统中添加工作节点,满足资源总需求*115%,其中15%为余量,可以多一点。 | |||
考试的节点手动添加标签cal_type:cpu/gpu ntype:exam | |||
对于exam的镜像会全部调度到这些节点中,普通教学的镜像只会被调度在ntype:study的节点上,以此实现考试和教学的分离,保障考试稳定性 | |||
对于gpu的镜像只会被调度到cal_type:gpu的节点上,cpu的镜像同理。 | |||
## 2环境准备 | |||
根据考试实际需要,build一个镜像 | |||
打包过程: | |||
文件在jupyter-image-mladder文件夹中 | |||
其中base是基镜像,先本地build该镜像 | |||
cd base | |||
sudo docker build -t mld:v1 . | |||
其它几个镜像中的FROM字段内容要修改为刚刚build的mld:v1 | |||
安装包只需要仿照下面代码,用conda或者pip3安装即可,推荐使用一些镜像源,速度较快 | |||
cudnn等一些包不能用pip安装,可以用conda,自行google搜索 | |||
``` | |||
FROM mld:v1 #这个要对应自己build的镜像 | |||
ARG NB_USER="jupyter" | |||
ARG NB_UID="1000" | |||
ARG NB_GID="100" | |||
ARG NB_PORT=8888 | |||
USER root | |||
###### 以下是装包: | |||
RUN conda install pytorch torchvision torchaudio cpuonly -c pytorch \ | |||
&& conda install tensorflow | |||
RUN pip3 install numpy seaborn sklearn h5py matplotlib pandas future imageio -i https://pypi.mirrors.ustc.edu.cn/simple/ | |||
RUN pip3 install lightgbm xgboost imblearn mindspore -i https://pypi.mirrors.ustc.edu.cn/simple/ | |||
RUN pip3 install keras -i https://pypi.mirrors.ustc.edu.cn/simple/ | |||
###### | |||
ENV HOME="/home/${NB_USER}" | |||
USER ${NB_UID} | |||
ENTRYPOINT [ "/enterpoint.sh" ] | |||
``` | |||
不同环境,只需要修改######之间的内容即可,别的代码一般不需要修改 | |||
打包好镜像后,需要上传到dockerhub | |||
dasetalent账户密码查阅 dassetalent_host.md文件 | |||
docker push 到dockerhub后,进入考试专用的节点(可以直接ssh,也可以在阿里云k8s管理模块workbench远程连接功能),手动docker pull dasetalent/xxxxxx,其中xxxxxx为上传的镜像,要带版本号 | |||
下面步骤为暂时性: | |||
进入天梯服务器 | |||
目录/home/lwttest | |||
修改/home/lwttest/config.json文件 | |||
``` | |||
{"version": "v1.2.1", "images": { | |||
"old": {"image": "bnc1010/old_notebook:v1", "use_gpu": false, "workdir": "/home/public/", "node_select":{"ntype":"study"}}, | |||
"torch-gpu": {"image": "bnc1010/mladder_notebook_torchgpu:v0.3", "use_gpu": true, "workdir": "/home/jupyter/", "node_select":{"ntype":"study"}}, | |||
"tensorflow-gpu": {"image": "bnc1010/mladder_notebook_tensorflowgpu:v0.2", "use_gpu": true, "workdir": "/home/jupyter/", "node_select":{"ntype":"study"}}, | |||
"tensorflow-pytorch-cpu(exam)": {"image": "bnc1010/mladder_notebook_torch_tf_sk:v1.6", "use_gpu": false, "workdir": "/home/jupyter/", "node_select":{"ntype":"exam"}} | |||
}, | |||
"node_ips": ["47.100.69.138", "139.224.216.129"], | |||
"gpu_maxn": 0, | |||
"gpu_notebook": {}} | |||
``` | |||
这里以tensorflow-pytorch-cpu(exam)为例,这是一个考试专用镜像,它有四个参数: | |||
``` | |||
{ | |||
"image": "bnc1010/mladder_notebook_torch_tf_sk:v1.6", | |||
"use_gpu": false, | |||
"workdir": "/home/jupyter/", | |||
"node_select":{"ntype":"exam"} | |||
} | |||
``` | |||
image: dockerhub中能直接pull的镜像名 | |||
use_gpu: 是否是需要GPU的 | |||
workdir: notebook的工作地址 | |||
node_select: 节点标签选择 | |||
这里node_select中有一个ntype:exam,表明该镜像起的容器只会在考试专用节点上 | |||
其它几个镜像起的容器只会在带有ntype:study标签的节点上 | |||
**在该配置文件添加好新镜像后,重启脚本/home/lwttest/workServer.py** | |||
目前是使用screen挂载 screen -x notebook即可进入, ctrl c 终端,然后python3 /home/lwttest/workServer.py即可重启 | |||
## 3比赛准备 | |||
准备好考试需要的比赛 | |||
详见天梯助教手册 | |||
**设置考试的环境为上述准备好的考试专用环境** | |||
暂时不要publish,测试作业可以使用私密链接,账号可以让测试人员直接从水杉账号,从水杉跳转过来 | |||
## 4考试过程 | |||
#### 4.1 快开始时 | |||
清空k8s考试节点中已打开的容器 | |||
配置资源容器资源额度: | |||
``` | |||
resources: | |||
requests: | |||
memory: 2Gi | |||
cpu: 800m | |||
limits: | |||
memory: 4Gi | |||
cpu: 2000m | |||
``` | |||
#### 4.2 开始 | |||
将比赛正式publish | |||
#### 4.3 考试中 | |||
检测k8s集群,掌握资源实时状态,如果不足,则需要临时加入新节点 | |||
如果有其它节点可用时,一个简单有效的补救:手动修改没有成功开启的deployment的yml,修改其ntype调度到非考试节点上去。 | |||
考试内容相关: | |||
如果有文件错误、说明有误等需要修改、增加文件的情况时,手动上传至对应比赛的input文件中。 | |||
#### 4.4 考试结束 | |||
应该向考生说明保存notebook,避免关闭容器后,代码没有保存而丢失。 | |||
在天梯中,点击code按钮下载考生代码、leaderboader排名CSV文件。 | |||
保存完毕后,删除考试节点所有的deployment。 | |||
设置比赛的环境为空,让考生无法从比赛页面再次进入notebook。 | |||
数据盘创建快照。 | |||
删除临时work节点。 | |||
@ -0,0 +1,39 @@ | |||
# import pickle | |||
# import pandas as pd | |||
# title=[] | |||
# with open('../../submission_202110142140.csv', 'r') as f: | |||
# records = f.readlines() | |||
# title = records[0].replace("\"","").split(',') | |||
# records = pd.read_csv('./submission_202110142140.csv') | |||
# pros = records["code"].to_list() | |||
# id = range(len(pros)) | |||
# pm_id = records["problem_id"].to_list() | |||
# dic={} | |||
# for i in range(len(pros)): | |||
# problem_id = pm_id[i] | |||
# if not dic.get(problem_id): | |||
# dic[problem_id] = [] | |||
# dic[problem_id].append(i) | |||
# pairs_a = [] | |||
# pairs_b = [] | |||
# for k,v in dic.items(): | |||
# for i in range(len(v)): | |||
# for j in range(i+1, len(v)): | |||
# pairs_a.append(v[i]) | |||
# pairs_b.append(v[j]) | |||
# pair_data = {'id1': pairs_a, 'id2':pairs_b} | |||
# newpair = pd.DataFrame(pair_data, columns=["id1","id2"]) | |||
# newpair.to_csv('./data/c/id_pair.csv') | |||
# Data = {'0': id, '1': pros} | |||
# newdata = pd.DataFrame(Data, columns=["0","1"]) | |||
# newdata.to_csv('./data/c/newproblems.csv') |
@ -0,0 +1,199 @@ | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
import torch | |||
from torch.autograd import Variable | |||
import random | |||
class BatchTreeEncoder(nn.Module): | |||
def __init__(self, vocab_size, embedding_dim, encode_dim, batch_size, use_gpu, pretrained_weight=None): | |||
super(BatchTreeEncoder, self).__init__() | |||
self.embedding = nn.Embedding(vocab_size, embedding_dim) | |||
self.embedding_dim = embedding_dim | |||
self.encode_dim = encode_dim | |||
self.W_c = nn.Linear(embedding_dim, encode_dim) | |||
self.activation = F.relu | |||
self.stop = -1 | |||
self.batch_size = batch_size | |||
self.use_gpu = use_gpu | |||
self.node_list = [] | |||
self.th = torch.cuda if use_gpu else torch | |||
self.batch_node = None | |||
self.max_index = vocab_size | |||
# pretrained embedding | |||
if pretrained_weight is not None: | |||
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_weight)) | |||
# self.embedding.weight.requires_grad = False | |||
def create_tensor(self, tensor): | |||
if self.use_gpu: | |||
return tensor.cuda() | |||
return tensor | |||
def traverse_mul(self, node, batch_index): | |||
size = len(node) | |||
if not size: | |||
return None | |||
batch_current = self.create_tensor(Variable(torch.zeros(size, self.embedding_dim))) | |||
index, children_index = [], [] | |||
current_node, children = [], [] | |||
for i in range(size): | |||
# if node[i][0] is not -1: | |||
index.append(i) | |||
current_node.append(node[i][0]) | |||
temp = node[i][1:] | |||
c_num = len(temp) | |||
for j in range(c_num): | |||
if temp[j][0] is not -1: | |||
if len(children_index) <= j: | |||
children_index.append([i]) | |||
children.append([temp[j]]) | |||
else: | |||
children_index[j].append(i) | |||
children[j].append(temp[j]) | |||
# else: | |||
# batch_index[i] = -1 | |||
batch_current = self.W_c(batch_current.index_copy(0, Variable(self.th.LongTensor(index)), | |||
self.embedding(Variable(self.th.LongTensor(current_node))))) | |||
for c in range(len(children)): | |||
zeros = self.create_tensor(Variable(torch.zeros(size, self.encode_dim))) | |||
batch_children_index = [batch_index[i] for i in children_index[c]] | |||
tree = self.traverse_mul(children[c], batch_children_index) | |||
if tree is not None: | |||
batch_current += zeros.index_copy(0, Variable(self.th.LongTensor(children_index[c])), tree) | |||
# batch_index = [i for i in batch_index if i is not -1] | |||
b_in = Variable(self.th.LongTensor(batch_index)) | |||
self.node_list.append(self.batch_node.index_copy(0, b_in, batch_current)) | |||
return batch_current | |||
def forward(self, x, bs): | |||
self.batch_size = bs | |||
self.batch_node = self.create_tensor(Variable(torch.zeros(self.batch_size, self.encode_dim))) | |||
self.node_list = [] | |||
self.traverse_mul(x, list(range(self.batch_size))) | |||
self.node_list = torch.stack(self.node_list) | |||
return torch.max(self.node_list, 0)[0] | |||
class BatchProgramCC(nn.Module): | |||
def __init__(self, embedding_dim, hidden_dim, vocab_size, encode_dim, label_size, batch_size, use_gpu=True, pretrained_weight=None): | |||
super(BatchProgramCC, self).__init__() | |||
self.stop = [vocab_size-1] | |||
self.hidden_dim = hidden_dim | |||
self.num_layers = 1 | |||
self.gpu = use_gpu | |||
self.batch_size = batch_size | |||
self.vocab_size = vocab_size | |||
self.embedding_dim = embedding_dim | |||
self.encode_dim = encode_dim | |||
self.label_size = label_size | |||
self.encoder = BatchTreeEncoder(self.vocab_size, self.embedding_dim, self.encode_dim, | |||
self.batch_size, self.gpu, pretrained_weight) | |||
self.root2label = nn.Linear(self.encode_dim, self.label_size) | |||
# gru | |||
self.bigru = nn.GRU(self.encode_dim, self.hidden_dim, num_layers=self.num_layers, bidirectional=True, | |||
batch_first=True) | |||
# linear | |||
self.hidden2label = nn.Linear(self.hidden_dim * 2, self.label_size) | |||
# hidden | |||
# self.hidden = self.init_hidden() | |||
self.dropout = nn.Dropout(0.2) | |||
def init_hidden(self): | |||
if self.gpu is True: | |||
if isinstance(self.bigru, nn.LSTM): | |||
h0 = Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim).cuda()) | |||
c0 = Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim).cuda()) | |||
return h0, c0 | |||
return Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim)).cuda() | |||
else: | |||
return Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim)) | |||
def get_zeros(self, num): | |||
zeros = Variable(torch.zeros(num, self.encode_dim)) | |||
if self.gpu: | |||
return zeros.cuda() | |||
return zeros | |||
def encode(self, x): | |||
# print(x) | |||
lens = [len(item) for item in x] | |||
max_len = max(lens) | |||
encodes = [] | |||
for i in range(self.batch_size): | |||
for j in range(lens[i]): | |||
encodes.append(x[i][j]) | |||
encodes = self.encoder(encodes, sum(lens)) | |||
seq, start, end = [], 0, 0 | |||
for i in range(self.batch_size): | |||
end += lens[i] | |||
if max_len - lens[i]: | |||
seq.append(self.get_zeros(max_len - lens[i])) | |||
seq.append(encodes[start:end]) | |||
start = end | |||
encodes = torch.cat(seq) | |||
encodes = encodes.view(self.batch_size, max_len, -1) | |||
# gru | |||
gru_out, hidden = self.bigru(encodes, self.hidden) | |||
gru_out = torch.transpose(gru_out, 1, 2) | |||
# pooling | |||
gru_out = F.max_pool1d(gru_out, gru_out.size(2)).squeeze(2) | |||
return gru_out | |||
def forward(self, x1, x2): | |||
lvec, rvec = self.encode(x1), self.encode(x2) | |||
# abs_dist = torch.abs(torch.add(lvec, -rvec)) | |||
y = F.cosine_similarity(rvec, lvec).view(-1) | |||
# y = torch.sigmoid(self.hidden2label(abs_dist)) | |||
return y | |||
# def encode(self, x): | |||
# bs = x.size(0) | |||
# lens = [len(item) for item in x] | |||
# max_len = max(lens) | |||
# encodes = x | |||
# # encodes = [] | |||
# # for i in range(self.batch_size): | |||
# # for j in range(lens[i]): | |||
# # encodes.append(x[i][j]) | |||
# # | |||
# # encodes = self.encoder(encodes, sum(lens)) | |||
# seq, start, end = [], 0, 0 | |||
# for i in range(bs): | |||
# end += lens[i] | |||
# if max_len-lens[i]: | |||
# seq.append(self.get_zeros(max_len-lens[i])) | |||
# seq.append(encodes[start:end]) | |||
# start = end | |||
# encodes = torch.cat(seq) | |||
# encodes = encodes.view(bs, max_len, -1) | |||
# # return encodes | |||
# | |||
# # gru_out, hidden = self.bigru(encodes, self.hidden) | |||
# gru_out, hidden = self.bigru(encodes) | |||
# gru_out = torch.transpose(gru_out, 1, 2) | |||
# # pooling | |||
# gru_out = F.max_pool1d(gru_out, gru_out.size(2)).squeeze(2) | |||
# | |||
# return gru_out | |||
# | |||
# def forward(self, x1, x2): | |||
# lvec, rvec = self.encode(x1), self.encode(x2) | |||
# | |||
# abs_dist = torch.abs(torch.add(lvec, -rvec)) | |||
# | |||
# y = torch.sigmoid(self.hidden2label(abs_dist)).view(x1.size(0), -1) | |||
# t = (~(y > 0.5)).float() | |||
# out = torch.cat([t, y],dim=1) | |||
# return out | |||
@ -0,0 +1,193 @@ | |||
import pandas as pd | |||
import os | |||
import sys | |||
import warnings | |||
warnings.filterwarnings('ignore') | |||
class Pipeline: | |||
def __init__(self, ratio, root, language): | |||
self.ratio = ratio | |||
self.root = root | |||
self.language = language | |||
self.sources = None | |||
self.blocks = None | |||
self.pairs = None | |||
self.train_file_path = None | |||
self.dev_file_path = None | |||
self.test_file_path = None | |||
self.size = None | |||
# parse source code | |||
def parse_source(self, output_file, option): | |||
path = self.root+self.language+'/'+output_file | |||
if os.path.exists(path) and option == 'existing': | |||
source = pd.read_pickle(path) | |||
else: | |||
if self.language is 'c': | |||
from pycparser import c_parser | |||
parser = c_parser.CParser() | |||
source = pd.read_pickle(self.root+self.language+'/programs.pkl') | |||
source.columns = ['id', 'code', 'label'] | |||
source['code'] = source['code'].apply(parser.parse) | |||
source.to_pickle(path) | |||
else: | |||
import javalang | |||
def parse_program(func): | |||
tokens = javalang.tokenizer.tokenize(func) | |||
parser = javalang.parser.Parser(tokens) | |||
tree = parser.parse_member_declaration() | |||
return tree | |||
# source = pd.read_csv(self.root+self.language+'/bcb_funcs_all.tsv', sep='\t', header=None, encoding='utf-8') | |||
source = pd.read_csv(self.root + self.language + '/codes.csv') | |||
source.columns = ['id', 'code'] | |||
source['code'] = source['code'].apply(parse_program) | |||
source.to_pickle(path) | |||
self.sources = source | |||
return source | |||
# create clone pairs | |||
def read_pairs(self, filename): | |||
pairs = pd.read_pickle(self.root+self.language+'/'+filename) | |||
self.pairs = pairs | |||
# split data for training, developing and testing | |||
def split_data(self): | |||
data_path = self.root+self.language+'/' | |||
data = self.pairs | |||
data_num = len(data) | |||
ratios = [int(r) for r in self.ratio.split(':')] | |||
train_split = int(ratios[0]/sum(ratios)*data_num) | |||
val_split = train_split + int(ratios[1]/sum(ratios)*data_num) | |||
data = data.sample(frac=1, random_state=666) | |||
train = data.iloc[:train_split] | |||
dev = data.iloc[train_split:val_split] | |||
test = data.iloc[val_split:] | |||
def check_or_create(path): | |||
if not os.path.exists(path): | |||
os.mkdir(path) | |||
train_path = data_path+'train/' | |||
check_or_create(train_path) | |||
self.train_file_path = train_path+'train_.pkl' | |||
train.to_pickle(self.train_file_path) | |||
dev_path = data_path+'dev/' | |||
check_or_create(dev_path) | |||
self.dev_file_path = dev_path+'dev_.pkl' | |||
dev.to_pickle(self.dev_file_path) | |||
test_path = data_path+'test/' | |||
check_or_create(test_path) | |||
self.test_file_path = test_path+'test_.pkl' | |||
test.to_pickle(self.test_file_path) | |||
# construct dictionary and train word embedding | |||
def dictionary_and_embedding(self, input_file, size): | |||
self.size = size | |||
data_path = self.root+self.language+'/' | |||
if not input_file: | |||
input_file = self.train_file_path | |||
pairs = pd.read_pickle(input_file) | |||
train_ids = pairs['id1'].append(pairs['id2']).unique() | |||
#trees = self.sources.set_index('id',drop=False).loc[train_ids] | |||
trees = self.sources.set_index('id',drop=False).loc[train_ids[0]] | |||
for i in train_ids[1:]: | |||
tmp_tt = self.sources.set_index('id',drop=False).loc[i] | |||
trees = pd.concat([trees,tmp_tt],axis=0) | |||
if not os.path.exists(data_path+'train/embedding'): | |||
os.mkdir(data_path+'train/embedding') | |||
if self.language is 'c': | |||
sys.path.append('../') | |||
from prepare_data import get_sequences as func | |||
else: | |||
from utils import get_sequence as func | |||
def trans_to_sequences(ast): | |||
sequence = [] | |||
func(ast, sequence) | |||
return sequence | |||
corpus = trees['code'].apply(trans_to_sequences) | |||
str_corpus = [' '.join(c) for c in corpus] | |||
trees['code'] = pd.Series(str_corpus) | |||
# trees.to_csv(data_path+'train/programs_ns.tsv') | |||
from gensim.models.word2vec import Word2Vec | |||
w2v = Word2Vec(corpus, size=size, workers=16, sg=1, max_final_vocab=3000) | |||
w2v.save(data_path+'train/embedding/node_w2v_' + str(size)) | |||
# generate block sequences with index representations | |||
def generate_block_seqs(self,size): | |||
self.size = size | |||
if self.language is 'c': | |||
from prepare_data import get_blocks as func | |||
else: | |||
from utils import get_blocks_v1 as func | |||
from gensim.models.word2vec import Word2Vec | |||
word2vec = Word2Vec.load(self.root+self.language+'node_w2v_' + str(self.size)).wv | |||
vocab = word2vec.vocab | |||
max_token = word2vec.syn0.shape[0] | |||
def tree_to_index(node): | |||
token = node.token | |||
result = [vocab[token].index if token in vocab else max_token] | |||
children = node.children | |||
for child in children: | |||
result.append(tree_to_index(child)) | |||
return result | |||
def trans2seq(r): | |||
blocks = [] | |||
func(r, blocks) | |||
tree = [] | |||
for b in blocks: | |||
btree = tree_to_index(b) | |||
tree.append(btree) | |||
return tree | |||
trees = pd.DataFrame(self.sources, copy=True) | |||
trees['code'] = trees['code'].apply(trans2seq) | |||
if 'label' in trees.columns: | |||
trees.drop('label', axis=1, inplace=True) | |||
self.blocks = trees | |||
# merge pairs | |||
def merge(self, data_path, part): | |||
pairs = pd.read_pickle(data_path) | |||
pairs['id1'] = pairs['id1'].astype(int) | |||
pairs['id2'] = pairs['id2'].astype(int) | |||
df = pd.merge(pairs, self.blocks, how='left', left_on='id1', right_on='id') | |||
df = pd.merge(df, self.blocks, how='left', left_on='id2', right_on='id') | |||
df.drop(['id_x', 'id_y'], axis=1,inplace=True) | |||
df.dropna(inplace=True) | |||
df.to_pickle(self.root+self.language+'/'+part+'/blocks.pkl') | |||
# run for processing data to train | |||
def run(self): | |||
print('parse source code...') | |||
self.parse_source(output_file='ast.pkl',option='existing') | |||
print('read id pairs...') | |||
if self.language is 'c': | |||
self.read_pairs('oj_clone_ids.pkl') | |||
else: | |||
self.read_pairs('id_pairs.pkl') | |||
# self.read_pairs('bcb_pair_ids.pkl') | |||
print('split data...') | |||
self.split_data() | |||
#print('train word embedding...') | |||
# self.dictionary_and_embedding(None, 128) | |||
print('generate block sequences...') | |||
self.generate_block_seqs(128) | |||
print('merge pairs and blocks...') | |||
self.merge(self.train_file_path, 'train') | |||
self.merge(self.dev_file_path, 'dev') | |||
self.merge(self.test_file_path, 'test') | |||
lang = "c" | |||
ppl = Pipeline('8:1:1', 'data/', lang) | |||
ppl.run() | |||
@ -0,0 +1,45 @@ | |||
from pycparser import c_parser, c_ast | |||
import pandas as pd | |||
import os | |||
import re | |||
import sys | |||
from gensim.models.word2vec import Word2Vec | |||
import pickle | |||
from tree import ASTNode, SingleNode | |||
import numpy as np | |||
def get_sequences(node, sequence): | |||
current = SingleNode(node) | |||
sequence.append(current.get_token()) | |||
for _, child in node.children(): | |||
get_sequences(child, sequence) | |||
if current.get_token().lower() == 'compound': | |||
sequence.append('End') | |||
def get_blocks(node, block_seq): | |||
children = node.children() | |||
name = node.__class__.__name__ | |||
if name in ['FuncDef', 'If', 'For', 'While', 'DoWhile']: | |||
block_seq.append(ASTNode(node)) | |||
if name is not 'For': | |||
skip = 1 | |||
else: | |||
skip = len(children) - 1 | |||
for i in range(skip, len(children)): | |||
child = children[i][1] | |||
if child.__class__.__name__ not in ['FuncDef', 'If', 'For', 'While', 'DoWhile', 'Compound']: | |||
block_seq.append(ASTNode(child)) | |||
get_blocks(child, block_seq) | |||
elif name is 'Compound': | |||
block_seq.append(ASTNode(name)) | |||
for _, child in node.children(): | |||
if child.__class__.__name__ not in ['If', 'For', 'While', 'DoWhile']: | |||
block_seq.append(ASTNode(child)) | |||
get_blocks(child, block_seq) | |||
block_seq.append(ASTNode('End')) | |||
else: | |||
for _, child in node.children(): | |||
get_blocks(child, block_seq) |
@ -0,0 +1,90 @@ | |||
import pandas as pd | |||
import torch | |||
import time | |||
import numpy as np | |||
import warnings | |||
from gensim.models.word2vec import Word2Vec | |||
from model import BatchProgramCC | |||
from torch.autograd import Variable | |||
from sklearn.metrics import precision_recall_fscore_support | |||
from tqdm import tqdm | |||
warnings.filterwarnings('ignore') | |||
def get_batch(dataset, idx, bs): | |||
tmp = dataset.iloc[idx: idx+bs] | |||
x1, x2, labels = [], [], [] | |||
for _, item in tmp.iterrows(): | |||
x1.append(item['code_ids_x']) | |||
x2.append(item['code_ids_y']) | |||
labels.append([item['label']]) | |||
return x1, x2, torch.FloatTensor(labels) | |||
if __name__ == '__main__': | |||
lang = 'c' | |||
root = 'data/' | |||
test_data = pd.read_pickle(root+lang+'/test/blocks_new.pkl').sample(frac=1) | |||
word2vec = Word2Vec.load(root+lang+"/node_w2v_128").wv | |||
MAX_TOKENS = word2vec.syn0.shape[0] | |||
EMBEDDING_DIM = word2vec.syn0.shape[1] | |||
embeddings = np.zeros((MAX_TOKENS + 1, EMBEDDING_DIM), dtype="float32") | |||
embeddings[:word2vec.syn0.shape[0]] = word2vec.syn0 | |||
HIDDEN_DIM = 100 | |||
ENCODE_DIM = 128 | |||
LABELS = 1 | |||
EPOCHS = 5 | |||
BATCH_SIZE = 64 | |||
USE_GPU = True | |||
model = BatchProgramCC(EMBEDDING_DIM,HIDDEN_DIM,MAX_TOKENS+1,ENCODE_DIM,LABELS,BATCH_SIZE, | |||
USE_GPU, embeddings) | |||
if USE_GPU: | |||
model.cuda() | |||
parameters = model.parameters() | |||
optimizer = torch.optim.Adamax(parameters) | |||
loss_function = torch.nn.BCELoss() | |||
PATH = './model/model_clone_c.pkl' | |||
checkpoint = torch.load(PATH) | |||
start_epoch = checkpoint['epoch'] | |||
model.load_state_dict(checkpoint['model_state_dict']) | |||
test_data_t = test_data | |||
print("Testing..." ) | |||
# testing procedure | |||
predicts = [] | |||
trues = [] | |||
total_loss = 0.0 | |||
total = 0.0 | |||
i = 0 | |||
for i in tqdm(range(0, len(test_data_t), BATCH_SIZE)): | |||
if i + BATCH_SIZE > len(test_data_t): | |||
BATCH_SIZE = len(test_data_t) - i | |||
batch = get_batch(test_data_t, i, BATCH_SIZE) | |||
i += BATCH_SIZE | |||
test1_inputs, test2_inputs, test_labels = batch | |||
if USE_GPU: | |||
test_labels = test_labels.cuda() | |||
model.batch_size = len(test_labels) | |||
model.hidden = model.init_hidden() | |||
output = model(test1_inputs, test2_inputs) | |||
# loss = loss_function(output, Variable(test_labels)) | |||
# calc testing acc | |||
predicted = (output.data > 0.5).cpu().numpy() | |||
predicts.extend(predicted) | |||
trues.extend(test_labels.cpu().numpy()) | |||
# total += len(test_labels) | |||
# total_loss += loss.item() * len(test_labels) | |||
p, r, f, _ = precision_recall_fscore_support(trues, predicts, average='binary') | |||
print("Testing results(P,R,F1):%.3f, %.3f, %.3f" % (p, r, f)) |
@ -0,0 +1,246 @@ | |||
import pandas as pd | |||
import torch | |||
import time | |||
import numpy as np | |||
import warnings | |||
from gensim.models.word2vec import Word2Vec | |||
from model import BatchProgramCC | |||
from torch.autograd import Variable | |||
from sklearn.metrics import precision_recall_fscore_support | |||
from tqdm import tqdm | |||
warnings.filterwarnings('ignore') | |||
from gensim.models.word2vec import Word2Vec | |||
# word2vec = Word2Vec.load("./train/embedding/node_w2v_128_new").wv | |||
# word2vec.index2word | |||
def get_batch(dataset, idx, bs): | |||
tmp = dataset.iloc[idx: idx+bs] | |||
x1, x2, labels = [], [], [] | |||
for _, item in tmp.iterrows(): | |||
x1.append(eval(item['code_ids_x'])) | |||
x2.append(eval(item['code_ids_y'])) | |||
labels.append([item['label']]) | |||
return x1, x2, torch.FloatTensor(labels) | |||
if __name__ == '__main__': | |||
# import argparse | |||
# | |||
# parser = argparse.ArgumentParser(description="Choose a dataset:[c|java]") | |||
# parser.add_argument('--lang') | |||
# args = parser.parse_args() | |||
# args.lang = 'java' | |||
# if not args.lang: | |||
# print("No specified dataset") | |||
# exit(1) | |||
root = 'data/' | |||
lang = 'java' | |||
categories = 1 | |||
if lang == 'java': | |||
categories = 5 | |||
print("Train for ", str.upper(lang)) | |||
# train_data = pd.read_pickle(root+lang+'/train/blocks_30w.pkl').sample(frac=1) | |||
train_data = pd.read_csv(root + lang + '/train/blocks_30w.csv').sample(frac=1) | |||
train_data = train_data.replace(-1, 0) | |||
# val_data = pd.read_pickle(root+lang+'/dev/blocks_30w.pkl').sample(frac=1) | |||
val_data = pd.read_csv(root + lang + '/dev/blocks_30w.csv').sample(frac=1) | |||
val_data = val_data.replace(-1, 0) | |||
# test_data = pd.read_pickle(root+lang+'/test/blocks_30w.pkl').sample(frac=1) | |||
test_data = pd.read_csv(root + lang + '/test/blocks_30w.csv').sample(frac=1) | |||
test_data = test_data.replace(-1, 0) | |||
test_data.loc[test_data['label'] > 0, 'label'] = 1 | |||
word2vec = Word2Vec.load("./data/java/train/embedding/node_w2v_128_new").wv | |||
MAX_TOKENS = word2vec.syn0.shape[0] | |||
EMBEDDING_DIM = word2vec.syn0.shape[1] | |||
embeddings = np.zeros((MAX_TOKENS + 1, EMBEDDING_DIM), dtype="float32") | |||
embeddings[:word2vec.syn0.shape[0]] = word2vec.syn0 | |||
HIDDEN_DIM = 100 | |||
ENCODE_DIM = 128 | |||
LABELS = 1 | |||
EPOCHS = 10 | |||
BATCH_SIZE = 64 | |||
USE_GPU = True | |||
model = BatchProgramCC(EMBEDDING_DIM,HIDDEN_DIM,MAX_TOKENS+1,ENCODE_DIM,LABELS,BATCH_SIZE, | |||
USE_GPU, embeddings) | |||
if USE_GPU: | |||
model.cuda() | |||
parameters = model.parameters() | |||
optimizer = torch.optim.Adamax(parameters) | |||
loss_function = torch.nn.BCELoss() | |||
PATH = './model/model_clone_java_30w.pkl' | |||
print(train_data) | |||
precision, recall, f1 = 0, 0, 0 | |||
print('Start training...') | |||
for t in range(5, categories+1): | |||
# if lang == 'java': | |||
# # train_data_t = train_data[train_data['label'].isin([t, 0])] | |||
# train_data_t = train_data | |||
# train_data_t.loc[train_data_t['label'] > 0, 'label'] = 1 | |||
# | |||
# # val_data_t = val_data[val_data['label'].isin([t, 0])] | |||
# val_data_t = val_data | |||
# val_data_t.loc[val_data_t['label'] > 0, 'label'] = 1 | |||
# | |||
# # test_data_t = test_data[test_data['label'].isin([t, 0])] | |||
# test_data_t = test_data | |||
# # test_data_t.loc[test_data_t['label'] > 0, 'label'] = 1 | |||
# else: | |||
train_data_t, val_data_t, test_data_t = train_data, val_data, test_data | |||
# training procedure | |||
train_loss_ = [] | |||
val_loss_ = [] | |||
for epoch in range(EPOCHS): | |||
start_time = time.time() | |||
# training epoch | |||
total_acc = 0.0 | |||
total_loss = 0.0 | |||
total = 0.0 | |||
i = 0 | |||
predicts = [] | |||
trues = [] | |||
model.train() | |||
bs = BATCH_SIZE | |||
# while i < len(train_data_t): | |||
for i in tqdm(range(0, len(train_data_t), bs)): | |||
if i + bs > len(train_data_t): | |||
bs = len(train_data_t) - i | |||
batch = get_batch(train_data_t, i, bs) | |||
# i += BATCH_SIZE | |||
train1_inputs, train2_inputs, train_labels = batch | |||
if USE_GPU: | |||
train1_inputs, train2_inputs, train_labels = train1_inputs, train2_inputs, train_labels.cuda() | |||
model.zero_grad() | |||
model.batch_size = len(train_labels) | |||
model.hidden = model.init_hidden() | |||
output = model(train1_inputs, train2_inputs) | |||
loss = loss_function(output, Variable(train_labels)) | |||
loss.backward() | |||
optimizer.step() | |||
total += len(train_labels) | |||
total_loss += loss.item() * len(train_labels) | |||
predicted = (output.data > 0.5).cpu().numpy() | |||
predicts.extend(predicted) | |||
trues.extend(train_labels.cpu().numpy()) | |||
train_loss_.append(total_loss / total) | |||
precision, recall, f1, _ = precision_recall_fscore_support(trues, predicts, average='binary') | |||
total_loss = 0.0 | |||
total = 0.0 | |||
i = 0 | |||
bs = BATCH_SIZE | |||
predicts = [] | |||
trues = [] | |||
model.eval() | |||
# while i < len(val_data_t): | |||
# batch = get_batch(val_data_t, i, BATCH_SIZE) | |||
# i += BATCH_SIZE | |||
for i in tqdm(range(0, len(val_data_t), bs)): | |||
if i + bs > len(val_data_t): | |||
bs = len(val_data_t) - i | |||
batch = get_batch(val_data_t, i, BATCH_SIZE) | |||
val1_inputs, val2_inputs, val_labels = batch | |||
if USE_GPU: | |||
val1_inputs, val2_inputs, val_labels = val1_inputs, val2_inputs, val_labels.cuda() | |||
model.batch_size = len(val_labels) | |||
model.hidden = model.init_hidden() | |||
output = model(val1_inputs, val2_inputs) | |||
loss = loss_function(output, Variable(val_labels)) | |||
total += len(val_labels) | |||
total_loss += loss.item() * len(val_labels) | |||
predicted = (output.data > 0.5).cpu().numpy() | |||
predicts.extend(predicted) | |||
trues.extend(val_labels.cpu().numpy()) | |||
val_loss_.append(total_loss / total) | |||
precision_, recall_, f1_, _ = precision_recall_fscore_support(trues, predicts, average='binary') | |||
print('categories-%d [Epoch: %3d/%3d] Training Loss: %.4f, Validation Loss: %.4f,' | |||
% (t, epoch + 1, EPOCHS, train_loss_[epoch], val_loss_[epoch])) | |||
print("Train results(P,R,F1):%.3f, %.3f, %.3f" % (precision, recall, f1)) | |||
print("Dev results(P,R,F1):%.3f, %.3f, %.3f" % (precision_, recall_, f1_)) | |||
torch.save({'epoch': epoch, | |||
'model_state_dict': model.state_dict() | |||
}, PATH) | |||
print("Testing-%d..." % t) | |||
# testing procedure | |||
predicts = [] | |||
trues = [] | |||
total_loss = 0.0 | |||
total = 0.0 | |||
i = 0 | |||
while i < len(test_data_t): | |||
batch = get_batch(test_data_t, i, BATCH_SIZE) | |||
i += BATCH_SIZE | |||
test1_inputs, test2_inputs, test_labels = batch | |||
if USE_GPU: | |||
test_labels = test_labels.cuda() | |||
model.batch_size = len(test_labels) | |||
model.hidden = model.init_hidden() | |||
output = model(test1_inputs, test2_inputs) | |||
# loss = loss_function(output, Variable(test_labels)) | |||
# calc testing acc | |||
predicted = (output.data > 0.5).cpu().numpy() | |||
predicts.extend(predicted) | |||
trues.extend(test_labels.cpu().numpy()) | |||
# total += len(test_labels) | |||
# total_loss += loss.item() * len(test_labels) | |||
precision_, recall_, f1_, _ = precision_recall_fscore_support(trues, predicts, average='binary') | |||
print("Test results(P,R,F1):%.3f, %.3f, %.3f" % (precision_, recall_, f1_)) | |||
# result = pd.DataFrame(np.array(predicts), columns=['predict']) | |||
# result['true'] = pd.DataFrame(np.array(trues)) | |||
# result['label'] = pd.DataFrame(np.array(trues)) | |||
# result.loc[result['label'] > 0, 'label'] = 1 | |||
# weights = [0, 0.005, 0.001, 0.002, 0.010, 0.982] | |||
# for k in range(1, categories+1): | |||
# trues_ = result[result['true'].isin([0, k])]['label'].values | |||
# predicts_ = result[result['true'].isin([0, k])]['predict'].values | |||
# p, r, f, _ = precision_recall_fscore_support(trues_, predicts_, average='binary') | |||
# precision += weights[k] * p | |||
# recall += weights[k] * r | |||
# f1 += weights[k] * f | |||
# print("Type-" + str(k) + ": " + str(p) + " " + str(r) + " " + str(f)) | |||
# | |||
# print("Total testing results(P,R,F1):%.3f, %.3f, %.3f" % (precision, recall, f1)) | |||
# if lang == 'java': | |||
# weights = [0, 0.005, 0.001, 0.002, 0.010, 0.982] | |||
# p, r, f, _ = precision_recall_fscore_support(trues, predicts, average='binary') | |||
# precision += weights[t] * p | |||
# recall += weights[t] * r | |||
# f1 += weights[t] * f | |||
# print("Type-" + str(t) + ": " + str(p) + " " + str(r) + " " + str(f)) | |||
# else: | |||
# precision, recall, f1, _ = precision_recall_fscore_support(trues, predicts, average='binary') | |||
# | |||
# print("Total testing results(P,R,F1):%.3f, %.3f, %.3f" % (precision, recall, f1)) |
@ -0,0 +1,170 @@ | |||
from javalang.ast import Node | |||
class ASTNode(object): | |||
def __init__(self, node): | |||
self.node = node | |||
# self.vocab = word_map | |||
self.is_str = isinstance(self.node, str) | |||
self.token = self.get_token() | |||
# self.index = self.token_to_index(self.token) | |||
self.children = self.add_children() | |||
def is_leaf(self): | |||
if self.is_str: | |||
return True | |||
return len(self.node.children()) == 0 | |||
def get_token(self, lower=True): | |||
if self.is_str: | |||
return self.node | |||
name = self.node.__class__.__name__ | |||
token = name | |||
is_name = False | |||
if self.is_leaf(): | |||
attr_names = self.node.attr_names | |||
if attr_names: | |||
if 'names' in attr_names: | |||
token = self.node.names[0] | |||
elif 'name' in attr_names: | |||
token = self.node.name | |||
is_name = True | |||
else: | |||
token = self.node.value | |||
else: | |||
token = name | |||
else: | |||
if name == 'TypeDecl': | |||
token = self.node.declname | |||
if self.node.attr_names: | |||
attr_names = self.node.attr_names | |||
if 'op' in attr_names: | |||
if self.node.op[0] == 'p': | |||
token = self.node.op[1:] | |||
else: | |||
token = self.node.op | |||
if token is None: | |||
token = name | |||
if lower and is_name: | |||
token = token.lower() | |||
return token | |||
# def token_to_index(self, token): | |||
# self.index = self.vocab[token].index if token in self.vocab else MAX_TOKENS | |||
# return self.index | |||
# def get_index(self): | |||
# return self.index | |||
def add_children(self): | |||
if self.is_str: | |||
return [] | |||
children = self.node.children() | |||
if self.token in ['FuncDef', 'If', 'While', 'DoWhile']: | |||
return [ASTNode(children[0][1])] | |||
elif self.token == 'For': | |||
return [ASTNode(children[c][1]) for c in range(0, len(children)-1)] | |||
else: | |||
return [ASTNode(child) for _, child in children] | |||
class BlockNode(object): | |||
def __init__(self, node): | |||
self.node = node | |||
self.is_str = isinstance(self.node, str) | |||
self.token = self.get_token(node) | |||
self.children = self.add_children() | |||
def is_leaf(self): | |||
if self.is_str: | |||
return True | |||
return len(self.node.children) == 0 | |||
def get_token(self, node): | |||
if isinstance(node, str): | |||
token = node | |||
elif isinstance(node, set): | |||
token = 'Modifier' | |||
elif isinstance(node, Node): | |||
token = node.__class__.__name__ | |||
else: | |||
token = '' | |||
return token | |||
def ori_children(self, root): | |||
if isinstance(root, Node): | |||
if self.token in ['MethodDeclaration', 'ConstructorDeclaration']: | |||
children = root.children[:-1] | |||
else: | |||
children = root.children | |||
elif isinstance(root, set): | |||
children = list(root) | |||
else: | |||
children = [] | |||
def expand(nested_list): | |||
for item in nested_list: | |||
if isinstance(item, list): | |||
for sub_item in expand(item): | |||
yield sub_item | |||
elif item: | |||
yield item | |||
return list(expand(children)) | |||
def add_children(self): | |||
if self.is_str: | |||
return [] | |||
logic = ['SwitchStatement', 'IfStatement', 'ForStatement', 'WhileStatement', 'DoStatement'] | |||
children = self.ori_children(self.node) | |||
if self.token in logic: | |||
return [BlockNode(children[0])] | |||
elif self.token in ['MethodDeclaration', 'ConstructorDeclaration']: | |||
return [BlockNode(child) for child in children] | |||
else: | |||
return [BlockNode(child) for child in children if self.get_token( child) not in logic] | |||
class SingleNode(ASTNode): | |||
def __init__(self, node): | |||
self.node = node | |||
self.is_str = isinstance(self.node, str) | |||
self.token = self.get_token() | |||
self.children = [] | |||
def is_leaf(self): | |||
if self.is_str: | |||
return True | |||
return len(self.node.children()) == 0 | |||
def get_token(self, lower=True): | |||
if self.is_str: | |||
return self.node | |||
name = self.node.__class__.__name__ | |||
token = name | |||
is_name = False | |||
if self.is_leaf(): | |||
attr_names = self.node.attr_names | |||
if attr_names: | |||
if 'names' in attr_names: | |||
token = self.node.names[0] | |||
elif 'name' in attr_names: | |||
token = self.node.name | |||
is_name = True | |||
else: | |||
token = self.node.value | |||
else: | |||
token = name | |||
else: | |||
if name == 'TypeDecl': | |||
token = self.node.declname | |||
if self.node.attr_names: | |||
attr_names = self.node.attr_names | |||
if 'op' in attr_names: | |||
if self.node.op[0] == 'p': | |||
token = self.node.op[1:] | |||
else: | |||
token = self.node.op | |||
if token is None: | |||
token = name | |||
if lower and is_name: | |||
token = token.lower() | |||
return token |
@ -0,0 +1,80 @@ | |||
import pandas as pd | |||
import javalang | |||
from javalang.ast import Node | |||
from tree import ASTNode, BlockNode | |||
import sys | |||
sys.setrecursionlimit(10000) | |||
def get_token(node): | |||
token = '' | |||
if isinstance(node, str): | |||
token = node | |||
elif isinstance(node, set): | |||
token = 'Modifier'#node.pop() | |||
elif isinstance(node, Node): | |||
token = node.__class__.__name__ | |||
return token | |||
def get_children(root): | |||
if isinstance(root, Node): | |||
children = root.children | |||
elif isinstance(root, set): | |||
children = list(root) | |||
else: | |||
children = [] | |||
def expand(nested_list): | |||
for item in nested_list: | |||
if isinstance(item, list): | |||
for sub_item in expand(item): | |||
yield sub_item | |||
elif item: | |||
yield item | |||
return list(expand(children)) | |||
def get_sequence(node, sequence): | |||
token, children = get_token(node), get_children(node) | |||
sequence.append(token) | |||
for child in children: | |||
get_sequence(child, sequence) | |||
if token in ['ForStatement', 'WhileStatement', 'DoStatement','SwitchStatement', 'IfStatement']: | |||
sequence.append('End') | |||
def get_blocks_v1(node, block_seq): | |||
name, children = get_token(node), get_children(node) | |||
logic = ['SwitchStatement','IfStatement', 'ForStatement', 'WhileStatement', 'DoStatement'] | |||
if name in ['MethodDeclaration', 'ConstructorDeclaration']: | |||
block_seq.append(BlockNode(node)) | |||
body = node.body | |||
for child in body: | |||
if get_token(child) not in logic and not hasattr(child, 'block'): | |||
block_seq.append(BlockNode(child)) | |||
else: | |||
get_blocks_v1(child, block_seq) | |||
elif name in logic: | |||
block_seq.append(BlockNode(node)) | |||
for child in children[1:]: | |||
token = get_token(child) | |||
if not hasattr(node, 'block') and token not in logic+['BlockStatement']: | |||
block_seq.append(BlockNode(child)) | |||
else: | |||
get_blocks_v1(child, block_seq) | |||
block_seq.append(BlockNode('End')) | |||
elif name is 'BlockStatement' or hasattr(node, 'block'): | |||
block_seq.append(BlockNode(name)) | |||
for child in children: | |||
if get_token(child)not in logic: | |||
block_seq.append(BlockNode(child)) | |||
else: | |||
get_blocks_v1(child, block_seq) | |||
else: | |||
for child in children: | |||
get_blocks_v1(child, block_seq) | |||
@ -0,0 +1,7 @@ | |||
1. 处理数据 python pipeline.py | |||
- 修改split函数,所有都是测试 | |||
- 数据在data/c/下面,两个pkl要替换成新的 | |||
- 必须是可编译的代码 | |||
2. 运行test python test.py | |||
- 可以放在126服务器上用cuda跑 | |||
- 把克隆看做二分类任务,阈值可以自行调整 |
@ -0,0 +1,27 @@ | |||
-----BEGIN RSA PRIVATE KEY----- | |||
MIIEowIBAAKCAQEA2R5MIQo+9/oysDtsH0s9xGxBCvD0dtNgMnawXmt1ZJdzAvzf | |||
bQ3AmT2y40zLtehcqU1XZh1LXZMfqi8GFwznm7fgCM1DqAd/kOKdBCpANWgj+OvL | |||
Kxwum4bBbmuMg4IYpXoKaf94MWcx5axBmksYrnF2D95avcGYcSxoLHNL86+KY6xJ | |||
9rMvORs+gLpMQGrY39Cz1n7Ef/9u0LRuWRK+4LHGxP/P4lMC4FvXH2K90RhQln0j | |||
7RR4uxwkQn57Vhqf2cimTZpouma9+/hVpiaoVdaQbvRFFVRTRLcdsdftemQIqNhc | |||
2KV2LzsivhbFcOwsLj7jSIjcwOwR0fH7TIel7QIDAQABAoIBAARILS8voQtJ82r3 | |||
WQwK81Zm3ieFlgSr6YdFQPgzvVZ1CC8kZpGjhktfZKJH9vKI+R7bqCAa7swTJTo5 | |||
gDC/L+gpybDSv4VWVIU6eudEoAyNl7wGhnS8swydLT5sv5IuZCcLT55EjA3JX7oM | |||
WiTdW0jBcxcgBwEcCtIckpdh1LsjD1XU6m54w/c8gyfuZcWK+ByFbVzdk5m5sco+ | |||
wJso3AXFtxx9LhG82XMJW5BKuQfJH2bNI5YoSEGe1fzddZ+ugoCm7iNo4xf83JMT | |||
9fUBlYxSlBa16lOaA9gwZgQEze7PEfzFKLQLFYTBDp7/QL02+TXAf4/ZdcXUnENs | |||
8PrABS8CgYEA8Yr91FdDmxQrXwy8S2CYI3JuWt7LqqUipxduz6xyi6dZHlqXslAn | |||
ndCVp6ihwryrzWSk2KltRaQIdbVTq7YukLQ0uKk/xneaeOh5RC9PclpGivzXNRAX | |||
cEbro0pz/htPQ9ZVTVuvKhOpDclz1qxhhxR29I/oxU6NkBfGJf3G9TcCgYEA5h0Q | |||
QHbkcyp8uAyXXXtbJTzAWkvKYWVZ2wwOF4MJ7l670BZef26IpY368zjmjKgNNK07 | |||
WV4FmFIuZje2Wb9jW3gV5qRysrmlb8Vzks7GEUKLSQcoPvVsiDyh8VvW0qEirH3o | |||
rGpJeQBrrot9DirmQ+ehDKjGl1b1z3X+/XohD/sCgYBsX/2lsYW+5hzTp+YwN+Xr | |||
OaO0F/Tv2uoiaIwql+hJKsv8p48azYYI9BbBxBLYSkkXfgnMwLArp/63uaUSDUr1 | |||
WDWziRT5Wp6vkzcd1dBisYinQezZfR/XG6sMeBJ1OBGnkVpyvClqyql2ayYTcwLL | |||
Ve5Nqug45xbzSQd58lS7nwKBgQDGKkUynrCBlRcukHlRoceOO9ESca5pTZSiFLGW | |||
AdztkFuBCaJ7bz7yA2EXT+sLOjWVJZG0lkmPMUaP9G5uv2ja/oEtzHSSAVm89Xdf | |||
9/2OI5Y7X5SDE2tRr5Vuer53SRjJHuzeffGj6H7TI4CgUMVXuQNyGW5cKiEpdd4P | |||
f7s1PQKBgFrezClBhd8c72+Q3hZZKoK5VpbqZXMcfcRoObpQ4W2OTY2+jqrSwhOO | |||
12fWIG473Ok0pnTNxAsO4IdhKpWlXySMDwxS0Rns6TAcDnJa4sCahKnqIoMAqSTA | |||
VUT/kwEUPat2/zlUhfOl4LooLAW36GDC/nc2urj2uVopdwdOTwVW | |||
-----END RSA PRIVATE KEY----- |
@ -0,0 +1,76 @@ | |||
-----BEGIN CERTIFICATE----- | |||
MIIHoTCCBYmgAwIBAgIQDOVX1N5YbGyIxnUGCW2zsDANBgkqhkiG9w0BAQsFADBc | |||
MQswCQYDVQQGEwJVUzEXMBUGA1UEChMORGlnaUNlcnQsIEluYy4xNDAyBgNVBAMT | |||
K1JhcGlkU1NMIEdsb2JhbCBUTFMgUlNBNDA5NiBTSEEyNTYgMjAyMiBDQTEwHhcN | |||
MjMwMTAyMDAwMDAwWhcNMjQwMTEyMjM1OTU5WjAcMRowGAYDVQQDDBEqLnNodWlz | |||
aGFuLm5ldC5jbjCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBANkeTCEK | |||
Pvf6MrA7bB9LPcRsQQrw9HbTYDJ2sF5rdWSXcwL8320NwJk9suNMy7XoXKlNV2Yd | |||
S12TH6ovBhcM55u34AjNQ6gHf5DinQQqQDVoI/jryyscLpuGwW5rjIOCGKV6Cmn/ | |||
eDFnMeWsQZpLGK5xdg/eWr3BmHEsaCxzS/OvimOsSfazLzkbPoC6TEBq2N/Qs9Z+ | |||
xH//btC0blkSvuCxxsT/z+JTAuBb1x9ivdEYUJZ9I+0UeLscJEJ+e1Yan9nIpk2a | |||
aLpmvfv4VaYmqFXWkG70RRVUU0S3HbHX7XpkCKjYXNildi87Ir4WxXDsLC4+40iI | |||
3MDsEdHx+0yHpe0CAwEAAaOCA50wggOZMB8GA1UdIwQYMBaAFPCchf2in32PyWi7 | |||
1dSJTR2+05D/MB0GA1UdDgQWBBQ8TpzGYRl5Mcx4zZ8subB5HviPVTAtBgNVHREE | |||
JjAkghEqLnNodWlzaGFuLm5ldC5jboIPc2h1aXNoYW4ubmV0LmNuMA4GA1UdDwEB | |||
/wQEAwIFoDAdBgNVHSUEFjAUBggrBgEFBQcDAQYIKwYBBQUHAwIwgZ8GA1UdHwSB | |||
lzCBlDBIoEagRIZCaHR0cDovL2NybDMuZGlnaWNlcnQuY29tL1JhcGlkU1NMR2xv | |||
YmFsVExTUlNBNDA5NlNIQTI1NjIwMjJDQTEuY3JsMEigRqBEhkJodHRwOi8vY3Js | |||
NC5kaWdpY2VydC5jb20vUmFwaWRTU0xHbG9iYWxUTFNSU0E0MDk2U0hBMjU2MjAy | |||
MkNBMS5jcmwwPgYDVR0gBDcwNTAzBgZngQwBAgEwKTAnBggrBgEFBQcCARYbaHR0 | |||
cDovL3d3dy5kaWdpY2VydC5jb20vQ1BTMIGHBggrBgEFBQcBAQR7MHkwJAYIKwYB | |||
BQUHMAGGGGh0dHA6Ly9vY3NwLmRpZ2ljZXJ0LmNvbTBRBggrBgEFBQcwAoZFaHR0 | |||
cDovL2NhY2VydHMuZGlnaWNlcnQuY29tL1JhcGlkU1NMR2xvYmFsVExTUlNBNDA5 | |||
NlNIQTI1NjIwMjJDQTEuY3J0MAkGA1UdEwQCMAAwggGABgorBgEEAdZ5AgQCBIIB | |||
cASCAWwBagB3AHb/iD8KtvuVUcJhzPWHujS0pM27KdxoQgqf5mdMWjp0AAABhXFA | |||
a8wAAAQDAEgwRgIhAI++QoPxPN2iOrxIQegcdgwWNzFPnZRoDFKXpBRKMBtlAiEA | |||
vw/HkDuckkDkfKvtFp1VxeS7GyaetlhEjQOK6ixcuP8AdgBIsONr2qZHNA/lagL6 | |||
nTDrHFIBy1bdLIHZu7+rOdiEcwAAAYVxQGuxAAAEAwBHMEUCIQDhcg/4dci0YtzM | |||
59uvgT4+2W780D6oRtCcX0IofxpnKwIgMliXM53/OAYXc0cpaKeotuoQE5ntDMCX | |||
FfojCPe3G9IAdwA7U3d1Pi25gE6LMFsG/kA7Z9hPw/THvQANLXJv4frUFwAAAYVx | |||
QGuvAAAEAwBIMEYCIQDvjMHZOQZQ08BLD5/XAHJ6Sw4HaEVwyd+lFpYHLi24vwIh | |||
AJO6f0RX/rG56cKjHWV/mQsRH94kxJDy7EjzU89uAV0XMA0GCSqGSIb3DQEBCwUA | |||
A4ICAQAq1H2pr19LU6VnkZGhGlIklQJJ4lhXfX6ciEP+9MSgbUyTTeohv0nUYl1i | |||
+8ehjD9yBnujUKgjxQZ3KsPn9LSgukvTU1WOEBWw05qywBg9UQ4nnc4tkghSVk35 | |||
YhJum5L6Xxr0U7ohnffFaFn07+Am/q0HlGtHUvSsrN2yh3idAupQmRWp3sLQl+LR | |||
VL/ynq2InSGoNcawFiIKd84CJMoHMyXW24iIop044yBvRl6v5DI74j6RUUno75rI | |||
G3HK1NUfREBeKGV7s7cTFYbR+bBFuIURHs05nGeHy+xHxFh7CwhY2Bg1Do8Mbqzb | |||
EAVV5yOvizkNqaVULcGg1+KEU92doK625dQ7iWqGLnX5gqFEAQaUgIX0MEgD4SDR | |||
kr73k5aEKvxCR2y89+7ieHyZM3sFX9SoCn8Az/WaNwNInqaE7uewodi+mKr7AQNH | |||
OoipoFvc5v7uZNnt+Ixv8VBB66jhNMYZ4YijXMpdqNYLerMVlsTZoavkaznkdQW3 | |||
jRKcjG35gN21vyKtao0tQC7CZpwGJMqKluDTU6qY8NbvCKEyRUKBH6FKh3FSj8tg | |||
t4zEnE+XLsKys3NNuDMhA+q+MCSmBE5rqz1l4z7O2a8UQ6vKc9fSULWTK4qJuSgq | |||
gkhh6LksuplrqG7E6yXHfRNMBuVQiMwgwATiRySDNuOvHJPaWw== | |||
-----END CERTIFICATE----- | |||
-----BEGIN CERTIFICATE----- | |||
MIIFyzCCBLOgAwIBAgIQCgWbJfVLPYeUzGYxR3U4ozANBgkqhkiG9w0BAQsFADBh | |||
MQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5jMRkwFwYDVQQLExB3 | |||
d3cuZGlnaWNlcnQuY29tMSAwHgYDVQQDExdEaWdpQ2VydCBHbG9iYWwgUm9vdCBD | |||
QTAeFw0yMjA1MDQwMDAwMDBaFw0zMTExMDkyMzU5NTlaMFwxCzAJBgNVBAYTAlVT | |||
MRcwFQYDVQQKEw5EaWdpQ2VydCwgSW5jLjE0MDIGA1UEAxMrUmFwaWRTU0wgR2xv | |||
YmFsIFRMUyBSU0E0MDk2IFNIQTI1NiAyMDIyIENBMTCCAiIwDQYJKoZIhvcNAQEB | |||
BQADggIPADCCAgoCggIBAKY5PJhwCX2UyBb1nelu9APen53D5+C40T+BOZfSFaB0 | |||
v0WJM3BGMsuiHZX2IHtwnjUhLL25d8tgLASaUNHCBNKKUlUGRXGztuDIeXb48d64 | |||
k7Gk7u7mMRSrj+yuLSWOKnK6OGKe9+s6oaVIjHXY+QX8p2I2S3uew0bW3BFpkeAr | |||
LBCU25iqeaoLEOGIa09DVojd3qc/RKqr4P11173R+7Ub05YYhuIcSv8e0d7qN1sO | |||
1+lfoNMVfV9WcqPABmOasNJ+ol0hAC2PTgRLy/VZo1L0HRMr6j8cbR7q0nKwdbn4 | |||
Ar+ZMgCgCcG9zCMFsuXYl/rqobiyV+8U37dDScAebZTIF/xPEvHcmGi3xxH6g+dT | |||
CjetOjJx8sdXUHKXGXC9ka33q7EzQIYlZISF7EkbT5dZHsO2DOMVLBdP1N1oUp0/ | |||
1f6fc8uTDduELoKBRzTTZ6OOBVHeZyFZMMdi6tA5s/jxmb74lqH1+jQ6nTU2/Mma | |||
hGNxUuJpyhUHezgBA6sto5lNeyqc+3Cr5ehFQzUuwNsJaWbDdQk1v7lqRaqOlYjn | |||
iomOl36J5txTs0wL7etCeMRfyPsmc+8HmH77IYVMUOcPJb+0gNuSmAkvf5QXbgPI | |||
Zursn/UYnP9obhNbHc/9LYdQkB7CXyX9mPexnDNO7pggNA2jpbEarLmZGi4grMmf | |||
AgMBAAGjggGCMIIBfjASBgNVHRMBAf8ECDAGAQH/AgEAMB0GA1UdDgQWBBTwnIX9 | |||
op99j8lou9XUiU0dvtOQ/zAfBgNVHSMEGDAWgBQD3lA1VtFMu2bwo+IbG8OXsj3R | |||
VTAOBgNVHQ8BAf8EBAMCAYYwHQYDVR0lBBYwFAYIKwYBBQUHAwEGCCsGAQUFBwMC | |||
MHYGCCsGAQUFBwEBBGowaDAkBggrBgEFBQcwAYYYaHR0cDovL29jc3AuZGlnaWNl | |||
cnQuY29tMEAGCCsGAQUFBzAChjRodHRwOi8vY2FjZXJ0cy5kaWdpY2VydC5jb20v | |||
RGlnaUNlcnRHbG9iYWxSb290Q0EuY3J0MEIGA1UdHwQ7MDkwN6A1oDOGMWh0dHA6 | |||
Ly9jcmwzLmRpZ2ljZXJ0LmNvbS9EaWdpQ2VydEdsb2JhbFJvb3RDQS5jcmwwPQYD | |||
VR0gBDYwNDALBglghkgBhv1sAgEwBwYFZ4EMAQEwCAYGZ4EMAQIBMAgGBmeBDAEC | |||
AjAIBgZngQwBAgMwDQYJKoZIhvcNAQELBQADggEBAAfjh/s1f5dDdfm0sNm74/dW | |||
MbbsxfYV1LoTpFt+3MSUWvSbiPQfUkoV57b5rutRJvnPP9mSlpFwcZ3e1nSUbi2o | |||
ITGA7RCOj23I1F4zk0YJm42qAwJIqOVenR3XtyQ2VR82qhC6xslxtNf7f2Ndx2G7 | |||
Mem4wpFhyPDT2P6UJ2MnrD+FC//ZKH5/ERo96ghz8VqNlmL5RXo8Ks9rMr/Ad9xw | |||
Y4hyRvAz5920myUffwdUqc0SvPlFnahsZg15uT5HkK48tHR0TLuLH8aRpzh4KJ/Y | |||
p0sARNb+9i1R4Fg5zPNvHs2BbIve0vkwxAy+R4727qYzl3027w9jEFC6HMXRaDc= | |||
-----END CERTIFICATE----- |
@ -0,0 +1,43 @@ | |||
FROM continuumio/miniconda3:4.12.0 | |||
RUN apt-get update \ | |||
&& apt-get -y upgrade \ | |||
&& apt-get -y install zip curl \ | |||
&& apt-get install -y \ | |||
&& apt-get autoremove -y \ | |||
&& apt-get clean -y \ | |||
&& rm -rf /var/lib/apt/lists/* | |||
SHELL ["/bin/bash", "-o", "pipefail", "-c"] | |||
RUN conda install -y jupyter notebook | |||
COPY notebook.html /tmp/ | |||
RUN rm /opt/conda/lib/python3.9/site-packages/notebook/templates/notebook.html \ | |||
&& mv /tmp/notebook.html /opt/conda/lib/python3.9/site-packages/notebook/templates/ | |||
ARG NB_USER="jupyter" | |||
ARG NB_UID="1000" | |||
ARG NB_GID="100" | |||
ARG NB_PORT=8888 | |||
EXPOSE ${NB_PORT} | |||
ENV SHELL=/bin/bash \ | |||
NB_USER="${NB_USER}" \ | |||
NB_UID=${NB_UID} \ | |||
NB_GID=${NB_GID} \ | |||
PYTHONPATH=$PYTHONPATH:/opt/conda/bin | |||
ENV PATH="${PYTHONPATH}:${PATH}" | |||
RUN useradd -l -m -s /bin/bash -N -u "${NB_UID}" "${NB_USER}" \ | |||
&& mkdir /home/${NB_USER}/.jupyter | |||
COPY jupyter_notebook_config.py /home/${NB_USER}/.jupyter/ | |||
RUN chmod 777 /home/${NB_USER}/.jupyter/jupyter_notebook_config.py \ | |||
&& chmod -R 777 /home/${NB_USER} | |||
ADD enterpoint.sh /enterpoint.sh |
@ -0,0 +1,7 @@ | |||
#! /bin/bash | |||
# dir_name=`ls /home/jupyter` | |||
config_str="\nc.NotebookApp.notebook_dir = \"/home/jupyter\"" | |||
echo -e ${config_str} >> /home/jupyter/.jupyter/jupyter_notebook_config.py | |||
jupyter notebook --ip=0.0.0.0 |