@ -0,0 +1,182 @@ | |||||
## 1 平台部署 | |||||
#### 1.1 部署流程: | |||||
1.把原来的服务器/data数据scp到新服务器的/data | |||||
在数据存放的机器上切换到相应目录并执行 scp ./data 用户名(新)@ip(新):/ | |||||
2.把docker-compose.yml文件复制到新服务器 | |||||
3.切换到yml文件所在目录,执行sudo docker-compose up -d | |||||
#### 1.2 docker-compose.yml | |||||
``` | |||||
version: "3" | |||||
services: | |||||
oj-redis: | |||||
image: redis:4.0-alpine | |||||
container_name: oj-redis | |||||
restart: always | |||||
volumes: | |||||
- /data/data/redis:/data | |||||
oj-postgres: | |||||
image: postgres:10-alpine | |||||
container_name: oj-postgres | |||||
restart: always | |||||
command: postgres -c max_connections=1000 | |||||
ports: | |||||
- "127.0.0.1:12348:5432" | |||||
volumes: | |||||
- /data/data/postgres:/var/lib/postgresql/data | |||||
environment: | |||||
- POSTGRES_DB=onlinejudge | |||||
- POSTGRES_USER=onlinejudge | |||||
- POSTGRES_PASSWORD=onlinejudge | |||||
judge-server: | |||||
image: registry.cn-hangzhou.aliyuncs.com/wsl/judge_server | |||||
container_name: judge-server | |||||
restart: always | |||||
read_only: true | |||||
cap_drop: | |||||
- SETPCAP | |||||
- MKNOD | |||||
- NET_BIND_SERVICE | |||||
- SYS_CHROOT | |||||
- SETFCAP | |||||
- FSETID | |||||
tmpfs: | |||||
- /tmp | |||||
volumes: | |||||
- /data/backend/test_case:/test_case:ro | |||||
- /data/judge_server/log:/log | |||||
- /data/judge_server/run:/judger | |||||
environment: | |||||
- SERVICE_URL=http://judge-server:8080 | |||||
- BACKEND_URL=http://oj-backend:8000/api/judge_server_heartbeat/ | |||||
- TOKEN=DASETALENT | |||||
oj-backend: | |||||
image: registry.cn-hangzhou.aliyuncs.com/wsl/oj_backend | |||||
container_name: oj-backend | |||||
restart: always | |||||
depends_on: | |||||
- oj-redis | |||||
- oj-postgres | |||||
- judge-server | |||||
volumes: | |||||
- /data/data/backend:/data | |||||
- /data/data/app:/app | |||||
environment: | |||||
- POSTGRES_DB=onlinejudge | |||||
- POSTGRES_USER=onlinejudge | |||||
- POSTGRES_PASSWORD=onlinejudge | |||||
- JUDGE_SERVER_TOKEN=DASETALENT | |||||
# - FORCE_HTTPS=1 | |||||
# - STATIC_CDN_HOST=cdn.oj.com | |||||
ports: | |||||
- "0.0.0.0:80:8000" | |||||
- "0.0.0.0:443:1443" | |||||
``` | |||||
## 2 题目创建要求 | |||||
#### 2.1 题面 | |||||
题目表述准确无歧义 | |||||
明确给出输入数据范围 | |||||
#### 2.2 测试用例 | |||||
题面中给出的sample不应出现在测试用例中 | |||||
样例中,小样例比例不高于20%,中等规模样例40%~60%,剩下为接近数据最大范围的样例 | |||||
#### 2.3 标程 | |||||
出题应准备能够保证正确性的标准程序代码 | |||||
## 3 验题要求 | |||||
#### 3.1 不同语言 | |||||
需要检查相同算法,不同的语言的结果是否相同。 | |||||
避免出现如c语言可以通过,python实现的相同算法代码超时现象 | |||||
#### 3.2 验题机制 | |||||
至少两人验题 | |||||
出题人提交标程测试,验证测试用例的正确性 | |||||
非出题人按照做题的流程,阅读题面,检查是否有歧义、表述不清等问题;编写代码进行提交测试。 | |||||
实现标程的其它语言版本,测试是否出现3.1中的问题,视情况可通过修改题目时间限制。 | |||||
## 4 机试具体流程 | |||||
机试前准备阶段 | |||||
1.机试oj主平台上创建比赛,设置密码,关闭real time rank,设置不可见。创建一个新服务器 | |||||
2.创建题目,满足2题目创建要求 | |||||
3.验题,满足3验题要求,验题时需要打开可见,先确保设置了密码,验完及时恢复不可见状态 | |||||
4.将主平台备份至新服务器,运行服务,并验证可用性。参考1平台部署部分。 | |||||
考场快开始阶段 | |||||
5.确认 密码已设置,real time rank为关闭,并且处于不可见状态 | |||||
马上开始阶段 | |||||
6.设置比赛为可见,让考生能够进入比赛输入密码页面 | |||||
7.公布比赛密码 | |||||
考试中 | |||||
8.观测平台可用性,若出现服务异常,优先进入服务器,执行sudo docker-compose restart热重启,没恢复则进行操作9;若恢复则继续考试,不需要操作9 | |||||
9.让考生进入备用服务器考试 | |||||
考试结束(立刻要做的) | |||||
10.修改比赛密码,同时设置为不可见。 | |||||
成绩统计 | |||||
11.参考机试结束后续流程.pdf | |||||
其中连接数据库部分做出如下变化: | |||||
使用SSH通道: | |||||
<img src="image/image-20230328110921250.png" alt="image-20230328110921250" style="zoom:50%;" /> | |||||
连接数据库: | |||||
<img src="image/image-20230328111052680.png" alt="image-20230328111052680" style="zoom:50%;" /> | |||||
数据库、用户名、密码皆为onlinejudge | |||||
## 机试结束,手动去做一个永久快照。 |
@ -0,0 +1,160 @@ | |||||
证书在文件夹cert中,其中9085819__XXX的过期时间为2024年1月13日星期六 07:59:59 | |||||
需要一年一换!!! | |||||
证书找水杉那边管理的同学要 | |||||
#### 1天梯证书部署: | |||||
/src 天梯项目目录 | |||||
/src/cert 证书存放目录 | |||||
/src/.env 天梯环境变量配置文件 | |||||
步骤: | |||||
1.将有效证书放入/src/cert中 | |||||
2.修改/src/.env文件: | |||||
``` | |||||
... | |||||
NGINX_PORT=80 | |||||
SSL_PORT=443 | |||||
## 修改这两行中的文件名为新的有效证书名 | |||||
SSL_CERTIFICATE_N=/app/certs/9085819__shuishan.net.cn.pem | |||||
SSL_CERTIFICATE_KEY_N=/app/certs/9085819__shuishan.net.cn.key | |||||
## | |||||
... | |||||
``` | |||||
3.sudo docker-compose stop && sudo docker-compose start 重启天梯项目 | |||||
4.访问mladder.shuishan.net.cn,检查是否可以访问 | |||||
#### 2 校场证书部署: | |||||
/jcdata 校场数据文件夹(以实际为准) | |||||
/jcdata/backend/ssl 证书存放目录 | |||||
/jcdata/backend_app/deploy/nginx/nginx.conf nginx配置文件 | |||||
步骤: | |||||
1.将有效证书放入/jcdata/backend/ssl中 | |||||
2.修改/jcdata/backend_app/deploy/nginx/nginx.conf文件: | |||||
``` | |||||
... | |||||
server { | |||||
listen 1443 ssl http2 default_server; | |||||
server_name _; | |||||
ssl_certificate /data/ssl/9085819__shuishan.net.cn.pem; | |||||
ssl_certificate_key /data/ssl/9085819__shuishan.net.cn.key; | |||||
这两行修改,/data/ssl是容器内的地址,/jcdata/backend/ssl -> /data/ssl | |||||
修改后面的文件名就行 | |||||
ssl_protocols TLSv1.2; | |||||
ssl_ciphers ... | |||||
} | |||||
} | |||||
``` | |||||
校场docker-compose.yml: | |||||
``` | |||||
version: "3" | |||||
services: | |||||
oj-redis: | |||||
image: redis:4.0-alpine | |||||
container_name: oj-redis | |||||
restart: always | |||||
volumes: | |||||
- /jcdata/redis:/data | |||||
oj-postgres: | |||||
image: postgres:10-alpine | |||||
container_name: oj-postgres | |||||
restart: always | |||||
volumes: | |||||
- /data/jcdb:/var/lib/postgresql/data | |||||
environment: | |||||
- POSTGRES_DB=onlinejudge | |||||
- POSTGRES_USER=onlinejudge | |||||
- POSTGRES_PASSWORD=onlinejudge | |||||
judge-server: | |||||
image: dasetalent/judgeserver:v2.1 | |||||
container_name: judge-server | |||||
restart: always | |||||
read_only: true | |||||
cap_drop: | |||||
- SETPCAP | |||||
- MKNOD | |||||
- NET_BIND_SERVICE | |||||
- SYS_CHROOT | |||||
- SETFCAP | |||||
- FSETID | |||||
tmpfs: | |||||
- /tmp | |||||
volumes: | |||||
- /jcdata/backend/test_case:/test_case:ro | |||||
- /jcdata/judge_server/log:/log | |||||
- /jcdata/judge_server/run:/judger | |||||
environment: | |||||
- SERVICE_URL=http://judge-server:8080 | |||||
- BACKEND_URL=http://oj-backend:8000/api/judge_server_heartbeat/ | |||||
- TOKEN=CHANGE_THIS | |||||
oj-backend: | |||||
image: registry.cn-shanghai.aliyuncs.com/shuishan-data/shuishan-oj-backend:aliyun | |||||
container_name: oj-backend | |||||
restart: always | |||||
depends_on: | |||||
- oj-redis | |||||
- oj-postgres | |||||
- judge-server | |||||
volumes: | |||||
- /jcdata/backend_app:/app | |||||
- /jcdata/backend:/data | |||||
environment: | |||||
- POSTGRES_DB=onlinejudge | |||||
- POSTGRES_USER=onlinejudge | |||||
- POSTGRES_PASSWORD=onlinejudge | |||||
- JUDGE_SERVER_TOKEN=CHANGE_THIS | |||||
# - FORCE_HTTPS=1 | |||||
# - STATIC_CDN_HOST=cdn.oj.com | |||||
ports: | |||||
- "0.0.0.0:80:8000" | |||||
- "0.0.0.0:443:1443" | |||||
``` | |||||
3.sudo docker-compose stop && sudo docker-compose start 重启校场 | |||||
4.访问judgefield.shuishan.net.cn,检查是否可以访问 | |||||
@ -0,0 +1,28 @@ | |||||
## S3令牌有效期为1年,每过一年都需要更新一次 | |||||
进入https://edu.ucloud.cn/ | |||||
账号见dasetalent_host.md | |||||
1.选择项目为实验室-陆雪松 | |||||
 | |||||
进入对象存储&CDN | |||||
 | |||||
点击tab标签中的令牌管理 | |||||
 | |||||
点击查看/编辑按钮 | |||||
 | |||||
点击重新设置,并设置为1年,点击确定即可 | |||||
@ -0,0 +1,150 @@ | |||||
#水杉天梯——助教使用手册 | |||||
本手册主要展示了助教创建一个作业,上传作业需要的评测文件,上传作业需要的数 | |||||
据集的具体流程,以及如何修改上传的数据集和编辑已创建作业。手册中针对多阶段 | |||||
的文件结果提交给出了命名规范,规定了模板文件中的打包函数。 | |||||
##一、创建作业 | |||||
###作业描述部分 | |||||
该部分将详细介绍表单中每个字段的含义,以及应该如何填写。 | |||||
 | |||||
**作业名称——中文名**:对应作业名。 | |||||
**作业名称——英文名**:提交表单后生成的**zip压缩包名称**。 | |||||
**图标**:如不上传,将使用默认的logo作为作业logo。(仅支持常用图片格式) | |||||
###Web Page部分 | |||||
 | |||||
 | |||||
**概述,数据描述,评估,限制条件**使用的是富文本编辑器,可以对文本框中的内容进行加粗,斜体,调整字体大小,上色等操作。 | |||||
 | |||||
这些文本框的内容将会以**html**的形式展现在生成的作业界面上。对应关系为: | |||||
**概述**对应——**Overview**;**评估**对应——**Evaluation**;**限制条件**对应——**Terms and Condictions**;**数据描述**对应——**Get Data**。下图用红框圈出。 | |||||
 | |||||
 | |||||
###多阶段部分 | |||||
设置了多阶段机制,允许一次作业分多个阶段提交。 | |||||
**(解释:多个阶段可以理解为一次作业的两个部分,例如实现svm/决策树分别完成同一个分类任务,将会对应两个leaderboard)** | |||||
 | |||||
**每日最高提交数**:该作业允许每日最高的提交数。 | |||||
**最高提交数**:该作业开始到结束期间允许的提交数之和。 | |||||
**阶段数量**:填写该作业分为几个阶段。填写完以后点击确认,会自动生成相应数量的阶段填写框。 | |||||
 | |||||
点击确认后,会出现对应数量的阶段内容框,每个阶段需要填写: | |||||
**代码评测文件**:.py格式,助教自定义填写,如需参考可点击超链接。 | |||||
**作业的参考预测文件**:助教上传用于评测的正确结果文件。 | |||||
**每个阶段的开始时间**:当前阶段学生可以提交文件的开始时间。 | |||||
 | |||||
生成的作业将会分为多个阶段。点击每个阶段,可以提交对应阶段需要上传的文件。 | |||||
**对于评测示例文件的补充说明**: | |||||
**文件定位**:若交由本系统进行评测,该段代码无需改动。若在本地测试,则直接修改路径即可。 | |||||
 | |||||
**文件读取**:需要读取学生结果(prediction.txt)和参考结果(true.txt),注意学生结果文件的命名应要求学生固定命名(可在Jupyter模板文件中写好),参考结果文件命名在作业创建中上传的文件名一致。 | |||||
 | |||||
 | |||||
 | |||||
两个文件里的数据内容格式应保持一致,通过修改自定义的read_txt函数来读取不同的格式。 | |||||
 | |||||
**结果评测**:编写自定义的calculate_metric函数以满足不同的指标评测。(如果需要得到多个指标的结果,可自行丰富,得到多个score),以下为计算准确率的示例。 | |||||
 | |||||
 | |||||
**结果输出**:输出文件要求写入规定名为(scores.txt)的文件,不能修改。输出文件的每一行代表对应的一个指标的分数结果,指标名、和小数点位数应与作业创建leaderboard处的填写内容一致。 | |||||
 | |||||
 | |||||
**请助教在写完Evaluate.py后先在本地试运行,输出文件结果无误后再进行上传!** | |||||
###排行榜部分 | |||||
 | |||||
**评估指标个数**:填写作业提交的评估指标个数,有的作业可能需要不只一个评估指标。填写评估指标个数将生成下图所示的指标框。 | |||||
**评估标签**:填写评估标签名,如ACC,Precision,Recall等。 | |||||
**数据格式**:保留几位小数。 | |||||
**排序**:指标排序是升序还是降序。只能填**asc或desc**。 | |||||
###其他 | |||||
 | |||||
**作业权限**:选择作业参与是否需要助教同意。如选择是,则参与作业需要助教后台审批,如选择否,则申请参与作业后自动通过请求。 | |||||
开始时间,截至时间:填写作业开始和结束的时间。 | |||||
 | |||||
点击生成zip文件,将生成一个zip压缩包,点击上图中蓝色超链接生成压缩包。(**表单中除了图标可以不填,其余所有均为必填选项。**) | |||||
下载得到压缩包以后,点击上传zip文件,将生成一个新的作业。(**压缩包文件名不能含中文,空格等。**) | |||||
 | |||||
上传成功后,可以点击蓝色view超链接,查看具体作业信息。 | |||||
##编辑作业 | |||||
如果需要编辑,在作业详情界面中点击选项——编辑。 | |||||
 | |||||
备注:编辑页面表单数量较多,此处只挑选助教有可能修改的部分做详细讲述,如需更深层次的编辑,请参考 | |||||
https://github.com/codalab/codalab-competitions/wiki/Organizer_Codalab-competition-YAML-definition-language | |||||
修改字段内容。 | |||||
###作业描述部分 | |||||
 | |||||
参考第一部分创建作业的作业描述部分。 | |||||
**Title**对应——**作业的中文名**。 | |||||
**Description**对应——**描述**。 | |||||
##Web Page部分 | |||||
 | |||||
编辑Web Pages,对应关系如一中Web Pages部分。 | |||||
###多阶段部分 | |||||
 | |||||
**Start Date(UTC)**:修改每个phase的开始时间。 | |||||
**Maximum Submissions(per User)**:修改作业最多允许的提交数之和。 | |||||
**Max Submissions(per User) per day**:修改作业每人每天最多允许的提交数。 | |||||
###排行榜部分 | |||||
 | |||||
 | |||||
**Key Label**必须**相同**,对应——**评估标签**。 | |||||
**Numberic format**对应——**数据格式**。 | |||||
**Sorting**对应——**排序** | |||||
###其他 | |||||
 | |||||
**Organizers need to approve the new teams**对应——第一部分中的作业权限。 | |||||
A**nonymous leaderboard**——排行榜用户名是否匿名 | |||||
 | |||||
**Disallow leaderboard modifying**——提交是否可以被修改 | |||||
**Force submission to leaderboard**——学生是否需要人工提交结果 | |||||
 | |||||
**Registration Required**——学生是否需要经过助教同意才能参与作业 | |||||
 | |||||
如果提交的结果比过去成绩好,则自动将当前结果提交到排行榜 | |||||
##上传数据集 | |||||
点击上传数据集按钮打开面板 | |||||
 | |||||
 | |||||
点击选择文件选择数据集压缩包,然后点击提交按钮进行提交。 | |||||
提交按钮下方会显示上传进度,提交完成后会有对话框提示完成。 | |||||
 | |||||
数据集格式说明: | |||||
文件中需要包含input文件夹,其中放置数据集,可以有一个模板性的notebook文件 | |||||
参考格式如下: | |||||
 | |||||
需要注意的是,压缩时需要选中这一级目录下所有文件及文件夹进行压缩,而不是对上一级目录进行压缩 | |||||
上传后,学生视角的目录如下图所示: | |||||
 | |||||
其中read_only_sample文件为只读文件,作为备份 | |||||
多阶段提交需要注意,需要额外加一个output文件夹,该目录下的结构如下图所示: | |||||
 | |||||
 | |||||
有几个阶段就添加多少个phase_x文件夹。phase_x文件夹用于保存学生对应第x阶段代码输出结果。 | |||||
##四、重置数据集 | |||||
点击重置Jupyter按钮,可以将该作业对应的notebook删除,学生打开notebook时会以最新上传的数据集作为模板。 | |||||
 | |||||
使用情形:上传数据集有误需要更新时,已有学生打开了notebook,重新上传数据集不会更新已打开notebook的文件,需要点击该按钮解决。 | |||||
 | |||||
 | |||||
##五、下载学生代码 | |||||
 | |||||
点击学生代码按钮即可下载 | |||||
 | |||||
可以下载所有学生的代码,学生文件以学号命名 | |||||
 | |||||
学生编写的notebook文件或py文件都会在对应目录下。 | |||||
##六、模板notebook文件说明 | |||||
模板文件中包含三块内容: | |||||
1.具体题目的程序逻辑 | |||||
 | |||||
2.输出程序,output_string存储程序输出结果, phase_id为阶段id,每个阶段的程序输出分开存放 | |||||
 | |||||
3.打包程序 | |||||
 | |||||
需要改变的主要是第一部分,后两部分为固定代码,使学生按照系统规定地输出结果和打包结果。 | |||||
 | |||||
点击submit按钮可以将prediction_phase_1.zip提交到phase1的排行榜中。 |
@ -0,0 +1,189 @@ | |||||
## 1资源准备 | |||||
根据模型规模,合理安排每个容器使用的资源额度,计算所需要的总资源数目 | |||||
如100个学生,每个容器需要1C,4G | |||||
那么总资源需要100C,400G | |||||
在K8S系统中添加工作节点,满足资源总需求*115%,其中15%为余量,可以多一点。 | |||||
考试的节点手动添加标签cal_type:cpu/gpu ntype:exam | |||||
对于exam的镜像会全部调度到这些节点中,普通教学的镜像只会被调度在ntype:study的节点上,以此实现考试和教学的分离,保障考试稳定性 | |||||
对于gpu的镜像只会被调度到cal_type:gpu的节点上,cpu的镜像同理。 | |||||
## 2环境准备 | |||||
根据考试实际需要,build一个镜像 | |||||
打包过程: | |||||
文件在jupyter-image-mladder文件夹中 | |||||
其中base是基镜像,先本地build该镜像 | |||||
cd base | |||||
sudo docker build -t mld:v1 . | |||||
其它几个镜像中的FROM字段内容要修改为刚刚build的mld:v1 | |||||
安装包只需要仿照下面代码,用conda或者pip3安装即可,推荐使用一些镜像源,速度较快 | |||||
cudnn等一些包不能用pip安装,可以用conda,自行google搜索 | |||||
``` | |||||
FROM mld:v1 #这个要对应自己build的镜像 | |||||
ARG NB_USER="jupyter" | |||||
ARG NB_UID="1000" | |||||
ARG NB_GID="100" | |||||
ARG NB_PORT=8888 | |||||
USER root | |||||
###### 以下是装包: | |||||
RUN conda install pytorch torchvision torchaudio cpuonly -c pytorch \ | |||||
&& conda install tensorflow | |||||
RUN pip3 install numpy seaborn sklearn h5py matplotlib pandas future imageio -i https://pypi.mirrors.ustc.edu.cn/simple/ | |||||
RUN pip3 install lightgbm xgboost imblearn mindspore -i https://pypi.mirrors.ustc.edu.cn/simple/ | |||||
RUN pip3 install keras -i https://pypi.mirrors.ustc.edu.cn/simple/ | |||||
###### | |||||
ENV HOME="/home/${NB_USER}" | |||||
USER ${NB_UID} | |||||
ENTRYPOINT [ "/enterpoint.sh" ] | |||||
``` | |||||
不同环境,只需要修改######之间的内容即可,别的代码一般不需要修改 | |||||
打包好镜像后,需要上传到dockerhub | |||||
dasetalent账户密码查阅 dassetalent_host.md文件 | |||||
docker push 到dockerhub后,进入考试专用的节点(可以直接ssh,也可以在阿里云k8s管理模块workbench远程连接功能),手动docker pull dasetalent/xxxxxx,其中xxxxxx为上传的镜像,要带版本号 | |||||
下面步骤为暂时性: | |||||
进入天梯服务器 | |||||
目录/home/lwttest | |||||
修改/home/lwttest/config.json文件 | |||||
``` | |||||
{"version": "v1.2.1", "images": { | |||||
"old": {"image": "bnc1010/old_notebook:v1", "use_gpu": false, "workdir": "/home/public/", "node_select":{"ntype":"study"}}, | |||||
"torch-gpu": {"image": "bnc1010/mladder_notebook_torchgpu:v0.3", "use_gpu": true, "workdir": "/home/jupyter/", "node_select":{"ntype":"study"}}, | |||||
"tensorflow-gpu": {"image": "bnc1010/mladder_notebook_tensorflowgpu:v0.2", "use_gpu": true, "workdir": "/home/jupyter/", "node_select":{"ntype":"study"}}, | |||||
"tensorflow-pytorch-cpu(exam)": {"image": "bnc1010/mladder_notebook_torch_tf_sk:v1.6", "use_gpu": false, "workdir": "/home/jupyter/", "node_select":{"ntype":"exam"}} | |||||
}, | |||||
"node_ips": ["47.100.69.138", "139.224.216.129"], | |||||
"gpu_maxn": 0, | |||||
"gpu_notebook": {}} | |||||
``` | |||||
这里以tensorflow-pytorch-cpu(exam)为例,这是一个考试专用镜像,它有四个参数: | |||||
``` | |||||
{ | |||||
"image": "bnc1010/mladder_notebook_torch_tf_sk:v1.6", | |||||
"use_gpu": false, | |||||
"workdir": "/home/jupyter/", | |||||
"node_select":{"ntype":"exam"} | |||||
} | |||||
``` | |||||
image: dockerhub中能直接pull的镜像名 | |||||
use_gpu: 是否是需要GPU的 | |||||
workdir: notebook的工作地址 | |||||
node_select: 节点标签选择 | |||||
这里node_select中有一个ntype:exam,表明该镜像起的容器只会在考试专用节点上 | |||||
其它几个镜像起的容器只会在带有ntype:study标签的节点上 | |||||
**在该配置文件添加好新镜像后,重启脚本/home/lwttest/workServer.py** | |||||
目前是使用screen挂载 screen -x notebook即可进入, ctrl c 终端,然后python3 /home/lwttest/workServer.py即可重启 | |||||
## 3比赛准备 | |||||
准备好考试需要的比赛 | |||||
详见天梯助教手册 | |||||
**设置考试的环境为上述准备好的考试专用环境** | |||||
暂时不要publish,测试作业可以使用私密链接,账号可以让测试人员直接从水杉账号,从水杉跳转过来 | |||||
## 4考试过程 | |||||
#### 4.1 快开始时 | |||||
清空k8s考试节点中已打开的容器 | |||||
配置资源容器资源额度: | |||||
``` | |||||
resources: | |||||
requests: | |||||
memory: 2Gi | |||||
cpu: 800m | |||||
limits: | |||||
memory: 4Gi | |||||
cpu: 2000m | |||||
``` | |||||
#### 4.2 开始 | |||||
将比赛正式publish | |||||
#### 4.3 考试中 | |||||
检测k8s集群,掌握资源实时状态,如果不足,则需要临时加入新节点 | |||||
如果有其它节点可用时,一个简单有效的补救:手动修改没有成功开启的deployment的yml,修改其ntype调度到非考试节点上去。 | |||||
考试内容相关: | |||||
如果有文件错误、说明有误等需要修改、增加文件的情况时,手动上传至对应比赛的input文件中。 | |||||
#### 4.4 考试结束 | |||||
应该向考生说明保存notebook,避免关闭容器后,代码没有保存而丢失。 | |||||
在天梯中,点击code按钮下载考生代码、leaderboader排名CSV文件。 | |||||
保存完毕后,删除考试节点所有的deployment。 | |||||
设置比赛的环境为空,让考生无法从比赛页面再次进入notebook。 | |||||
数据盘创建快照。 | |||||
删除临时work节点。 | |||||
@ -0,0 +1,39 @@ | |||||
# import pickle | |||||
# import pandas as pd | |||||
# title=[] | |||||
# with open('../../submission_202110142140.csv', 'r') as f: | |||||
# records = f.readlines() | |||||
# title = records[0].replace("\"","").split(',') | |||||
# records = pd.read_csv('./submission_202110142140.csv') | |||||
# pros = records["code"].to_list() | |||||
# id = range(len(pros)) | |||||
# pm_id = records["problem_id"].to_list() | |||||
# dic={} | |||||
# for i in range(len(pros)): | |||||
# problem_id = pm_id[i] | |||||
# if not dic.get(problem_id): | |||||
# dic[problem_id] = [] | |||||
# dic[problem_id].append(i) | |||||
# pairs_a = [] | |||||
# pairs_b = [] | |||||
# for k,v in dic.items(): | |||||
# for i in range(len(v)): | |||||
# for j in range(i+1, len(v)): | |||||
# pairs_a.append(v[i]) | |||||
# pairs_b.append(v[j]) | |||||
# pair_data = {'id1': pairs_a, 'id2':pairs_b} | |||||
# newpair = pd.DataFrame(pair_data, columns=["id1","id2"]) | |||||
# newpair.to_csv('./data/c/id_pair.csv') | |||||
# Data = {'0': id, '1': pros} | |||||
# newdata = pd.DataFrame(Data, columns=["0","1"]) | |||||
# newdata.to_csv('./data/c/newproblems.csv') |
@ -0,0 +1,199 @@ | |||||
import torch.nn as nn | |||||
import torch.nn.functional as F | |||||
import torch | |||||
from torch.autograd import Variable | |||||
import random | |||||
class BatchTreeEncoder(nn.Module): | |||||
def __init__(self, vocab_size, embedding_dim, encode_dim, batch_size, use_gpu, pretrained_weight=None): | |||||
super(BatchTreeEncoder, self).__init__() | |||||
self.embedding = nn.Embedding(vocab_size, embedding_dim) | |||||
self.embedding_dim = embedding_dim | |||||
self.encode_dim = encode_dim | |||||
self.W_c = nn.Linear(embedding_dim, encode_dim) | |||||
self.activation = F.relu | |||||
self.stop = -1 | |||||
self.batch_size = batch_size | |||||
self.use_gpu = use_gpu | |||||
self.node_list = [] | |||||
self.th = torch.cuda if use_gpu else torch | |||||
self.batch_node = None | |||||
self.max_index = vocab_size | |||||
# pretrained embedding | |||||
if pretrained_weight is not None: | |||||
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_weight)) | |||||
# self.embedding.weight.requires_grad = False | |||||
def create_tensor(self, tensor): | |||||
if self.use_gpu: | |||||
return tensor.cuda() | |||||
return tensor | |||||
def traverse_mul(self, node, batch_index): | |||||
size = len(node) | |||||
if not size: | |||||
return None | |||||
batch_current = self.create_tensor(Variable(torch.zeros(size, self.embedding_dim))) | |||||
index, children_index = [], [] | |||||
current_node, children = [], [] | |||||
for i in range(size): | |||||
# if node[i][0] is not -1: | |||||
index.append(i) | |||||
current_node.append(node[i][0]) | |||||
temp = node[i][1:] | |||||
c_num = len(temp) | |||||
for j in range(c_num): | |||||
if temp[j][0] is not -1: | |||||
if len(children_index) <= j: | |||||
children_index.append([i]) | |||||
children.append([temp[j]]) | |||||
else: | |||||
children_index[j].append(i) | |||||
children[j].append(temp[j]) | |||||
# else: | |||||
# batch_index[i] = -1 | |||||
batch_current = self.W_c(batch_current.index_copy(0, Variable(self.th.LongTensor(index)), | |||||
self.embedding(Variable(self.th.LongTensor(current_node))))) | |||||
for c in range(len(children)): | |||||
zeros = self.create_tensor(Variable(torch.zeros(size, self.encode_dim))) | |||||
batch_children_index = [batch_index[i] for i in children_index[c]] | |||||
tree = self.traverse_mul(children[c], batch_children_index) | |||||
if tree is not None: | |||||
batch_current += zeros.index_copy(0, Variable(self.th.LongTensor(children_index[c])), tree) | |||||
# batch_index = [i for i in batch_index if i is not -1] | |||||
b_in = Variable(self.th.LongTensor(batch_index)) | |||||
self.node_list.append(self.batch_node.index_copy(0, b_in, batch_current)) | |||||
return batch_current | |||||
def forward(self, x, bs): | |||||
self.batch_size = bs | |||||
self.batch_node = self.create_tensor(Variable(torch.zeros(self.batch_size, self.encode_dim))) | |||||
self.node_list = [] | |||||
self.traverse_mul(x, list(range(self.batch_size))) | |||||
self.node_list = torch.stack(self.node_list) | |||||
return torch.max(self.node_list, 0)[0] | |||||
class BatchProgramCC(nn.Module): | |||||
def __init__(self, embedding_dim, hidden_dim, vocab_size, encode_dim, label_size, batch_size, use_gpu=True, pretrained_weight=None): | |||||
super(BatchProgramCC, self).__init__() | |||||
self.stop = [vocab_size-1] | |||||
self.hidden_dim = hidden_dim | |||||
self.num_layers = 1 | |||||
self.gpu = use_gpu | |||||
self.batch_size = batch_size | |||||
self.vocab_size = vocab_size | |||||
self.embedding_dim = embedding_dim | |||||
self.encode_dim = encode_dim | |||||
self.label_size = label_size | |||||
self.encoder = BatchTreeEncoder(self.vocab_size, self.embedding_dim, self.encode_dim, | |||||
self.batch_size, self.gpu, pretrained_weight) | |||||
self.root2label = nn.Linear(self.encode_dim, self.label_size) | |||||
# gru | |||||
self.bigru = nn.GRU(self.encode_dim, self.hidden_dim, num_layers=self.num_layers, bidirectional=True, | |||||
batch_first=True) | |||||
# linear | |||||
self.hidden2label = nn.Linear(self.hidden_dim * 2, self.label_size) | |||||
# hidden | |||||
# self.hidden = self.init_hidden() | |||||
self.dropout = nn.Dropout(0.2) | |||||
def init_hidden(self): | |||||
if self.gpu is True: | |||||
if isinstance(self.bigru, nn.LSTM): | |||||
h0 = Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim).cuda()) | |||||
c0 = Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim).cuda()) | |||||
return h0, c0 | |||||
return Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim)).cuda() | |||||
else: | |||||
return Variable(torch.zeros(self.num_layers * 2, self.batch_size, self.hidden_dim)) | |||||
def get_zeros(self, num): | |||||
zeros = Variable(torch.zeros(num, self.encode_dim)) | |||||
if self.gpu: | |||||
return zeros.cuda() | |||||
return zeros | |||||
def encode(self, x): | |||||
# print(x) | |||||
lens = [len(item) for item in x] | |||||
max_len = max(lens) | |||||
encodes = [] | |||||
for i in range(self.batch_size): | |||||
for j in range(lens[i]): | |||||
encodes.append(x[i][j]) | |||||
encodes = self.encoder(encodes, sum(lens)) | |||||
seq, start, end = [], 0, 0 | |||||
for i in range(self.batch_size): | |||||
end += lens[i] | |||||
if max_len - lens[i]: | |||||
seq.append(self.get_zeros(max_len - lens[i])) | |||||
seq.append(encodes[start:end]) | |||||
start = end | |||||
encodes = torch.cat(seq) | |||||
encodes = encodes.view(self.batch_size, max_len, -1) | |||||
# gru | |||||
gru_out, hidden = self.bigru(encodes, self.hidden) | |||||
gru_out = torch.transpose(gru_out, 1, 2) | |||||
# pooling | |||||
gru_out = F.max_pool1d(gru_out, gru_out.size(2)).squeeze(2) | |||||
return gru_out | |||||
def forward(self, x1, x2): | |||||
lvec, rvec = self.encode(x1), self.encode(x2) | |||||
# abs_dist = torch.abs(torch.add(lvec, -rvec)) | |||||
y = F.cosine_similarity(rvec, lvec).view(-1) | |||||
# y = torch.sigmoid(self.hidden2label(abs_dist)) | |||||
return y | |||||
# def encode(self, x): | |||||
# bs = x.size(0) | |||||
# lens = [len(item) for item in x] | |||||
# max_len = max(lens) | |||||
# encodes = x | |||||
# # encodes = [] | |||||
# # for i in range(self.batch_size): | |||||
# # for j in range(lens[i]): | |||||
# # encodes.append(x[i][j]) | |||||
# # | |||||
# # encodes = self.encoder(encodes, sum(lens)) | |||||
# seq, start, end = [], 0, 0 | |||||
# for i in range(bs): | |||||
# end += lens[i] | |||||
# if max_len-lens[i]: | |||||
# seq.append(self.get_zeros(max_len-lens[i])) | |||||
# seq.append(encodes[start:end]) | |||||
# start = end | |||||
# encodes = torch.cat(seq) | |||||
# encodes = encodes.view(bs, max_len, -1) | |||||
# # return encodes | |||||
# | |||||
# # gru_out, hidden = self.bigru(encodes, self.hidden) | |||||
# gru_out, hidden = self.bigru(encodes) | |||||
# gru_out = torch.transpose(gru_out, 1, 2) | |||||
# # pooling | |||||
# gru_out = F.max_pool1d(gru_out, gru_out.size(2)).squeeze(2) | |||||
# | |||||
# return gru_out | |||||
# | |||||
# def forward(self, x1, x2): | |||||
# lvec, rvec = self.encode(x1), self.encode(x2) | |||||
# | |||||
# abs_dist = torch.abs(torch.add(lvec, -rvec)) | |||||
# | |||||
# y = torch.sigmoid(self.hidden2label(abs_dist)).view(x1.size(0), -1) | |||||
# t = (~(y > 0.5)).float() | |||||
# out = torch.cat([t, y],dim=1) | |||||
# return out | |||||
@ -0,0 +1,193 @@ | |||||
import pandas as pd | |||||
import os | |||||
import sys | |||||
import warnings | |||||
warnings.filterwarnings('ignore') | |||||
class Pipeline: | |||||
def __init__(self, ratio, root, language): | |||||
self.ratio = ratio | |||||
self.root = root | |||||
self.language = language | |||||
self.sources = None | |||||
self.blocks = None | |||||
self.pairs = None | |||||
self.train_file_path = None | |||||
self.dev_file_path = None | |||||
self.test_file_path = None | |||||
self.size = None | |||||
# parse source code | |||||
def parse_source(self, output_file, option): | |||||
path = self.root+self.language+'/'+output_file | |||||
if os.path.exists(path) and option == 'existing': | |||||
source = pd.read_pickle(path) | |||||
else: | |||||
if self.language is 'c': | |||||
from pycparser import c_parser | |||||
parser = c_parser.CParser() | |||||
source = pd.read_pickle(self.root+self.language+'/programs.pkl') | |||||
source.columns = ['id', 'code', 'label'] | |||||
source['code'] = source['code'].apply(parser.parse) | |||||
source.to_pickle(path) | |||||
else: | |||||
import javalang | |||||
def parse_program(func): | |||||
tokens = javalang.tokenizer.tokenize(func) | |||||
parser = javalang.parser.Parser(tokens) | |||||
tree = parser.parse_member_declaration() | |||||
return tree | |||||
# source = pd.read_csv(self.root+self.language+'/bcb_funcs_all.tsv', sep='\t', header=None, encoding='utf-8') | |||||
source = pd.read_csv(self.root + self.language + '/codes.csv') | |||||
source.columns = ['id', 'code'] | |||||
source['code'] = source['code'].apply(parse_program) | |||||
source.to_pickle(path) | |||||
self.sources = source | |||||
return source | |||||
# create clone pairs | |||||
def read_pairs(self, filename): | |||||
pairs = pd.read_pickle(self.root+self.language+'/'+filename) | |||||
self.pairs = pairs | |||||
# split data for training, developing and testing | |||||
def split_data(self): | |||||
data_path = self.root+self.language+'/' | |||||
data = self.pairs | |||||
data_num = len(data) | |||||
ratios = [int(r) for r in self.ratio.split(':')] | |||||
train_split = int(ratios[0]/sum(ratios)*data_num) | |||||
val_split = train_split + int(ratios[1]/sum(ratios)*data_num) | |||||
data = data.sample(frac=1, random_state=666) | |||||
train = data.iloc[:train_split] | |||||
dev = data.iloc[train_split:val_split] | |||||
test = data.iloc[val_split:] | |||||
def check_or_create(path): | |||||
if not os.path.exists(path): | |||||
os.mkdir(path) | |||||
train_path = data_path+'train/' | |||||
check_or_create(train_path) | |||||
self.train_file_path = train_path+'train_.pkl' | |||||
train.to_pickle(self.train_file_path) | |||||
dev_path = data_path+'dev/' | |||||
check_or_create(dev_path) | |||||
self.dev_file_path = dev_path+'dev_.pkl' | |||||
dev.to_pickle(self.dev_file_path) | |||||
test_path = data_path+'test/' | |||||
check_or_create(test_path) | |||||
self.test_file_path = test_path+'test_.pkl' | |||||
test.to_pickle(self.test_file_path) | |||||
# construct dictionary and train word embedding | |||||
def dictionary_and_embedding(self, input_file, size): | |||||
self.size = size | |||||
data_path = self.root+self.language+'/' | |||||
if not input_file: | |||||
input_file = self.train_file_path | |||||
pairs = pd.read_pickle(input_file) | |||||
train_ids = pairs['id1'].append(pairs['id2']).unique() | |||||
#trees = self.sources.set_index('id',drop=False).loc[train_ids] | |||||
trees = self.sources.set_index('id',drop=False).loc[train_ids[0]] | |||||
for i in train_ids[1:]: | |||||
tmp_tt = self.sources.set_index('id',drop=False).loc[i] | |||||
trees = pd.concat([trees,tmp_tt],axis=0) | |||||
if not os.path.exists(data_path+'train/embedding'): | |||||
os.mkdir(data_path+'train/embedding') | |||||
if self.language is 'c': | |||||
sys.path.append('../') | |||||
from prepare_data import get_sequences as func | |||||
else: | |||||
from utils import get_sequence as func | |||||
def trans_to_sequences(ast): | |||||
sequence = [] | |||||
func(ast, sequence) | |||||
return sequence | |||||
corpus = trees['code'].apply(trans_to_sequences) | |||||
str_corpus = [' '.join(c) for c in corpus] | |||||
trees['code'] = pd.Series(str_corpus) | |||||
# trees.to_csv(data_path+'train/programs_ns.tsv') | |||||
from gensim.models.word2vec import Word2Vec | |||||
w2v = Word2Vec(corpus, size=size, workers=16, sg=1, max_final_vocab=3000) | |||||
w2v.save(data_path+'train/embedding/node_w2v_' + str(size)) | |||||
# generate block sequences with index representations | |||||
def generate_block_seqs(self,size): | |||||
self.size = size | |||||
if self.language is 'c': | |||||
from prepare_data import get_blocks as func | |||||
else: | |||||
from utils import get_blocks_v1 as func | |||||
from gensim.models.word2vec import Word2Vec | |||||
word2vec = Word2Vec.load(self.root+self.language+'node_w2v_' + str(self.size)).wv | |||||
vocab = word2vec.vocab | |||||
max_token = word2vec.syn0.shape[0] | |||||
def tree_to_index(node): | |||||
token = node.token | |||||
result = [vocab[token].index if token in vocab else max_token] | |||||
children = node.children | |||||
for child in children: | |||||
result.append(tree_to_index(child)) | |||||
return result | |||||
def trans2seq(r): | |||||
blocks = [] | |||||
func(r, blocks) | |||||
tree = [] | |||||
for b in blocks: | |||||
btree = tree_to_index(b) | |||||
tree.append(btree) | |||||
return tree | |||||
trees = pd.DataFrame(self.sources, copy=True) | |||||
trees['code'] = trees['code'].apply(trans2seq) | |||||
if 'label' in trees.columns: | |||||
trees.drop('label', axis=1, inplace=True) | |||||
self.blocks = trees | |||||
# merge pairs | |||||
def merge(self, data_path, part): | |||||
pairs = pd.read_pickle(data_path) | |||||
pairs['id1'] = pairs['id1'].astype(int) | |||||
pairs['id2'] = pairs['id2'].astype(int) | |||||
df = pd.merge(pairs, self.blocks, how='left', left_on='id1', right_on='id') | |||||
df = pd.merge(df, self.blocks, how='left', left_on='id2', right_on='id') | |||||
df.drop(['id_x', 'id_y'], axis=1,inplace=True) | |||||
df.dropna(inplace=True) | |||||
df.to_pickle(self.root+self.language+'/'+part+'/blocks.pkl') | |||||
# run for processing data to train | |||||
def run(self): | |||||
print('parse source code...') | |||||
self.parse_source(output_file='ast.pkl',option='existing') | |||||
print('read id pairs...') | |||||
if self.language is 'c': | |||||
self.read_pairs('oj_clone_ids.pkl') | |||||
else: | |||||
self.read_pairs('id_pairs.pkl') | |||||
# self.read_pairs('bcb_pair_ids.pkl') | |||||
print('split data...') | |||||
self.split_data() | |||||
#print('train word embedding...') | |||||
# self.dictionary_and_embedding(None, 128) | |||||
print('generate block sequences...') | |||||
self.generate_block_seqs(128) | |||||
print('merge pairs and blocks...') | |||||
self.merge(self.train_file_path, 'train') | |||||
self.merge(self.dev_file_path, 'dev') | |||||
self.merge(self.test_file_path, 'test') | |||||
lang = "c" | |||||
ppl = Pipeline('8:1:1', 'data/', lang) | |||||
ppl.run() | |||||
@ -0,0 +1,45 @@ | |||||
from pycparser import c_parser, c_ast | |||||
import pandas as pd | |||||
import os | |||||
import re | |||||
import sys | |||||
from gensim.models.word2vec import Word2Vec | |||||
import pickle | |||||
from tree import ASTNode, SingleNode | |||||
import numpy as np | |||||
def get_sequences(node, sequence): | |||||
current = SingleNode(node) | |||||
sequence.append(current.get_token()) | |||||
for _, child in node.children(): | |||||
get_sequences(child, sequence) | |||||
if current.get_token().lower() == 'compound': | |||||
sequence.append('End') | |||||
def get_blocks(node, block_seq): | |||||
children = node.children() | |||||
name = node.__class__.__name__ | |||||
if name in ['FuncDef', 'If', 'For', 'While', 'DoWhile']: | |||||
block_seq.append(ASTNode(node)) | |||||
if name is not 'For': | |||||
skip = 1 | |||||
else: | |||||
skip = len(children) - 1 | |||||
for i in range(skip, len(children)): | |||||
child = children[i][1] | |||||
if child.__class__.__name__ not in ['FuncDef', 'If', 'For', 'While', 'DoWhile', 'Compound']: | |||||
block_seq.append(ASTNode(child)) | |||||
get_blocks(child, block_seq) | |||||
elif name is 'Compound': | |||||
block_seq.append(ASTNode(name)) | |||||
for _, child in node.children(): | |||||
if child.__class__.__name__ not in ['If', 'For', 'While', 'DoWhile']: | |||||
block_seq.append(ASTNode(child)) | |||||
get_blocks(child, block_seq) | |||||
block_seq.append(ASTNode('End')) | |||||
else: | |||||
for _, child in node.children(): | |||||
get_blocks(child, block_seq) |
@ -0,0 +1,90 @@ | |||||
import pandas as pd | |||||
import torch | |||||
import time | |||||
import numpy as np | |||||
import warnings | |||||
from gensim.models.word2vec import Word2Vec | |||||
from model import BatchProgramCC | |||||
from torch.autograd import Variable | |||||
from sklearn.metrics import precision_recall_fscore_support | |||||
from tqdm import tqdm | |||||
warnings.filterwarnings('ignore') | |||||
def get_batch(dataset, idx, bs): | |||||
tmp = dataset.iloc[idx: idx+bs] | |||||
x1, x2, labels = [], [], [] | |||||
for _, item in tmp.iterrows(): | |||||
x1.append(item['code_ids_x']) | |||||
x2.append(item['code_ids_y']) | |||||
labels.append([item['label']]) | |||||
return x1, x2, torch.FloatTensor(labels) | |||||
if __name__ == '__main__': | |||||
lang = 'c' | |||||
root = 'data/' | |||||
test_data = pd.read_pickle(root+lang+'/test/blocks_new.pkl').sample(frac=1) | |||||
word2vec = Word2Vec.load(root+lang+"/node_w2v_128").wv | |||||
MAX_TOKENS = word2vec.syn0.shape[0] | |||||
EMBEDDING_DIM = word2vec.syn0.shape[1] | |||||
embeddings = np.zeros((MAX_TOKENS + 1, EMBEDDING_DIM), dtype="float32") | |||||
embeddings[:word2vec.syn0.shape[0]] = word2vec.syn0 | |||||
HIDDEN_DIM = 100 | |||||
ENCODE_DIM = 128 | |||||
LABELS = 1 | |||||
EPOCHS = 5 | |||||
BATCH_SIZE = 64 | |||||
USE_GPU = True | |||||
model = BatchProgramCC(EMBEDDING_DIM,HIDDEN_DIM,MAX_TOKENS+1,ENCODE_DIM,LABELS,BATCH_SIZE, | |||||
USE_GPU, embeddings) | |||||
if USE_GPU: | |||||
model.cuda() | |||||
parameters = model.parameters() | |||||
optimizer = torch.optim.Adamax(parameters) | |||||
loss_function = torch.nn.BCELoss() | |||||
PATH = './model/model_clone_c.pkl' | |||||
checkpoint = torch.load(PATH) | |||||
start_epoch = checkpoint['epoch'] | |||||
model.load_state_dict(checkpoint['model_state_dict']) | |||||
test_data_t = test_data | |||||
print("Testing..." ) | |||||
# testing procedure | |||||
predicts = [] | |||||
trues = [] | |||||
total_loss = 0.0 | |||||
total = 0.0 | |||||
i = 0 | |||||
for i in tqdm(range(0, len(test_data_t), BATCH_SIZE)): | |||||
if i + BATCH_SIZE > len(test_data_t): | |||||
BATCH_SIZE = len(test_data_t) - i | |||||
batch = get_batch(test_data_t, i, BATCH_SIZE) | |||||
i += BATCH_SIZE | |||||
test1_inputs, test2_inputs, test_labels = batch | |||||
if USE_GPU: | |||||
test_labels = test_labels.cuda() | |||||
model.batch_size = len(test_labels) | |||||
model.hidden = model.init_hidden() | |||||
output = model(test1_inputs, test2_inputs) | |||||
# loss = loss_function(output, Variable(test_labels)) | |||||
# calc testing acc | |||||
predicted = (output.data > 0.5).cpu().numpy() | |||||
predicts.extend(predicted) | |||||
trues.extend(test_labels.cpu().numpy()) | |||||
# total += len(test_labels) | |||||
# total_loss += loss.item() * len(test_labels) | |||||
p, r, f, _ = precision_recall_fscore_support(trues, predicts, average='binary') | |||||
print("Testing results(P,R,F1):%.3f, %.3f, %.3f" % (p, r, f)) |
@ -0,0 +1,246 @@ | |||||
import pandas as pd | |||||
import torch | |||||
import time | |||||
import numpy as np | |||||
import warnings | |||||
from gensim.models.word2vec import Word2Vec | |||||
from model import BatchProgramCC | |||||
from torch.autograd import Variable | |||||
from sklearn.metrics import precision_recall_fscore_support | |||||
from tqdm import tqdm | |||||
warnings.filterwarnings('ignore') | |||||
from gensim.models.word2vec import Word2Vec | |||||
# word2vec = Word2Vec.load("./train/embedding/node_w2v_128_new").wv | |||||
# word2vec.index2word | |||||
def get_batch(dataset, idx, bs): | |||||
tmp = dataset.iloc[idx: idx+bs] | |||||
x1, x2, labels = [], [], [] | |||||
for _, item in tmp.iterrows(): | |||||
x1.append(eval(item['code_ids_x'])) | |||||
x2.append(eval(item['code_ids_y'])) | |||||
labels.append([item['label']]) | |||||
return x1, x2, torch.FloatTensor(labels) | |||||
if __name__ == '__main__': | |||||
# import argparse | |||||
# | |||||
# parser = argparse.ArgumentParser(description="Choose a dataset:[c|java]") | |||||
# parser.add_argument('--lang') | |||||
# args = parser.parse_args() | |||||
# args.lang = 'java' | |||||
# if not args.lang: | |||||
# print("No specified dataset") | |||||
# exit(1) | |||||
root = 'data/' | |||||
lang = 'java' | |||||
categories = 1 | |||||
if lang == 'java': | |||||
categories = 5 | |||||
print("Train for ", str.upper(lang)) | |||||
# train_data = pd.read_pickle(root+lang+'/train/blocks_30w.pkl').sample(frac=1) | |||||
train_data = pd.read_csv(root + lang + '/train/blocks_30w.csv').sample(frac=1) | |||||
train_data = train_data.replace(-1, 0) | |||||
# val_data = pd.read_pickle(root+lang+'/dev/blocks_30w.pkl').sample(frac=1) | |||||
val_data = pd.read_csv(root + lang + '/dev/blocks_30w.csv').sample(frac=1) | |||||
val_data = val_data.replace(-1, 0) | |||||
# test_data = pd.read_pickle(root+lang+'/test/blocks_30w.pkl').sample(frac=1) | |||||
test_data = pd.read_csv(root + lang + '/test/blocks_30w.csv').sample(frac=1) | |||||
test_data = test_data.replace(-1, 0) | |||||
test_data.loc[test_data['label'] > 0, 'label'] = 1 | |||||
word2vec = Word2Vec.load("./data/java/train/embedding/node_w2v_128_new").wv | |||||
MAX_TOKENS = word2vec.syn0.shape[0] | |||||
EMBEDDING_DIM = word2vec.syn0.shape[1] | |||||
embeddings = np.zeros((MAX_TOKENS + 1, EMBEDDING_DIM), dtype="float32") | |||||
embeddings[:word2vec.syn0.shape[0]] = word2vec.syn0 | |||||
HIDDEN_DIM = 100 | |||||
ENCODE_DIM = 128 | |||||
LABELS = 1 | |||||
EPOCHS = 10 | |||||
BATCH_SIZE = 64 | |||||
USE_GPU = True | |||||
model = BatchProgramCC(EMBEDDING_DIM,HIDDEN_DIM,MAX_TOKENS+1,ENCODE_DIM,LABELS,BATCH_SIZE, | |||||
USE_GPU, embeddings) | |||||
if USE_GPU: | |||||
model.cuda() | |||||
parameters = model.parameters() | |||||
optimizer = torch.optim.Adamax(parameters) | |||||
loss_function = torch.nn.BCELoss() | |||||
PATH = './model/model_clone_java_30w.pkl' | |||||
print(train_data) | |||||
precision, recall, f1 = 0, 0, 0 | |||||
print('Start training...') | |||||
for t in range(5, categories+1): | |||||
# if lang == 'java': | |||||
# # train_data_t = train_data[train_data['label'].isin([t, 0])] | |||||
# train_data_t = train_data | |||||
# train_data_t.loc[train_data_t['label'] > 0, 'label'] = 1 | |||||
# | |||||
# # val_data_t = val_data[val_data['label'].isin([t, 0])] | |||||
# val_data_t = val_data | |||||
# val_data_t.loc[val_data_t['label'] > 0, 'label'] = 1 | |||||
# | |||||
# # test_data_t = test_data[test_data['label'].isin([t, 0])] | |||||
# test_data_t = test_data | |||||
# # test_data_t.loc[test_data_t['label'] > 0, 'label'] = 1 | |||||
# else: | |||||
train_data_t, val_data_t, test_data_t = train_data, val_data, test_data | |||||
# training procedure | |||||
train_loss_ = [] | |||||
val_loss_ = [] | |||||
for epoch in range(EPOCHS): | |||||
start_time = time.time() | |||||
# training epoch | |||||
total_acc = 0.0 | |||||
total_loss = 0.0 | |||||
total = 0.0 | |||||
i = 0 | |||||
predicts = [] | |||||
trues = [] | |||||
model.train() | |||||
bs = BATCH_SIZE | |||||
# while i < len(train_data_t): | |||||
for i in tqdm(range(0, len(train_data_t), bs)): | |||||
if i + bs > len(train_data_t): | |||||
bs = len(train_data_t) - i | |||||
batch = get_batch(train_data_t, i, bs) | |||||
# i += BATCH_SIZE | |||||
train1_inputs, train2_inputs, train_labels = batch | |||||
if USE_GPU: | |||||
train1_inputs, train2_inputs, train_labels = train1_inputs, train2_inputs, train_labels.cuda() | |||||
model.zero_grad() | |||||
model.batch_size = len(train_labels) | |||||
model.hidden = model.init_hidden() | |||||
output = model(train1_inputs, train2_inputs) | |||||
loss = loss_function(output, Variable(train_labels)) | |||||
loss.backward() | |||||
optimizer.step() | |||||
total += len(train_labels) | |||||
total_loss += loss.item() * len(train_labels) | |||||
predicted = (output.data > 0.5).cpu().numpy() | |||||
predicts.extend(predicted) | |||||
trues.extend(train_labels.cpu().numpy()) | |||||
train_loss_.append(total_loss / total) | |||||
precision, recall, f1, _ = precision_recall_fscore_support(trues, predicts, average='binary') | |||||
total_loss = 0.0 | |||||
total = 0.0 | |||||
i = 0 | |||||
bs = BATCH_SIZE | |||||
predicts = [] | |||||
trues = [] | |||||
model.eval() | |||||
# while i < len(val_data_t): | |||||
# batch = get_batch(val_data_t, i, BATCH_SIZE) | |||||
# i += BATCH_SIZE | |||||
for i in tqdm(range(0, len(val_data_t), bs)): | |||||
if i + bs > len(val_data_t): | |||||
bs = len(val_data_t) - i | |||||
batch = get_batch(val_data_t, i, BATCH_SIZE) | |||||
val1_inputs, val2_inputs, val_labels = batch | |||||
if USE_GPU: | |||||
val1_inputs, val2_inputs, val_labels = val1_inputs, val2_inputs, val_labels.cuda() | |||||
model.batch_size = len(val_labels) | |||||
model.hidden = model.init_hidden() | |||||
output = model(val1_inputs, val2_inputs) | |||||
loss = loss_function(output, Variable(val_labels)) | |||||
total += len(val_labels) | |||||
total_loss += loss.item() * len(val_labels) | |||||
predicted = (output.data > 0.5).cpu().numpy() | |||||
predicts.extend(predicted) | |||||
trues.extend(val_labels.cpu().numpy()) | |||||
val_loss_.append(total_loss / total) | |||||
precision_, recall_, f1_, _ = precision_recall_fscore_support(trues, predicts, average='binary') | |||||
print('categories-%d [Epoch: %3d/%3d] Training Loss: %.4f, Validation Loss: %.4f,' | |||||
% (t, epoch + 1, EPOCHS, train_loss_[epoch], val_loss_[epoch])) | |||||
print("Train results(P,R,F1):%.3f, %.3f, %.3f" % (precision, recall, f1)) | |||||
print("Dev results(P,R,F1):%.3f, %.3f, %.3f" % (precision_, recall_, f1_)) | |||||
torch.save({'epoch': epoch, | |||||
'model_state_dict': model.state_dict() | |||||
}, PATH) | |||||
print("Testing-%d..." % t) | |||||
# testing procedure | |||||
predicts = [] | |||||
trues = [] | |||||
total_loss = 0.0 | |||||
total = 0.0 | |||||
i = 0 | |||||
while i < len(test_data_t): | |||||
batch = get_batch(test_data_t, i, BATCH_SIZE) | |||||
i += BATCH_SIZE | |||||
test1_inputs, test2_inputs, test_labels = batch | |||||
if USE_GPU: | |||||
test_labels = test_labels.cuda() | |||||
model.batch_size = len(test_labels) | |||||
model.hidden = model.init_hidden() | |||||
output = model(test1_inputs, test2_inputs) | |||||
# loss = loss_function(output, Variable(test_labels)) | |||||
# calc testing acc | |||||
predicted = (output.data > 0.5).cpu().numpy() | |||||
predicts.extend(predicted) | |||||
trues.extend(test_labels.cpu().numpy()) | |||||
# total += len(test_labels) | |||||
# total_loss += loss.item() * len(test_labels) | |||||
precision_, recall_, f1_, _ = precision_recall_fscore_support(trues, predicts, average='binary') | |||||
print("Test results(P,R,F1):%.3f, %.3f, %.3f" % (precision_, recall_, f1_)) | |||||
# result = pd.DataFrame(np.array(predicts), columns=['predict']) | |||||
# result['true'] = pd.DataFrame(np.array(trues)) | |||||
# result['label'] = pd.DataFrame(np.array(trues)) | |||||
# result.loc[result['label'] > 0, 'label'] = 1 | |||||
# weights = [0, 0.005, 0.001, 0.002, 0.010, 0.982] | |||||
# for k in range(1, categories+1): | |||||
# trues_ = result[result['true'].isin([0, k])]['label'].values | |||||
# predicts_ = result[result['true'].isin([0, k])]['predict'].values | |||||
# p, r, f, _ = precision_recall_fscore_support(trues_, predicts_, average='binary') | |||||
# precision += weights[k] * p | |||||
# recall += weights[k] * r | |||||
# f1 += weights[k] * f | |||||
# print("Type-" + str(k) + ": " + str(p) + " " + str(r) + " " + str(f)) | |||||
# | |||||
# print("Total testing results(P,R,F1):%.3f, %.3f, %.3f" % (precision, recall, f1)) | |||||
# if lang == 'java': | |||||
# weights = [0, 0.005, 0.001, 0.002, 0.010, 0.982] | |||||
# p, r, f, _ = precision_recall_fscore_support(trues, predicts, average='binary') | |||||
# precision += weights[t] * p | |||||
# recall += weights[t] * r | |||||
# f1 += weights[t] * f | |||||
# print("Type-" + str(t) + ": " + str(p) + " " + str(r) + " " + str(f)) | |||||
# else: | |||||
# precision, recall, f1, _ = precision_recall_fscore_support(trues, predicts, average='binary') | |||||
# | |||||
# print("Total testing results(P,R,F1):%.3f, %.3f, %.3f" % (precision, recall, f1)) |
@ -0,0 +1,170 @@ | |||||
from javalang.ast import Node | |||||
class ASTNode(object): | |||||
def __init__(self, node): | |||||
self.node = node | |||||
# self.vocab = word_map | |||||
self.is_str = isinstance(self.node, str) | |||||
self.token = self.get_token() | |||||
# self.index = self.token_to_index(self.token) | |||||
self.children = self.add_children() | |||||
def is_leaf(self): | |||||
if self.is_str: | |||||
return True | |||||
return len(self.node.children()) == 0 | |||||
def get_token(self, lower=True): | |||||
if self.is_str: | |||||
return self.node | |||||
name = self.node.__class__.__name__ | |||||
token = name | |||||
is_name = False | |||||
if self.is_leaf(): | |||||
attr_names = self.node.attr_names | |||||
if attr_names: | |||||
if 'names' in attr_names: | |||||
token = self.node.names[0] | |||||
elif 'name' in attr_names: | |||||
token = self.node.name | |||||
is_name = True | |||||
else: | |||||
token = self.node.value | |||||
else: | |||||
token = name | |||||
else: | |||||
if name == 'TypeDecl': | |||||
token = self.node.declname | |||||
if self.node.attr_names: | |||||
attr_names = self.node.attr_names | |||||
if 'op' in attr_names: | |||||
if self.node.op[0] == 'p': | |||||
token = self.node.op[1:] | |||||
else: | |||||
token = self.node.op | |||||
if token is None: | |||||
token = name | |||||
if lower and is_name: | |||||
token = token.lower() | |||||
return token | |||||
# def token_to_index(self, token): | |||||
# self.index = self.vocab[token].index if token in self.vocab else MAX_TOKENS | |||||
# return self.index | |||||
# def get_index(self): | |||||
# return self.index | |||||
def add_children(self): | |||||
if self.is_str: | |||||
return [] | |||||
children = self.node.children() | |||||
if self.token in ['FuncDef', 'If', 'While', 'DoWhile']: | |||||
return [ASTNode(children[0][1])] | |||||
elif self.token == 'For': | |||||
return [ASTNode(children[c][1]) for c in range(0, len(children)-1)] | |||||
else: | |||||
return [ASTNode(child) for _, child in children] | |||||
class BlockNode(object): | |||||
def __init__(self, node): | |||||
self.node = node | |||||
self.is_str = isinstance(self.node, str) | |||||
self.token = self.get_token(node) | |||||
self.children = self.add_children() | |||||
def is_leaf(self): | |||||
if self.is_str: | |||||
return True | |||||
return len(self.node.children) == 0 | |||||
def get_token(self, node): | |||||
if isinstance(node, str): | |||||
token = node | |||||
elif isinstance(node, set): | |||||
token = 'Modifier' | |||||
elif isinstance(node, Node): | |||||
token = node.__class__.__name__ | |||||
else: | |||||
token = '' | |||||
return token | |||||
def ori_children(self, root): | |||||
if isinstance(root, Node): | |||||
if self.token in ['MethodDeclaration', 'ConstructorDeclaration']: | |||||
children = root.children[:-1] | |||||
else: | |||||
children = root.children | |||||
elif isinstance(root, set): | |||||
children = list(root) | |||||
else: | |||||
children = [] | |||||
def expand(nested_list): | |||||
for item in nested_list: | |||||
if isinstance(item, list): | |||||
for sub_item in expand(item): | |||||
yield sub_item | |||||
elif item: | |||||
yield item | |||||
return list(expand(children)) | |||||
def add_children(self): | |||||
if self.is_str: | |||||
return [] | |||||
logic = ['SwitchStatement', 'IfStatement', 'ForStatement', 'WhileStatement', 'DoStatement'] | |||||
children = self.ori_children(self.node) | |||||
if self.token in logic: | |||||
return [BlockNode(children[0])] | |||||
elif self.token in ['MethodDeclaration', 'ConstructorDeclaration']: | |||||
return [BlockNode(child) for child in children] | |||||
else: | |||||
return [BlockNode(child) for child in children if self.get_token( child) not in logic] | |||||
class SingleNode(ASTNode): | |||||
def __init__(self, node): | |||||
self.node = node | |||||
self.is_str = isinstance(self.node, str) | |||||
self.token = self.get_token() | |||||
self.children = [] | |||||
def is_leaf(self): | |||||
if self.is_str: | |||||
return True | |||||
return len(self.node.children()) == 0 | |||||
def get_token(self, lower=True): | |||||
if self.is_str: | |||||
return self.node | |||||
name = self.node.__class__.__name__ | |||||
token = name | |||||
is_name = False | |||||
if self.is_leaf(): | |||||
attr_names = self.node.attr_names | |||||
if attr_names: | |||||
if 'names' in attr_names: | |||||
token = self.node.names[0] | |||||
elif 'name' in attr_names: | |||||
token = self.node.name | |||||
is_name = True | |||||
else: | |||||
token = self.node.value | |||||
else: | |||||
token = name | |||||
else: | |||||
if name == 'TypeDecl': | |||||
token = self.node.declname | |||||
if self.node.attr_names: | |||||
attr_names = self.node.attr_names | |||||
if 'op' in attr_names: | |||||
if self.node.op[0] == 'p': | |||||
token = self.node.op[1:] | |||||
else: | |||||
token = self.node.op | |||||
if token is None: | |||||
token = name | |||||
if lower and is_name: | |||||
token = token.lower() | |||||
return token |
@ -0,0 +1,80 @@ | |||||
import pandas as pd | |||||
import javalang | |||||
from javalang.ast import Node | |||||
from tree import ASTNode, BlockNode | |||||
import sys | |||||
sys.setrecursionlimit(10000) | |||||
def get_token(node): | |||||
token = '' | |||||
if isinstance(node, str): | |||||
token = node | |||||
elif isinstance(node, set): | |||||
token = 'Modifier'#node.pop() | |||||
elif isinstance(node, Node): | |||||
token = node.__class__.__name__ | |||||
return token | |||||
def get_children(root): | |||||
if isinstance(root, Node): | |||||
children = root.children | |||||
elif isinstance(root, set): | |||||
children = list(root) | |||||
else: | |||||
children = [] | |||||
def expand(nested_list): | |||||
for item in nested_list: | |||||
if isinstance(item, list): | |||||
for sub_item in expand(item): | |||||
yield sub_item | |||||
elif item: | |||||
yield item | |||||
return list(expand(children)) | |||||
def get_sequence(node, sequence): | |||||
token, children = get_token(node), get_children(node) | |||||
sequence.append(token) | |||||
for child in children: | |||||
get_sequence(child, sequence) | |||||
if token in ['ForStatement', 'WhileStatement', 'DoStatement','SwitchStatement', 'IfStatement']: | |||||
sequence.append('End') | |||||
def get_blocks_v1(node, block_seq): | |||||
name, children = get_token(node), get_children(node) | |||||
logic = ['SwitchStatement','IfStatement', 'ForStatement', 'WhileStatement', 'DoStatement'] | |||||
if name in ['MethodDeclaration', 'ConstructorDeclaration']: | |||||
block_seq.append(BlockNode(node)) | |||||
body = node.body | |||||
for child in body: | |||||
if get_token(child) not in logic and not hasattr(child, 'block'): | |||||
block_seq.append(BlockNode(child)) | |||||
else: | |||||
get_blocks_v1(child, block_seq) | |||||
elif name in logic: | |||||
block_seq.append(BlockNode(node)) | |||||
for child in children[1:]: | |||||
token = get_token(child) | |||||
if not hasattr(node, 'block') and token not in logic+['BlockStatement']: | |||||
block_seq.append(BlockNode(child)) | |||||
else: | |||||
get_blocks_v1(child, block_seq) | |||||
block_seq.append(BlockNode('End')) | |||||
elif name is 'BlockStatement' or hasattr(node, 'block'): | |||||
block_seq.append(BlockNode(name)) | |||||
for child in children: | |||||
if get_token(child)not in logic: | |||||
block_seq.append(BlockNode(child)) | |||||
else: | |||||
get_blocks_v1(child, block_seq) | |||||
else: | |||||
for child in children: | |||||
get_blocks_v1(child, block_seq) | |||||
@ -0,0 +1,7 @@ | |||||
1. 处理数据 python pipeline.py | |||||
- 修改split函数,所有都是测试 | |||||
- 数据在data/c/下面,两个pkl要替换成新的 | |||||
- 必须是可编译的代码 | |||||
2. 运行test python test.py | |||||
- 可以放在126服务器上用cuda跑 | |||||
- 把克隆看做二分类任务,阈值可以自行调整 |
@ -0,0 +1,27 @@ | |||||
-----BEGIN RSA PRIVATE KEY----- | |||||
MIIEowIBAAKCAQEA2R5MIQo+9/oysDtsH0s9xGxBCvD0dtNgMnawXmt1ZJdzAvzf | |||||
bQ3AmT2y40zLtehcqU1XZh1LXZMfqi8GFwznm7fgCM1DqAd/kOKdBCpANWgj+OvL | |||||
Kxwum4bBbmuMg4IYpXoKaf94MWcx5axBmksYrnF2D95avcGYcSxoLHNL86+KY6xJ | |||||
9rMvORs+gLpMQGrY39Cz1n7Ef/9u0LRuWRK+4LHGxP/P4lMC4FvXH2K90RhQln0j | |||||
7RR4uxwkQn57Vhqf2cimTZpouma9+/hVpiaoVdaQbvRFFVRTRLcdsdftemQIqNhc | |||||
2KV2LzsivhbFcOwsLj7jSIjcwOwR0fH7TIel7QIDAQABAoIBAARILS8voQtJ82r3 | |||||
WQwK81Zm3ieFlgSr6YdFQPgzvVZ1CC8kZpGjhktfZKJH9vKI+R7bqCAa7swTJTo5 | |||||
gDC/L+gpybDSv4VWVIU6eudEoAyNl7wGhnS8swydLT5sv5IuZCcLT55EjA3JX7oM | |||||
WiTdW0jBcxcgBwEcCtIckpdh1LsjD1XU6m54w/c8gyfuZcWK+ByFbVzdk5m5sco+ | |||||
wJso3AXFtxx9LhG82XMJW5BKuQfJH2bNI5YoSEGe1fzddZ+ugoCm7iNo4xf83JMT | |||||
9fUBlYxSlBa16lOaA9gwZgQEze7PEfzFKLQLFYTBDp7/QL02+TXAf4/ZdcXUnENs | |||||
8PrABS8CgYEA8Yr91FdDmxQrXwy8S2CYI3JuWt7LqqUipxduz6xyi6dZHlqXslAn | |||||
ndCVp6ihwryrzWSk2KltRaQIdbVTq7YukLQ0uKk/xneaeOh5RC9PclpGivzXNRAX | |||||
cEbro0pz/htPQ9ZVTVuvKhOpDclz1qxhhxR29I/oxU6NkBfGJf3G9TcCgYEA5h0Q | |||||
QHbkcyp8uAyXXXtbJTzAWkvKYWVZ2wwOF4MJ7l670BZef26IpY368zjmjKgNNK07 | |||||
WV4FmFIuZje2Wb9jW3gV5qRysrmlb8Vzks7GEUKLSQcoPvVsiDyh8VvW0qEirH3o | |||||
rGpJeQBrrot9DirmQ+ehDKjGl1b1z3X+/XohD/sCgYBsX/2lsYW+5hzTp+YwN+Xr | |||||
OaO0F/Tv2uoiaIwql+hJKsv8p48azYYI9BbBxBLYSkkXfgnMwLArp/63uaUSDUr1 | |||||
WDWziRT5Wp6vkzcd1dBisYinQezZfR/XG6sMeBJ1OBGnkVpyvClqyql2ayYTcwLL | |||||
Ve5Nqug45xbzSQd58lS7nwKBgQDGKkUynrCBlRcukHlRoceOO9ESca5pTZSiFLGW | |||||
AdztkFuBCaJ7bz7yA2EXT+sLOjWVJZG0lkmPMUaP9G5uv2ja/oEtzHSSAVm89Xdf | |||||
9/2OI5Y7X5SDE2tRr5Vuer53SRjJHuzeffGj6H7TI4CgUMVXuQNyGW5cKiEpdd4P | |||||
f7s1PQKBgFrezClBhd8c72+Q3hZZKoK5VpbqZXMcfcRoObpQ4W2OTY2+jqrSwhOO | |||||
12fWIG473Ok0pnTNxAsO4IdhKpWlXySMDwxS0Rns6TAcDnJa4sCahKnqIoMAqSTA | |||||
VUT/kwEUPat2/zlUhfOl4LooLAW36GDC/nc2urj2uVopdwdOTwVW | |||||
-----END RSA PRIVATE KEY----- |
@ -0,0 +1,76 @@ | |||||
-----BEGIN CERTIFICATE----- | |||||
MIIHoTCCBYmgAwIBAgIQDOVX1N5YbGyIxnUGCW2zsDANBgkqhkiG9w0BAQsFADBc | |||||
MQswCQYDVQQGEwJVUzEXMBUGA1UEChMORGlnaUNlcnQsIEluYy4xNDAyBgNVBAMT | |||||
K1JhcGlkU1NMIEdsb2JhbCBUTFMgUlNBNDA5NiBTSEEyNTYgMjAyMiBDQTEwHhcN | |||||
MjMwMTAyMDAwMDAwWhcNMjQwMTEyMjM1OTU5WjAcMRowGAYDVQQDDBEqLnNodWlz | |||||
aGFuLm5ldC5jbjCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBANkeTCEK | |||||
Pvf6MrA7bB9LPcRsQQrw9HbTYDJ2sF5rdWSXcwL8320NwJk9suNMy7XoXKlNV2Yd | |||||
S12TH6ovBhcM55u34AjNQ6gHf5DinQQqQDVoI/jryyscLpuGwW5rjIOCGKV6Cmn/ | |||||
eDFnMeWsQZpLGK5xdg/eWr3BmHEsaCxzS/OvimOsSfazLzkbPoC6TEBq2N/Qs9Z+ | |||||
xH//btC0blkSvuCxxsT/z+JTAuBb1x9ivdEYUJZ9I+0UeLscJEJ+e1Yan9nIpk2a | |||||
aLpmvfv4VaYmqFXWkG70RRVUU0S3HbHX7XpkCKjYXNildi87Ir4WxXDsLC4+40iI | |||||
3MDsEdHx+0yHpe0CAwEAAaOCA50wggOZMB8GA1UdIwQYMBaAFPCchf2in32PyWi7 | |||||
1dSJTR2+05D/MB0GA1UdDgQWBBQ8TpzGYRl5Mcx4zZ8subB5HviPVTAtBgNVHREE | |||||
JjAkghEqLnNodWlzaGFuLm5ldC5jboIPc2h1aXNoYW4ubmV0LmNuMA4GA1UdDwEB | |||||
/wQEAwIFoDAdBgNVHSUEFjAUBggrBgEFBQcDAQYIKwYBBQUHAwIwgZ8GA1UdHwSB | |||||
lzCBlDBIoEagRIZCaHR0cDovL2NybDMuZGlnaWNlcnQuY29tL1JhcGlkU1NMR2xv | |||||
YmFsVExTUlNBNDA5NlNIQTI1NjIwMjJDQTEuY3JsMEigRqBEhkJodHRwOi8vY3Js | |||||
NC5kaWdpY2VydC5jb20vUmFwaWRTU0xHbG9iYWxUTFNSU0E0MDk2U0hBMjU2MjAy | |||||
MkNBMS5jcmwwPgYDVR0gBDcwNTAzBgZngQwBAgEwKTAnBggrBgEFBQcCARYbaHR0 | |||||
cDovL3d3dy5kaWdpY2VydC5jb20vQ1BTMIGHBggrBgEFBQcBAQR7MHkwJAYIKwYB | |||||
BQUHMAGGGGh0dHA6Ly9vY3NwLmRpZ2ljZXJ0LmNvbTBRBggrBgEFBQcwAoZFaHR0 | |||||
cDovL2NhY2VydHMuZGlnaWNlcnQuY29tL1JhcGlkU1NMR2xvYmFsVExTUlNBNDA5 | |||||
NlNIQTI1NjIwMjJDQTEuY3J0MAkGA1UdEwQCMAAwggGABgorBgEEAdZ5AgQCBIIB | |||||
cASCAWwBagB3AHb/iD8KtvuVUcJhzPWHujS0pM27KdxoQgqf5mdMWjp0AAABhXFA | |||||
a8wAAAQDAEgwRgIhAI++QoPxPN2iOrxIQegcdgwWNzFPnZRoDFKXpBRKMBtlAiEA | |||||
vw/HkDuckkDkfKvtFp1VxeS7GyaetlhEjQOK6ixcuP8AdgBIsONr2qZHNA/lagL6 | |||||
nTDrHFIBy1bdLIHZu7+rOdiEcwAAAYVxQGuxAAAEAwBHMEUCIQDhcg/4dci0YtzM | |||||
59uvgT4+2W780D6oRtCcX0IofxpnKwIgMliXM53/OAYXc0cpaKeotuoQE5ntDMCX | |||||
FfojCPe3G9IAdwA7U3d1Pi25gE6LMFsG/kA7Z9hPw/THvQANLXJv4frUFwAAAYVx | |||||
QGuvAAAEAwBIMEYCIQDvjMHZOQZQ08BLD5/XAHJ6Sw4HaEVwyd+lFpYHLi24vwIh | |||||
AJO6f0RX/rG56cKjHWV/mQsRH94kxJDy7EjzU89uAV0XMA0GCSqGSIb3DQEBCwUA | |||||
A4ICAQAq1H2pr19LU6VnkZGhGlIklQJJ4lhXfX6ciEP+9MSgbUyTTeohv0nUYl1i | |||||
+8ehjD9yBnujUKgjxQZ3KsPn9LSgukvTU1WOEBWw05qywBg9UQ4nnc4tkghSVk35 | |||||
YhJum5L6Xxr0U7ohnffFaFn07+Am/q0HlGtHUvSsrN2yh3idAupQmRWp3sLQl+LR | |||||
VL/ynq2InSGoNcawFiIKd84CJMoHMyXW24iIop044yBvRl6v5DI74j6RUUno75rI | |||||
G3HK1NUfREBeKGV7s7cTFYbR+bBFuIURHs05nGeHy+xHxFh7CwhY2Bg1Do8Mbqzb | |||||
EAVV5yOvizkNqaVULcGg1+KEU92doK625dQ7iWqGLnX5gqFEAQaUgIX0MEgD4SDR | |||||
kr73k5aEKvxCR2y89+7ieHyZM3sFX9SoCn8Az/WaNwNInqaE7uewodi+mKr7AQNH | |||||
OoipoFvc5v7uZNnt+Ixv8VBB66jhNMYZ4YijXMpdqNYLerMVlsTZoavkaznkdQW3 | |||||
jRKcjG35gN21vyKtao0tQC7CZpwGJMqKluDTU6qY8NbvCKEyRUKBH6FKh3FSj8tg | |||||
t4zEnE+XLsKys3NNuDMhA+q+MCSmBE5rqz1l4z7O2a8UQ6vKc9fSULWTK4qJuSgq | |||||
gkhh6LksuplrqG7E6yXHfRNMBuVQiMwgwATiRySDNuOvHJPaWw== | |||||
-----END CERTIFICATE----- | |||||
-----BEGIN CERTIFICATE----- | |||||
MIIFyzCCBLOgAwIBAgIQCgWbJfVLPYeUzGYxR3U4ozANBgkqhkiG9w0BAQsFADBh | |||||
MQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5jMRkwFwYDVQQLExB3 | |||||
d3cuZGlnaWNlcnQuY29tMSAwHgYDVQQDExdEaWdpQ2VydCBHbG9iYWwgUm9vdCBD | |||||
QTAeFw0yMjA1MDQwMDAwMDBaFw0zMTExMDkyMzU5NTlaMFwxCzAJBgNVBAYTAlVT | |||||
MRcwFQYDVQQKEw5EaWdpQ2VydCwgSW5jLjE0MDIGA1UEAxMrUmFwaWRTU0wgR2xv | |||||
YmFsIFRMUyBSU0E0MDk2IFNIQTI1NiAyMDIyIENBMTCCAiIwDQYJKoZIhvcNAQEB | |||||
BQADggIPADCCAgoCggIBAKY5PJhwCX2UyBb1nelu9APen53D5+C40T+BOZfSFaB0 | |||||
v0WJM3BGMsuiHZX2IHtwnjUhLL25d8tgLASaUNHCBNKKUlUGRXGztuDIeXb48d64 | |||||
k7Gk7u7mMRSrj+yuLSWOKnK6OGKe9+s6oaVIjHXY+QX8p2I2S3uew0bW3BFpkeAr | |||||
LBCU25iqeaoLEOGIa09DVojd3qc/RKqr4P11173R+7Ub05YYhuIcSv8e0d7qN1sO | |||||
1+lfoNMVfV9WcqPABmOasNJ+ol0hAC2PTgRLy/VZo1L0HRMr6j8cbR7q0nKwdbn4 | |||||
Ar+ZMgCgCcG9zCMFsuXYl/rqobiyV+8U37dDScAebZTIF/xPEvHcmGi3xxH6g+dT | |||||
CjetOjJx8sdXUHKXGXC9ka33q7EzQIYlZISF7EkbT5dZHsO2DOMVLBdP1N1oUp0/ | |||||
1f6fc8uTDduELoKBRzTTZ6OOBVHeZyFZMMdi6tA5s/jxmb74lqH1+jQ6nTU2/Mma | |||||
hGNxUuJpyhUHezgBA6sto5lNeyqc+3Cr5ehFQzUuwNsJaWbDdQk1v7lqRaqOlYjn | |||||
iomOl36J5txTs0wL7etCeMRfyPsmc+8HmH77IYVMUOcPJb+0gNuSmAkvf5QXbgPI | |||||
Zursn/UYnP9obhNbHc/9LYdQkB7CXyX9mPexnDNO7pggNA2jpbEarLmZGi4grMmf | |||||
AgMBAAGjggGCMIIBfjASBgNVHRMBAf8ECDAGAQH/AgEAMB0GA1UdDgQWBBTwnIX9 | |||||
op99j8lou9XUiU0dvtOQ/zAfBgNVHSMEGDAWgBQD3lA1VtFMu2bwo+IbG8OXsj3R | |||||
VTAOBgNVHQ8BAf8EBAMCAYYwHQYDVR0lBBYwFAYIKwYBBQUHAwEGCCsGAQUFBwMC | |||||
MHYGCCsGAQUFBwEBBGowaDAkBggrBgEFBQcwAYYYaHR0cDovL29jc3AuZGlnaWNl | |||||
cnQuY29tMEAGCCsGAQUFBzAChjRodHRwOi8vY2FjZXJ0cy5kaWdpY2VydC5jb20v | |||||
RGlnaUNlcnRHbG9iYWxSb290Q0EuY3J0MEIGA1UdHwQ7MDkwN6A1oDOGMWh0dHA6 | |||||
Ly9jcmwzLmRpZ2ljZXJ0LmNvbS9EaWdpQ2VydEdsb2JhbFJvb3RDQS5jcmwwPQYD | |||||
VR0gBDYwNDALBglghkgBhv1sAgEwBwYFZ4EMAQEwCAYGZ4EMAQIBMAgGBmeBDAEC | |||||
AjAIBgZngQwBAgMwDQYJKoZIhvcNAQELBQADggEBAAfjh/s1f5dDdfm0sNm74/dW | |||||
MbbsxfYV1LoTpFt+3MSUWvSbiPQfUkoV57b5rutRJvnPP9mSlpFwcZ3e1nSUbi2o | |||||
ITGA7RCOj23I1F4zk0YJm42qAwJIqOVenR3XtyQ2VR82qhC6xslxtNf7f2Ndx2G7 | |||||
Mem4wpFhyPDT2P6UJ2MnrD+FC//ZKH5/ERo96ghz8VqNlmL5RXo8Ks9rMr/Ad9xw | |||||
Y4hyRvAz5920myUffwdUqc0SvPlFnahsZg15uT5HkK48tHR0TLuLH8aRpzh4KJ/Y | |||||
p0sARNb+9i1R4Fg5zPNvHs2BbIve0vkwxAy+R4727qYzl3027w9jEFC6HMXRaDc= | |||||
-----END CERTIFICATE----- |
@ -0,0 +1,43 @@ | |||||
FROM continuumio/miniconda3:4.12.0 | |||||
RUN apt-get update \ | |||||
&& apt-get -y upgrade \ | |||||
&& apt-get -y install zip curl \ | |||||
&& apt-get install -y \ | |||||
&& apt-get autoremove -y \ | |||||
&& apt-get clean -y \ | |||||
&& rm -rf /var/lib/apt/lists/* | |||||
SHELL ["/bin/bash", "-o", "pipefail", "-c"] | |||||
RUN conda install -y jupyter notebook | |||||
COPY notebook.html /tmp/ | |||||
RUN rm /opt/conda/lib/python3.9/site-packages/notebook/templates/notebook.html \ | |||||
&& mv /tmp/notebook.html /opt/conda/lib/python3.9/site-packages/notebook/templates/ | |||||
ARG NB_USER="jupyter" | |||||
ARG NB_UID="1000" | |||||
ARG NB_GID="100" | |||||
ARG NB_PORT=8888 | |||||
EXPOSE ${NB_PORT} | |||||
ENV SHELL=/bin/bash \ | |||||
NB_USER="${NB_USER}" \ | |||||
NB_UID=${NB_UID} \ | |||||
NB_GID=${NB_GID} \ | |||||
PYTHONPATH=$PYTHONPATH:/opt/conda/bin | |||||
ENV PATH="${PYTHONPATH}:${PATH}" | |||||
RUN useradd -l -m -s /bin/bash -N -u "${NB_UID}" "${NB_USER}" \ | |||||
&& mkdir /home/${NB_USER}/.jupyter | |||||
COPY jupyter_notebook_config.py /home/${NB_USER}/.jupyter/ | |||||
RUN chmod 777 /home/${NB_USER}/.jupyter/jupyter_notebook_config.py \ | |||||
&& chmod -R 777 /home/${NB_USER} | |||||
ADD enterpoint.sh /enterpoint.sh |
@ -0,0 +1,7 @@ | |||||
#! /bin/bash | |||||
# dir_name=`ls /home/jupyter` | |||||
config_str="\nc.NotebookApp.notebook_dir = \"/home/jupyter\"" | |||||
echo -e ${config_str} >> /home/jupyter/.jupyter/jupyter_notebook_config.py | |||||
jupyter notebook --ip=0.0.0.0 |