@ -0,0 +1,59 @@ | |||
# 多语言文本识别 | |||
## 介绍 | |||
本项目采用基于faster rcnn 的CTPN网络进行文本定位,修改原网络以进行多语言语言识别。利用神经网络预测文本行与anchor之间的偏移量。使用VGG16提取特征,在feature map上使用滑动窗口预测和anchor之间的偏移距离,之后将其输入到一个双向LSTM网络,获得序列特征。由于文本行长度差异较大,模型仅预测anchor高度,最后循环连接小尺度的文本框。 | |||
文本识别网络采用seq2seq模型以及attention机制。encoder端使用CNN以获得较高的并行速度,同时采用positional embedding表征位置信息; decoder端使用LSTM做解码器。考虑到数学公式的识别存在长距离依赖的问题,故引入attention机制。 | |||
训练数据结合了拍摄的图像以及课题组制作的含有混合latex公式及文字的图像,训练过程中进行了图像增强,以提高泛化能力。 | |||
## 结构 | |||
- 文本行定位 | |||
main文件夹,net文件夹。 权重:checkpoint_mlt | |||
- 文本识别 | |||
model文件夹 权重:results/full | |||
- 数据生成脚本:generate_data文件夹 | |||
train:训练文本识别部分的网络 | |||
evaluate_txt:验证文本识别部分的网络 | |||
Main/train:训练文本定位部分的网络 | |||
predict: 单行预测 | |||
demo2:展示两个网络联合起来的效果,输入多行图片预测 | |||
Classification:单图像语言分类(最终模型未使用) | |||
- 数据来源: | |||
1. 自生成的行级别Latex与行级别的英文数据,用于训练文本识别网络。 | |||
Latex文本来源为arXiv论文http://www.cs.cornell.edu/projects/kddcup/datasets.html | |||
英文文本来源为美国当代英语语料库(COCA) | |||
经过处理后的实验的文本数据在data2和data3中。 | |||
2. 自生成的图像级别的数据集,用于训练文本框检测网络 | |||
英文文本,Latex文本来源同上。 | |||
- 文本识别结果 | |||
| | BLEU-4 | Inverse Edit | perplexity | Exact Match | | |||
| --------------- | ------ | ------------ | ---------- | ----------- | | |||
| Seq2seq混合式 | 86.36 | 88.69 | -1.44 | 36.20 | | |||
| Seq2seq-Latex | 90.10 | 84.12 | -1.32 | 37.21 | | |||
| Seq2seq-English | 97.2 | 97.22 | -1.05 | 88.54 | |
@ -0,0 +1,47 @@ | |||
import click | |||
from model.utils.data_generator import DataGenerator | |||
from model.utils.text import build_vocab, write_vocab | |||
from model.utils.image import build_images | |||
from model.utils.general import Config | |||
@click.command() | |||
@click.option('--data', default="configs/data.json", | |||
help='Path to data json config') | |||
@click.option('--vocab', default="configs/vocab.json", | |||
help='Path to vocab json config') | |||
def main(data, vocab): | |||
data_config = Config(data) | |||
# datasets | |||
train_set = DataGenerator( | |||
path_formulas=data_config.path_formulas_train, | |||
dir_images=data_config.dir_images_train, | |||
path_matching=data_config.path_matching_train) | |||
""" | |||
test_set = DataGenerator( | |||
path_formulas=data_config.path_formulas_test, | |||
dir_images=data_config.dir_images_test, | |||
path_matching=data_config.path_matching_test) | |||
""" | |||
val_set = DataGenerator( | |||
path_formulas=data_config.path_formulas_val, | |||
dir_images=data_config.dir_images_val, | |||
path_matching=data_config.path_matching_val) | |||
# produce images and matching files | |||
train_set.build(buckets=data_config.buckets) | |||
#test_set.build(buckets=data_config.buckets) | |||
val_set.build(buckets=data_config.buckets) | |||
# vocab | |||
vocab_config = Config(vocab) | |||
vocab = build_vocab([train_set], min_count=vocab_config.min_count_tok) | |||
write_vocab(vocab, vocab_config.path_vocab) | |||
if __name__ == "__main__": | |||
main() |
@ -0,0 +1,29 @@ | |||
{ | |||
"export_name": "data.json", | |||
"dir_images_train": "data2/images_train/", | |||
"dir_images_test" : "data2/images_test/", | |||
"dir_images_val" : "data2/images_val/", | |||
"path_matching_train": "data2/train.matching.txt", | |||
"path_matching_val" : "data2/val.matching.txt", | |||
"path_matching_test" : "data2/test.matching.txt", | |||
"path_formulas_train": "data2/train.formulas.norm.txt", | |||
"path_formulas_test" : "data2/test.formulas.norm.txt", | |||
"path_formulas_val" : "data2/val.formulas.norm.txt", | |||
"bucket_train": true, | |||
"bucket_val": true, | |||
"bucket_test": true, | |||
"max_iter" : null, | |||
"max_length_formula": 150, | |||
"buckets": [ | |||
[80,80],[240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100], | |||
[560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100], | |||
[720, 120], [720, 200], [800, 80], [800, 100], [1000, 80], | |||
[1000, 100], [1200, 100], [1600, 80], [1600, 100] | |||
] | |||
} |
@ -0,0 +1,29 @@ | |||
{ | |||
"export_name": "data.json", | |||
"dir_images_train": "data/small/", | |||
"dir_images_test" : "data/small/", | |||
"dir_images_val" : "data/small/", | |||
"path_matching_train": "data/small.matching.txt", | |||
"path_matching_val" : "data/small.matching.txt", | |||
"path_matching_test" : "data/small.matching.txt", | |||
"path_formulas_train": "data/small.formulas.norm.txt", | |||
"path_formulas_test" : "data/small.formulas.norm.txt", | |||
"path_formulas_val" : "data/small.formulas.norm.txt", | |||
"max_iter" : 20, | |||
"max_length_formula": 50, | |||
"bucket_train": true, | |||
"bucket_val": true, | |||
"bucket_test": true, | |||
"buckets": [ | |||
[240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100], | |||
[560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100], | |||
[720, 120], [720, 200], [800, 100], [800, 320], [1000, 200], | |||
[1000, 400], [1200, 200], [1600, 200], [1600, 1600] | |||
] | |||
} |
@ -0,0 +1,9 @@ | |||
import tensorflow as tf | |||
import numpy as np | |||
sess = tf.Session() | |||
inputs = tf.placeholder(dtype=tf.float32, shape=(1, 300, 300, 3)) | |||
net = tf.layers.Conv2D(filters=2, kernel_size=3)(inputs) | |||
net = tf.nn.softmax(net, axis=-1) | |||
sess.run(tf.global_variables_initializer()) | |||
sess.run(net, feed_dict={inputs: np.zeros(shape=(1, 300, 300, 3), dtype=np.float32)}) |
@ -0,0 +1,277 @@ | |||
# coding=utf-8 | |||
import os | |||
import shutil | |||
import sys | |||
import time | |||
import cv2 | |||
import numpy as np | |||
import tensorflow as tf | |||
from main import preprocess | |||
import json | |||
import locale | |||
locale.setlocale(locale.LC_ALL, 'C') | |||
from scipy.misc import imread | |||
#current_directory = os.path.dirname(os.path.abspath(__file__)) | |||
#root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".") | |||
#sys.path.append(sys.path.append(os.getcwd())) | |||
sys.path.append(os.getcwd()) | |||
from nets import model_train as ctpnmodel | |||
from utils.rpn_msr.proposal_layer import proposal_layer | |||
from utils.text_connector.detectors import TextDetector | |||
from scipy.misc import imread | |||
import os | |||
from PIL import Image | |||
from model.img2seq import Img2SeqModel | |||
from model.utils.general import Config, run | |||
from model.utils.text import Vocab | |||
from model.utils.image import greyscale,predictsize | |||
tf.app.flags.DEFINE_string('test_data_path', '/app/image/1.png', '') | |||
tf.app.flags.DEFINE_string('output_path', '/app/im2latex_master/results/predict/', '') | |||
tf.app.flags.DEFINE_string('gpu', '0', '') | |||
tf.app.flags.DEFINE_string('checkpoint_path', '/app/im2latex_master/checkpoints_mlt/', '') | |||
FLAGS = tf.app.flags.FLAGS | |||
tf.app.flags.DEFINE_integer('language', '2', '') | |||
def get_images(): | |||
files = [] | |||
exts = ['jpg', 'png', 'jpeg', 'JPG'] | |||
for parent, dirnames, filenames in os.walk(FLAGS.test_data_path): | |||
for filename in filenames: | |||
for ext in exts: | |||
if filename.endswith(ext): | |||
files.append(os.path.join(parent, filename)) | |||
break | |||
print('Find {} images'.format(len(files))) | |||
return files | |||
def resize_image(img): | |||
img_size = img.shape | |||
im_size_min = np.min(img_size[0:2]) | |||
im_size_max = np.max(img_size[0:2]) | |||
im_scale = float(600) / float(im_size_min) | |||
if np.round(im_scale * im_size_max) > 1200: | |||
im_scale = float(1200) / float(im_size_max) | |||
new_h = int(img_size[0] * im_scale) | |||
new_w = int(img_size[1] * im_scale) | |||
new_h = new_h if new_h // 16 == 0 else (new_h // 16 + 1) * 16 | |||
new_w = new_w if new_w // 16 == 0 else (new_w // 16 + 1) * 16 | |||
re_im = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR) | |||
#cv2.imshow("ss",img) | |||
#cv2.waitKey(0) | |||
return re_im, (new_h / img_size[0], new_w / img_size[1]) | |||
def get_box(): | |||
if os.path.exists(FLAGS.output_path): | |||
shutil.rmtree(FLAGS.output_path) | |||
os.makedirs(FLAGS.output_path) | |||
os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu | |||
input_image = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_image') | |||
input_im_info = tf.placeholder(tf.float32, shape=[None, 3], name='input_im_info') | |||
global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) | |||
bbox_pred, cls_pred, cls_prob = ctpnmodel.model(input_image,2) | |||
variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step) | |||
saver = tf.train.Saver(variable_averages.variables_to_restore()) | |||
ckpt_state = tf.train.get_checkpoint_state(FLAGS.checkpoint_path) | |||
model_path = os.path.join(FLAGS.checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path)) | |||
# print('Restore from {}'.format(model_path)) | |||
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) | |||
saver.restore(sess, model_path) | |||
dir_output = "/app/im2latex_master/results/full/" | |||
config_vocab = Config(dir_output + "vocab.json") | |||
config_model = Config(dir_output + "model.json") | |||
vocab = Vocab(config_vocab) | |||
model = Img2SeqModel(config_model, dir_output, vocab) | |||
model.build_pred() | |||
model.restore_session(dir_output + "model.weights4/test-model.ckpt") | |||
# print(FLAGS.test_data_path) | |||
img = cv2.imread(FLAGS.test_data_path)[:, :, ::-1] | |||
h, w, c = img.shape | |||
if h > 121: | |||
approx, image, (rh, rw) = preprocess.draw_rec(img) | |||
img = preprocess.Perspective(image, approx) | |||
img = cv2.resize(img, None, None, fx=1.0 / rw, fy=1.0 / rh, interpolation=cv2.INTER_LINEAR) | |||
#cv2.imshow("Dd",img) | |||
#cv2.waitKey(0) | |||
img, (rh, rw) = resize_image(img) | |||
h, w, c = img.shape | |||
im_info = np.array([h, w, c]).reshape([1, 3]) | |||
bbox_pred_val, cls_prob_val = sess.run([bbox_pred, cls_prob], | |||
feed_dict={input_image: [img], | |||
input_im_info: im_info}) | |||
textsegs, _ = proposal_layer(cls_prob_val, bbox_pred_val, im_info, img) | |||
scores = textsegs[:, 0:2] # 改 | |||
textsegs = textsegs[:, 2:6] # 改 | |||
textdetector = TextDetector(DETECT_MODE='H') | |||
boxes = textdetector.detect(textsegs, scores, img.shape[:2], img) | |||
boxes = np.array(boxes, dtype=np.int) | |||
image_box = sorted(boxes, key=(lambda x: (x[1] + x[3], x[0] + x[6]))) | |||
for i, box in enumerate(image_box): | |||
cv2.polylines(img, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0), | |||
thickness=2) | |||
img = cv2.resize(img, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR) | |||
cv2.imshow("ss",img) | |||
cv2.waitKey(0) | |||
return 0 | |||
def save_to_file(): | |||
if os.path.exists(FLAGS.output_path): | |||
shutil.rmtree(FLAGS.output_path) | |||
os.makedirs(FLAGS.output_path) | |||
os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu | |||
input_image = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_image') | |||
input_im_info = tf.placeholder(tf.float32, shape=[None, 3], name='input_im_info') | |||
global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) | |||
bbox_pred, cls_pred, cls_prob = ctpnmodel.model(input_image,2.0) | |||
variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step) | |||
saver = tf.train.Saver(variable_averages.variables_to_restore()) | |||
ckpt_state = tf.train.get_checkpoint_state(FLAGS.checkpoint_path) | |||
model_path = os.path.join(FLAGS.checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path)) | |||
sess=tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) | |||
saver.restore(sess, model_path) | |||
dir_output = "/app/im2latex_master/results/full/" | |||
config_vocab = Config(dir_output + "vocab.json") | |||
config_model = Config(dir_output + "model.json") | |||
vocab = Vocab(config_vocab) | |||
#英文 | |||
config_vocab_en = Config(dir_output + "vocabe.json") | |||
vocab_en = Vocab(config_vocab_en) | |||
model_en = Img2SeqModel(config_model, dir_output, vocab_en) | |||
model_en.build_pred() | |||
model_en.restore_session(dir_output + "model.weights_en/test-model.ckpt") | |||
#print(FLAGS.test_data_path) | |||
img = imread(FLAGS.test_data_path) | |||
h, w, c = img.shape | |||
res = "" | |||
if h>40: | |||
approx, image, (rh, rw) = preprocess.draw_rec(img) | |||
img = preprocess.Perspective(image, approx) | |||
img = cv2.resize(img, None, None, fx=1.0 / rw, fy=1.0 / rh, interpolation=cv2.INTER_LINEAR) | |||
img, (rh, rw) = resize_image(img) | |||
h, w, c = img.shape | |||
im_info = np.array([h, w, c]).reshape([1, 3]) | |||
bbox_pred_val, cls_prob_val = sess.run([bbox_pred, cls_prob], | |||
feed_dict={input_image: [img], | |||
input_im_info: im_info}) | |||
textsegs, _ = proposal_layer(cls_prob_val, bbox_pred_val, im_info,img) | |||
scores = textsegs[:, 0:2] # 改 | |||
textsegs = textsegs[:, 2:6] # 改 | |||
textdetector = TextDetector(DETECT_MODE='H') | |||
boxes = textdetector.detect(textsegs, scores, img.shape[:2],img) | |||
boxes = np.array(boxes, dtype=np.int) | |||
img2=img.copy() | |||
for i, box in enumerate(boxes): | |||
if box[8]==1: | |||
cv2.polylines(img2, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0), | |||
thickness=2) | |||
else: | |||
cv2.polylines(img2, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(255, 0, 0), | |||
thickness=2) | |||
img2 = cv2.resize(img2, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR) | |||
#cv2.imshow("ss", img2) | |||
#cv2.waitKey(0) | |||
for i,b in enumerate(boxes): | |||
lan=b[8] | |||
box = boxes[i] | |||
img0 = img[min(box[1], box[3]) - 1:max(box[5], box[7]) + 1, min(box[0], box[2]) - 1:max(box[4], box[6]) + 1, | |||
::-1] | |||
#cv2.imshow("ss",img0) | |||
#cv2.waitKey(0) | |||
""" | |||
if lan == 2: | |||
img0 = predictsize(img0) | |||
#cv2.imshow("ss",img0) | |||
#cv2.waitKey(0) | |||
img0 = greyscale(img0) | |||
hyp = model.predict(img0) | |||
res = res + hyp[0] + "\n" | |||
model.logger.info(hyp[0]) | |||
else: | |||
""" | |||
img0 = predictsize(img0) | |||
#cv2.imshow("ss",img0) | |||
#cv2.waitKey(0) | |||
img0 = greyscale(img0) | |||
hyp = model_en.predict(img0) | |||
res = res + hyp[0] + "\n" | |||
model_en.logger.info(hyp[0]) | |||
#hyp=pytesseract.image_to_string(img0) | |||
#res = res + hyp + "\n" | |||
#model.logger.info(hyp) | |||
res = json.dumps({"res": res}) | |||
model_en.logger.info(res) | |||
else: | |||
#print(0) | |||
img = predictsize(img) | |||
img0 = greyscale(img) | |||
#cv2.imshow("ss", img0) | |||
#cv2.waitKey(0) | |||
hyps = model_en.predict(img0) | |||
res = res + hyps[0] + "\n" | |||
model_en.logger.info(hyps[0]) | |||
res = json.dumps({"res": res}) | |||
model_en.logger.info(res) | |||
return 0 | |||
''' | |||
cv2.imwrite(os.path.join(FLAGS.output_path, str(i) +'.png'),img[min(box[1],box[3]):max(box[5],box[7]),min(box[0],box[2]) :max(box[4],box[6]), ::-1]) | |||
cv2.polylines(img, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0), | |||
thickness=2) | |||
img = cv2.resize(img, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR) | |||
cv2.imwrite(os.path.join(FLAGS.output_path, os.path.basename(im_fn)), img[:, :, ::-1]) | |||
with open(os.path.join(FLAGS.output_path, os.path.splitext(os.path.basename(im_fn))[0]) + ".txt", | |||
"w") as f: | |||
for i, box in enumerate(boxes): | |||
line = ",".join(str(box[k]) for k in range(8)) | |||
line += "," + str(scores[i]) + "\r\n" | |||
f.writelines(line) | |||
''' | |||
def main(argv=None): | |||
res=save_to_file() | |||
#res=get_box() | |||
return res | |||
if __name__ == '__main__': | |||
tf.app.run() |
@ -0,0 +1,53 @@ | |||
import click | |||
from model.utils.data_generator import DataGenerator | |||
from model.img2seq import Img2SeqModel | |||
from model.utils.general import Config | |||
from model.utils.text import Vocab, load_formulas | |||
from model.utils.image import greyscale, build_images | |||
from model.evaluation.text import score_files | |||
from model.evaluation.image import score_dirs | |||
@click.command() | |||
@click.option('--results', default="results/full/", help='Dir to results') | |||
def main(results): | |||
# restore config and model | |||
dir_output = results | |||
config_data = Config(dir_output + "data.json") | |||
config_vocab = Config(dir_output + "vocab.json") | |||
config_model = Config(dir_output + "model.json") | |||
vocab = Vocab(config_vocab) | |||
model = Img2SeqModel(config_model, dir_output, vocab) | |||
model.build_pred() | |||
model.restore_session(dir_output + "model.weights/") | |||
# load dataset | |||
test_set = DataGenerator(path_formulas=config_data.path_formulas_test, | |||
dir_images=config_data.dir_images_test, img_prepro=greyscale, | |||
max_iter=1, bucket=config_data.bucket_test, | |||
path_matching=config_data.path_matching_test, | |||
max_len=config_data.max_length_formula, | |||
form_prepro=vocab.form_prepro,bucket_size=1) | |||
# build images from formulas | |||
formula_ref = dir_output + "formulas_test/ref.txt" | |||
formula_hyp = dir_output + "formulas_test/hyp_0.txt" | |||
images_ref = dir_output + "images_test/ref/" | |||
images_test = dir_output + "images_test/hyp_0/" | |||
build_images(load_formulas(formula_ref), images_ref) | |||
build_images(load_formulas(formula_hyp), images_test) | |||
# score the repositories | |||
scores = score_dirs(images_ref, images_test, greyscale) | |||
msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in scores.items()]) | |||
model.logger.info("- Eval Img: {}".format(msg)) | |||
if __name__ == "__main__": | |||
main() |
@ -0,0 +1,51 @@ | |||
import click | |||
from model.utils.data_generator import DataGenerator | |||
from model.img2seq import Img2SeqModel | |||
from model.utils.general import Config | |||
from model.utils.text import Vocab | |||
from model.utils.image import greyscale | |||
from model.utils.text import load_formulas | |||
from model.evaluation.text import score_files | |||
@click.command() | |||
@click.option('--results', default="results/full/", help='Dir to results') | |||
def main(results): | |||
# restore config and model | |||
dir_output = results | |||
config_data = Config(dir_output + "data.json") | |||
config_vocab = Config(dir_output + "vocab.json") | |||
config_model = Config(dir_output + "model.json") | |||
vocab = Vocab(config_vocab) | |||
model = Img2SeqModel(config_model, dir_output, vocab) | |||
model.build_pred() | |||
model.restore_session(dir_output + "model.weights4/test-model.ckpt") | |||
# load dataset | |||
test_set = DataGenerator(path_formulas=config_data.path_formulas_test, | |||
dir_images=config_data.dir_images_test, | |||
max_iter=3000, bucket=config_data.bucket_test, | |||
path_matching=config_data.path_matching_test, | |||
max_len=config_data.max_length_formula, | |||
form_prepro=vocab.form_prepro) | |||
# use model to write predictions in files | |||
config_eval = Config({"dir_answers": dir_output + "formulas_test/", | |||
"batch_size": 20}) | |||
files, perplexity = model.write_prediction(config_eval, test_set) | |||
formula_ref, formula_hyp = files[0], files[1] | |||
# score the ref and prediction files | |||
scores = score_files(formula_ref, formula_hyp) | |||
scores["perplexity"] = perplexity | |||
msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in scores.items()]) | |||
model.logger.info("- Test Txt: {}".format(msg)) | |||
if __name__ == "__main__": | |||
main() |
@ -0,0 +1,52 @@ | |||
install-linux: | |||
sudo pip install -r requirements.txt | |||
sudo apt-get install texlive-latex-base | |||
sudo apt-get install texlive-latex-extra | |||
sudo apt-get install ghostscript | |||
sudo apt-get install libgs-dev | |||
wget http://www.imagemagick.org/download/ImageMagick.tar.gz | |||
tar -xvf ImageMagick.tar.gz | |||
cd ImageMagick-7.*; \ | |||
./configure --with-gslib=yes; \ | |||
make; \ | |||
sudo make install; \ | |||
sudo ldconfig /usr/local/lib | |||
rm ImageMagick.tar.gz | |||
rm -r ImageMagick-7.* | |||
install-mac: | |||
sudo pip install -r requirements.txt | |||
wget http://www.imagemagick.org/download/ImageMagick.tar.gz | |||
tar -xvf ImageMagick.tar.gz | |||
cd ImageMagick-7.*; \ | |||
./configure --with-gslib=yes; \ | |||
make;\ | |||
sudo make install; \ | |||
rm ImageMagick.tar.gz | |||
rm -r ImageMagick-7.* | |||
build-small: | |||
python build.py --data=configs/data_small.json --vocab=configs/vocab_small.json | |||
train-small: | |||
python train.py --data=configs/data_small.json --vocab=configs/vocab_small.json --training=configs/training_small.json --model=configs/model.json --output=results/small/ | |||
eval-small: | |||
python evaluate_txt.py --results=results/small/ | |||
python evaluate_img.py --results=results/small/ | |||
small: build-small train-small eval-small | |||
build: | |||
python build.py --data=configs/data.json --vocab=configs/vocab.json | |||
train: | |||
python train.py --data=configs/data.json --vocab=configs/vocab.json --training=configs/training.json --model=configs/model.json --output=results/full/ | |||
eval: | |||
python evaluate_txt.py --results=results/full/ | |||
python evaluate_img.py --results=results/full/ | |||
full: build train eval |
@ -0,0 +1,21 @@ | |||
{ | |||
"export_name": "model.json", | |||
"encoder_cnn": "vanilla", | |||
"positional_embeddings": true, | |||
"attn_cell_config": { | |||
"cell_type": "lstm", | |||
"num_units": 512, | |||
"dim_e" : 512, | |||
"dim_o" : 512, | |||
"dim_embeddings": 80 | |||
}, | |||
"decoding": "beam_search", | |||
"beam_size": 5, | |||
"div_gamma": 1, | |||
"div_prob": 0, | |||
"max_length_formula": 150 | |||
} |
@ -0,0 +1,168 @@ | |||
import tensorflow as tf | |||
from tensorflow.contrib import slim | |||
from nets import vgg | |||
from utils.rpn_msr.anchor_target_layer import anchor_target_layer as anchor_target_layer_py | |||
def mean_image_subtraction(images, means=[123.68, 116.78, 103.94]): | |||
num_channels = images.get_shape().as_list()[-1] | |||
if len(means) != num_channels: | |||
raise ValueError('len(means) must match the number of channels') | |||
channels = tf.split(axis=3, num_or_size_splits=num_channels, value=images) | |||
for i in range(num_channels): | |||
channels[i] -= means[i] | |||
return tf.concat(axis=3, values=channels) | |||
def make_var(name, shape, initializer=None): | |||
return tf.get_variable(name, shape, initializer=initializer) | |||
def Bilstm(net, input_channel, hidden_unit_num, output_channel, scope_name): | |||
# width--->time step | |||
with tf.variable_scope(scope_name) as scope: | |||
shape = tf.shape(net) | |||
N, H, W, C = shape[0], shape[1], shape[2], shape[3] | |||
net = tf.reshape(net, [N * H, W, C]) | |||
net.set_shape([None, None, input_channel]) | |||
lstm_fw_cell = tf.contrib.rnn.LSTMCell(hidden_unit_num, state_is_tuple=True) | |||
lstm_bw_cell = tf.contrib.rnn.LSTMCell(hidden_unit_num, state_is_tuple=True) | |||
lstm_out, last_state = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, net, dtype=tf.float32) | |||
lstm_out = tf.concat(lstm_out, axis=-1) | |||
lstm_out = tf.reshape(lstm_out, [N * H * W, 2 * hidden_unit_num]) | |||
init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.01, mode='FAN_AVG', uniform=False) | |||
init_biases = tf.constant_initializer(0.0) | |||
weights = make_var('weights', [2 * hidden_unit_num, output_channel], init_weights) | |||
biases = make_var('biases', [output_channel], init_biases) | |||
outputs = tf.matmul(lstm_out, weights) + biases | |||
outputs = tf.reshape(outputs, [N, H, W, output_channel]) | |||
return outputs | |||
def lstm_fc(net, input_channel, output_channel, scope_name): | |||
with tf.variable_scope(scope_name) as scope: | |||
shape = tf.shape(net) | |||
N, H, W, C = shape[0], shape[1], shape[2], shape[3] | |||
net = tf.reshape(net, [N * H * W, C]) | |||
init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.01, mode='FAN_AVG', uniform=False) | |||
init_biases = tf.constant_initializer(0.0) | |||
weights = make_var('weights', [input_channel, output_channel], init_weights) | |||
biases = make_var('biases', [output_channel], init_biases) | |||
output = tf.matmul(net, weights) + biases | |||
output = tf.reshape(output, [N, H, W, output_channel]) | |||
return output | |||
def model(image,language):#改 | |||
image = mean_image_subtraction(image) | |||
with slim.arg_scope(vgg.vgg_arg_scope()): | |||
conv5_3 = vgg.vgg_16(image) | |||
rpn_conv = slim.conv2d(conv5_3, 512, 3) | |||
lstm_output = Bilstm(rpn_conv, 512, 128, 512, scope_name='BiLSTM') | |||
bbox_pred = lstm_fc(lstm_output, 512, 10 * 4, scope_name="bbox_pred") | |||
cls_pred = lstm_fc(lstm_output, 512, 10 * 3, scope_name="cls_pred")#改 | |||
# transpose: (1, H, W, A x d) -> (1, H, WxA, d) | |||
cls_pred_shape = tf.shape(cls_pred) | |||
cls_pred_reshape = tf.reshape(cls_pred, [cls_pred_shape[0], cls_pred_shape[1], -1, 3])#改 | |||
cls_pred_reshape_shape = tf.shape(cls_pred_reshape) | |||
cls_prob = tf.reshape(tf.nn.softmax(tf.reshape(cls_pred_reshape, [-1, cls_pred_reshape_shape[3]])), | |||
[-1, cls_pred_reshape_shape[1], cls_pred_reshape_shape[2], cls_pred_reshape_shape[3]], | |||
name="cls_prob") | |||
return bbox_pred, cls_pred, cls_prob | |||
def anchor_target_layer(cls_pred, bbox, im_info, scope_name): | |||
with tf.variable_scope(scope_name) as scope: | |||
# 'rpn_cls_score', 'gt_boxes', 'im_info' | |||
rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = \ | |||
tf.py_func(anchor_target_layer_py, | |||
[cls_pred, bbox, im_info, [16, ], [16]], | |||
[tf.float32, tf.float32, tf.float32, tf.float32]) | |||
rpn_labels = tf.convert_to_tensor(tf.cast(rpn_labels, tf.int32), | |||
name='rpn_labels') | |||
rpn_bbox_targets = tf.convert_to_tensor(rpn_bbox_targets, | |||
name='rpn_bbox_targets') | |||
rpn_bbox_inside_weights = tf.convert_to_tensor(rpn_bbox_inside_weights, | |||
name='rpn_bbox_inside_weights') | |||
rpn_bbox_outside_weights = tf.convert_to_tensor(rpn_bbox_outside_weights, | |||
name='rpn_bbox_outside_weights') | |||
return [rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights] | |||
def smooth_l1_dist(deltas, sigma2=9.0, name='smooth_l1_dist'): | |||
with tf.name_scope(name=name) as scope: | |||
deltas_abs = tf.abs(deltas) | |||
smoothL1_sign = tf.cast(tf.less(deltas_abs, 1.0 / sigma2), tf.float32) | |||
return tf.square(deltas) * 0.5 * sigma2 * smoothL1_sign + \ | |||
(deltas_abs - 0.5 / sigma2) * tf.abs(smoothL1_sign - 1) | |||
def loss(bbox_pred, cls_pred, bbox, im_info): | |||
# rpn_labels : (HxWxA, 1), for each anchor, 0 denotes bg, 1 fg, -1 dontcare | |||
#rpn_bbox_targets: (HxWxA, 4), distances of the anchors to the gt_boxes(may contains some transform) | |||
# that are the regression objectives | |||
#rpn_bbox_inside_weights: (HxWxA, 4) weights of each boxes, mainly accepts hyper param in cfg | |||
#rpn_bbox_outside_weights: (HxWxA, 4) used to balance the fg/bg, | |||
# beacuse the numbers of bgs and fgs mays significiantly different | |||
rpn_data = anchor_target_layer(cls_pred, bbox, im_info, "anchor_target_layer")#改 | |||
# classification loss | |||
# transpose: (1, H, W, A x d) -> (1, H, WxA, d) | |||
cls_pred_shape = tf.shape(cls_pred) | |||
cls_pred_reshape = tf.reshape(cls_pred, [cls_pred_shape[0], cls_pred_shape[1], -1, 3])#改 | |||
rpn_cls_score = tf.reshape(cls_pred_reshape, [-1, 3])#改 | |||
rpn_label = tf.reshape(rpn_data[0], [-1]) | |||
# ignore_label(-1) | |||
fg_keep = tf.not_equal(rpn_label, -1)&tf.not_equal(rpn_label, 0)#改 | |||
rpn_keep = tf.where(tf.not_equal(rpn_label, -1)) | |||
rpn_cls_score = tf.gather(rpn_cls_score, rpn_keep) | |||
rpn_label = tf.gather(rpn_label, rpn_keep) | |||
rpn_cross_entropy_n = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=rpn_label, logits=rpn_cls_score) | |||
# box loss | |||
rpn_bbox_pred = bbox_pred | |||
rpn_bbox_targets = rpn_data[1] | |||
rpn_bbox_inside_weights = rpn_data[2] | |||
rpn_bbox_outside_weights = rpn_data[3] | |||
rpn_bbox_pred = tf.gather(tf.reshape(rpn_bbox_pred, [-1, 4]), rpn_keep) # shape (N, 4) | |||
rpn_bbox_targets = tf.gather(tf.reshape(rpn_bbox_targets, [-1, 4]), rpn_keep) | |||
rpn_bbox_inside_weights = tf.gather(tf.reshape(rpn_bbox_inside_weights, [-1, 4]), rpn_keep) | |||
rpn_bbox_outside_weights = tf.gather(tf.reshape(rpn_bbox_outside_weights, [-1, 4]), rpn_keep) | |||
rpn_loss_box_n = tf.reduce_sum(rpn_bbox_outside_weights * smooth_l1_dist( | |||
rpn_bbox_inside_weights * (rpn_bbox_pred - rpn_bbox_targets)), reduction_indices=[1]) | |||
rpn_loss_box = tf.reduce_sum(rpn_loss_box_n) / (tf.reduce_sum(tf.cast(fg_keep, tf.float32)) + 1) | |||
rpn_cross_entropy = tf.reduce_mean(rpn_cross_entropy_n) | |||
model_loss = rpn_cross_entropy + rpn_loss_box | |||
regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) | |||
total_loss = tf.add_n(regularization_losses) + model_loss | |||
tf.summary.scalar('model_loss', model_loss) | |||
tf.summary.scalar('total_loss', total_loss) | |||
tf.summary.scalar('rpn_cross_entropy', rpn_cross_entropy) | |||
tf.summary.scalar('rpn_loss_box', rpn_loss_box) | |||
return total_loss, model_loss, rpn_cross_entropy, rpn_loss_box |
@ -0,0 +1,74 @@ | |||
from scipy.misc import imread,imshow | |||
import os | |||
from PIL import Image | |||
import PIL | |||
from model.img2seq import Img2SeqModel | |||
from model.utils.general import Config, run | |||
from model.utils.text import Vocab | |||
from model.utils.image import greyscale, crop_image, pad_image,predictsize ,\ | |||
downsample_image, TIMEOUT | |||
def interactive_shell(model): | |||
"""Creates interactive shell to play with model | |||
""" | |||
model.logger.info(""" | |||
This is an interactive mode. | |||
To exit, enter 'exit'. | |||
Enter a path to a file | |||
input> data/images_test/0.png""") | |||
while True: | |||
img_path = input("input> ") | |||
if img_path == "exit": | |||
break | |||
if img_path[-3:] == "png": | |||
img = imread(img_path) | |||
elif img_path[-3:] == "pdf": | |||
# call magick to convert the pdf into a png file | |||
buckets = [ | |||
[240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100], | |||
[560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100], | |||
[720, 120], [720, 200], [800, 100], [800, 320], [1000, 200], | |||
[1000, 400], [1200, 200], [1600, 200], [1600, 1600] | |||
] | |||
dir_output = "tmp/" | |||
name = img_path.split('/')[-1].split('.')[0] | |||
run("magick convert -density {} -quality {} {} {}".format(200, 100, | |||
img_path, dir_output+"{}.png".format(name)), TIMEOUT) | |||
img_path = dir_output + "{}.png".format(name) | |||
crop_image(img_path, img_path) | |||
pad_image(img_path, img_path, buckets=buckets) | |||
downsample_image(img_path, img_path, 2) | |||
img = imread(img_path) | |||
img = predictsize(img) | |||
im_converted = PIL.Image.fromarray(img) | |||
im_converted.show() | |||
img = greyscale(img) | |||
hyps = model.predict(img) | |||
with open("norm_formula_val.txt", "w") as f: | |||
f.write(hyps[0]) | |||
model.logger.info(hyps[0]) | |||
if __name__ == "__main__": | |||
# restore config and model | |||
dir_output = "results/full/" | |||
config_vocab = Config(dir_output + "vocab.json") | |||
config_model = Config(dir_output + "model.json") | |||
vocab = Vocab(config_vocab) | |||
model = Img2SeqModel(config_model, dir_output, vocab) | |||
model.build_pred() | |||
model.restore_session(dir_output + "model.weights4/test-model.ckpt") | |||
interactive_shell(model) |
@ -0,0 +1,62 @@ | |||
import click | |||
from model.utils.data_generator import DataGenerator | |||
from model.img2seq import Img2SeqModel | |||
from model.utils.lr_schedule import LRSchedule | |||
from model.utils.general import Config | |||
from model.utils.text import Vocab | |||
from model.utils.image import greyscale | |||
@click.command() | |||
@click.option('--data', default="configs/data.json", | |||
help='Path to data json config') | |||
@click.option('--vocab', default="configs/vocab.json", | |||
help='Path to vocab json config') | |||
@click.option('--training', default="configs/training.json", | |||
help='Path to training json config') | |||
@click.option('--model', default="configs/model.json", | |||
help='Path to model json config') | |||
@click.option('--output', default="results/full/", | |||
help='Dir for results and model weights') | |||
def main(data, vocab, training, model, output): | |||
# Load configs | |||
dir_output = output | |||
config = Config([data, vocab, training, model]) | |||
config.save(dir_output) | |||
vocab = Vocab(config) | |||
# Load datasets | |||
train_set = DataGenerator(path_formulas=config.path_formulas_train, | |||
dir_images=config.dir_images_train, | |||
max_iter=config.max_iter, bucket=config.bucket_train, | |||
path_matching=config.path_matching_train, | |||
max_len=config.max_length_formula, | |||
form_prepro=vocab.form_prepro) | |||
val_set = DataGenerator(path_formulas=config.path_formulas_val, | |||
dir_images=config.dir_images_val, | |||
max_iter=config.max_iter, bucket=config.bucket_val, | |||
path_matching=config.path_matching_val, | |||
max_len=config.max_length_formula, | |||
form_prepro=vocab.form_prepro) | |||
# Define learning rate schedule | |||
n_batches_epoch = ((len(train_set) + config.batch_size - 1) // | |||
config.batch_size) | |||
lr_schedule = LRSchedule(lr_init=config.lr_init, | |||
start_decay=config.start_decay*n_batches_epoch, | |||
end_decay=config.end_decay*n_batches_epoch, | |||
end_warm=config.end_warm*n_batches_epoch, | |||
lr_warm=config.lr_warm, | |||
lr_min=config.lr_min) | |||
# Build model and train | |||
model = Img2SeqModel(config, dir_output, vocab) | |||
model.build_train(config) | |||
#model.restore_session(dir_output + "model.weights/test-model.ckpt") | |||
model.train(config, train_set, val_set, lr_schedule) | |||
if __name__ == "__main__": | |||
main() |
@ -0,0 +1,17 @@ | |||
{ | |||
"export_name": "training.json", | |||
"lr_method" : "Adam", | |||
"n_epochs" : 50, | |||
"batch_size" : 20, | |||
"dropout" : 1, | |||
"metric_val" : "perplexity", | |||
"clip" : -1, | |||
"lr_init" : 1e-3, | |||
"lr_min" : 1e-4, | |||
"start_decay" : 6, | |||
"end_decay" : 13, | |||
"lr_warm" : 1e-4, | |||
"end_warm" : 2 | |||
} |
@ -0,0 +1,17 @@ | |||
{ | |||
"export_name": "training.json", | |||
"lr_method" : "Adam", | |||
"n_epochs" : 50, | |||
"batch_size" : 3, | |||
"dropout" : 1, | |||
"metric_val" : "perplexity", | |||
"clip" : -1, | |||
"lr_init" : 1e-3, | |||
"lr_min" : 1e-3, | |||
"start_decay" : 6, | |||
"end_decay" : 13, | |||
"lr_warm" : 1e-3, | |||
"end_warm" : 0 | |||
} |
@ -0,0 +1,28 @@ | |||
import tensorflow as tf | |||
slim = tf.contrib.slim | |||
def vgg_arg_scope(weight_decay=0.0005): | |||
with slim.arg_scope([slim.conv2d, slim.fully_connected], | |||
activation_fn=tf.nn.relu, | |||
weights_regularizer=slim.l2_regularizer(weight_decay), | |||
biases_initializer=tf.zeros_initializer()): | |||
with slim.arg_scope([slim.conv2d], padding='SAME') as arg_sc: | |||
return arg_sc | |||
def vgg_16(inputs, scope='vgg_16'): | |||
with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc: | |||
with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d]): | |||
net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') | |||
net = slim.max_pool2d(net, [2, 2], scope='pool1') | |||
net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') | |||
net = slim.max_pool2d(net, [2, 2], scope='pool2') | |||
net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') | |||
net = slim.max_pool2d(net, [2, 2], scope='pool3') | |||
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') | |||
net = slim.max_pool2d(net, [2, 2], scope='pool4') | |||
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') | |||
return net |
@ -0,0 +1,9 @@ | |||
{ | |||
"export_name": "vocab.json", | |||
"unk": "_UNK", | |||
"pad": "_PAD", | |||
"end": "_END", | |||
"path_vocab": "/Users/iris/im2latex_master2/data2/vocab.txt", | |||
"min_count_tok": 10 | |||
} |
@ -0,0 +1,9 @@ | |||
{ | |||
"export_name": "vocab.json", | |||
"unk": "_UNK", | |||
"pad": "_PAD", | |||
"end": "_END", | |||
"path_vocab": "/Users/iris/im2latex_master/data/small_vocab.txt", | |||
"min_count_tok": 2 | |||
} |