@ -0,0 +1,59 @@ | |||||
# 多语言文本识别 | |||||
## 介绍 | |||||
本项目采用基于faster rcnn 的CTPN网络进行文本定位,修改原网络以进行多语言语言识别。利用神经网络预测文本行与anchor之间的偏移量。使用VGG16提取特征,在feature map上使用滑动窗口预测和anchor之间的偏移距离,之后将其输入到一个双向LSTM网络,获得序列特征。由于文本行长度差异较大,模型仅预测anchor高度,最后循环连接小尺度的文本框。 | |||||
文本识别网络采用seq2seq模型以及attention机制。encoder端使用CNN以获得较高的并行速度,同时采用positional embedding表征位置信息; decoder端使用LSTM做解码器。考虑到数学公式的识别存在长距离依赖的问题,故引入attention机制。 | |||||
训练数据结合了拍摄的图像以及课题组制作的含有混合latex公式及文字的图像,训练过程中进行了图像增强,以提高泛化能力。 | |||||
## 结构 | |||||
- 文本行定位 | |||||
main文件夹,net文件夹。 权重:checkpoint_mlt | |||||
- 文本识别 | |||||
model文件夹 权重:results/full | |||||
- 数据生成脚本:generate_data文件夹 | |||||
train:训练文本识别部分的网络 | |||||
evaluate_txt:验证文本识别部分的网络 | |||||
Main/train:训练文本定位部分的网络 | |||||
predict: 单行预测 | |||||
demo2:展示两个网络联合起来的效果,输入多行图片预测 | |||||
Classification:单图像语言分类(最终模型未使用) | |||||
- 数据来源: | |||||
1. 自生成的行级别Latex与行级别的英文数据,用于训练文本识别网络。 | |||||
Latex文本来源为arXiv论文http://www.cs.cornell.edu/projects/kddcup/datasets.html | |||||
英文文本来源为美国当代英语语料库(COCA) | |||||
经过处理后的实验的文本数据在data2和data3中。 | |||||
2. 自生成的图像级别的数据集,用于训练文本框检测网络 | |||||
英文文本,Latex文本来源同上。 | |||||
- 文本识别结果 | |||||
| | BLEU-4 | Inverse Edit | perplexity | Exact Match | | |||||
| --------------- | ------ | ------------ | ---------- | ----------- | | |||||
| Seq2seq混合式 | 86.36 | 88.69 | -1.44 | 36.20 | | |||||
| Seq2seq-Latex | 90.10 | 84.12 | -1.32 | 37.21 | | |||||
| Seq2seq-English | 97.2 | 97.22 | -1.05 | 88.54 | |
@ -0,0 +1,47 @@ | |||||
import click | |||||
from model.utils.data_generator import DataGenerator | |||||
from model.utils.text import build_vocab, write_vocab | |||||
from model.utils.image import build_images | |||||
from model.utils.general import Config | |||||
@click.command() | |||||
@click.option('--data', default="configs/data.json", | |||||
help='Path to data json config') | |||||
@click.option('--vocab', default="configs/vocab.json", | |||||
help='Path to vocab json config') | |||||
def main(data, vocab): | |||||
data_config = Config(data) | |||||
# datasets | |||||
train_set = DataGenerator( | |||||
path_formulas=data_config.path_formulas_train, | |||||
dir_images=data_config.dir_images_train, | |||||
path_matching=data_config.path_matching_train) | |||||
""" | |||||
test_set = DataGenerator( | |||||
path_formulas=data_config.path_formulas_test, | |||||
dir_images=data_config.dir_images_test, | |||||
path_matching=data_config.path_matching_test) | |||||
""" | |||||
val_set = DataGenerator( | |||||
path_formulas=data_config.path_formulas_val, | |||||
dir_images=data_config.dir_images_val, | |||||
path_matching=data_config.path_matching_val) | |||||
# produce images and matching files | |||||
train_set.build(buckets=data_config.buckets) | |||||
#test_set.build(buckets=data_config.buckets) | |||||
val_set.build(buckets=data_config.buckets) | |||||
# vocab | |||||
vocab_config = Config(vocab) | |||||
vocab = build_vocab([train_set], min_count=vocab_config.min_count_tok) | |||||
write_vocab(vocab, vocab_config.path_vocab) | |||||
if __name__ == "__main__": | |||||
main() |
@ -0,0 +1,29 @@ | |||||
{ | |||||
"export_name": "data.json", | |||||
"dir_images_train": "data2/images_train/", | |||||
"dir_images_test" : "data2/images_test/", | |||||
"dir_images_val" : "data2/images_val/", | |||||
"path_matching_train": "data2/train.matching.txt", | |||||
"path_matching_val" : "data2/val.matching.txt", | |||||
"path_matching_test" : "data2/test.matching.txt", | |||||
"path_formulas_train": "data2/train.formulas.norm.txt", | |||||
"path_formulas_test" : "data2/test.formulas.norm.txt", | |||||
"path_formulas_val" : "data2/val.formulas.norm.txt", | |||||
"bucket_train": true, | |||||
"bucket_val": true, | |||||
"bucket_test": true, | |||||
"max_iter" : null, | |||||
"max_length_formula": 150, | |||||
"buckets": [ | |||||
[80,80],[240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100], | |||||
[560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100], | |||||
[720, 120], [720, 200], [800, 80], [800, 100], [1000, 80], | |||||
[1000, 100], [1200, 100], [1600, 80], [1600, 100] | |||||
] | |||||
} |
@ -0,0 +1,29 @@ | |||||
{ | |||||
"export_name": "data.json", | |||||
"dir_images_train": "data/small/", | |||||
"dir_images_test" : "data/small/", | |||||
"dir_images_val" : "data/small/", | |||||
"path_matching_train": "data/small.matching.txt", | |||||
"path_matching_val" : "data/small.matching.txt", | |||||
"path_matching_test" : "data/small.matching.txt", | |||||
"path_formulas_train": "data/small.formulas.norm.txt", | |||||
"path_formulas_test" : "data/small.formulas.norm.txt", | |||||
"path_formulas_val" : "data/small.formulas.norm.txt", | |||||
"max_iter" : 20, | |||||
"max_length_formula": 50, | |||||
"bucket_train": true, | |||||
"bucket_val": true, | |||||
"bucket_test": true, | |||||
"buckets": [ | |||||
[240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100], | |||||
[560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100], | |||||
[720, 120], [720, 200], [800, 100], [800, 320], [1000, 200], | |||||
[1000, 400], [1200, 200], [1600, 200], [1600, 1600] | |||||
] | |||||
} |
@ -0,0 +1,9 @@ | |||||
import tensorflow as tf | |||||
import numpy as np | |||||
sess = tf.Session() | |||||
inputs = tf.placeholder(dtype=tf.float32, shape=(1, 300, 300, 3)) | |||||
net = tf.layers.Conv2D(filters=2, kernel_size=3)(inputs) | |||||
net = tf.nn.softmax(net, axis=-1) | |||||
sess.run(tf.global_variables_initializer()) | |||||
sess.run(net, feed_dict={inputs: np.zeros(shape=(1, 300, 300, 3), dtype=np.float32)}) |
@ -0,0 +1,277 @@ | |||||
# coding=utf-8 | |||||
import os | |||||
import shutil | |||||
import sys | |||||
import time | |||||
import cv2 | |||||
import numpy as np | |||||
import tensorflow as tf | |||||
from main import preprocess | |||||
import json | |||||
import locale | |||||
locale.setlocale(locale.LC_ALL, 'C') | |||||
from scipy.misc import imread | |||||
#current_directory = os.path.dirname(os.path.abspath(__file__)) | |||||
#root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".") | |||||
#sys.path.append(sys.path.append(os.getcwd())) | |||||
sys.path.append(os.getcwd()) | |||||
from nets import model_train as ctpnmodel | |||||
from utils.rpn_msr.proposal_layer import proposal_layer | |||||
from utils.text_connector.detectors import TextDetector | |||||
from scipy.misc import imread | |||||
import os | |||||
from PIL import Image | |||||
from model.img2seq import Img2SeqModel | |||||
from model.utils.general import Config, run | |||||
from model.utils.text import Vocab | |||||
from model.utils.image import greyscale,predictsize | |||||
tf.app.flags.DEFINE_string('test_data_path', '/app/image/1.png', '') | |||||
tf.app.flags.DEFINE_string('output_path', '/app/im2latex_master/results/predict/', '') | |||||
tf.app.flags.DEFINE_string('gpu', '0', '') | |||||
tf.app.flags.DEFINE_string('checkpoint_path', '/app/im2latex_master/checkpoints_mlt/', '') | |||||
FLAGS = tf.app.flags.FLAGS | |||||
tf.app.flags.DEFINE_integer('language', '2', '') | |||||
def get_images(): | |||||
files = [] | |||||
exts = ['jpg', 'png', 'jpeg', 'JPG'] | |||||
for parent, dirnames, filenames in os.walk(FLAGS.test_data_path): | |||||
for filename in filenames: | |||||
for ext in exts: | |||||
if filename.endswith(ext): | |||||
files.append(os.path.join(parent, filename)) | |||||
break | |||||
print('Find {} images'.format(len(files))) | |||||
return files | |||||
def resize_image(img): | |||||
img_size = img.shape | |||||
im_size_min = np.min(img_size[0:2]) | |||||
im_size_max = np.max(img_size[0:2]) | |||||
im_scale = float(600) / float(im_size_min) | |||||
if np.round(im_scale * im_size_max) > 1200: | |||||
im_scale = float(1200) / float(im_size_max) | |||||
new_h = int(img_size[0] * im_scale) | |||||
new_w = int(img_size[1] * im_scale) | |||||
new_h = new_h if new_h // 16 == 0 else (new_h // 16 + 1) * 16 | |||||
new_w = new_w if new_w // 16 == 0 else (new_w // 16 + 1) * 16 | |||||
re_im = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR) | |||||
#cv2.imshow("ss",img) | |||||
#cv2.waitKey(0) | |||||
return re_im, (new_h / img_size[0], new_w / img_size[1]) | |||||
def get_box(): | |||||
if os.path.exists(FLAGS.output_path): | |||||
shutil.rmtree(FLAGS.output_path) | |||||
os.makedirs(FLAGS.output_path) | |||||
os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu | |||||
input_image = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_image') | |||||
input_im_info = tf.placeholder(tf.float32, shape=[None, 3], name='input_im_info') | |||||
global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) | |||||
bbox_pred, cls_pred, cls_prob = ctpnmodel.model(input_image,2) | |||||
variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step) | |||||
saver = tf.train.Saver(variable_averages.variables_to_restore()) | |||||
ckpt_state = tf.train.get_checkpoint_state(FLAGS.checkpoint_path) | |||||
model_path = os.path.join(FLAGS.checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path)) | |||||
# print('Restore from {}'.format(model_path)) | |||||
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) | |||||
saver.restore(sess, model_path) | |||||
dir_output = "/app/im2latex_master/results/full/" | |||||
config_vocab = Config(dir_output + "vocab.json") | |||||
config_model = Config(dir_output + "model.json") | |||||
vocab = Vocab(config_vocab) | |||||
model = Img2SeqModel(config_model, dir_output, vocab) | |||||
model.build_pred() | |||||
model.restore_session(dir_output + "model.weights4/test-model.ckpt") | |||||
# print(FLAGS.test_data_path) | |||||
img = cv2.imread(FLAGS.test_data_path)[:, :, ::-1] | |||||
h, w, c = img.shape | |||||
if h > 121: | |||||
approx, image, (rh, rw) = preprocess.draw_rec(img) | |||||
img = preprocess.Perspective(image, approx) | |||||
img = cv2.resize(img, None, None, fx=1.0 / rw, fy=1.0 / rh, interpolation=cv2.INTER_LINEAR) | |||||
#cv2.imshow("Dd",img) | |||||
#cv2.waitKey(0) | |||||
img, (rh, rw) = resize_image(img) | |||||
h, w, c = img.shape | |||||
im_info = np.array([h, w, c]).reshape([1, 3]) | |||||
bbox_pred_val, cls_prob_val = sess.run([bbox_pred, cls_prob], | |||||
feed_dict={input_image: [img], | |||||
input_im_info: im_info}) | |||||
textsegs, _ = proposal_layer(cls_prob_val, bbox_pred_val, im_info, img) | |||||
scores = textsegs[:, 0:2] # 改 | |||||
textsegs = textsegs[:, 2:6] # 改 | |||||
textdetector = TextDetector(DETECT_MODE='H') | |||||
boxes = textdetector.detect(textsegs, scores, img.shape[:2], img) | |||||
boxes = np.array(boxes, dtype=np.int) | |||||
image_box = sorted(boxes, key=(lambda x: (x[1] + x[3], x[0] + x[6]))) | |||||
for i, box in enumerate(image_box): | |||||
cv2.polylines(img, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0), | |||||
thickness=2) | |||||
img = cv2.resize(img, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR) | |||||
cv2.imshow("ss",img) | |||||
cv2.waitKey(0) | |||||
return 0 | |||||
def save_to_file(): | |||||
if os.path.exists(FLAGS.output_path): | |||||
shutil.rmtree(FLAGS.output_path) | |||||
os.makedirs(FLAGS.output_path) | |||||
os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu | |||||
input_image = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_image') | |||||
input_im_info = tf.placeholder(tf.float32, shape=[None, 3], name='input_im_info') | |||||
global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) | |||||
bbox_pred, cls_pred, cls_prob = ctpnmodel.model(input_image,2.0) | |||||
variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step) | |||||
saver = tf.train.Saver(variable_averages.variables_to_restore()) | |||||
ckpt_state = tf.train.get_checkpoint_state(FLAGS.checkpoint_path) | |||||
model_path = os.path.join(FLAGS.checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path)) | |||||
sess=tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) | |||||
saver.restore(sess, model_path) | |||||
dir_output = "/app/im2latex_master/results/full/" | |||||
config_vocab = Config(dir_output + "vocab.json") | |||||
config_model = Config(dir_output + "model.json") | |||||
vocab = Vocab(config_vocab) | |||||
#英文 | |||||
config_vocab_en = Config(dir_output + "vocabe.json") | |||||
vocab_en = Vocab(config_vocab_en) | |||||
model_en = Img2SeqModel(config_model, dir_output, vocab_en) | |||||
model_en.build_pred() | |||||
model_en.restore_session(dir_output + "model.weights_en/test-model.ckpt") | |||||
#print(FLAGS.test_data_path) | |||||
img = imread(FLAGS.test_data_path) | |||||
h, w, c = img.shape | |||||
res = "" | |||||
if h>40: | |||||
approx, image, (rh, rw) = preprocess.draw_rec(img) | |||||
img = preprocess.Perspective(image, approx) | |||||
img = cv2.resize(img, None, None, fx=1.0 / rw, fy=1.0 / rh, interpolation=cv2.INTER_LINEAR) | |||||
img, (rh, rw) = resize_image(img) | |||||
h, w, c = img.shape | |||||
im_info = np.array([h, w, c]).reshape([1, 3]) | |||||
bbox_pred_val, cls_prob_val = sess.run([bbox_pred, cls_prob], | |||||
feed_dict={input_image: [img], | |||||
input_im_info: im_info}) | |||||
textsegs, _ = proposal_layer(cls_prob_val, bbox_pred_val, im_info,img) | |||||
scores = textsegs[:, 0:2] # 改 | |||||
textsegs = textsegs[:, 2:6] # 改 | |||||
textdetector = TextDetector(DETECT_MODE='H') | |||||
boxes = textdetector.detect(textsegs, scores, img.shape[:2],img) | |||||
boxes = np.array(boxes, dtype=np.int) | |||||
img2=img.copy() | |||||
for i, box in enumerate(boxes): | |||||
if box[8]==1: | |||||
cv2.polylines(img2, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0), | |||||
thickness=2) | |||||
else: | |||||
cv2.polylines(img2, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(255, 0, 0), | |||||
thickness=2) | |||||
img2 = cv2.resize(img2, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR) | |||||
#cv2.imshow("ss", img2) | |||||
#cv2.waitKey(0) | |||||
for i,b in enumerate(boxes): | |||||
lan=b[8] | |||||
box = boxes[i] | |||||
img0 = img[min(box[1], box[3]) - 1:max(box[5], box[7]) + 1, min(box[0], box[2]) - 1:max(box[4], box[6]) + 1, | |||||
::-1] | |||||
#cv2.imshow("ss",img0) | |||||
#cv2.waitKey(0) | |||||
""" | |||||
if lan == 2: | |||||
img0 = predictsize(img0) | |||||
#cv2.imshow("ss",img0) | |||||
#cv2.waitKey(0) | |||||
img0 = greyscale(img0) | |||||
hyp = model.predict(img0) | |||||
res = res + hyp[0] + "\n" | |||||
model.logger.info(hyp[0]) | |||||
else: | |||||
""" | |||||
img0 = predictsize(img0) | |||||
#cv2.imshow("ss",img0) | |||||
#cv2.waitKey(0) | |||||
img0 = greyscale(img0) | |||||
hyp = model_en.predict(img0) | |||||
res = res + hyp[0] + "\n" | |||||
model_en.logger.info(hyp[0]) | |||||
#hyp=pytesseract.image_to_string(img0) | |||||
#res = res + hyp + "\n" | |||||
#model.logger.info(hyp) | |||||
res = json.dumps({"res": res}) | |||||
model_en.logger.info(res) | |||||
else: | |||||
#print(0) | |||||
img = predictsize(img) | |||||
img0 = greyscale(img) | |||||
#cv2.imshow("ss", img0) | |||||
#cv2.waitKey(0) | |||||
hyps = model_en.predict(img0) | |||||
res = res + hyps[0] + "\n" | |||||
model_en.logger.info(hyps[0]) | |||||
res = json.dumps({"res": res}) | |||||
model_en.logger.info(res) | |||||
return 0 | |||||
''' | |||||
cv2.imwrite(os.path.join(FLAGS.output_path, str(i) +'.png'),img[min(box[1],box[3]):max(box[5],box[7]),min(box[0],box[2]) :max(box[4],box[6]), ::-1]) | |||||
cv2.polylines(img, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0), | |||||
thickness=2) | |||||
img = cv2.resize(img, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR) | |||||
cv2.imwrite(os.path.join(FLAGS.output_path, os.path.basename(im_fn)), img[:, :, ::-1]) | |||||
with open(os.path.join(FLAGS.output_path, os.path.splitext(os.path.basename(im_fn))[0]) + ".txt", | |||||
"w") as f: | |||||
for i, box in enumerate(boxes): | |||||
line = ",".join(str(box[k]) for k in range(8)) | |||||
line += "," + str(scores[i]) + "\r\n" | |||||
f.writelines(line) | |||||
''' | |||||
def main(argv=None): | |||||
res=save_to_file() | |||||
#res=get_box() | |||||
return res | |||||
if __name__ == '__main__': | |||||
tf.app.run() |
@ -0,0 +1,53 @@ | |||||
import click | |||||
from model.utils.data_generator import DataGenerator | |||||
from model.img2seq import Img2SeqModel | |||||
from model.utils.general import Config | |||||
from model.utils.text import Vocab, load_formulas | |||||
from model.utils.image import greyscale, build_images | |||||
from model.evaluation.text import score_files | |||||
from model.evaluation.image import score_dirs | |||||
@click.command() | |||||
@click.option('--results', default="results/full/", help='Dir to results') | |||||
def main(results): | |||||
# restore config and model | |||||
dir_output = results | |||||
config_data = Config(dir_output + "data.json") | |||||
config_vocab = Config(dir_output + "vocab.json") | |||||
config_model = Config(dir_output + "model.json") | |||||
vocab = Vocab(config_vocab) | |||||
model = Img2SeqModel(config_model, dir_output, vocab) | |||||
model.build_pred() | |||||
model.restore_session(dir_output + "model.weights/") | |||||
# load dataset | |||||
test_set = DataGenerator(path_formulas=config_data.path_formulas_test, | |||||
dir_images=config_data.dir_images_test, img_prepro=greyscale, | |||||
max_iter=1, bucket=config_data.bucket_test, | |||||
path_matching=config_data.path_matching_test, | |||||
max_len=config_data.max_length_formula, | |||||
form_prepro=vocab.form_prepro,bucket_size=1) | |||||
# build images from formulas | |||||
formula_ref = dir_output + "formulas_test/ref.txt" | |||||
formula_hyp = dir_output + "formulas_test/hyp_0.txt" | |||||
images_ref = dir_output + "images_test/ref/" | |||||
images_test = dir_output + "images_test/hyp_0/" | |||||
build_images(load_formulas(formula_ref), images_ref) | |||||
build_images(load_formulas(formula_hyp), images_test) | |||||
# score the repositories | |||||
scores = score_dirs(images_ref, images_test, greyscale) | |||||
msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in scores.items()]) | |||||
model.logger.info("- Eval Img: {}".format(msg)) | |||||
if __name__ == "__main__": | |||||
main() |
@ -0,0 +1,51 @@ | |||||
import click | |||||
from model.utils.data_generator import DataGenerator | |||||
from model.img2seq import Img2SeqModel | |||||
from model.utils.general import Config | |||||
from model.utils.text import Vocab | |||||
from model.utils.image import greyscale | |||||
from model.utils.text import load_formulas | |||||
from model.evaluation.text import score_files | |||||
@click.command() | |||||
@click.option('--results', default="results/full/", help='Dir to results') | |||||
def main(results): | |||||
# restore config and model | |||||
dir_output = results | |||||
config_data = Config(dir_output + "data.json") | |||||
config_vocab = Config(dir_output + "vocab.json") | |||||
config_model = Config(dir_output + "model.json") | |||||
vocab = Vocab(config_vocab) | |||||
model = Img2SeqModel(config_model, dir_output, vocab) | |||||
model.build_pred() | |||||
model.restore_session(dir_output + "model.weights4/test-model.ckpt") | |||||
# load dataset | |||||
test_set = DataGenerator(path_formulas=config_data.path_formulas_test, | |||||
dir_images=config_data.dir_images_test, | |||||
max_iter=3000, bucket=config_data.bucket_test, | |||||
path_matching=config_data.path_matching_test, | |||||
max_len=config_data.max_length_formula, | |||||
form_prepro=vocab.form_prepro) | |||||
# use model to write predictions in files | |||||
config_eval = Config({"dir_answers": dir_output + "formulas_test/", | |||||
"batch_size": 20}) | |||||
files, perplexity = model.write_prediction(config_eval, test_set) | |||||
formula_ref, formula_hyp = files[0], files[1] | |||||
# score the ref and prediction files | |||||
scores = score_files(formula_ref, formula_hyp) | |||||
scores["perplexity"] = perplexity | |||||
msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in scores.items()]) | |||||
model.logger.info("- Test Txt: {}".format(msg)) | |||||
if __name__ == "__main__": | |||||
main() |
@ -0,0 +1,52 @@ | |||||
install-linux: | |||||
sudo pip install -r requirements.txt | |||||
sudo apt-get install texlive-latex-base | |||||
sudo apt-get install texlive-latex-extra | |||||
sudo apt-get install ghostscript | |||||
sudo apt-get install libgs-dev | |||||
wget http://www.imagemagick.org/download/ImageMagick.tar.gz | |||||
tar -xvf ImageMagick.tar.gz | |||||
cd ImageMagick-7.*; \ | |||||
./configure --with-gslib=yes; \ | |||||
make; \ | |||||
sudo make install; \ | |||||
sudo ldconfig /usr/local/lib | |||||
rm ImageMagick.tar.gz | |||||
rm -r ImageMagick-7.* | |||||
install-mac: | |||||
sudo pip install -r requirements.txt | |||||
wget http://www.imagemagick.org/download/ImageMagick.tar.gz | |||||
tar -xvf ImageMagick.tar.gz | |||||
cd ImageMagick-7.*; \ | |||||
./configure --with-gslib=yes; \ | |||||
make;\ | |||||
sudo make install; \ | |||||
rm ImageMagick.tar.gz | |||||
rm -r ImageMagick-7.* | |||||
build-small: | |||||
python build.py --data=configs/data_small.json --vocab=configs/vocab_small.json | |||||
train-small: | |||||
python train.py --data=configs/data_small.json --vocab=configs/vocab_small.json --training=configs/training_small.json --model=configs/model.json --output=results/small/ | |||||
eval-small: | |||||
python evaluate_txt.py --results=results/small/ | |||||
python evaluate_img.py --results=results/small/ | |||||
small: build-small train-small eval-small | |||||
build: | |||||
python build.py --data=configs/data.json --vocab=configs/vocab.json | |||||
train: | |||||
python train.py --data=configs/data.json --vocab=configs/vocab.json --training=configs/training.json --model=configs/model.json --output=results/full/ | |||||
eval: | |||||
python evaluate_txt.py --results=results/full/ | |||||
python evaluate_img.py --results=results/full/ | |||||
full: build train eval |
@ -0,0 +1,21 @@ | |||||
{ | |||||
"export_name": "model.json", | |||||
"encoder_cnn": "vanilla", | |||||
"positional_embeddings": true, | |||||
"attn_cell_config": { | |||||
"cell_type": "lstm", | |||||
"num_units": 512, | |||||
"dim_e" : 512, | |||||
"dim_o" : 512, | |||||
"dim_embeddings": 80 | |||||
}, | |||||
"decoding": "beam_search", | |||||
"beam_size": 5, | |||||
"div_gamma": 1, | |||||
"div_prob": 0, | |||||
"max_length_formula": 150 | |||||
} |
@ -0,0 +1,168 @@ | |||||
import tensorflow as tf | |||||
from tensorflow.contrib import slim | |||||
from nets import vgg | |||||
from utils.rpn_msr.anchor_target_layer import anchor_target_layer as anchor_target_layer_py | |||||
def mean_image_subtraction(images, means=[123.68, 116.78, 103.94]): | |||||
num_channels = images.get_shape().as_list()[-1] | |||||
if len(means) != num_channels: | |||||
raise ValueError('len(means) must match the number of channels') | |||||
channels = tf.split(axis=3, num_or_size_splits=num_channels, value=images) | |||||
for i in range(num_channels): | |||||
channels[i] -= means[i] | |||||
return tf.concat(axis=3, values=channels) | |||||
def make_var(name, shape, initializer=None): | |||||
return tf.get_variable(name, shape, initializer=initializer) | |||||
def Bilstm(net, input_channel, hidden_unit_num, output_channel, scope_name): | |||||
# width--->time step | |||||
with tf.variable_scope(scope_name) as scope: | |||||
shape = tf.shape(net) | |||||
N, H, W, C = shape[0], shape[1], shape[2], shape[3] | |||||
net = tf.reshape(net, [N * H, W, C]) | |||||
net.set_shape([None, None, input_channel]) | |||||
lstm_fw_cell = tf.contrib.rnn.LSTMCell(hidden_unit_num, state_is_tuple=True) | |||||
lstm_bw_cell = tf.contrib.rnn.LSTMCell(hidden_unit_num, state_is_tuple=True) | |||||
lstm_out, last_state = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, net, dtype=tf.float32) | |||||
lstm_out = tf.concat(lstm_out, axis=-1) | |||||
lstm_out = tf.reshape(lstm_out, [N * H * W, 2 * hidden_unit_num]) | |||||
init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.01, mode='FAN_AVG', uniform=False) | |||||
init_biases = tf.constant_initializer(0.0) | |||||
weights = make_var('weights', [2 * hidden_unit_num, output_channel], init_weights) | |||||
biases = make_var('biases', [output_channel], init_biases) | |||||
outputs = tf.matmul(lstm_out, weights) + biases | |||||
outputs = tf.reshape(outputs, [N, H, W, output_channel]) | |||||
return outputs | |||||
def lstm_fc(net, input_channel, output_channel, scope_name): | |||||
with tf.variable_scope(scope_name) as scope: | |||||
shape = tf.shape(net) | |||||
N, H, W, C = shape[0], shape[1], shape[2], shape[3] | |||||
net = tf.reshape(net, [N * H * W, C]) | |||||
init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.01, mode='FAN_AVG', uniform=False) | |||||
init_biases = tf.constant_initializer(0.0) | |||||
weights = make_var('weights', [input_channel, output_channel], init_weights) | |||||
biases = make_var('biases', [output_channel], init_biases) | |||||
output = tf.matmul(net, weights) + biases | |||||
output = tf.reshape(output, [N, H, W, output_channel]) | |||||
return output | |||||
def model(image,language):#改 | |||||
image = mean_image_subtraction(image) | |||||
with slim.arg_scope(vgg.vgg_arg_scope()): | |||||
conv5_3 = vgg.vgg_16(image) | |||||
rpn_conv = slim.conv2d(conv5_3, 512, 3) | |||||
lstm_output = Bilstm(rpn_conv, 512, 128, 512, scope_name='BiLSTM') | |||||
bbox_pred = lstm_fc(lstm_output, 512, 10 * 4, scope_name="bbox_pred") | |||||
cls_pred = lstm_fc(lstm_output, 512, 10 * 3, scope_name="cls_pred")#改 | |||||
# transpose: (1, H, W, A x d) -> (1, H, WxA, d) | |||||
cls_pred_shape = tf.shape(cls_pred) | |||||
cls_pred_reshape = tf.reshape(cls_pred, [cls_pred_shape[0], cls_pred_shape[1], -1, 3])#改 | |||||
cls_pred_reshape_shape = tf.shape(cls_pred_reshape) | |||||
cls_prob = tf.reshape(tf.nn.softmax(tf.reshape(cls_pred_reshape, [-1, cls_pred_reshape_shape[3]])), | |||||
[-1, cls_pred_reshape_shape[1], cls_pred_reshape_shape[2], cls_pred_reshape_shape[3]], | |||||
name="cls_prob") | |||||
return bbox_pred, cls_pred, cls_prob | |||||
def anchor_target_layer(cls_pred, bbox, im_info, scope_name): | |||||
with tf.variable_scope(scope_name) as scope: | |||||
# 'rpn_cls_score', 'gt_boxes', 'im_info' | |||||
rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = \ | |||||
tf.py_func(anchor_target_layer_py, | |||||
[cls_pred, bbox, im_info, [16, ], [16]], | |||||
[tf.float32, tf.float32, tf.float32, tf.float32]) | |||||
rpn_labels = tf.convert_to_tensor(tf.cast(rpn_labels, tf.int32), | |||||
name='rpn_labels') | |||||
rpn_bbox_targets = tf.convert_to_tensor(rpn_bbox_targets, | |||||
name='rpn_bbox_targets') | |||||
rpn_bbox_inside_weights = tf.convert_to_tensor(rpn_bbox_inside_weights, | |||||
name='rpn_bbox_inside_weights') | |||||
rpn_bbox_outside_weights = tf.convert_to_tensor(rpn_bbox_outside_weights, | |||||
name='rpn_bbox_outside_weights') | |||||
return [rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights] | |||||
def smooth_l1_dist(deltas, sigma2=9.0, name='smooth_l1_dist'): | |||||
with tf.name_scope(name=name) as scope: | |||||
deltas_abs = tf.abs(deltas) | |||||
smoothL1_sign = tf.cast(tf.less(deltas_abs, 1.0 / sigma2), tf.float32) | |||||
return tf.square(deltas) * 0.5 * sigma2 * smoothL1_sign + \ | |||||
(deltas_abs - 0.5 / sigma2) * tf.abs(smoothL1_sign - 1) | |||||
def loss(bbox_pred, cls_pred, bbox, im_info): | |||||
# rpn_labels : (HxWxA, 1), for each anchor, 0 denotes bg, 1 fg, -1 dontcare | |||||
#rpn_bbox_targets: (HxWxA, 4), distances of the anchors to the gt_boxes(may contains some transform) | |||||
# that are the regression objectives | |||||
#rpn_bbox_inside_weights: (HxWxA, 4) weights of each boxes, mainly accepts hyper param in cfg | |||||
#rpn_bbox_outside_weights: (HxWxA, 4) used to balance the fg/bg, | |||||
# beacuse the numbers of bgs and fgs mays significiantly different | |||||
rpn_data = anchor_target_layer(cls_pred, bbox, im_info, "anchor_target_layer")#改 | |||||
# classification loss | |||||
# transpose: (1, H, W, A x d) -> (1, H, WxA, d) | |||||
cls_pred_shape = tf.shape(cls_pred) | |||||
cls_pred_reshape = tf.reshape(cls_pred, [cls_pred_shape[0], cls_pred_shape[1], -1, 3])#改 | |||||
rpn_cls_score = tf.reshape(cls_pred_reshape, [-1, 3])#改 | |||||
rpn_label = tf.reshape(rpn_data[0], [-1]) | |||||
# ignore_label(-1) | |||||
fg_keep = tf.not_equal(rpn_label, -1)&tf.not_equal(rpn_label, 0)#改 | |||||
rpn_keep = tf.where(tf.not_equal(rpn_label, -1)) | |||||
rpn_cls_score = tf.gather(rpn_cls_score, rpn_keep) | |||||
rpn_label = tf.gather(rpn_label, rpn_keep) | |||||
rpn_cross_entropy_n = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=rpn_label, logits=rpn_cls_score) | |||||
# box loss | |||||
rpn_bbox_pred = bbox_pred | |||||
rpn_bbox_targets = rpn_data[1] | |||||
rpn_bbox_inside_weights = rpn_data[2] | |||||
rpn_bbox_outside_weights = rpn_data[3] | |||||
rpn_bbox_pred = tf.gather(tf.reshape(rpn_bbox_pred, [-1, 4]), rpn_keep) # shape (N, 4) | |||||
rpn_bbox_targets = tf.gather(tf.reshape(rpn_bbox_targets, [-1, 4]), rpn_keep) | |||||
rpn_bbox_inside_weights = tf.gather(tf.reshape(rpn_bbox_inside_weights, [-1, 4]), rpn_keep) | |||||
rpn_bbox_outside_weights = tf.gather(tf.reshape(rpn_bbox_outside_weights, [-1, 4]), rpn_keep) | |||||
rpn_loss_box_n = tf.reduce_sum(rpn_bbox_outside_weights * smooth_l1_dist( | |||||
rpn_bbox_inside_weights * (rpn_bbox_pred - rpn_bbox_targets)), reduction_indices=[1]) | |||||
rpn_loss_box = tf.reduce_sum(rpn_loss_box_n) / (tf.reduce_sum(tf.cast(fg_keep, tf.float32)) + 1) | |||||
rpn_cross_entropy = tf.reduce_mean(rpn_cross_entropy_n) | |||||
model_loss = rpn_cross_entropy + rpn_loss_box | |||||
regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) | |||||
total_loss = tf.add_n(regularization_losses) + model_loss | |||||
tf.summary.scalar('model_loss', model_loss) | |||||
tf.summary.scalar('total_loss', total_loss) | |||||
tf.summary.scalar('rpn_cross_entropy', rpn_cross_entropy) | |||||
tf.summary.scalar('rpn_loss_box', rpn_loss_box) | |||||
return total_loss, model_loss, rpn_cross_entropy, rpn_loss_box |
@ -0,0 +1,74 @@ | |||||
from scipy.misc import imread,imshow | |||||
import os | |||||
from PIL import Image | |||||
import PIL | |||||
from model.img2seq import Img2SeqModel | |||||
from model.utils.general import Config, run | |||||
from model.utils.text import Vocab | |||||
from model.utils.image import greyscale, crop_image, pad_image,predictsize ,\ | |||||
downsample_image, TIMEOUT | |||||
def interactive_shell(model): | |||||
"""Creates interactive shell to play with model | |||||
""" | |||||
model.logger.info(""" | |||||
This is an interactive mode. | |||||
To exit, enter 'exit'. | |||||
Enter a path to a file | |||||
input> data/images_test/0.png""") | |||||
while True: | |||||
img_path = input("input> ") | |||||
if img_path == "exit": | |||||
break | |||||
if img_path[-3:] == "png": | |||||
img = imread(img_path) | |||||
elif img_path[-3:] == "pdf": | |||||
# call magick to convert the pdf into a png file | |||||
buckets = [ | |||||
[240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100], | |||||
[560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100], | |||||
[720, 120], [720, 200], [800, 100], [800, 320], [1000, 200], | |||||
[1000, 400], [1200, 200], [1600, 200], [1600, 1600] | |||||
] | |||||
dir_output = "tmp/" | |||||
name = img_path.split('/')[-1].split('.')[0] | |||||
run("magick convert -density {} -quality {} {} {}".format(200, 100, | |||||
img_path, dir_output+"{}.png".format(name)), TIMEOUT) | |||||
img_path = dir_output + "{}.png".format(name) | |||||
crop_image(img_path, img_path) | |||||
pad_image(img_path, img_path, buckets=buckets) | |||||
downsample_image(img_path, img_path, 2) | |||||
img = imread(img_path) | |||||
img = predictsize(img) | |||||
im_converted = PIL.Image.fromarray(img) | |||||
im_converted.show() | |||||
img = greyscale(img) | |||||
hyps = model.predict(img) | |||||
with open("norm_formula_val.txt", "w") as f: | |||||
f.write(hyps[0]) | |||||
model.logger.info(hyps[0]) | |||||
if __name__ == "__main__": | |||||
# restore config and model | |||||
dir_output = "results/full/" | |||||
config_vocab = Config(dir_output + "vocab.json") | |||||
config_model = Config(dir_output + "model.json") | |||||
vocab = Vocab(config_vocab) | |||||
model = Img2SeqModel(config_model, dir_output, vocab) | |||||
model.build_pred() | |||||
model.restore_session(dir_output + "model.weights4/test-model.ckpt") | |||||
interactive_shell(model) |
@ -0,0 +1,62 @@ | |||||
import click | |||||
from model.utils.data_generator import DataGenerator | |||||
from model.img2seq import Img2SeqModel | |||||
from model.utils.lr_schedule import LRSchedule | |||||
from model.utils.general import Config | |||||
from model.utils.text import Vocab | |||||
from model.utils.image import greyscale | |||||
@click.command() | |||||
@click.option('--data', default="configs/data.json", | |||||
help='Path to data json config') | |||||
@click.option('--vocab', default="configs/vocab.json", | |||||
help='Path to vocab json config') | |||||
@click.option('--training', default="configs/training.json", | |||||
help='Path to training json config') | |||||
@click.option('--model', default="configs/model.json", | |||||
help='Path to model json config') | |||||
@click.option('--output', default="results/full/", | |||||
help='Dir for results and model weights') | |||||
def main(data, vocab, training, model, output): | |||||
# Load configs | |||||
dir_output = output | |||||
config = Config([data, vocab, training, model]) | |||||
config.save(dir_output) | |||||
vocab = Vocab(config) | |||||
# Load datasets | |||||
train_set = DataGenerator(path_formulas=config.path_formulas_train, | |||||
dir_images=config.dir_images_train, | |||||
max_iter=config.max_iter, bucket=config.bucket_train, | |||||
path_matching=config.path_matching_train, | |||||
max_len=config.max_length_formula, | |||||
form_prepro=vocab.form_prepro) | |||||
val_set = DataGenerator(path_formulas=config.path_formulas_val, | |||||
dir_images=config.dir_images_val, | |||||
max_iter=config.max_iter, bucket=config.bucket_val, | |||||
path_matching=config.path_matching_val, | |||||
max_len=config.max_length_formula, | |||||
form_prepro=vocab.form_prepro) | |||||
# Define learning rate schedule | |||||
n_batches_epoch = ((len(train_set) + config.batch_size - 1) // | |||||
config.batch_size) | |||||
lr_schedule = LRSchedule(lr_init=config.lr_init, | |||||
start_decay=config.start_decay*n_batches_epoch, | |||||
end_decay=config.end_decay*n_batches_epoch, | |||||
end_warm=config.end_warm*n_batches_epoch, | |||||
lr_warm=config.lr_warm, | |||||
lr_min=config.lr_min) | |||||
# Build model and train | |||||
model = Img2SeqModel(config, dir_output, vocab) | |||||
model.build_train(config) | |||||
#model.restore_session(dir_output + "model.weights/test-model.ckpt") | |||||
model.train(config, train_set, val_set, lr_schedule) | |||||
if __name__ == "__main__": | |||||
main() |
@ -0,0 +1,17 @@ | |||||
{ | |||||
"export_name": "training.json", | |||||
"lr_method" : "Adam", | |||||
"n_epochs" : 50, | |||||
"batch_size" : 20, | |||||
"dropout" : 1, | |||||
"metric_val" : "perplexity", | |||||
"clip" : -1, | |||||
"lr_init" : 1e-3, | |||||
"lr_min" : 1e-4, | |||||
"start_decay" : 6, | |||||
"end_decay" : 13, | |||||
"lr_warm" : 1e-4, | |||||
"end_warm" : 2 | |||||
} |
@ -0,0 +1,17 @@ | |||||
{ | |||||
"export_name": "training.json", | |||||
"lr_method" : "Adam", | |||||
"n_epochs" : 50, | |||||
"batch_size" : 3, | |||||
"dropout" : 1, | |||||
"metric_val" : "perplexity", | |||||
"clip" : -1, | |||||
"lr_init" : 1e-3, | |||||
"lr_min" : 1e-3, | |||||
"start_decay" : 6, | |||||
"end_decay" : 13, | |||||
"lr_warm" : 1e-3, | |||||
"end_warm" : 0 | |||||
} |
@ -0,0 +1,28 @@ | |||||
import tensorflow as tf | |||||
slim = tf.contrib.slim | |||||
def vgg_arg_scope(weight_decay=0.0005): | |||||
with slim.arg_scope([slim.conv2d, slim.fully_connected], | |||||
activation_fn=tf.nn.relu, | |||||
weights_regularizer=slim.l2_regularizer(weight_decay), | |||||
biases_initializer=tf.zeros_initializer()): | |||||
with slim.arg_scope([slim.conv2d], padding='SAME') as arg_sc: | |||||
return arg_sc | |||||
def vgg_16(inputs, scope='vgg_16'): | |||||
with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc: | |||||
with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d]): | |||||
net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') | |||||
net = slim.max_pool2d(net, [2, 2], scope='pool1') | |||||
net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') | |||||
net = slim.max_pool2d(net, [2, 2], scope='pool2') | |||||
net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') | |||||
net = slim.max_pool2d(net, [2, 2], scope='pool3') | |||||
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') | |||||
net = slim.max_pool2d(net, [2, 2], scope='pool4') | |||||
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') | |||||
return net |
@ -0,0 +1,9 @@ | |||||
{ | |||||
"export_name": "vocab.json", | |||||
"unk": "_UNK", | |||||
"pad": "_PAD", | |||||
"end": "_END", | |||||
"path_vocab": "/Users/iris/im2latex_master2/data2/vocab.txt", | |||||
"min_count_tok": 10 | |||||
} |
@ -0,0 +1,9 @@ | |||||
{ | |||||
"export_name": "vocab.json", | |||||
"unk": "_UNK", | |||||
"pad": "_PAD", | |||||
"end": "_END", | |||||
"path_vocab": "/Users/iris/im2latex_master/data/small_vocab.txt", | |||||
"min_count_tok": 2 | |||||
} |