diff --git a/im2latex_master/README.md b/im2latex_master/README.md new file mode 100644 index 0000000..e4c3220 --- /dev/null +++ b/im2latex_master/README.md @@ -0,0 +1,59 @@ +# 多语言文本识别 + +## 介绍 + + +本项目采用基于faster rcnn 的CTPN网络进行文本定位,修改原网络以进行多语言语言识别。利用神经网络预测文本行与anchor之间的偏移量。使用VGG16提取特征,在feature map上使用滑动窗口预测和anchor之间的偏移距离,之后将其输入到一个双向LSTM网络,获得序列特征。由于文本行长度差异较大,模型仅预测anchor高度,最后循环连接小尺度的文本框。 + +文本识别网络采用seq2seq模型以及attention机制。encoder端使用CNN以获得较高的并行速度,同时采用positional embedding表征位置信息; decoder端使用LSTM做解码器。考虑到数学公式的识别存在长距离依赖的问题,故引入attention机制。 +训练数据结合了拍摄的图像以及课题组制作的含有混合latex公式及文字的图像,训练过程中进行了图像增强,以提高泛化能力。 + +## 结构 + + + +- 文本行定位 + + main文件夹,net文件夹。 权重:checkpoint_mlt + +- 文本识别 + + model文件夹 权重:results/full + +- 数据生成脚本:generate_data文件夹 + +train:训练文本识别部分的网络 + +evaluate_txt:验证文本识别部分的网络 + +Main/train:训练文本定位部分的网络 + +predict: 单行预测 + +demo2:展示两个网络联合起来的效果,输入多行图片预测 + +Classification:单图像语言分类(最终模型未使用) + +- 数据来源: + + 1. 自生成的行级别Latex与行级别的英文数据,用于训练文本识别网络。 + + Latex文本来源为arXiv论文http://www.cs.cornell.edu/projects/kddcup/datasets.html + + 英文文本来源为美国当代英语语料库(COCA) + + 经过处理后的实验的文本数据在data2和data3中。 + +2. 自生成的图像级别的数据集,用于训练文本框检测网络 + + 英文文本,Latex文本来源同上。 + +- 文本识别结果 + + + +| | BLEU-4 | Inverse Edit | perplexity | Exact Match | +| --------------- | ------ | ------------ | ---------- | ----------- | +| Seq2seq混合式 | 86.36 | 88.69 | -1.44 | 36.20 | +| Seq2seq-Latex | 90.10 | 84.12 | -1.32 | 37.21 | +| Seq2seq-English | 97.2 | 97.22 | -1.05 | 88.54 | \ No newline at end of file diff --git a/im2latex_master/__init__.py b/im2latex_master/__init__.py new file mode 100644 index 0000000..06d7405 Binary files /dev/null and b/im2latex_master/__init__.py differ diff --git a/im2latex_master/build.py b/im2latex_master/build.py new file mode 100644 index 0000000..d3b48c6 --- /dev/null +++ b/im2latex_master/build.py @@ -0,0 +1,47 @@ +import click + + +from model.utils.data_generator import DataGenerator +from model.utils.text import build_vocab, write_vocab +from model.utils.image import build_images +from model.utils.general import Config + + +@click.command() +@click.option('--data', default="configs/data.json", + help='Path to data json config') +@click.option('--vocab', default="configs/vocab.json", + help='Path to vocab json config') +def main(data, vocab): + data_config = Config(data) + + # datasets + train_set = DataGenerator( + path_formulas=data_config.path_formulas_train, + dir_images=data_config.dir_images_train, + path_matching=data_config.path_matching_train) + """ + + test_set = DataGenerator( + path_formulas=data_config.path_formulas_test, + dir_images=data_config.dir_images_test, + path_matching=data_config.path_matching_test) + """ + val_set = DataGenerator( + path_formulas=data_config.path_formulas_val, + dir_images=data_config.dir_images_val, + path_matching=data_config.path_matching_val) + + # produce images and matching files + train_set.build(buckets=data_config.buckets) + #test_set.build(buckets=data_config.buckets) + val_set.build(buckets=data_config.buckets) + + # vocab + vocab_config = Config(vocab) + vocab = build_vocab([train_set], min_count=vocab_config.min_count_tok) + write_vocab(vocab, vocab_config.path_vocab) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/im2latex_master/data.json b/im2latex_master/data.json new file mode 100644 index 0000000..d69fb29 --- /dev/null +++ b/im2latex_master/data.json @@ -0,0 +1,29 @@ +{ + "export_name": "data.json", + + "dir_images_train": "data2/images_train/", + "dir_images_test" : "data2/images_test/", + "dir_images_val" : "data2/images_val/", + + "path_matching_train": "data2/train.matching.txt", + "path_matching_val" : "data2/val.matching.txt", + "path_matching_test" : "data2/test.matching.txt", + + "path_formulas_train": "data2/train.formulas.norm.txt", + "path_formulas_test" : "data2/test.formulas.norm.txt", + "path_formulas_val" : "data2/val.formulas.norm.txt", + + "bucket_train": true, + "bucket_val": true, + "bucket_test": true, + + "max_iter" : null, + "max_length_formula": 150, + + "buckets": [ + [80,80],[240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100], + [560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100], + [720, 120], [720, 200], [800, 80], [800, 100], [1000, 80], + [1000, 100], [1200, 100], [1600, 80], [1600, 100] + ] +} diff --git a/im2latex_master/data_small.json b/im2latex_master/data_small.json new file mode 100644 index 0000000..d16e451 --- /dev/null +++ b/im2latex_master/data_small.json @@ -0,0 +1,29 @@ +{ + "export_name": "data.json", + + "dir_images_train": "data/small/", + "dir_images_test" : "data/small/", + "dir_images_val" : "data/small/", + + "path_matching_train": "data/small.matching.txt", + "path_matching_val" : "data/small.matching.txt", + "path_matching_test" : "data/small.matching.txt", + + "path_formulas_train": "data/small.formulas.norm.txt", + "path_formulas_test" : "data/small.formulas.norm.txt", + "path_formulas_val" : "data/small.formulas.norm.txt", + + "max_iter" : 20, + "max_length_formula": 50, + + "bucket_train": true, + "bucket_val": true, + "bucket_test": true, + + "buckets": [ + [240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100], + [560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100], + [720, 120], [720, 200], [800, 100], [800, 320], [1000, 200], + [1000, 400], [1200, 200], [1600, 200], [1600, 1600] + ] +} \ No newline at end of file diff --git a/im2latex_master/debug.py b/im2latex_master/debug.py new file mode 100644 index 0000000..4fa3f13 --- /dev/null +++ b/im2latex_master/debug.py @@ -0,0 +1,9 @@ +import tensorflow as tf +import numpy as np + +sess = tf.Session() +inputs = tf.placeholder(dtype=tf.float32, shape=(1, 300, 300, 3)) +net = tf.layers.Conv2D(filters=2, kernel_size=3)(inputs) +net = tf.nn.softmax(net, axis=-1) +sess.run(tf.global_variables_initializer()) +sess.run(net, feed_dict={inputs: np.zeros(shape=(1, 300, 300, 3), dtype=np.float32)}) diff --git a/im2latex_master/demo2.py b/im2latex_master/demo2.py new file mode 100644 index 0000000..3f01d43 --- /dev/null +++ b/im2latex_master/demo2.py @@ -0,0 +1,277 @@ +# coding=utf-8 +import os +import shutil +import sys +import time +import cv2 +import numpy as np +import tensorflow as tf +from main import preprocess +import json +import locale +locale.setlocale(locale.LC_ALL, 'C') + +from scipy.misc import imread +#current_directory = os.path.dirname(os.path.abspath(__file__)) +#root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".") +#sys.path.append(sys.path.append(os.getcwd())) + +sys.path.append(os.getcwd()) +from nets import model_train as ctpnmodel +from utils.rpn_msr.proposal_layer import proposal_layer +from utils.text_connector.detectors import TextDetector + +from scipy.misc import imread +import os +from PIL import Image + + +from model.img2seq import Img2SeqModel +from model.utils.general import Config, run +from model.utils.text import Vocab +from model.utils.image import greyscale,predictsize + + +tf.app.flags.DEFINE_string('test_data_path', '/app/image/1.png', '') +tf.app.flags.DEFINE_string('output_path', '/app/im2latex_master/results/predict/', '') +tf.app.flags.DEFINE_string('gpu', '0', '') +tf.app.flags.DEFINE_string('checkpoint_path', '/app/im2latex_master/checkpoints_mlt/', '') +FLAGS = tf.app.flags.FLAGS +tf.app.flags.DEFINE_integer('language', '2', '') + + +def get_images(): + files = [] + exts = ['jpg', 'png', 'jpeg', 'JPG'] + for parent, dirnames, filenames in os.walk(FLAGS.test_data_path): + for filename in filenames: + for ext in exts: + if filename.endswith(ext): + files.append(os.path.join(parent, filename)) + break + print('Find {} images'.format(len(files))) + return files + + +def resize_image(img): + img_size = img.shape + im_size_min = np.min(img_size[0:2]) + im_size_max = np.max(img_size[0:2]) + + im_scale = float(600) / float(im_size_min) + if np.round(im_scale * im_size_max) > 1200: + im_scale = float(1200) / float(im_size_max) + new_h = int(img_size[0] * im_scale) + new_w = int(img_size[1] * im_scale) + + new_h = new_h if new_h // 16 == 0 else (new_h // 16 + 1) * 16 + new_w = new_w if new_w // 16 == 0 else (new_w // 16 + 1) * 16 + + re_im = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR) + #cv2.imshow("ss",img) + #cv2.waitKey(0) + return re_im, (new_h / img_size[0], new_w / img_size[1]) +def get_box(): + if os.path.exists(FLAGS.output_path): + shutil.rmtree(FLAGS.output_path) + os.makedirs(FLAGS.output_path) + os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu + + input_image = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_image') + input_im_info = tf.placeholder(tf.float32, shape=[None, 3], name='input_im_info') + + global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) + + bbox_pred, cls_pred, cls_prob = ctpnmodel.model(input_image,2) + + variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step) + saver = tf.train.Saver(variable_averages.variables_to_restore()) + + ckpt_state = tf.train.get_checkpoint_state(FLAGS.checkpoint_path) + model_path = os.path.join(FLAGS.checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path)) + # print('Restore from {}'.format(model_path)) + sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) + saver.restore(sess, model_path) + + dir_output = "/app/im2latex_master/results/full/" + config_vocab = Config(dir_output + "vocab.json") + config_model = Config(dir_output + "model.json") + vocab = Vocab(config_vocab) + model = Img2SeqModel(config_model, dir_output, vocab) + model.build_pred() + model.restore_session(dir_output + "model.weights4/test-model.ckpt") + + # print(FLAGS.test_data_path) + + img = cv2.imread(FLAGS.test_data_path)[:, :, ::-1] + h, w, c = img.shape + if h > 121: + approx, image, (rh, rw) = preprocess.draw_rec(img) + + img = preprocess.Perspective(image, approx) + img = cv2.resize(img, None, None, fx=1.0 / rw, fy=1.0 / rh, interpolation=cv2.INTER_LINEAR) + #cv2.imshow("Dd",img) + #cv2.waitKey(0) + img, (rh, rw) = resize_image(img) + h, w, c = img.shape + im_info = np.array([h, w, c]).reshape([1, 3]) + bbox_pred_val, cls_prob_val = sess.run([bbox_pred, cls_prob], + feed_dict={input_image: [img], + input_im_info: im_info}) + + textsegs, _ = proposal_layer(cls_prob_val, bbox_pred_val, im_info, img) + scores = textsegs[:, 0:2] # 改 + textsegs = textsegs[:, 2:6] # 改 + + textdetector = TextDetector(DETECT_MODE='H') + boxes = textdetector.detect(textsegs, scores, img.shape[:2], img) + boxes = np.array(boxes, dtype=np.int) + image_box = sorted(boxes, key=(lambda x: (x[1] + x[3], x[0] + x[6]))) + for i, box in enumerate(image_box): + cv2.polylines(img, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0), + thickness=2) + img = cv2.resize(img, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR) + cv2.imshow("ss",img) + cv2.waitKey(0) + return 0 +def save_to_file(): + if os.path.exists(FLAGS.output_path): + shutil.rmtree(FLAGS.output_path) + os.makedirs(FLAGS.output_path) + os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu + + input_image = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_image') + input_im_info = tf.placeholder(tf.float32, shape=[None, 3], name='input_im_info') + + global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) + + bbox_pred, cls_pred, cls_prob = ctpnmodel.model(input_image,2.0) + + variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step) + saver = tf.train.Saver(variable_averages.variables_to_restore()) + ckpt_state = tf.train.get_checkpoint_state(FLAGS.checkpoint_path) + model_path = os.path.join(FLAGS.checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path)) + + sess=tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) + saver.restore(sess, model_path) + + dir_output = "/app/im2latex_master/results/full/" + config_vocab = Config(dir_output + "vocab.json") + + config_model = Config(dir_output + "model.json") + vocab = Vocab(config_vocab) + + #英文 + config_vocab_en = Config(dir_output + "vocabe.json") + vocab_en = Vocab(config_vocab_en) + model_en = Img2SeqModel(config_model, dir_output, vocab_en) + model_en.build_pred() + model_en.restore_session(dir_output + "model.weights_en/test-model.ckpt") + #print(FLAGS.test_data_path) + + img = imread(FLAGS.test_data_path) + h, w, c = img.shape + res = "" + if h>40: + approx, image, (rh, rw) = preprocess.draw_rec(img) + + img = preprocess.Perspective(image, approx) + img = cv2.resize(img, None, None, fx=1.0 / rw, fy=1.0 / rh, interpolation=cv2.INTER_LINEAR) + + img, (rh, rw) = resize_image(img) + h, w, c = img.shape + im_info = np.array([h, w, c]).reshape([1, 3]) + bbox_pred_val, cls_prob_val = sess.run([bbox_pred, cls_prob], + feed_dict={input_image: [img], + input_im_info: im_info}) + + textsegs, _ = proposal_layer(cls_prob_val, bbox_pred_val, im_info,img) + scores = textsegs[:, 0:2] # 改 + textsegs = textsegs[:, 2:6] # 改 + + textdetector = TextDetector(DETECT_MODE='H') + boxes = textdetector.detect(textsegs, scores, img.shape[:2],img) + + + + boxes = np.array(boxes, dtype=np.int) + img2=img.copy() + for i, box in enumerate(boxes): + if box[8]==1: + cv2.polylines(img2, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0), + thickness=2) + else: + cv2.polylines(img2, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(255, 0, 0), + thickness=2) + img2 = cv2.resize(img2, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR) + #cv2.imshow("ss", img2) + #cv2.waitKey(0) + for i,b in enumerate(boxes): + lan=b[8] + box = boxes[i] + img0 = img[min(box[1], box[3]) - 1:max(box[5], box[7]) + 1, min(box[0], box[2]) - 1:max(box[4], box[6]) + 1, + ::-1] + #cv2.imshow("ss",img0) + #cv2.waitKey(0) + + """ + if lan == 2: + img0 = predictsize(img0) + #cv2.imshow("ss",img0) + #cv2.waitKey(0) + img0 = greyscale(img0) + hyp = model.predict(img0) + res = res + hyp[0] + "\n" + model.logger.info(hyp[0]) + else: + """ + img0 = predictsize(img0) + #cv2.imshow("ss",img0) + #cv2.waitKey(0) + img0 = greyscale(img0) + hyp = model_en.predict(img0) + res = res + hyp[0] + "\n" + model_en.logger.info(hyp[0]) + #hyp=pytesseract.image_to_string(img0) + #res = res + hyp + "\n" + #model.logger.info(hyp) + res = json.dumps({"res": res}) + model_en.logger.info(res) + else: + #print(0) + img = predictsize(img) + img0 = greyscale(img) + #cv2.imshow("ss", img0) + #cv2.waitKey(0) + hyps = model_en.predict(img0) + res = res + hyps[0] + "\n" + model_en.logger.info(hyps[0]) + res = json.dumps({"res": res}) + model_en.logger.info(res) + + + + return 0 + + +''' + cv2.imwrite(os.path.join(FLAGS.output_path, str(i) +'.png'),img[min(box[1],box[3]):max(box[5],box[7]),min(box[0],box[2]) :max(box[4],box[6]), ::-1]) + cv2.polylines(img, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0), + thickness=2) + img = cv2.resize(img, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR) + cv2.imwrite(os.path.join(FLAGS.output_path, os.path.basename(im_fn)), img[:, :, ::-1]) + + with open(os.path.join(FLAGS.output_path, os.path.splitext(os.path.basename(im_fn))[0]) + ".txt", + "w") as f: + for i, box in enumerate(boxes): + line = ",".join(str(box[k]) for k in range(8)) + line += "," + str(scores[i]) + "\r\n" + f.writelines(line) +''' +def main(argv=None): + res=save_to_file() + #res=get_box() + return res + +if __name__ == '__main__': + tf.app.run() diff --git a/im2latex_master/encoder.pyc b/im2latex_master/encoder.pyc new file mode 100644 index 0000000..156658f Binary files /dev/null and b/im2latex_master/encoder.pyc differ diff --git a/im2latex_master/evaluate_img.py b/im2latex_master/evaluate_img.py new file mode 100644 index 0000000..f516704 --- /dev/null +++ b/im2latex_master/evaluate_img.py @@ -0,0 +1,53 @@ +import click + + +from model.utils.data_generator import DataGenerator +from model.img2seq import Img2SeqModel +from model.utils.general import Config +from model.utils.text import Vocab, load_formulas +from model.utils.image import greyscale, build_images + +from model.evaluation.text import score_files +from model.evaluation.image import score_dirs + + +@click.command() +@click.option('--results', default="results/full/", help='Dir to results') +def main(results): + # restore config and model + dir_output = results + + config_data = Config(dir_output + "data.json") + config_vocab = Config(dir_output + "vocab.json") + config_model = Config(dir_output + "model.json") + + vocab = Vocab(config_vocab) + model = Img2SeqModel(config_model, dir_output, vocab) + model.build_pred() + model.restore_session(dir_output + "model.weights/") + + # load dataset + test_set = DataGenerator(path_formulas=config_data.path_formulas_test, + dir_images=config_data.dir_images_test, img_prepro=greyscale, + max_iter=1, bucket=config_data.bucket_test, + path_matching=config_data.path_matching_test, + max_len=config_data.max_length_formula, + form_prepro=vocab.form_prepro,bucket_size=1) + + + # build images from formulas + formula_ref = dir_output + "formulas_test/ref.txt" + formula_hyp = dir_output + "formulas_test/hyp_0.txt" + images_ref = dir_output + "images_test/ref/" + images_test = dir_output + "images_test/hyp_0/" + build_images(load_formulas(formula_ref), images_ref) + build_images(load_formulas(formula_hyp), images_test) + + # score the repositories + scores = score_dirs(images_ref, images_test, greyscale) + msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in scores.items()]) + model.logger.info("- Eval Img: {}".format(msg)) + + +if __name__ == "__main__": + main() diff --git a/im2latex_master/evaluate_txt.py b/im2latex_master/evaluate_txt.py new file mode 100644 index 0000000..887acfb --- /dev/null +++ b/im2latex_master/evaluate_txt.py @@ -0,0 +1,51 @@ +import click + + +from model.utils.data_generator import DataGenerator +from model.img2seq import Img2SeqModel +from model.utils.general import Config +from model.utils.text import Vocab +from model.utils.image import greyscale + +from model.utils.text import load_formulas +from model.evaluation.text import score_files + + +@click.command() +@click.option('--results', default="results/full/", help='Dir to results') +def main(results): + # restore config and model + dir_output = results + + config_data = Config(dir_output + "data.json") + config_vocab = Config(dir_output + "vocab.json") + config_model = Config(dir_output + "model.json") + + vocab = Vocab(config_vocab) + model = Img2SeqModel(config_model, dir_output, vocab) + model.build_pred() + model.restore_session(dir_output + "model.weights4/test-model.ckpt") + + # load dataset + test_set = DataGenerator(path_formulas=config_data.path_formulas_test, + dir_images=config_data.dir_images_test, + max_iter=3000, bucket=config_data.bucket_test, + path_matching=config_data.path_matching_test, + max_len=config_data.max_length_formula, + form_prepro=vocab.form_prepro) + + # use model to write predictions in files + config_eval = Config({"dir_answers": dir_output + "formulas_test/", + "batch_size": 20}) + files, perplexity = model.write_prediction(config_eval, test_set) + formula_ref, formula_hyp = files[0], files[1] + + # score the ref and prediction files + scores = score_files(formula_ref, formula_hyp) + scores["perplexity"] = perplexity + msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in scores.items()]) + model.logger.info("- Test Txt: {}".format(msg)) + + +if __name__ == "__main__": + main() diff --git a/im2latex_master/makefile b/im2latex_master/makefile new file mode 100644 index 0000000..578f549 --- /dev/null +++ b/im2latex_master/makefile @@ -0,0 +1,52 @@ +install-linux: + sudo pip install -r requirements.txt + sudo apt-get install texlive-latex-base + sudo apt-get install texlive-latex-extra + + sudo apt-get install ghostscript + sudo apt-get install libgs-dev + + wget http://www.imagemagick.org/download/ImageMagick.tar.gz + tar -xvf ImageMagick.tar.gz + cd ImageMagick-7.*; \ + ./configure --with-gslib=yes; \ + make; \ + sudo make install; \ + sudo ldconfig /usr/local/lib + rm ImageMagick.tar.gz + rm -r ImageMagick-7.* + +install-mac: + sudo pip install -r requirements.txt + wget http://www.imagemagick.org/download/ImageMagick.tar.gz + tar -xvf ImageMagick.tar.gz + cd ImageMagick-7.*; \ + ./configure --with-gslib=yes; \ + make;\ + sudo make install; \ + rm ImageMagick.tar.gz + rm -r ImageMagick-7.* + +build-small: + python build.py --data=configs/data_small.json --vocab=configs/vocab_small.json + +train-small: + python train.py --data=configs/data_small.json --vocab=configs/vocab_small.json --training=configs/training_small.json --model=configs/model.json --output=results/small/ + +eval-small: + python evaluate_txt.py --results=results/small/ + python evaluate_img.py --results=results/small/ + +small: build-small train-small eval-small + +build: + python build.py --data=configs/data.json --vocab=configs/vocab.json + +train: + python train.py --data=configs/data.json --vocab=configs/vocab.json --training=configs/training.json --model=configs/model.json --output=results/full/ + +eval: + python evaluate_txt.py --results=results/full/ + python evaluate_img.py --results=results/full/ + +full: build train eval diff --git a/im2latex_master/model.json b/im2latex_master/model.json new file mode 100644 index 0000000..c78cb25 --- /dev/null +++ b/im2latex_master/model.json @@ -0,0 +1,21 @@ +{ + "export_name": "model.json", + + "encoder_cnn": "vanilla", + "positional_embeddings": true, + + "attn_cell_config": { + "cell_type": "lstm", + "num_units": 512, + "dim_e" : 512, + "dim_o" : 512, + "dim_embeddings": 80 + }, + + "decoding": "beam_search", + "beam_size": 5, + "div_gamma": 1, + "div_prob": 0, + + "max_length_formula": 150 +} diff --git a/im2latex_master/model_train.py b/im2latex_master/model_train.py new file mode 100644 index 0000000..dcd06e7 --- /dev/null +++ b/im2latex_master/model_train.py @@ -0,0 +1,168 @@ +import tensorflow as tf +from tensorflow.contrib import slim + +from nets import vgg +from utils.rpn_msr.anchor_target_layer import anchor_target_layer as anchor_target_layer_py + + +def mean_image_subtraction(images, means=[123.68, 116.78, 103.94]): + num_channels = images.get_shape().as_list()[-1] + if len(means) != num_channels: + raise ValueError('len(means) must match the number of channels') + channels = tf.split(axis=3, num_or_size_splits=num_channels, value=images) + for i in range(num_channels): + channels[i] -= means[i] + return tf.concat(axis=3, values=channels) + + +def make_var(name, shape, initializer=None): + return tf.get_variable(name, shape, initializer=initializer) + + +def Bilstm(net, input_channel, hidden_unit_num, output_channel, scope_name): + # width--->time step + with tf.variable_scope(scope_name) as scope: + shape = tf.shape(net) + N, H, W, C = shape[0], shape[1], shape[2], shape[3] + net = tf.reshape(net, [N * H, W, C]) + net.set_shape([None, None, input_channel]) + + lstm_fw_cell = tf.contrib.rnn.LSTMCell(hidden_unit_num, state_is_tuple=True) + lstm_bw_cell = tf.contrib.rnn.LSTMCell(hidden_unit_num, state_is_tuple=True) + + lstm_out, last_state = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, net, dtype=tf.float32) + lstm_out = tf.concat(lstm_out, axis=-1) + + lstm_out = tf.reshape(lstm_out, [N * H * W, 2 * hidden_unit_num]) + + init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.01, mode='FAN_AVG', uniform=False) + init_biases = tf.constant_initializer(0.0) + weights = make_var('weights', [2 * hidden_unit_num, output_channel], init_weights) + biases = make_var('biases', [output_channel], init_biases) + + outputs = tf.matmul(lstm_out, weights) + biases + + outputs = tf.reshape(outputs, [N, H, W, output_channel]) + return outputs + + +def lstm_fc(net, input_channel, output_channel, scope_name): + with tf.variable_scope(scope_name) as scope: + shape = tf.shape(net) + N, H, W, C = shape[0], shape[1], shape[2], shape[3] + net = tf.reshape(net, [N * H * W, C]) + + init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.01, mode='FAN_AVG', uniform=False) + init_biases = tf.constant_initializer(0.0) + weights = make_var('weights', [input_channel, output_channel], init_weights) + biases = make_var('biases', [output_channel], init_biases) + + output = tf.matmul(net, weights) + biases + output = tf.reshape(output, [N, H, W, output_channel]) + return output + + +def model(image,language):#改 + image = mean_image_subtraction(image) + with slim.arg_scope(vgg.vgg_arg_scope()): + conv5_3 = vgg.vgg_16(image) + + rpn_conv = slim.conv2d(conv5_3, 512, 3) + + lstm_output = Bilstm(rpn_conv, 512, 128, 512, scope_name='BiLSTM') + + bbox_pred = lstm_fc(lstm_output, 512, 10 * 4, scope_name="bbox_pred") + cls_pred = lstm_fc(lstm_output, 512, 10 * 3, scope_name="cls_pred")#改 + + + # transpose: (1, H, W, A x d) -> (1, H, WxA, d) + cls_pred_shape = tf.shape(cls_pred) + cls_pred_reshape = tf.reshape(cls_pred, [cls_pred_shape[0], cls_pred_shape[1], -1, 3])#改 + + cls_pred_reshape_shape = tf.shape(cls_pred_reshape) + cls_prob = tf.reshape(tf.nn.softmax(tf.reshape(cls_pred_reshape, [-1, cls_pred_reshape_shape[3]])), + [-1, cls_pred_reshape_shape[1], cls_pred_reshape_shape[2], cls_pred_reshape_shape[3]], + name="cls_prob") + + return bbox_pred, cls_pred, cls_prob + + +def anchor_target_layer(cls_pred, bbox, im_info, scope_name): + with tf.variable_scope(scope_name) as scope: + # 'rpn_cls_score', 'gt_boxes', 'im_info' + rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = \ + tf.py_func(anchor_target_layer_py, + [cls_pred, bbox, im_info, [16, ], [16]], + [tf.float32, tf.float32, tf.float32, tf.float32]) + + rpn_labels = tf.convert_to_tensor(tf.cast(rpn_labels, tf.int32), + name='rpn_labels') + rpn_bbox_targets = tf.convert_to_tensor(rpn_bbox_targets, + name='rpn_bbox_targets') + rpn_bbox_inside_weights = tf.convert_to_tensor(rpn_bbox_inside_weights, + name='rpn_bbox_inside_weights') + rpn_bbox_outside_weights = tf.convert_to_tensor(rpn_bbox_outside_weights, + name='rpn_bbox_outside_weights') + + return [rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights] + + +def smooth_l1_dist(deltas, sigma2=9.0, name='smooth_l1_dist'): + with tf.name_scope(name=name) as scope: + deltas_abs = tf.abs(deltas) + smoothL1_sign = tf.cast(tf.less(deltas_abs, 1.0 / sigma2), tf.float32) + return tf.square(deltas) * 0.5 * sigma2 * smoothL1_sign + \ + (deltas_abs - 0.5 / sigma2) * tf.abs(smoothL1_sign - 1) + + +def loss(bbox_pred, cls_pred, bbox, im_info): + # rpn_labels : (HxWxA, 1), for each anchor, 0 denotes bg, 1 fg, -1 dontcare + #rpn_bbox_targets: (HxWxA, 4), distances of the anchors to the gt_boxes(may contains some transform) + # that are the regression objectives + #rpn_bbox_inside_weights: (HxWxA, 4) weights of each boxes, mainly accepts hyper param in cfg + #rpn_bbox_outside_weights: (HxWxA, 4) used to balance the fg/bg, + # beacuse the numbers of bgs and fgs mays significiantly different + + rpn_data = anchor_target_layer(cls_pred, bbox, im_info, "anchor_target_layer")#改 + + # classification loss + # transpose: (1, H, W, A x d) -> (1, H, WxA, d) + cls_pred_shape = tf.shape(cls_pred) + cls_pred_reshape = tf.reshape(cls_pred, [cls_pred_shape[0], cls_pred_shape[1], -1, 3])#改 + rpn_cls_score = tf.reshape(cls_pred_reshape, [-1, 3])#改 + rpn_label = tf.reshape(rpn_data[0], [-1]) + # ignore_label(-1) + fg_keep = tf.not_equal(rpn_label, -1)&tf.not_equal(rpn_label, 0)#改 + rpn_keep = tf.where(tf.not_equal(rpn_label, -1)) + rpn_cls_score = tf.gather(rpn_cls_score, rpn_keep) + rpn_label = tf.gather(rpn_label, rpn_keep) + rpn_cross_entropy_n = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=rpn_label, logits=rpn_cls_score) + + # box loss + rpn_bbox_pred = bbox_pred + rpn_bbox_targets = rpn_data[1] + rpn_bbox_inside_weights = rpn_data[2] + rpn_bbox_outside_weights = rpn_data[3] + + rpn_bbox_pred = tf.gather(tf.reshape(rpn_bbox_pred, [-1, 4]), rpn_keep) # shape (N, 4) + rpn_bbox_targets = tf.gather(tf.reshape(rpn_bbox_targets, [-1, 4]), rpn_keep) + rpn_bbox_inside_weights = tf.gather(tf.reshape(rpn_bbox_inside_weights, [-1, 4]), rpn_keep) + rpn_bbox_outside_weights = tf.gather(tf.reshape(rpn_bbox_outside_weights, [-1, 4]), rpn_keep) + + rpn_loss_box_n = tf.reduce_sum(rpn_bbox_outside_weights * smooth_l1_dist( + rpn_bbox_inside_weights * (rpn_bbox_pred - rpn_bbox_targets)), reduction_indices=[1]) + + rpn_loss_box = tf.reduce_sum(rpn_loss_box_n) / (tf.reduce_sum(tf.cast(fg_keep, tf.float32)) + 1) + rpn_cross_entropy = tf.reduce_mean(rpn_cross_entropy_n) + + model_loss = rpn_cross_entropy + rpn_loss_box + + regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) + total_loss = tf.add_n(regularization_losses) + model_loss + + tf.summary.scalar('model_loss', model_loss) + tf.summary.scalar('total_loss', total_loss) + tf.summary.scalar('rpn_cross_entropy', rpn_cross_entropy) + tf.summary.scalar('rpn_loss_box', rpn_loss_box) + + return total_loss, model_loss, rpn_cross_entropy, rpn_loss_box diff --git a/im2latex_master/predict.py b/im2latex_master/predict.py new file mode 100644 index 0000000..7f976e4 --- /dev/null +++ b/im2latex_master/predict.py @@ -0,0 +1,74 @@ +from scipy.misc import imread,imshow +import os +from PIL import Image +import PIL + +from model.img2seq import Img2SeqModel +from model.utils.general import Config, run +from model.utils.text import Vocab +from model.utils.image import greyscale, crop_image, pad_image,predictsize ,\ + downsample_image, TIMEOUT + + + +def interactive_shell(model): + """Creates interactive shell to play with model + """ + model.logger.info(""" +This is an interactive mode. +To exit, enter 'exit'. +Enter a path to a file +input> data/images_test/0.png""") + + while True: + + img_path = input("input> ") + + if img_path == "exit": + break + + if img_path[-3:] == "png": + img = imread(img_path) + + elif img_path[-3:] == "pdf": + # call magick to convert the pdf into a png file + buckets = [ + [240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100], + [560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100], + [720, 120], [720, 200], [800, 100], [800, 320], [1000, 200], + [1000, 400], [1200, 200], [1600, 200], [1600, 1600] + ] + + dir_output = "tmp/" + name = img_path.split('/')[-1].split('.')[0] + run("magick convert -density {} -quality {} {} {}".format(200, 100, + img_path, dir_output+"{}.png".format(name)), TIMEOUT) + img_path = dir_output + "{}.png".format(name) + crop_image(img_path, img_path) + pad_image(img_path, img_path, buckets=buckets) + downsample_image(img_path, img_path, 2) + + img = imread(img_path) + + img = predictsize(img) + im_converted = PIL.Image.fromarray(img) + im_converted.show() + img = greyscale(img) + + hyps = model.predict(img) + with open("norm_formula_val.txt", "w") as f: + f.write(hyps[0]) + model.logger.info(hyps[0]) + + +if __name__ == "__main__": + # restore config and model + dir_output = "results/full/" + config_vocab = Config(dir_output + "vocab.json") + config_model = Config(dir_output + "model.json") + vocab = Vocab(config_vocab) + + model = Img2SeqModel(config_model, dir_output, vocab) + model.build_pred() + model.restore_session(dir_output + "model.weights4/test-model.ckpt") + interactive_shell(model) diff --git a/im2latex_master/train.py b/im2latex_master/train.py new file mode 100644 index 0000000..38e72b6 --- /dev/null +++ b/im2latex_master/train.py @@ -0,0 +1,62 @@ +import click + + +from model.utils.data_generator import DataGenerator +from model.img2seq import Img2SeqModel +from model.utils.lr_schedule import LRSchedule +from model.utils.general import Config +from model.utils.text import Vocab +from model.utils.image import greyscale + + +@click.command() +@click.option('--data', default="configs/data.json", + help='Path to data json config') +@click.option('--vocab', default="configs/vocab.json", + help='Path to vocab json config') +@click.option('--training', default="configs/training.json", + help='Path to training json config') +@click.option('--model', default="configs/model.json", + help='Path to model json config') +@click.option('--output', default="results/full/", + help='Dir for results and model weights') +def main(data, vocab, training, model, output): + # Load configs + dir_output = output + config = Config([data, vocab, training, model]) + config.save(dir_output) + vocab = Vocab(config) + + # Load datasets + train_set = DataGenerator(path_formulas=config.path_formulas_train, + dir_images=config.dir_images_train, + max_iter=config.max_iter, bucket=config.bucket_train, + path_matching=config.path_matching_train, + max_len=config.max_length_formula, + form_prepro=vocab.form_prepro) + val_set = DataGenerator(path_formulas=config.path_formulas_val, + dir_images=config.dir_images_val, + max_iter=config.max_iter, bucket=config.bucket_val, + path_matching=config.path_matching_val, + max_len=config.max_length_formula, + form_prepro=vocab.form_prepro) + + # Define learning rate schedule + n_batches_epoch = ((len(train_set) + config.batch_size - 1) // + config.batch_size) + lr_schedule = LRSchedule(lr_init=config.lr_init, + start_decay=config.start_decay*n_batches_epoch, + end_decay=config.end_decay*n_batches_epoch, + end_warm=config.end_warm*n_batches_epoch, + lr_warm=config.lr_warm, + lr_min=config.lr_min) + + # Build model and train + model = Img2SeqModel(config, dir_output, vocab) + model.build_train(config) + #model.restore_session(dir_output + "model.weights/test-model.ckpt") + model.train(config, train_set, val_set, lr_schedule) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/im2latex_master/training.json b/im2latex_master/training.json new file mode 100644 index 0000000..dc6b47e --- /dev/null +++ b/im2latex_master/training.json @@ -0,0 +1,17 @@ +{ + "export_name": "training.json", + + "lr_method" : "Adam", + "n_epochs" : 50, + "batch_size" : 20, + "dropout" : 1, + "metric_val" : "perplexity", + "clip" : -1, + + "lr_init" : 1e-3, + "lr_min" : 1e-4, + "start_decay" : 6, + "end_decay" : 13, + "lr_warm" : 1e-4, + "end_warm" : 2 +} diff --git a/im2latex_master/training_small.json b/im2latex_master/training_small.json new file mode 100644 index 0000000..e47b935 --- /dev/null +++ b/im2latex_master/training_small.json @@ -0,0 +1,17 @@ +{ + "export_name": "training.json", + + "lr_method" : "Adam", + "n_epochs" : 50, + "batch_size" : 3, + "dropout" : 1, + "metric_val" : "perplexity", + "clip" : -1, + + "lr_init" : 1e-3, + "lr_min" : 1e-3, + "start_decay" : 6, + "end_decay" : 13, + "lr_warm" : 1e-3, + "end_warm" : 0 +} diff --git a/im2latex_master/vgg.py b/im2latex_master/vgg.py new file mode 100644 index 0000000..c3ff48d --- /dev/null +++ b/im2latex_master/vgg.py @@ -0,0 +1,28 @@ +import tensorflow as tf + +slim = tf.contrib.slim + + +def vgg_arg_scope(weight_decay=0.0005): + with slim.arg_scope([slim.conv2d, slim.fully_connected], + activation_fn=tf.nn.relu, + weights_regularizer=slim.l2_regularizer(weight_decay), + biases_initializer=tf.zeros_initializer()): + with slim.arg_scope([slim.conv2d], padding='SAME') as arg_sc: + return arg_sc + + +def vgg_16(inputs, scope='vgg_16'): + with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc: + with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d]): + net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') + net = slim.max_pool2d(net, [2, 2], scope='pool1') + net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') + net = slim.max_pool2d(net, [2, 2], scope='pool2') + net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') + net = slim.max_pool2d(net, [2, 2], scope='pool3') + net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') + net = slim.max_pool2d(net, [2, 2], scope='pool4') + net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') + + return net diff --git a/im2latex_master/vocab.json b/im2latex_master/vocab.json new file mode 100644 index 0000000..ef12154 --- /dev/null +++ b/im2latex_master/vocab.json @@ -0,0 +1,9 @@ +{ + "export_name": "vocab.json", + + "unk": "_UNK", + "pad": "_PAD", + "end": "_END", + "path_vocab": "/Users/iris/im2latex_master2/data2/vocab.txt", + "min_count_tok": 10 +} diff --git a/im2latex_master/vocab_small.json b/im2latex_master/vocab_small.json new file mode 100644 index 0000000..86fe442 --- /dev/null +++ b/im2latex_master/vocab_small.json @@ -0,0 +1,9 @@ +{ + "export_name": "vocab.json", + + "unk": "_UNK", + "pad": "_PAD", + "end": "_END", + "path_vocab": "/Users/iris/im2latex_master/data/small_vocab.txt", + "min_count_tok": 2 +}