上传文件至 'im2latex_master'

4 years ago · ed42254537
--- a/im2latex_master/README.md
+++ b/im2latex_master/README.md
@ -0,0 +1,59 @@
 # 多语言文本识别

 ## 介绍


 本项目采用基于faster rcnn 的CTPN网络进行文本定位，修改原网络以进行多语言语言识别。利用神经网络预测文本行与anchor之间的偏移量。使用VGG16提取特征，在feature map上使用滑动窗口预测和anchor之间的偏移距离，之后将其输入到一个双向LSTM网络，获得序列特征。由于文本行长度差异较大，模型仅预测anchor高度，最后循环连接小尺度的文本框。

 文本识别网络采用seq2seq模型以及attention机制。encoder端使用CNN以获得较高的并行速度，同时采用positional embedding表征位置信息； decoder端使用LSTM做解码器。考虑到数学公式的识别存在长距离依赖的问题，故引入attention机制。
 训练数据结合了拍摄的图像以及课题组制作的含有混合latex公式及文字的图像，训练过程中进行了图像增强，以提高泛化能力。

 ## 结构



 - 文本行定位

   main文件夹，net文件夹。 权重：checkpoint_mlt

 - 文本识别

  model文件夹   权重：results/full

 - 数据生成脚本：generate_data文件夹

 train：训练文本识别部分的网络

 evaluate_txt：验证文本识别部分的网络

 Main/train:训练文本定位部分的网络

 predict: 单行预测

 demo2：展示两个网络联合起来的效果，输入多行图片预测

 Classification：单图像语言分类（最终模型未使用）

 - 数据来源：

  1. 自生成的行级别Latex与行级别的英文数据，用于训练文本识别网络。

     Latex文本来源为arXiv论文http://www.cs.cornell.edu/projects/kddcup/datasets.html

     英文文本来源为美国当代英语语料库（COCA）

     经过处理后的实验的文本数据在data2和data3中。

 2. 自生成的图像级别的数据集，用于训练文本框检测网络

   英文文本，Latex文本来源同上。

 - 文本识别结果



 |                 | BLEU-4 | Inverse Edit | perplexity | Exact Match |
 | --------------- | ------ | ------------ | ---------- | ----------- |
 | Seq2seq混合式   | 86.36  | 88.69        | -1.44      | 36.20       |
 | Seq2seq-Latex   | 90.10  | 84.12        | -1.32      | 37.21       |
 | Seq2seq-English | 97.2   | 97.22        | -1.05      | 88.54       |
--- a/im2latex_master/init.py
+++ b/im2latex_master/init.py
--- a/im2latex_master/build.py
+++ b/im2latex_master/build.py
@ -0,0 +1,47 @@
 import click


 from model.utils.data_generator import DataGenerator
 from model.utils.text import build_vocab, write_vocab
 from model.utils.image import build_images
 from model.utils.general import Config


@click.command()
@click.option('--data', default="configs/data.json",
        help='Path to data json config')
@click.option('--vocab', default="configs/vocab.json",
        help='Path to vocab json config')
 def main(data, vocab):
    data_config = Config(data)

    # datasets
    train_set = DataGenerator(
        path_formulas=data_config.path_formulas_train,
        dir_images=data_config.dir_images_train,
        path_matching=data_config.path_matching_train)
    """
    
    test_set  = DataGenerator(
        path_formulas=data_config.path_formulas_test,
        dir_images=data_config.dir_images_test,
        path_matching=data_config.path_matching_test)
    """
    val_set   = DataGenerator(
        path_formulas=data_config.path_formulas_val,
        dir_images=data_config.dir_images_val,
        path_matching=data_config.path_matching_val)

    # produce images and matching files
    train_set.build(buckets=data_config.buckets)
    #test_set.build(buckets=data_config.buckets)
    val_set.build(buckets=data_config.buckets)

    # vocab
    vocab_config = Config(vocab)
    vocab = build_vocab([train_set], min_count=vocab_config.min_count_tok)
    write_vocab(vocab, vocab_config.path_vocab)


 if __name__ == "__main__":
    main()
--- a/im2latex_master/data.json
+++ b/im2latex_master/data.json
@ -0,0 +1,29 @@
 {
    "export_name": "data.json",

    "dir_images_train": "data2/images_train/",
    "dir_images_test" : "data2/images_test/",
    "dir_images_val"  : "data2/images_val/",

    "path_matching_train": "data2/train.matching.txt",
    "path_matching_val"  : "data2/val.matching.txt",
    "path_matching_test" : "data2/test.matching.txt",

    "path_formulas_train": "data2/train.formulas.norm.txt",
    "path_formulas_test" : "data2/test.formulas.norm.txt",
    "path_formulas_val"  : "data2/val.formulas.norm.txt",

    "bucket_train": true,
    "bucket_val": true,
    "bucket_test": true,

    "max_iter"          : null,
    "max_length_formula": 150,

    "buckets": [
        [80,80],[240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100],
        [560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100],
        [720, 120], [720, 200], [800, 80], [800, 100], [1000, 80],
        [1000, 100], [1200, 100], [1600, 80], [1600, 100]
        ]
 }
--- a/im2latex_master/data_small.json
+++ b/im2latex_master/data_small.json
@ -0,0 +1,29 @@
 {
    "export_name": "data.json",

    "dir_images_train": "data/small/",
    "dir_images_test" : "data/small/",
    "dir_images_val"  : "data/small/",

    "path_matching_train": "data/small.matching.txt",
    "path_matching_val"  : "data/small.matching.txt",
    "path_matching_test" : "data/small.matching.txt",

    "path_formulas_train": "data/small.formulas.norm.txt",
    "path_formulas_test" : "data/small.formulas.norm.txt",
    "path_formulas_val"  : "data/small.formulas.norm.txt",

    "max_iter"          : 20,
    "max_length_formula": 50,

    "bucket_train": true,
    "bucket_val": true,
    "bucket_test": true,

    "buckets": [
        [240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100],
        [560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100],
        [720, 120], [720, 200], [800, 100], [800, 320], [1000, 200],
        [1000, 400], [1200, 200], [1600, 200], [1600, 1600]
        ]
 }
--- a/im2latex_master/debug.py
+++ b/im2latex_master/debug.py
@ -0,0 +1,9 @@
 import tensorflow as tf
 import numpy as np

 sess = tf.Session()
 inputs = tf.placeholder(dtype=tf.float32, shape=(1, 300, 300, 3))
 net = tf.layers.Conv2D(filters=2, kernel_size=3)(inputs)
 net = tf.nn.softmax(net, axis=-1)
 sess.run(tf.global_variables_initializer())
 sess.run(net, feed_dict={inputs: np.zeros(shape=(1, 300, 300, 3), dtype=np.float32)})
--- a/im2latex_master/demo2.py
+++ b/im2latex_master/demo2.py
@ -0,0 +1,277 @@
 # coding=utf-8
 import os
 import shutil
 import sys
 import time
 import cv2
 import numpy as np
 import tensorflow as tf
 from main import preprocess
 import json
 import locale
 locale.setlocale(locale.LC_ALL, 'C')

 from scipy.misc import imread
 #current_directory = os.path.dirname(os.path.abspath(__file__))
 #root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".")
 #sys.path.append(sys.path.append(os.getcwd()))

 sys.path.append(os.getcwd())
 from nets import model_train as ctpnmodel
 from utils.rpn_msr.proposal_layer import proposal_layer
 from utils.text_connector.detectors import TextDetector

 from scipy.misc import imread
 import os
 from PIL import Image


 from model.img2seq import Img2SeqModel
 from model.utils.general import Config, run
 from model.utils.text import Vocab
 from model.utils.image import greyscale,predictsize


 tf.app.flags.DEFINE_string('test_data_path', '/app/image/1.png', '')
 tf.app.flags.DEFINE_string('output_path', '/app/im2latex_master/results/predict/', '')
 tf.app.flags.DEFINE_string('gpu', '0', '')
 tf.app.flags.DEFINE_string('checkpoint_path', '/app/im2latex_master/checkpoints_mlt/', '')
 FLAGS = tf.app.flags.FLAGS
 tf.app.flags.DEFINE_integer('language', '2', '')


 def get_images():
    files = []
    exts = ['jpg', 'png', 'jpeg', 'JPG']
    for parent, dirnames, filenames in os.walk(FLAGS.test_data_path):
        for filename in filenames:
            for ext in exts:
                if filename.endswith(ext):
                    files.append(os.path.join(parent, filename))
                    break
    print('Find {} images'.format(len(files)))
    return files


 def resize_image(img):
    img_size = img.shape
    im_size_min = np.min(img_size[0:2])
    im_size_max = np.max(img_size[0:2])

    im_scale = float(600) / float(im_size_min)
    if np.round(im_scale * im_size_max) > 1200:
        im_scale = float(1200) / float(im_size_max)
    new_h = int(img_size[0] * im_scale)
    new_w = int(img_size[1] * im_scale)

    new_h = new_h if new_h // 16 == 0 else (new_h // 16 + 1) * 16
    new_w = new_w if new_w // 16 == 0 else (new_w // 16 + 1) * 16

    re_im = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
    #cv2.imshow("ss",img)
    #cv2.waitKey(0)
    return re_im, (new_h / img_size[0], new_w / img_size[1])
 def get_box():
    if os.path.exists(FLAGS.output_path):
        shutil.rmtree(FLAGS.output_path)
    os.makedirs(FLAGS.output_path)
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu

    input_image = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_image')
    input_im_info = tf.placeholder(tf.float32, shape=[None, 3], name='input_im_info')

    global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)

    bbox_pred, cls_pred, cls_prob = ctpnmodel.model(input_image,2)

    variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step)
    saver = tf.train.Saver(variable_averages.variables_to_restore())

    ckpt_state = tf.train.get_checkpoint_state(FLAGS.checkpoint_path)
    model_path = os.path.join(FLAGS.checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path))
    # print('Restore from {}'.format(model_path))
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    saver.restore(sess, model_path)

    dir_output = "/app/im2latex_master/results/full/"
    config_vocab = Config(dir_output + "vocab.json")
    config_model = Config(dir_output + "model.json")
    vocab = Vocab(config_vocab)
    model = Img2SeqModel(config_model, dir_output, vocab)
    model.build_pred()
    model.restore_session(dir_output + "model.weights4/test-model.ckpt")

    # print(FLAGS.test_data_path)

    img = cv2.imread(FLAGS.test_data_path)[:, :, ::-1]
    h, w, c = img.shape
    if h > 121:
        approx, image, (rh, rw) = preprocess.draw_rec(img)

        img = preprocess.Perspective(image, approx)
        img = cv2.resize(img, None, None, fx=1.0 / rw, fy=1.0 / rh, interpolation=cv2.INTER_LINEAR)
    #cv2.imshow("Dd",img)
    #cv2.waitKey(0)
    img, (rh, rw) = resize_image(img)
    h, w, c = img.shape
    im_info = np.array([h, w, c]).reshape([1, 3])
    bbox_pred_val, cls_prob_val = sess.run([bbox_pred, cls_prob],
                                           feed_dict={input_image: [img],
                                                      input_im_info: im_info})

    textsegs, _ = proposal_layer(cls_prob_val, bbox_pred_val, im_info, img)
    scores = textsegs[:, 0:2]  # 改
    textsegs = textsegs[:, 2:6]  # 改

    textdetector = TextDetector(DETECT_MODE='H')
    boxes = textdetector.detect(textsegs, scores, img.shape[:2], img)
    boxes = np.array(boxes, dtype=np.int)
    image_box = sorted(boxes, key=(lambda x: (x[1] + x[3], x[0] + x[6])))
    for i, box in enumerate(image_box):
        cv2.polylines(img, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0),
                      thickness=2)
    img = cv2.resize(img, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR)
    cv2.imshow("ss",img)
    cv2.waitKey(0)
    return 0
 def save_to_file():
    if os.path.exists(FLAGS.output_path):
        shutil.rmtree(FLAGS.output_path)
    os.makedirs(FLAGS.output_path)
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu

    input_image = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_image')
    input_im_info = tf.placeholder(tf.float32, shape=[None, 3], name='input_im_info')

    global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)

    bbox_pred, cls_pred, cls_prob = ctpnmodel.model(input_image,2.0)

    variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step)
    saver = tf.train.Saver(variable_averages.variables_to_restore())
    ckpt_state = tf.train.get_checkpoint_state(FLAGS.checkpoint_path)
    model_path = os.path.join(FLAGS.checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path))

    sess=tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    saver.restore(sess, model_path)

    dir_output = "/app/im2latex_master/results/full/"
    config_vocab = Config(dir_output + "vocab.json")

    config_model = Config(dir_output + "model.json")
    vocab = Vocab(config_vocab)
  
    #英文
    config_vocab_en = Config(dir_output + "vocabe.json")
    vocab_en = Vocab(config_vocab_en)
    model_en = Img2SeqModel(config_model, dir_output, vocab_en)
    model_en.build_pred()
    model_en.restore_session(dir_output + "model.weights_en/test-model.ckpt")
    #print(FLAGS.test_data_path)

    img = imread(FLAGS.test_data_path)
    h, w, c = img.shape
    res = ""
    if h>40:
        approx, image, (rh, rw) = preprocess.draw_rec(img)

        img = preprocess.Perspective(image, approx)
        img = cv2.resize(img, None, None, fx=1.0 / rw, fy=1.0 / rh, interpolation=cv2.INTER_LINEAR)

        img, (rh, rw) = resize_image(img)
        h, w, c = img.shape
        im_info = np.array([h, w, c]).reshape([1, 3])
        bbox_pred_val, cls_prob_val = sess.run([bbox_pred, cls_prob],
                                               feed_dict={input_image: [img],
                                                          input_im_info: im_info})

        textsegs, _ = proposal_layer(cls_prob_val, bbox_pred_val, im_info,img)
        scores = textsegs[:, 0:2]  # 改
        textsegs = textsegs[:, 2:6]  # 改

        textdetector = TextDetector(DETECT_MODE='H')
        boxes = textdetector.detect(textsegs, scores, img.shape[:2],img)



        boxes = np.array(boxes, dtype=np.int)
        img2=img.copy()
        for i, box in enumerate(boxes):
            if box[8]==1:
                cv2.polylines(img2, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0),
                          thickness=2)
            else:
                cv2.polylines(img2, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(255, 0, 0),
                              thickness=2)
        img2 = cv2.resize(img2, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR)
        #cv2.imshow("ss", img2)
        #cv2.waitKey(0)
        for i,b in enumerate(boxes):
            lan=b[8]
            box = boxes[i]
            img0 = img[min(box[1], box[3]) - 1:max(box[5], box[7]) + 1, min(box[0], box[2]) - 1:max(box[4], box[6]) + 1,
                   ::-1]
            #cv2.imshow("ss",img0)
            #cv2.waitKey(0)

            """
            if lan == 2:
                img0 = predictsize(img0)
                #cv2.imshow("ss",img0)
                #cv2.waitKey(0)
                img0 = greyscale(img0)
                hyp = model.predict(img0)
                res = res + hyp[0] + "\n"
                model.logger.info(hyp[0])
            else:
            """
            img0 = predictsize(img0)
            #cv2.imshow("ss",img0)
            #cv2.waitKey(0)
            img0 = greyscale(img0)
            hyp = model_en.predict(img0)
            res = res + hyp[0] + "\n"
            model_en.logger.info(hyp[0])
            #hyp=pytesseract.image_to_string(img0)
            #res = res + hyp + "\n"
            #model.logger.info(hyp)
        res = json.dumps({"res": res})
        model_en.logger.info(res)
    else:
        #print(0)
        img = predictsize(img)
        img0 = greyscale(img)
        #cv2.imshow("ss", img0)
        #cv2.waitKey(0)
        hyps = model_en.predict(img0)
        res = res + hyps[0] + "\n"
        model_en.logger.info(hyps[0])
        res = json.dumps({"res": res})
        model_en.logger.info(res)



    return 0


 '''
                    cv2.imwrite(os.path.join(FLAGS.output_path, str(i) +'.png'),img[min(box[1],box[3]):max(box[5],box[7]),min(box[0],box[2]) :max(box[4],box[6]), ::-1])
                    cv2.polylines(img, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0),
                                  thickness=2)
                img = cv2.resize(img, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR)
                cv2.imwrite(os.path.join(FLAGS.output_path, os.path.basename(im_fn)), img[:, :, ::-1])

                with open(os.path.join(FLAGS.output_path, os.path.splitext(os.path.basename(im_fn))[0]) + ".txt",
                          "w") as f:
                    for i, box in enumerate(boxes):
                        line = ",".join(str(box[k]) for k in range(8))
                        line += "," + str(scores[i]) + "\r\n"
                        f.writelines(line)
 '''
 def main(argv=None):
    res=save_to_file()
    #res=get_box()
    return res

 if __name__ == '__main__':
    tf.app.run()
--- a/im2latex_master/encoder.pyc
+++ b/im2latex_master/encoder.pyc
--- a/im2latex_master/evaluate_img.py
+++ b/im2latex_master/evaluate_img.py
@ -0,0 +1,53 @@
 import click


 from model.utils.data_generator import DataGenerator
 from model.img2seq import Img2SeqModel
 from model.utils.general import Config
 from model.utils.text import Vocab, load_formulas
 from model.utils.image import greyscale, build_images

 from model.evaluation.text import score_files
 from model.evaluation.image import score_dirs


@click.command()
@click.option('--results', default="results/full/", help='Dir to results')
 def main(results):
    # restore config and model
    dir_output = results

    config_data  = Config(dir_output + "data.json")
    config_vocab = Config(dir_output + "vocab.json")
    config_model = Config(dir_output + "model.json")

    vocab = Vocab(config_vocab)
    model = Img2SeqModel(config_model, dir_output, vocab)
    model.build_pred()
    model.restore_session(dir_output + "model.weights/")

    # load dataset
    test_set = DataGenerator(path_formulas=config_data.path_formulas_test,
            dir_images=config_data.dir_images_test, img_prepro=greyscale,
            max_iter=1, bucket=config_data.bucket_test,
            path_matching=config_data.path_matching_test,
            max_len=config_data.max_length_formula,
            form_prepro=vocab.form_prepro,bucket_size=1)


    # build images from formulas
    formula_ref = dir_output + "formulas_test/ref.txt"
    formula_hyp = dir_output + "formulas_test/hyp_0.txt"
    images_ref  = dir_output + "images_test/ref/"
    images_test = dir_output + "images_test/hyp_0/"
    build_images(load_formulas(formula_ref), images_ref)
    build_images(load_formulas(formula_hyp), images_test)

    # score the repositories
    scores = score_dirs(images_ref, images_test, greyscale)
    msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in scores.items()])
    model.logger.info("- Eval Img: {}".format(msg))


 if __name__ == "__main__":
    main()
--- a/im2latex_master/evaluate_txt.py
+++ b/im2latex_master/evaluate_txt.py
@ -0,0 +1,51 @@
 import click


 from model.utils.data_generator import DataGenerator
 from model.img2seq import Img2SeqModel
 from model.utils.general import Config
 from model.utils.text import Vocab
 from model.utils.image import greyscale

 from model.utils.text import load_formulas
 from model.evaluation.text import score_files


@click.command()
@click.option('--results', default="results/full/", help='Dir to results')
 def main(results):
    # restore config and model
    dir_output = results

    config_data  = Config(dir_output + "data.json")
    config_vocab = Config(dir_output + "vocab.json")
    config_model = Config(dir_output + "model.json")

    vocab = Vocab(config_vocab)
    model = Img2SeqModel(config_model, dir_output, vocab)
    model.build_pred()
    model.restore_session(dir_output + "model.weights4/test-model.ckpt")

    # load dataset
    test_set = DataGenerator(path_formulas=config_data.path_formulas_test,
            dir_images=config_data.dir_images_test,
            max_iter=3000, bucket=config_data.bucket_test,
            path_matching=config_data.path_matching_test,
            max_len=config_data.max_length_formula,
            form_prepro=vocab.form_prepro)

    # use model to write predictions in files
    config_eval = Config({"dir_answers": dir_output + "formulas_test/",
                          "batch_size": 20})
    files, perplexity = model.write_prediction(config_eval, test_set)
    formula_ref, formula_hyp = files[0], files[1]

    # score the ref and prediction files
    scores = score_files(formula_ref, formula_hyp)
    scores["perplexity"] = perplexity
    msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in scores.items()])
    model.logger.info("- Test Txt: {}".format(msg))


 if __name__ == "__main__":
    main()
--- a/im2latex_master/makefile
+++ b/im2latex_master/makefile
@ -0,0 +1,52 @@
 install-linux:
 	sudo pip install -r requirements.txt
 	sudo apt-get install texlive-latex-base
 	sudo apt-get install texlive-latex-extra

 	sudo apt-get install ghostscript
 	sudo apt-get install libgs-dev

 	wget http://www.imagemagick.org/download/ImageMagick.tar.gz
 	tar -xvf ImageMagick.tar.gz
 	cd ImageMagick-7.*; \
 	./configure --with-gslib=yes; \
 	make; \
 	sudo make install; \
 	sudo ldconfig /usr/local/lib
 	rm ImageMagick.tar.gz
 	rm -r ImageMagick-7.*

 install-mac:
 	sudo pip install -r requirements.txt
 	wget http://www.imagemagick.org/download/ImageMagick.tar.gz
 	tar -xvf ImageMagick.tar.gz
 	cd ImageMagick-7.*; \
 	./configure --with-gslib=yes; \
 	make;\
 	sudo make install; \
 	rm ImageMagick.tar.gz
 	rm -r ImageMagick-7.*

 build-small:
 	python build.py --data=configs/data_small.json --vocab=configs/vocab_small.json

 train-small:
 	python train.py --data=configs/data_small.json --vocab=configs/vocab_small.json --training=configs/training_small.json --model=configs/model.json --output=results/small/

 eval-small:
 	python evaluate_txt.py --results=results/small/
 	python evaluate_img.py --results=results/small/

 small: build-small train-small eval-small

 build:
 	python build.py --data=configs/data.json --vocab=configs/vocab.json

 train:
 	python train.py --data=configs/data.json --vocab=configs/vocab.json --training=configs/training.json --model=configs/model.json --output=results/full/

 eval:
 	python evaluate_txt.py --results=results/full/
 	python evaluate_img.py --results=results/full/

 full: build train eval
--- a/im2latex_master/model.json
+++ b/im2latex_master/model.json
@ -0,0 +1,21 @@
 {
    "export_name": "model.json",

    "encoder_cnn": "vanilla",
    "positional_embeddings": true,

    "attn_cell_config": {
        "cell_type": "lstm",
        "num_units": 512,
        "dim_e"    : 512,
        "dim_o"    : 512,
        "dim_embeddings": 80
    },

    "decoding": "beam_search",
    "beam_size": 5,
    "div_gamma": 1,
    "div_prob": 0,

    "max_length_formula": 150
 }
--- a/im2latex_master/model_train.py
+++ b/im2latex_master/model_train.py
@ -0,0 +1,168 @@
 import tensorflow as tf
 from tensorflow.contrib import slim

 from nets import vgg
 from utils.rpn_msr.anchor_target_layer import anchor_target_layer as anchor_target_layer_py


 def mean_image_subtraction(images, means=[123.68, 116.78, 103.94]):
    num_channels = images.get_shape().as_list()[-1]
    if len(means) != num_channels:
        raise ValueError('len(means) must match the number of channels')
    channels = tf.split(axis=3, num_or_size_splits=num_channels, value=images)
    for i in range(num_channels):
        channels[i] -= means[i]
    return tf.concat(axis=3, values=channels)


 def make_var(name, shape, initializer=None):
    return tf.get_variable(name, shape, initializer=initializer)


 def Bilstm(net, input_channel, hidden_unit_num, output_channel, scope_name):
    # width--->time step
    with tf.variable_scope(scope_name) as scope:
        shape = tf.shape(net)
        N, H, W, C = shape[0], shape[1], shape[2], shape[3]
        net = tf.reshape(net, [N * H, W, C])
        net.set_shape([None, None, input_channel])

        lstm_fw_cell = tf.contrib.rnn.LSTMCell(hidden_unit_num, state_is_tuple=True)
        lstm_bw_cell = tf.contrib.rnn.LSTMCell(hidden_unit_num, state_is_tuple=True)

        lstm_out, last_state = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, net, dtype=tf.float32)
        lstm_out = tf.concat(lstm_out, axis=-1)

        lstm_out = tf.reshape(lstm_out, [N * H * W, 2 * hidden_unit_num])

        init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.01, mode='FAN_AVG', uniform=False)
        init_biases = tf.constant_initializer(0.0)
        weights = make_var('weights', [2 * hidden_unit_num, output_channel], init_weights)
        biases = make_var('biases', [output_channel], init_biases)

        outputs = tf.matmul(lstm_out, weights) + biases

        outputs = tf.reshape(outputs, [N, H, W, output_channel])
        return outputs


 def lstm_fc(net, input_channel, output_channel, scope_name):
    with tf.variable_scope(scope_name) as scope:
        shape = tf.shape(net)
        N, H, W, C = shape[0], shape[1], shape[2], shape[3]
        net = tf.reshape(net, [N * H * W, C])

        init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.01, mode='FAN_AVG', uniform=False)
        init_biases = tf.constant_initializer(0.0)
        weights = make_var('weights', [input_channel, output_channel], init_weights)
        biases = make_var('biases', [output_channel], init_biases)

        output = tf.matmul(net, weights) + biases
        output = tf.reshape(output, [N, H, W, output_channel])
    return output


 def model(image,language):#改
    image = mean_image_subtraction(image)
    with slim.arg_scope(vgg.vgg_arg_scope()):
        conv5_3 = vgg.vgg_16(image)

    rpn_conv = slim.conv2d(conv5_3, 512, 3)

    lstm_output = Bilstm(rpn_conv, 512, 128, 512, scope_name='BiLSTM')

    bbox_pred = lstm_fc(lstm_output, 512, 10 * 4, scope_name="bbox_pred")
    cls_pred = lstm_fc(lstm_output, 512, 10 * 3, scope_name="cls_pred")#改


    # transpose: (1, H, W, A x d) -> (1, H, WxA, d)
    cls_pred_shape = tf.shape(cls_pred)
    cls_pred_reshape = tf.reshape(cls_pred, [cls_pred_shape[0], cls_pred_shape[1], -1, 3])#改

    cls_pred_reshape_shape = tf.shape(cls_pred_reshape)
    cls_prob = tf.reshape(tf.nn.softmax(tf.reshape(cls_pred_reshape, [-1, cls_pred_reshape_shape[3]])),
                          [-1, cls_pred_reshape_shape[1], cls_pred_reshape_shape[2], cls_pred_reshape_shape[3]],
                          name="cls_prob")

    return bbox_pred, cls_pred, cls_prob


 def anchor_target_layer(cls_pred, bbox, im_info, scope_name):
    with tf.variable_scope(scope_name) as scope:
        # 'rpn_cls_score', 'gt_boxes', 'im_info'
        rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = \
            tf.py_func(anchor_target_layer_py,
                       [cls_pred, bbox, im_info, [16, ], [16]],
                       [tf.float32, tf.float32, tf.float32, tf.float32])

        rpn_labels = tf.convert_to_tensor(tf.cast(rpn_labels, tf.int32),
                                          name='rpn_labels')
        rpn_bbox_targets = tf.convert_to_tensor(rpn_bbox_targets,
                                                name='rpn_bbox_targets')
        rpn_bbox_inside_weights = tf.convert_to_tensor(rpn_bbox_inside_weights,
                                                       name='rpn_bbox_inside_weights')
        rpn_bbox_outside_weights = tf.convert_to_tensor(rpn_bbox_outside_weights,
                                                        name='rpn_bbox_outside_weights')

        return [rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights]


 def smooth_l1_dist(deltas, sigma2=9.0, name='smooth_l1_dist'):
    with tf.name_scope(name=name) as scope:
        deltas_abs = tf.abs(deltas)
        smoothL1_sign = tf.cast(tf.less(deltas_abs, 1.0 / sigma2), tf.float32)
        return tf.square(deltas) * 0.5 * sigma2 * smoothL1_sign + \
               (deltas_abs - 0.5 / sigma2) * tf.abs(smoothL1_sign - 1)


 def loss(bbox_pred, cls_pred, bbox, im_info):
    # rpn_labels : (HxWxA, 1), for each anchor, 0 denotes bg, 1 fg, -1 dontcare
    #rpn_bbox_targets: (HxWxA, 4), distances of the anchors to the gt_boxes(may contains some transform)
    #                        that are the regression objectives
    #rpn_bbox_inside_weights: (HxWxA, 4) weights of each boxes, mainly accepts hyper param in cfg
    #rpn_bbox_outside_weights: (HxWxA, 4) used to balance the fg/bg,
     #                       beacuse the numbers of bgs and fgs mays significiantly different

    rpn_data = anchor_target_layer(cls_pred, bbox, im_info, "anchor_target_layer")#改

    # classification loss
    # transpose: (1, H, W, A x d) -> (1, H, WxA, d)
    cls_pred_shape = tf.shape(cls_pred)
    cls_pred_reshape = tf.reshape(cls_pred, [cls_pred_shape[0], cls_pred_shape[1], -1, 3])#改
    rpn_cls_score = tf.reshape(cls_pred_reshape, [-1, 3])#改
    rpn_label = tf.reshape(rpn_data[0], [-1])
    # ignore_label(-1)
    fg_keep = tf.not_equal(rpn_label, -1)&tf.not_equal(rpn_label, 0)#改
    rpn_keep = tf.where(tf.not_equal(rpn_label, -1))
    rpn_cls_score = tf.gather(rpn_cls_score, rpn_keep)
    rpn_label = tf.gather(rpn_label, rpn_keep)
    rpn_cross_entropy_n = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=rpn_label, logits=rpn_cls_score)

    # box loss
    rpn_bbox_pred = bbox_pred
    rpn_bbox_targets = rpn_data[1]
    rpn_bbox_inside_weights = rpn_data[2]
    rpn_bbox_outside_weights = rpn_data[3]

    rpn_bbox_pred = tf.gather(tf.reshape(rpn_bbox_pred, [-1, 4]), rpn_keep)  # shape (N, 4)
    rpn_bbox_targets = tf.gather(tf.reshape(rpn_bbox_targets, [-1, 4]), rpn_keep)
    rpn_bbox_inside_weights = tf.gather(tf.reshape(rpn_bbox_inside_weights, [-1, 4]), rpn_keep)
    rpn_bbox_outside_weights = tf.gather(tf.reshape(rpn_bbox_outside_weights, [-1, 4]), rpn_keep)

    rpn_loss_box_n = tf.reduce_sum(rpn_bbox_outside_weights * smooth_l1_dist(
        rpn_bbox_inside_weights * (rpn_bbox_pred - rpn_bbox_targets)), reduction_indices=[1])

    rpn_loss_box = tf.reduce_sum(rpn_loss_box_n) / (tf.reduce_sum(tf.cast(fg_keep, tf.float32)) + 1)
    rpn_cross_entropy = tf.reduce_mean(rpn_cross_entropy_n)

    model_loss = rpn_cross_entropy + rpn_loss_box

    regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    total_loss = tf.add_n(regularization_losses) + model_loss

    tf.summary.scalar('model_loss', model_loss)
    tf.summary.scalar('total_loss', total_loss)
    tf.summary.scalar('rpn_cross_entropy', rpn_cross_entropy)
    tf.summary.scalar('rpn_loss_box', rpn_loss_box)

    return total_loss, model_loss, rpn_cross_entropy, rpn_loss_box
--- a/im2latex_master/predict.py
+++ b/im2latex_master/predict.py
@ -0,0 +1,74 @@
 from scipy.misc import imread,imshow
 import os
 from PIL import Image
 import PIL

 from model.img2seq import Img2SeqModel
 from model.utils.general import Config, run
 from model.utils.text import Vocab
 from model.utils.image import greyscale, crop_image, pad_image,predictsize ,\
    downsample_image, TIMEOUT



 def interactive_shell(model):
    """Creates interactive shell to play with model
    """
    model.logger.info("""
 This is an interactive mode.
 To exit, enter 'exit'.
 Enter a path to a file
 input> data/images_test/0.png""")

    while True:

        img_path = input("input> ")

        if img_path == "exit":
            break

        if img_path[-3:] == "png":
            img = imread(img_path)

        elif img_path[-3:] == "pdf":
            # call magick to convert the pdf into a png file
            buckets = [
            [240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100],
            [560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100],
            [720, 120], [720, 200], [800, 100], [800, 320], [1000, 200],
            [1000, 400], [1200, 200], [1600, 200], [1600, 1600]
            ]

            dir_output = "tmp/"
            name = img_path.split('/')[-1].split('.')[0]
            run("magick convert -density {} -quality {} {} {}".format(200, 100,
                img_path, dir_output+"{}.png".format(name)), TIMEOUT)
            img_path = dir_output + "{}.png".format(name)
            crop_image(img_path, img_path)
            pad_image(img_path, img_path, buckets=buckets)
            downsample_image(img_path, img_path, 2)

            img = imread(img_path)

        img = predictsize(img)
        im_converted = PIL.Image.fromarray(img)
        im_converted.show()
        img = greyscale(img)

        hyps = model.predict(img)
        with open("norm_formula_val.txt", "w") as f:
            f.write(hyps[0])
        model.logger.info(hyps[0])


 if __name__ == "__main__":
    # restore config and model
    dir_output = "results/full/"
    config_vocab = Config(dir_output + "vocab.json")
    config_model = Config(dir_output + "model.json")
    vocab = Vocab(config_vocab)

    model = Img2SeqModel(config_model, dir_output, vocab)
    model.build_pred()
    model.restore_session(dir_output + "model.weights4/test-model.ckpt")
    interactive_shell(model)
--- a/im2latex_master/train.py
+++ b/im2latex_master/train.py
@ -0,0 +1,62 @@
 import click


 from model.utils.data_generator import DataGenerator
 from model.img2seq import Img2SeqModel
 from model.utils.lr_schedule import LRSchedule
 from model.utils.general import Config
 from model.utils.text import Vocab
 from model.utils.image import greyscale


@click.command()
@click.option('--data', default="configs/data.json",
        help='Path to data json config')
@click.option('--vocab', default="configs/vocab.json",
        help='Path to vocab json config')
@click.option('--training', default="configs/training.json",
        help='Path to training json config')
@click.option('--model', default="configs/model.json",
        help='Path to model json config')
@click.option('--output', default="results/full/",
        help='Dir for results and model weights')
 def main(data, vocab, training, model, output):
    # Load configs
    dir_output = output
    config = Config([data, vocab, training, model])
    config.save(dir_output)
    vocab = Vocab(config)

    # Load datasets
    train_set = DataGenerator(path_formulas=config.path_formulas_train,
            dir_images=config.dir_images_train,
            max_iter=config.max_iter, bucket=config.bucket_train,
            path_matching=config.path_matching_train,
            max_len=config.max_length_formula,
            form_prepro=vocab.form_prepro)
    val_set = DataGenerator(path_formulas=config.path_formulas_val,
            dir_images=config.dir_images_val,
            max_iter=config.max_iter, bucket=config.bucket_val,
            path_matching=config.path_matching_val,
            max_len=config.max_length_formula,
            form_prepro=vocab.form_prepro)

    # Define learning rate schedule
    n_batches_epoch = ((len(train_set) + config.batch_size - 1) //
                        config.batch_size)
    lr_schedule = LRSchedule(lr_init=config.lr_init,
            start_decay=config.start_decay*n_batches_epoch,
            end_decay=config.end_decay*n_batches_epoch,
            end_warm=config.end_warm*n_batches_epoch,
            lr_warm=config.lr_warm,
            lr_min=config.lr_min)

    # Build model and train
    model = Img2SeqModel(config, dir_output, vocab)
    model.build_train(config)
    #model.restore_session(dir_output + "model.weights/test-model.ckpt")
    model.train(config, train_set, val_set, lr_schedule)


 if __name__ == "__main__":
    main()
--- a/im2latex_master/training.json
+++ b/im2latex_master/training.json
@ -0,0 +1,17 @@
 {
    "export_name": "training.json",

    "lr_method"    : "Adam",
    "n_epochs"     : 50,
    "batch_size"   : 20,
    "dropout"      : 1,
    "metric_val"   : "perplexity",
    "clip"         : -1,

    "lr_init"      : 1e-3,
    "lr_min"       : 1e-4,
    "start_decay"  : 6,
    "end_decay"    : 13,
    "lr_warm"      : 1e-4,
    "end_warm"     : 2
 }
--- a/im2latex_master/training_small.json
+++ b/im2latex_master/training_small.json
@ -0,0 +1,17 @@
 {
    "export_name": "training.json",

    "lr_method"    : "Adam",
    "n_epochs"     : 50,
    "batch_size"   : 3,
    "dropout"      : 1,
    "metric_val"   : "perplexity",
    "clip"         : -1,

    "lr_init"      : 1e-3,
    "lr_min"       : 1e-3,
    "start_decay"  : 6,
    "end_decay"    : 13,
    "lr_warm"      : 1e-3,
    "end_warm"     : 0
 }
--- a/im2latex_master/vgg.py
+++ b/im2latex_master/vgg.py
@ -0,0 +1,28 @@
 import tensorflow as tf

 slim = tf.contrib.slim


 def vgg_arg_scope(weight_decay=0.0005):
    with slim.arg_scope([slim.conv2d, slim.fully_connected],
                        activation_fn=tf.nn.relu,
                        weights_regularizer=slim.l2_regularizer(weight_decay),
                        biases_initializer=tf.zeros_initializer()):
        with slim.arg_scope([slim.conv2d], padding='SAME') as arg_sc:
            return arg_sc


 def vgg_16(inputs, scope='vgg_16'):
    with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc:
        with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d]):
            net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
            net = slim.max_pool2d(net, [2, 2], scope='pool1')
            net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
            net = slim.max_pool2d(net, [2, 2], scope='pool2')
            net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
            net = slim.max_pool2d(net, [2, 2], scope='pool3')
            net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
            net = slim.max_pool2d(net, [2, 2], scope='pool4')
            net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')

    return net
--- a/im2latex_master/vocab.json
+++ b/im2latex_master/vocab.json
@ -0,0 +1,9 @@
 {
 	"export_name": "vocab.json",

    "unk": "_UNK",
    "pad": "_PAD",
    "end": "_END",
    "path_vocab": "/Users/iris/im2latex_master2/data2/vocab.txt",
    "min_count_tok": 10
 }
--- a/im2latex_master/vocab_small.json
+++ b/im2latex_master/vocab_small.json
@ -0,0 +1,9 @@
 {
 	"export_name": "vocab.json",

    "unk": "_UNK",
    "pad": "_PAD",
    "end": "_END",
    "path_vocab": "/Users/iris/im2latex_master/data/small_vocab.txt",
    "min_count_tok": 2
 }