Browse Source

上传文件至 'im2latex_master'

master
王子玥 4 years ago
parent
commit
ed42254537
20 changed files with 1011 additions and 0 deletions
  1. +59
    -0
      im2latex_master/README.md
  2. BIN
      im2latex_master/__init__.py
  3. +47
    -0
      im2latex_master/build.py
  4. +29
    -0
      im2latex_master/data.json
  5. +29
    -0
      im2latex_master/data_small.json
  6. +9
    -0
      im2latex_master/debug.py
  7. +277
    -0
      im2latex_master/demo2.py
  8. BIN
      im2latex_master/encoder.pyc
  9. +53
    -0
      im2latex_master/evaluate_img.py
  10. +51
    -0
      im2latex_master/evaluate_txt.py
  11. +52
    -0
      im2latex_master/makefile
  12. +21
    -0
      im2latex_master/model.json
  13. +168
    -0
      im2latex_master/model_train.py
  14. +74
    -0
      im2latex_master/predict.py
  15. +62
    -0
      im2latex_master/train.py
  16. +17
    -0
      im2latex_master/training.json
  17. +17
    -0
      im2latex_master/training_small.json
  18. +28
    -0
      im2latex_master/vgg.py
  19. +9
    -0
      im2latex_master/vocab.json
  20. +9
    -0
      im2latex_master/vocab_small.json

+ 59
- 0
im2latex_master/README.md View File

@ -0,0 +1,59 @@
# 多语言文本识别
## 介绍
本项目采用基于faster rcnn 的CTPN网络进行文本定位,修改原网络以进行多语言语言识别。利用神经网络预测文本行与anchor之间的偏移量。使用VGG16提取特征,在feature map上使用滑动窗口预测和anchor之间的偏移距离,之后将其输入到一个双向LSTM网络,获得序列特征。由于文本行长度差异较大,模型仅预测anchor高度,最后循环连接小尺度的文本框。
文本识别网络采用seq2seq模型以及attention机制。encoder端使用CNN以获得较高的并行速度,同时采用positional embedding表征位置信息; decoder端使用LSTM做解码器。考虑到数学公式的识别存在长距离依赖的问题,故引入attention机制。
训练数据结合了拍摄的图像以及课题组制作的含有混合latex公式及文字的图像,训练过程中进行了图像增强,以提高泛化能力。
## 结构
- 文本行定位
main文件夹,net文件夹。 权重:checkpoint_mlt
- 文本识别
model文件夹 权重:results/full
- 数据生成脚本:generate_data文件夹
train:训练文本识别部分的网络
evaluate_txt:验证文本识别部分的网络
Main/train:训练文本定位部分的网络
predict: 单行预测
demo2:展示两个网络联合起来的效果,输入多行图片预测
Classification:单图像语言分类(最终模型未使用)
- 数据来源:
1. 自生成的行级别Latex与行级别的英文数据,用于训练文本识别网络。
Latex文本来源为arXiv论文http://www.cs.cornell.edu/projects/kddcup/datasets.html
英文文本来源为美国当代英语语料库(COCA)
经过处理后的实验的文本数据在data2和data3中。
2. 自生成的图像级别的数据集,用于训练文本框检测网络
英文文本,Latex文本来源同上。
- 文本识别结果
| | BLEU-4 | Inverse Edit | perplexity | Exact Match |
| --------------- | ------ | ------------ | ---------- | ----------- |
| Seq2seq混合式 | 86.36 | 88.69 | -1.44 | 36.20 |
| Seq2seq-Latex | 90.10 | 84.12 | -1.32 | 37.21 |
| Seq2seq-English | 97.2 | 97.22 | -1.05 | 88.54 |

BIN
im2latex_master/__init__.py View File


+ 47
- 0
im2latex_master/build.py View File

@ -0,0 +1,47 @@
import click
from model.utils.data_generator import DataGenerator
from model.utils.text import build_vocab, write_vocab
from model.utils.image import build_images
from model.utils.general import Config
@click.command()
@click.option('--data', default="configs/data.json",
help='Path to data json config')
@click.option('--vocab', default="configs/vocab.json",
help='Path to vocab json config')
def main(data, vocab):
data_config = Config(data)
# datasets
train_set = DataGenerator(
path_formulas=data_config.path_formulas_train,
dir_images=data_config.dir_images_train,
path_matching=data_config.path_matching_train)
"""
test_set = DataGenerator(
path_formulas=data_config.path_formulas_test,
dir_images=data_config.dir_images_test,
path_matching=data_config.path_matching_test)
"""
val_set = DataGenerator(
path_formulas=data_config.path_formulas_val,
dir_images=data_config.dir_images_val,
path_matching=data_config.path_matching_val)
# produce images and matching files
train_set.build(buckets=data_config.buckets)
#test_set.build(buckets=data_config.buckets)
val_set.build(buckets=data_config.buckets)
# vocab
vocab_config = Config(vocab)
vocab = build_vocab([train_set], min_count=vocab_config.min_count_tok)
write_vocab(vocab, vocab_config.path_vocab)
if __name__ == "__main__":
main()

+ 29
- 0
im2latex_master/data.json View File

@ -0,0 +1,29 @@
{
"export_name": "data.json",
"dir_images_train": "data2/images_train/",
"dir_images_test" : "data2/images_test/",
"dir_images_val" : "data2/images_val/",
"path_matching_train": "data2/train.matching.txt",
"path_matching_val" : "data2/val.matching.txt",
"path_matching_test" : "data2/test.matching.txt",
"path_formulas_train": "data2/train.formulas.norm.txt",
"path_formulas_test" : "data2/test.formulas.norm.txt",
"path_formulas_val" : "data2/val.formulas.norm.txt",
"bucket_train": true,
"bucket_val": true,
"bucket_test": true,
"max_iter" : null,
"max_length_formula": 150,
"buckets": [
[80,80],[240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100],
[560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100],
[720, 120], [720, 200], [800, 80], [800, 100], [1000, 80],
[1000, 100], [1200, 100], [1600, 80], [1600, 100]
]
}

+ 29
- 0
im2latex_master/data_small.json View File

@ -0,0 +1,29 @@
{
"export_name": "data.json",
"dir_images_train": "data/small/",
"dir_images_test" : "data/small/",
"dir_images_val" : "data/small/",
"path_matching_train": "data/small.matching.txt",
"path_matching_val" : "data/small.matching.txt",
"path_matching_test" : "data/small.matching.txt",
"path_formulas_train": "data/small.formulas.norm.txt",
"path_formulas_test" : "data/small.formulas.norm.txt",
"path_formulas_val" : "data/small.formulas.norm.txt",
"max_iter" : 20,
"max_length_formula": 50,
"bucket_train": true,
"bucket_val": true,
"bucket_test": true,
"buckets": [
[240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100],
[560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100],
[720, 120], [720, 200], [800, 100], [800, 320], [1000, 200],
[1000, 400], [1200, 200], [1600, 200], [1600, 1600]
]
}

+ 9
- 0
im2latex_master/debug.py View File

@ -0,0 +1,9 @@
import tensorflow as tf
import numpy as np
sess = tf.Session()
inputs = tf.placeholder(dtype=tf.float32, shape=(1, 300, 300, 3))
net = tf.layers.Conv2D(filters=2, kernel_size=3)(inputs)
net = tf.nn.softmax(net, axis=-1)
sess.run(tf.global_variables_initializer())
sess.run(net, feed_dict={inputs: np.zeros(shape=(1, 300, 300, 3), dtype=np.float32)})

+ 277
- 0
im2latex_master/demo2.py View File

@ -0,0 +1,277 @@
# coding=utf-8
import os
import shutil
import sys
import time
import cv2
import numpy as np
import tensorflow as tf
from main import preprocess
import json
import locale
locale.setlocale(locale.LC_ALL, 'C')
from scipy.misc import imread
#current_directory = os.path.dirname(os.path.abspath(__file__))
#root_path = os.path.abspath(os.path.dirname(current_directory) + os.path.sep + ".")
#sys.path.append(sys.path.append(os.getcwd()))
sys.path.append(os.getcwd())
from nets import model_train as ctpnmodel
from utils.rpn_msr.proposal_layer import proposal_layer
from utils.text_connector.detectors import TextDetector
from scipy.misc import imread
import os
from PIL import Image
from model.img2seq import Img2SeqModel
from model.utils.general import Config, run
from model.utils.text import Vocab
from model.utils.image import greyscale,predictsize
tf.app.flags.DEFINE_string('test_data_path', '/app/image/1.png', '')
tf.app.flags.DEFINE_string('output_path', '/app/im2latex_master/results/predict/', '')
tf.app.flags.DEFINE_string('gpu', '0', '')
tf.app.flags.DEFINE_string('checkpoint_path', '/app/im2latex_master/checkpoints_mlt/', '')
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('language', '2', '')
def get_images():
files = []
exts = ['jpg', 'png', 'jpeg', 'JPG']
for parent, dirnames, filenames in os.walk(FLAGS.test_data_path):
for filename in filenames:
for ext in exts:
if filename.endswith(ext):
files.append(os.path.join(parent, filename))
break
print('Find {} images'.format(len(files)))
return files
def resize_image(img):
img_size = img.shape
im_size_min = np.min(img_size[0:2])
im_size_max = np.max(img_size[0:2])
im_scale = float(600) / float(im_size_min)
if np.round(im_scale * im_size_max) > 1200:
im_scale = float(1200) / float(im_size_max)
new_h = int(img_size[0] * im_scale)
new_w = int(img_size[1] * im_scale)
new_h = new_h if new_h // 16 == 0 else (new_h // 16 + 1) * 16
new_w = new_w if new_w // 16 == 0 else (new_w // 16 + 1) * 16
re_im = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
#cv2.imshow("ss",img)
#cv2.waitKey(0)
return re_im, (new_h / img_size[0], new_w / img_size[1])
def get_box():
if os.path.exists(FLAGS.output_path):
shutil.rmtree(FLAGS.output_path)
os.makedirs(FLAGS.output_path)
os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu
input_image = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_image')
input_im_info = tf.placeholder(tf.float32, shape=[None, 3], name='input_im_info')
global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
bbox_pred, cls_pred, cls_prob = ctpnmodel.model(input_image,2)
variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step)
saver = tf.train.Saver(variable_averages.variables_to_restore())
ckpt_state = tf.train.get_checkpoint_state(FLAGS.checkpoint_path)
model_path = os.path.join(FLAGS.checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path))
# print('Restore from {}'.format(model_path))
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
saver.restore(sess, model_path)
dir_output = "/app/im2latex_master/results/full/"
config_vocab = Config(dir_output + "vocab.json")
config_model = Config(dir_output + "model.json")
vocab = Vocab(config_vocab)
model = Img2SeqModel(config_model, dir_output, vocab)
model.build_pred()
model.restore_session(dir_output + "model.weights4/test-model.ckpt")
# print(FLAGS.test_data_path)
img = cv2.imread(FLAGS.test_data_path)[:, :, ::-1]
h, w, c = img.shape
if h > 121:
approx, image, (rh, rw) = preprocess.draw_rec(img)
img = preprocess.Perspective(image, approx)
img = cv2.resize(img, None, None, fx=1.0 / rw, fy=1.0 / rh, interpolation=cv2.INTER_LINEAR)
#cv2.imshow("Dd",img)
#cv2.waitKey(0)
img, (rh, rw) = resize_image(img)
h, w, c = img.shape
im_info = np.array([h, w, c]).reshape([1, 3])
bbox_pred_val, cls_prob_val = sess.run([bbox_pred, cls_prob],
feed_dict={input_image: [img],
input_im_info: im_info})
textsegs, _ = proposal_layer(cls_prob_val, bbox_pred_val, im_info, img)
scores = textsegs[:, 0:2] # 改
textsegs = textsegs[:, 2:6] # 改
textdetector = TextDetector(DETECT_MODE='H')
boxes = textdetector.detect(textsegs, scores, img.shape[:2], img)
boxes = np.array(boxes, dtype=np.int)
image_box = sorted(boxes, key=(lambda x: (x[1] + x[3], x[0] + x[6])))
for i, box in enumerate(image_box):
cv2.polylines(img, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0),
thickness=2)
img = cv2.resize(img, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR)
cv2.imshow("ss",img)
cv2.waitKey(0)
return 0
def save_to_file():
if os.path.exists(FLAGS.output_path):
shutil.rmtree(FLAGS.output_path)
os.makedirs(FLAGS.output_path)
os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu
input_image = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_image')
input_im_info = tf.placeholder(tf.float32, shape=[None, 3], name='input_im_info')
global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
bbox_pred, cls_pred, cls_prob = ctpnmodel.model(input_image,2.0)
variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step)
saver = tf.train.Saver(variable_averages.variables_to_restore())
ckpt_state = tf.train.get_checkpoint_state(FLAGS.checkpoint_path)
model_path = os.path.join(FLAGS.checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path))
sess=tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
saver.restore(sess, model_path)
dir_output = "/app/im2latex_master/results/full/"
config_vocab = Config(dir_output + "vocab.json")
config_model = Config(dir_output + "model.json")
vocab = Vocab(config_vocab)
#英文
config_vocab_en = Config(dir_output + "vocabe.json")
vocab_en = Vocab(config_vocab_en)
model_en = Img2SeqModel(config_model, dir_output, vocab_en)
model_en.build_pred()
model_en.restore_session(dir_output + "model.weights_en/test-model.ckpt")
#print(FLAGS.test_data_path)
img = imread(FLAGS.test_data_path)
h, w, c = img.shape
res = ""
if h>40:
approx, image, (rh, rw) = preprocess.draw_rec(img)
img = preprocess.Perspective(image, approx)
img = cv2.resize(img, None, None, fx=1.0 / rw, fy=1.0 / rh, interpolation=cv2.INTER_LINEAR)
img, (rh, rw) = resize_image(img)
h, w, c = img.shape
im_info = np.array([h, w, c]).reshape([1, 3])
bbox_pred_val, cls_prob_val = sess.run([bbox_pred, cls_prob],
feed_dict={input_image: [img],
input_im_info: im_info})
textsegs, _ = proposal_layer(cls_prob_val, bbox_pred_val, im_info,img)
scores = textsegs[:, 0:2] # 改
textsegs = textsegs[:, 2:6] # 改
textdetector = TextDetector(DETECT_MODE='H')
boxes = textdetector.detect(textsegs, scores, img.shape[:2],img)
boxes = np.array(boxes, dtype=np.int)
img2=img.copy()
for i, box in enumerate(boxes):
if box[8]==1:
cv2.polylines(img2, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0),
thickness=2)
else:
cv2.polylines(img2, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(255, 0, 0),
thickness=2)
img2 = cv2.resize(img2, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR)
#cv2.imshow("ss", img2)
#cv2.waitKey(0)
for i,b in enumerate(boxes):
lan=b[8]
box = boxes[i]
img0 = img[min(box[1], box[3]) - 1:max(box[5], box[7]) + 1, min(box[0], box[2]) - 1:max(box[4], box[6]) + 1,
::-1]
#cv2.imshow("ss",img0)
#cv2.waitKey(0)
"""
if lan == 2:
img0 = predictsize(img0)
#cv2.imshow("ss",img0)
#cv2.waitKey(0)
img0 = greyscale(img0)
hyp = model.predict(img0)
res = res + hyp[0] + "\n"
model.logger.info(hyp[0])
else:
"""
img0 = predictsize(img0)
#cv2.imshow("ss",img0)
#cv2.waitKey(0)
img0 = greyscale(img0)
hyp = model_en.predict(img0)
res = res + hyp[0] + "\n"
model_en.logger.info(hyp[0])
#hyp=pytesseract.image_to_string(img0)
#res = res + hyp + "\n"
#model.logger.info(hyp)
res = json.dumps({"res": res})
model_en.logger.info(res)
else:
#print(0)
img = predictsize(img)
img0 = greyscale(img)
#cv2.imshow("ss", img0)
#cv2.waitKey(0)
hyps = model_en.predict(img0)
res = res + hyps[0] + "\n"
model_en.logger.info(hyps[0])
res = json.dumps({"res": res})
model_en.logger.info(res)
return 0
'''
cv2.imwrite(os.path.join(FLAGS.output_path, str(i) +'.png'),img[min(box[1],box[3]):max(box[5],box[7]),min(box[0],box[2]) :max(box[4],box[6]), ::-1])
cv2.polylines(img, [box[:8].astype(np.int32).reshape((-1, 1, 2))], True, color=(0, 255, 0),
thickness=2)
img = cv2.resize(img, None, None, fx=1.0 / rh, fy=1.0 / rw, interpolation=cv2.INTER_LINEAR)
cv2.imwrite(os.path.join(FLAGS.output_path, os.path.basename(im_fn)), img[:, :, ::-1])
with open(os.path.join(FLAGS.output_path, os.path.splitext(os.path.basename(im_fn))[0]) + ".txt",
"w") as f:
for i, box in enumerate(boxes):
line = ",".join(str(box[k]) for k in range(8))
line += "," + str(scores[i]) + "\r\n"
f.writelines(line)
'''
def main(argv=None):
res=save_to_file()
#res=get_box()
return res
if __name__ == '__main__':
tf.app.run()

BIN
im2latex_master/encoder.pyc View File


+ 53
- 0
im2latex_master/evaluate_img.py View File

@ -0,0 +1,53 @@
import click
from model.utils.data_generator import DataGenerator
from model.img2seq import Img2SeqModel
from model.utils.general import Config
from model.utils.text import Vocab, load_formulas
from model.utils.image import greyscale, build_images
from model.evaluation.text import score_files
from model.evaluation.image import score_dirs
@click.command()
@click.option('--results', default="results/full/", help='Dir to results')
def main(results):
# restore config and model
dir_output = results
config_data = Config(dir_output + "data.json")
config_vocab = Config(dir_output + "vocab.json")
config_model = Config(dir_output + "model.json")
vocab = Vocab(config_vocab)
model = Img2SeqModel(config_model, dir_output, vocab)
model.build_pred()
model.restore_session(dir_output + "model.weights/")
# load dataset
test_set = DataGenerator(path_formulas=config_data.path_formulas_test,
dir_images=config_data.dir_images_test, img_prepro=greyscale,
max_iter=1, bucket=config_data.bucket_test,
path_matching=config_data.path_matching_test,
max_len=config_data.max_length_formula,
form_prepro=vocab.form_prepro,bucket_size=1)
# build images from formulas
formula_ref = dir_output + "formulas_test/ref.txt"
formula_hyp = dir_output + "formulas_test/hyp_0.txt"
images_ref = dir_output + "images_test/ref/"
images_test = dir_output + "images_test/hyp_0/"
build_images(load_formulas(formula_ref), images_ref)
build_images(load_formulas(formula_hyp), images_test)
# score the repositories
scores = score_dirs(images_ref, images_test, greyscale)
msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in scores.items()])
model.logger.info("- Eval Img: {}".format(msg))
if __name__ == "__main__":
main()

+ 51
- 0
im2latex_master/evaluate_txt.py View File

@ -0,0 +1,51 @@
import click
from model.utils.data_generator import DataGenerator
from model.img2seq import Img2SeqModel
from model.utils.general import Config
from model.utils.text import Vocab
from model.utils.image import greyscale
from model.utils.text import load_formulas
from model.evaluation.text import score_files
@click.command()
@click.option('--results', default="results/full/", help='Dir to results')
def main(results):
# restore config and model
dir_output = results
config_data = Config(dir_output + "data.json")
config_vocab = Config(dir_output + "vocab.json")
config_model = Config(dir_output + "model.json")
vocab = Vocab(config_vocab)
model = Img2SeqModel(config_model, dir_output, vocab)
model.build_pred()
model.restore_session(dir_output + "model.weights4/test-model.ckpt")
# load dataset
test_set = DataGenerator(path_formulas=config_data.path_formulas_test,
dir_images=config_data.dir_images_test,
max_iter=3000, bucket=config_data.bucket_test,
path_matching=config_data.path_matching_test,
max_len=config_data.max_length_formula,
form_prepro=vocab.form_prepro)
# use model to write predictions in files
config_eval = Config({"dir_answers": dir_output + "formulas_test/",
"batch_size": 20})
files, perplexity = model.write_prediction(config_eval, test_set)
formula_ref, formula_hyp = files[0], files[1]
# score the ref and prediction files
scores = score_files(formula_ref, formula_hyp)
scores["perplexity"] = perplexity
msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in scores.items()])
model.logger.info("- Test Txt: {}".format(msg))
if __name__ == "__main__":
main()

+ 52
- 0
im2latex_master/makefile View File

@ -0,0 +1,52 @@
install-linux:
sudo pip install -r requirements.txt
sudo apt-get install texlive-latex-base
sudo apt-get install texlive-latex-extra
sudo apt-get install ghostscript
sudo apt-get install libgs-dev
wget http://www.imagemagick.org/download/ImageMagick.tar.gz
tar -xvf ImageMagick.tar.gz
cd ImageMagick-7.*; \
./configure --with-gslib=yes; \
make; \
sudo make install; \
sudo ldconfig /usr/local/lib
rm ImageMagick.tar.gz
rm -r ImageMagick-7.*
install-mac:
sudo pip install -r requirements.txt
wget http://www.imagemagick.org/download/ImageMagick.tar.gz
tar -xvf ImageMagick.tar.gz
cd ImageMagick-7.*; \
./configure --with-gslib=yes; \
make;\
sudo make install; \
rm ImageMagick.tar.gz
rm -r ImageMagick-7.*
build-small:
python build.py --data=configs/data_small.json --vocab=configs/vocab_small.json
train-small:
python train.py --data=configs/data_small.json --vocab=configs/vocab_small.json --training=configs/training_small.json --model=configs/model.json --output=results/small/
eval-small:
python evaluate_txt.py --results=results/small/
python evaluate_img.py --results=results/small/
small: build-small train-small eval-small
build:
python build.py --data=configs/data.json --vocab=configs/vocab.json
train:
python train.py --data=configs/data.json --vocab=configs/vocab.json --training=configs/training.json --model=configs/model.json --output=results/full/
eval:
python evaluate_txt.py --results=results/full/
python evaluate_img.py --results=results/full/
full: build train eval

+ 21
- 0
im2latex_master/model.json View File

@ -0,0 +1,21 @@
{
"export_name": "model.json",
"encoder_cnn": "vanilla",
"positional_embeddings": true,
"attn_cell_config": {
"cell_type": "lstm",
"num_units": 512,
"dim_e" : 512,
"dim_o" : 512,
"dim_embeddings": 80
},
"decoding": "beam_search",
"beam_size": 5,
"div_gamma": 1,
"div_prob": 0,
"max_length_formula": 150
}

+ 168
- 0
im2latex_master/model_train.py View File

@ -0,0 +1,168 @@
import tensorflow as tf
from tensorflow.contrib import slim
from nets import vgg
from utils.rpn_msr.anchor_target_layer import anchor_target_layer as anchor_target_layer_py
def mean_image_subtraction(images, means=[123.68, 116.78, 103.94]):
num_channels = images.get_shape().as_list()[-1]
if len(means) != num_channels:
raise ValueError('len(means) must match the number of channels')
channels = tf.split(axis=3, num_or_size_splits=num_channels, value=images)
for i in range(num_channels):
channels[i] -= means[i]
return tf.concat(axis=3, values=channels)
def make_var(name, shape, initializer=None):
return tf.get_variable(name, shape, initializer=initializer)
def Bilstm(net, input_channel, hidden_unit_num, output_channel, scope_name):
# width--->time step
with tf.variable_scope(scope_name) as scope:
shape = tf.shape(net)
N, H, W, C = shape[0], shape[1], shape[2], shape[3]
net = tf.reshape(net, [N * H, W, C])
net.set_shape([None, None, input_channel])
lstm_fw_cell = tf.contrib.rnn.LSTMCell(hidden_unit_num, state_is_tuple=True)
lstm_bw_cell = tf.contrib.rnn.LSTMCell(hidden_unit_num, state_is_tuple=True)
lstm_out, last_state = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, net, dtype=tf.float32)
lstm_out = tf.concat(lstm_out, axis=-1)
lstm_out = tf.reshape(lstm_out, [N * H * W, 2 * hidden_unit_num])
init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.01, mode='FAN_AVG', uniform=False)
init_biases = tf.constant_initializer(0.0)
weights = make_var('weights', [2 * hidden_unit_num, output_channel], init_weights)
biases = make_var('biases', [output_channel], init_biases)
outputs = tf.matmul(lstm_out, weights) + biases
outputs = tf.reshape(outputs, [N, H, W, output_channel])
return outputs
def lstm_fc(net, input_channel, output_channel, scope_name):
with tf.variable_scope(scope_name) as scope:
shape = tf.shape(net)
N, H, W, C = shape[0], shape[1], shape[2], shape[3]
net = tf.reshape(net, [N * H * W, C])
init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.01, mode='FAN_AVG', uniform=False)
init_biases = tf.constant_initializer(0.0)
weights = make_var('weights', [input_channel, output_channel], init_weights)
biases = make_var('biases', [output_channel], init_biases)
output = tf.matmul(net, weights) + biases
output = tf.reshape(output, [N, H, W, output_channel])
return output
def model(image,language):#改
image = mean_image_subtraction(image)
with slim.arg_scope(vgg.vgg_arg_scope()):
conv5_3 = vgg.vgg_16(image)
rpn_conv = slim.conv2d(conv5_3, 512, 3)
lstm_output = Bilstm(rpn_conv, 512, 128, 512, scope_name='BiLSTM')
bbox_pred = lstm_fc(lstm_output, 512, 10 * 4, scope_name="bbox_pred")
cls_pred = lstm_fc(lstm_output, 512, 10 * 3, scope_name="cls_pred")#改
# transpose: (1, H, W, A x d) -> (1, H, WxA, d)
cls_pred_shape = tf.shape(cls_pred)
cls_pred_reshape = tf.reshape(cls_pred, [cls_pred_shape[0], cls_pred_shape[1], -1, 3])#改
cls_pred_reshape_shape = tf.shape(cls_pred_reshape)
cls_prob = tf.reshape(tf.nn.softmax(tf.reshape(cls_pred_reshape, [-1, cls_pred_reshape_shape[3]])),
[-1, cls_pred_reshape_shape[1], cls_pred_reshape_shape[2], cls_pred_reshape_shape[3]],
name="cls_prob")
return bbox_pred, cls_pred, cls_prob
def anchor_target_layer(cls_pred, bbox, im_info, scope_name):
with tf.variable_scope(scope_name) as scope:
# 'rpn_cls_score', 'gt_boxes', 'im_info'
rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = \
tf.py_func(anchor_target_layer_py,
[cls_pred, bbox, im_info, [16, ], [16]],
[tf.float32, tf.float32, tf.float32, tf.float32])
rpn_labels = tf.convert_to_tensor(tf.cast(rpn_labels, tf.int32),
name='rpn_labels')
rpn_bbox_targets = tf.convert_to_tensor(rpn_bbox_targets,
name='rpn_bbox_targets')
rpn_bbox_inside_weights = tf.convert_to_tensor(rpn_bbox_inside_weights,
name='rpn_bbox_inside_weights')
rpn_bbox_outside_weights = tf.convert_to_tensor(rpn_bbox_outside_weights,
name='rpn_bbox_outside_weights')
return [rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights]
def smooth_l1_dist(deltas, sigma2=9.0, name='smooth_l1_dist'):
with tf.name_scope(name=name) as scope:
deltas_abs = tf.abs(deltas)
smoothL1_sign = tf.cast(tf.less(deltas_abs, 1.0 / sigma2), tf.float32)
return tf.square(deltas) * 0.5 * sigma2 * smoothL1_sign + \
(deltas_abs - 0.5 / sigma2) * tf.abs(smoothL1_sign - 1)
def loss(bbox_pred, cls_pred, bbox, im_info):
# rpn_labels : (HxWxA, 1), for each anchor, 0 denotes bg, 1 fg, -1 dontcare
#rpn_bbox_targets: (HxWxA, 4), distances of the anchors to the gt_boxes(may contains some transform)
# that are the regression objectives
#rpn_bbox_inside_weights: (HxWxA, 4) weights of each boxes, mainly accepts hyper param in cfg
#rpn_bbox_outside_weights: (HxWxA, 4) used to balance the fg/bg,
# beacuse the numbers of bgs and fgs mays significiantly different
rpn_data = anchor_target_layer(cls_pred, bbox, im_info, "anchor_target_layer")#改
# classification loss
# transpose: (1, H, W, A x d) -> (1, H, WxA, d)
cls_pred_shape = tf.shape(cls_pred)
cls_pred_reshape = tf.reshape(cls_pred, [cls_pred_shape[0], cls_pred_shape[1], -1, 3])#改
rpn_cls_score = tf.reshape(cls_pred_reshape, [-1, 3])#改
rpn_label = tf.reshape(rpn_data[0], [-1])
# ignore_label(-1)
fg_keep = tf.not_equal(rpn_label, -1)&tf.not_equal(rpn_label, 0)#改
rpn_keep = tf.where(tf.not_equal(rpn_label, -1))
rpn_cls_score = tf.gather(rpn_cls_score, rpn_keep)
rpn_label = tf.gather(rpn_label, rpn_keep)
rpn_cross_entropy_n = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=rpn_label, logits=rpn_cls_score)
# box loss
rpn_bbox_pred = bbox_pred
rpn_bbox_targets = rpn_data[1]
rpn_bbox_inside_weights = rpn_data[2]
rpn_bbox_outside_weights = rpn_data[3]
rpn_bbox_pred = tf.gather(tf.reshape(rpn_bbox_pred, [-1, 4]), rpn_keep) # shape (N, 4)
rpn_bbox_targets = tf.gather(tf.reshape(rpn_bbox_targets, [-1, 4]), rpn_keep)
rpn_bbox_inside_weights = tf.gather(tf.reshape(rpn_bbox_inside_weights, [-1, 4]), rpn_keep)
rpn_bbox_outside_weights = tf.gather(tf.reshape(rpn_bbox_outside_weights, [-1, 4]), rpn_keep)
rpn_loss_box_n = tf.reduce_sum(rpn_bbox_outside_weights * smooth_l1_dist(
rpn_bbox_inside_weights * (rpn_bbox_pred - rpn_bbox_targets)), reduction_indices=[1])
rpn_loss_box = tf.reduce_sum(rpn_loss_box_n) / (tf.reduce_sum(tf.cast(fg_keep, tf.float32)) + 1)
rpn_cross_entropy = tf.reduce_mean(rpn_cross_entropy_n)
model_loss = rpn_cross_entropy + rpn_loss_box
regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
total_loss = tf.add_n(regularization_losses) + model_loss
tf.summary.scalar('model_loss', model_loss)
tf.summary.scalar('total_loss', total_loss)
tf.summary.scalar('rpn_cross_entropy', rpn_cross_entropy)
tf.summary.scalar('rpn_loss_box', rpn_loss_box)
return total_loss, model_loss, rpn_cross_entropy, rpn_loss_box

+ 74
- 0
im2latex_master/predict.py View File

@ -0,0 +1,74 @@
from scipy.misc import imread,imshow
import os
from PIL import Image
import PIL
from model.img2seq import Img2SeqModel
from model.utils.general import Config, run
from model.utils.text import Vocab
from model.utils.image import greyscale, crop_image, pad_image,predictsize ,\
downsample_image, TIMEOUT
def interactive_shell(model):
"""Creates interactive shell to play with model
"""
model.logger.info("""
This is an interactive mode.
To exit, enter 'exit'.
Enter a path to a file
input> data/images_test/0.png""")
while True:
img_path = input("input> ")
if img_path == "exit":
break
if img_path[-3:] == "png":
img = imread(img_path)
elif img_path[-3:] == "pdf":
# call magick to convert the pdf into a png file
buckets = [
[240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100],
[560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100],
[720, 120], [720, 200], [800, 100], [800, 320], [1000, 200],
[1000, 400], [1200, 200], [1600, 200], [1600, 1600]
]
dir_output = "tmp/"
name = img_path.split('/')[-1].split('.')[0]
run("magick convert -density {} -quality {} {} {}".format(200, 100,
img_path, dir_output+"{}.png".format(name)), TIMEOUT)
img_path = dir_output + "{}.png".format(name)
crop_image(img_path, img_path)
pad_image(img_path, img_path, buckets=buckets)
downsample_image(img_path, img_path, 2)
img = imread(img_path)
img = predictsize(img)
im_converted = PIL.Image.fromarray(img)
im_converted.show()
img = greyscale(img)
hyps = model.predict(img)
with open("norm_formula_val.txt", "w") as f:
f.write(hyps[0])
model.logger.info(hyps[0])
if __name__ == "__main__":
# restore config and model
dir_output = "results/full/"
config_vocab = Config(dir_output + "vocab.json")
config_model = Config(dir_output + "model.json")
vocab = Vocab(config_vocab)
model = Img2SeqModel(config_model, dir_output, vocab)
model.build_pred()
model.restore_session(dir_output + "model.weights4/test-model.ckpt")
interactive_shell(model)

+ 62
- 0
im2latex_master/train.py View File

@ -0,0 +1,62 @@
import click
from model.utils.data_generator import DataGenerator
from model.img2seq import Img2SeqModel
from model.utils.lr_schedule import LRSchedule
from model.utils.general import Config
from model.utils.text import Vocab
from model.utils.image import greyscale
@click.command()
@click.option('--data', default="configs/data.json",
help='Path to data json config')
@click.option('--vocab', default="configs/vocab.json",
help='Path to vocab json config')
@click.option('--training', default="configs/training.json",
help='Path to training json config')
@click.option('--model', default="configs/model.json",
help='Path to model json config')
@click.option('--output', default="results/full/",
help='Dir for results and model weights')
def main(data, vocab, training, model, output):
# Load configs
dir_output = output
config = Config([data, vocab, training, model])
config.save(dir_output)
vocab = Vocab(config)
# Load datasets
train_set = DataGenerator(path_formulas=config.path_formulas_train,
dir_images=config.dir_images_train,
max_iter=config.max_iter, bucket=config.bucket_train,
path_matching=config.path_matching_train,
max_len=config.max_length_formula,
form_prepro=vocab.form_prepro)
val_set = DataGenerator(path_formulas=config.path_formulas_val,
dir_images=config.dir_images_val,
max_iter=config.max_iter, bucket=config.bucket_val,
path_matching=config.path_matching_val,
max_len=config.max_length_formula,
form_prepro=vocab.form_prepro)
# Define learning rate schedule
n_batches_epoch = ((len(train_set) + config.batch_size - 1) //
config.batch_size)
lr_schedule = LRSchedule(lr_init=config.lr_init,
start_decay=config.start_decay*n_batches_epoch,
end_decay=config.end_decay*n_batches_epoch,
end_warm=config.end_warm*n_batches_epoch,
lr_warm=config.lr_warm,
lr_min=config.lr_min)
# Build model and train
model = Img2SeqModel(config, dir_output, vocab)
model.build_train(config)
#model.restore_session(dir_output + "model.weights/test-model.ckpt")
model.train(config, train_set, val_set, lr_schedule)
if __name__ == "__main__":
main()

+ 17
- 0
im2latex_master/training.json View File

@ -0,0 +1,17 @@
{
"export_name": "training.json",
"lr_method" : "Adam",
"n_epochs" : 50,
"batch_size" : 20,
"dropout" : 1,
"metric_val" : "perplexity",
"clip" : -1,
"lr_init" : 1e-3,
"lr_min" : 1e-4,
"start_decay" : 6,
"end_decay" : 13,
"lr_warm" : 1e-4,
"end_warm" : 2
}

+ 17
- 0
im2latex_master/training_small.json View File

@ -0,0 +1,17 @@
{
"export_name": "training.json",
"lr_method" : "Adam",
"n_epochs" : 50,
"batch_size" : 3,
"dropout" : 1,
"metric_val" : "perplexity",
"clip" : -1,
"lr_init" : 1e-3,
"lr_min" : 1e-3,
"start_decay" : 6,
"end_decay" : 13,
"lr_warm" : 1e-3,
"end_warm" : 0
}

+ 28
- 0
im2latex_master/vgg.py View File

@ -0,0 +1,28 @@
import tensorflow as tf
slim = tf.contrib.slim
def vgg_arg_scope(weight_decay=0.0005):
with slim.arg_scope([slim.conv2d, slim.fully_connected],
activation_fn=tf.nn.relu,
weights_regularizer=slim.l2_regularizer(weight_decay),
biases_initializer=tf.zeros_initializer()):
with slim.arg_scope([slim.conv2d], padding='SAME') as arg_sc:
return arg_sc
def vgg_16(inputs, scope='vgg_16'):
with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc:
with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d]):
net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
net = slim.max_pool2d(net, [2, 2], scope='pool1')
net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
net = slim.max_pool2d(net, [2, 2], scope='pool2')
net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
net = slim.max_pool2d(net, [2, 2], scope='pool3')
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
net = slim.max_pool2d(net, [2, 2], scope='pool4')
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
return net

+ 9
- 0
im2latex_master/vocab.json View File

@ -0,0 +1,9 @@
{
"export_name": "vocab.json",
"unk": "_UNK",
"pad": "_PAD",
"end": "_END",
"path_vocab": "/Users/iris/im2latex_master2/data2/vocab.txt",
"min_count_tok": 10
}

+ 9
- 0
im2latex_master/vocab_small.json View File

@ -0,0 +1,9 @@
{
"export_name": "vocab.json",
"unk": "_UNK",
"pad": "_PAD",
"end": "_END",
"path_vocab": "/Users/iris/im2latex_master/data/small_vocab.txt",
"min_count_tok": 2
}

Loading…
Cancel
Save