import click
|
|
|
|
|
|
from model.utils.data_generator import DataGenerator
|
|
from model.utils.text import build_vocab, write_vocab
|
|
from model.utils.image import build_images
|
|
from model.utils.general import Config
|
|
|
|
|
|
@click.command()
|
|
@click.option('--data', default="configs/data.json",
|
|
help='Path to data json config')
|
|
@click.option('--vocab', default="configs/vocab.json",
|
|
help='Path to vocab json config')
|
|
def main(data, vocab):
|
|
data_config = Config(data)
|
|
|
|
# datasets
|
|
train_set = DataGenerator(
|
|
path_formulas=data_config.path_formulas_train,
|
|
dir_images=data_config.dir_images_train,
|
|
path_matching=data_config.path_matching_train)
|
|
"""
|
|
|
|
test_set = DataGenerator(
|
|
path_formulas=data_config.path_formulas_test,
|
|
dir_images=data_config.dir_images_test,
|
|
path_matching=data_config.path_matching_test)
|
|
"""
|
|
val_set = DataGenerator(
|
|
path_formulas=data_config.path_formulas_val,
|
|
dir_images=data_config.dir_images_val,
|
|
path_matching=data_config.path_matching_val)
|
|
|
|
# produce images and matching files
|
|
train_set.build(buckets=data_config.buckets)
|
|
#test_set.build(buckets=data_config.buckets)
|
|
val_set.build(buckets=data_config.buckets)
|
|
|
|
# vocab
|
|
vocab_config = Config(vocab)
|
|
vocab = build_vocab([train_set], min_count=vocab_config.min_count_tok)
|
|
write_vocab(vocab, vocab_config.path_vocab)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|