File utils

JsonFactory

from nlp_utils.file_utils import JsonFactory

# write and load json
JsonFactory.write_json([1, 2, 3], "a.json")
JsonFactory.load_json("a.json")

# write and load jsonlines
JsonFactory.write_jsonl([1, 2, 3], "a.jsonl")
JsonFactory.load_jsonl("a.jsonl")

# write and load jsonlines with gzip
JsonFactory.write_jsonl([1, 2, 3], "a.jsonl.gz", gzip=True)
JsonFactory.load_jsonl("a.jsonl.gz", gzip=True)

# or directly call api
JsonFactory.write_jsonl_gzip([1, 2, 3], "a.jsonl.gz")
JsonFactory.load_jsonl_gzip("a.jsonl.gz")

Examples

Image-to-Text

Image-to-Text model with huggingface VisionEncoderDecoder

Quick Start

# import
from nlp_utils.pipelines import SimpleOCR

# instantiate
model = SimpleOCR()

# load encoder and decoder
model.from_encoder_decoder_pretrained(
    "swin",
    "microsoft/swin-base-patch4-window7-224-in22k",
    "bert",
    "fnlp/bart-base-chinese",
)

# load train dataset
import pandas as pd
df_train = pd.read_csv("dataset/train.txt", sep="\t", header=None)
df_train.columns = ["image_path", "target_text"]
df_train['target_text'] = df_train['target_text'].astype(str)

# train
model.train(
    train_df=train_df, # pd.DataFrame with 2 columns: image_path & target_text
    eval_df=eval_df, # pd.DataFrame with 2 columns: image_path & target_text
    image_dir="dataset/images",
    target_max_token_len = 128,
    batch_size = 16,
    max_epochs = 5,
    use_gpu = True,
    output_dir = "outputs",
    early_stopping_patience_epochs = 0,
    precision = 32,
    accumulate_grad_batches = 1,
    learning_rate = 2e-5,
    dataloader_num_workers = 0,
    use_fgm = False,
    gradient_clip_algorithm = None,
    gradient_clip_val = None,
)

# load trained T5 model
model.load_model("other", checkpoint_dir, use_gpu=True)

# predict
model.predict("dataset/images/1.jpg")

# batch predict
model.batch_predict(["dataset/images/1.jpg", "dataset/images/2.jpg"])

Text Generation

modified simpleT5

Quick Start

# import
from nlp_utils.pipelines import SimpleT5

# instantiate
model = SimpleT5()

# load (supports t5, mt5, byT5 models)
model.from_pretrained("t5", "t5-base")

# train
model.train(
    train_df=train_df, # pd.DataFrame with 2 columns: source_text & target_text
    eval_df=eval_df, # pd.DataFrame with 2 columns: source_text & target_text
    source_max_token_len = 128,
    target_max_token_len = 128,
    batch_size = 16,
    max_epochs = 5,
    use_gpu = True,
    output_dir = "outputs",
    early_stopping_patience_epochs = 0,
    precision = 32,
    accumulate_grad_batches = 1,
    learning_rate = 2e-5,
    dataloader_num_workers = 0,
    use_fgm = False,
    gradient_clip_algorithm = None,
    gradient_clip_val = None,
)

# load trained T5 model
model.load_model("t5", checkpoint_dir, use_gpu=True)

# predict
model.predict("input text for prediction")

# batch predict
model.batch_predict(["input text1 for prediction", "input text2 for prediction"])

Supported Models

specified with model_type

  • t5
  • mt5
  • byt5
  • bart
  • cpt

Generation Options

reference: Utilities for Generation

example:

kwargs = dict(
    max_length=100,
    num_beams=10,
    do_sample=False,
    top_k=50,
    top_p=1.0,
    early_stopping=False,
    repetition_penalty=2.5,
)
model.predict(input_text, **kwargs)

Acknowledgements