Cross Language Prompt Internalization - Recreating Dataset

Using the large Spacy models to preprocess the Tatoeba dataset
prompt internalization
multilingual prompt internalization
cross language word sense induction
Published

July 7, 2022

The quality of the dataset is reflected in the quality of the model. If I can improve the dataset then that should improve the model.

There are two ways that the dataset can be improved. I can use the large Spacy models to detect the nouns, and that should be more accurate. The other way is to capitalize the

Code
import blog.transformers_logging

Encode with the Large Spacy Models

This should improve the accuracy of the noun detection which may make the task easier to learn. Once again we will only accept rows where both the English sentence and the translated sentence have a single noun. As I have done this before I’ve smashed all the code into one big function, and it can be encoded for different models.

Code

This is all the code that is used. There is quite a lot!

Code
from pathlib import Path

TATOEBA_FOLDER = Path("/data/tatoeba/2022-06-18")
SENTENCES_FILE = TATOEBA_FOLDER / "sentences.tar.bz2"
LINKS_FILE = TATOEBA_FOLDER / "links.tar.bz2"

DATA_FOLDER = Path("/data/tatoeba/2022-06-18/dataset")
DATA_FOLDER.mkdir(exist_ok=True, parents=True)
Code
# from src/main/python/blog/prompt_internalization/tatoeba.py
import tarfile
from pathlib import Path
from typing import List

import pandas as pd


def read_tarfile(path: Path, names: List[str]) -> pd.DataFrame:
    """
    The Tatoeba files are provided as bz2 compressed tarfiles.
    This will correctly load them into pandas.
    """
    # there is one file in each tarfile
    with tarfile.open(path, "r:*") as handle:
        tar_path = handle.getnames()[0]
        return pd.read_csv(
            handle.extractfile(tar_path),
            delimiter="\t",
            names=names,
        )



# from src/main/python/blog/prompt_internalization/multilingual/dataset.py
from typing import Any, Dict, List, Tuple

import pandas as pd
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from tqdm.auto import tqdm
from transformers import AutoTokenizer

NLP_NAMES = {
    "eng": "en_core_web_md",
    "deu": "de_core_news_md",
    "fra": "fr_core_news_md",
    "ita": "it_core_news_md",
    "jpn": "ja_core_news_md",
    "por": "pt_core_news_md",
    "rus": "ru_core_news_md",
    "spa": "es_core_news_md",
}


def encode(
    translations_df: pd.DataFrame,
    model_name: str = "xlm-roberta-base",
    # prompt uses str.format to add in the target, so {} will be replaced
    prompt: str = " Pet: Dog, Color: Yellow, Vehicle: Tractor, Fruit: Banana,<mask>: {}",
) -> pd.DataFrame:
    tqdm.pandas()

    encoder = Encoder(model_name=model_name)
    prompt_expander = InputPromptExpander(model_name=model_name, prompt=prompt)

    teacher_df = pd.DataFrame(
        translations_df.progress_apply(
            lambda row: encoder.encode(text=row.text, language=row.language),
            axis="columns",
        ).tolist(),
        index=translations_df.index,
    )
    teacher_df = teacher_df[teacher_df.labels.apply(len) == 1]
    teacher_df = pd.DataFrame(
        teacher_df.progress_apply(
            prompt_expander,
            axis="columns",
        ).tolist(),
        index=teacher_df.index,
    )
    teacher_df = teacher_df[["input_ids"]].rename(
        columns={"input_ids": "teacher_input_ids"}
    )
    df = pd.merge(translations_df, teacher_df, left_index=True, right_index=True)

    student_df = pd.DataFrame(
        df.progress_apply(
            lambda row: encoder.encode(
                text=row.translation_text, language=row.translation_language
            ),
            axis="columns",
        ).tolist(),
        index=df.index,
    )
    student_df = student_df[student_df.labels.apply(len) == 1]
    df = pd.merge(df, student_df, left_index=True, right_index=True)
    df = df.reset_index(drop=True)
    return df


class Encoder:
    def __init__(self, model_name: str) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.extractors = {
            language: NounExtractor(spacy_name)
            for language, spacy_name in NLP_NAMES.items()
        }

    def encode(self, text: str, language: str) -> Dict[str, Any]:
        nouns = self.extractors[language].get_nouns(text)
        tokens = self.tokenizer(text, truncation=True, return_offsets_mapping=True)
        labels = self.find(tokens.offset_mapping, nouns)
        return {
            "input_ids": tokens.input_ids,
            "attention_mask": tokens.attention_mask,
            "labels": labels,
        }

    def find(
        self, offsets: List[Tuple[int, int]], nouns: List[Span]
    ) -> List[Tuple[int, int]]:
        starts = {
            start: index for index, (start, end) in enumerate(offsets) if start != end
        }
        ends = {
            end: index for index, (start, end) in enumerate(offsets) if start != end
        }
        spans = [
            (starts[noun.start_char], 1 + ends[noun.end_char] - starts[noun.start_char])
            for noun in nouns
            if noun.start_char in starts and noun.end_char in ends
        ]
        return [(start, length) for start, length in spans if length > 0]


class NounExtractor:
    def __init__(self, spacy_name: str) -> None:
        self.nlp = spacy.load(spacy_name)
        self.matcher = Matcher(self.nlp.vocab)
        self.matcher.add(
            "nouns",
            [
                [{"POS": {"IN": ["NOUN", "PROPN"]}, "OP": "+"}],
            ],
        )

    def get_nouns(self, text: str) -> List[Span]:
        doc = self.nlp(text)
        return self.matcher(doc, as_spans=True)


class InputPromptExpander:
    def __init__(
        self,
        model_name: str,
        prompt: str,
    ) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.prompt = prompt

    def __call__(self, row: pd.Series) -> List[int]:
        return self.add_prompt(input_ids=row.input_ids, noun=row.labels[0])

    def add_prompt(self, input_ids: List[int], noun: Tuple[int, int]) -> List[int]:
        input_text = self.tokenizer.decode(input_ids, skip_special_tokens=True)
        input_text = input_text.strip()
        if not input_text[-1] in {".", "?", "!"}:
            input_text = input_text + "."

        noun_start, noun_length = noun
        noun = self.tokenizer.decode(input_ids[noun_start : noun_start + noun_length])
        noun = noun.strip()
        # capitalize is a string function, but that lowercases the rest of the word
        # if we have an acronym or similar then it might all be capital letters already
        noun = noun[0].upper() + noun[1:]

        prompt = self.prompt.format(noun)
        return self.tokenizer(input_text + prompt)

Encoding

This reads the Tatoeba data, joins it to produce sentence and translation, then filters it down to English source sentences with translations in the target languages.

Code
sentences_df = read_tarfile(SENTENCES_FILE, names=["id", "language", "text"])
sentences_df = sentences_df.set_index("id", drop=True)

# example is [1, 77]
# sentence 77 is a translation of sentence 1
links_df = read_tarfile(LINKS_FILE, names=["sentence_id", "translation_id"])

print(f"Read {len(sentences_df):,} sentences and {len(links_df):,} links")

# join the sentences together via the links to get the translations
translation_map = links_df.set_index("sentence_id").translation_id.to_dict()

translation_df = sentences_df.copy()
translation_df["translation_id"] = sentences_df.index.map(translation_map)
translation_df = translation_df.dropna()
translation_df["translation_language"] = translation_df.translation_id.map(sentences_df.language)
translation_df["translation_text"] = translation_df.translation_id.map(sentences_df.text)
translation_df = translation_df.drop(columns="translation_id")
translation_df

# restrict the translations to a source of english and a translation of non-english
english_df = translation_df[(translation_df.language == "eng") & (translation_df.translation_language != "eng")]
english_df

# further restrict the translations to the 7 languages we have spacy models for
restricted_df = english_df[
    english_df.translation_language.isin({
        "deu",
        "fra",
        "ita",
        "jpn",
        "por",
        "rus",
        "spa",
    })
].copy()
Read 10,467,414 sentences and 21,259,444 links

This then uses Spacy to find nouns in the English and translated sentences. With these it can filter the rows down to those that only have a single noun on each side.

Then the text is tokenized and the English text is prompted, as that will be provided to the teacher.

Code
df = encode(
    translations_df=restricted_df,
    model_name="roberta-base",
    prompt=" Pet: Dog, Color: Yellow, Vehicle: Tractor, Fruit: Banana,<mask>: {}",
)
Code
df.to_parquet(DATA_FOLDER / "roberta.gz.parquet", compression="gzip")
Code
df
language text translation_language translation_text teacher_input_ids input_ids attention_mask labels
0 eng I was in the mountains. fra J'étais à la montagne. [0, 100, 21, 11, 5, 9787, 4, 5106, 35, 8563, 6... [0, 863, 108, 1140, 4349, 354, 6534, 897, 2712... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [(8, 2)]
1 eng I told them to send me another ticket. deu Ich sagte ihnen, sie sollen mir eine neue Fahr... [0, 100, 174, 106, 7, 2142, 162, 277, 3682, 4,... [0, 100, 611, 17929, 859, 939, 10245, 225, 6, ... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [(19, 4)]
2 eng It depends on the context. deu Das hängt vom Kontext ab. [0, 243, 7971, 15, 5, 5377, 4, 5106, 35, 8563,... [0, 495, 281, 1368, 1561, 2590, 90, 23953, 112... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [(8, 2)]
3 eng The Germans are very crafty. ita I tedeschi sono molto furbi. [0, 133, 18415, 32, 182, 6306, 219, 4, 5106, 3... [0, 100, 44817, 293, 7930, 979, 139, 13200, 56... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [(2, 3)]
4 eng But the universe is infinite. rus Однако Вселенная бесконечна. [0, 1708, 5, 9468, 16, 32952, 4, 5106, 35, 856... [0, 25417, 17772, 37947, 36765, 26161, 41171, ... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [(9, 9)]
... ... ... ... ... ... ... ... ...
119449 eng These batteries can be recharged. deu Diese Batterien sind wiederaufladbar. [0, 4528, 10535, 64, 28, 769, 18716, 4, 5106, ... [0, 495, 918, 242, 163, 7933, 2495, 579, 2028,... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [(4, 3)]
119450 eng It still works? "Yup, all it needs is new batt... spa ¿Eso todavía funciona? "Si, solo necesita pila... [0, 243, 202, 1364, 116, 22, 975, 658, 6, 70, ... [0, 4056, 9470, 717, 2527, 7, 417, 1469, 5272,... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [(19, 2)]
119451 eng I can write kanji. jpn 私は漢字が書けます。 [0, 100, 64, 3116, 9864, 5186, 4, 5106, 35, 85... [0, 36714, 6248, 10172, 48549, 37127, 4394, 72... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [(7, 3)]
119452 eng They ran to the scene. jpn 現場へと走った。 [0, 1213, 2075, 7, 5, 1310, 4, 5106, 35, 8563,... [0, 36714, 9357, 4726, 42393, 21402, 20024, 46... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [(3, 4)]
119453 eng Where exactly did you meet Tom? jpn そもそも、どこでトムと会ったの? [0, 13841, 2230, 222, 47, 972, 1560, 116, 5106... [0, 46311, 46, 48885, 46311, 46, 48885, 47341,... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [(12, 2)]

119454 rows × 8 columns

Code
from typing import Tuple
import datasets
import pandas as pd

train_languages = {
    "rus",
    "deu",
    "ita",
    "fra",
    "jpn",
}
test_languages = {
    "por",
    "spa",
}

def to_datasets(df: pd.DataFrame) -> Tuple[datasets.Dataset, datasets.Dataset]:
    train_ds = datasets.Dataset.from_pandas(
        df[df.translation_language.isin(train_languages)]
    )
    test_ds = datasets.Dataset.from_pandas(
        df[df.translation_language.isin(test_languages)]
    )
    
    return train_ds, test_ds
Code
train_ds, test_ds = to_datasets(df)

train_ds.save_to_disk(DATA_FOLDER / "roberta-train.dataset")
test_ds.save_to_disk(DATA_FOLDER / "roberta-test.dataset")

print(f"the train dataset is {len(train_ds):,} rows")
print(f"the test dataset is {len(test_ds):,} rows")
the train dataset is 96,303 rows
the test dataset is 23,151 rows

Previously there were 96,680 training rows and 23,310 test rows.

Now that we have the encoded data we can quickly look at the difference in the prompt:

Code
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokenizer.decode(df.iloc[0].teacher_input_ids)
'<s>I was in the mountains. Pet: Dog, Color: Yellow, Vehicle: Tractor, Fruit: Banana,<mask>: Mountains</s>'

As we can see the mountains in the prompt is capitalized. Hopefully this work makes the model slightly easier to train.

Different Models

Now that we have automated this we can create versions for different models. Both MBART (Liu et al. 2020) and T5 {% cite raffel2019t5 %} seem to be viable for this task.

Liu, Yinhan, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, and Luke Zettlemoyer. 2020. “Multilingual Denoising Pre-Training for Neural Machine Translation.” arXiv. https://doi.org/10.48550/ARXIV.2001.08210.
Code
df = encode(
    translations_df=restricted_df,
    model_name="t5-base",
    prompt=" Pet: Dog, Color: Yellow, Vehicle: Tractor, Fruit: Banana,<mask>: {}",
)

df.to_parquet(DATA_FOLDER / "t5.gz.parquet", compression="gzip")

train_ds, test_ds = to_datasets(df)

train_ds.save_to_disk(DATA_FOLDER / "t5-train.dataset")
test_ds.save_to_disk(DATA_FOLDER / "t5-test.dataset")

print(f"the train dataset is {len(train_ds):,} rows")
print(f"the test dataset is {len(test_ds):,} rows")
/home/matthew/.cache/pypoetry/virtualenvs/blog-HrtMnrOS-py3.10/lib/python3.10/site-packages/transformers/models/t5/tokenization_t5_fast.py:156: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.
  warnings.warn(
the train dataset is 92,292 rows
the test dataset is 23,156 rows

The difference in dataset size is due to the difficulty in extracting the noun spans from the tokens. I’ve tried encoding with mbart but it requires source and target languages for translation, which is not suitable for this task.