Code
import blog.transformers_logging
July 7, 2022
The quality of the dataset is reflected in the quality of the model. If I can improve the dataset then that should improve the model.
There are two ways that the dataset can be improved. I can use the large Spacy models to detect the nouns, and that should be more accurate. The other way is to capitalize the
This should improve the accuracy of the noun detection which may make the task easier to learn. Once again we will only accept rows where both the English sentence and the translated sentence have a single noun. As I have done this before I’ve smashed all the code into one big function, and it can be encoded for different models.
This is all the code that is used. There is quite a lot!
# from src/main/python/blog/prompt_internalization/tatoeba.py
import tarfile
from pathlib import Path
from typing import List
import pandas as pd
def read_tarfile(path: Path, names: List[str]) -> pd.DataFrame:
"""
The Tatoeba files are provided as bz2 compressed tarfiles.
This will correctly load them into pandas.
"""
# there is one file in each tarfile
with tarfile.open(path, "r:*") as handle:
tar_path = handle.getnames()[0]
return pd.read_csv(
handle.extractfile(tar_path),
delimiter="\t",
names=names,
)
# from src/main/python/blog/prompt_internalization/multilingual/dataset.py
from typing import Any, Dict, List, Tuple
import pandas as pd
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from tqdm.auto import tqdm
from transformers import AutoTokenizer
NLP_NAMES = {
"eng": "en_core_web_md",
"deu": "de_core_news_md",
"fra": "fr_core_news_md",
"ita": "it_core_news_md",
"jpn": "ja_core_news_md",
"por": "pt_core_news_md",
"rus": "ru_core_news_md",
"spa": "es_core_news_md",
}
def encode(
translations_df: pd.DataFrame,
model_name: str = "xlm-roberta-base",
# prompt uses str.format to add in the target, so {} will be replaced
prompt: str = " Pet: Dog, Color: Yellow, Vehicle: Tractor, Fruit: Banana,<mask>: {}",
) -> pd.DataFrame:
tqdm.pandas()
encoder = Encoder(model_name=model_name)
prompt_expander = InputPromptExpander(model_name=model_name, prompt=prompt)
teacher_df = pd.DataFrame(
translations_df.progress_apply(
lambda row: encoder.encode(text=row.text, language=row.language),
axis="columns",
).tolist(),
index=translations_df.index,
)
teacher_df = teacher_df[teacher_df.labels.apply(len) == 1]
teacher_df = pd.DataFrame(
teacher_df.progress_apply(
prompt_expander,
axis="columns",
).tolist(),
index=teacher_df.index,
)
teacher_df = teacher_df[["input_ids"]].rename(
columns={"input_ids": "teacher_input_ids"}
)
df = pd.merge(translations_df, teacher_df, left_index=True, right_index=True)
student_df = pd.DataFrame(
df.progress_apply(
lambda row: encoder.encode(
text=row.translation_text, language=row.translation_language
),
axis="columns",
).tolist(),
index=df.index,
)
student_df = student_df[student_df.labels.apply(len) == 1]
df = pd.merge(df, student_df, left_index=True, right_index=True)
df = df.reset_index(drop=True)
return df
class Encoder:
def __init__(self, model_name: str) -> None:
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.extractors = {
language: NounExtractor(spacy_name)
for language, spacy_name in NLP_NAMES.items()
}
def encode(self, text: str, language: str) -> Dict[str, Any]:
nouns = self.extractors[language].get_nouns(text)
tokens = self.tokenizer(text, truncation=True, return_offsets_mapping=True)
labels = self.find(tokens.offset_mapping, nouns)
return {
"input_ids": tokens.input_ids,
"attention_mask": tokens.attention_mask,
"labels": labels,
}
def find(
self, offsets: List[Tuple[int, int]], nouns: List[Span]
) -> List[Tuple[int, int]]:
starts = {
start: index for index, (start, end) in enumerate(offsets) if start != end
}
ends = {
end: index for index, (start, end) in enumerate(offsets) if start != end
}
spans = [
(starts[noun.start_char], 1 + ends[noun.end_char] - starts[noun.start_char])
for noun in nouns
if noun.start_char in starts and noun.end_char in ends
]
return [(start, length) for start, length in spans if length > 0]
class NounExtractor:
def __init__(self, spacy_name: str) -> None:
self.nlp = spacy.load(spacy_name)
self.matcher = Matcher(self.nlp.vocab)
self.matcher.add(
"nouns",
[
[{"POS": {"IN": ["NOUN", "PROPN"]}, "OP": "+"}],
],
)
def get_nouns(self, text: str) -> List[Span]:
doc = self.nlp(text)
return self.matcher(doc, as_spans=True)
class InputPromptExpander:
def __init__(
self,
model_name: str,
prompt: str,
) -> None:
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.prompt = prompt
def __call__(self, row: pd.Series) -> List[int]:
return self.add_prompt(input_ids=row.input_ids, noun=row.labels[0])
def add_prompt(self, input_ids: List[int], noun: Tuple[int, int]) -> List[int]:
input_text = self.tokenizer.decode(input_ids, skip_special_tokens=True)
input_text = input_text.strip()
if not input_text[-1] in {".", "?", "!"}:
input_text = input_text + "."
noun_start, noun_length = noun
noun = self.tokenizer.decode(input_ids[noun_start : noun_start + noun_length])
noun = noun.strip()
# capitalize is a string function, but that lowercases the rest of the word
# if we have an acronym or similar then it might all be capital letters already
noun = noun[0].upper() + noun[1:]
prompt = self.prompt.format(noun)
return self.tokenizer(input_text + prompt)
This reads the Tatoeba data, joins it to produce sentence and translation, then filters it down to English source sentences with translations in the target languages.
sentences_df = read_tarfile(SENTENCES_FILE, names=["id", "language", "text"])
sentences_df = sentences_df.set_index("id", drop=True)
# example is [1, 77]
# sentence 77 is a translation of sentence 1
links_df = read_tarfile(LINKS_FILE, names=["sentence_id", "translation_id"])
print(f"Read {len(sentences_df):,} sentences and {len(links_df):,} links")
# join the sentences together via the links to get the translations
translation_map = links_df.set_index("sentence_id").translation_id.to_dict()
translation_df = sentences_df.copy()
translation_df["translation_id"] = sentences_df.index.map(translation_map)
translation_df = translation_df.dropna()
translation_df["translation_language"] = translation_df.translation_id.map(sentences_df.language)
translation_df["translation_text"] = translation_df.translation_id.map(sentences_df.text)
translation_df = translation_df.drop(columns="translation_id")
translation_df
# restrict the translations to a source of english and a translation of non-english
english_df = translation_df[(translation_df.language == "eng") & (translation_df.translation_language != "eng")]
english_df
# further restrict the translations to the 7 languages we have spacy models for
restricted_df = english_df[
english_df.translation_language.isin({
"deu",
"fra",
"ita",
"jpn",
"por",
"rus",
"spa",
})
].copy()
Read 10,467,414 sentences and 21,259,444 links
This then uses Spacy to find nouns in the English and translated sentences. With these it can filter the rows down to those that only have a single noun on each side.
Then the text is tokenized and the English text is prompted, as that will be provided to the teacher.
language | text | translation_language | translation_text | teacher_input_ids | input_ids | attention_mask | labels | |
---|---|---|---|---|---|---|---|---|
0 | eng | I was in the mountains. | fra | J'étais à la montagne. | [0, 100, 21, 11, 5, 9787, 4, 5106, 35, 8563, 6... | [0, 863, 108, 1140, 4349, 354, 6534, 897, 2712... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] | [(8, 2)] |
1 | eng | I told them to send me another ticket. | deu | Ich sagte ihnen, sie sollen mir eine neue Fahr... | [0, 100, 174, 106, 7, 2142, 162, 277, 3682, 4,... | [0, 100, 611, 17929, 859, 939, 10245, 225, 6, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [(19, 4)] |
2 | eng | It depends on the context. | deu | Das hängt vom Kontext ab. | [0, 243, 7971, 15, 5, 5377, 4, 5106, 35, 8563,... | [0, 495, 281, 1368, 1561, 2590, 90, 23953, 112... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] | [(8, 2)] |
3 | eng | The Germans are very crafty. | ita | I tedeschi sono molto furbi. | [0, 133, 18415, 32, 182, 6306, 219, 4, 5106, 3... | [0, 100, 44817, 293, 7930, 979, 139, 13200, 56... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] | [(2, 3)] |
4 | eng | But the universe is infinite. | rus | Однако Вселенная бесконечна. | [0, 1708, 5, 9468, 16, 32952, 4, 5106, 35, 856... | [0, 25417, 17772, 37947, 36765, 26161, 41171, ... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [(9, 9)] |
... | ... | ... | ... | ... | ... | ... | ... | ... |
119449 | eng | These batteries can be recharged. | deu | Diese Batterien sind wiederaufladbar. | [0, 4528, 10535, 64, 28, 769, 18716, 4, 5106, ... | [0, 495, 918, 242, 163, 7933, 2495, 579, 2028,... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [(4, 3)] |
119450 | eng | It still works? "Yup, all it needs is new batt... | spa | ¿Eso todavía funciona? "Si, solo necesita pila... | [0, 243, 202, 1364, 116, 22, 975, 658, 6, 70, ... | [0, 4056, 9470, 717, 2527, 7, 417, 1469, 5272,... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [(19, 2)] |
119451 | eng | I can write kanji. | jpn | 私は漢字が書けます。 | [0, 100, 64, 3116, 9864, 5186, 4, 5106, 35, 85... | [0, 36714, 6248, 10172, 48549, 37127, 4394, 72... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [(7, 3)] |
119452 | eng | They ran to the scene. | jpn | 現場へと走った。 | [0, 1213, 2075, 7, 5, 1310, 4, 5106, 35, 8563,... | [0, 36714, 9357, 4726, 42393, 21402, 20024, 46... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [(3, 4)] |
119453 | eng | Where exactly did you meet Tom? | jpn | そもそも、どこでトムと会ったの? | [0, 13841, 2230, 222, 47, 972, 1560, 116, 5106... | [0, 46311, 46, 48885, 46311, 46, 48885, 47341,... | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | [(12, 2)] |
119454 rows × 8 columns
from typing import Tuple
import datasets
import pandas as pd
train_languages = {
"rus",
"deu",
"ita",
"fra",
"jpn",
}
test_languages = {
"por",
"spa",
}
def to_datasets(df: pd.DataFrame) -> Tuple[datasets.Dataset, datasets.Dataset]:
train_ds = datasets.Dataset.from_pandas(
df[df.translation_language.isin(train_languages)]
)
test_ds = datasets.Dataset.from_pandas(
df[df.translation_language.isin(test_languages)]
)
return train_ds, test_ds
the train dataset is 96,303 rows
the test dataset is 23,151 rows
Previously there were 96,680 training rows and 23,310 test rows.
Now that we have the encoded data we can quickly look at the difference in the prompt:
'<s>I was in the mountains. Pet: Dog, Color: Yellow, Vehicle: Tractor, Fruit: Banana,<mask>: Mountains</s>'
As we can see the mountains in the prompt is capitalized. Hopefully this work makes the model slightly easier to train.
Now that we have automated this we can create versions for different models. Both MBART (Liu et al. 2020) and T5 {% cite raffel2019t5 %} seem to be viable for this task.
df = encode(
translations_df=restricted_df,
model_name="t5-base",
prompt=" Pet: Dog, Color: Yellow, Vehicle: Tractor, Fruit: Banana,<mask>: {}",
)
df.to_parquet(DATA_FOLDER / "t5.gz.parquet", compression="gzip")
train_ds, test_ds = to_datasets(df)
train_ds.save_to_disk(DATA_FOLDER / "t5-train.dataset")
test_ds.save_to_disk(DATA_FOLDER / "t5-test.dataset")
print(f"the train dataset is {len(train_ds):,} rows")
print(f"the test dataset is {len(test_ds):,} rows")
/home/matthew/.cache/pypoetry/virtualenvs/blog-HrtMnrOS-py3.10/lib/python3.10/site-packages/transformers/models/t5/tokenization_t5_fast.py:156: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.
warnings.warn(
the train dataset is 92,292 rows
the test dataset is 23,156 rows
The difference in dataset size is due to the difficulty in extracting the noun spans from the tokens. I’ve tried encoding with mbart but it requires source and target languages for translation, which is not suitable for this task.