Code
import blog.transformers_logging
July 10, 2022
This is a further investigation into model collapse. The investigation of prompt internalization has been hindered by the collapse and that makes me think that the task itself is not stable. If a model that is trained for a long time collapses then I may have to review the structure of the task itself.
Just smashing everything in from the last time.
# from src/main/python/blog/prompt_internalization/multilingual/roberta/trainer.py
from itertools import starmap
from typing import Any, Dict, List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments
from transformers.modeling_outputs import MaskedLMOutput
class MultilingualMaskedPromptInternalizationTrainingArguments(TrainingArguments):
def __init__(
self,
*args,
temperature: float = 2.0,
mean_prediction: bool = True,
ignore_tokens: Optional[List[int]] = None,
**kwargs,
) -> None:
super().__init__(*args, **kwargs)
self.temperature = temperature
self.mean_prediction = mean_prediction
if ignore_tokens is not None:
self.ignore_tokens = ignore_tokens
else:
self.ignore_tokens = []
class MultilingualMaskedPromptInternalizationTrainer(Trainer):
def __init__(
self,
*args,
teacher_model: AutoModelForMaskedLM = None,
tokenizer: AutoTokenizer = None,
**kwargs,
) -> None:
super().__init__(*args, **kwargs)
self.teacher = teacher_model
self._move_model_to_device(self.teacher, self.model.device)
self.teacher.eval()
self.mask_token_id = tokenizer.mask_token_id
def compute_loss(
self,
model: AutoModelForMaskedLM,
inputs: Dict[str, Union[torch.Tensor, Any]],
return_outputs: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
outputs: MaskedLMOutput = model(
input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]
)
if self.args.mean_prediction:
predictions = self._student_predictions_mean(
outputs=outputs, labels=inputs["labels"]
)
else:
predictions = self._student_predictions_first(
outputs=outputs, labels=inputs["labels"]
)
targets = self._teacher_predictions(
input_ids=inputs["teacher_input_ids"],
attention_mask=inputs["teacher_attention_mask"],
)
loss = self._loss(predictions=predictions, targets=targets)
if not return_outputs:
return loss
# This directly calculates the kl_div and overlap metrics.
# It's much faster to do this using CUDA operations instead of waiting for cpu numpy.
with torch.inference_mode():
kl_div = F.kl_div(
input=F.log_softmax(predictions.to(torch.float32), dim=-1),
target=F.softmax(targets.to(torch.float32), dim=-1),
reduction="none",
log_target=False,
)
kl_div = kl_div.sum(dim=1)
overlap = starmap(
torch.isin,
zip(
predictions.argsort(descending=True)[:, :10],
targets.argsort(descending=True)[:, :10],
),
)
overlap = map(torch.sum, overlap)
overlap = torch.tensor(list(overlap), device=self.model.device)
overlap = overlap / 10
# This will reshape the metrics to be [batch_size, 2] which will then
# get correctly passed to the metric calculation
metric_output = torch.cat([kl_div[:, None], overlap[:, None]], dim=1)
return loss, metric_output
@torch.inference_mode()
def _teacher_predictions(
self, input_ids: torch.Tensor, attention_mask: torch.Tensor
) -> torch.Tensor:
outputs_teacher = self.teacher(
input_ids=input_ids,
attention_mask=attention_mask,
)
mask_indices = input_ids == self.mask_token_id
teacher_predictions = outputs_teacher.logits[mask_indices]
teacher_predictions[:, self.args.ignore_tokens] = teacher_predictions.min()
return teacher_predictions
def _student_predictions_mean(
self, outputs: MaskedLMOutput, labels: torch.Tensor
) -> torch.Tensor:
# When calculating this it is very important to avoid breaking back propagation.
# torch.cat will break back propagation, so the prediction is added per row to a holder
logits = outputs.logits
predictions = torch.zeros(logits.shape[0], device=logits.device)
for index, (start, length) in enumerate(labels):
prediction = logits[index, start : start + length]
prediction = prediction.mean(dim=0)
predictions[index] += prediction
return predictions
def _student_predictions_first(
self,
outputs: MaskedLMOutput,
labels: torch.Tensor,
) -> torch.Tensor:
return outputs.logits[range(outputs.logits.shape[0]), labels[:, 0]]
def _loss(self, predictions: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
predictions = F.log_softmax(
predictions.to(torch.float32) / self.args.temperature, dim=-1
)
targets = F.softmax(targets.to(torch.float32) / self.args.temperature, dim=-1)
loss = F.kl_div(
input=predictions,
target=targets,
reduction="batchmean",
log_target=False,
)
return loss * (self.args.temperature**2)
# from src/main/python/blog/prompt_internalization/multilingual/roberta/collator.py
from typing import Any, Dict, List
from transformers import AutoTokenizer
class TeacherStudentCollator:
"""
The teacher inputs need to be padded and have an associated attention mask.
"""
def __init__(self, tokenizer: AutoTokenizer) -> None:
self.tokenizer = tokenizer
def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
teacher_inputs = self._teacher_inputs(features)
student_inputs = self._student_inputs(features)
batch = {**teacher_inputs, **student_inputs}
if "label" in batch:
batch["labels"] = batch["label"]
del batch["label"]
if "label_ids" in batch:
batch["labels"] = batch["label_ids"]
del batch["label_ids"]
return batch
def _teacher_inputs(self, features: List[Dict[str, Any]]) -> Dict[str, List[Any]]:
teacher_inputs = [{"input_ids": row["teacher_input_ids"]} for row in features]
teacher_batch = self.tokenizer.pad(
teacher_inputs,
padding=True,
return_tensors="pt",
)
return {
"teacher_input_ids": teacher_batch["input_ids"],
"teacher_attention_mask": teacher_batch["attention_mask"],
}
def _student_inputs(self, features: List[Dict[str, Any]]) -> Dict[str, List[Any]]:
student_inputs = [
{
"input_ids": row["input_ids"],
"labels": row["labels"][0], # known to have a single entry
}
for row in features
]
return self.tokenizer.pad(
student_inputs,
padding=True,
return_tensors="pt",
)
# from src/main/python/blog/prompt_internalization/multilingual/roberta/metrics.py
from typing import Dict
from transformers import EvalPrediction
def compute_metrics(model_output: EvalPrediction) -> Dict[str, float]:
kl_div = model_output.predictions[:, 0].mean()
overlap = model_output.predictions[:, 1].mean()
return {
"kl_div": kl_div,
"overlap": overlap,
}
# from src/main/python/blog/prompt_internalization/multilingual/roberta/train.py
from pathlib import Path
from typing import List, Optional
import datasets
from transformers import AutoModelForMaskedLM, AutoTokenizer
from .collator import TeacherStudentCollator
from .metrics import compute_metrics
from .trainer import (
MultilingualMaskedPromptInternalizationTrainer,
MultilingualMaskedPromptInternalizationTrainingArguments,
)
DATASET_FOLDER = Path("/data/tatoeba/2022-06-18/dataset/")
MODEL_FOLDER = Path("/data/prompt-internalization/multilingual/")
RUN_FOLDER = Path("/tmp/runs")
MODEL_FOLDER.mkdir(parents=True, exist_ok=True)
RUN_FOLDER.mkdir(parents=True, exist_ok=True)
def train(
*,
model_name: str = "xlm-roberta-base",
dataset_name: str = "xlm-roberta",
batch_size: int = 64,
learning_rate: float = 1e-4,
temperature: float = 2,
fp16: bool = False,
mean_prediction: bool = False,
ignore_tokens: Optional[List[int]] = None,
epochs: Optional[float] = 2,
max_steps: int = -1,
evaluation_steps: int = 500,
) -> Path:
run_name = "-".join(
[
f"{model_name}",
f"e{epochs}" if max_steps == -1 else f"ms{max_steps}",
f"bs{batch_size}",
f"lr{learning_rate}",
f"t{temperature}",
]
+ (["fp16"] if fp16 else [])
+ (["mean"] if mean_prediction else [])
+ ([f"it{len(ignore_tokens)}"] if ignore_tokens else [])
)
print(f"Starting {run_name}")
train_ds = datasets.load_from_disk(DATASET_FOLDER / f"{dataset_name}-train.dataset")
test_ds = datasets.load_from_disk(DATASET_FOLDER / f"{dataset_name}-test.dataset")
training_args = MultilingualMaskedPromptInternalizationTrainingArguments(
report_to="none",
output_dir=RUN_FOLDER,
num_train_epochs=epochs,
max_steps=max_steps,
seed=33,
# number of steps before moving evaluation results from GPU to CPU see
# https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941
eval_accumulation_steps=5,
#
# hyperparameters
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
fp16=fp16,
temperature=temperature,
mean_prediction=mean_prediction,
ignore_tokens=ignore_tokens,
learning_rate=learning_rate,
#
# evaluation settings
evaluation_strategy="steps",
logging_steps=evaluation_steps,
eval_steps=evaluation_steps,
save_steps=evaluation_steps,
#
# checkpoint settings
logging_dir=RUN_FOLDER / "logs",
save_total_limit=2,
load_best_model_at_end=True,
metric_for_best_model="overlap",
greater_is_better=True,
remove_unused_columns=False,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
teacher_model = AutoModelForMaskedLM.from_pretrained(model_name)
student_model = AutoModelForMaskedLM.from_pretrained(model_name)
data_collator = TeacherStudentCollator(tokenizer=tokenizer)
trainer = MultilingualMaskedPromptInternalizationTrainer(
model=student_model,
args=training_args,
teacher_model=teacher_model,
train_dataset=train_ds,
eval_dataset=test_ds,
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.train()
student_model.save_pretrained(MODEL_FOLDER / run_name)
return MODEL_FOLDER / run_name
# from src/main/python/blog/prompt_internalization/multilingual/roberta/evaluate.py
from pathlib import Path
from typing import List, Optional, Tuple
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
def evaluate(
model_name: str, model_path: Path, ignore_tokens: Optional[List[int]] = None
) -> None:
if ignore_tokens is None:
ignore_tokens = []
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_path)
model.eval()
bass_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)
friday_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)
malibu_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)
football_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)
def bass_evaluation(
model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
first_phrase = "We spotted a large bass in the ocean."
second_phrase = "The bass player did not receive the acknowledgment she deserves."
third_phrase = "The black sea bass, is a member of the wreckfish family."
first_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=first_phrase,
noun="bass",
ignore_tokens=ignore_tokens,
)
second_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=second_phrase,
noun="bass",
ignore_tokens=ignore_tokens,
)
third_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=third_phrase,
noun="bass",
ignore_tokens=ignore_tokens,
)
print("=== BASS EVALUATION ===")
print(f"First Phrase is: {first_phrase} Target is: bass")
print(f"Description is: {', '.join(first_predicted_words)}")
print()
print(f"Second Phrase is: {second_phrase} Target is: bass")
print(f"Description is: {', '.join(second_predicted_words)}")
print()
print(f"Third Phrase is: {third_phrase} Target is: bass")
print(f"Description is: {', '.join(third_predicted_words)}")
print()
print(
f"First & Second: {sorted(set(first_predicted_words) & set(second_predicted_words))}"
)
print(
f"First & Third: {sorted(set(first_predicted_words) & set(third_predicted_words))}"
)
print(
f"Second & Third: {sorted(set(second_predicted_words) & set(third_predicted_words))}"
)
print()
def friday_evaluation(
model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
spanish_text = "Friday es mi canción favorita."
english_text = "Friday is my favourite song."
spanish_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=spanish_text,
noun="Friday",
ignore_tokens=ignore_tokens,
)
english_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=english_text,
noun="Friday",
ignore_tokens=ignore_tokens,
)
overlap = set(spanish_predicted_words) & set(english_predicted_words)
difference = set(spanish_predicted_words) ^ set(english_predicted_words)
print("=== FRIDAY EVALUATION ===")
print(f"Spanish Phrase is: {spanish_text}")
print(f"Spanish Description is: {', '.join(spanish_predicted_words)}")
print(f"English Phrase is: {english_text}")
print(f"English Description is: {', '.join(english_predicted_words)}")
print()
print(f"Description Overlap is: {', '.join(sorted(overlap))}")
print(f"Description Difference is: {', '.join(sorted(difference))}")
print()
def malibu_evaluation(
model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
text = "I like to drive my Malibu while drinking Malibu."
first_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=text,
noun="Malibu",
ignore_tokens=ignore_tokens,
)
second_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=text,
noun="Malibu",
index=1,
ignore_tokens=ignore_tokens,
)
print("=== MALIBU EVALUATION ===")
print(f"Phrase is: {text}")
print(f"First Malibu (car) Description is: {', '.join(first_predicted_words)}")
print(f"Second Malibu (drink) Description is: {', '.join(second_predicted_words)}")
print()
print(
f"First & Second: {sorted(set(first_predicted_words) & set(second_predicted_words))}"
)
print(
f"First ^ Second: {sorted(set(first_predicted_words) ^ set(second_predicted_words))}"
)
print()
def football_evaluation(
model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
spanish_phrase = (
"Retiremos el equipo de la cancha, "
"Boca no merece jugar esta copa que "
"hace tiempo viene siendo desprestigiada.\n"
"Ya no se juega al futbol."
)
english_phrase = (
"Let's remove the team from the field, "
"Boca does not deserve to play this cup that "
"has long been discredited. "
"Football is no longer played."
)
print("=== FOOTBALL EVALUATION ===")
print(f"Spanish Phrase is: {spanish_phrase}")
print(f"English Phrase is: {english_phrase}")
print()
for spanish_noun, english_noun in [
["equipo", "team"],
["Boca", "Boca"],
["copa", "cup"],
["tiempo", "long"],
["futbol", "Football"],
]:
spanish_description = get_predictions(
model=model,
tokenizer=tokenizer,
text=spanish_phrase,
noun=spanish_noun,
ignore_tokens=ignore_tokens,
)
english_description = get_predictions(
model=model,
tokenizer=tokenizer,
text=english_phrase,
noun=english_noun,
ignore_tokens=ignore_tokens,
)
overlap = set(spanish_description) & set(english_description)
difference = set(spanish_description) ^ set(english_description)
print(f"Spanish word is: {spanish_noun}, English word is: {english_noun}")
print(f"Spanish Description is: {', '.join(spanish_description)}")
print(f"English Description is: {', '.join(english_description)}")
print(f"Overlap is: {', '.join(sorted(overlap))} ({len(overlap)})")
print(f"Difference is: {', '.join(sorted(difference))} ({len(difference)})")
print()
@torch.inference_mode()
def get_predictions(
*,
model: AutoModelForMaskedLM,
tokenizer: AutoTokenizer,
text: str,
noun: str,
index: int = 0,
ignore_tokens: Optional[List[int]] = None,
) -> List[str]:
if ignore_tokens is None:
ignore_tokens = []
tokens = tokenizer(text, return_tensors="pt")
start, _end = get_noun(
tokenizer=tokenizer, tokens=tokens.input_ids[0], noun=noun, index=index
)
output = model(**tokens)
predictions = output.logits[0, start]
predictions[ignore_tokens] = predictions.min()
predicted_tokens = predictions.argsort(descending=True)[:10]
predicted_words = [
word.strip() for word in tokenizer.batch_decode(predicted_tokens)
]
return predicted_words
def get_noun(
tokenizer: AutoTokenizer, tokens: torch.Tensor, noun: str, index: int
) -> Tuple[int, int]:
length = tokens.shape[0]
current_index = index
for start_index in range(length):
word = tokenizer.decode(tokens[start_index]).strip()
if not noun.startswith(word):
continue
for end_index in range(start_index + 1, length):
word = tokenizer.decode(tokens[start_index:end_index]).strip()
if not noun == word:
continue
if current_index > 0:
current_index -= 1
else:
return start_index, end_index
raise AssertionError(f"Did not find {noun}[{index}] in {tokenizer.decode(tokens)}")
20 epochs is about 10x longer than I have been training for so far.
Starting xlm-roberta-base-e20-bs32-lr0.0001-t2
/home/matthew/.cache/pypoetry/virtualenvs/blog-HrtMnrOS-py3.10/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
Step | Training Loss | Validation Loss | Kl Div | Overlap |
---|---|---|---|---|
1000 | 0.403300 | 0.280854 | 0.256531 | 0.683585 |
2000 | 0.288300 | 0.259887 | 0.240529 | 0.694900 |
3000 | 0.262300 | 0.261534 | 0.245546 | 0.698782 |
4000 | 0.225200 | 0.249906 | 0.228982 | 0.707599 |
5000 | 0.217900 | 0.240995 | 0.226439 | 0.712368 |
6000 | 0.211700 | 0.236759 | 0.219825 | 0.718522 |
7000 | 0.186700 | 0.240164 | 0.225746 | 0.713896 |
8000 | 0.181800 | 0.234820 | 0.216973 | 0.721330 |
9000 | 0.180100 | 0.228655 | 0.213052 | 0.722672 |
10000 | 0.160700 | 0.231911 | 0.216320 | 0.723374 |
11000 | 0.157300 | 0.227415 | 0.213988 | 0.726265 |
12000 | 0.159000 | 0.228098 | 0.211121 | 0.724159 |
13000 | 0.141000 | 0.226735 | 0.210241 | 0.729279 |
14000 | 0.138900 | 0.224571 | 0.207329 | 0.728123 |
15000 | 0.139900 | 0.225365 | 0.210097 | 0.728846 |
16000 | 0.126000 | 0.228829 | 0.214268 | 0.727834 |
17000 | 0.124400 | 0.226964 | 0.211337 | 0.729919 |
18000 | 0.123600 | 0.227304 | 0.211372 | 0.726987 |
19000 | 0.113800 | 0.226793 | 0.211822 | 0.729589 |
20000 | 0.109600 | 0.228013 | 0.211201 | 0.730952 |
21000 | 0.113300 | 0.224208 | 0.209901 | 0.730745 |
22000 | 0.102500 | 0.225952 | 0.210528 | 0.731695 |
23000 | 0.100200 | 0.222585 | 0.209529 | 0.730849 |
24000 | 0.101700 | 0.222588 | 0.207854 | 0.733223 |
25000 | 0.096400 | 0.223735 | 0.209063 | 0.734669 |
26000 | 0.091300 | 0.221564 | 0.204739 | 0.733430 |
27000 | 0.092500 | 0.220193 | 0.204844 | 0.733843 |
28000 | 0.087600 | 0.220678 | 0.203973 | 0.736011 |
29000 | 0.083700 | 0.218368 | 0.203532 | 0.737291 |
30000 | 0.085100 | 0.224064 | 0.207729 | 0.733987 |
31000 | 0.082200 | 0.220344 | 0.206388 | 0.737167 |
32000 | 0.078200 | 0.223062 | 0.207357 | 0.735866 |
33000 | 0.078000 | 0.223453 | 0.208466 | 0.734565 |
34000 | 0.076200 | 0.221971 | 0.206944 | 0.737229 |
35000 | 0.071800 | 0.218891 | 0.203820 | 0.738592 |
36000 | 0.073300 | 0.220247 | 0.205179 | 0.737725 |
37000 | 0.071100 | 0.219950 | 0.205713 | 0.737043 |
38000 | 0.067000 | 0.218195 | 0.203988 | 0.738902 |
39000 | 0.068200 | 0.219006 | 0.203538 | 0.737910 |
40000 | 0.067100 | 0.216856 | 0.202330 | 0.739025 |
41000 | 0.063200 | 0.217489 | 0.202965 | 0.738798 |
42000 | 0.064100 | 0.217092 | 0.202647 | 0.738860 |
43000 | 0.062700 | 0.216701 | 0.203053 | 0.739872 |
44000 | 0.060000 | 0.218087 | 0.203738 | 0.739542 |
45000 | 0.060000 | 0.218724 | 0.204076 | 0.739418 |
46000 | 0.059400 | 0.214085 | 0.200080 | 0.741483 |
47000 | 0.056200 | 0.215620 | 0.201491 | 0.740595 |
48000 | 0.057100 | 0.217038 | 0.202258 | 0.741400 |
49000 | 0.056500 | 0.215010 | 0.200240 | 0.742288 |
50000 | 0.053800 | 0.214259 | 0.199056 | 0.742370 |
51000 | 0.054400 | 0.215045 | 0.200357 | 0.742329 |
52000 | 0.053700 | 0.214869 | 0.199806 | 0.741937 |
53000 | 0.051600 | 0.215995 | 0.201306 | 0.741710 |
54000 | 0.051500 | 0.213915 | 0.199177 | 0.742907 |
55000 | 0.051700 | 0.215324 | 0.200396 | 0.742143 |
56000 | 0.049500 | 0.214356 | 0.199690 | 0.742453 |
57000 | 0.049700 | 0.214086 | 0.199557 | 0.742825 |
58000 | 0.049800 | 0.213373 | 0.198633 | 0.744518 |
59000 | 0.048100 | 0.213276 | 0.198745 | 0.744167 |
60000 | 0.048300 | 0.212866 | 0.198230 | 0.744229 |
61000 | 0.048100 | 0.212841 | 0.198155 | 0.744539 |
Could not locate the tokenizer configuration file, will try to use the model config instead.
=== BASS EVALUATION ===
First Phrase is: We spotted a large bass in the ocean. Target is: bass
Description is: Location, Type, Description, Size, Area, Color, Position, View, Status, Material
Second Phrase is: The bass player did not receive the acknowledgment she deserves. Target is: bass
Description is: Description, Status, Type, Name, Title, Owner, Rating, Position, Location, Feature
Third Phrase is: The black sea bass, is a member of the wreckfish family. Target is: bass
Description is: Type, Status, Family, Owner, Description, Name, Country, Age, Location, Animal
First & Second: ['Description', 'Location', 'Position', 'Status', 'Type']
First & Third: ['Description', 'Location', 'Status', 'Type']
Second & Third: ['Description', 'Location', 'Name', 'Owner', 'Status', 'Type']
=== FRIDAY EVALUATION ===
Spanish Phrase is: Friday es mi canción favorita.
Spanish Description is: Description, Tags, Tag, Status, Theme, Location, Title, Album, Category, Labels
English Phrase is: Friday is my favourite song.
English Description is: Description, Tags, Tag, Status, Album, Location, Title, Theme, Labels, Comments
Description Overlap is: Album, Description, Labels, Location, Status, Tag, Tags, Theme, Title
Description Difference is: Category, Comments
=== MALIBU EVALUATION ===
Phrase is: I like to drive my Malibu while drinking Malibu.
First Malibu (car) Description is: Food, Color, Type, Name, Material, Product, Weight, Plant, Animal, Fruit
Second Malibu (drink) Description is: Food, Color, Drink, Name, Type, Material, Fruit, Weight, Product, Wine
First & Second: ['Color', 'Food', 'Fruit', 'Material', 'Name', 'Product', 'Type', 'Weight']
First ^ Second: ['Animal', 'Drink', 'Plant', 'Wine']
=== FOOTBALL EVALUATION ===
Spanish Phrase is: Retiremos el equipo de la cancha, Boca no merece jugar esta copa que hace tiempo viene siendo desprestigiada.
Ya no se juega al futbol.
English Phrase is: Let's remove the team from the field, Boca does not deserve to play this cup that has long been discredited. Football is no longer played.
Spanish word is: equipo, English word is: team
Spanish Description is: Type, Name, Title, Status, Description, Location, Team, Brand, Owner, Game
English Description is: Name, Type, Title, Description, Status, Owner, Brand, Team, Location, Logo
Overlap is: Brand, Description, Location, Name, Owner, Status, Team, Title, Type (9)
Difference is: Game, Logo (2)
Spanish word is: Boca, English word is: Boca
Spanish Description is: Name, Type, Title, Status, Description, Game, Owner, Location, Brand, Tag
English Description is: Name, Title, Type, Status, Description, Owner, Game, Brand, Location, Color
Overlap is: Brand, Description, Game, Location, Name, Owner, Status, Title, Type (9)
Difference is: Color, Tag (2)
Spanish word is: copa, English word is: cup
Spanish Description is: Type, Title, Game, Status, Description, Category, Sports, Theme, Location, Sport
English Description is: Type, Title, Status, Category, Game, Description, Sports, Sport, Location, Application
Overlap is: Category, Description, Game, Location, Sport, Sports, Status, Title, Type (9)
Difference is: Application, Theme (2)
Spanish word is: tiempo, English word is: long
Spanish Description is: Age, Location, Duration, Type, Description, Date, Status, Game, Year, Time
English Description is: Status, Type, Age, Title, Description, Rating, Location, Name, Year, Game
Overlap is: Age, Description, Game, Location, Status, Type, Year (7)
Difference is: Date, Duration, Name, Rating, Time, Title (6)
Spanish word is: futbol, English word is: Football
Spanish Description is: Sport, Type, Sports, Game, Style, Theme, Category, Application, Title, Description
English Description is: Type, Sports, Sport, Game, Title, Category, Style, Football, Theme, Application
Overlap is: Application, Category, Game, Sport, Sports, Style, Theme, Title, Type (9)
Difference is: Description, Football (2)
The metrics show an improvement across the entire train, yet the output is weak with terms like Description and Title turning up frequently. This does suggest collapse.
We can try training again with an increased temperature.
Starting xlm-roberta-base-e20-bs32-lr0.0001-t5
/home/matthew/.cache/pypoetry/virtualenvs/blog-HrtMnrOS-py3.10/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
Step | Training Loss | Validation Loss | Kl Div | Overlap |
---|---|---|---|---|
1000 | 0.499100 | 0.346719 | 0.272460 | 0.674107 |
2000 | 0.351100 | 0.313100 | 0.255789 | 0.690770 |
3000 | 0.318900 | 0.308473 | 0.246127 | 0.698678 |
4000 | 0.278800 | 0.305208 | 0.235727 | 0.705637 |
5000 | 0.268300 | 0.288019 | 0.229908 | 0.713236 |
6000 | 0.261800 | 0.285359 | 0.226173 | 0.714516 |
7000 | 0.231600 | 0.290514 | 0.225802 | 0.714887 |
8000 | 0.225700 | 0.285120 | 0.224180 | 0.716890 |
9000 | 0.223400 | 0.275393 | 0.217442 | 0.718687 |
10000 | 0.200400 | 0.276047 | 0.215493 | 0.722321 |
11000 | 0.196100 | 0.276407 | 0.215938 | 0.723973 |
12000 | 0.197700 | 0.274994 | 0.216346 | 0.723229 |
13000 | 0.176200 | 0.273291 | 0.215743 | 0.725563 |
14000 | 0.174100 | 0.272258 | 0.212242 | 0.726802 |
15000 | 0.174000 | 0.272064 | 0.217354 | 0.725769 |
16000 | 0.158100 | 0.275212 | 0.216617 | 0.724117 |
17000 | 0.155900 | 0.272179 | 0.212783 | 0.729486 |
18000 | 0.154900 | 0.267659 | 0.210043 | 0.730704 |
19000 | 0.142900 | 0.269398 | 0.214941 | 0.729073 |
20000 | 0.138800 | 0.269864 | 0.210844 | 0.729589 |
21000 | 0.142400 | 0.272709 | 0.212704 | 0.729858 |
22000 | 0.130100 | 0.269864 | 0.211297 | 0.731633 |
23000 | 0.127200 | 0.265007 | 0.211502 | 0.731798 |
24000 | 0.128300 | 0.270539 | 0.214050 | 0.732149 |
25000 | 0.121700 | 0.264980 | 0.211163 | 0.734111 |
26000 | 0.116000 | 0.265250 | 0.210008 | 0.734235 |
27000 | 0.117500 | 0.265351 | 0.208235 | 0.733533 |
28000 | 0.111000 | 0.264313 | 0.207080 | 0.735329 |
29000 | 0.106400 | 0.267540 | 0.208386 | 0.734256 |
30000 | 0.108400 | 0.268394 | 0.207938 | 0.735226 |
31000 | 0.104700 | 0.263199 | 0.208138 | 0.736919 |
32000 | 0.099500 | 0.263530 | 0.207196 | 0.735267 |
33000 | 0.099300 | 0.263550 | 0.205301 | 0.734524 |
34000 | 0.097200 | 0.267153 | 0.208381 | 0.735722 |
35000 | 0.091800 | 0.263137 | 0.207557 | 0.735928 |
36000 | 0.093700 | 0.266297 | 0.209930 | 0.735350 |
37000 | 0.090900 | 0.261557 | 0.206160 | 0.737766 |
38000 | 0.085800 | 0.260821 | 0.206914 | 0.738158 |
39000 | 0.087600 | 0.260837 | 0.206006 | 0.738117 |
40000 | 0.085800 | 0.261977 | 0.205463 | 0.737208 |
41000 | 0.081100 | 0.259678 | 0.205240 | 0.738385 |
42000 | 0.082100 | 0.261295 | 0.204675 | 0.739253 |
43000 | 0.080200 | 0.258047 | 0.203045 | 0.739913 |
44000 | 0.076800 | 0.261033 | 0.205304 | 0.740760 |
45000 | 0.077300 | 0.262062 | 0.207669 | 0.738963 |
46000 | 0.076400 | 0.258826 | 0.203268 | 0.740491 |
47000 | 0.072400 | 0.260828 | 0.204339 | 0.741441 |
48000 | 0.073100 | 0.260119 | 0.205025 | 0.740657 |
49000 | 0.072800 | 0.258310 | 0.202318 | 0.741689 |
50000 | 0.069100 | 0.259067 | 0.202928 | 0.742536 |
51000 | 0.070100 | 0.259791 | 0.204569 | 0.740677 |
52000 | 0.069500 | 0.258562 | 0.203154 | 0.741070 |
53000 | 0.066500 | 0.258652 | 0.203341 | 0.741441 |
54000 | 0.066400 | 0.259573 | 0.203057 | 0.741668 |
55000 | 0.066400 | 0.259439 | 0.203471 | 0.743052 |
56000 | 0.063900 | 0.258449 | 0.202918 | 0.741978 |
57000 | 0.064000 | 0.259101 | 0.203254 | 0.742907 |
58000 | 0.064400 | 0.257644 | 0.202360 | 0.743176 |
59000 | 0.062200 | 0.257244 | 0.202339 | 0.742680 |
60000 | 0.062300 | 0.258215 | 0.202372 | 0.743093 |
61000 | 0.062000 | 0.257468 | 0.201944 | 0.743196 |
Could not locate the tokenizer configuration file, will try to use the model config instead.
=== BASS EVALUATION ===
First Phrase is: We spotted a large bass in the ocean. Target is: bass
Description is: Location, Area, Color, Position, View, Land, Type, Country, Ocean, Views
Second Phrase is: The bass player did not receive the acknowledgment she deserves. Target is: bass
Description is: Description, Name, Type, Status, Owner, Title, Location, Rating, Position, Details
Third Phrase is: The black sea bass, is a member of the wreckfish family. Target is: bass
Description is: Status, Type, Family, Description, Owner, Name, Age, Country, Location, Religion
First & Second: ['Location', 'Position', 'Type']
First & Third: ['Country', 'Location', 'Type']
Second & Third: ['Description', 'Location', 'Name', 'Owner', 'Status', 'Type']
=== FRIDAY EVALUATION ===
Spanish Phrase is: Friday es mi canción favorita.
Spanish Description is: Description, Tags, Tag, Album, Status, Title, Theme, Location, Comments, Motto
English Phrase is: Friday is my favourite song.
English Description is: Description, Tags, Tag, Album, Title, Status, Location, Theme, Comments, Labels
Description Overlap is: Album, Comments, Description, Location, Status, Tag, Tags, Theme, Title
Description Difference is: Labels, Motto
=== MALIBU EVALUATION ===
Phrase is: I like to drive my Malibu while drinking Malibu.
First Malibu (car) Description is: Food, Name, Type, Color, Material, Product, Animal, Owner, Cat, Plant
Second Malibu (drink) Description is: Food, Name, Color, Product, Type, Plant, Material, Language, Owner, Animal
First & Second: ['Animal', 'Color', 'Food', 'Material', 'Name', 'Owner', 'Plant', 'Product', 'Type']
First ^ Second: ['Cat', 'Language']
=== FOOTBALL EVALUATION ===
Spanish Phrase is: Retiremos el equipo de la cancha, Boca no merece jugar esta copa que hace tiempo viene siendo desprestigiada.
Ya no se juega al futbol.
English Phrase is: Let's remove the team from the field, Boca does not deserve to play this cup that has long been discredited. Football is no longer played.
Spanish word is: equipo, English word is: team
Spanish Description is: Title, Type, Description, Name, Status, Team, Game, Location, Brand, Category
English Description is: Title, Type, Name, Description, Status, Brand, Location, Owner, Team, Game
Overlap is: Brand, Description, Game, Location, Name, Status, Team, Title, Type (9)
Difference is: Category, Owner (2)
Spanish word is: Boca, English word is: Boca
Spanish Description is: Title, Name, Type, Description, Game, Status, Owner, Brand, Location, Color
English Description is: Title, Type, Name, Game, Description, Brand, Location, Sport, Owner, Status
Overlap is: Brand, Description, Game, Location, Name, Owner, Status, Title, Type (9)
Difference is: Color, Sport (2)
Spanish word is: copa, English word is: cup
Spanish Description is: Type, Title, Game, Description, Status, Category, Sports, Sport, Theme, Name
English Description is: Type, Title, Game, Description, Sports, Sport, Category, Status, Location, Name
Overlap is: Category, Description, Game, Name, Sport, Sports, Status, Title, Type (9)
Difference is: Location, Theme (2)
Spanish word is: tiempo, English word is: long
Spanish Description is: Title, Type, Description, Game, Status, Name, Category, Location, Sports, Country
English Description is: Type, Title, Description, Game, Status, Name, Location, Sport, Sports, Country
Overlap is: Country, Description, Game, Location, Name, Sports, Status, Title, Type (9)
Difference is: Category, Sport (2)
Spanish word is: futbol, English word is: Football
Spanish Description is: Type, Sports, Game, Sport, Title, Style, Category, Theme, Description, Football
English Description is: Type, Sports, Sport, Game, Title, Football, Category, Style, Description, Theme
Overlap is: Category, Description, Football, Game, Sport, Sports, Style, Theme, Title, Type (10)
Difference is: (0)
The collapse is not as bad here. Bass evaluation does perform poorly though.
Broadly I think that the evaluation itself is prone to gaming by producing fixed output, and the model loses the ability to distinguish different nouns in the input. This is a dataset problem and I think that the solution lies within a better dataset.