Code
import blog.transformers_logging
June 22, 2022
The hyperparameters that I used for the first version of multilingual prompt internalization were what seemed sensible at the time. These parameters have a large impact on the performance of the model. Fiddling with them should show how well this technique can do.
This post is mainly going to be different runs of the model.
To evaluate these changes I’m going to take all of the evaluations that were done in the last post and run them over each version of the model.
# from src/main/python/blog/prompt_internalization/multilingual/roberta/trainer.py
from itertools import starmap
from typing import Any, Dict, List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments
from transformers.modeling_outputs import MaskedLMOutput
class MultilingualMaskedPromptInternalizationTrainingArguments(TrainingArguments):
def __init__(
self,
*args,
temperature: float = 2.0,
mean_prediction: bool = True,
ignore_tokens: Optional[List[int]] = None,
**kwargs,
) -> None:
super().__init__(*args, **kwargs)
self.temperature = temperature
self.mean_prediction = mean_prediction
if ignore_tokens is not None:
self.ignore_tokens = ignore_tokens
else:
self.ignore_tokens = []
class MultilingualMaskedPromptInternalizationTrainer(Trainer):
def __init__(
self,
*args,
teacher_model: AutoModelForMaskedLM = None,
tokenizer: AutoTokenizer = None,
**kwargs,
) -> None:
super().__init__(*args, **kwargs)
self.teacher = teacher_model
self._move_model_to_device(self.teacher, self.model.device)
self.teacher.eval()
self.mask_token_id = tokenizer.mask_token_id
def compute_loss(
self,
model: AutoModelForMaskedLM,
inputs: Dict[str, Union[torch.Tensor, Any]],
return_outputs: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
outputs: MaskedLMOutput = model(
input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]
)
if self.args.mean_prediction:
predictions = self._student_predictions_mean(
outputs=outputs, labels=inputs["labels"]
)
else:
predictions = self._student_predictions_first(
outputs=outputs, labels=inputs["labels"]
)
targets = self._teacher_predictions(
input_ids=inputs["teacher_input_ids"],
attention_mask=inputs["teacher_attention_mask"],
)
loss = self._loss(predictions=predictions, targets=targets)
if not return_outputs:
return loss
# This directly calculates the kl_div and overlap metrics.
# It's much faster to do this using CUDA operations instead of waiting for cpu numpy.
with torch.inference_mode():
kl_div = F.kl_div(
input=F.log_softmax(predictions.to(torch.float32), dim=-1),
target=F.softmax(targets.to(torch.float32), dim=-1),
reduction="none",
log_target=False,
)
kl_div = kl_div.sum(dim=1)
overlap = starmap(
torch.isin,
zip(
predictions.argsort(descending=True)[:, :10],
targets.argsort(descending=True)[:, :10],
),
)
overlap = map(torch.sum, overlap)
overlap = torch.tensor(list(overlap), device=self.model.device)
overlap = overlap / 10
# This will reshape the metrics to be [batch_size, 2] which will then
# get correctly passed to the metric calculation
metric_output = torch.cat([kl_div[:, None], overlap[:, None]], dim=1)
return loss, metric_output
@torch.inference_mode()
def _teacher_predictions(
self, input_ids: torch.Tensor, attention_mask: torch.Tensor
) -> torch.Tensor:
outputs_teacher = self.teacher(
input_ids=input_ids,
attention_mask=attention_mask,
)
mask_indices = input_ids == self.mask_token_id
teacher_predictions = outputs_teacher.logits[mask_indices]
teacher_predictions[:, self.args.ignore_tokens] = teacher_predictions.min()
return teacher_predictions
def _student_predictions_mean(
self, outputs: MaskedLMOutput, labels: torch.Tensor
) -> torch.Tensor:
# When calculating this it is very important to avoid breaking back propagation.
# torch.cat will break back propagation, so the prediction is added per row to a holder
logits = outputs.logits
predictions = torch.zeros(logits.shape[0], device=logits.device)
for index, (start, length) in enumerate(labels):
prediction = logits[index, start : start + length]
prediction = prediction.mean(dim=0)
predictions[index] += prediction
return predictions
def _student_predictions_first(
self,
outputs: MaskedLMOutput,
labels: torch.Tensor,
) -> torch.Tensor:
return outputs.logits[range(outputs.logits.shape[0]), labels[:, 0]]
def _loss(self, predictions: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
predictions = F.log_softmax(
predictions.to(torch.float32) / self.args.temperature, dim=-1
)
targets = F.softmax(targets.to(torch.float32) / self.args.temperature, dim=-1)
loss = F.kl_div(
input=predictions,
target=targets,
reduction="batchmean",
log_target=False,
)
return loss * (self.args.temperature**2)
# from src/main/python/blog/prompt_internalization/multilingual/roberta/collator.py
from typing import Any, Dict, List
from transformers import AutoTokenizer
class TeacherStudentCollator:
"""
The teacher inputs need to be padded and have an associated attention mask.
"""
def __init__(self, tokenizer: AutoTokenizer) -> None:
self.tokenizer = tokenizer
def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
teacher_inputs = self._teacher_inputs(features)
student_inputs = self._student_inputs(features)
batch = {**teacher_inputs, **student_inputs}
if "label" in batch:
batch["labels"] = batch["label"]
del batch["label"]
if "label_ids" in batch:
batch["labels"] = batch["label_ids"]
del batch["label_ids"]
return batch
def _teacher_inputs(self, features: List[Dict[str, Any]]) -> Dict[str, List[Any]]:
teacher_inputs = [{"input_ids": row["teacher_input_ids"]} for row in features]
teacher_batch = self.tokenizer.pad(
teacher_inputs,
padding=True,
return_tensors="pt",
)
return {
"teacher_input_ids": teacher_batch["input_ids"],
"teacher_attention_mask": teacher_batch["attention_mask"],
}
def _student_inputs(self, features: List[Dict[str, Any]]) -> Dict[str, List[Any]]:
student_inputs = [
{
"input_ids": row["input_ids"],
"labels": row["labels"][0], # known to have a single entry
}
for row in features
]
return self.tokenizer.pad(
student_inputs,
padding=True,
return_tensors="pt",
)
As part of these evaluations I want to try altering the temperature parameter.
The problem with changing the temperature is that the loss values will become incomparable, as the temperature direcly relates to the loss calculation. To compare the models I need a proper metric.
Since the evaluation is mainly involved with the overlap between two inputs using that for the metric seems appropriate. I can also calculate KL Divergence without incorporating temperature as a consistent measure.
# from src/main/python/blog/prompt_internalization/multilingual/roberta/metrics.py
from typing import Dict
from transformers import EvalPrediction
def compute_metrics(model_output: EvalPrediction) -> Dict[str, float]:
kl_div = model_output.predictions[:, 0].mean()
overlap = model_output.predictions[:, 1].mean()
return {
"kl_div": kl_div,
"overlap": overlap,
}
# from src/main/python/blog/prompt_internalization/multilingual/roberta/train.py
from pathlib import Path
from typing import List, Optional
import datasets
from transformers import AutoModelForMaskedLM, AutoTokenizer
from .collator import TeacherStudentCollator
from .metrics import compute_metrics
from .trainer import (
MultilingualMaskedPromptInternalizationTrainer,
MultilingualMaskedPromptInternalizationTrainingArguments,
)
DATASET_FOLDER = Path("/data/tatoeba/2022-06-18/dataset/")
MODEL_FOLDER = Path("/data/prompt-internalization/multilingual/")
RUN_FOLDER = Path("/tmp/runs")
MODEL_FOLDER.mkdir(parents=True, exist_ok=True)
RUN_FOLDER.mkdir(parents=True, exist_ok=True)
def train(
*,
model_name: str = "xlm-roberta-base",
dataset_name: str = "xlm-roberta",
batch_size: int = 64,
learning_rate: float = 1e-4,
temperature: float = 2,
fp16: bool = False,
mean_prediction: bool = False,
ignore_tokens: Optional[List[int]] = None,
epochs: Optional[float] = 2,
max_steps: int = -1,
evaluation_steps: int = 500,
) -> Path:
run_name = "-".join(
[
f"{model_name}",
f"e{epochs}" if max_steps == -1 else f"ms{max_steps}",
f"bs{batch_size}",
f"lr{learning_rate}",
f"t{temperature}",
]
+ (["fp16"] if fp16 else [])
+ (["mean"] if mean_prediction else [])
+ ([f"it{len(ignore_tokens)}"] if ignore_tokens else [])
)
print(f"Starting {run_name}")
train_ds = datasets.load_from_disk(DATASET_FOLDER / f"{dataset_name}-train.dataset")
test_ds = datasets.load_from_disk(DATASET_FOLDER / f"{dataset_name}-test.dataset")
training_args = MultilingualMaskedPromptInternalizationTrainingArguments(
report_to="none",
output_dir=RUN_FOLDER,
num_train_epochs=epochs,
max_steps=max_steps,
seed=33,
# number of steps before moving evaluation results from GPU to CPU see
# https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941
eval_accumulation_steps=5,
#
# hyperparameters
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
fp16=fp16,
temperature=temperature,
mean_prediction=mean_prediction,
ignore_tokens=ignore_tokens,
learning_rate=learning_rate,
#
# evaluation settings
evaluation_strategy="steps",
logging_steps=evaluation_steps,
eval_steps=evaluation_steps,
save_steps=evaluation_steps,
#
# checkpoint settings
logging_dir=RUN_FOLDER / "logs",
save_total_limit=2,
load_best_model_at_end=True,
metric_for_best_model="overlap",
greater_is_better=True,
remove_unused_columns=False,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
teacher_model = AutoModelForMaskedLM.from_pretrained(model_name)
student_model = AutoModelForMaskedLM.from_pretrained(model_name)
data_collator = TeacherStudentCollator(tokenizer=tokenizer)
trainer = MultilingualMaskedPromptInternalizationTrainer(
model=student_model,
args=training_args,
teacher_model=teacher_model,
train_dataset=train_ds,
eval_dataset=test_ds,
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.train()
student_model.save_pretrained(MODEL_FOLDER / run_name)
return MODEL_FOLDER / run_name
# from src/main/python/blog/prompt_internalization/multilingual/roberta/evaluate.py
from pathlib import Path
from typing import List, Optional, Tuple
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
def evaluate(
model_name: str, model_path: Path, ignore_tokens: Optional[List[int]] = None
) -> None:
if ignore_tokens is None:
ignore_tokens = []
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_path)
model.eval()
bass_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)
friday_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)
malibu_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)
football_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)
def bass_evaluation(
model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
first_phrase = "We spotted a large bass in the ocean."
second_phrase = "The bass player did not receive the acknowledgment she deserves."
third_phrase = "The black sea bass, is a member of the wreckfish family."
first_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=first_phrase,
noun="bass",
ignore_tokens=ignore_tokens,
)
second_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=second_phrase,
noun="bass",
ignore_tokens=ignore_tokens,
)
third_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=third_phrase,
noun="bass",
ignore_tokens=ignore_tokens,
)
print("=== BASS EVALUATION ===")
print(f"First Phrase is: {first_phrase} Target is: bass")
print(f"Description is: {', '.join(first_predicted_words)}")
print()
print(f"Second Phrase is: {second_phrase} Target is: bass")
print(f"Description is: {', '.join(second_predicted_words)}")
print()
print(f"Third Phrase is: {third_phrase} Target is: bass")
print(f"Description is: {', '.join(third_predicted_words)}")
print()
print(
f"First & Second: {sorted(set(first_predicted_words) & set(second_predicted_words))}"
)
print(
f"First & Third: {sorted(set(first_predicted_words) & set(third_predicted_words))}"
)
print(
f"Second & Third: {sorted(set(second_predicted_words) & set(third_predicted_words))}"
)
print()
def friday_evaluation(
model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
spanish_text = "Friday es mi canción favorita."
english_text = "Friday is my favourite song."
spanish_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=spanish_text,
noun="Friday",
ignore_tokens=ignore_tokens,
)
english_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=english_text,
noun="Friday",
ignore_tokens=ignore_tokens,
)
overlap = set(spanish_predicted_words) & set(english_predicted_words)
difference = set(spanish_predicted_words) ^ set(english_predicted_words)
print("=== FRIDAY EVALUATION ===")
print(f"Spanish Phrase is: {spanish_text}")
print(f"Spanish Description is: {', '.join(spanish_predicted_words)}")
print(f"English Phrase is: {english_text}")
print(f"English Description is: {', '.join(english_predicted_words)}")
print()
print(f"Description Overlap is: {', '.join(sorted(overlap))}")
print(f"Description Difference is: {', '.join(sorted(difference))}")
print()
def malibu_evaluation(
model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
text = "I like to drive my Malibu while drinking Malibu."
first_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=text,
noun="Malibu",
ignore_tokens=ignore_tokens,
)
second_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=text,
noun="Malibu",
index=1,
ignore_tokens=ignore_tokens,
)
print("=== MALIBU EVALUATION ===")
print(f"Phrase is: {text}")
print(f"First Malibu (car) Description is: {', '.join(first_predicted_words)}")
print(f"Second Malibu (drink) Description is: {', '.join(second_predicted_words)}")
print()
print(
f"First & Second: {sorted(set(first_predicted_words) & set(second_predicted_words))}"
)
print(
f"First ^ Second: {sorted(set(first_predicted_words) ^ set(second_predicted_words))}"
)
print()
def football_evaluation(
model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
spanish_phrase = (
"Retiremos el equipo de la cancha, "
"Boca no merece jugar esta copa que "
"hace tiempo viene siendo desprestigiada.\n"
"Ya no se juega al futbol."
)
english_phrase = (
"Let's remove the team from the field, "
"Boca does not deserve to play this cup that "
"has long been discredited. "
"Football is no longer played."
)
print("=== FOOTBALL EVALUATION ===")
print(f"Spanish Phrase is: {spanish_phrase}")
print(f"English Phrase is: {english_phrase}")
print()
for spanish_noun, english_noun in [
["equipo", "team"],
["Boca", "Boca"],
["copa", "cup"],
["tiempo", "long"],
["futbol", "Football"],
]:
spanish_description = get_predictions(
model=model,
tokenizer=tokenizer,
text=spanish_phrase,
noun=spanish_noun,
ignore_tokens=ignore_tokens,
)
english_description = get_predictions(
model=model,
tokenizer=tokenizer,
text=english_phrase,
noun=english_noun,
ignore_tokens=ignore_tokens,
)
overlap = set(spanish_description) & set(english_description)
difference = set(spanish_description) ^ set(english_description)
print(f"Spanish word is: {spanish_noun}, English word is: {english_noun}")
print(f"Spanish Description is: {', '.join(spanish_description)}")
print(f"English Description is: {', '.join(english_description)}")
print(f"Overlap is: {', '.join(sorted(overlap))} ({len(overlap)})")
print(f"Difference is: {', '.join(sorted(difference))} ({len(difference)})")
print()
@torch.inference_mode()
def get_predictions(
*,
model: AutoModelForMaskedLM,
tokenizer: AutoTokenizer,
text: str,
noun: str,
index: int = 0,
ignore_tokens: Optional[List[int]] = None,
) -> List[str]:
if ignore_tokens is None:
ignore_tokens = []
tokens = tokenizer(text, return_tensors="pt")
start, _end = get_noun(
tokenizer=tokenizer, tokens=tokens.input_ids[0], noun=noun, index=index
)
output = model(**tokens)
predictions = output.logits[0, start]
predictions[ignore_tokens] = predictions.min()
predicted_tokens = predictions.argsort(descending=True)[:10]
predicted_words = [
word.strip() for word in tokenizer.batch_decode(predicted_tokens)
]
return predicted_words
def get_noun(
tokenizer: AutoTokenizer, tokens: torch.Tensor, noun: str, index: int
) -> Tuple[int, int]:
length = tokens.shape[0]
current_index = index
for start_index in range(length):
word = tokenizer.decode(tokens[start_index]).strip()
if not noun.startswith(word):
continue
for end_index in range(start_index + 1, length):
word = tokenizer.decode(tokens[start_index:end_index]).strip()
if not noun == word:
continue
if current_index > 0:
current_index -= 1
else:
return start_index, end_index
raise AssertionError(f"Did not find {noun}[{index}] in {tokenizer.decode(tokens)}")
from pathlib import Path
import datasets
DATASET_FOLDER = Path("/data/blog/2022-06-18-cross-language-prompt-internalization")
DATA_FOLDER = Path("/data/blog/2022-06-22-cross-languge-first-token")
RUN_DIRECTORY = Path(DATA_FOLDER / "runs")
RUN_DIRECTORY.mkdir(parents=True, exist_ok=True)
MODEL_NAME = "roberta-base"
One of the changes that I’ve thought about for the cross language word sense model is to only use the predictions from the first token in the noun. This is quite a quick change so I can just try it out now.
The 8 epoch train that I did using the mean of the tokens got a lowest validation loss of 0.708059 so if this beats that it will be encouraging.
Step | Training Loss | Validation Loss | Kl Div | Overlap |
---|---|---|---|---|
500 | 1.017000 | 0.773606 | 0.862134 | 0.285226 |
1000 | 0.739700 | 0.741717 | 0.827723 | 0.344493 |
1500 | 0.649900 | 0.730937 | 0.813940 | 0.354007 |
2000 | 0.559300 | 0.698919 | 0.783726 | 0.362598 |
2500 | 0.522200 | 0.685605 | 0.768987 | 0.367191 |
3000 | 0.490500 | 0.674662 | 0.751440 | 0.372155 |
Could not locate the tokenizer configuration file, will try to use the model config instead.
=== BASS EVALUATION ===
First Phrase is: We spotted a large bass in the ocean. Target is: bass
Description is: Size, Species, Shape, Color, Body, Type, Item, Sex, Animal, Weight
Second Phrase is: The bass player did not receive the acknowledgment she deserves. Target is: bass
Description is: Position, Artist, Player, Role, Function, Singer, Driver, Name, Music, Owner
Third Phrase is: The black sea bass, is a member of the wreckfish family. Target is: bass
Description is: Species, Size, Race, Color, Type, Game, Name, Breed, Shape, Item
First & Second: []
First & Third: ['Color', 'Item', 'Shape', 'Size', 'Species', 'Type']
Second & Third: ['Name']
=== FRIDAY EVALUATION ===
Spanish Phrase is: Friday es mi canción favorita.
Spanish Description is: Date, Day, Time, Days, When, Event, Theme, Week, Weather, Month
English Phrase is: Friday is my favourite song.
English Description is: Day, Date, Time, Theme, Month, Name, Event, Color, Days, Year
Description Overlap is: Date, Day, Days, Event, Month, Theme, Time
Description Difference is: Color, Name, Weather, Week, When, Year
=== MALIBU EVALUATION ===
Phrase is: I like to drive my Malibu while drinking Malibu.
First Malibu (car) Description is: Color, Vehicle, Location, Car, Destination, Year, Season, Rider, Driver, Brand
Second Malibu (drink) Description is: Color, Drink, Location, Destination, Season, Course, Year, Flavor, Weather, State
First & Second: ['Color', 'Destination', 'Location', 'Season', 'Year']
First ^ Second: ['Brand', 'Car', 'Course', 'Drink', 'Driver', 'Flavor', 'Rider', 'State', 'Vehicle', 'Weather']
=== FOOTBALL EVALUATION ===
Spanish Phrase is: Retiremos el equipo de la cancha, Boca no merece jugar esta copa que hace tiempo viene siendo desprestigiada.
Ya no se juega al futbol.
English Phrase is: Let's remove the team from the field, Boca does not deserve to play this cup that has long been discredited. Football is no longer played.
Spanish word is: equipo, English word is: team
Spanish Description is: Team, Position, Location, Name, Player, Age, Game, Size, Friend, Goal
English Description is: Team, Location, Position, Player, Name, Teams, Goal, Game, Color, Interest
Overlap is: Game, Goal, Location, Name, Player, Position, Team (7)
Difference is: Age, Color, Friend, Interest, Size, Teams (6)
Spanish word is: Boca, English word is: Boca
Spanish Description is: Name, Color, Game, Team, Player, Location, Age, Size, Owner, Friend
English Description is: Name, Color, Team, Location, Game, Owner, Player, Race, Age, Rider
Overlap is: Age, Color, Game, Location, Name, Owner, Player, Team (8)
Difference is: Friend, Race, Rider, Size (4)
Spanish word is: copa, English word is: cup
Spanish Description is: Age, Size, Location, Color, Purpose, Time, Name, Weather, Type, Position
English Description is: Game, Sport, Type, Player, Team, Ball, Color, Function, Item, Purpose
Overlap is: Color, Purpose, Type (3)
Difference is: Age, Ball, Function, Game, Item, Location, Name, Player, Position, Size, Sport, Team, Time, Weather (14)
Spanish word is: tiempo, English word is: long
Spanish Description is: Time, Weather, Age, Size, Distance, Year, Weight, Season, Color, Length
English Description is: Age, Reason, Time, Name, Season, Color, Year, Date, Purpose, Size
Overlap is: Age, Color, Season, Size, Time, Year (6)
Difference is: Date, Distance, Length, Name, Purpose, Reason, Weather, Weight (8)
Spanish word is: futbol, English word is: Football
Spanish Description is: Sport, Interest, Game, Team, Football, Sports, Soccer, Season, Ball, Position
English Description is: Game, Sport, Team, Football, Ball, Interest, Player, Position, Sports, Season
Overlap is: Ball, Football, Game, Interest, Position, Season, Sport, Sports, Team (9)
Difference is: Player, Soccer (2)
The metrics for this show a significant improvement.
The evaluation is more mixed however the two senses of Malibu have actually diverged a bit. This seems like an overall improvement to me, and it seems more reasonable to take this approach as the student is updating a single token compared to a single token in the teacher.
The temperature parameter is used in distillation to better show the distribution of classes to the student model. A language model tends to produce a smoother distribution anyway so the temperature parameter may have less of an effect.
PyTorch: setting up devices
Could not locate the tokenizer configuration file, will try to use the model config instead.
Step | Training Loss | Validation Loss | Kl Div | Overlap |
---|---|---|---|---|
500 | 1.103200 | 0.844906 | 0.844220 | 0.080362 |
1000 | 0.792800 | 0.801168 | 0.800937 | 0.085042 |
1500 | 0.682200 | 0.762441 | 0.762480 | 0.083643 |
2000 | 0.579400 | 0.762365 | 0.762209 | 0.072138 |
2500 | 0.540700 | 0.735147 | 0.735035 | 0.076601 |
3000 | 0.500600 | 0.731170 | 0.731033 | 0.075463 |
Could not locate the tokenizer configuration file, will try to use the model config instead.
=== BASS EVALUATION ===
First Phrase is: We spotted a large bass in the ocean. Target is: bass
Description is: Species, Size, Color, Sex, Age, Body, Animal, Number, Gender, Name
Second Phrase is: The bass player did not receive the acknowledgment she deserves. Target is: bass
Description is: Position, Role, Player, Artist, Name, User, Function, Job, Driver, Person
Third Phrase is: The black sea bass, is a member of the wreckfish family. Target is: bass
Description is: Species, Size, Color, Name, Race, Sex, Age, Location, Type, Animal
First & Second: ['Name']
First & Third: ['Age', 'Animal', 'Color', 'Name', 'Sex', 'Size', 'Species']
Second & Third: ['Name']
=== FRIDAY EVALUATION ===
Spanish Phrase is: Friday es mi canción favorita.
Spanish Description is: Date, Day, Time, Event, Days, Theme, When, Reason, Weather, Subject
English Phrase is: Friday is my favourite song.
English Description is: Date, Theme, Day, Time, Name, Category, Song, Title, Artist, Subject
Description Overlap is: Date, Day, Subject, Theme, Time
Description Difference is: Artist, Category, Days, Event, Name, Reason, Song, Title, Weather, When
=== MALIBU EVALUATION ===
Phrase is: I like to drive my Malibu while drinking Malibu.
First Malibu (car) Description is: Color, Location, Drink, Destination, Flavor, Country, Name, Season, Course, State
Second Malibu (drink) Description is: Drink, Color, Location, Destination, Course, Flavor, Beer, Country, Season, State
First & Second: ['Color', 'Country', 'Course', 'Destination', 'Drink', 'Flavor', 'Location', 'Season', 'State']
First ^ Second: ['Beer', 'Name']
=== FOOTBALL EVALUATION ===
Spanish Phrase is: Retiremos el equipo de la cancha, Boca no merece jugar esta copa que hace tiempo viene siendo desprestigiada.
Ya no se juega al futbol.
English Phrase is: Let's remove the team from the field, Boca does not deserve to play this cup that has long been discredited. Football is no longer played.
Spanish word is: equipo, English word is: team
Spanish Description is: Team, Position, Location, Name, Player, Organization, Age, Size, Goal, Purpose
English Description is: Team, Position, Location, Name, Teams, Player, Goal, Game, Interest, Color
Overlap is: Goal, Location, Name, Player, Position, Team (6)
Difference is: Age, Color, Game, Interest, Organization, Purpose, Size, Teams (8)
Spanish word is: Boca, English word is: Boca
Spanish Description is: Name, Owner, Color, Age, Location, Size, Game, Body, Food, Race
English Description is: Location, City, Name, Team, Destination, Country, Game, Color, Place, Course
Overlap is: Color, Game, Location, Name (4)
Difference is: Age, Body, City, Country, Course, Destination, Food, Owner, Place, Race, Size, Team (12)
Spanish word is: copa, English word is: cup
Spanish Description is: Age, Size, Location, Name, Purpose, Team, Color, Position, Gender, Type
English Description is: Game, Sport, Team, Type, Goal, Position, Ball, Skill, Course, Football
Overlap is: Position, Team, Type (3)
Difference is: Age, Ball, Color, Course, Football, Game, Gender, Goal, Location, Name, Purpose, Size, Skill, Sport (14)
Spanish word is: tiempo, English word is: long
Spanish Description is: Time, Age, Size, Weather, Season, Weight, Purpose, Location, Year, Distance
English Description is: Age, Reason, Time, Name, Date, Year, Location, Color, Size, Source
Overlap is: Age, Location, Size, Time, Year (5)
Difference is: Color, Date, Distance, Name, Purpose, Reason, Season, Source, Weather, Weight (10)
Spanish word is: futbol, English word is: Football
Spanish Description is: Sport, Game, Team, Football, Interest, Sports, Season, Ball, Field, Position
English Description is: Football, Team, Sport, Game, Position, Season, Ball, Field, Player, Soccer
Overlap is: Ball, Field, Football, Game, Position, Season, Sport, Team (8)
Difference is: Interest, Player, Soccer, Sports (4)
Dropping the temperature has made the model significantly worse, both in metrics and in results. Clearly the temperature parameter has value. Let’s try increasing it instead.
PyTorch: setting up devices
Could not locate the tokenizer configuration file, will try to use the model config instead.
Step | Training Loss | Validation Loss | Kl Div | Overlap |
---|---|---|---|---|
500 | 0.875900 | 0.675583 | 0.902006 | 0.407073 |
1000 | 0.655900 | 0.639347 | 0.838118 | 0.436139 |
1500 | 0.583900 | 0.622461 | 0.813681 | 0.448237 |
2000 | 0.514300 | 0.603257 | 0.784411 | 0.457620 |
2500 | 0.480600 | 0.591916 | 0.773270 | 0.461809 |
3000 | 0.455000 | 0.585542 | 0.761299 | 0.465770 |
Could not locate the tokenizer configuration file, will try to use the model config instead.
=== BASS EVALUATION ===
First Phrase is: We spotted a large bass in the ocean. Target is: bass
Description is: Species, Size, Color, Item, Animal, Type, Shape, Body, Sex, Race
Second Phrase is: The bass player did not receive the acknowledgment she deserves. Target is: bass
Description is: Artist, Name, Position, Person, Owner, Singer, Role, Function, Subject, Driver
Third Phrase is: The black sea bass, is a member of the wreckfish family. Target is: bass
Description is: Species, Size, Race, Color, Type, Breed, Game, Animal, Location, Sex
First & Second: []
First & Third: ['Animal', 'Color', 'Race', 'Sex', 'Size', 'Species', 'Type']
Second & Third: []
=== FRIDAY EVALUATION ===
Spanish Phrase is: Friday es mi canción favorita.
Spanish Description is: Day, Date, Time, Days, Month, Event, Theme, Week, When, Weather
English Phrase is: Friday is my favourite song.
English Description is: Day, Date, Time, Theme, Month, Days, Event, Name, Reason, Color
Description Overlap is: Date, Day, Days, Event, Month, Theme, Time
Description Difference is: Color, Name, Reason, Weather, Week, When
=== MALIBU EVALUATION ===
Phrase is: I like to drive my Malibu while drinking Malibu.
First Malibu (car) Description is: Color, Drink, Destination, Location, Season, Weather, Year, Course, Time, State
Second Malibu (drink) Description is: Color, Destination, Drink, Location, Weather, Season, Year, State, Water, Course
First & Second: ['Color', 'Course', 'Destination', 'Drink', 'Location', 'Season', 'State', 'Weather', 'Year']
First ^ Second: ['Time', 'Water']
=== FOOTBALL EVALUATION ===
Spanish Phrase is: Retiremos el equipo de la cancha, Boca no merece jugar esta copa que hace tiempo viene siendo desprestigiada.
Ya no se juega al futbol.
English Phrase is: Let's remove the team from the field, Boca does not deserve to play this cup that has long been discredited. Football is no longer played.
Spanish word is: equipo, English word is: team
Spanish Description is: Team, Position, Name, Location, Player, Age, Color, Game, Size, Equipment
English Description is: Team, Location, Position, Player, Name, Color, Game, Destination, Age, Interest
Overlap is: Age, Color, Game, Location, Name, Player, Position, Team (8)
Difference is: Destination, Equipment, Interest, Size (4)
Spanish word is: Boca, English word is: Boca
Spanish Description is: Name, Color, Owner, Location, Age, Race, Game, Team, Player, Rider
English Description is: Location, Destination, Team, Name, Country, Place, City, Season, Color, Race
Overlap is: Color, Location, Name, Race, Team (5)
Difference is: Age, City, Country, Destination, Game, Owner, Place, Player, Rider, Season (10)
Spanish word is: copa, English word is: cup
Spanish Description is: Age, Size, Color, Name, Location, Race, Person, Type, Body, Purpose
English Description is: Game, Sport, Type, Position, Player, Color, Purpose, Equipment, Weapon, Size
Overlap is: Color, Purpose, Size, Type (4)
Difference is: Age, Body, Equipment, Game, Location, Name, Person, Player, Position, Race, Sport, Weapon (12)
Spanish word is: tiempo, English word is: long
Spanish Description is: Time, Weather, Age, Size, Year, Season, Distance, Color, Date, Location
English Description is: Age, Reason, Name, Purpose, Color, Time, Location, Season, Year, Date
Overlap is: Age, Color, Date, Location, Season, Time, Year (7)
Difference is: Distance, Name, Purpose, Reason, Size, Weather (6)
Spanish word is: futbol, English word is: Football
Spanish Description is: Sport, Game, Interest, Sports, Team, Season, Skill, Exercise, Soccer, Type
English Description is: Sport, Game, Football, Team, Interest, Ball, Sports, Player, Position, Season
Overlap is: Game, Interest, Season, Sport, Sports, Team (6)
Difference is: Ball, Exercise, Football, Player, Position, Skill, Soccer, Type (8)
Changing temperature from 2 to 5 has improved the overlap metric by about \(\frac{1}{4}\) (0.37 -> 0.47), which feels big. Looking at the evaluation results there is a greater overlap which is good, except for the Malibu evaluation where the difference between the senses has largely been lost.
If 5 is good then how about 10?
PyTorch: setting up devices
Could not locate the tokenizer configuration file, will try to use the model config instead.
Step | Training Loss | Validation Loss | Kl Div | Overlap |
---|---|---|---|---|
500 | 0.854400 | 0.664589 | 0.904724 | 0.412325 |
1000 | 0.633700 | 0.630449 | 0.844462 | 0.440205 |
1500 | 0.562500 | 0.596431 | 0.797427 | 0.452643 |
2000 | 0.493800 | 0.594233 | 0.791138 | 0.460484 |
2500 | 0.462400 | 0.575075 | 0.771659 | 0.467274 |
3000 | 0.436500 | 0.569529 | 0.764885 | 0.470625 |
Could not locate the tokenizer configuration file, will try to use the model config instead.
=== BASS EVALUATION ===
First Phrase is: We spotted a large bass in the ocean. Target is: bass
Description is: Size, Species, Color, Shape, Type, Body, Sex, Item, Animal, Location
Second Phrase is: The bass player did not receive the acknowledgment she deserves. Target is: bass
Description is: Name, Person, Artist, Owner, Position, Function, Role, Subject, User, Employee
Third Phrase is: The black sea bass, is a member of the wreckfish family. Target is: bass
Description is: Species, Size, Color, Type, Name, Race, Sex, Location, Animal, Game
First & Second: []
First & Third: ['Animal', 'Color', 'Location', 'Sex', 'Size', 'Species', 'Type']
Second & Third: ['Name']
=== FRIDAY EVALUATION ===
Spanish Phrase is: Friday es mi canción favorita.
Spanish Description is: Date, Day, Time, Days, Month, When, Event, Year, Reason, Week
English Phrase is: Friday is my favourite song.
English Description is: Day, Date, Time, Month, Name, Theme, Year, Event, Number, Days
Description Overlap is: Date, Day, Days, Event, Month, Time, Year
Description Difference is: Name, Number, Reason, Theme, Week, When
=== MALIBU EVALUATION ===
Phrase is: I like to drive my Malibu while drinking Malibu.
First Malibu (car) Description is: Color, Vehicle, Destination, Car, Driver, Drink, Location, Season, Type, Speed
Second Malibu (drink) Description is: Drink, Beer, Color, Flavor, Course, Season, Water, Destination, Type, Reason
First & Second: ['Color', 'Destination', 'Drink', 'Season', 'Type']
First ^ Second: ['Beer', 'Car', 'Course', 'Driver', 'Flavor', 'Location', 'Reason', 'Speed', 'Vehicle', 'Water']
=== FOOTBALL EVALUATION ===
Spanish Phrase is: Retiremos el equipo de la cancha, Boca no merece jugar esta copa que hace tiempo viene siendo desprestigiada.
Ya no se juega al futbol.
English Phrase is: Let's remove the team from the field, Boca does not deserve to play this cup that has long been discredited. Football is no longer played.
Spanish word is: equipo, English word is: team
Spanish Description is: Name, Age, Location, Color, Position, Team, Purpose, Type, Size, Race
English Description is: Team, Position, Player, Location, Name, Color, Age, Game, Goal, Interest
Overlap is: Age, Color, Location, Name, Position, Team (6)
Difference is: Game, Goal, Interest, Player, Purpose, Race, Size, Type (8)
Spanish word is: Boca, English word is: Boca
Spanish Description is: Name, Color, Age, Owner, Location, Race, Number, Size, Person, Type
English Description is: Team, Name, Location, Color, Game, Player, Season, Age, Position, Race
Overlap is: Age, Color, Location, Name, Race (5)
Difference is: Game, Number, Owner, Person, Player, Position, Season, Size, Team, Type (10)
Spanish word is: copa, English word is: cup
Spanish Description is: Age, Name, Person, Gender, Color, Sex, Location, Owner, Purpose, Personality
English Description is: Game, Sport, Type, Purpose, Player, Equipment, Color, Function, Size, Position
Overlap is: Color, Purpose (2)
Difference is: Age, Equipment, Function, Game, Gender, Location, Name, Owner, Person, Personality, Player, Position, Sex, Size, Sport, Type (16)
Spanish word is: tiempo, English word is: long
Spanish Description is: Time, Weather, Size, Age, Color, Food, Season, Year, Weight, Location
English Description is: Age, Reason, Name, Season, Purpose, Time, Year, Source, Color, Location
Overlap is: Age, Color, Location, Season, Time, Year (6)
Difference is: Food, Name, Purpose, Reason, Size, Source, Weather, Weight (8)
Spanish word is: futbol, English word is: Football
Spanish Description is: Purpose, Reason, Season, Age, Function, Weather, Type, Color, Time, Location
English Description is: Game, Sport, Football, Team, Position, Type, Interest, Player, Function, Season
Overlap is: Function, Season, Type (3)
Difference is: Age, Color, Football, Game, Interest, Location, Player, Position, Purpose, Reason, Sport, Team, Time, Weather (14)
A marginal improvement here according to the metrics. It’s interesting that this time the Malibu senses have worked out quite a lot better with Drink/Beer/Vehicle/Car as distinct (and correctly assigned) terms.
If 10 is a good temperature then how good would 20 be?
PyTorch: setting up devices
Could not locate the tokenizer configuration file, will try to use the model config instead.
Step | Training Loss | Validation Loss | Kl Div | Overlap |
---|---|---|---|---|
500 | 0.844500 | 0.654614 | 0.913253 | 0.413606 |
1000 | 0.625500 | 0.625355 | 0.861498 | 0.439834 |
1500 | 0.559200 | 0.598256 | 0.821264 | 0.451698 |
2000 | 0.491300 | 0.600460 | 0.814797 | 0.458475 |
2500 | 0.458800 | 0.580012 | 0.791404 | 0.464546 |
3000 | 0.434100 | 0.571730 | 0.778867 | 0.469008 |
Could not locate the tokenizer configuration file, will try to use the model config instead.
=== BASS EVALUATION ===
First Phrase is: We spotted a large bass in the ocean. Target is: bass
Description is: Size, Species, Color, Shape, Type, Body, Sex, Name, Location, Item
Second Phrase is: The bass player did not receive the acknowledgment she deserves. Target is: bass
Description is: Name, Owner, Artist, Person, Subject, Position, Location, Singer, Function, Personality
Third Phrase is: The black sea bass, is a member of the wreckfish family. Target is: bass
Description is: Species, Size, Race, Color, Type, Game, Breed, Name, Age, Location
First & Second: ['Location', 'Name']
First & Third: ['Color', 'Location', 'Name', 'Size', 'Species', 'Type']
Second & Third: ['Location', 'Name']
=== FRIDAY EVALUATION ===
Spanish Phrase is: Friday es mi canción favorita.
Spanish Description is: Day, Date, Time, Days, Event, Reason, Month, Theme, Weather, Week
English Phrase is: Friday is my favourite song.
English Description is: Day, Date, Time, Name, Theme, Event, Color, Reason, Month, Days
Description Overlap is: Date, Day, Days, Event, Month, Reason, Theme, Time
Description Difference is: Color, Name, Weather, Week
=== MALIBU EVALUATION ===
Phrase is: I like to drive my Malibu while drinking Malibu.
First Malibu (car) Description is: Drink, Color, Flavor, Beer, Water, Course, Season, Type, Weather, Reason
Second Malibu (drink) Description is: Drink, Color, Flavor, Beer, Water, Course, Season, Type, Weather, Reason
First & Second: ['Beer', 'Color', 'Course', 'Drink', 'Flavor', 'Reason', 'Season', 'Type', 'Water', 'Weather']
First ^ Second: []
=== FOOTBALL EVALUATION ===
Spanish Phrase is: Retiremos el equipo de la cancha, Boca no merece jugar esta copa que hace tiempo viene siendo desprestigiada.
Ya no se juega al futbol.
English Phrase is: Let's remove the team from the field, Boca does not deserve to play this cup that has long been discredited. Football is no longer played.
Spanish word is: equipo, English word is: team
Spanish Description is: Team, Name, Position, Player, Age, Location, Color, Game, Type, Season
English Description is: Team, Location, Position, Player, Color, Name, Age, Game, Interest, Destination
Overlap is: Age, Color, Game, Location, Name, Player, Position, Team (8)
Difference is: Destination, Interest, Season, Type (4)
Spanish word is: Boca, English word is: Boca
Spanish Description is: Name, Color, Team, Game, Player, Location, Age, Race, Owner, Type
English Description is: Name, Color, Team, Location, Player, Owner, Game, Age, Race, Friend
Overlap is: Age, Color, Game, Location, Name, Owner, Player, Race, Team (9)
Difference is: Friend, Type (2)
Spanish word is: copa, English word is: cup
Spanish Description is: Game, Age, Type, Color, Position, Name, Location, Size, Purpose, Function
English Description is: Game, Size, Type, Color, Item, Purpose, Weapon, Function, Player, Equipment
Overlap is: Color, Function, Game, Purpose, Size, Type (6)
Difference is: Age, Equipment, Item, Location, Name, Player, Position, Weapon (8)
Spanish word is: tiempo, English word is: long
Spanish Description is: Time, Weather, Age, Size, Year, Distance, Color, Season, Weight, Temperature
English Description is: Age, Name, Reason, Time, Year, Color, Purpose, Location, Date, Season
Overlap is: Age, Color, Season, Time, Year (5)
Difference is: Date, Distance, Location, Name, Purpose, Reason, Size, Temperature, Weather, Weight (10)
Spanish word is: futbol, English word is: Football
Spanish Description is: Sport, Game, Interest, Season, Sports, Type, Skill, Team, Time, Course
English Description is: Game, Sport, Football, Interest, Team, Player, Position, Function, Sports, Type
Overlap is: Game, Interest, Sport, Sports, Team, Type (6)
Difference is: Course, Football, Function, Player, Position, Season, Skill, Time (8)
The metrics are now marginally worse and the Malibu evaluation has degraded completely. It looks like this was too much!
Using a single token as the prediction is a solid improvement, as is increasing the temperature. A proper hyperparameter search could find the precise values for this model. I think the next thing to test is different models, the current model is a small one which is quite old now.