Code
import blog.transformers_logging
September 13, 2022
from pathlib import Path
ARTICLE_FILE = PROCESSED_FOLDER / "20220701" / "article-descriptions.gz.parquet"
DATASET_FOLDER = PROCESSED_FOLDER / "20220701" / "student"
MODEL_FOLDER = Path("/data/prompt-internalization/multilingual/models/wikipedia")
RUN_FOLDER = Path("/tmp/runs")
MODEL_FOLDER.mkdir(parents=True, exist_ok=True)
RUN_FOLDER.mkdir(parents=True, exist_ok=True)
from itertools import starmap
from typing import Any, Dict, List, Optional, Tuple, Union
from pathlib import Path
import pandas as pd
import datasets
import torch
import torch.nn.functional as F
from transformers import (
AutoModelForMaskedLM,
AutoTokenizer,
DataCollatorWithPadding,
EvalPrediction,
Trainer,
TrainingArguments,
)
from transformers.modeling_outputs import MaskedLMOutput
class ArticleTrainingArguments(TrainingArguments):
def __init__(
self,
*args,
temperature: float = 2.0,
**kwargs,
) -> None:
super().__init__(*args, **kwargs)
self.temperature = temperature
class ArticleMeasure:
def __init__(self, file: Path) -> None:
df = pd.read_parquet(file)
self.indices = [
torch.tensor(values, dtype=torch.long)
for values in df["indices"]
]
self.mean = [
torch.tensor(values, dtype=torch.float)
for values in df["mean"]
]
self.baseline = torch.tensor(df["baseline"].tolist())
def to(self, device) -> None:
self.indices = [entry.to(device) for entry in self.indices]
self.mean = [entry.to(device) for entry in self.mean]
self.baseline = self.baseline.to(device)
def loss(self, output: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
batch_size, token_count, vocab_size = output.shape
start_tokens = labels[:, :, 0]
offsets = torch.arange(start=0, end=batch_size, dtype=torch.long, device=output.device) * token_count
offsets = offsets.repeat_interleave((start_tokens != -1).sum(axis=1))
start_tokens = start_tokens.flatten()
target_indices = labels[:, :, 2].flatten()
token_mask = start_tokens != -1
start_tokens = start_tokens[token_mask]
target_indices = target_indices[token_mask]
# offsets = offsets[token_mask] # repeat_interleave has already established this
output = output.reshape(-1, vocab_size)
predictions = output[start_tokens + offsets]
# predictions = F.log_softmax(predictions, dim=-1)
targets = torch.ones_like(
predictions,
dtype=torch.float,
device=output.device,
requires_grad=False,
) * self.baseline[target_indices, None]
for row_index, index in enumerate(target_indices):
targets[row_index, self.indices[index]] = self.mean[index]
return F.kl_div(
input=F.log_softmax(predictions, dim=-1),
target=F.log_softmax(targets, dim=-1),
reduction="batchmean",
log_target=True,
)
class ArticleTrainer(Trainer):
def __init__(
self,
*args,
article_file: Path = None,
**kwargs,
) -> None:
super().__init__(*args, **kwargs)
self.measure = ArticleMeasure(article_file)
self.measure.to(self.model.device)
def compute_loss(
self,
model: AutoModelForMaskedLM,
inputs: Dict[str, Union[torch.Tensor, Any]],
return_outputs: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
outputs: MaskedLMOutput = model(
input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]
)
loss: torch.Tensor = self.measure.loss(outputs.logits, labels=inputs["labels"])
if not return_outputs:
return loss
return loss, outputs
def compute_metrics(model_output: EvalPrediction) -> Dict[str, float]:
# distance is just loss already
kl_div = model_output.predictions[:, 0].mean()
overlap = model_output.predictions[:, 1].mean()
return {
"kl_div": kl_div,
"overlap": overlap,
}
def train(
*,
model_name: str = "xlm-roberta-base",
# dataset_name: str = "xlm-roberta",
batch_size: int = 32,
learning_rate: float = 1e-4,
# temperature: float = 2,
fp16: bool = False,
# mean_prediction: bool = False,
# ignore_tokens: Optional[List[int]] = None,
epochs: Optional[float] = 2,
max_steps: int = -1,
evaluation_steps: int = 500,
article_file: Path = None,
) -> Path:
assert article_file is not None
run_name = "-".join(
[
f"{model_name}",
f"e{epochs}" if max_steps == -1 else f"ms{max_steps}",
f"bs{batch_size}",
f"lr{learning_rate}",
# f"t{temperature}",
]
+ (["fp16"] if fp16 else [])
# + (["mean"] if mean_prediction else [])
# + ([f"it{len(ignore_tokens)}"] if ignore_tokens else [])
)
print(f"Starting {run_name}")
train_ds = datasets.load_from_disk(DATASET_FOLDER / "train.dataset")
test_ds = datasets.load_from_disk(DATASET_FOLDER / "valid.dataset")
training_args = ArticleTrainingArguments(
report_to="none",
output_dir=RUN_FOLDER,
num_train_epochs=epochs,
max_steps=max_steps,
seed=33,
# number of steps before moving evaluation results from GPU to CPU see
# https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941
eval_accumulation_steps=5,
#
# hyperparameters
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
fp16=fp16,
# temperature=temperature,
# mean_prediction=mean_prediction,
# ignore_tokens=ignore_tokens,
learning_rate=learning_rate,
#
# evaluation settings
evaluation_strategy="steps",
logging_steps=evaluation_steps,
eval_steps=evaluation_steps,
save_steps=evaluation_steps,
#
# checkpoint settings
logging_dir=RUN_FOLDER / "logs",
save_total_limit=2,
load_best_model_at_end=True,
metric_for_best_model="loss",
greater_is_better=False,
# remove_unused_columns=False,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = ArticleTrainer(
model=model,
args=training_args,
train_dataset=train_ds,
eval_dataset=test_ds,
data_collator=data_collator,
tokenizer=tokenizer,
# compute_metrics=compute_metrics,
article_file=article_file,
)
trainer.train()
model.save_pretrained(MODEL_FOLDER / run_name)
return MODEL_FOLDER / run_name
Starting xlm-roberta-base-ms1000-bs8-lr0.0001
/home/matthew/.cache/pypoetry/virtualenvs/blog-HrtMnrOS-py3.10/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
Step | Training Loss | Validation Loss |
---|---|---|
100 | 1.240700 | 0.566710 |
200 | 0.588300 | 0.485166 |
300 | 0.505400 | 0.442383 |
400 | 0.480900 | 0.434608 |
500 | 0.427500 | 0.405620 |
600 | 0.394600 | 0.382248 |
700 | 0.371100 | 0.375811 |
800 | 0.348000 | 0.351471 |
900 | 0.326100 | 0.329928 |
1000 | 0.323300 | 0.326775 |
# from src/main/python/blog/prompt_internalization/multilingual/roberta/evaluate.py
from pathlib import Path
from typing import List, Optional, Tuple
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
def evaluate(
model_name: str, model_path: Path, ignore_tokens: Optional[List[int]] = None
) -> None:
if ignore_tokens is None:
ignore_tokens = []
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_path)
model.eval()
bass_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)
friday_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)
malibu_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)
football_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)
def bass_evaluation(
model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
first_phrase = "We spotted a large bass in the ocean."
second_phrase = "The bass player did not receive the acknowledgment she deserves."
third_phrase = "The black sea bass, is a member of the wreckfish family."
first_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=first_phrase,
noun="bass",
ignore_tokens=ignore_tokens,
)
second_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=second_phrase,
noun="bass",
ignore_tokens=ignore_tokens,
)
third_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=third_phrase,
noun="bass",
ignore_tokens=ignore_tokens,
)
print("=== BASS EVALUATION ===")
print(f"First Phrase is: {first_phrase} Target is: bass")
print(f"Description is: {', '.join(first_predicted_words)}")
print()
print(f"Second Phrase is: {second_phrase} Target is: bass")
print(f"Description is: {', '.join(second_predicted_words)}")
print()
print(f"Third Phrase is: {third_phrase} Target is: bass")
print(f"Description is: {', '.join(third_predicted_words)}")
print()
print(
f"First & Second: {sorted(set(first_predicted_words) & set(second_predicted_words))}"
)
print(
f"First & Third: {sorted(set(first_predicted_words) & set(third_predicted_words))}"
)
print(
f"Second & Third: {sorted(set(second_predicted_words) & set(third_predicted_words))}"
)
print()
def friday_evaluation(
model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
spanish_text = "Friday es mi canción favorita."
english_text = "Friday is my favourite song."
spanish_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=spanish_text,
noun="Friday",
ignore_tokens=ignore_tokens,
)
english_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=english_text,
noun="Friday",
ignore_tokens=ignore_tokens,
)
overlap = set(spanish_predicted_words) & set(english_predicted_words)
difference = set(spanish_predicted_words) ^ set(english_predicted_words)
print("=== FRIDAY EVALUATION ===")
print(f"Spanish Phrase is: {spanish_text}")
print(f"Spanish Description is: {', '.join(spanish_predicted_words)}")
print(f"English Phrase is: {english_text}")
print(f"English Description is: {', '.join(english_predicted_words)}")
print()
print(f"Description Overlap is: {', '.join(sorted(overlap))}")
print(f"Description Difference is: {', '.join(sorted(difference))}")
print()
def malibu_evaluation(
model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
text = "I like to drive my Malibu while drinking Malibu."
first_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=text,
noun="Malibu",
ignore_tokens=ignore_tokens,
)
second_predicted_words = get_predictions(
model=model,
tokenizer=tokenizer,
text=text,
noun="Malibu",
index=1,
ignore_tokens=ignore_tokens,
)
print("=== MALIBU EVALUATION ===")
print(f"Phrase is: {text}")
print(f"First Malibu (car) Description is: {', '.join(first_predicted_words)}")
print(f"Second Malibu (drink) Description is: {', '.join(second_predicted_words)}")
print()
print(
f"First & Second: {sorted(set(first_predicted_words) & set(second_predicted_words))}"
)
print(
f"First ^ Second: {sorted(set(first_predicted_words) ^ set(second_predicted_words))}"
)
print()
def football_evaluation(
model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
spanish_phrase = (
"Retiremos el equipo de la cancha, "
"Boca no merece jugar esta copa que "
"hace tiempo viene siendo desprestigiada.\n"
"Ya no se juega al futbol."
)
english_phrase = (
"Let's remove the team from the field, "
"Boca does not deserve to play this cup that "
"has long been discredited. "
"Football is no longer played."
)
print("=== FOOTBALL EVALUATION ===")
print(f"Spanish Phrase is: {spanish_phrase}")
print(f"English Phrase is: {english_phrase}")
print()
for spanish_noun, english_noun in [
["equipo", "team"],
["Boca", "Boca"],
["copa", "cup"],
["tiempo", "long"],
["futbol", "Football"],
]:
spanish_description = get_predictions(
model=model,
tokenizer=tokenizer,
text=spanish_phrase,
noun=spanish_noun,
ignore_tokens=ignore_tokens,
)
english_description = get_predictions(
model=model,
tokenizer=tokenizer,
text=english_phrase,
noun=english_noun,
ignore_tokens=ignore_tokens,
)
overlap = set(spanish_description) & set(english_description)
difference = set(spanish_description) ^ set(english_description)
print(f"Spanish word is: {spanish_noun}, English word is: {english_noun}")
print(f"Spanish Description is: {', '.join(spanish_description)}")
print(f"English Description is: {', '.join(english_description)}")
print(f"Overlap is: {', '.join(sorted(overlap))} ({len(overlap)})")
print(f"Difference is: {', '.join(sorted(difference))} ({len(difference)})")
print()
@torch.inference_mode()
def get_predictions(
*,
model: AutoModelForMaskedLM,
tokenizer: AutoTokenizer,
text: str,
noun: str,
index: int = 0,
ignore_tokens: Optional[List[int]] = None,
) -> List[str]:
if ignore_tokens is None:
ignore_tokens = []
tokens = tokenizer(text, return_tensors="pt")
start, _end = get_noun(
tokenizer=tokenizer, tokens=tokens.input_ids[0], noun=noun, index=index
)
output = model(**tokens)
predictions = output.logits[0, start]
predictions[ignore_tokens] = predictions.min()
predicted_tokens = predictions.argsort(descending=True)[:10]
predicted_words = [
word.strip() for word in tokenizer.batch_decode(predicted_tokens)
]
return predicted_words
def get_noun(
tokenizer: AutoTokenizer, tokens: torch.Tensor, noun: str, index: int
) -> Tuple[int, int]:
length = tokens.shape[0]
current_index = index
for start_index in range(length):
word = tokenizer.decode(tokens[start_index]).strip()
if not noun.startswith(word):
continue
for end_index in range(start_index + 1, length):
word = tokenizer.decode(tokens[start_index:end_index]).strip()
if not noun == word:
continue
if current_index > 0:
current_index -= 1
else:
return start_index, end_index
raise AssertionError(f"Did not find {noun}[{index}] in {tokenizer.decode(tokens)}")
Could not locate the tokenizer configuration file, will try to use the model config instead.
=== BASS EVALUATION ===
First Phrase is: We spotted a large bass in the ocean. Target is: bass
Description is: Area, Material, Type, Color, Location, Size, Description, View, Surface, Name
Second Phrase is: The bass player did not receive the acknowledgment she deserves. Target is: bass
Description is: Instrument, Material, Type, Music, Style, Position, Description, Language, Profession, Color
Third Phrase is: The black sea bass, is a member of the wreckfish family. Target is: bass
Description is: Animal, Type, Color, Name, Plant, Material, Food, Description, Cat, Family
First & Second: ['Color', 'Description', 'Material', 'Type']
First & Third: ['Color', 'Description', 'Material', 'Name', 'Type']
Second & Third: ['Color', 'Description', 'Material', 'Type']
=== FRIDAY EVALUATION ===
Spanish Phrase is: Friday es mi canción favorita.
Spanish Description is: Date, Day, Time, Description, Name, Song, Theme, Type, Tag, Color
English Phrase is: Friday is my favourite song.
English Description is: Date, Day, Time, Theme, Song, Description, Name, Color, Type, Tag
Description Overlap is: Color, Date, Day, Description, Name, Song, Tag, Theme, Time, Type
Description Difference is:
=== MALIBU EVALUATION ===
Phrase is: I like to drive my Malibu while drinking Malibu.
First Malibu (car) Description is: Country, Location, Land, Language, Region, City, Origin, Type, State, Source
Second Malibu (drink) Description is: Food, Language, Source, Country, Type, Material, Color, Drink, Style, Culture
First & Second: ['Country', 'Language', 'Source', 'Type']
First ^ Second: ['City', 'Color', 'Culture', 'Drink', 'Food', 'Land', 'Location', 'Material', 'Origin', 'Region', 'State', 'Style']
=== FOOTBALL EVALUATION ===
Spanish Phrase is: Retiremos el equipo de la cancha, Boca no merece jugar esta copa que hace tiempo viene siendo desprestigiada.
Ya no se juega al futbol.
English Phrase is: Let's remove the team from the field, Boca does not deserve to play this cup that has long been discredited. Football is no longer played.
Spanish word is: equipo, English word is: team
Spanish Description is: Sponsor, Name, Type, Owner, Location, Organization, Title, Country, Company, Sport
English Description is: Type, Name, Organization, Sponsor, Title, Team, Location, Sport, Country, Company
Overlap is: Company, Country, Location, Name, Organization, Sponsor, Sport, Title, Type (9)
Difference is: Owner, Team (2)
Spanish word is: Boca, English word is: Boca
Spanish Description is: Sponsor, Company, Owner, City, Location, Organization, Country, Name, Team, Land
English Description is: Sponsor, Company, Owner, Organization, City, Location, Name, Country, Brand, Team
Overlap is: City, Company, Country, Location, Name, Organization, Owner, Sponsor, Team (9)
Difference is: Brand, Land (2)
Spanish word is: copa, English word is: cup
Spanish Description is: Title, Type, Series, Category, Sports, Game, Sport, Sponsor, Organization, Year
English Description is: Title, Series, Type, Sport, Sports, Organization, Category, Status, Sponsor, Game
Overlap is: Category, Game, Organization, Series, Sponsor, Sport, Sports, Title, Type (9)
Difference is: Status, Year (2)
Spanish word is: tiempo, English word is: long
Spanish Description is: Time, Year, Age, Date, Description, Country, History, Location, Title, Duration
English Description is: Age, Title, Description, Year, Type, Subject, Status, History, Date, Country
Overlap is: Age, Country, Date, Description, History, Title, Year (7)
Difference is: Duration, Location, Status, Subject, Time, Type (6)
Spanish word is: futbol, English word is: Football
Spanish Description is: Sports, Sport, Type, Style, Football, Game, Language, Religion, Category, Culture
English Description is: Sports, Football, Sport, Style, Language, Type, Religion, Game, Culture, Category
Overlap is: Category, Culture, Football, Game, Language, Religion, Sport, Sports, Style, Type (10)
Difference is: (0)
PyTorch: setting up devices
Starting xlm-roberta-base-ms20000-bs8-lr0.0001
Could not locate the tokenizer configuration file, will try to use the model config instead.
/home/matthew/.cache/pypoetry/virtualenvs/blog-HrtMnrOS-py3.10/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
Step | Training Loss | Validation Loss |
---|---|---|
1000 | 0.556900 | 0.404128 |
2000 | 0.362400 | 0.352797 |
3000 | 0.310700 | 0.331835 |
4000 | 0.279500 | 0.308182 |
5000 | 0.254400 | 0.293333 |
6000 | 0.240900 | 0.310928 |
7000 | 0.221900 | 0.281576 |
8000 | 0.207000 | 0.273815 |
9000 | 0.198500 | 0.256036 |
10000 | 0.186200 | 0.254111 |
11000 | 0.171700 | 0.257458 |
12000 | 0.161200 | 0.242577 |
13000 | 0.156700 | 0.235280 |
14000 | 0.147600 | 0.245646 |
15000 | 0.144100 | 0.233584 |
16000 | 0.138200 | 0.227569 |
17000 | 0.131800 | 0.225234 |
18000 | 0.126100 | 0.215953 |
19000 | 0.121000 | 0.217426 |
20000 | 0.120600 | 0.215612 |
Could not locate the tokenizer configuration file, will try to use the model config instead.
=== BASS EVALUATION ===
First Phrase is: We spotted a large bass in the ocean. Target is: bass
Description is: Material, Type, Description, Location, Area, Size, Application, Source, Name, Color
Second Phrase is: The bass player did not receive the acknowledgment she deserves. Target is: bass
Description is: Instrument, Type, Material, Game, Music, Style, Player, System, Description, Language
Third Phrase is: The black sea bass, is a member of the wreckfish family. Target is: bass
Description is: Type, Material, Game, Instrument, Fish, Color, Country, Language, Description, Sports
First & Second: ['Description', 'Material', 'Type']
First & Third: ['Color', 'Description', 'Material', 'Type']
Second & Third: ['Description', 'Game', 'Instrument', 'Language', 'Material', 'Type']
=== FRIDAY EVALUATION ===
Spanish Phrase is: Friday es mi canción favorita.
Spanish Description is: Tag, Date, Day, Name, Music, Song, Description, Color, Title, Album
English Phrase is: Friday is my favourite song.
English Description is: Date, Day, Time, Tag, Holiday, Birthday, Theme, Year, Name, Description
Description Overlap is: Date, Day, Description, Name, Tag
Description Difference is: Album, Birthday, Color, Holiday, Music, Song, Theme, Time, Title, Year
=== MALIBU EVALUATION ===
Phrase is: I like to drive my Malibu while drinking Malibu.
First Malibu (car) Description is: Location, Country, City, Land, Place, Area, Region, State, Village, Local
Second Malibu (drink) Description is: Location, Country, City, Land, Region, Place, Area, State, Local, Language
First & Second: ['Area', 'City', 'Country', 'Land', 'Local', 'Location', 'Place', 'Region', 'State']
First ^ Second: ['Language', 'Village']
=== FOOTBALL EVALUATION ===
Spanish Phrase is: Retiremos el equipo de la cancha, Boca no merece jugar esta copa que hace tiempo viene siendo desprestigiada.
Ya no se juega al futbol.
English Phrase is: Let's remove the team from the field, Boca does not deserve to play this cup that has long been discredited. Football is no longer played.
Spanish word is: equipo, English word is: team
Spanish Description is: Sponsor, Organization, Owner, Name, Team, Company, Location, Type, Sports, Title
English Description is: Organization, Team, Sponsor, Name, Owner, Company, Type, Location, Sport, Title
Overlap is: Company, Location, Name, Organization, Owner, Sponsor, Team, Title, Type (9)
Difference is: Sport, Sports (2)
Spanish word is: Boca, English word is: Boca
Spanish Description is: Sponsor, Company, City, Club, Organization, Team, Owner, Land, Location, Country
English Description is: Sponsor, Company, City, Club, Organization, Team, Owner, Land, Location, Country
Overlap is: City, Club, Company, Country, Land, Location, Organization, Owner, Sponsor, Team (10)
Difference is: (0)
Spanish word is: copa, English word is: cup
Spanish Description is: Title, Type, Sport, Series, Sports, Game, Category, Football, Location, Sponsor
English Description is: Title, Type, Series, Sport, Category, Sports, Football, Game, Cup, Organization
Overlap is: Category, Football, Game, Series, Sport, Sports, Title, Type (8)
Difference is: Cup, Location, Organization, Sponsor (4)
Spanish word is: tiempo, English word is: long
Spanish Description is: Year, Age, Time, Type, Date, Country, Duration, Location, Sport, Description
English Description is: Year, Description, Title, Age, Type, Date, Country, Religion, Name, Location
Overlap is: Age, Country, Date, Description, Location, Type, Year (7)
Difference is: Duration, Name, Religion, Sport, Time, Title (6)
Spanish word is: futbol, English word is: Football
Spanish Description is: Sports, Sport, Football, Type, Style, Game, Religion, Language, Theme, Title
English Description is: Sports, Sport, Football, Type, Style, Religion, Game, Language, Series, Title
Overlap is: Football, Game, Language, Religion, Sport, Sports, Style, Title, Type (9)
Difference is: Series, Theme (2)
I really need a more consistent way to measure the performance of this model. It seems good?