Code
import blog.transformers_logging
October 10, 2021
When training the model using mean squared error the loss essentially vanished. The problem with this is that the gradients from that loss may result in very minor updates. If the training is redone with a restricted set of tokens then the updates should be more significant.
What is the scale of the difference? The vocabulary size of BART is around 50,000. There are only 50 tokens that are being considered, and even across them there will be an uneven distribution which only sums to 1. This means that if the model randomly produces output then the mean squared error will still be very small (at most 0.00004). Comparing this to restricting the loss to only the 50 relevant tokens means that the potential error value is vastly increased and the model is more likely to learn to predict the correct token.
These are the two extreme positions - learning every token, and learning only the relevant tokens. There are also intermediate strategies where some number of other tokens are included in the loss calculation. The additional tokens could be ones that the model is wrongly predicting strongly or they could be randomly selected.
Let’s start with just the 50 PMI tokens. Since the overall output of the model runs through softmax this should result in pushing the non PMI tokens down to zero anyway.
# from src/main/python/blog/wikipedia_link/loss/link_mse_only_pmi.py
import torch
def calculate_loss_link_mse_only_pmi(
predictions: torch.Tensor, # [:, n, vocab_size]
link_labels: torch.Tensor, # [:, n] for index
token_indices: torch.Tensor, # index -> 50 tokens
token_values: torch.Tensor, # index -> 50 pmi values
) -> torch.Tensor:
"""Calculate the loss for the link predictions.
The labels for this are only valid when they are positive.
The predictions are only the link target predictions."""
predictions = predictions.softmax(dim=-1)
mask = link_labels.flatten() >= 0
link_labels = link_labels.flatten()[mask].long()
rows = link_labels.shape[0]
vocab_size = predictions.shape[-1]
predictions = predictions.view(-1, vocab_size)[mask]
rows = predictions.shape[0]
targets = token_values[link_labels]
offsets = torch.tensor(range(rows), device=predictions.device) * vocab_size
predictions = predictions.flatten()[token_indices[link_labels] + offsets[:, None]]
predictions = predictions.reshape(-1, targets.shape[-1])
return torch.nn.functional.mse_loss(predictions, targets)
# from src/main/python/blog/wikipedia_link/metrics/link_jaccard.py
from typing import Dict
import numpy as np
def metric_link_jaccard(
predictions: np.array,
labels: np.array,
token_indices: np.array,
) -> Dict[str, float]:
# predictions and labels have been flattened
# predictions have been sliced
# labels have NOT been sliced
# this means that labels is either [3, n] ints (begin, within) or [2, n] ints (iob)
# and predictions is [n, vocab_size] floats
# limit predictions to just the links
link_mask = labels[:, :-1].sum(axis=1) > 0
predictions = predictions[link_mask]
predictions = predictions.argsort(axis=1)[:, -50:]
labels = labels[link_mask][:, -1]
labels = token_indices[labels]
intersections = sum(
len(np.intersect1d(preds, labs)) for preds, labs in zip(predictions, labels)
)
unions = sum(
len(np.union1d(preds, labs)) for preds, labs in zip(predictions, labels)
)
return {
"accuracy": intersections / unions,
"intersection": intersections,
"union": unions,
}
# from src/main/python/blog/wikipedia_link/model/bart_only_value.py
import numpy as np
import torch
from transformers import BartConfig, BartForConditionalGeneration
class BartLinksOnlyValue(BartForConditionalGeneration):
def __init__(self, config: BartConfig) -> None:
super().__init__(config)
self.token_indices = None
self.token_values = None
@staticmethod
def link_loss(
predictions: torch.Tensor, # [:, n, vocab_size]
link_labels: torch.Tensor, # [:, n] for index
token_indices: np.array, # index -> 50 tokens
token_values: np.array,
) -> torch.Tensor:
raise NotImplementedError()
def forward(
self,
input_ids=None,
attention_mask=None,
labels=None,
):
assert self.token_indices is not None, "Model misconfigured, set token_indices"
assert self.token_values is not None, "Model misconfigured, set token_values"
outputs = self.model(
input_ids,
attention_mask=attention_mask,
)
logits = self.lm_head(outputs[0]) + self.final_logits_bias
# Including the full outputs means more stuff gets passed to the metrics method.
# Keeping the output just the logits or loss and logits makes metrics easier.
# The base model uses this approach:
# output = (logits,) + outputs[1:]
if labels is not None:
loss = self.link_loss(
predictions=logits,
link_labels=labels[:, :, -1],
token_indices=self.token_indices,
token_values=self.token_values,
)
return (loss, logits)
return (logits,)
# from src/main/python/blog/wikipedia_link/data/load.py
from pathlib import Path
from typing import Dict, Tuple
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
DEFAULT_DATA_FOLDER = Path("/data/blog/2021-10-03-wikipedia-train-mse")
def load_dataset(
data_folder: Path = DEFAULT_DATA_FOLDER, test_size: int = 64
) -> Dict[str, Dataset]:
df = pd.read_parquet(sorted(Path(data_folder / "inputs").glob("*.gz.parquet"))[-1])
df = df[["input_ids", "label"]]
return Dataset.from_pandas(df).train_test_split(test_size=test_size)
def load_page_tokens(
data_folder: Path = DEFAULT_DATA_FOLDER, device: torch.device = torch.device("cuda")
) -> Tuple[torch.Tensor, torch.Tensor]:
token_df = pd.read_parquet(data_folder / "pmi-filtered.gz.parquet")
token_df = token_df.set_index("title")
token_indices = np.concatenate(token_df.tokens.values).reshape(-1, 50)
token_indices = torch.from_numpy(token_indices).long()
token_indices = token_indices.detach().to(device)
token_values = np.concatenate(token_df.scores.values).reshape(-1, 50)
token_values = torch.from_numpy(token_values).float()
token_values = token_values.detach().to(device)
token_values = token_values.softmax(dim=-1)
return token_indices, token_values
def load_title_to_index(data_folder: Path = DEFAULT_DATA_FOLDER) -> Dict[str, int]:
df = pd.read_parquet(data_folder / "pmi-filtered.gz.parquet")
return dict(zip(df.title, df.index))
from typing import *
from pathlib import Path
from transformers import Trainer, TrainingArguments
from transformers import EvalPrediction
MODEL_RUN_FOLDER = DATA_FOLDER / "runs"
MODEL_RUN_FOLDER.mkdir(parents=True, exist_ok=True)
token_indices_npy = token_indices.cpu().numpy()
def compute_metrics(preds: EvalPrediction) -> Dict[str, float]:
labels = preds.label_ids.reshape(-1, 2)
predictions = preds.predictions.reshape(-1, tokenizer.vocab_size)
return {
f"link_{key}": value
for key, value in metric_link_jaccard(
predictions=predictions,
labels=labels,
token_indices=token_indices_npy
).items()
}
training_args = TrainingArguments(
report_to=[],
output_dir=MODEL_RUN_FOLDER / "output",
overwrite_output_dir=True,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
learning_rate=5e-5,
warmup_ratio=0.06,
evaluation_strategy="steps",
logging_dir=MODEL_RUN_FOLDER / "output",
num_train_epochs=EPOCHS,
logging_steps=1_000,
# not really training properly here
# load_best_model_at_end=True,
# metric_for_best_model="quality",
# greater_is_better=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=split["train"],
eval_dataset=split["test"],
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.train()
Step | Training Loss | Validation Loss | Link Accuracy | Link Intersection | Link Union |
---|---|---|---|---|---|
1000 | 0.001400 | 0.001019 | 0.016341 | 2005 | 122695 |
2000 | 0.001100 | 0.001021 | 0.017619 | 2159 | 122541 |
3000 | 0.001100 | 0.000977 | 0.018466 | 2261 | 122439 |
4000 | 0.001100 | 0.000995 | 0.020684 | 2527 | 122173 |
5000 | 0.001100 | 0.000973 | 0.021185 | 2587 | 122113 |
6000 | 0.001100 | 0.000960 | 0.021369 | 2609 | 122091 |
7000 | 0.001100 | 0.000964 | 0.020099 | 2457 | 122243 |
8000 | 0.001000 | 0.000972 | 0.023952 | 2917 | 121783 |
9000 | 0.001100 | 0.001007 | 0.023574 | 2872 | 121828 |
10000 | 0.001000 | 0.000960 | 0.025266 | 3073 | 121627 |
11000 | 0.001000 | 0.000947 | 0.025005 | 3042 | 121658 |
12000 | 0.001000 | 0.000937 | 0.027183 | 3300 | 121400 |
13000 | 0.001000 | 0.000941 | 0.027648 | 3355 | 121345 |
14000 | 0.001000 | 0.000942 | 0.027640 | 3354 | 121346 |
15000 | 0.001000 | 0.000923 | 0.028047 | 3402 | 121298 |
16000 | 0.001000 | 0.000943 | 0.028309 | 3433 | 121267 |
17000 | 0.001000 | 0.000930 | 0.028632 | 3471 | 121229 |
18000 | 0.001000 | 0.000936 | 0.028946 | 3508 | 121192 |
19000 | 0.001000 | 0.000945 | 0.029770 | 3605 | 121095 |
20000 | 0.001000 | 0.000960 | 0.029847 | 3614 | 121086 |
21000 | 0.001000 | 0.000926 | 0.030136 | 3648 | 121052 |
22000 | 0.001000 | 0.000935 | 0.029855 | 3615 | 121085 |
23000 | 0.001000 | 0.000920 | 0.030749 | 3720 | 120980 |
24000 | 0.001000 | 0.000950 | 0.030877 | 3735 | 120965 |
25000 | 0.001000 | 0.000932 | 0.031431 | 3800 | 120900 |
26000 | 0.001000 | 0.000903 | 0.031576 | 3817 | 120883 |
27000 | 0.001000 | 0.000907 | 0.031866 | 3851 | 120849 |
28000 | 0.001000 | 0.000926 | 0.031294 | 3784 | 120916 |
29000 | 0.001000 | 0.000900 | 0.032567 | 3933 | 120767 |
30000 | 0.001000 | 0.000907 | 0.032422 | 3916 | 120784 |
31000 | 0.001000 | 0.000931 | 0.032345 | 3907 | 120793 |
32000 | 0.000900 | 0.000906 | 0.032858 | 3967 | 120733 |
33000 | 0.000900 | 0.000901 | 0.033054 | 3990 | 120710 |
34000 | 0.000900 | 0.000897 | 0.033123 | 3998 | 120702 |
35000 | 0.000900 | 0.000895 | 0.033080 | 3993 | 120707 |
36000 | 0.000900 | 0.000902 | 0.033183 | 4005 | 120695 |
37000 | 0.000900 | 0.000910 | 0.033500 | 4042 | 120658 |
38000 | 0.000900 | 0.000908 | 0.033405 | 4031 | 120669 |
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38500/special_tokens_map.json
TrainOutput(global_step=38555, training_loss=0.0010075891841912668, metrics={'train_runtime': 16947.989, 'train_samples_per_second': 36.395, 'train_steps_per_second': 2.275, 'total_flos': 9.40236483723264e+16, 'train_loss': 0.0010075891841912668, 'epoch': 5.0})
This train finished with a Jaccard accuracy of 0.033405 which beats the previous mse approach which peaked at 0.031482. The run feels like it was slower to train, and did not reach a peak as it was still improving at the end. I am resorting to words like feel because even though I can state facts about the training primarily it was a feeling.
Given that, I think it is time to start using wandb again.
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/model-pmi-tokens/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/model-pmi-tokens/pytorch_model.bin
Let’s see what the inference looks like.
# from src/main/python/blog/wikipedia_link/evaluation/text_no_ee.py
from typing import Any, Dict, Iterator, List, Optional, Tuple
import regex as re
import torch
import wikitextparser as wtp
from transformers import AutoModel, AutoTokenizer
CATEGORY_LINK_PATTERN = re.compile(r"\\[\\[[^]]+:[^]]+\\]\\]")
TITLE_PATTERN = re.compile(r"(=+)[^=]+\\1")
WHITESPACE_PATTERN = re.compile(r"\\s+")
@torch.no_grad()
def infer_no_ee(
text: str, *, tokenizer: AutoTokenizer, model: AutoModel
) -> List[Tuple[str, Optional[List[str]]]]:
tokens, boundaries = _encode(text, tokenizer=tokenizer, max_length=256)
tokens = torch.tensor(tokens, device=model.device)[None, :]
output = model(tokens)[0]
lm_output = output[0, :, : model.config.vocab_size]
results = []
word_tokens = None
word_output = None
for token, link, (is_start, is_within) in zip(
tokens[0, :, None], lm_output, boundaries
):
if is_start:
if word_tokens:
results.append(
_to_result(
word_tokens=word_tokens, output=word_output, tokenizer=tokenizer
)
)
word_tokens = [token.item()]
word_output = link
elif is_within:
word_tokens.append(token.item())
word_output += link
else:
if word_tokens:
results.append(
_to_result(
word_tokens=word_tokens, output=word_output, tokenizer=tokenizer
)
)
word_tokens = None
word_output = None
if word_tokens:
results.append(
_to_result(word_tokens=word_tokens, output=word_output, tokenizer=tokenizer)
)
return results
def _encode(
text: str, tokenizer: AutoTokenizer, max_length: int
) -> Tuple[List[int], List[Tuple[bool, bool]]]:
"""
Tokenize the text and extract the wiki marked up entities.
This returns the following values:
input_ids: List[int] - the tokenized text
entities: List[Tuple[bool, bool]] - token boundaries, [start, within]
"""
linked_text = _extract_links(text)
tokenized_text = tokenizer(
linked_text["text"],
return_offsets_mapping=True,
return_attention_mask=False,
max_length=max_length,
truncation=True,
)
entities = _to_boundaries(
token_offsets=tokenized_text["offset_mapping"],
link_starts=linked_text["start"],
link_ends=linked_text["end"],
)
return tokenized_text["input_ids"], entities
def _extract_links(text: str) -> Dict[str, Any]:
text = text.strip()
text = re.sub(CATEGORY_LINK_PATTERN, "", text)
text = re.sub(TITLE_PATTERN, "", text)
text = re.sub(WHITESPACE_PATTERN, " ", text)
parsed = wtp.parse(text)
links = []
starts = []
ends = []
offset = 0
for link in parsed.wikilinks:
target = link.target
start, end = link.span
length = end - start
start -= offset
offset += length - len(link.plain_text())
end -= offset
links.append(target)
starts.append(start)
ends.append(end)
return {
"text": parsed.plain_text(),
"link": links,
"start": starts,
"end": ends,
}
def _to_boundaries(
token_offsets: List[Tuple[int, int]],
link_starts: List[int],
link_ends: List[int],
) -> List[Tuple[bool, bool]]:
boundaries = [(False, False)] * len(token_offsets)
link_iter = zip(link_starts, link_ends)
try:
link_start, link_end = _next_link(
token_start=0,
links=link_iter,
)
within = False
for index, (token_start, token_end) in enumerate(token_offsets):
if token_start >= link_end:
link_start, link_end = _next_link(
token_start=token_start,
links=link_iter,
)
within = False
if token_start < link_end and token_end > link_start:
boundaries[index] = (not within, True)
within = True
except StopIteration:
pass
return boundaries
def _next_link(
token_start: int,
links: Iterator[Tuple[int, int]],
) -> Tuple[int, int]:
link_start, link_end = next(links)
while token_start >= link_end:
link_start, link_end = next(links)
return link_start, link_end
def _to_result(
word_tokens: List[int], output: torch.Tensor, tokenizer: AutoTokenizer
) -> Tuple[str, Optional[List[str]]]:
link_tokens = output.argsort(dim=0, descending=True)[:10]
return (
tokenizer.decode(word_tokens),
tokenizer.batch_decode(link_tokens[:, None]),
)
[(' Malibu',
['ibu',
' tsun',
' condos',
' Godzilla',
' taxis',
' renters',
' Lyft',
'iami',
' condo',
' Cruiser'])]
[('Apple',
[' iPhones',
' iCloud',
' iPhone',
'Apple',
' Apple',
' MacBook',
' iPad',
' iPads',
' iPod',
'iPhone']),
(' Samsung',
[' Samsung',
'Samsung',
' Lenovo',
' iPhones',
' handset',
' OLED',
' Panasonic',
' iCloud',
' Chromebook',
' chipset']),
(' phone',
[' handset',
' iPhones',
' iPhone',
' iCloud',
' Samsung',
' smartphones',
' BlackBerry',
' iPod',
' MacBook',
'Samsung'])]
[(' apple',
[' apples',
' apple',
' oranges',
' beet',
' broccoli',
' avocado',
' bananas',
' almonds',
' cabbage',
' strawberries'])]
[(' Apple',
[' MacBook',
' iPhones',
' iPhone',
' iPad',
' iPod',
' iPads',
' iCloud',
' Apple',
'Apple',
'iPhone'])]
The Malibu inference is considerably worse than the previous version. The different uses of the word apple show how it can do well with correctly cased terms. This might be more of a problem though as it should not be so dramatically dependent on case to correctly categorize words. It almost feels like it is just memorizing some words.
I think the current training approach problem is that it has not had the negative feedback that would make it focus more clearly on the correct tokens. The case problem is a separate issue that I will focus on another time.
I need to restore the wandb usage to allow better comparison of these runs. Since my employer has actually paid for a weights and bias account I might try to use that. Let’s have a short test run to get everything working again.
# from src/main/python/blog/wandb.py
"""
I wrote this code to easily allow me to switch between my work and home wandb profiles.
It can be stored in git without worrying about committing the api keys.
The format of the ~/.config/wandb/authentication.yaml file is:
[section-name]
email = your-email-here
key = your-api-key-here
I have home and work as my two section names.
The section names are used as the first parameter to the methods this provides.
You can get your api key by going to https://wandb.ai/authorize.
"""
import configparser
from pathlib import Path
from typing import Any, Dict
import wandb
USER_FILE = Path.home() / ".config" / "wandb" / "authentication.yaml"
def wandb_login(config_key: str) -> bool:
configuration = _wandb_config(config_key)
if wandb.setup().settings.email == configuration["email"]:
return True
key = configuration["key"]
return wandb.login(key=key, relogin=True)
def wandb_init(config_key: str, **kwargs: Any) -> wandb.sdk.wandb_run.Run:
assert wandb_login(config_key)
return wandb.init(**kwargs)
def _wandb_config(config_key: str) -> Dict[str, str]:
configuration = configparser.ConfigParser()
configuration.read_string(USER_FILE.read_text())
return dict(configuration[config_key])
wandb: WARNING If you're specifying your api key in code, ensure this code is not shared publically.
wandb: WARNING Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.
wandb: Appending key for api.wandb.ai to your netrc file: /home/matthew/.netrc
Could not locate the tokenizer configuration file, will try to use the model config instead.
from typing import *
from pathlib import Path
import wandb
from transformers import Trainer, TrainingArguments
from transformers import EvalPrediction
MODEL_RUN_FOLDER = DATA_FOLDER / "runs"
MODEL_RUN_FOLDER.mkdir(parents=True, exist_ok=True)
token_indices_npy = token_indices.cpu().numpy()
def compute_metrics(preds: EvalPrediction) -> Dict[str, float]:
labels = preds.label_ids.reshape(-1, 2)
predictions = preds.predictions.reshape(-1, tokenizer.vocab_size)
return {
f"link_{key}": value
for key, value in metric_link_jaccard(
predictions=predictions,
labels=labels,
token_indices=token_indices_npy
).items()
}
with wandb_init(
"work",
project=PROJECT_NAME,
name="test-run",
mode="online",
) as run:
training_args = TrainingArguments(
report_to=["wandb"],
output_dir=MODEL_RUN_FOLDER / "output",
overwrite_output_dir=True,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
learning_rate=5e-5,
warmup_ratio=0.06,
evaluation_strategy="steps",
logging_dir=MODEL_RUN_FOLDER / "output",
# num_train_epochs=EPOCHS,
max_steps=100,
logging_steps=10,
# not really training properly here
# load_best_model_at_end=True,
# metric_for_best_model="quality",
# greater_is_better=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=split["train"],
eval_dataset=split["test"],
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.train()
wandb: wandb version 0.12.4 is available! To upgrade, please run:
wandb: $ pip install wandb --upgrade
/home/matthew/Programming/Blog/blog/_notebooks/wandb/run-20211011_220839-gyz80ptb
using `logging_steps` to initialize `eval_steps` to 10
PyTorch: setting up devices
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Step | Training Loss | Validation Loss | Link Accuracy | Link Intersection | Link Union |
---|---|---|---|---|---|
10 | 0.003300 | 0.001046 | 0.000409 | 51 | 124649 |
20 | 0.001200 | 0.001046 | 0.000024 | 3 | 124697 |
30 | 0.001100 | 0.001046 | 0.000032 | 4 | 124696 |
40 | 0.001200 | 0.001046 | 0.000032 | 4 | 124696 |
50 | 0.001200 | 0.001046 | 0.000032 | 4 | 124696 |
60 | 0.001100 | 0.001046 | 0.000040 | 5 | 124695 |
70 | 0.001100 | 0.001046 | 0.000040 | 5 | 124695 |
80 | 0.001200 | 0.001046 | 0.000048 | 6 | 124694 |
90 | 0.001200 | 0.001046 | 0.000048 | 6 | 124694 |
100 | 0.001100 | 0.001046 | 0.000048 | 6 | 124694 |
/home/matthew/Programming/Blog/blog/_notebooks/wandb/run-20211011_220839-gyz80ptb/logs/debug.log
/home/matthew/Programming/Blog/blog/_notebooks/wandb/run-20211011_220839-gyz80ptb/logs/debug-internal.log
train/loss | 0.0011 |
train/learning_rate | 0.0 |
train/epoch | 0.01 |
train/global_step | 100 |
_runtime | 81 |
_timestamp | 1633986600 |
_step | 20 |
eval/loss | 0.00105 |
eval/link_accuracy | 5e-05 |
eval/link_intersection | 6 |
eval/link_union | 124694 |
eval/runtime | 3.9759 |
eval/samples_per_second | 8.048 |
eval/steps_per_second | 0.503 |
train/train_runtime | 79.9487 |
train/train_samples_per_second | 20.013 |
train/train_steps_per_second | 1.251 |
train/total_flos | 243894583296000.0 |
train/train_loss | 0.00137 |
train/loss | █▁▁▁▁▁▁▁▁▁ |
train/learning_rate | █▇▆▆▅▄▃▃▂▁ |
train/epoch | ▁▁▁▁▁▁███████████████ |
train/global_step | ▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███ |
_runtime | ▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███ |
_timestamp | ▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███ |
_step | ▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇▇██ |
eval/loss | ▁███▇▇▇▇▆▆ |
eval/link_accuracy | █▁▁▁▁▁▁▁▁▁ |
eval/link_intersection | █▁▁▁▁▁▁▁▁▁ |
eval/link_union | ▁█████████ |
eval/runtime | █▂▁▁▂▂▂▁▂▃ |
eval/samples_per_second | ▁▇██▇▇▇▇▇▆ |
eval/steps_per_second | ▁▇█▇▇▇▇▇▇▆ |
train/train_runtime | ▁ |
train/train_samples_per_second | ▁ |
train/train_steps_per_second | ▁ |
train/total_flos | ▁ |
train/train_loss | ▁ |
That was fine. Took more effort getting different profiles working than actually doing the training. Now I can log each run to see more precisely how things change.
I’ve tried training with just the 50 tokens of interest and while that achieves a higher peak than the full train it looks to train slowly. If I incorporate a limited number of negative tokens into the mix then it should be possible to give greater feedback to the model without swamping it with negative feedback.
This is based on the idea of negative sampling that I first saw explained in this article on word2vec. The number of negative samples is considered a hyperparameter. Given that for word2vec 5 negative samples are used when predicting one word I am going to use 100 negative samples given the desire to predict 50 tokens. I’m aiming low here and it can be turned up later.
# from src/main/python/blog/wikipedia_link/loss/link_mse_negative_sampling.py
import torch
def calculate_loss_link_mse_negative_sampling(
predictions: torch.Tensor, # [:, n, vocab_size]
link_labels: torch.Tensor, # [:, n] for index
token_indices: torch.Tensor, # index -> 50 tokens
token_values: torch.Tensor, # index -> 50 pmi values
negative_samples: int,
) -> torch.Tensor:
"""Calculate the loss for the link predictions.
The labels for this are only valid when they are positive.
The predictions are only the link target predictions."""
predictions = predictions.softmax(dim=-1)
mask = link_labels.flatten() >= 0
link_labels = link_labels.flatten()[mask].long()
rows = link_labels.shape[0]
vocab_size = predictions.shape[-1]
predictions = predictions.view(-1, vocab_size)[mask]
rows = predictions.shape[0]
target_width = 50 + negative_samples
targets = torch.zeros(rows, target_width, device=predictions.device)
targets[:, :50] = token_values[link_labels]
offsets = torch.tensor(range(rows), device=predictions.device) * vocab_size
prediction_indices = torch.cat(
[
token_indices[link_labels],
torch.randint(
low=0,
high=vocab_size,
size=(rows, negative_samples),
dtype=torch.long,
device=predictions.device,
),
],
dim=-1,
)
predictions = predictions.flatten()[prediction_indices + offsets[:, None]]
predictions = predictions.reshape(-1, targets.shape[-1])
return torch.nn.functional.mse_loss(predictions, targets)
#hide_output
from transformers import AutoTokenizer
from functools import partial
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = BartLinksOnlyValue.from_pretrained(MODEL_NAME)
model.token_indices = token_indices
model.token_values = token_values
model.link_loss = partial(calculate_loss_link_mse_negative_sampling, negative_samples=100)
from typing import *
from pathlib import Path
from transformers import Trainer, TrainingArguments
from transformers import EvalPrediction
MODEL_RUN_FOLDER = DATA_FOLDER / "runs"
MODEL_RUN_FOLDER.mkdir(parents=True, exist_ok=True)
token_indices_npy = token_indices.cpu().numpy()
def compute_metrics(preds: EvalPrediction) -> Dict[str, float]:
labels = preds.label_ids.reshape(-1, 2)
predictions = preds.predictions.reshape(-1, tokenizer.vocab_size)
return {
f"link_{key}": value
for key, value in metric_link_jaccard(
predictions=predictions,
labels=labels,
token_indices=token_indices_npy
).items()
}
with wandb_init(
"work",
project=PROJECT_NAME,
name="mse-pmi-with-negative-sampling",
mode="online",
) as run:
training_args = TrainingArguments(
report_to=["wandb"],
output_dir=MODEL_RUN_FOLDER / "output",
overwrite_output_dir=True,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
learning_rate=5e-5,
warmup_ratio=0.06,
evaluation_strategy="steps",
logging_dir=MODEL_RUN_FOLDER / "output",
num_train_epochs=EPOCHS,
logging_steps=1_000,
# not really training properly here
# load_best_model_at_end=True,
# metric_for_best_model="quality",
# greater_is_better=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=split["train"],
eval_dataset=split["test"],
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.train()
wandb: WARNING If you're specifying your api key in code, ensure this code is not shared publically.
wandb: WARNING Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.
wandb: Appending key for api.wandb.ai to your netrc file: /home/matthew/.netrc
wandb: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:
wandb: wandb version 0.12.4 is available! To upgrade, please run:
wandb: $ pip install wandb --upgrade
/home/matthew/Programming/Blog/blog/_notebooks/wandb/run-20211012_212846-1abrcbpd
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Step | Training Loss | Validation Loss | Link Accuracy | Link Intersection | Link Union |
---|---|---|---|---|---|
1000 | 0.000500 | 0.000342 | 0.014390 | 1769 | 122931 |
2000 | 0.000400 | 0.000339 | 0.015348 | 1885 | 122815 |
3000 | 0.000400 | 0.000341 | 0.018026 | 2208 | 122492 |
4000 | 0.000400 | 0.000331 | 0.021219 | 2591 | 122109 |
5000 | 0.000400 | 0.000328 | 0.021545 | 2630 | 122070 |
6000 | 0.000400 | 0.000325 | 0.021420 | 2615 | 122085 |
7000 | 0.000400 | 0.000324 | 0.021327 | 2604 | 122096 |
8000 | 0.000400 | 0.000331 | 0.025865 | 3144 | 121556 |
9000 | 0.000400 | 0.000326 | 0.024432 | 2974 | 121726 |
10000 | 0.000300 | 0.000318 | 0.026963 | 3274 | 121426 |
11000 | 0.000300 | 0.000323 | 0.026692 | 3242 | 121458 |
12000 | 0.000300 | 0.000317 | 0.027394 | 3325 | 121375 |
13000 | 0.000300 | 0.000334 | 0.028157 | 3415 | 121285 |
14000 | 0.000300 | 0.000315 | 0.027454 | 3332 | 121368 |
15000 | 0.000300 | 0.000307 | 0.027835 | 3377 | 121323 |
16000 | 0.000300 | 0.000315 | 0.028386 | 3442 | 121258 |
17000 | 0.000300 | 0.000312 | 0.030127 | 3647 | 121053 |
18000 | 0.000300 | 0.000312 | 0.030527 | 3694 | 121006 |
19000 | 0.000300 | 0.000309 | 0.030630 | 3706 | 120994 |
20000 | 0.000300 | 0.000321 | 0.029370 | 3558 | 121142 |
21000 | 0.000300 | 0.000314 | 0.030902 | 3738 | 120962 |
22000 | 0.000300 | 0.000322 | 0.031952 | 3861 | 120839 |
23000 | 0.000300 | 0.000321 | 0.032567 | 3933 | 120767 |
24000 | 0.000300 | 0.000320 | 0.031218 | 3775 | 120925 |
25000 | 0.000300 | 0.000318 | 0.032174 | 3887 | 120813 |
26000 | 0.000300 | 0.000311 | 0.032011 | 3868 | 120832 |
27000 | 0.000300 | 0.000317 | 0.032140 | 3883 | 120817 |
28000 | 0.000300 | 0.000315 | 0.032225 | 3893 | 120807 |
29000 | 0.000300 | 0.000317 | 0.032233 | 3894 | 120806 |
30000 | 0.000300 | 0.000313 | 0.032755 | 3955 | 120745 |
31000 | 0.000300 | 0.000324 | 0.033037 | 3988 | 120712 |
32000 | 0.000300 | 0.000316 | 0.033072 | 3992 | 120708 |
33000 | 0.000300 | 0.000316 | 0.033106 | 3996 | 120704 |
34000 | 0.000300 | 0.000319 | 0.033491 | 4041 | 120659 |
35000 | 0.000300 | 0.000312 | 0.033423 | 4033 | 120667 |
36000 | 0.000300 | 0.000311 | 0.033680 | 4063 | 120637 |
37000 | 0.000300 | 0.000312 | 0.033945 | 4094 | 120606 |
38000 | 0.000300 | 0.000314 | 0.033800 | 4077 | 120623 |
/home/matthew/Programming/Blog/blog/_notebooks/wandb/run-20211012_212846-1abrcbpd/logs/debug.log
/home/matthew/Programming/Blog/blog/_notebooks/wandb/run-20211012_212846-1abrcbpd/logs/debug-internal.log
train/loss | 0.0003 |
train/learning_rate | 0.0 |
train/epoch | 5.0 |
train/global_step | 38555 |
_runtime | 17742 |
_timestamp | 1634088268 |
_step | 76 |
eval/loss | 0.00031 |
eval/link_accuracy | 0.0338 |
eval/link_intersection | 4077 |
eval/link_union | 120623 |
eval/runtime | 4.1699 |
eval/samples_per_second | 7.674 |
eval/steps_per_second | 0.48 |
train/train_runtime | 17741.1925 |
train/train_samples_per_second | 34.767 |
train/train_steps_per_second | 2.173 |
train/total_flos | 9.40236483723264e+16 |
train/train_loss | 0.00034 |
train/loss | █▄▄▄▄▄▄▄▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ |
train/learning_rate | ▄▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁ |
train/epoch | ▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████ |
train/global_step | ▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████ |
_runtime | ▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████ |
_timestamp | ▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████ |
_step | ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███ |
eval/loss | █▇█▆▅▅▄▆▅▃▄▃▇▃▁▃▂▂▁▄▂▄▄▄▃▂▃▃▃▂▄▃▃▃▂▂▂▂ |
eval/link_accuracy | ▁▁▂▃▄▄▃▅▅▆▅▆▆▆▆▆▇▇▇▆▇▇█▇▇▇▇▇▇█████████ |
eval/link_intersection | ▁▁▂▃▄▄▄▅▅▆▅▆▆▆▆▆▇▇▇▆▇▇█▇▇▇▇▇▇█████████ |
eval/link_union | ██▇▆▅▅▅▄▄▃▄▃▃▃▃▃▂▂▂▃▂▂▁▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁ |
eval/runtime | ▃▂▁▂▁▁▁██▇▇▆▇▇▇▃▇▇▆▆▇▅▆▆█▆██▆▇█▇█▆▇▇▇▆ |
eval/samples_per_second | ▆▇█▇▇▇█▁▁▁▂▃▂▂▂▆▂▂▃▃▂▄▃▃▁▃▁▁▃▂▁▂▁▃▂▂▂▃ |
eval/steps_per_second | ▆▇▇▇▇▇█▁▁▁▂▃▂▂▂▅▂▂▃▃▂▄▃▃▁▃▁▁▃▂▁▂▁▃▂▂▂▃ |
train/train_runtime | ▁ |
train/train_samples_per_second | ▁ |
train/train_steps_per_second | ▁ |
train/total_flos | ▁ |
train/train_loss | ▁ |
Let’s see what the inference looks like.
[(' Malibu',
['ibu',
' condos',
' tsun',
' taxis',
' headlights',
' renters',
' ancest',
' Cruiser',
'iami',
' sedan'])]
That’s not great. The output for the PMI tokens only was:
[(' Malibu',
['ibu',
' tsun',
' condos',
' Godzilla',
' taxis',
' renters',
' Lyft',
'iami',
' condo',
' Cruiser'])]
So while it is equivalent to that it does not have the quality that the binary cross entropy model has.