Wikipedia Focused Mean Squared Error Train

Train the wikipedia link recognition model using mean squared error and reduce the number of tokens considered
Published

October 10, 2021

When training the model using mean squared error the loss essentially vanished. The problem with this is that the gradients from that loss may result in very minor updates. If the training is redone with a restricted set of tokens then the updates should be more significant.

What is the scale of the difference? The vocabulary size of BART is around 50,000. There are only 50 tokens that are being considered, and even across them there will be an uneven distribution which only sums to 1. This means that if the model randomly produces output then the mean squared error will still be very small (at most 0.00004). Comparing this to restricting the loss to only the 50 relevant tokens means that the potential error value is vastly increased and the model is more likely to learn to predict the correct token.

These are the two extreme positions - learning every token, and learning only the relevant tokens. There are also intermediate strategies where some number of other tokens are included in the loss calculation. The additional tokens could be ones that the model is wrongly predicting strongly or they could be randomly selected.

Code
import blog.transformers_logging
Code
from pathlib import Path

DATA_FOLDER = Path("/data/blog/2021-10-10-wikipedia-train-focused-mse")
DATA_FOLDER.mkdir(parents=True, exist_ok=True)

MODEL_NAME = "facebook/bart-base"
BATCH_SIZE = 16
EPOCHS = 5

PROJECT_NAME = "wikipedia-link-recognition"

PMI Token Loss

Let’s start with just the 50 PMI tokens. Since the overall output of the model runs through softmax this should result in pushing the non PMI tokens down to zero anyway.

Code
# from src/main/python/blog/wikipedia_link/loss/link_mse_only_pmi.py
import torch


def calculate_loss_link_mse_only_pmi(
    predictions: torch.Tensor,  # [:, n, vocab_size]
    link_labels: torch.Tensor,  # [:, n] for index
    token_indices: torch.Tensor,  # index -> 50 tokens
    token_values: torch.Tensor,  # index -> 50 pmi values
) -> torch.Tensor:
    """Calculate the loss for the link predictions.
    The labels for this are only valid when they are positive.
    The predictions are only the link target predictions."""

    predictions = predictions.softmax(dim=-1)

    mask = link_labels.flatten() >= 0
    link_labels = link_labels.flatten()[mask].long()
    rows = link_labels.shape[0]

    vocab_size = predictions.shape[-1]
    predictions = predictions.view(-1, vocab_size)[mask]
    rows = predictions.shape[0]

    targets = token_values[link_labels]
    offsets = torch.tensor(range(rows), device=predictions.device) * vocab_size
    predictions = predictions.flatten()[token_indices[link_labels] + offsets[:, None]]
    predictions = predictions.reshape(-1, targets.shape[-1])

    return torch.nn.functional.mse_loss(predictions, targets)



# from src/main/python/blog/wikipedia_link/metrics/link_jaccard.py
from typing import Dict

import numpy as np


def metric_link_jaccard(
    predictions: np.array,
    labels: np.array,
    token_indices: np.array,
) -> Dict[str, float]:
    # predictions and labels have been flattened
    # predictions have been sliced
    # labels have NOT been sliced
    # this means that labels is either [3, n] ints (begin, within) or [2, n] ints (iob)
    # and predictions is [n, vocab_size] floats

    # limit predictions to just the links
    link_mask = labels[:, :-1].sum(axis=1) > 0
    predictions = predictions[link_mask]
    predictions = predictions.argsort(axis=1)[:, -50:]
    labels = labels[link_mask][:, -1]
    labels = token_indices[labels]

    intersections = sum(
        len(np.intersect1d(preds, labs)) for preds, labs in zip(predictions, labels)
    )
    unions = sum(
        len(np.union1d(preds, labs)) for preds, labs in zip(predictions, labels)
    )

    return {
        "accuracy": intersections / unions,
        "intersection": intersections,
        "union": unions,
    }



# from src/main/python/blog/wikipedia_link/model/bart_only_value.py
import numpy as np
import torch
from transformers import BartConfig, BartForConditionalGeneration


class BartLinksOnlyValue(BartForConditionalGeneration):
    def __init__(self, config: BartConfig) -> None:
        super().__init__(config)
        self.token_indices = None
        self.token_values = None

    @staticmethod
    def link_loss(
        predictions: torch.Tensor,  # [:, n, vocab_size]
        link_labels: torch.Tensor,  # [:, n] for index
        token_indices: np.array,  # index -> 50 tokens
        token_values: np.array,
    ) -> torch.Tensor:
        raise NotImplementedError()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        labels=None,
    ):
        assert self.token_indices is not None, "Model misconfigured, set token_indices"
        assert self.token_values is not None, "Model misconfigured, set token_values"

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
        )
        logits = self.lm_head(outputs[0]) + self.final_logits_bias

        # Including the full outputs means more stuff gets passed to the metrics method.
        # Keeping the output just the logits or loss and logits makes metrics easier.
        # The base model uses this approach:
        # output = (logits,) + outputs[1:]

        if labels is not None:
            loss = self.link_loss(
                predictions=logits,
                link_labels=labels[:, :, -1],
                token_indices=self.token_indices,
                token_values=self.token_values,
            )
            return (loss, logits)
        return (logits,)



# from src/main/python/blog/wikipedia_link/data/load.py
from pathlib import Path
from typing import Dict, Tuple

import numpy as np
import pandas as pd
import torch
from datasets import Dataset

DEFAULT_DATA_FOLDER = Path("/data/blog/2021-10-03-wikipedia-train-mse")


def load_dataset(
    data_folder: Path = DEFAULT_DATA_FOLDER, test_size: int = 64
) -> Dict[str, Dataset]:
    df = pd.read_parquet(sorted(Path(data_folder / "inputs").glob("*.gz.parquet"))[-1])
    df = df[["input_ids", "label"]]

    return Dataset.from_pandas(df).train_test_split(test_size=test_size)


def load_page_tokens(
    data_folder: Path = DEFAULT_DATA_FOLDER, device: torch.device = torch.device("cuda")
) -> Tuple[torch.Tensor, torch.Tensor]:
    token_df = pd.read_parquet(data_folder / "pmi-filtered.gz.parquet")
    token_df = token_df.set_index("title")

    token_indices = np.concatenate(token_df.tokens.values).reshape(-1, 50)
    token_indices = torch.from_numpy(token_indices).long()
    token_indices = token_indices.detach().to(device)

    token_values = np.concatenate(token_df.scores.values).reshape(-1, 50)
    token_values = torch.from_numpy(token_values).float()
    token_values = token_values.detach().to(device)
    token_values = token_values.softmax(dim=-1)

    return token_indices, token_values


def load_title_to_index(data_folder: Path = DEFAULT_DATA_FOLDER) -> Dict[str, int]:
    df = pd.read_parquet(data_folder / "pmi-filtered.gz.parquet")
    return dict(zip(df.title, df.index))
Code
token_indices, token_values = load_page_tokens()
title_to_index = load_title_to_index()
split = load_dataset(test_size=BATCH_SIZE*2)
Code
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = BartLinksOnlyValue.from_pretrained(MODEL_NAME)

model.token_indices = token_indices
model.token_values = token_values
model.link_loss = calculate_loss_link_mse_only_pmi
Code
from typing import *
from pathlib import Path
from transformers import Trainer, TrainingArguments
from transformers import EvalPrediction

MODEL_RUN_FOLDER = DATA_FOLDER / "runs"
MODEL_RUN_FOLDER.mkdir(parents=True, exist_ok=True)

token_indices_npy = token_indices.cpu().numpy()
def compute_metrics(preds: EvalPrediction) -> Dict[str, float]:
    labels = preds.label_ids.reshape(-1, 2)
    predictions = preds.predictions.reshape(-1, tokenizer.vocab_size)
    return {
        f"link_{key}": value
        for key, value in metric_link_jaccard(
            predictions=predictions,
            labels=labels,
            token_indices=token_indices_npy
        ).items()
    }

training_args = TrainingArguments(
    report_to=[],            
    output_dir=MODEL_RUN_FOLDER / "output",
    overwrite_output_dir=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=5e-5,
    warmup_ratio=0.06,
    evaluation_strategy="steps",
    logging_dir=MODEL_RUN_FOLDER / "output",

    num_train_epochs=EPOCHS,
    logging_steps=1_000,

    # not really training properly here
    # load_best_model_at_end=True,
    # metric_for_best_model="quality",
    # greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split["train"],
    eval_dataset=split["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
[38555/38555 4:42:27, Epoch 5/5]
Step Training Loss Validation Loss Link Accuracy Link Intersection Link Union
1000 0.001400 0.001019 0.016341 2005 122695
2000 0.001100 0.001021 0.017619 2159 122541
3000 0.001100 0.000977 0.018466 2261 122439
4000 0.001100 0.000995 0.020684 2527 122173
5000 0.001100 0.000973 0.021185 2587 122113
6000 0.001100 0.000960 0.021369 2609 122091
7000 0.001100 0.000964 0.020099 2457 122243
8000 0.001000 0.000972 0.023952 2917 121783
9000 0.001100 0.001007 0.023574 2872 121828
10000 0.001000 0.000960 0.025266 3073 121627
11000 0.001000 0.000947 0.025005 3042 121658
12000 0.001000 0.000937 0.027183 3300 121400
13000 0.001000 0.000941 0.027648 3355 121345
14000 0.001000 0.000942 0.027640 3354 121346
15000 0.001000 0.000923 0.028047 3402 121298
16000 0.001000 0.000943 0.028309 3433 121267
17000 0.001000 0.000930 0.028632 3471 121229
18000 0.001000 0.000936 0.028946 3508 121192
19000 0.001000 0.000945 0.029770 3605 121095
20000 0.001000 0.000960 0.029847 3614 121086
21000 0.001000 0.000926 0.030136 3648 121052
22000 0.001000 0.000935 0.029855 3615 121085
23000 0.001000 0.000920 0.030749 3720 120980
24000 0.001000 0.000950 0.030877 3735 120965
25000 0.001000 0.000932 0.031431 3800 120900
26000 0.001000 0.000903 0.031576 3817 120883
27000 0.001000 0.000907 0.031866 3851 120849
28000 0.001000 0.000926 0.031294 3784 120916
29000 0.001000 0.000900 0.032567 3933 120767
30000 0.001000 0.000907 0.032422 3916 120784
31000 0.001000 0.000931 0.032345 3907 120793
32000 0.000900 0.000906 0.032858 3967 120733
33000 0.000900 0.000901 0.033054 3990 120710
34000 0.000900 0.000897 0.033123 3998 120702
35000 0.000900 0.000895 0.033080 3993 120707
36000 0.000900 0.000902 0.033183 4005 120695
37000 0.000900 0.000910 0.033500 4042 120658
38000 0.000900 0.000908 0.033405 4031 120669

Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-1500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-2500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-3500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-4500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-5500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-6500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-7500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-8500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-9500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-10500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-11500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-12500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-13500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-14500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-15500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-16500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-17500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-18500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-19500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-20500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-21500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-22500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-23500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-24500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-25500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-26500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-27500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-28500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-29500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-30500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-31500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-32500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-33500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-34500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-35500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-36500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-37500/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38000/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38000/special_tokens_map.json
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38500/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/runs/output/checkpoint-38500/special_tokens_map.json
TrainOutput(global_step=38555, training_loss=0.0010075891841912668, metrics={'train_runtime': 16947.989, 'train_samples_per_second': 36.395, 'train_steps_per_second': 2.275, 'total_flos': 9.40236483723264e+16, 'train_loss': 0.0010075891841912668, 'epoch': 5.0})

This train finished with a Jaccard accuracy of 0.033405 which beats the previous mse approach which peaked at 0.031482. The run feels like it was slower to train, and did not reach a peak as it was still improving at the end. I am resorting to words like feel because even though I can state facts about the training primarily it was a feeling.

Given that, I think it is time to start using wandb again.

Code
model.save_pretrained(DATA_FOLDER / "model-pmi-tokens")
Configuration saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/model-pmi-tokens/config.json
Model weights saved in /data/blog/2021-10-10-wikipedia-train-focused-mse/model-pmi-tokens/pytorch_model.bin

PMI Token Loss Evaluation

Let’s see what the inference looks like.

Code
model = model.eval()
Code
# from src/main/python/blog/wikipedia_link/evaluation/text_no_ee.py
from typing import Any, Dict, Iterator, List, Optional, Tuple

import regex as re
import torch
import wikitextparser as wtp
from transformers import AutoModel, AutoTokenizer

CATEGORY_LINK_PATTERN = re.compile(r"\\[\\[[^]]+:[^]]+\\]\\]")
TITLE_PATTERN = re.compile(r"(=+)[^=]+\\1")
WHITESPACE_PATTERN = re.compile(r"\\s+")


@torch.no_grad()
def infer_no_ee(
    text: str, *, tokenizer: AutoTokenizer, model: AutoModel
) -> List[Tuple[str, Optional[List[str]]]]:
    tokens, boundaries = _encode(text, tokenizer=tokenizer, max_length=256)
    tokens = torch.tensor(tokens, device=model.device)[None, :]

    output = model(tokens)[0]
    lm_output = output[0, :, : model.config.vocab_size]

    results = []
    word_tokens = None
    word_output = None
    for token, link, (is_start, is_within) in zip(
        tokens[0, :, None], lm_output, boundaries
    ):
        if is_start:
            if word_tokens:
                results.append(
                    _to_result(
                        word_tokens=word_tokens, output=word_output, tokenizer=tokenizer
                    )
                )
            word_tokens = [token.item()]
            word_output = link
        elif is_within:
            word_tokens.append(token.item())
            word_output += link
        else:
            if word_tokens:
                results.append(
                    _to_result(
                        word_tokens=word_tokens, output=word_output, tokenizer=tokenizer
                    )
                )
                word_tokens = None
                word_output = None
    if word_tokens:
        results.append(
            _to_result(word_tokens=word_tokens, output=word_output, tokenizer=tokenizer)
        )

    return results


def _encode(
    text: str, tokenizer: AutoTokenizer, max_length: int
) -> Tuple[List[int], List[Tuple[bool, bool]]]:
    """
    Tokenize the text and extract the wiki marked up entities.
    This returns the following values:

    input_ids: List[int] - the tokenized text
    entities: List[Tuple[bool, bool]] - token boundaries, [start, within]
    """

    linked_text = _extract_links(text)
    tokenized_text = tokenizer(
        linked_text["text"],
        return_offsets_mapping=True,
        return_attention_mask=False,
        max_length=max_length,
        truncation=True,
    )
    entities = _to_boundaries(
        token_offsets=tokenized_text["offset_mapping"],
        link_starts=linked_text["start"],
        link_ends=linked_text["end"],
    )
    return tokenized_text["input_ids"], entities


def _extract_links(text: str) -> Dict[str, Any]:
    text = text.strip()
    text = re.sub(CATEGORY_LINK_PATTERN, "", text)
    text = re.sub(TITLE_PATTERN, "", text)
    text = re.sub(WHITESPACE_PATTERN, " ", text)

    parsed = wtp.parse(text)

    links = []
    starts = []
    ends = []
    offset = 0
    for link in parsed.wikilinks:
        target = link.target

        start, end = link.span
        length = end - start
        start -= offset
        offset += length - len(link.plain_text())
        end -= offset

        links.append(target)
        starts.append(start)
        ends.append(end)

    return {
        "text": parsed.plain_text(),
        "link": links,
        "start": starts,
        "end": ends,
    }


def _to_boundaries(
    token_offsets: List[Tuple[int, int]],
    link_starts: List[int],
    link_ends: List[int],
) -> List[Tuple[bool, bool]]:
    boundaries = [(False, False)] * len(token_offsets)

    link_iter = zip(link_starts, link_ends)
    try:
        link_start, link_end = _next_link(
            token_start=0,
            links=link_iter,
        )

        within = False
        for index, (token_start, token_end) in enumerate(token_offsets):
            if token_start >= link_end:
                link_start, link_end = _next_link(
                    token_start=token_start,
                    links=link_iter,
                )
                within = False

            if token_start < link_end and token_end > link_start:
                boundaries[index] = (not within, True)
                within = True
    except StopIteration:
        pass

    return boundaries


def _next_link(
    token_start: int,
    links: Iterator[Tuple[int, int]],
) -> Tuple[int, int]:
    link_start, link_end = next(links)
    while token_start >= link_end:
        link_start, link_end = next(links)
    return link_start, link_end


def _to_result(
    word_tokens: List[int], output: torch.Tensor, tokenizer: AutoTokenizer
) -> Tuple[str, Optional[List[str]]]:
    link_tokens = output.argsort(dim=0, descending=True)[:10]
    return (
        tokenizer.decode(word_tokens),
        tokenizer.batch_decode(link_tokens[:, None]),
    )
Code
infer_no_ee(
    "I like to drive my [[Malibu]]",
    tokenizer=tokenizer,
    model=model,
)
[(' Malibu',
  ['ibu',
   ' tsun',
   ' condos',
   ' Godzilla',
   ' taxis',
   ' renters',
   ' Lyft',
   'iami',
   ' condo',
   ' Cruiser'])]
Code
infer_no_ee(
    "[[Apple]] have overtaken [[Samsung]] in [[phone]] sales",
    tokenizer=tokenizer,
    model=model,
)
[('Apple',
  [' iPhones',
   ' iCloud',
   ' iPhone',
   'Apple',
   ' Apple',
   ' MacBook',
   ' iPad',
   ' iPads',
   ' iPod',
   'iPhone']),
 (' Samsung',
  [' Samsung',
   'Samsung',
   ' Lenovo',
   ' iPhones',
   ' handset',
   ' OLED',
   ' Panasonic',
   ' iCloud',
   ' Chromebook',
   ' chipset']),
 (' phone',
  [' handset',
   ' iPhones',
   ' iPhone',
   ' iCloud',
   ' Samsung',
   ' smartphones',
   ' BlackBerry',
   ' iPod',
   ' MacBook',
   'Samsung'])]
Code
infer_no_ee(
    "An [[apple]] a day keeps the doctor away",
    tokenizer=tokenizer,
    model=model,
)
[(' apple',
  [' apples',
   ' apple',
   ' oranges',
   ' beet',
   ' broccoli',
   ' avocado',
   ' bananas',
   ' almonds',
   ' cabbage',
   ' strawberries'])]
Code
infer_no_ee(
    "An [[Apple]] a day keeps the doctor away",
    tokenizer=tokenizer,
    model=model,
)
[(' Apple',
  [' MacBook',
   ' iPhones',
   ' iPhone',
   ' iPad',
   ' iPod',
   ' iPads',
   ' iCloud',
   ' Apple',
   'Apple',
   'iPhone'])]

The Malibu inference is considerably worse than the previous version. The different uses of the word apple show how it can do well with correctly cased terms. This might be more of a problem though as it should not be so dramatically dependent on case to correctly categorize words. It almost feels like it is just memorizing some words.

I think the current training approach problem is that it has not had the negative feedback that would make it focus more clearly on the correct tokens. The case problem is a separate issue that I will focus on another time.


WandB Test Training

I need to restore the wandb usage to allow better comparison of these runs. Since my employer has actually paid for a weights and bias account I might try to use that. Let’s have a short test run to get everything working again.

Code
# from src/main/python/blog/wandb.py
"""
I wrote this code to easily allow me to switch between my work and home wandb profiles.
It can be stored in git without worrying about committing the api keys.
The format of the ~/.config/wandb/authentication.yaml file is:

    [section-name]
    email = your-email-here
    key = your-api-key-here

I have home and work as my two section names.
The section names are used as the first parameter to the methods this provides.

You can get your api key by going to https://wandb.ai/authorize.
"""

import configparser
from pathlib import Path
from typing import Any, Dict

import wandb

USER_FILE = Path.home() / ".config" / "wandb" / "authentication.yaml"


def wandb_login(config_key: str) -> bool:
    configuration = _wandb_config(config_key)
    if wandb.setup().settings.email == configuration["email"]:
        return True
    key = configuration["key"]
    return wandb.login(key=key, relogin=True)


def wandb_init(config_key: str, **kwargs: Any) -> wandb.sdk.wandb_run.Run:
    assert wandb_login(config_key)
    return wandb.init(**kwargs)


def _wandb_config(config_key: str) -> Dict[str, str]:
    configuration = configparser.ConfigParser()
    configuration.read_string(USER_FILE.read_text())
    return dict(configuration[config_key])
Code
wandb_login("work")
wandb: WARNING If you're specifying your api key in code, ensure this code is not shared publically.
wandb: WARNING Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.
wandb: Appending key for api.wandb.ai to your netrc file: /home/matthew/.netrc
Code
#hide_output
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = BartLinksOnlyValue.from_pretrained(MODEL_NAME)

model.token_indices = token_indices
model.token_values = token_values
model.link_loss = calculate_loss_link_mse_only_pmi
Could not locate the tokenizer configuration file, will try to use the model config instead.
Code
from typing import *
from pathlib import Path

import wandb
from transformers import Trainer, TrainingArguments
from transformers import EvalPrediction

MODEL_RUN_FOLDER = DATA_FOLDER / "runs"
MODEL_RUN_FOLDER.mkdir(parents=True, exist_ok=True)

token_indices_npy = token_indices.cpu().numpy()
def compute_metrics(preds: EvalPrediction) -> Dict[str, float]:
    labels = preds.label_ids.reshape(-1, 2)
    predictions = preds.predictions.reshape(-1, tokenizer.vocab_size)
    return {
        f"link_{key}": value
        for key, value in metric_link_jaccard(
            predictions=predictions,
            labels=labels,
            token_indices=token_indices_npy
        ).items()
    }

with wandb_init(
    "work",
    project=PROJECT_NAME,
    name="test-run",
    mode="online",
) as run:
    training_args = TrainingArguments(
        report_to=["wandb"],
        output_dir=MODEL_RUN_FOLDER / "output",
        overwrite_output_dir=True,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=5e-5,
        warmup_ratio=0.06,
        evaluation_strategy="steps",
        logging_dir=MODEL_RUN_FOLDER / "output",

        # num_train_epochs=EPOCHS,
        max_steps=100,
        logging_steps=10,

        # not really training properly here
        # load_best_model_at_end=True,
        # metric_for_best_model="quality",
        # greater_is_better=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=split["train"],
        eval_dataset=split["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
wandb: wandb version 0.12.4 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade
Tracking run with wandb version 0.10.33
Syncing run test-run to Weights & Biases (Documentation).
Project page: https://wandb.ai/bw-matthew/wikipedia-link-recognition
Run page: https://wandb.ai/bw-matthew/wikipedia-link-recognition/runs/gyz80ptb
Run data is saved locally in /home/matthew/Programming/Blog/blog/_notebooks/wandb/run-20211011_220839-gyz80ptb

using `logging_steps` to initialize `eval_steps` to 10
PyTorch: setting up devices
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[100/100 01:19, Epoch 0/1]
Step Training Loss Validation Loss Link Accuracy Link Intersection Link Union
10 0.003300 0.001046 0.000409 51 124649
20 0.001200 0.001046 0.000024 3 124697
30 0.001100 0.001046 0.000032 4 124696
40 0.001200 0.001046 0.000032 4 124696
50 0.001200 0.001046 0.000032 4 124696
60 0.001100 0.001046 0.000040 5 124695
70 0.001100 0.001046 0.000040 5 124695
80 0.001200 0.001046 0.000048 6 124694
90 0.001200 0.001046 0.000048 6 124694
100 0.001100 0.001046 0.000048 6 124694


Waiting for W&B process to finish, PID 397922
Program ended successfully.
Find user logs for this run at: /home/matthew/Programming/Blog/blog/_notebooks/wandb/run-20211011_220839-gyz80ptb/logs/debug.log
Find internal logs for this run at: /home/matthew/Programming/Blog/blog/_notebooks/wandb/run-20211011_220839-gyz80ptb/logs/debug-internal.log

Run summary:


train/loss0.0011
train/learning_rate0.0
train/epoch0.01
train/global_step100
_runtime81
_timestamp1633986600
_step20
eval/loss0.00105
eval/link_accuracy5e-05
eval/link_intersection6
eval/link_union124694
eval/runtime3.9759
eval/samples_per_second8.048
eval/steps_per_second0.503
train/train_runtime79.9487
train/train_samples_per_second20.013
train/train_steps_per_second1.251
train/total_flos243894583296000.0
train/train_loss0.00137

Run history:


train/loss█▁▁▁▁▁▁▁▁▁
train/learning_rate█▇▆▆▅▄▃▃▂▁
train/epoch▁▁▁▁▁▁███████████████
train/global_step▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
_runtime▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███
_timestamp▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███
_step▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇▇██
eval/loss▁███▇▇▇▇▆▆
eval/link_accuracy█▁▁▁▁▁▁▁▁▁
eval/link_intersection█▁▁▁▁▁▁▁▁▁
eval/link_union▁█████████
eval/runtime█▂▁▁▂▂▂▁▂▃
eval/samples_per_second▁▇██▇▇▇▇▇▆
eval/steps_per_second▁▇█▇▇▇▇▇▇▆
train/train_runtime
train/train_samples_per_second
train/train_steps_per_second
train/total_flos
train/train_loss

Synced 6 W&B file(s), 1 media file(s), 0 artifact file(s) and 2 other file(s)

That was fine. Took more effort getting different profiles working than actually doing the training. Now I can log each run to see more precisely how things change.


PMI Token Loss with Negative Sampling

I’ve tried training with just the 50 tokens of interest and while that achieves a higher peak than the full train it looks to train slowly. If I incorporate a limited number of negative tokens into the mix then it should be possible to give greater feedback to the model without swamping it with negative feedback.

This is based on the idea of negative sampling that I first saw explained in this article on word2vec. The number of negative samples is considered a hyperparameter. Given that for word2vec 5 negative samples are used when predicting one word I am going to use 100 negative samples given the desire to predict 50 tokens. I’m aiming low here and it can be turned up later.

Code
# from src/main/python/blog/wikipedia_link/loss/link_mse_negative_sampling.py
import torch


def calculate_loss_link_mse_negative_sampling(
    predictions: torch.Tensor,  # [:, n, vocab_size]
    link_labels: torch.Tensor,  # [:, n] for index
    token_indices: torch.Tensor,  # index -> 50 tokens
    token_values: torch.Tensor,  # index -> 50 pmi values
    negative_samples: int,
) -> torch.Tensor:
    """Calculate the loss for the link predictions.
    The labels for this are only valid when they are positive.
    The predictions are only the link target predictions."""

    predictions = predictions.softmax(dim=-1)

    mask = link_labels.flatten() >= 0
    link_labels = link_labels.flatten()[mask].long()
    rows = link_labels.shape[0]

    vocab_size = predictions.shape[-1]
    predictions = predictions.view(-1, vocab_size)[mask]
    rows = predictions.shape[0]
    target_width = 50 + negative_samples

    targets = torch.zeros(rows, target_width, device=predictions.device)
    targets[:, :50] = token_values[link_labels]

    offsets = torch.tensor(range(rows), device=predictions.device) * vocab_size
    prediction_indices = torch.cat(
        [
            token_indices[link_labels],
            torch.randint(
                low=0,
                high=vocab_size,
                size=(rows, negative_samples),
                dtype=torch.long,
                device=predictions.device,
            ),
        ],
        dim=-1,
    )
    predictions = predictions.flatten()[prediction_indices + offsets[:, None]]
    predictions = predictions.reshape(-1, targets.shape[-1])

    return torch.nn.functional.mse_loss(predictions, targets)
Code
#hide_output
from transformers import AutoTokenizer
from functools import partial

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = BartLinksOnlyValue.from_pretrained(MODEL_NAME)

model.token_indices = token_indices
model.token_values = token_values
model.link_loss = partial(calculate_loss_link_mse_negative_sampling, negative_samples=100)
Code
from typing import *
from pathlib import Path
from transformers import Trainer, TrainingArguments
from transformers import EvalPrediction

MODEL_RUN_FOLDER = DATA_FOLDER / "runs"
MODEL_RUN_FOLDER.mkdir(parents=True, exist_ok=True)

token_indices_npy = token_indices.cpu().numpy()
def compute_metrics(preds: EvalPrediction) -> Dict[str, float]:
    labels = preds.label_ids.reshape(-1, 2)
    predictions = preds.predictions.reshape(-1, tokenizer.vocab_size)
    return {
        f"link_{key}": value
        for key, value in metric_link_jaccard(
            predictions=predictions,
            labels=labels,
            token_indices=token_indices_npy
        ).items()
    }

with wandb_init(
    "work",
    project=PROJECT_NAME,
    name="mse-pmi-with-negative-sampling",
    mode="online",
) as run:
    training_args = TrainingArguments(
        report_to=["wandb"],
        output_dir=MODEL_RUN_FOLDER / "output",
        overwrite_output_dir=True,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=5e-5,
        warmup_ratio=0.06,
        evaluation_strategy="steps",
        logging_dir=MODEL_RUN_FOLDER / "output",

        num_train_epochs=EPOCHS,
        logging_steps=1_000,

        # not really training properly here
        # load_best_model_at_end=True,
        # metric_for_best_model="quality",
        # greater_is_better=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=split["train"],
        eval_dataset=split["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
wandb: WARNING If you're specifying your api key in code, ensure this code is not shared publically.
wandb: WARNING Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.
wandb: Appending key for api.wandb.ai to your netrc file: /home/matthew/.netrc
wandb: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:  
wandb: wandb version 0.12.4 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade
Tracking run with wandb version 0.10.33
Syncing run mse-pmi-with-negative-sampling to Weights & Biases (Documentation).
Project page: https://wandb.ai/brandwatch-ml/wikipedia-link-recognition
Run page: https://wandb.ai/brandwatch-ml/wikipedia-link-recognition/runs/1abrcbpd
Run data is saved locally in /home/matthew/Programming/Blog/blog/_notebooks/wandb/run-20211012_212846-1abrcbpd

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[38555/38555 4:55:40, Epoch 5/5]
Step Training Loss Validation Loss Link Accuracy Link Intersection Link Union
1000 0.000500 0.000342 0.014390 1769 122931
2000 0.000400 0.000339 0.015348 1885 122815
3000 0.000400 0.000341 0.018026 2208 122492
4000 0.000400 0.000331 0.021219 2591 122109
5000 0.000400 0.000328 0.021545 2630 122070
6000 0.000400 0.000325 0.021420 2615 122085
7000 0.000400 0.000324 0.021327 2604 122096
8000 0.000400 0.000331 0.025865 3144 121556
9000 0.000400 0.000326 0.024432 2974 121726
10000 0.000300 0.000318 0.026963 3274 121426
11000 0.000300 0.000323 0.026692 3242 121458
12000 0.000300 0.000317 0.027394 3325 121375
13000 0.000300 0.000334 0.028157 3415 121285
14000 0.000300 0.000315 0.027454 3332 121368
15000 0.000300 0.000307 0.027835 3377 121323
16000 0.000300 0.000315 0.028386 3442 121258
17000 0.000300 0.000312 0.030127 3647 121053
18000 0.000300 0.000312 0.030527 3694 121006
19000 0.000300 0.000309 0.030630 3706 120994
20000 0.000300 0.000321 0.029370 3558 121142
21000 0.000300 0.000314 0.030902 3738 120962
22000 0.000300 0.000322 0.031952 3861 120839
23000 0.000300 0.000321 0.032567 3933 120767
24000 0.000300 0.000320 0.031218 3775 120925
25000 0.000300 0.000318 0.032174 3887 120813
26000 0.000300 0.000311 0.032011 3868 120832
27000 0.000300 0.000317 0.032140 3883 120817
28000 0.000300 0.000315 0.032225 3893 120807
29000 0.000300 0.000317 0.032233 3894 120806
30000 0.000300 0.000313 0.032755 3955 120745
31000 0.000300 0.000324 0.033037 3988 120712
32000 0.000300 0.000316 0.033072 3992 120708
33000 0.000300 0.000316 0.033106 3996 120704
34000 0.000300 0.000319 0.033491 4041 120659
35000 0.000300 0.000312 0.033423 4033 120667
36000 0.000300 0.000311 0.033680 4063 120637
37000 0.000300 0.000312 0.033945 4094 120606
38000 0.000300 0.000314 0.033800 4077 120623


Waiting for W&B process to finish, PID 419867
Program ended successfully.
Find user logs for this run at: /home/matthew/Programming/Blog/blog/_notebooks/wandb/run-20211012_212846-1abrcbpd/logs/debug.log
Find internal logs for this run at: /home/matthew/Programming/Blog/blog/_notebooks/wandb/run-20211012_212846-1abrcbpd/logs/debug-internal.log

Run summary:


train/loss0.0003
train/learning_rate0.0
train/epoch5.0
train/global_step38555
_runtime17742
_timestamp1634088268
_step76
eval/loss0.00031
eval/link_accuracy0.0338
eval/link_intersection4077
eval/link_union120623
eval/runtime4.1699
eval/samples_per_second7.674
eval/steps_per_second0.48
train/train_runtime17741.1925
train/train_samples_per_second34.767
train/train_steps_per_second2.173
train/total_flos9.40236483723264e+16
train/train_loss0.00034

Run history:


train/loss█▄▄▄▄▄▄▄▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate▄▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/epoch▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
_runtime▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
_timestamp▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
_step▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval/loss█▇█▆▅▅▄▆▅▃▄▃▇▃▁▃▂▂▁▄▂▄▄▄▃▂▃▃▃▂▄▃▃▃▂▂▂▂
eval/link_accuracy▁▁▂▃▄▄▃▅▅▆▅▆▆▆▆▆▇▇▇▆▇▇█▇▇▇▇▇▇█████████
eval/link_intersection▁▁▂▃▄▄▄▅▅▆▅▆▆▆▆▆▇▇▇▆▇▇█▇▇▇▇▇▇█████████
eval/link_union██▇▆▅▅▅▄▄▃▄▃▃▃▃▃▂▂▂▃▂▂▁▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
eval/runtime▃▂▁▂▁▁▁██▇▇▆▇▇▇▃▇▇▆▆▇▅▆▆█▆██▆▇█▇█▆▇▇▇▆
eval/samples_per_second▆▇█▇▇▇█▁▁▁▂▃▂▂▂▆▂▂▃▃▂▄▃▃▁▃▁▁▃▂▁▂▁▃▂▂▂▃
eval/steps_per_second▆▇▇▇▇▇█▁▁▁▂▃▂▂▂▅▂▂▃▃▂▄▃▃▁▃▁▁▃▂▁▂▁▃▂▂▂▃
train/train_runtime
train/train_samples_per_second
train/train_steps_per_second
train/total_flos
train/train_loss

Synced 6 W&B file(s), 1 media file(s), 0 artifact file(s) and 2 other file(s)
Code
model.save_pretrained(DATA_FOLDER / "model-pmi-with-negative-sampling")

PMI Token Loss with Negative Sampling Evaluation

Let’s see what the inference looks like.

Code
model = model.eval()
Code
infer_no_ee(
    "I like to drive my [[Malibu]]",
    tokenizer=tokenizer,
    model=model,
)
[(' Malibu',
  ['ibu',
   ' condos',
   ' tsun',
   ' taxis',
   ' headlights',
   ' renters',
   ' ancest',
   ' Cruiser',
   'iami',
   ' sedan'])]

That’s not great. The output for the PMI tokens only was:

[(' Malibu',
  ['ibu',
   ' tsun',
   ' condos',
   ' Godzilla',
   ' taxis',
   ' renters',
   ' Lyft',
   'iami',
   ' condo',
   ' Cruiser'])]

So while it is equivalent to that it does not have the quality that the binary cross entropy model has.