Wikipedia Link Metrics

Training is slow and performance varies wildly. A way to track the quality of the model early could make progress more systematic
Published

August 31, 2021

Understanding the performance of the wikipedia model is not just about how well it spots links. The model also describes those links. Tracking how well it can describe them is even more important - if it is good enough at describing then extraction could be dropped, as anything that has an interesting description is probably a link.

Since the model is trained to predict a fixed set of tokens for a given link we can start by measuring how accurately it predicts them.

Jaccard Similarity

The first measure of accuracy is to take the top 50 predicted indices and then compare them to the top 50 for the actual link. This can be a simple intersection over union of the tokens - \(accuracy = \frac{T_{prediction} \cap T_{actual}}{T_{prediction} \cup T_{actual}}\) - otherwise known as the Jaccard Similarity Coefficient.

Let’s get to it.

Code
MODEL_NAME = "facebook/bart-base"
BATCH_SIZE = 32
Code
import blog.transformers_logging

from blog.wikipedia_link.metrics.link_jaccard import metric_link_jaccard
from blog.wikipedia_link.loss.boundary_bce import calculate_loss_boundary_bce
from blog.wikipedia_link.loss.link_bce import calculate_loss_link_bce
from blog.wikipedia_link.model.bart_boundary_bce import BartLinksBoundaryBCE

from blog.wikipedia_link.data.boundary_bce import load_dataset_bce
from blog.wikipedia_link.data.page_tokens import load_page_tokens
from blog.wikipedia_link.data.title_to_index import load_title_to_index
Code
#hide_output
from transformers import AutoTokenizer

token_indices = load_page_tokens()
title_to_index = load_title_to_index()
split = load_dataset_bce(test_size=BATCH_SIZE*2)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = BartLinksBoundaryBCE.from_pretrained(MODEL_NAME)

model.token_indices = token_indices
model.boundary_loss = calculate_loss_boundary_bce
model.link_loss = calculate_loss_link_bce
Some weights of BartLinksBoundaryBCE were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['link_head.weight', 'link_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Code
from typing import *
from pathlib import Path
from transformers import Trainer, TrainingArguments
from transformers import EvalPrediction

MODEL_RUN_FOLDER = Path("/data/blog/2021-08-30-wikipedia-metrics/runs")
MODEL_RUN_FOLDER.mkdir(parents=True, exist_ok=True)
token_indices_npy = token_indices.cpu().numpy()

def compute_metrics(preds: EvalPrediction) -> Dict[str, float]:
    labels = preds.label_ids.reshape(-1, 3)
    link_predictions = preds.predictions.reshape(-1, tokenizer.vocab_size + 2)[:, :tokenizer.vocab_size]
    return metric_link_jaccard(predictions=link_predictions, labels=labels, token_indices=token_indices_npy)

training_args = TrainingArguments(
    report_to=[],            
    output_dir=MODEL_RUN_FOLDER / "output",
    overwrite_output_dir=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=5e-5,
    warmup_ratio=0.06,
    evaluation_strategy="steps",
    logging_dir=MODEL_RUN_FOLDER / "output",

    max_steps=100,
    logging_steps=10,

    # not really training properly here
    # load_best_model_at_end=True,
    # metric_for_best_model="quality",
    # greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split["train"],
    eval_dataset=split["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
[100/100 02:12, Epoch 0/1]
Step Training Loss Validation Loss Accuracy Intersection Union
10 1.271300 0.814785 0.001701 273 160527
20 0.803000 0.745464 0.001538 247 160553
30 0.736000 0.709939 0.001464 235 160565
40 0.716200 0.684700 0.001445 232 160568
50 0.681200 0.647955 0.001470 236 160564
60 0.657300 0.631995 0.001470 236 160564
70 0.612400 0.616565 0.001501 241 160559
80 0.630400 0.610896 0.001520 244 160556
90 0.625600 0.605611 0.001557 250 160550
100 0.607700 0.604615 0.001563 251 160549

TrainOutput(global_step=100, training_loss=0.7341130256652832, metrics={'train_runtime': 133.2628, 'train_samples_per_second': 24.013, 'train_steps_per_second': 0.75, 'total_flos': 487796726169600.0, 'train_loss': 0.7341130256652832, 'epoch': 0.02})

So this seems to be working. The stats are very poor though. What happens if I load up one of the trained models and try that?

Code
#hide_output
model = BartLinksBoundaryBCE.from_pretrained("/data/blog/2021-08-07-wikipedia-preprocessing-performance/model")

model.token_indices = token_indices
model.boundary_loss = calculate_loss_boundary_bce
model.link_loss = calculate_loss_link_bce
loading configuration file /data/blog/2021-08-07-wikipedia-preprocessing-performance/model/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForLinks"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 1024,
  "model_type": "bart",
  "no_repeat_ngram_size": 3,
  "normalize_before": false,
  "normalize_embedding": true,
  "num_beams": 4,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "scale_embedding": false,
  "task_specific_params": {
    "summarization": {
      "length_penalty": 1.0,
      "max_length": 128,
      "min_length": 12,
      "num_beams": 4
    },
    "summarization_cnn": {
      "length_penalty": 2.0,
      "max_length": 142,
      "min_length": 56,
      "num_beams": 4
    },
    "summarization_xsum": {
      "length_penalty": 1.0,
      "max_length": 62,
      "min_length": 11,
      "num_beams": 6
    }
  },
  "transformers_version": "4.9.2",
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file /data/blog/2021-08-07-wikipedia-preprocessing-performance/model/pytorch_model.bin
All model checkpoint weights were used when initializing BartLinksBoundaryBCE.

All the weights of BartLinksBoundaryBCE were initialized from the model checkpoint at /data/blog/2021-08-07-wikipedia-preprocessing-performance/model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BartLinksBoundaryBCE for predictions without further training.
Code
from typing import *
from pathlib import Path
from transformers import Trainer, TrainingArguments
from transformers import EvalPrediction
import pandas as pd

MODEL_RUN_FOLDER = Path("/data/blog/2021-08-30-wikipedia-metrics/runs")
MODEL_RUN_FOLDER.mkdir(parents=True, exist_ok=True)
token_indices_npy = token_indices.cpu().numpy()

def compute_metrics(preds: EvalPrediction) -> Dict[str, float]:
    labels = preds.label_ids.reshape(-1, 3)
    link_predictions = preds.predictions.reshape(-1, tokenizer.vocab_size + 2)[:, :tokenizer.vocab_size]
    return metric_link_jaccard(predictions=link_predictions, labels=labels, token_indices=token_indices_npy)

training_args = TrainingArguments(
    report_to=[],            
    output_dir=MODEL_RUN_FOLDER / "output",
    overwrite_output_dir=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=5e-5,
    warmup_ratio=0.06,
    evaluation_strategy="steps",
    logging_dir=MODEL_RUN_FOLDER / "output",

    max_steps=100,
    logging_steps=10,

    # not really training properly here
    # load_best_model_at_end=True,
    # metric_for_best_model="quality",
    # greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=split["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

pd.DataFrame([trainer.evaluate()])
using `logging_steps` to initialize `eval_steps` to 10
PyTorch: setting up devices
[2/2 00:00]
eval_loss eval_accuracy eval_intersection eval_union eval_runtime eval_samples_per_second eval_steps_per_second
0 1.582133 0.057213 8702 152098 5.5835 11.462 0.358

This is the one that I’ve been showing others, where the evaluation can show strong results. The fact that this only gets 5% of the tokens correct is kinda amazing.

To me this suggests that either the model is wildly underperforming or that this metric is a poor way to measure performance. Given that it is the page predictions that matter I would like to try something that focuses more on them.


Perplexity

The next metric I am considering is to treat this as a 6 million class classification problem. Using the 50 tokens to filter the output of the model is a form of linear layer, a rather simple one at that. If I take this per-page output then it should be possible to treat it as a single class prediction and run it through cross entropy loss. Perplexity is then the exponent of that.

Code
# from src/main/python/blog/wikipedia_link/metrics/link_perplexity.py
from typing import Dict

import numpy as np
import torch


@torch.no_grad()
def metric_link_perplexity(
    predictions: np.array,
    labels: np.array,
    token_indices: torch.Tensor,
) -> Dict[str, float]:
    # predictions and labels have been flattened
    # predictions have been sliced
    # labels have NOT been sliced
    # this means that labels is either [n, 3] ints (begin, within) or [n, 2] ints (iob)
    # and predictions is [n, vocab_size] floats

    # limit predictions to just the links
    link_mask = labels[:, :-1].sum(axis=1) > 0
    predictions = predictions[link_mask]

    labels = labels[link_mask][:, -1]
    labels = torch.from_numpy(labels).to(token_indices.device)

    predictions = torch.from_numpy(predictions).to(token_indices.device)

    # got limited memory so just run this through step by step
    # makes this very slow bu hu hu
    loss = torch.tensor(
        [
            torch.nn.functional.cross_entropy(
                predictions[idx, token_indices].sum(dim=-1)[None, :], labels[None, idx]
            )
            for idx in range(predictions.shape[0])
        ]
    ).mean()
    perplexity = torch.exp(loss).item()

    return {"cross_entropy": loss.item(), "perplexity": perplexity}
Code
#hide_output
from transformers import AutoTokenizer

token_indices = load_page_tokens()
title_to_index = load_title_to_index()
split = load_dataset_bce(test_size=BATCH_SIZE*2)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = BartLinksBoundaryBCE.from_pretrained(MODEL_NAME)

model.token_indices = token_indices
model.boundary_loss = calculate_loss_boundary_bce
model.link_loss = calculate_loss_link_bce
Some weights of BartLinksBoundaryBCE were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['link_head.bias', 'link_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Code
from typing import *
from pathlib import Path
from transformers import Trainer, TrainingArguments
from transformers import EvalPrediction

MODEL_RUN_FOLDER = Path("/data/blog/2021-08-30-wikipedia-metrics/runs")
MODEL_RUN_FOLDER.mkdir(parents=True, exist_ok=True)

def compute_metrics(preds: EvalPrediction) -> Dict[str, float]:
    labels = preds.label_ids.reshape(-1, 3)
    link_predictions = preds.predictions.reshape(-1, tokenizer.vocab_size + 2)[:, :tokenizer.vocab_size]
    return metric_link_perplexity(predictions=link_predictions, labels=labels, token_indices=token_indices)

training_args = TrainingArguments(
    report_to=[],            
    output_dir=MODEL_RUN_FOLDER / "output",
    overwrite_output_dir=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=5e-5,
    warmup_ratio=0.06,
    evaluation_strategy="steps",
    logging_dir=MODEL_RUN_FOLDER / "output",

    max_steps=100,
    logging_steps=10,

    # not really training properly here
    # load_best_model_at_end=True,
    # metric_for_best_model="quality",
    # greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split["train"],
    eval_dataset=split["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
[100/100 05:36, Epoch 0/1]
Step Training Loss Validation Loss Cross Entropy Perplexity
10 1.243800 0.796465 175.470093 inf
20 0.798800 0.737848 132.800415 inf
30 0.732700 0.709505 140.187775 inf
40 0.717400 0.685683 124.354576 inf
50 0.684600 0.652236 114.922829 inf
60 0.664200 0.634367 118.784439 inf
70 0.617300 0.621118 110.832184 inf
80 0.636800 0.614504 112.220947 inf
90 0.631700 0.609446 110.314163 inf
100 0.612400 0.608113 109.717552 inf

TrainOutput(global_step=100, training_loss=0.7339662551879883, metrics={'train_runtime': 337.046, 'train_samples_per_second': 9.494, 'train_steps_per_second': 0.297, 'total_flos': 487796726169600.0, 'train_loss': 0.7339662551879883, 'epoch': 0.02})

This again suggests that the model is performing poorly. The Jaccard metric suggests that less than 1 token is predicted correctly per link, so the perplexity being huge is not surprising.

I’m hopeful that the perplexity approach could be used for training and that might address both of these.