Wikipedia Mean Squared Error Train

Train the wikipedia link recognition model using mean squared error
Published

October 3, 2021

The problem using perplexity or binary cross entropy is that the output of the model is based on the relative information each token contributes to a specific page. Reducing the tokens to yes/no choices is losing a lot of the information that the PMI provides. There is also the problem that the majority of the language model output is negative and that when the binary cross entropy works with that it takes extra effort to bring the values into range.

How can I address these issues? I want to take the PMI output and train the model against that. If I can get the softmax of the PMI output then I can compare that to the softmax of the model output. Using softmax will naturally scale the different outputs.

To do all of this I need to have the original PMI output of the model for the top 50 tokens. I have been calculating this already, it’s just been discarded. Let’s start by calculating that.

Code
import blog.transformers_logging
Code
from pathlib import Path

DATA_FOLDER = Path("/data/blog/2021-10-03-wikipedia-train-mse")

MODEL_NAME = "facebook/bart-base"
BATCH_SIZE = 16
EPOCHS = 5

Generating an Updated Dataset

To get the PMI values associated with the pages I have to reprocess the data.

Generating PMI Values

As I’ve mentioned before the process of calculating the PMI score for all of the wikipedia pages is quite straightforward:

Since I was focused on the top 50 tokens without being concerned about the actual values I did not save them. Now it’s desirable to have them so recalculating everything is appropriate at this point. Quite a bit of code follows, and it is broadly the same as the previous processing that was done in the Wikipedia Page Mutual Information post.

Code
# from src/main/python/blog/wikipedia_link/data/pmi/px.py
"""
Calculate the p(x) value across all of the rows
"""

from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd
import ray
from tqdm.auto import tqdm
from transformers import AutoTokenizer


def calculate_px(  # pylint: disable=invalid-name
    source_folder: Path,
    destination: Path,
    model_name: str,
) -> np.array:
    if destination.exists():
        print(f"Skipping px calculation, already exists at {destination}")
        return np.load(destination, allow_pickle=True)

    paths = sorted(source_folder.glob("*.gz.parquet"))

    ray.init()
    tasks = [
        calculate_px_for_file.remote(path, model_name=model_name) for path in paths
    ]
    all_px = []
    row_count = 0

    with tqdm(total=len(paths)) as pbar:
        while tasks:
            completed, tasks = ray.wait(tasks)
            for file_px, count in ray.get(completed):
                all_px.append(file_px)
                row_count += count
            pbar.update(len(completed))
    total_px = sum(all_px) / row_count
    np.save(destination, total_px)

    return total_px


@ray.remote
def calculate_px_for_file(path: Path, model_name: str) -> Tuple[np.array, int]:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # suppress "Token indices sequence length is longer than the specified maximum" errors
    tokenizer.deprecation_warnings[
        "sequence-length-is-longer-than-the-specified-maximum"
    ] = True

    vocab_size = tokenizer.vocab_size

    def _block(text: List[str]) -> np.array:
        tokens = tokenizer(text, return_attention_mask=False)["input_ids"]
        counts = np.concatenate(
            [np.bincount(ids, minlength=vocab_size)[None, :] for ids in tokens]
        )
        counts = counts / counts.sum(axis=1)[:, None]
        return counts.sum(axis=0)

    try:
        text = pd.read_parquet(path).text.tolist()
    except:
        print(path)
        raise
    token_counts = sum(
        _block(text[idx : idx + 1_000]) for idx in range(0, len(text), 1_000)
    )
    return token_counts, len(text)



# from src/main/python/blog/wikipedia_link/data/pmi/pmi.py
from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd
import ray
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer


def combine_pmi_files(source_folder: Path, destination: Path) -> pd.DataFrame:
    if destination.exists():
        print(f"Skipping pmi aggregation, already exists at {destination}")
        return pd.read_parquet(destination)

    paths = sorted(source_folder.glob("*.gz.parquet"))

    df = pd.concat([pd.read_parquet(path) for path in tqdm(paths)])
    df = df.sort_values(by="title")
    df = df.reset_index(drop=True)
    df.to_parquet(destination, compression="gzip")
    return df


def per_file_pmi(
    source_folder: Path, destination_folder: Path, p_x: np.array, model_name: str
) -> None:
    paths = sorted(source_folder.glob("*.gz.parquet"))
    destination_folder.mkdir(exist_ok=True, parents=True)

    ray.init()
    p_x_obj = ray.put(p_x)

    tasks = [
        calculate_pmi.remote(
            path,
            destination_folder=destination_folder,
            model_name=model_name,
            p_x=p_x_obj,
        )
        for path in paths
    ]

    with tqdm(total=len(paths)) as pbar:
        while tasks:
            completed, tasks = ray.wait(tasks)
            completed = ray.get(completed)
            pbar.update(len(completed))


# Observed >4.25 GB memory in oom exception, limit likely same as original article ingestion
@ray.remote(memory=4.5 * 1024 * 1024)
def calculate_pmi(
    path: Path, destination_folder: Path, model_name: str, p_x: np.array
) -> Path:
    destination = destination_folder / f"{path.stem}-pmi.gz.parquet"
    if destination.exists():
        print(f"Skipping {path}, already exists at {destination}")
        return destination

    text_df = pd.read_parquet(path)
    token_df = text_to_pmi(
        model_name=model_name,
        text=text_df.text.tolist(),
        p_x=p_x,
        index=text_df.index,
    )
    df = pd.merge(
        text_df[["title"]],
        token_df,
        left_index=True,
        right_index=True,
    )
    df.to_parquet(destination, compression="gzip")

    return destination


def text_to_pmi(
    model_name: str, text: List[str], p_x: np.array, index: pd.Series
) -> pd.DataFrame:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # suppress "Token indices sequence length is longer than the specified maximum" errors
    tokenizer.deprecation_warnings[
        "sequence-length-is-longer-than-the-specified-maximum"
    ] = True

    vocab_size = tokenizer.vocab_size

    def _block(text: List[str]) -> Tuple[np.array, np.array]:
        tokens = tokenizer(text, return_attention_mask=False)["input_ids"]
        counts = np.concatenate(
            [np.bincount(ids, minlength=vocab_size)[None, :] for ids in tokens]
        )
        counts = counts / counts.sum(axis=1)[:, None]
        pmi = np.log(counts / p_x)
        pmi = np.nan_to_num(
            pmi,
            copy=False,
            nan=-1e300,
        )
        pmi = torch.from_numpy(pmi)
        scores, tokens = pmi.sort(dim=-1, descending=True)
        scores = scores[:, :50].numpy().copy()
        tokens = tokens[:, :50].numpy().copy()
        return tokens, scores

    all_blocks = [_block(text[idx : idx + 1_000]) for idx in range(0, len(text), 1_000)]
    all_tokens = np.concatenate([tokens for tokens, _ in all_blocks])
    all_scores = np.concatenate([scores for _, scores in all_blocks])
    return pd.DataFrame(
        {
            "tokens": pd.Series(all_tokens.tolist(), index=index),
            "scores": pd.Series(all_scores.tolist(), index=index),
        }
    )
Code
#collapse_input
#hide_output
p_x = calculate_px(
    source_folder=DATA_FOLDER / "articles",
    destination=DATA_FOLDER / "px.npy",
    model_name=MODEL_NAME,
)
per_file_pmi(
    source_folder=DATA_FOLDER / "articles",
    destination_folder=DATA_FOLDER / "pmi",
    p_x=p_x,
    model_name=MODEL_NAME,
)
pmi = combine_pmi_files(
    source_folder=DATA_FOLDER / "pmi",
    destination=DATA_FOLDER / "pmi.gz.parquet",
)
Skipping px calculation, already exists at /data/blog/2021-10-03-wikipedia-train-mse/px.npy
2021-10-09 11:58:45,430 INFO services.py:1263 -- View the Ray dashboard at http://127.0.0.1:8266
(pid=310242) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles12.xml-p7054860p8554859.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles12.xml-p7054860p8554859.gz-pmi.gz.parquet
(pid=310242) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles18.xml-p25216198p26716197.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles18.xml-p25216198p26716197.gz-pmi.gz.parquet
(pid=310242) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles18.xml-p26716198p27121850.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles18.xml-p26716198p27121850.gz-pmi.gz.parquet
(pid=310242) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles19.xml-p27121851p28621850.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles19.xml-p27121851p28621850.gz-pmi.gz.parquet
(pid=310242) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles19.xml-p28621851p30121850.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles19.xml-p28621851p30121850.gz-pmi.gz.parquet
(pid=310242) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles19.xml-p30121851p31308442.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles19.xml-p30121851p31308442.gz-pmi.gz.parquet
(pid=310242) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles2.xml-p41243p151573.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles2.xml-p41243p151573.gz-pmi.gz.parquet
(pid=310242) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles20.xml-p34308443p35522432.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles20.xml-p34308443p35522432.gz-pmi.gz.parquet
(pid=310242) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles22.xml-p39996246p41496245.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles22.xml-p39996246p41496245.gz-pmi.gz.parquet
(pid=310242) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles23.xml-p49288942p50564553.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles23.xml-p49288942p50564553.gz-pmi.gz.parquet
(pid=310242) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles25.xml-p58525656p60025655.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles25.xml-p58525656p60025655.gz-pmi.gz.parquet
(pid=310242) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles27.xml-p65475910p66975909.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles27.xml-p65475910p66975909.gz-pmi.gz.parquet
(pid=310248) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles18.xml-p23716198p25216197.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles18.xml-p23716198p25216197.gz-pmi.gz.parquet
(pid=310253) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles14.xml-p11659683p13159682.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles14.xml-p11659683p13159682.gz-pmi.gz.parquet
(pid=310253) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles24.xml-p55064554p56564553.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles24.xml-p55064554p56564553.gz-pmi.gz.parquet
(pid=310253) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles25.xml-p60025656p61525655.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles25.xml-p60025656p61525655.gz-pmi.gz.parquet
(pid=310253) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles26.xml-p62585851p63975909.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles26.xml-p62585851p63975909.gz-pmi.gz.parquet
(pid=310253) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles27.xml-p66975910p68108549.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles27.xml-p66975910p68108549.gz-pmi.gz.parquet
(pid=310253) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles5.xml-p558392p958045.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles5.xml-p558392p958045.gz-pmi.gz.parquet
(pid=310253) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles8.xml-p2134112p2936260.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles8.xml-p2134112p2936260.gz-pmi.gz.parquet
(pid=310251) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles17.xml-p20570393p22070392.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles17.xml-p20570393p22070392.gz-pmi.gz.parquet
(pid=310255) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles15.xml-p17324603p17460152.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles15.xml-p17324603p17460152.gz-pmi.gz.parquet
(pid=310256) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles16.xml-p17460153p18960152.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles16.xml-p17460153p18960152.gz-pmi.gz.parquet
(pid=310240) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles12.xml-p8554860p9172788.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles12.xml-p8554860p9172788.gz-pmi.gz.parquet
(pid=310240) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles20.xml-p31308443p32808442.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles20.xml-p31308443p32808442.gz-pmi.gz.parquet
(pid=310240) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles20.xml-p32808443p34308442.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles20.xml-p32808443p34308442.gz-pmi.gz.parquet
(pid=310240) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles21.xml-p35522433p37022432.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles21.xml-p35522433p37022432.gz-pmi.gz.parquet
(pid=310240) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles21.xml-p37022433p38522432.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles21.xml-p37022433p38522432.gz-pmi.gz.parquet
(pid=310240) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles21.xml-p38522433p39996245.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles21.xml-p38522433p39996245.gz-pmi.gz.parquet
(pid=310240) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles22.xml-p44496246p44788941.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles22.xml-p44496246p44788941.gz-pmi.gz.parquet
(pid=310240) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles24.xml-p53564554p55064553.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles24.xml-p53564554p55064553.gz-pmi.gz.parquet
(pid=310240) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles6.xml-p958046p1483661.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles6.xml-p958046p1483661.gz-pmi.gz.parquet
(pid=310257) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles14.xml-p13159683p14324602.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles14.xml-p13159683p14324602.gz-pmi.gz.parquet
(pid=310246) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles17.xml-p22070393p23570392.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles17.xml-p22070393p23570392.gz-pmi.gz.parquet
(pid=310246) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles7.xml-p1483662p2134111.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles7.xml-p1483662p2134111.gz-pmi.gz.parquet
(pid=310244) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles15.xml-p15824603p17324602.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles15.xml-p15824603p17324602.gz-pmi.gz.parquet
(pid=310247) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles16.xml-p20460153p20570392.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles16.xml-p20460153p20570392.gz-pmi.gz.parquet
(pid=310241) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles17.xml-p23570393p23716197.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles17.xml-p23570393p23716197.gz-pmi.gz.parquet
(pid=310241) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles22.xml-p42996246p44496245.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles22.xml-p42996246p44496245.gz-pmi.gz.parquet
(pid=310241) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles23.xml-p46288942p47788941.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles23.xml-p46288942p47788941.gz-pmi.gz.parquet
(pid=310241) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles24.xml-p50564554p52064553.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles24.xml-p50564554p52064553.gz-pmi.gz.parquet
(pid=310241) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles25.xml-p57025656p58525655.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles25.xml-p57025656p58525655.gz-pmi.gz.parquet
(pid=310241) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles27.xml-p63975910p65475909.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles27.xml-p63975910p65475909.gz-pmi.gz.parquet
(pid=310241) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles4.xml-p311330p558391.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles4.xml-p311330p558391.gz-pmi.gz.parquet
(pid=310250) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles11.xml-p6899367p7054859.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles11.xml-p6899367p7054859.gz-pmi.gz.parquet
(pid=310243) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles16.xml-p18960153p20460152.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles16.xml-p18960153p20460152.gz-pmi.gz.parquet
(pid=310243) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles22.xml-p41496246p42996245.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles22.xml-p41496246p42996245.gz-pmi.gz.parquet
(pid=310243) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles23.xml-p44788942p46288941.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles23.xml-p44788942p46288941.gz-pmi.gz.parquet
(pid=310243) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles23.xml-p47788942p49288941.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles23.xml-p47788942p49288941.gz-pmi.gz.parquet
(pid=310243) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles24.xml-p52064554p53564553.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles24.xml-p52064554p53564553.gz-pmi.gz.parquet
(pid=310243) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles24.xml-p56564554p57025655.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles24.xml-p56564554p57025655.gz-pmi.gz.parquet
(pid=310243) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles25.xml-p61525656p62585850.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles25.xml-p61525656p62585850.gz-pmi.gz.parquet
(pid=310243) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles3.xml-p151574p311329.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles3.xml-p151574p311329.gz-pmi.gz.parquet
(pid=310243) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles9.xml-p2936261p4045402.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles9.xml-p2936261p4045402.gz-pmi.gz.parquet
(pid=310254) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles13.xml-p9172789p10672788.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles13.xml-p9172789p10672788.gz-pmi.gz.parquet
(pid=310258) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles11.xml-p5399367p6899366.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles11.xml-p5399367p6899366.gz-pmi.gz.parquet
(pid=310252) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles15.xml-p14324603p15824602.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles15.xml-p14324603p15824602.gz-pmi.gz.parquet
(pid=310245) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles13.xml-p10672789p11659682.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles13.xml-p10672789p11659682.gz-pmi.gz.parquet
(pid=310259) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles1.xml-p1p41242.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles1.xml-p1p41242.gz-pmi.gz.parquet

Skipping pmi aggregation, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi.gz.parquet
(pid=310249) Skipping /data/blog/2021-10-03-wikipedia-train-mse/articles/enwiki-20210701-pages-articles10.xml-p4045403p5399366.gz.parquet, already exists at /data/blog/2021-10-03-wikipedia-train-mse/pmi/enwiki-20210701-pages-articles10.xml-p4045403p5399366.gz-pmi.gz.parquet
Code
pmi
title tokens scores
0 ! (Cláudia Pascoal album) [35134, 26404, 20155, 9973, 8062, 13053, 14062... [9.290038528698204, 7.986949862985146, 7.76676...
1 ! (The Dismemberment Plan album) [39562, 42305, 29042, 15674, 39211, 21062, 465... [9.262877524452705, 8.896630429014293, 8.86250...
2 ! (The Song Formerly Known As) [44926, 32589, 43877, 44562, 32276, 23940, 113... [9.927762220193253, 9.268947739832392, 8.99197...
3 ! (Trippie Redd album) [46046, 44261, 33539, 45170, 37186, 31834, 265... [9.582111458155616, 8.997840204998903, 8.60897...
4 ! (disambiguation) [49005, 46741, 25121, 31317, 29256, 37724, 487... [10.846365638801405, 9.619055047169276, 9.0023...
... ... ... ...
6328473 𝕊 [23567, 8576, 23944, 40051, 45429, 7457, 14141... [9.569169164058135, 8.343952594921872, 8.26162...
6328474 🍜 [0, 2, 50114, 50130, 50129, 50128, 50127, 5012... [4.282792861024332, 4.282791475871699, -1e+300...
6328475 🍲 [0, 2, 50114, 50130, 50129, 50128, 50127, 5012... [4.282792861024332, 4.282791475871699, -1e+300...
6328476 🥣 [0, 2, 50114, 50130, 50129, 50128, 50127, 5012... [4.282792861024332, 4.282791475871699, -1e+300...
6328477 🥤 [0, 2, 50114, 50130, 50129, 50128, 50127, 5012... [4.282792861024332, 4.282791475871699, -1e+300...

6328478 rows × 3 columns

Now that we have it, just glancing at it I can see some problems. The scores for the emoji at the bottom hit the semaphore value (-1e+300) which indicates that the PMI values could not be calculated (it’s the number that is substituted for nan values).

I need to exclude any row that has such a value.

Code
invalid_page_mask = pmi.scores.apply(lambda values: min(values) <= -1e300)
print(f"There are {invalid_page_mask.sum():,} invalid pages")
There are 875,981 invalid pages

That seems like a lot! I’m going to drop these for now.

Code
pmi = pmi[~invalid_page_mask]
pmi = pmi.reset_index(drop=True)
pmi
title tokens scores
0 ! (Cláudia Pascoal album) [35134, 26404, 20155, 9973, 8062, 13053, 14062... [9.290038528698204, 7.986949862985146, 7.76676...
1 ! (The Dismemberment Plan album) [39562, 42305, 29042, 15674, 39211, 21062, 465... [9.262877524452705, 8.896630429014293, 8.86250...
2 ! (The Song Formerly Known As) [44926, 32589, 43877, 44562, 32276, 23940, 113... [9.927762220193253, 9.268947739832392, 8.99197...
3 ! (Trippie Redd album) [46046, 44261, 33539, 45170, 37186, 31834, 265... [9.582111458155616, 8.997840204998903, 8.60897...
4 ! (disambiguation) [49005, 46741, 25121, 31317, 29256, 37724, 487... [10.846365638801405, 9.619055047169276, 9.0023...
... ... ... ...
5452492 [15023, 19991, 34849, 46873, 44947, 46969, 363... [8.597022608881108, 7.832322879728436, 7.72508...
5452493 [44763, 46483, 43195, 48456, 39609, 15023, 384... [10.969752362154262, 8.554106174148323, 8.2454...
5452494 [45415, 45299, 46483, 27003, 38494, 28243, 117... [11.01923101017471, 10.358655093166249, 8.8245...
5452495 이상화 [48328, 49550, 48414, 48280, 46747, 47649, 224... [9.341999317797894, 9.235801267109444, 8.43977...
5452496 𓈖 [31228, 23077, 29588, 49585, 37552, 39218, 342... [10.64922465565504, 9.69788970886562, 9.312148...

5452497 rows × 3 columns

Code
pmi.to_parquet(DATA_FOLDER / "pmi-filtered.gz.parquet", compression="gzip")

Reprocessing the Data

Since I’ve changed the indexes of the pages I now need to reprocess the text and the associated page indexes for each link. The boundaries can remain the same as the links to the now missing pages are still links even if we don’t subsequently attempt to resolve them.

This change means that the model code will have to change as well, as it should only look at the token link labels to determine if the link needs to be evaluated.

Code
# from src/main/python/blog/wikipedia_link/data/labels/generate_iob.py
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterator, List, Tuple, Union

import pandas as pd
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer


@dataclass
class IOBEncoder:
    """ Encodes the row using IOB boundary labels """

    tokenizer: AutoTokenizer
    title_to_index: Dict[str, int]
    max_length: int = 256
    inside_label: int = 1
    beginning_label: int = 2
    outside_label: int = 0

    def encode(self, paths: List[Path], destination_folder: Path) -> None:
        destination_folder.mkdir(exist_ok=True, parents=True)
        for path in tqdm(paths):
            destination = destination_folder / path.name
            if destination.exists():
                print(f"Skipping {path}, already encoded at {destination}")
                continue
            df = pd.read_parquet(path)
            encoded_df = pd.DataFrame(df.apply(self, axis="columns").tolist())
            encoded_df = encoded_df[encoded_df.input_ids.apply(len) == self.max_length]
            encoded_df.to_parquet(destination, compression="gzip")
            print(f"Written {len(encoded_df):,} rows to {destination}")

    def __call__(self, row: Union[pd.Series, Dict[str, Any]]) -> Dict[str, List[int]]:
        tokenized_text = self.tokenizer(
            row["text"],
            return_offsets_mapping=True,
            return_attention_mask=False,
            truncation=True,
            max_length=self.max_length,
        )
        labels = self._to_boundaries(
            token_offsets=tokenized_text["offset_mapping"],
            link_starts=row["start"],
            link_ends=row["end"],
            link_targets=row["link"],
        )
        return {
            "input_ids": tokenized_text["input_ids"],
            "label": labels,
        }

    def _to_boundaries(
        self,
        *,
        token_offsets: List[Tuple[int, int]],
        link_starts: List[int],
        link_ends: List[int],
        link_targets: List[str],
    ) -> torch.Tensor:
        title_to_index = self.title_to_index
        inside_label = self.inside_label
        beginning_label = self.beginning_label

        boundaries = [[self.outside_label, -1]] * len(token_offsets)

        link_iter = zip(link_starts, link_ends, link_targets)
        try:
            link_start, link_end, link_target = _next_link(
                token_start=0,
                links=link_iter,
                title_to_index=title_to_index,
            )

            within = False
            for index, (token_start, token_end) in enumerate(token_offsets):
                if token_start >= link_end:
                    link_start, link_end, link_target = _next_link(
                        token_start=0,
                        links=link_iter,
                        title_to_index=title_to_index,
                    )
                    within = False

                if token_start < link_end and token_end > link_start:
                    boundaries[index] = [
                        inside_label if within else beginning_label,
                        link_target,
                    ]
                    within = True
        except StopIteration:
            pass

        return boundaries


def _next_link(
    token_start: int,
    links: Iterator[Tuple[int, int, str]],
    title_to_index: Dict[str, int],
) -> Tuple[int, int, int]:
    link_start, link_end, link_target = next(links)
    while token_start >= link_end:
        link_start, link_end, link_target = next(links)
    return link_start, link_end, title_to_index.get(link_target, -1)
Code
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
title_to_index = dict(zip(pmi.title, pmi.index))
encoder = IOBEncoder(tokenizer=tokenizer, title_to_index=title_to_index)
Code
#hide_output
encoder.encode(
    paths=sorted((DATA_FOLDER / "articles").glob("*.gz.parquet")),
    destination_folder=DATA_FOLDER / "inputs"
)
Written 19,583 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles1.xml-p1p41242.gz.parquet
Written 122,033 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles10.xml-p4045403p5399366.gz.parquet
Written 121,778 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles11.xml-p5399367p6899366.gz.parquet
Written 11,222 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles11.xml-p6899367p7054859.gz.parquet
Written 97,264 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles12.xml-p7054860p8554859.gz.parquet
Written 39,082 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles12.xml-p8554860p9172788.gz.parquet
Written 53,787 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles13.xml-p10672789p11659682.gz.parquet
Written 78,068 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles13.xml-p9172789p10672788.gz.parquet
Written 89,777 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles14.xml-p11659683p13159682.gz.parquet
Written 58,055 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles14.xml-p13159683p14324602.gz.parquet
Written 71,875 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles15.xml-p14324603p15824602.gz.parquet
Written 62,280 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles15.xml-p15824603p17324602.gz.parquet
Written 5,952 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles15.xml-p17324603p17460152.gz.parquet
Written 70,992 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles16.xml-p17460153p18960152.gz.parquet
Written 64,310 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles16.xml-p18960153p20460152.gz.parquet
Written 4,842 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles16.xml-p20460153p20570392.gz.parquet
Written 70,944 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles17.xml-p20570393p22070392.gz.parquet
Written 70,849 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles17.xml-p22070393p23570392.gz.parquet
Written 8,093 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles17.xml-p23570393p23716197.gz.parquet
Written 74,902 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles18.xml-p23716198p25216197.gz.parquet
Written 68,427 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles18.xml-p25216198p26716197.gz.parquet
Written 19,316 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles18.xml-p26716198p27121850.gz.parquet
Written 65,065 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles19.xml-p27121851p28621850.gz.parquet
Written 54,886 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles19.xml-p28621851p30121850.gz.parquet
Written 53,582 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles19.xml-p30121851p31308442.gz.parquet
Written 61,949 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles2.xml-p41243p151573.gz.parquet
Written 69,437 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles20.xml-p31308443p32808442.gz.parquet
Written 64,973 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles20.xml-p32808443p34308442.gz.parquet
Written 45,131 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles20.xml-p34308443p35522432.gz.parquet
Written 64,647 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles21.xml-p35522433p37022432.gz.parquet
Written 59,601 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles21.xml-p37022433p38522432.gz.parquet
Written 62,193 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles21.xml-p38522433p39996245.gz.parquet
Written 62,525 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles22.xml-p39996246p41496245.gz.parquet
Written 63,987 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles22.xml-p41496246p42996245.gz.parquet
Written 66,754 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles22.xml-p42996246p44496245.gz.parquet
Written 10,474 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles22.xml-p44496246p44788941.gz.parquet
Written 41,988 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles23.xml-p44788942p46288941.gz.parquet
Written 67,108 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles23.xml-p46288942p47788941.gz.parquet
Written 54,676 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles23.xml-p47788942p49288941.gz.parquet
Written 43,947 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles23.xml-p49288942p50564553.gz.parquet
Written 58,439 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles24.xml-p50564554p52064553.gz.parquet
Written 57,342 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles24.xml-p52064554p53564553.gz.parquet
Written 54,967 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles24.xml-p53564554p55064553.gz.parquet
Written 56,523 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles24.xml-p55064554p56564553.gz.parquet
Written 17,664 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles24.xml-p56564554p57025655.gz.parquet
Written 53,712 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles25.xml-p57025656p58525655.gz.parquet
Written 48,340 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles25.xml-p58525656p60025655.gz.parquet
Written 53,588 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles25.xml-p60025656p61525655.gz.parquet
Written 37,039 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles25.xml-p61525656p62585850.gz.parquet
Written 50,168 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles26.xml-p62585851p63975909.gz.parquet
Written 47,942 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles27.xml-p63975910p65475909.gz.parquet
Written 49,408 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles27.xml-p65475910p66975909.gz.parquet
Written 36,729 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles27.xml-p66975910p68108549.gz.parquet
Written 48,533 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles3.xml-p151574p311329.gz.parquet
Written 64,665 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles4.xml-p311330p558391.gz.parquet
Written 78,151 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles5.xml-p558392p958045.gz.parquet
Written 93,925 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles6.xml-p958046p1483661.gz.parquet
Written 101,368 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles7.xml-p1483662p2134111.gz.parquet
Written 110,227 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles8.xml-p2134112p2936260.gz.parquet
Written 123,395 rows to /data/blog/2021-10-03-wikipedia-train-mse/inputs/enwiki-20210701-pages-articles9.xml-p2936261p4045402.gz.parquet

This conversion took about 3 hours. Using ray for each file could’ve sped it up quite substantially. Something to consider next time.


Training

The next thing is to define a mean squared error loss that can work with the PMI values we now have.

Composite Model

Ideally this new technique would work with boundary detection. We can start by training the link resolution alongside the boundary detection.

Code
# from src/main/python/blog/wikipedia_link/loss/boundary_iob.py
import torch


def calculate_loss_boundary_iob(
    predictions: torch.Tensor,  # [:, n, 3] inside, outside, begin
    labels: torch.Tensor,  # [:, n, 1] class
) -> torch.Tensor:
    """Calculate the loss for the boundary predictions (outside, inside, beginning).
    The predictions are only the boundary predictions.
    The labels combine the boundary labels and the link target index."""
    return torch.nn.functional.cross_entropy(
        predictions.reshape(-1, 3), labels.flatten()
    )



# from src/main/python/blog/wikipedia_link/loss/link_mse.py
import torch


def calculate_loss_link_mse(
    predictions: torch.Tensor,  # [:, n, vocab_size]
    link_labels: torch.Tensor,  # [:, n] for index
    token_indices: torch.Tensor,  # index -> 50 tokens
    token_values: torch.Tensor,  # index -> 50 pmi values
) -> torch.Tensor:
    """Calculate the loss for the link predictions.
    The labels for this are only valid when they are positive.
    The predictions are only the link target predictions."""

    predictions = predictions.softmax(dim=-1)

    mask = link_labels.flatten() >= 0
    link_labels = link_labels.flatten()[mask].long()
    rows = link_labels.shape[0]

    vocab_size = predictions.shape[-1]
    predictions = predictions.view(-1, vocab_size)[mask]

    targets = torch.zeros(vocab_size * rows, device=predictions.device)
    target_offsets = torch.tensor(range(rows), device=predictions.device) * vocab_size
    target_indexes = (token_indices[link_labels] + target_offsets[:, None]).flatten()
    targets[target_indexes] = token_values[link_labels].flatten()

    return torch.nn.functional.mse_loss(predictions, targets.view(-1, vocab_size))



# from src/main/python/blog/wikipedia_link/metrics/boundary_iob.py
from typing import Dict

import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def metric_boundary_iob(
    predictions: np.array,
    labels: np.array,
    inside_label: int = 1,
    beginning_label: int = 2,
    outside_label: int = 0,
) -> Dict[str, float]:
    # predictions and labels have been flattened and sliced
    # this means that labels is [n] ints and predictions is [n, 3] floats
    predictions = predictions.argmax(axis=1)

    accuracy = accuracy_score(labels, predictions)
    metrics = precision_recall_fscore_support(labels, predictions, zero_division=0)

    return {
        "accuracy": accuracy,
        "beginning_precision": metrics[0][beginning_label],
        "beginning_recall": metrics[1][beginning_label],
        "beginning_fscore": metrics[2][beginning_label],
        "inside_precision": metrics[0][inside_label],
        "inside_recall": metrics[1][inside_label],
        "inside_fscore": metrics[2][inside_label],
        "outside_precision": metrics[0][outside_label],
        "outside_recall": metrics[1][outside_label],
        "outside_fscore": metrics[2][outside_label],
    }



# from src/main/python/blog/wikipedia_link/metrics/link_jaccard.py
from typing import Dict

import numpy as np


def metric_link_jaccard(
    predictions: np.array,
    labels: np.array,
    token_indices: np.array,
) -> Dict[str, float]:
    # predictions and labels have been flattened
    # predictions have been sliced
    # labels have NOT been sliced
    # this means that labels is either [3, n] ints (begin, within) or [2, n] ints (iob)
    # and predictions is [n, vocab_size] floats

    # limit predictions to just the links
    link_mask = labels[:, :-1].sum(axis=1) > 0
    predictions = predictions[link_mask]
    predictions = predictions.argsort(axis=1)[:, -50:]
    labels = labels[link_mask][:, -1]
    labels = token_indices[labels]

    intersections = sum(
        len(np.intersect1d(preds, labs)) for preds, labs in zip(predictions, labels)
    )
    unions = sum(
        len(np.union1d(preds, labs)) for preds, labs in zip(predictions, labels)
    )

    return {
        "accuracy": intersections / unions,
        "intersection": intersections,
        "union": unions,
    }



# from src/main/python/blog/wikipedia_link/model/bart_boundary_iob_value.py
import numpy as np
import torch
from transformers import BartConfig, BartForConditionalGeneration


class BartLinksBoundaryIOBLinkValue(BartForConditionalGeneration):
    def __init__(self, config: BartConfig) -> None:
        super().__init__(config)
        # iob for the link detection
        self.link_head = torch.nn.Linear(
            in_features=config.d_model, out_features=3, bias=True
        )
        self.token_indices = None
        self.token_values = None

    @staticmethod
    def boundary_loss(
        predictions: torch.Tensor,  # [:, n, 3] outside, inside, begin
        labels: torch.Tensor,  # [:, n, 1] class: outside, inside, begin
    ) -> torch.Tensor:
        raise NotImplementedError()

    @staticmethod
    def link_loss(
        predictions: torch.Tensor,  # [:, n, vocab_size]
        link_labels: torch.Tensor,  # [:, n] for index
        token_indices: np.array,  # index -> 50 tokens
        token_values: np.array,
    ) -> torch.Tensor:
        raise NotImplementedError()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        labels=None,
    ):
        assert self.token_indices is not None, "Model misconfigured, set token_indices"
        assert self.token_values is not None, "Model misconfigured, set token_values"

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
        )
        link_logits = self.lm_head(outputs[0]) + self.final_logits_bias
        boundary_logits = self.link_head(outputs[0])
        logits = torch.cat(
            [
                link_logits,
                boundary_logits,
            ],
            dim=-1,
        )

        # Including the full outputs means more stuff gets passed to the metrics method.
        # Keeping the output just the logits or loss and logits makes metrics easier.
        # The base model uses this approach:
        # output = (logits,) + outputs[1:]

        if labels is not None:
            boundary_loss = self.boundary_loss(
                predictions=boundary_logits, labels=labels[:, :, :1]
            )
            link_loss = self.link_loss(
                predictions=link_logits,
                link_labels=labels[:, :, 1],
                token_indices=self.token_indices,
                token_values=self.token_values,
            )
            loss = boundary_loss + link_loss
            return (loss, logits)
        return (logits,)



# from src/main/python/blog/wikipedia_link/data/load.py
from pathlib import Path
from typing import Dict, Tuple

import numpy as np
import pandas as pd
import torch
from datasets import Dataset

DEFAULT_DATA_FOLDER = Path("/data/blog/2021-10-03-wikipedia-train-mse")


def load_dataset(
    data_folder: Path = DEFAULT_DATA_FOLDER, test_size: int = 64
) -> Dict[str, Dataset]:
    df = pd.read_parquet(sorted(Path(data_folder / "inputs").glob("*.gz.parquet"))[-1])
    df = df[["input_ids", "label"]]

    return Dataset.from_pandas(df).train_test_split(test_size=test_size)


def load_page_tokens(
    data_folder: Path = DEFAULT_DATA_FOLDER, device: torch.device = torch.device("cuda")
) -> Tuple[torch.Tensor, torch.Tensor]:
    token_df = pd.read_parquet(data_folder / "pmi-filtered.gz.parquet")
    token_df = token_df.set_index("title")

    token_indices = np.concatenate(token_df.tokens.values).reshape(-1, 50)
    token_indices = torch.from_numpy(token_indices).long()
    token_indices = token_indices.detach().to(device)

    token_values = np.concatenate(token_df.scores.values).reshape(-1, 50)
    token_values = torch.from_numpy(token_values).float()
    token_values = token_values.detach().to(device)
    token_values = token_values.softmax(dim=-1)

    return token_indices, token_values


def load_title_to_index(data_folder: Path = DEFAULT_DATA_FOLDER) -> Dict[str, int]:
    df = pd.read_parquet(data_folder / "pmi-filtered.gz.parquet")
    return dict(zip(df.title, df.index))
Code
token_indices, token_values = load_page_tokens(DATA_FOLDER)
title_to_index = load_title_to_index(DATA_FOLDER)
split = load_dataset(DATA_FOLDER, test_size=BATCH_SIZE*2)
Code
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = BartLinksBoundaryIOBLinkValue.from_pretrained(MODEL_NAME)

model.token_indices = token_indices
model.token_values = token_values
model.boundary_loss = calculate_loss_boundary_iob
model.link_loss = calculate_loss_link_mse
Some weights of BartLinksBoundaryIOBLinkValue were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['link_head.weight', 'link_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Code
from typing import *
from pathlib import Path
from transformers import Trainer, TrainingArguments
from transformers import EvalPrediction

MODEL_RUN_FOLDER = DATA_FOLDER / "runs"
MODEL_RUN_FOLDER.mkdir(parents=True, exist_ok=True)

token_indices_npy = token_indices.cpu().numpy()
def compute_metrics(preds: EvalPrediction) -> Dict[str, float]:
    labels = preds.label_ids.reshape(-1, 2)
    predictions = preds.predictions.reshape(-1, tokenizer.vocab_size + 3)
    return {
        **{
            f"boundary_{key}": value
            for key, value in metric_boundary_iob(
                predictions=predictions[:, -3:],
                labels=labels[:, 0],
            ).items()
        },
        **{
            f"link_{key}": value
            for key, value in metric_link_jaccard(
                predictions=predictions[:, :tokenizer.vocab_size],
                labels=labels,
                token_indices=token_indices_npy
            ).items()
        }
    }

training_args = TrainingArguments(
    report_to=[],            
    output_dir=MODEL_RUN_FOLDER / "output",
    overwrite_output_dir=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=5e-5,
    warmup_ratio=0.06,
    evaluation_strategy="steps",
    logging_dir=MODEL_RUN_FOLDER / "output",

    num_train_epochs=EPOCHS,
    logging_steps=1_000,

    # not really training properly here
    # load_best_model_at_end=True,
    # metric_for_best_model="quality",
    # greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split["train"],
    eval_dataset=split["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
The following columns in the training set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running training *****
  Num examples = 123363
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 38555
[38555/38555 4:52:33, Epoch 5/5]
Step Training Loss Validation Loss Boundary Accuracy Boundary Beginning Precision Boundary Beginning Recall Boundary Beginning Fscore Boundary Inside Precision Boundary Inside Recall Boundary Inside Fscore Boundary Outside Precision Boundary Outside Recall Boundary Outside Fscore Link Accuracy Link Intersection Link Union
1000 0.286900 0.238589 0.895752 0.665072 0.663484 0.664277 0.624224 0.728261 0.672241 0.948443 0.929734 0.938995 0.005613 696 124004
2000 0.185200 0.216498 0.906250 0.733516 0.637232 0.681992 0.681544 0.682367 0.681955 0.941849 0.949172 0.945496 0.003727 463 124237
3000 0.175000 0.222351 0.901245 0.691932 0.675418 0.683575 0.634000 0.765700 0.693654 0.953266 0.931030 0.942016 0.002122 264 124436
4000 0.171300 0.218993 0.906982 0.707809 0.670644 0.688725 0.673767 0.725845 0.698837 0.948573 0.942837 0.945696 0.001413 176 124524
5000 0.167600 0.200478 0.916870 0.732648 0.680191 0.705446 0.691511 0.806763 0.744705 0.959193 0.944276 0.951676 0.000939 117 124583
6000 0.164600 0.189287 0.918091 0.747340 0.670644 0.706918 0.736386 0.718599 0.727384 0.948202 0.956803 0.952483 0.000947 118 124582
7000 0.162900 0.218404 0.911865 0.723618 0.687351 0.705018 0.673618 0.780193 0.722999 0.956255 0.941109 0.948621 0.000939 117 124583
8000 0.157600 0.203956 0.911377 0.708642 0.684964 0.696602 0.690315 0.740338 0.714452 0.951732 0.945428 0.948570 0.001124 140 124560
9000 0.149800 0.201089 0.911621 0.739583 0.677804 0.707347 0.686788 0.728261 0.706917 0.949639 0.947588 0.948613 0.000875 109 124591
10000 0.148400 0.192989 0.918945 0.777143 0.649165 0.707412 0.750988 0.688406 0.718336 0.943950 0.962707 0.953236 0.000891 111 124589
11000 0.147700 0.184169 0.921021 0.768571 0.642005 0.699610 0.758044 0.711353 0.733956 0.946497 0.962851 0.954604 0.000730 91 124609
12000 0.147300 0.203711 0.917236 0.740458 0.694511 0.716749 0.711982 0.746377 0.728774 0.952965 0.951044 0.952003 0.000650 81 124619
13000 0.147000 0.192629 0.920044 0.767760 0.670644 0.715924 0.743396 0.713768 0.728281 0.947945 0.959683 0.953778 0.000554 69 124631
14000 0.146400 0.197082 0.917969 0.776836 0.656325 0.711514 0.729763 0.707729 0.718578 0.946553 0.958819 0.952647 0.000618 77 124623
15000 0.146100 0.192789 0.916016 0.764045 0.649165 0.701935 0.718788 0.716184 0.717483 0.946941 0.955940 0.951419 0.000578 72 124628
16000 0.137100 0.205322 0.914673 0.741688 0.692124 0.716049 0.704358 0.722222 0.713178 0.950086 0.951044 0.950565 0.000562 70 124630
17000 0.130800 0.222428 0.917114 0.731343 0.701671 0.716200 0.705556 0.766908 0.734954 0.955588 0.948020 0.951789 0.000570 71 124629
18000 0.132000 0.216368 0.915649 0.732010 0.704057 0.717762 0.702273 0.746377 0.723653 0.953539 0.948596 0.951061 0.000594 74 124626
19000 0.132500 0.202685 0.919189 0.754569 0.689737 0.720698 0.716078 0.758454 0.736657 0.953982 0.952196 0.953088 0.000554 69 124631
20000 0.131200 0.208159 0.919067 0.745407 0.677804 0.710000 0.722543 0.754831 0.738334 0.953067 0.953204 0.953135 0.000682 85 124615
21000 0.131400 0.202520 0.917236 0.754617 0.682578 0.716792 0.713287 0.739130 0.725979 0.951258 0.952628 0.951942 0.000562 70 124630
22000 0.131800 0.194960 0.918335 0.765668 0.670644 0.715013 0.718640 0.740338 0.729328 0.950803 0.954500 0.952648 0.000602 75 124625
23000 0.130900 0.215586 0.911987 0.716749 0.694511 0.705455 0.691874 0.740338 0.715286 0.951739 0.945572 0.948646 0.000674 84 124616
24000 0.119700 0.226422 0.914062 0.716667 0.718377 0.717521 0.698754 0.745169 0.721216 0.953694 0.946004 0.949834 0.000594 74 124626
25000 0.118300 0.213425 0.915649 0.725926 0.701671 0.713592 0.701559 0.760870 0.730012 0.954710 0.947012 0.950846 0.000674 84 124616
26000 0.117400 0.213604 0.916504 0.738272 0.713604 0.725728 0.700559 0.757246 0.727800 0.955020 0.947732 0.951362 0.000618 77 124623
27000 0.117300 0.224768 0.916992 0.732187 0.711217 0.721550 0.706412 0.758454 0.731508 0.955046 0.948308 0.951665 0.000626 78 124622
28000 0.118100 0.222146 0.915405 0.736842 0.701671 0.718826 0.705471 0.731884 0.718435 0.951687 0.950180 0.950933 0.000530 66 124634
29000 0.117500 0.226192 0.916626 0.754617 0.682578 0.716792 0.704545 0.748792 0.725995 0.952402 0.950756 0.951578 0.000642 80 124620
30000 0.118000 0.230557 0.917114 0.744304 0.701671 0.722359 0.707622 0.751208 0.728764 0.953599 0.949892 0.951742 0.000602 75 124625
31000 0.114500 0.230756 0.916504 0.749354 0.692124 0.719603 0.702703 0.753623 0.727273 0.953303 0.949460 0.951378 0.000658 82 124618
32000 0.107500 0.237001 0.915771 0.741117 0.696897 0.718327 0.699105 0.754831 0.725900 0.953795 0.948164 0.950971 0.000666 83 124617
33000 0.108300 0.234847 0.915894 0.729529 0.701671 0.715328 0.708525 0.742754 0.725236 0.952752 0.949460 0.951103 0.000666 83 124617
34000 0.108100 0.239588 0.915039 0.730000 0.696897 0.713065 0.697442 0.757246 0.726115 0.954156 0.947012 0.950571 0.000602 75 124625
35000 0.106200 0.235279 0.914551 0.733503 0.689737 0.710947 0.701373 0.740338 0.720329 0.951762 0.948884 0.950321 0.000586 73 124627
36000 0.106500 0.233483 0.915039 0.731156 0.694511 0.712362 0.701476 0.746377 0.723230 0.952842 0.948452 0.950642 0.000602 75 124625
37000 0.106400 0.234265 0.913818 0.726601 0.704057 0.715152 0.692308 0.750000 0.720000 0.953694 0.946004 0.949834 0.000626 78 124622
38000 0.106300 0.235026 0.914062 0.725490 0.706444 0.715840 0.694631 0.750000 0.721254 0.953701 0.946148 0.949910 0.000610 76 124624

Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-1000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-1000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-1500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-1500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-1500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-1500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-2000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-2000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-2000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-2000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-2500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-2500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-2500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-2500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-3000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-3000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-3000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-3000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-3500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-3500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-3500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-3500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-4000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-4000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-4000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-4000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-4500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-4500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-4500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-4500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-5000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-5000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-5000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-5500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-5500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-5500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-5500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-6000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-6000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-6000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-6000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-6500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-6500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-6500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-6500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-6500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-7000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-7000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-7000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-7000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-7000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-7500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-7500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-7500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-7500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-7500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-8000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-8000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-8000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-8000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-8000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-8500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-8500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-8500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-8500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-8500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-9000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-9000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-9000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-9000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-9000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-9500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-9500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-9500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-9500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-9500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-10000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-10000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-10000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-10000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-10000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-10500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-10500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-10500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-10500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-10500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-11000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-11000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-11000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-11000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-11000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-11500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-11500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-11500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-11500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-11500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-12000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-12000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-12000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-12000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-12000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-12500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-12500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-12500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-12500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-12500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-13000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-13000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-13000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-13000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-13000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-13500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-13500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-13500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-13500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-13500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-14000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-14000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-14000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-14000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-14000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-14500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-14500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-14500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-14500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-14500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-15000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-15000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-15000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-15000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-15000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-15500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-15500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-15500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-15500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-15500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-16000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-16000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-16000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-16000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-16000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-16500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-16500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-16500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-16500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-16500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-17000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-17000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-17000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-17000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-17000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-17500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-17500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-17500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-17500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-17500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-18000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-18000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-18000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-18000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-18000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-18500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-18500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-18500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-18500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-18500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-19000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-19000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-19000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-19000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-19000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-19500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-19500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-19500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-19500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-19500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-20000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-20000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-20000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-20000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-20000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-20500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-20500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-20500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-20500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-20500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-21000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-21000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-21000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-21000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-21000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-21500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-21500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-21500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-21500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-21500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-22000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-22000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-22000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-22000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-22000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-22500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-22500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-22500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-22500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-22500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-23000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-23000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-23000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-23000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-23000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-23500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-23500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-23500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-23500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-23500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-24000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-24000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-24000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-24000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-24000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-24500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-24500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-24500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-24500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-24500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-25000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-25000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-25000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-25000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-25000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-25500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-25500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-25500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-25500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-25500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-26000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-26000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-26000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-26000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-26000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-26500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-26500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-26500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-26500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-26500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-27000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-27000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-27000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-27000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-27000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-27500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-27500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-27500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-27500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-27500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-28000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-28000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-28000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-28000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-28000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-28500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-28500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-28500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-28500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-28500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-29000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-29000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-29000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-29000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-29000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-29500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-29500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-29500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-29500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-29500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-30000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-30000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-30000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-30000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-30000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-30500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-30500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-30500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-30500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-30500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-31000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-31000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-31000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-31000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-31000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-31500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-31500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-31500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-31500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-31500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-32000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-32000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-32000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-32000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-32000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-32500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-32500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-32500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-32500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-32500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-33000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-33000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-33000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-33000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-33000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-33500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-33500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-33500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-33500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-33500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-34000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-34000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-34000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-34000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-34000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-34500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-34500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-34500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-34500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-34500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-35000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-35000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-35000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-35000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-35000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-35500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-35500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-35500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-35500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-35500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-36000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-36000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-36000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-36000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-36000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-36500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-36500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-36500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-36500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-36500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-37000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-37000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-37000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-37000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-37000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-37500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-37500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-37500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-37500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-37500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartLinksBoundaryIOBLinkValue.forward` and have been ignored: __index_level_0__.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 16
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-38000
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-38000/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-38000/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-38000/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-38000/special_tokens_map.json
Saving model checkpoint to /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-38500
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-38500/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-38500/pytorch_model.bin
tokenizer config file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-38500/tokenizer_config.json
Special tokens file saved in /data/blog/2021-10-03-wikipedia-train-mse/runs/output/checkpoint-38500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)

TrainOutput(global_step=38555, training_loss=0.13774871648076814, metrics={'train_runtime': 17554.2424, 'train_samples_per_second': 35.138, 'train_steps_per_second': 2.196, 'total_flos': 9.402583408835328e+16, 'train_loss': 0.13774871648076814, 'epoch': 5.0})
Code
model.save_pretrained(DATA_FOLDER / "model-with-links")
Configuration saved in /data/blog/2021-10-03-wikipedia-train-mse/model-with-links/config.json
Model weights saved in /data/blog/2021-10-03-wikipedia-train-mse/model-with-links/pytorch_model.bin

This doesn’t look great. The Jaccard similarity has dropped during the train, which suggests that the link resolution performance may be very poor.

The mean squared error loss metric is currently quite small - it is comparing the softmax output over 50k tokens, so many of them will be effectively zero both in prediction and target. These multitudes of tiny differences will cause the mean value to be quite small. As a consequence the link recognition loss function may be overpowered by the boundary detection loss function.

Composite Evaluation

Let’s see how the model performs when we evaluate it on some text. This will give a better idea of how it has balanced the boundary detection with link resolution.

Code
model = model.eval()
Code
# from src/main/python/blog/wikipedia_link/evaluation/text.py
from typing import List, Optional, Tuple

import torch
from transformers import AutoModel, AutoTokenizer


@torch.no_grad()
def infer(
    text: str, *, tokenizer: AutoTokenizer, model: AutoModel
) -> List[Tuple[str, Optional[List[str]]]]:
    return [
        (token, tokenizer.batch_decode(link[:10]) if link else None)
        for token, link in _simple_infer(text, tokenizer=tokenizer, model=model)
    ]


@torch.no_grad()
def _simple_infer(
    text: str, tokenizer: AutoTokenizer, model: AutoModel
) -> List[Tuple[str, Optional[List[str]]]]:
    tokens = tokenizer(
        text,
        return_attention_mask=False,
        return_tensors="pt",
        max_length=256,
        truncation=True,
    )["input_ids"]
    tokens = tokens.to(model.device)

    output = model(tokens)[0]
    lm_output = output[0, :, : model.config.vocab_size]
    link_tokens = lm_output.argsort(dim=-1, descending=True)[:, :50]
    is_link = (output[0, :, model.config.vocab_size :].argmax(dim=-1) > 0).tolist()

    text_tokens = tokenizer.batch_decode(tokens[0, :, None])

    return [
        (token, link[:10] if is_link else None)
        for token, link, is_link in zip(
            text_tokens, link_tokens[:, :, None].tolist(), is_link
        )
    ]
Code
infer("I like to drive my Malibu", tokenizer=tokenizer, model=model)
[('<s>', None),
 ('I', None),
 (' like', None),
 (' to', None),
 (' drive', None),
 (' my', None),
 (' Mal',
  ['…..',
   ' \xad',
   ' Playstation',
   ' [',
   ' Cap',
   ' �',
   ' American',
   ' French',
   ' Greek',
   'HHHH']),
 ('ibu',
  [' HOUSE',
   ' cricket',
   ' \xad',
   'HHHH',
   'agame',
   ' Playstation',
   'se',
   ' Offline',
   ' Dome',
   ' \u200b']),
 ('</s>', None)]

This doesn’t look good at all. These tokens are totally unrelated to driving or to a car. I should remind you that the very first attempt at training a model for this task produced the following tokens for this input:

 (' Mal',
  [' Mal',
   ' Honda',
   ' Ferrari',
   ' Bentley',
   ' Chevy',
   '.',
   'ibu',
   ':',
   ' Jaguar',
   ' in']),
 ('ibu',
  ['ibu',
   ' convertible',
   ' Chevy',
   ' sedan',
   ' Jaguar',
   ' Bentley',
   ' Corvette',
   ' Model',
   ' Honda',
   ' Ferrari']),

There is clearly a problem with the new training approach. I think that the two loss functions may be out of scale with each other, leading to the boundary detection dominating the model updates.