Using the Wikipedia Dataset for Cross Language WSI

Training a model with the Wikipedia Dataset
prompt internalization
multilingual prompt internalization
cross language word sense induction
Published

September 13, 2022

Code
import blog.transformers_logging
Code
from pathlib import Path

INTERIM_FOLDER = Path("/data/prompt-internalization/multilingual/wikipedia/interim")
PROCESSED_FOLDER = Path("/data/prompt-internalization/multilingual/wikipedia/processed")
Code
from pathlib import Path

ARTICLE_FILE = PROCESSED_FOLDER / "20220701" / "article-descriptions.gz.parquet"
DATASET_FOLDER = PROCESSED_FOLDER / "20220701" / "student"
MODEL_FOLDER = Path("/data/prompt-internalization/multilingual/models/wikipedia")
RUN_FOLDER = Path("/tmp/runs")

MODEL_FOLDER.mkdir(parents=True, exist_ok=True)
RUN_FOLDER.mkdir(parents=True, exist_ok=True)
Code
from itertools import starmap
from typing import Any, Dict, List, Optional, Tuple, Union
from pathlib import Path

import pandas as pd
import datasets
import torch
import torch.nn.functional as F
from transformers import (
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    Trainer,
    TrainingArguments,
)
from transformers.modeling_outputs import MaskedLMOutput

class ArticleTrainingArguments(TrainingArguments):
    def __init__(
        self,
        *args,
        temperature: float = 2.0,
        **kwargs,
    ) -> None:
        super().__init__(*args, **kwargs)
        self.temperature = temperature


class ArticleMeasure:
    def __init__(self, file: Path) -> None:
        df = pd.read_parquet(file)
        self.indices = [
            torch.tensor(values, dtype=torch.long)
            for values in df["indices"]
        ]
        self.mean = [
            torch.tensor(values, dtype=torch.float)
            for values in df["mean"]
        ]
        self.baseline = torch.tensor(df["baseline"].tolist())

    def to(self, device) -> None:
        self.indices = [entry.to(device) for entry in self.indices]
        self.mean = [entry.to(device) for entry in self.mean]
        self.baseline = self.baseline.to(device)

    def loss(self, output: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
        batch_size, token_count, vocab_size = output.shape

        start_tokens = labels[:, :, 0]
        offsets = torch.arange(start=0, end=batch_size, dtype=torch.long, device=output.device) * token_count
        offsets = offsets.repeat_interleave((start_tokens != -1).sum(axis=1))

        start_tokens = start_tokens.flatten()
        target_indices = labels[:, :, 2].flatten()
        token_mask = start_tokens != -1
        start_tokens = start_tokens[token_mask]
        target_indices = target_indices[token_mask]
        # offsets = offsets[token_mask] # repeat_interleave has already established this

        output = output.reshape(-1, vocab_size)
        predictions = output[start_tokens + offsets]
        # predictions = F.log_softmax(predictions, dim=-1)
        targets = torch.ones_like(
            predictions,
            dtype=torch.float,
            device=output.device,
            requires_grad=False,
        ) * self.baseline[target_indices, None]
        for row_index, index in enumerate(target_indices):
            targets[row_index, self.indices[index]] = self.mean[index]

        return F.kl_div(
            input=F.log_softmax(predictions, dim=-1),
            target=F.log_softmax(targets, dim=-1),
            reduction="batchmean",
            log_target=True,
        )

class ArticleTrainer(Trainer):
    def __init__(
        self,
        *args,
        article_file: Path = None,
        **kwargs,
    ) -> None:
        super().__init__(*args, **kwargs)
        self.measure = ArticleMeasure(article_file)
        self.measure.to(self.model.device)

    def compute_loss(
        self,
        model: AutoModelForMaskedLM,
        inputs: Dict[str, Union[torch.Tensor, Any]],
        return_outputs: bool = False,
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
        outputs: MaskedLMOutput = model(
            input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]
        )
        loss: torch.Tensor = self.measure.loss(outputs.logits, labels=inputs["labels"])

        if not return_outputs:
            return loss
        return loss, outputs



def compute_metrics(model_output: EvalPrediction) -> Dict[str, float]:
    # distance is just loss already
    kl_div = model_output.predictions[:, 0].mean()
    overlap = model_output.predictions[:, 1].mean()
    return {
        "kl_div": kl_div,
        "overlap": overlap,
    }



def train(
    *,
    model_name: str = "xlm-roberta-base",
    # dataset_name: str = "xlm-roberta",
    batch_size: int = 32,
    learning_rate: float = 1e-4,
    # temperature: float = 2,
    fp16: bool = False,
    # mean_prediction: bool = False,
    # ignore_tokens: Optional[List[int]] = None,
    epochs: Optional[float] = 2,
    max_steps: int = -1,
    evaluation_steps: int = 500,
    article_file: Path = None,
) -> Path:
    assert article_file is not None
    run_name = "-".join(
        [
            f"{model_name}",
            f"e{epochs}" if max_steps == -1 else f"ms{max_steps}",
            f"bs{batch_size}",
            f"lr{learning_rate}",
            # f"t{temperature}",
        ]
        + (["fp16"] if fp16 else [])
        # + (["mean"] if mean_prediction else [])
        # + ([f"it{len(ignore_tokens)}"] if ignore_tokens else [])
    )
    print(f"Starting {run_name}")
    train_ds = datasets.load_from_disk(DATASET_FOLDER / "train.dataset")
    test_ds = datasets.load_from_disk(DATASET_FOLDER / "valid.dataset")

    training_args = ArticleTrainingArguments(
        report_to="none",
        output_dir=RUN_FOLDER,
        num_train_epochs=epochs,
        max_steps=max_steps,
        seed=33,
        # number of steps before moving evaluation results from GPU to CPU see
        # https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941
        eval_accumulation_steps=5,
        #
        # hyperparameters
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        fp16=fp16,
        # temperature=temperature,
        # mean_prediction=mean_prediction,
        # ignore_tokens=ignore_tokens,
        learning_rate=learning_rate,
        #
        # evaluation settings
        evaluation_strategy="steps",
        logging_steps=evaluation_steps,
        eval_steps=evaluation_steps,
        save_steps=evaluation_steps,
        #
        # checkpoint settings
        logging_dir=RUN_FOLDER / "logs",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        # remove_unused_columns=False,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    trainer = ArticleTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        data_collator=data_collator,
        tokenizer=tokenizer,
        # compute_metrics=compute_metrics,
        article_file=article_file,
    )

    trainer.train()
    model.save_pretrained(MODEL_FOLDER / run_name)

    return MODEL_FOLDER / run_name
Code
model_path = train(
    model_name="xlm-roberta-base",
    batch_size=8,
    learning_rate=1e-4,
    # epochs=20,
    max_steps=1_000,
    evaluation_steps=100,
    article_file=ARTICLE_FILE,
)
Starting xlm-roberta-base-ms1000-bs8-lr0.0001
/home/matthew/.cache/pypoetry/virtualenvs/blog-HrtMnrOS-py3.10/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
[1000/1000 17:14, Epoch 0/1]
Step Training Loss Validation Loss
100 1.240700 0.566710
200 0.588300 0.485166
300 0.505400 0.442383
400 0.480900 0.434608
500 0.427500 0.405620
600 0.394600 0.382248
700 0.371100 0.375811
800 0.348000 0.351471
900 0.326100 0.329928
1000 0.323300 0.326775

Code
# from src/main/python/blog/prompt_internalization/multilingual/roberta/evaluate.py
from pathlib import Path
from typing import List, Optional, Tuple

import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer


def evaluate(
    model_name: str, model_path: Path, ignore_tokens: Optional[List[int]] = None
) -> None:
    if ignore_tokens is None:
        ignore_tokens = []

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_path)
    model.eval()

    bass_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)
    friday_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)
    malibu_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)
    football_evaluation(model=model, tokenizer=tokenizer, ignore_tokens=ignore_tokens)


def bass_evaluation(
    model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
    first_phrase = "We spotted a large bass in the ocean."
    second_phrase = "The bass player did not receive the acknowledgment she deserves."
    third_phrase = "The black sea bass, is a member of the wreckfish family."

    first_predicted_words = get_predictions(
        model=model,
        tokenizer=tokenizer,
        text=first_phrase,
        noun="bass",
        ignore_tokens=ignore_tokens,
    )
    second_predicted_words = get_predictions(
        model=model,
        tokenizer=tokenizer,
        text=second_phrase,
        noun="bass",
        ignore_tokens=ignore_tokens,
    )
    third_predicted_words = get_predictions(
        model=model,
        tokenizer=tokenizer,
        text=third_phrase,
        noun="bass",
        ignore_tokens=ignore_tokens,
    )

    print("=== BASS EVALUATION ===")
    print(f"First Phrase is: {first_phrase} Target is: bass")
    print(f"Description is: {', '.join(first_predicted_words)}")
    print()

    print(f"Second Phrase is: {second_phrase} Target is: bass")
    print(f"Description is: {', '.join(second_predicted_words)}")
    print()

    print(f"Third Phrase is: {third_phrase} Target is: bass")
    print(f"Description is: {', '.join(third_predicted_words)}")
    print()

    print(
        f"First & Second: {sorted(set(first_predicted_words) & set(second_predicted_words))}"
    )
    print(
        f"First & Third: {sorted(set(first_predicted_words) & set(third_predicted_words))}"
    )
    print(
        f"Second & Third: {sorted(set(second_predicted_words) & set(third_predicted_words))}"
    )
    print()


def friday_evaluation(
    model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
    spanish_text = "Friday es mi canción favorita."
    english_text = "Friday is my favourite song."

    spanish_predicted_words = get_predictions(
        model=model,
        tokenizer=tokenizer,
        text=spanish_text,
        noun="Friday",
        ignore_tokens=ignore_tokens,
    )
    english_predicted_words = get_predictions(
        model=model,
        tokenizer=tokenizer,
        text=english_text,
        noun="Friday",
        ignore_tokens=ignore_tokens,
    )

    overlap = set(spanish_predicted_words) & set(english_predicted_words)
    difference = set(spanish_predicted_words) ^ set(english_predicted_words)

    print("=== FRIDAY EVALUATION ===")
    print(f"Spanish Phrase is: {spanish_text}")
    print(f"Spanish Description is: {', '.join(spanish_predicted_words)}")

    print(f"English Phrase is: {english_text}")
    print(f"English Description is: {', '.join(english_predicted_words)}")
    print()

    print(f"Description Overlap is: {', '.join(sorted(overlap))}")
    print(f"Description Difference is: {', '.join(sorted(difference))}")
    print()


def malibu_evaluation(
    model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
    text = "I like to drive my Malibu while drinking Malibu."

    first_predicted_words = get_predictions(
        model=model,
        tokenizer=tokenizer,
        text=text,
        noun="Malibu",
        ignore_tokens=ignore_tokens,
    )
    second_predicted_words = get_predictions(
        model=model,
        tokenizer=tokenizer,
        text=text,
        noun="Malibu",
        index=1,
        ignore_tokens=ignore_tokens,
    )

    print("=== MALIBU EVALUATION ===")
    print(f"Phrase is: {text}")
    print(f"First Malibu (car) Description is: {', '.join(first_predicted_words)}")
    print(f"Second Malibu (drink) Description is: {', '.join(second_predicted_words)}")
    print()

    print(
        f"First & Second: {sorted(set(first_predicted_words) & set(second_predicted_words))}"
    )
    print(
        f"First ^ Second: {sorted(set(first_predicted_words) ^ set(second_predicted_words))}"
    )
    print()


def football_evaluation(
    model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, ignore_tokens: List[int]
) -> None:
    spanish_phrase = (
        "Retiremos el equipo de la cancha, "
        "Boca no merece jugar esta copa que "
        "hace tiempo viene siendo desprestigiada.\n"
        "Ya no se juega al futbol."
    )

    english_phrase = (
        "Let's remove the team from the field, "
        "Boca does not deserve to play this cup that "
        "has long been discredited. "
        "Football is no longer played."
    )

    print("=== FOOTBALL EVALUATION ===")
    print(f"Spanish Phrase is: {spanish_phrase}")
    print(f"English Phrase is: {english_phrase}")
    print()

    for spanish_noun, english_noun in [
        ["equipo", "team"],
        ["Boca", "Boca"],
        ["copa", "cup"],
        ["tiempo", "long"],
        ["futbol", "Football"],
    ]:
        spanish_description = get_predictions(
            model=model,
            tokenizer=tokenizer,
            text=spanish_phrase,
            noun=spanish_noun,
            ignore_tokens=ignore_tokens,
        )
        english_description = get_predictions(
            model=model,
            tokenizer=tokenizer,
            text=english_phrase,
            noun=english_noun,
            ignore_tokens=ignore_tokens,
        )
        overlap = set(spanish_description) & set(english_description)
        difference = set(spanish_description) ^ set(english_description)

        print(f"Spanish word is: {spanish_noun}, English word is: {english_noun}")
        print(f"Spanish Description is: {', '.join(spanish_description)}")
        print(f"English Description is: {', '.join(english_description)}")
        print(f"Overlap is: {', '.join(sorted(overlap))} ({len(overlap)})")
        print(f"Difference is: {', '.join(sorted(difference))} ({len(difference)})")
        print()


@torch.inference_mode()
def get_predictions(
    *,
    model: AutoModelForMaskedLM,
    tokenizer: AutoTokenizer,
    text: str,
    noun: str,
    index: int = 0,
    ignore_tokens: Optional[List[int]] = None,
) -> List[str]:
    if ignore_tokens is None:
        ignore_tokens = []

    tokens = tokenizer(text, return_tensors="pt")
    start, _end = get_noun(
        tokenizer=tokenizer, tokens=tokens.input_ids[0], noun=noun, index=index
    )

    output = model(**tokens)
    predictions = output.logits[0, start]
    predictions[ignore_tokens] = predictions.min()
    predicted_tokens = predictions.argsort(descending=True)[:10]
    predicted_words = [
        word.strip() for word in tokenizer.batch_decode(predicted_tokens)
    ]

    return predicted_words


def get_noun(
    tokenizer: AutoTokenizer, tokens: torch.Tensor, noun: str, index: int
) -> Tuple[int, int]:
    length = tokens.shape[0]
    current_index = index
    for start_index in range(length):
        word = tokenizer.decode(tokens[start_index]).strip()
        if not noun.startswith(word):
            continue
        for end_index in range(start_index + 1, length):
            word = tokenizer.decode(tokens[start_index:end_index]).strip()
            if not noun == word:
                continue
            if current_index > 0:
                current_index -= 1
            else:
                return start_index, end_index
    raise AssertionError(f"Did not find {noun}[{index}] in {tokenizer.decode(tokens)}")
Code
evaluate("xlm-roberta-base", model_path)
Could not locate the tokenizer configuration file, will try to use the model config instead.
=== BASS EVALUATION ===
First Phrase is: We spotted a large bass in the ocean. Target is: bass
Description is: Area, Material, Type, Color, Location, Size, Description, View, Surface, Name

Second Phrase is: The bass player did not receive the acknowledgment she deserves. Target is: bass
Description is: Instrument, Material, Type, Music, Style, Position, Description, Language, Profession, Color

Third Phrase is: The black sea bass, is a member of the wreckfish family. Target is: bass
Description is: Animal, Type, Color, Name, Plant, Material, Food, Description, Cat, Family

First & Second: ['Color', 'Description', 'Material', 'Type']
First & Third: ['Color', 'Description', 'Material', 'Name', 'Type']
Second & Third: ['Color', 'Description', 'Material', 'Type']

=== FRIDAY EVALUATION ===
Spanish Phrase is: Friday es mi canción favorita.
Spanish Description is: Date, Day, Time, Description, Name, Song, Theme, Type, Tag, Color
English Phrase is: Friday is my favourite song.
English Description is: Date, Day, Time, Theme, Song, Description, Name, Color, Type, Tag

Description Overlap is: Color, Date, Day, Description, Name, Song, Tag, Theme, Time, Type
Description Difference is: 

=== MALIBU EVALUATION ===
Phrase is: I like to drive my Malibu while drinking Malibu.
First Malibu (car) Description is: Country, Location, Land, Language, Region, City, Origin, Type, State, Source
Second Malibu (drink) Description is: Food, Language, Source, Country, Type, Material, Color, Drink, Style, Culture

First & Second: ['Country', 'Language', 'Source', 'Type']
First ^ Second: ['City', 'Color', 'Culture', 'Drink', 'Food', 'Land', 'Location', 'Material', 'Origin', 'Region', 'State', 'Style']

=== FOOTBALL EVALUATION ===
Spanish Phrase is: Retiremos el equipo de la cancha, Boca no merece jugar esta copa que hace tiempo viene siendo desprestigiada.
Ya no se juega al futbol.
English Phrase is: Let's remove the team from the field, Boca does not deserve to play this cup that has long been discredited. Football is no longer played.

Spanish word is: equipo, English word is: team
Spanish Description is: Sponsor, Name, Type, Owner, Location, Organization, Title, Country, Company, Sport
English Description is: Type, Name, Organization, Sponsor, Title, Team, Location, Sport, Country, Company
Overlap is: Company, Country, Location, Name, Organization, Sponsor, Sport, Title, Type (9)
Difference is: Owner, Team (2)

Spanish word is: Boca, English word is: Boca
Spanish Description is: Sponsor, Company, Owner, City, Location, Organization, Country, Name, Team, Land
English Description is: Sponsor, Company, Owner, Organization, City, Location, Name, Country, Brand, Team
Overlap is: City, Company, Country, Location, Name, Organization, Owner, Sponsor, Team (9)
Difference is: Brand, Land (2)

Spanish word is: copa, English word is: cup
Spanish Description is: Title, Type, Series, Category, Sports, Game, Sport, Sponsor, Organization, Year
English Description is: Title, Series, Type, Sport, Sports, Organization, Category, Status, Sponsor, Game
Overlap is: Category, Game, Organization, Series, Sponsor, Sport, Sports, Title, Type (9)
Difference is: Status, Year (2)

Spanish word is: tiempo, English word is: long
Spanish Description is: Time, Year, Age, Date, Description, Country, History, Location, Title, Duration
English Description is: Age, Title, Description, Year, Type, Subject, Status, History, Date, Country
Overlap is: Age, Country, Date, Description, History, Title, Year (7)
Difference is: Duration, Location, Status, Subject, Time, Type (6)

Spanish word is: futbol, English word is: Football
Spanish Description is: Sports, Sport, Type, Style, Football, Game, Language, Religion, Category, Culture
English Description is: Sports, Football, Sport, Style, Language, Type, Religion, Game, Culture, Category
Overlap is: Category, Culture, Football, Game, Language, Religion, Sport, Sports, Style, Type (10)
Difference is:  (0)
Code
model_path = train(
    model_name="xlm-roberta-base",
    batch_size=8,
    learning_rate=1e-4,
    # epochs=20,
    max_steps=20_000,
    evaluation_steps=1_000,
    article_file=ARTICLE_FILE,
)
PyTorch: setting up devices
Starting xlm-roberta-base-ms20000-bs8-lr0.0001
Could not locate the tokenizer configuration file, will try to use the model config instead.
/home/matthew/.cache/pypoetry/virtualenvs/blog-HrtMnrOS-py3.10/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
[20000/20000 4:23:07, Epoch 0/1]
Step Training Loss Validation Loss
1000 0.556900 0.404128
2000 0.362400 0.352797
3000 0.310700 0.331835
4000 0.279500 0.308182
5000 0.254400 0.293333
6000 0.240900 0.310928
7000 0.221900 0.281576
8000 0.207000 0.273815
9000 0.198500 0.256036
10000 0.186200 0.254111
11000 0.171700 0.257458
12000 0.161200 0.242577
13000 0.156700 0.235280
14000 0.147600 0.245646
15000 0.144100 0.233584
16000 0.138200 0.227569
17000 0.131800 0.225234
18000 0.126100 0.215953
19000 0.121000 0.217426
20000 0.120600 0.215612

Code
evaluate("xlm-roberta-base", model_path)
Could not locate the tokenizer configuration file, will try to use the model config instead.
=== BASS EVALUATION ===
First Phrase is: We spotted a large bass in the ocean. Target is: bass
Description is: Material, Type, Description, Location, Area, Size, Application, Source, Name, Color

Second Phrase is: The bass player did not receive the acknowledgment she deserves. Target is: bass
Description is: Instrument, Type, Material, Game, Music, Style, Player, System, Description, Language

Third Phrase is: The black sea bass, is a member of the wreckfish family. Target is: bass
Description is: Type, Material, Game, Instrument, Fish, Color, Country, Language, Description, Sports

First & Second: ['Description', 'Material', 'Type']
First & Third: ['Color', 'Description', 'Material', 'Type']
Second & Third: ['Description', 'Game', 'Instrument', 'Language', 'Material', 'Type']

=== FRIDAY EVALUATION ===
Spanish Phrase is: Friday es mi canción favorita.
Spanish Description is: Tag, Date, Day, Name, Music, Song, Description, Color, Title, Album
English Phrase is: Friday is my favourite song.
English Description is: Date, Day, Time, Tag, Holiday, Birthday, Theme, Year, Name, Description

Description Overlap is: Date, Day, Description, Name, Tag
Description Difference is: Album, Birthday, Color, Holiday, Music, Song, Theme, Time, Title, Year

=== MALIBU EVALUATION ===
Phrase is: I like to drive my Malibu while drinking Malibu.
First Malibu (car) Description is: Location, Country, City, Land, Place, Area, Region, State, Village, Local
Second Malibu (drink) Description is: Location, Country, City, Land, Region, Place, Area, State, Local, Language

First & Second: ['Area', 'City', 'Country', 'Land', 'Local', 'Location', 'Place', 'Region', 'State']
First ^ Second: ['Language', 'Village']

=== FOOTBALL EVALUATION ===
Spanish Phrase is: Retiremos el equipo de la cancha, Boca no merece jugar esta copa que hace tiempo viene siendo desprestigiada.
Ya no se juega al futbol.
English Phrase is: Let's remove the team from the field, Boca does not deserve to play this cup that has long been discredited. Football is no longer played.

Spanish word is: equipo, English word is: team
Spanish Description is: Sponsor, Organization, Owner, Name, Team, Company, Location, Type, Sports, Title
English Description is: Organization, Team, Sponsor, Name, Owner, Company, Type, Location, Sport, Title
Overlap is: Company, Location, Name, Organization, Owner, Sponsor, Team, Title, Type (9)
Difference is: Sport, Sports (2)

Spanish word is: Boca, English word is: Boca
Spanish Description is: Sponsor, Company, City, Club, Organization, Team, Owner, Land, Location, Country
English Description is: Sponsor, Company, City, Club, Organization, Team, Owner, Land, Location, Country
Overlap is: City, Club, Company, Country, Land, Location, Organization, Owner, Sponsor, Team (10)
Difference is:  (0)

Spanish word is: copa, English word is: cup
Spanish Description is: Title, Type, Sport, Series, Sports, Game, Category, Football, Location, Sponsor
English Description is: Title, Type, Series, Sport, Category, Sports, Football, Game, Cup, Organization
Overlap is: Category, Football, Game, Series, Sport, Sports, Title, Type (8)
Difference is: Cup, Location, Organization, Sponsor (4)

Spanish word is: tiempo, English word is: long
Spanish Description is: Year, Age, Time, Type, Date, Country, Duration, Location, Sport, Description
English Description is: Year, Description, Title, Age, Type, Date, Country, Religion, Name, Location
Overlap is: Age, Country, Date, Description, Location, Type, Year (7)
Difference is: Duration, Name, Religion, Sport, Time, Title (6)

Spanish word is: futbol, English word is: Football
Spanish Description is: Sports, Sport, Football, Type, Style, Game, Religion, Language, Theme, Title
English Description is: Sports, Sport, Football, Type, Style, Religion, Game, Language, Series, Title
Overlap is: Football, Game, Language, Religion, Sport, Sports, Style, Title, Type (9)
Difference is: Series, Theme (2)

I really need a more consistent way to measure the performance of this model. It seems good?