Different Clustering Techniques

Clustering the trained prompts to try to extract the different classes
Published

May 7, 2021

I’ve been trying to visualize the separation of the different classes but it has been going very badly. The classes look completely intermingled, even for the prompt that classifies them with 92% accuracy. I feel like the problem is with the techniques that I have been using to separate them instead of the existence of separation.

The main approaches that I want to take are PCA, t-SNE, Umap, and KMeans. I’ll see if there is anything else.


Dataset

To start with I need the dataset. This is really made of two parts - the prompt and then the converted validation rows. Since I want to use this many times and do quite a bit of conversion over it, I am going to create a full dataframe from it.

So to do this I need quite a bit of code from the previous posts. First, the pytorch dataloader

Code
#collapse

from typing import Dict, Iterator, Optional, Tuple, Union

import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

Past = Tuple[Tuple[torch.Tensor, ...], ...]
TextBatch = Dict[str, torch.Tensor]
PastBatch = Dict[str, Union[torch.Tensor, Past]]


class TextDataloader:
    """Provides a dataloader over a text dataframe"""

    def __init__(
        self,
        df: pd.DataFrame,
        *,
        tokenizer: AutoTokenizer,
        batch_size: int,
        max_length: int,
        device: torch.device = torch.device("cuda"),
    ) -> None:
        self.tokenizer = tokenizer
        self.df = df
        self.batch_size = batch_size
        self.max_length = max_length
        self.device = device

    def __iter__(self) -> Iterator[TextBatch]:
        """Returns an iterator that returns batches.
        The final batch can be a partial batch."""
        df = self.df
        batch_size = self.batch_size

        for i in range(len(self)):
            start = i * batch_size
            end = start + batch_size
            yield self.to_batch(df[start:end])

    def __len__(self) -> int:
        """Returns the total number of batches that can be returned."""
        full_batches = len(self.df) // self.batch_size
        if len(self.df) % self.batch_size:
            return full_batches + 1
        return full_batches

    def to_batch(self, rows: pd.DataFrame) -> TextBatch:
        """Converts the rows into a batch"""
        tokens = self.tokenizer(
            rows.text.tolist(),
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=self.max_length,
        ).to(self.device)
        return {
            "input_ids": tokens["input_ids"],
            "attention_mask": tokens["attention_mask"],
        }


class PastDataloader(TextDataloader):  # pylint: disable=too-few-public-methods
    """Provides a dataloader which converts the text into past tensors"""

    def __init__(
        self,
        df: pd.DataFrame,
        *,
        model: AutoModelForCausalLM,
        tokenizer: AutoTokenizer,
        batch_size: int,
        max_length: int,
        device: torch.device = torch.device("cuda"),
    ) -> None:
        super().__init__(
            df=df,
            tokenizer=tokenizer,
            batch_size=batch_size,
            max_length=max_length,
            device=device,
        )
        model.to(device)
        self.model = model

    @torch.no_grad()
    def to_batch(self, rows: pd.DataFrame) -> PastBatch:
        batch = super().to_batch(rows)
        past_key_values = self.model(
            input_ids=batch["input_ids"],
            attention_mask=batch.get("attention_mask", None),
        ).past_key_values
        return {
            "past_key_values": past_key_values,
            "attention_mask": batch["attention_mask"],
        }

Then the imdb dataloader

Code
#collapse

from pathlib import Path
import pandas as pd

def load(path: Path) -> pd.DataFrame:
    positive_files = sorted(path.glob("pos/*.txt"))
    negative_files = sorted(path.glob("neg/*.txt"))
    
    return pd.DataFrame(
        [
            {"label": "good", "text": file.read_text()}
            for file in positive_files
        ] +
        [
            {"label": "bad", "text": file.read_text()}
            for file in negative_files
        ]
    )

Then we need the imdb data and the model stuff to convert it

Code
#collapse

from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("gpt2")
model.to("cuda")
model.eval()

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token # needed to enable padding

validation_df = load(Path("/data/sentiment/imdb-movie-reviews/test"))

BATCH_SIZE = 32
MAX_LENGTH = 1_000
GOOD_TOKEN = 11274
BAD_TOKEN = 14774

validation_dataloader = PastDataloader(
    model=model,
    tokenizer=tokenizer,
    df=validation_df,
    batch_size=BATCH_SIZE,
    max_length=MAX_LENGTH,
)

Now we can have the code that passes the text and prompt through the model

Code
#collapse

from tqdm.auto import tqdm

@torch.no_grad()
def generate_outputs(
    dl: PastDataloader,
    model: AutoModelForCausalLM,
    prompt: torch.Tensor,
    use_lm_head: bool = True,
) -> Iterator[Tuple[torch.Tensor, torch.Tensor]]:
    prompt_attention = torch.ones((1, prompt.shape[1]), device=dl.device)

    for batch in tqdm(dl):
        output = _get_output_with_past(
            model=model,
            prompt=prompt,
            attention_mask=prompt_attention,
            past=batch["past_key_values"],
            past_attention_mask=batch["attention_mask"],
        )
        if use_lm_head:
            output = model.lm_head(output)
        output = output[:, -1]
        for row in output.cpu().numpy():
            yield row[None, :]

def _get_output_with_past(
    *,
    model: AutoModelForCausalLM,
    prompt: torch.nn.Parameter,
    attention_mask: torch.Tensor,
    past: Past,
    past_attention_mask: torch.Tensor,
) -> torch.Tensor:
    """Get the predictions for the next token after the prompt"""
    # concatenate the past attention with the prompt attention
    batch_size = past_attention_mask.shape[0]
    attention_mask = attention_mask.repeat_interleave(batch_size, dim=0)
    attention_mask = torch.cat([past_attention_mask, attention_mask], dim=-1)

    # expand the prompt to match the batch size
    input_ids = prompt.repeat_interleave(batch_size, dim=0)

    return model.transformer(
        inputs_embeds=input_ids,
        attention_mask=attention_mask,
        past_key_values=past,
    ).last_hidden_state

Now we can generate the two datasets - one for the raw output of the underlying model, and one that translates that through the language model. These datasets are not shuffled so they are aligned with the original dataframe labels.

Code
from pathlib import Path
import numpy as np

DATA_FOLDER = Path("/data/blog/2021-05-07-different-clustering-approaches")

original_prompt = torch.load(
    "/data/blog/2021-04-28-imbd-prompt-training/trained-prompt-1-20-768.pt"
)

# The underlying model outputs
outputs = np.concatenate(
    list(generate_outputs(
        dl=validation_dataloader,
        model=model,
        prompt=original_prompt,
        use_lm_head=False,
    ))
)

np.save(
    DATA_FOLDER / "original-prompt-model-good.npy",
    outputs[validation_df.label == "good"]
)
np.save(
    DATA_FOLDER / "original-prompt-model-bad.npy",
    outputs[validation_df.label == "bad"]
)
Code
from pathlib import Path
import numpy as np

DATA_FOLDER = Path("/data/blog/2021-05-07-different-clustering-approaches")

original_prompt = torch.load(
    "/data/blog/2021-04-28-imbd-prompt-training/trained-prompt-1-20-768.pt"
)

# The output associated with explicit tokens
outputs = np.concatenate(
    list(generate_outputs(
        dl=validation_dataloader,
        model=model,
        prompt=original_prompt,
        use_lm_head=True,
    ))
)

np.save(
    DATA_FOLDER / "original-prompt-lm-good.npy",
    outputs[validation_df.label == "good"]
)
np.save(
    DATA_FOLDER / "original-prompt-lm-bad.npy",
    outputs[validation_df.label == "bad"]
)
Code
validation_df.to_parquet(DATA_FOLDER / "validation.gz.parquet", compression="gzip")

Visualization

So these datasets are large and loading the model has used quite a bit of memory. For each of these I will be loading the data from disk to allow me to restart this notebook as required. I’m going to be using the sklearn versions of the different dimensionality reduction techniques. The aim will be to produce a visualization that separates the two classes.

Ideally this would be achieved without guidance, as the data is separate - this dataset is 92% correctly separated. We can actually demonstrate this.

Data Separation

This data isn’t difficult to separate. We know that the model produces the “good” or “bad” output for the different sentiment classes. So the outputs are separated. To demonstrate this we can just check the number of correctly classified rows in each of the datasets.

Code
from pathlib import Path
import numpy as np

DATA_FOLDER = Path("/data/blog/2021-05-07-different-clustering-approaches")
GOOD_TOKEN = 11274
BAD_TOKEN = 14774

good_output = np.load(DATA_FOLDER / "original-prompt-lm-good.npy")
bad_output = np.load(DATA_FOLDER / "original-prompt-lm-bad.npy")
Code
(good_output[:, GOOD_TOKEN] > good_output[:, BAD_TOKEN]).sum() / len(good_output)
0.92168
Code
(bad_output[:, BAD_TOKEN] > bad_output[:, GOOD_TOKEN]).sum() / len(bad_output)
0.92352

So as we can see it is 92% accurate for both good and bad labels.

Principle Component Analysis (PCA)

So to start with we can try PCA again. This did not separate the classes well before. I think it’s good to include this in the different visualization techniques so we can compare them easily.

Code
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

lm_output = np.concatenate([
    np.load(DATA_FOLDER / "original-prompt-lm-good.npy"),
    np.load(DATA_FOLDER / "original-prompt-lm-bad.npy"),
])
validation_df = pd.read_parquet(DATA_FOLDER / "validation.gz.parquet")

pca_output = PCA(n_components=2).fit_transform(lm_output)
pca_df = pd.DataFrame(pca_output)
pca_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

pca_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

Code
model_output = np.concatenate([
    np.load(DATA_FOLDER / "original-prompt-model-good.npy"),
    np.load(DATA_FOLDER / "original-prompt-model-bad.npy"),
])

pca_output = PCA(n_components=2).fit_transform(model_output)
pca_df = pd.DataFrame(pca_output)
pca_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

pca_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

OH MY.

This appears to have worked? Maybe I made a mistake before - or was the pytorch version of PCA separating differently?

T-Distributed Stochastic Neighbor Embedding (t-SNE)

This technique does more to separate the different clusters according to their proximity in the original space. From wikipedia:

It models each high-dimensional object by a two- or three-dimensional point in such a way that similar objects are modeled by nearby points and dissimilar objects are modeled by distant points with high probability.

So this should produce visually separate clusters. It will be interesting to see if these clusters correspond to the dataset classes.

Code
from sklearn.manifold import TSNE
import numpy as np
import pandas as pd

lm_output = np.concatenate([
    np.load(DATA_FOLDER / "original-prompt-lm-good.npy"),
    np.load(DATA_FOLDER / "original-prompt-lm-bad.npy"),
])
validation_df = pd.read_parquet(DATA_FOLDER / "validation.gz.parquet")

tsne_output = TSNE(n_components=2).fit_transform(lm_output)
tsne_df = pd.DataFrame(tsne_output)
tsne_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

tsne_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

Code
model_output = np.concatenate([
    np.load(DATA_FOLDER / "original-prompt-model-good.npy"),
    np.load(DATA_FOLDER / "original-prompt-model-bad.npy"),
])
validation_df = pd.read_parquet(DATA_FOLDER / "validation.gz.parquet")

tsne_output = TSNE(n_components=2).fit_transform(model_output)
tsne_df = pd.DataFrame(tsne_output)
tsne_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

tsne_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

These are very pretty and they also separate the data really well. I especially like the second one, as it has naturally separated the outputs well. There is a small amount of mixing but it’s really very limited.

KMeans

This is both a way to visualize the classes and classify them. If I can use this to reconstruct the classes then this would be a way to classify the output of the model.

Code
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd

lm_output = np.concatenate([
    np.load(DATA_FOLDER / "original-prompt-lm-good.npy"),
    np.load(DATA_FOLDER / "original-prompt-lm-bad.npy"),
])
validation_df = pd.read_parquet(DATA_FOLDER / "validation.gz.parquet")

kmeans_output = KMeans(n_clusters=2).fit_transform(lm_output)
kmeans_df = pd.DataFrame(kmeans_output)
kmeans_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

kmeans_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

Code
model_output = np.concatenate([
    np.load(DATA_FOLDER / "original-prompt-model-good.npy"),
    np.load(DATA_FOLDER / "original-prompt-model-bad.npy"),
])
validation_df = pd.read_parquet(DATA_FOLDER / "validation.gz.parquet")

kmeans_output = KMeans(n_clusters=2).fit_transform(model_output)
kmeans_df = pd.DataFrame(kmeans_output)
kmeans_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

kmeans_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

That looks really odd. I wonder why it is shaped like that. Since it doesn’t appear to cleanly separate the classes I am not confident about the predictions that this will make.

Anyway it should be possible to predict the clusters over this. I don’t know which cluster index will be assigned to good or bad, so I need to just take the majority cluster for each.

Code
kmeans_predictions = KMeans(n_clusters=2).fit_predict(lm_output)
kmeans_predictions_df = pd.DataFrame(kmeans_predictions, columns=["cluster"])
kmeans_predictions_df["label"] = validation_df.label
kmeans_predictions_df = kmeans_predictions_df.value_counts().unstack()

# normalize by cluster
kmeans_predictions_df.divide(kmeans_predictions_df.sum(axis=1), axis=0)
label bad good
cluster
0 0.670294 0.329706
1 0.339523 0.660477
Code
kmeans_predictions = KMeans(n_clusters=2).fit_predict(model_output)
kmeans_predictions_df = pd.DataFrame(kmeans_predictions, columns=["cluster"])
kmeans_predictions_df["label"] = validation_df.label
kmeans_predictions_df = kmeans_predictions_df.value_counts().unstack()

# normalize by cluster
kmeans_predictions_df.divide(kmeans_predictions_df.sum(axis=1), axis=0)
label bad good
cluster
0 0.682953 0.317047
1 0.332951 0.667049

These numbers are very similar, and indicate a roughly \(\frac{2}{3}\) correct classification. This is very poor compared to the simple split between the good and bad tokens. So KMeans is not demonstrating a strong ability to split the clusters. The default cluster count is

Code
kmeans_predictions = KMeans(n_clusters=8).fit_predict(model_output)
kmeans_predictions_df = pd.DataFrame(kmeans_predictions, columns=["cluster"])
kmeans_predictions_df["label"] = validation_df.label
kmeans_predictions_df = kmeans_predictions_df.value_counts().unstack()

# normalize by cluster
kmeans_predictions_df.divide(kmeans_predictions_df.sum(axis=1), axis=0)
label bad good
cluster
0 0.893684 0.106316
1 0.927250 0.072750
2 0.797464 0.202536
3 0.124364 0.875636
4 0.818837 0.181163
5 0.264790 0.735210
6 0.138849 0.861151
7 0.237556 0.762444

While it’s slightly better with more clusters it’s still not 92% accurate. Using KMeans loses accuracy.

This isn’t a huge surprise given the very strange visualization. It suggests to me that KMeans is the wrong dimensionality reduction technique for this.

Normalized KMeans

One thing about kmeans is that it uses a measure of distance. If the different dimensions vary heavily in magnitude then they could lead to the bad clustering. Maybe some kind of normalization of the data could help?

Code
from sklearn.preprocessing import StandardScaler

scaled_lm_output = StandardScaler().fit_transform(lm_output)
scaled_model_output = StandardScaler().fit_transform(model_output)
Code
kmeans_output = KMeans(n_clusters=2).fit_transform(scaled_lm_output)
kmeans_df = pd.DataFrame(kmeans_output)
kmeans_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

kmeans_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

Code
kmeans_output = KMeans(n_clusters=2).fit_transform(scaled_model_output)
kmeans_df = pd.DataFrame(kmeans_output)
kmeans_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

kmeans_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

The kmeans results after scaling the model output look promising. What concerns me is that the dividing line does not appear to lie along the axis, to my eye it looks slightly offset. I feel that this could account for quite a lot of the misclassifications.

Anyway lets see how well it predicts the scaled model output.

Code
kmeans_predictions = KMeans(n_clusters=2).fit_predict(scaled_model_output)
kmeans_predictions_df = pd.DataFrame(kmeans_predictions, columns=["cluster"])
kmeans_predictions_df["label"] = validation_df.label
kmeans_predictions_df = kmeans_predictions_df.value_counts().unstack()

# normalize by cluster
kmeans_predictions_df.divide(kmeans_predictions_df.sum(axis=1), axis=0)
label bad good
cluster
0 0.929535 0.070465
1 0.093083 0.906917
Code
kmeans_predictions_df.divide(kmeans_predictions_df.sum(axis=1), axis=0).max()
label
bad     0.929535
good    0.906917
dtype: float64

This incurs about a 2% accuracy loss for good classifications. That’s better than I was expecting. So normalizing the data improves the accuracy of the cluster classification, but it’s not as good as the direct token comparison.

Uniform Manifold Approximation and Projection for Dimension Reduction (UMap)

This is a dimensionality reduction technique that is not part of sklearn (documentation). Someone at work suggested this as a way to do dimensionality reduction over this dataset (thank you!). Apparently it has a way to guide the separation based on the classes of the different entries.

Code
from umap import UMAP
import numpy as np
import pandas as pd

lm_output = np.concatenate([
    np.load(DATA_FOLDER / "original-prompt-lm-good.npy"),
    np.load(DATA_FOLDER / "original-prompt-lm-bad.npy"),
])
validation_df = pd.read_parquet(DATA_FOLDER / "validation.gz.parquet")

umap_output = UMAP(n_components=2).fit_transform(lm_output)
umap_df = pd.DataFrame(umap_output)
umap_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

umap_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

Code
model_output = np.concatenate([
    np.load(DATA_FOLDER / "original-prompt-model-good.npy"),
    np.load(DATA_FOLDER / "original-prompt-model-bad.npy"),
])
validation_df = pd.read_parquet(DATA_FOLDER / "validation.gz.parquet")

umap_output = UMAP(n_components=2).fit_transform(model_output)
umap_df = pd.DataFrame(umap_output)
umap_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

umap_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

These are very pretty. The documentation suggests using standard scaler over the data before running umap.

Code
umap_output = UMAP(n_components=2).fit_transform(scaled_lm_output)
umap_df = pd.DataFrame(umap_output)
umap_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

umap_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

Code
umap_output = UMAP(n_components=2).fit_transform(scaled_model_output)
umap_df = pd.DataFrame(umap_output)
umap_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

umap_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

After scaling the model output separates quite nicely with UMap. It appears that the data separates around 5 on axis 0. I’m quite entertained by the streamer style projections as a similar shape was also produced by t-SNE. I don’t think such streamers are easy to separate.


Retrained Prompt Visualization

So these visualization techniques have done really well with the token trained prompt. The underlying reason that I wanted to visualize the output of the prompt was because I had trained two prompts in more of a semantic way. I thought that the visualization and clustering of the outputs of the semantic training would show if it had worked without tying it to a specific token.

So I am going to visualize the cosine similarity trained prompt as well as the distance trained prompt. I have trained these on both the raw model output and the token output, so we can even evaluate those against each other. Let’s see if they separate.

The first thing to do is to generate the datasets for the clustering.

Code
from pathlib import Path
import numpy as np

DATA_FOLDER = Path("/data/blog/2021-05-07-different-clustering-approaches")

cosine_model_prompt = torch.load(
    "/data/blog/2021-05-04-prompt-training-clustering/trained-prompt-1-20-768-cosine.pt"
)

# The underlying model outputs
outputs = np.concatenate(
    list(generate_outputs(
        dl=validation_dataloader,
        model=model,
        prompt=cosine_model_prompt,
        use_lm_head=False,
    ))
)

np.save(
    DATA_FOLDER / "cosine-model-prompt-good.npy",
    outputs[validation_df.label == "good"]
)
np.save(
    DATA_FOLDER / "cosine-model-prompt-bad.npy",
    outputs[validation_df.label == "bad"]
)
Code
from pathlib import Path
import numpy as np

DATA_FOLDER = Path("/data/blog/2021-05-07-different-clustering-approaches")

distance_model_prompt = torch.load(
    "/data/blog/2021-05-04-prompt-training-clustering/trained-prompt-1-20-768-distance.pt"
)

# The underlying model outputs
outputs = np.concatenate(
    list(generate_outputs(
        dl=validation_dataloader,
        model=model,
        prompt=distance_model_prompt,
        use_lm_head=False,
    ))
)

np.save(
    DATA_FOLDER / "distance-model-prompt-good.npy",
    outputs[validation_df.label == "good"]
)
np.save(
    DATA_FOLDER / "distance-model-prompt-bad.npy",
    outputs[validation_df.label == "bad"]
)

Cosine Similarity - PCA

The first bit of training was with cosine similarity loss and the raw model output. This seemed to train quite poorly as the loss barely changed during training. Let’s see how that fares with PCA.

Code
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

model_output = np.concatenate([
    np.load(DATA_FOLDER / "cosine-model-prompt-good.npy"),
    np.load(DATA_FOLDER / "cosine-model-prompt-bad.npy"),
])
validation_df = pd.read_parquet(DATA_FOLDER / "validation.gz.parquet")

pca_output = PCA(n_components=2).fit_transform(model_output)
pca_df = pd.DataFrame(pca_output)
pca_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

pca_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

Distance Similarity

Code
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

model_output = np.concatenate([
    np.load(DATA_FOLDER / "distance-model-prompt-good.npy"),
    np.load(DATA_FOLDER / "distance-model-prompt-bad.npy"),
])
validation_df = pd.read_parquet(DATA_FOLDER / "validation.gz.parquet")

scaled_model_output = StandardScaler().fit_transform(model_output)

Distance Similarity - PCA

The second loss was the distance between the points. This time the loss changed quite dramatically. My concerns with this training is the relative weight between the loss for different classes and the loss for the same class.

Code
from sklearn.decomposition import PCA

pca_output = PCA(n_components=2).fit_transform(model_output)
pca_df = pd.DataFrame(pca_output)
pca_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

pca_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

Code
pca_output = PCA(n_components=2).fit_transform(scaled_model_output)
pca_df = pd.DataFrame(pca_output)
pca_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

pca_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

These both look terrible. The clustering looks extremely similar to the original clustering that was observed in the training run. This leads me to believe that I was using PCA correctly, and that the results were bad.

I’m going to run the other clustering methods over the distance trained dataset for completeness.

Distance Similarity - t-SNE

Code
from sklearn.manifold import TSNE

tsne_output = TSNE(n_components=2).fit_transform(model_output)
tsne_df = pd.DataFrame(tsne_output)
tsne_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

tsne_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

Code
tsne_output = TSNE(n_components=2).fit_transform(scaled_model_output)
tsne_df = pd.DataFrame(tsne_output)
tsne_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

tsne_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

Distance Similarity - KMeans

Code
from sklearn.cluster import KMeans

kmeans_output = KMeans(n_clusters=2).fit_transform(model_output)
kmeans_df = pd.DataFrame(kmeans_output)
kmeans_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

kmeans_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

Code
kmeans_output = KMeans(n_clusters=2).fit_transform(scaled_model_output)
kmeans_df = pd.DataFrame(kmeans_output)
kmeans_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

kmeans_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

Distance Similarity - UMap

Code
from umap import UMAP

umap_output = UMAP(n_components=2).fit_transform(model_output)
umap_df = pd.DataFrame(umap_output)
umap_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

umap_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

Code
from umap import UMAP

umap_output = UMAP(n_components=2).fit_transform(scaled_model_output)
umap_df = pd.DataFrame(umap_output)
umap_df["color"] = validation_df.label.map({"good": "blue", "bad": "red"})

umap_df.plot.scatter(x=0, y=1, c="color", s=0.1) ; None

I don’t think it’s a wild surprise that all of these visualizations are poor. I feel like the core problem is the loss function. That needs to be revisited - first by checking that it works at all, and then by retraining. It’s great to see that the visualization techniques work though.