Code
= "facebook/bart-base"
MODEL_NAME = 128
MAXIMUM_TOKEN_LENGTH = 64
BATCH_SIZE = 5 EPOCHS
July 25, 2021
After creating the previous model and evaluating it I came up with a few different improvements that could be done. One is to avoid using the model to do two things, and instead just concentrate on token sentiment. All of the datasets mark up the entities already so it should be fine to evaluate like this.
Given that the model will move towards token sentiment classification, it should not be penalized for predicting sentiment for tokens other than the end of the entity. The entire token sequence for the entity can be marked up with the desired sentiment and the model can train against that. When training against that two approaches spring to mind - a straight per token cross entropy loss, and averaging the sentiment across the different tokens.
Since the direcct per-token approach will be easier to implement I’ll start with that. The metric to beat is the .83 - .85 that the current SOTA achieves. My previous approach got .82 so it may be possible.
The labels need to indicate if the token has a known sentiment. So the semaphore value -1
will be used to mark tokens that do not have a known sentiment. They can then be skipped when calculating loss.
The code here encodes the text and then transforms the datasets. It is quite similar to the previous posts. If you want to review how the parquet files were produced see this blog post.
#collapse
from typing import *
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
sentiment_index = {
"negative": 0,
"neutral": 1,
"positive": 2,
}
def encode(row: Dict[str, Any]) -> Dict[str, Any]:
text = row["text"]
entities = row["entities"]
tokenized_text = tokenizer(
text,
return_offsets_mapping=True,
max_length=MAXIMUM_TOKEN_LENGTH,
truncation=True,
padding="max_length"
)
offset_mapping = tokenized_text["offset_mapping"]
labels = []
try:
entity_iter = iter(entities)
entity = next(entity_iter)
for span_start, span_end in offset_mapping:
if span_start == span_end: # zero width token, skip
labels.append(-1)
continue
while span_start > entity["end"]:
entity = next(entity_iter)
# if the entity overlaps the current span at all then it has that sentiment
if entity["start"] <= span_start < entity["end"]:
labels.append(sentiment_index[entity["sentiment"]])
elif entity["start"] < span_end <= entity["end"]:
labels.append(sentiment_index[entity["sentiment"]])
else:
labels.append(-1)
except StopIteration:
labels += [-1] * (len(offset_mapping) - len(labels))
return {
"input_ids": tokenized_text["input_ids"],
"attention_mask": tokenized_text["attention_mask"],
"label": labels,
}
#hide_output
import pandas as pd
from datasets import Dataset
train_df = pd.read_parquet("/data/blog/2021-07-18-aspect-sentiment-dataset/train.gz.parquet")
validation_df = pd.read_parquet("/data/blog/2021-07-18-aspect-sentiment-dataset/validation.gz.parquet")
test_df = pd.read_parquet("/data/blog/2021-07-18-aspect-sentiment-dataset/test.gz.parquet")
train_ds = Dataset.from_pandas(train_df)
train_ds = train_ds.map(encode)
validation_ds = Dataset.from_pandas(validation_df)
validation_ds = validation_ds.map(encode)
test_ds = Dataset.from_pandas(test_df)
test_ds = test_ds.map(encode)
Here the model is copying the original BartForSequenceClassification quite heavily as that provides good support for the different functionality that huggingface models provide. Given that this leads to quite verbose code, it is collapsed. The part of interest is the loss calculation:
logits = self.score(hidden_states)
loss = None
if labels is not None:
sentiment_labels = labels.flatten()
sentiment_predictions = logits.reshape(-1, 3)
sentiment_mask = sentiment_labels >= 0
sentiment_labels = sentiment_labels[sentiment_mask]
sentiment_predictions = sentiment_predictions[sentiment_mask]
loss = torch.nn.functional.cross_entropy(
sentiment_predictions,
sentiment_labels
)
You can see that the labels and predictions are flattened out so that the mask can select the rows of interest. After that it is a normal cross entropy loss calculation.
#collapse
from typing import *
from transformers import BartModel, AutoConfig, BartPretrainedModel
from transformers.modeling_outputs import Seq2SeqSequenceClassifierOutput
# from transformers.models.bart.modeling_bart import
import torch
class TokenSentimentSequenceClassifier(BartPretrainedModel):
def __init__(self, config: AutoConfig) -> None:
config.num_labels = 3 # negative, neutral, positive
super().__init__(config)
self.model = BartModel(config)
# bart model for sequence classification actually has a more complex classification head
self.score = torch.nn.Linear(
in_features=config.d_model,
out_features=config.num_labels,
bias=False,
)
def forward(
self,
*,
input_ids=None,
attention_mask=None,
decoder_input_ids=None,
decoder_attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
encoder_outputs=None,
inputs_embeds=None,
decoder_inputs_embeds=None,
labels=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
# heavily copied from the original bart sequence classification code
# https://huggingface.co/transformers/_modules/transformers/models/bart/modeling_bart.html#BartForSequenceClassification
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
use_cache = False
if input_ids is None and inputs_embeds is not None:
raise NotImplementedError(
f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
)
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
# cross_attn_head_mask=cross_attn_head_mask, # need to update my transformers?
encoder_outputs=encoder_outputs,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0] # last hidden state
logits = self.score(hidden_states)
loss = None
if labels is not None:
sentiment_labels = labels.flatten()
sentiment_predictions = logits.reshape(-1, 3)
sentiment_mask = sentiment_labels >= 0
sentiment_labels = sentiment_labels[sentiment_mask]
sentiment_predictions = sentiment_predictions[sentiment_mask]
loss = torch.nn.functional.cross_entropy(
sentiment_predictions,
sentiment_labels
)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return Seq2SeqSequenceClassifierOutput(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
#collapse
from typing import *
from sklearn.metrics import (
accuracy_score, precision_recall_fscore_support
)
from transformers import EvalPrediction
def compute_metrics(pred: EvalPrediction) -> Dict[str, float]:
labels = pred.label_ids.flatten()
predictions = pred.predictions[0].reshape(-1, 3)
sentiment_mask = labels >= 0
sentiment_labels = labels[sentiment_mask]
sentiment_predictions = predictions[sentiment_mask].argmax(axis=1)
accuracy = accuracy_score(sentiment_labels, sentiment_predictions)
(
(negative_precision, neutral_precision, positive_precision),
(negative_recall, neutral_recall, positive_recall),
(negative_fscore, neutral_fscore, positive_fscore),
_
) = precision_recall_fscore_support(sentiment_labels, sentiment_predictions)
return {
"accuracy": accuracy,
"negative_precision": negative_precision,
"negative_recall": negative_recall,
"negative_f1_score": negative_fscore,
"neutral_precision": neutral_precision,
"neutral_recall": neutral_recall,
"neutral_f1_score": neutral_fscore,
"positive_precision": positive_precision,
"positive_recall": positive_recall,
"positive_f1_score": positive_fscore,
}
Now we can try training this. My computer is actually busy doing other things so I need to wait for a while before running this. Such a shame.
Some weights of the model checkpoint at facebook/bart-base were not used when initializing TokenSentimentSequenceClassifier: ['final_logits_bias']
- This IS expected if you are initializing TokenSentimentSequenceClassifier from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TokenSentimentSequenceClassifier from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TokenSentimentSequenceClassifier were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
from pathlib import Path
from transformers import Trainer, TrainingArguments
MODEL_RUN_FOLDER = Path("/data/blog/2021-07-25-sentiment-only/runs")
MODEL_RUN_FOLDER.mkdir(parents=True, exist_ok=True)
training_args = TrainingArguments(
report_to=[],
output_dir=MODEL_RUN_FOLDER / "output",
overwrite_output_dir=True,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
learning_rate=5e-5,
warmup_ratio=0.06,
num_train_epochs=EPOCHS,
evaluation_strategy="epoch",
logging_dir=MODEL_RUN_FOLDER / "output",
logging_steps=100,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
greater_is_better=True,
no_cuda=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_ds,
eval_dataset=validation_ds,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.train()
Epoch | Training Loss | Validation Loss | Accuracy | Negative Precision | Negative Recall | Negative F1 Score | Neutral Precision | Neutral Recall | Neutral F1 Score | Positive Precision | Positive Recall | Positive F1 Score | Runtime | Samples Per Second |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | No log | 0.616170 | 0.726969 | 0.781818 | 0.708235 | 0.743210 | 0.820375 | 0.648305 | 0.724260 | 0.632780 | 0.840220 | 0.721893 | 33.245500 | 15.040000 |
2 | 0.702600 | 0.534138 | 0.787589 | 0.819549 | 0.769412 | 0.793689 | 0.824541 | 0.761653 | 0.791850 | 0.733010 | 0.831956 | 0.779355 | 33.015500 | 15.144000 |
3 | 0.410000 | 0.588257 | 0.778998 | 0.734694 | 0.847059 | 0.786885 | 0.866580 | 0.708686 | 0.779720 | 0.723890 | 0.830579 | 0.773573 | 33.002100 | 15.151000 |
4 | 0.410000 | 0.614729 | 0.788544 | 0.804196 | 0.811765 | 0.807963 | 0.839243 | 0.752119 | 0.793296 | 0.728049 | 0.822314 | 0.772316 | 33.516800 | 14.918000 |
5 | 0.273100 | 0.631105 | 0.790453 | 0.780761 | 0.821176 | 0.800459 | 0.843675 | 0.748941 | 0.793490 | 0.740741 | 0.826446 | 0.781250 | 33.067000 | 15.121000 |
TrainOutput(global_step=340, training_loss=0.43597483354456285, metrics={'train_runtime': 4978.6882, 'train_samples_per_second': 0.068, 'total_flos': 2300541802905600.0, 'epoch': 5.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -8753152, 'train_mem_gpu_alloc_delta': 0, 'train_mem_cpu_peaked_delta': 18440192, 'train_mem_gpu_peaked_delta': 0})
So this isn’t better than the combined model. I wonder if this is the luck of the draw or if the classification of entities actually materially improves the output. Still, can evaluate the sentiment it is coming up with.
Now that we are explicitly producing sentiment for every word we can review the sentiment for every word. Previously I was viewing this as a “graph” of tokens however I am reminded that it is possible to render HTML in jupyter. So now I can just color the text directly.
#collapse
from IPython.display import HTML
sentiment_names = ["negative", "neutral", "positive"]
def show_word_sentiment(text: str) -> HTML:
sentiment = word_sentiment(text)
word_and_color = [
(
entry["token"],
f"rgb({int(entry['negative'] * 255)}, {int(entry['positive'] * 255)}, {int(entry['neutral'] * 255)})"
)
for index, entry in enumerate(sentiment)
]
html_words = " ".join(
f"""<span style="color: {color}">{word}</span>"""
for word, color in word_and_color
)
return HTML(f"""<div style="font-size: large">{html_words}</div>""")
def word_sentiment(text: str) -> List[Tuple[str, str]]:
tokenized_text = tokenizer(
text,
return_tensors="pt",
add_special_tokens=False
)
with torch.no_grad():
input_ids = tokenized_text["input_ids"].to(model.device)
output = model(input_ids=input_ids)[0]
token_sentiment = (
output.reshape(-1, 3)
.softmax(dim=-1)
)
return [
{
"token": tokenizer.decode(token),
"negative": round(sentiment[0], 3),
"neutral": round(sentiment[1], 3),
"positive": round(sentiment[2], 3),
}
for token, sentiment in zip(input_ids[0].tolist(), token_sentiment.tolist())
]
The main difference here seems to be that the 80 epoch model from the last post is more extreme. The broad thrust of the sentiment allocation appears to be the same. This is not very surprising.