Code
import blog.transformers_logging
June 1, 2022
I want to try to update the model used by the conversation bot. The original blog post was written in 2019 and the transformer library has progressed significantly since then. How hard would it be to take the original code and get it working again?
Once again we will use the PERSONA-CHAT dataset (Zhang et al. 2018) and train GPT-2. It may be possible to produce a version of the model that loses all punctuation and case information as that would be more consistent with it’s use as a chatbot.
To help the model we will mark out different sections of the input with special tokens so that the model can spot the difference between the persona it is using and the different utterances of the participants.
from transformers import AutoModel, AutoTokenizer
SPECIAL_TOKENS = [
"<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"
]
ATTR_TO_SPECIAL_TOKEN = {
"bos_token": "<bos>",
"eos_token": "<eos>",
"pad_token": "<pad>",
"additional_special_tokens": ["<speaker1>", "<speaker2>"]
}
def add_special_tokens_(model: AutoModel, tokenizer: AutoTokenizer) -> None:
""" Add special tokens to the tokenizer and the model if they have not already been added. """
orig_num_tokens = tokenizer.vocab_size
num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
if num_added_tokens > 0:
model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
1]``:
- 0 corresponds to a `sentence A` token,
- 1 corresponds to a `sentence B` token.
`What are token type IDs? <../glossary.html#token-type-ids>`_
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size - 1]`` All labels set to
``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size - 1]``
No config specified, defaulting to: personachat_truecased/full
Reusing dataset personachat_truecased (/home/matthew/.cache/huggingface/datasets/personachat_truecased/full/1.0.0/73ee8f1a0d9e42255af5a8301877a2f3ac638e55b1cd9cbccca5ab7e23d2b638)
DatasetDict({
train: Dataset({
features: ['personality', 'candidates', 'history', 'conv_id', 'utterance_idx'],
num_rows: 131438
})
validation: Dataset({
features: ['personality', 'candidates', 'history', 'conv_id', 'utterance_idx'],
num_rows: 7801
})
})
from typing import Any, Dict, List
from itertools import chain, cycle
def tokenize_row(row: Dict[str, Any]) -> Dict[str, Any]:
personality = tokenizer(
row["personality"],
return_attention_mask=False,
add_special_tokens=False,
).input_ids
history = tokenizer(
row["history"],
return_attention_mask=False,
add_special_tokens=False,
).input_ids
candidates = tokenizer(
row["candidates"],
return_attention_mask=False,
add_special_tokens=False,
).input_ids
return {
"personality_ids": personality,
"history_ids": history,
"candidates_ids": candidates,
}
def build_inputs(row: Dict[str, Any]) -> List[Dict[str, Any]]:
personality = list(chain(*row["personality_ids"]))
history = row["history_ids"]
candidates = row["candidates_ids"]
gold_reply_index = len(candidates) - 1
return [
_encode(personality=personality, history=history, reply=reply, is_gold=index==gold_reply_index)
for index, reply in enumerate(candidates)
]
def _encode(
personality: List[int],
history: List[List[int]],
reply: List[int],
is_gold: bool,
) -> Dict[str, Any]:
# speaker 1 is the person, 2 is the chatbot
bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
personality = [bos] + personality
reply = reply + [eos]
utterances = history + [reply]
# the person speaks first so we start with speaker1
sequence = [personality] + [
[speaker] + utterance
for speaker, utterance in zip(cycle([speaker1, speaker2]), utterances)
]
input_ids = list(chain(*sequence))
# the chatbot personality is first so we start with speaker2
# the speaker1 value is 0 and speaker2 is 1
token_type_ids = [
speaker
for speaker, row in zip(cycle([1, 0]), sequence)
for _ in row
]
mc_token_ids = len(input_ids) - 1
if is_gold:
lm_labels = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
else:
lm_labels = [-100] * len(input_ids)
return {
"input_ids": input_ids,
"token_type_ids": token_type_ids,
"mc_token_ids": mc_token_ids,
"labels": lm_labels,
}
Some weights of GPT2DoubleHeadsModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['multiple_choice_head.summary.weight', 'multiple_choice_head.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Dataset({
features: ['input_ids', 'token_type_ids', 'mc_token_ids', 'labels'],
num_rows: 156020
})
import datasets
from transformers import GPT2DoubleHeadsModel, AutoTokenizer
encoded_ds = datasets.load_from_disk("/data/blog/2022-06-01-training-conversational-model/encoded-persona")
model = GPT2DoubleHeadsModel.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
add_special_tokens_(model=model, tokenizer=tokenizer)
import string
from typing import Any, Dict, Tuple, Union
import torch
import torch.nn.functional as F
from transformers import GPT2DoubleHeadsModel, Trainer, TrainingArguments
from transformers.models.gpt2.modeling_gpt2 import GPT2DoubleHeadsModelOutput
from transformers.modeling_outputs import MaskedLMOutput
from transformers.tokenization_utils_base import BatchEncoding
class ConversationTrainingArguments(TrainingArguments):
def __init__(self, *args, lm_coefficient: float = 1.0, mc_coefficient: float = 1.0, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.lm_coefficient = lm_coefficient
self.mc_coefficient = mc_coefficient
class ConversationTrainer(Trainer):
def compute_loss(
self,
model: GPT2DoubleHeadsModel,
inputs: Dict[str, Union[torch.Tensor, Any]],
return_outputs: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
output: GPT2DoubleHeadsModelOutput = model(**inputs)
if output.loss is not None and output.mc_loss is not None:
loss = output.loss * self.args.lm_coefficient + output.mc_loss * self.args.mc_coefficient
elif output.loss is not None:
loss = output.loss * self.args.lm_coefficient
elif output.mc_loss is not None:
loss = output.mc_loss * self.args.mc_coefficient
else:
# lm_labels is missing!
print("no loss available")
print(inputs.keys())
print(output)
loss = torch.tensor(0., device=model.device)
return (loss, output) if return_outputs else loss
from typing import *
from pathlib import Path
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import EvalPrediction
MODEL_RUN_FOLDER = DATA_FOLDER / "runs"
MODEL_RUN_FOLDER.mkdir(parents=True, exist_ok=True)
training_args = ConversationTrainingArguments(
report_to=[],
output_dir=MODEL_RUN_FOLDER / "output",
overwrite_output_dir=True,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
learning_rate=5e-5,
warmup_ratio=0.06,
evaluation_strategy="steps",
logging_dir=MODEL_RUN_FOLDER / "output",
# num_train_epochs=EPOCHS,
max_steps=MAX_STEPS,
logging_steps=1_000,
save_strategy="steps",
save_steps=1_000,
save_total_limit=3,
# not really training properly here
load_best_model_at_end=True,
metric_for_best_model="loss",
greater_is_better=False,
)
# the labels are not padded correctly
class CustomCollator:
def __init__(self, tokenizer):
self.tokenizer = tokenizer
def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
batch = self.tokenizer.pad(
features,
padding=True,
# max_length=self.tokenizer.model_max_length,
# pad_to_multiple_of=self.pad_to_multiple_of,
# return_tensors=self.return_tensors,
)
if "label" in batch:
batch["labels"] = batch["label"]
del batch["label"]
if "label_ids" in batch:
batch["labels"] = batch["label_ids"]
del batch["label_ids"]
# pad the labels with -100 to indicate that the index is ignored
max_length = max(map(len, batch["labels"]))
batch["labels"] = [
row + [-100]*(max_length-len(row))
for row in batch["labels"]
]
batch = {
key: torch.tensor(value, dtype=torch.long)
for key, value in batch.items()
}
return batch
data_collator = CustomCollator(tokenizer=tokenizer)
trainer = ConversationTrainer(
model=model,
args=training_args,
train_dataset=encoded_ds["train"],
eval_dataset=encoded_ds["test"].train_test_split(test_size=100)["test"],
data_collator=data_collator,
tokenizer=tokenizer,
# compute_metrics=compute_metrics,
)
trainer.train()
Loading cached split indices for dataset at /data/blog/2022-06-01-training-conversational-model/encoded-persona/test/cache-2b493335bed4e91d.arrow and /data/blog/2022-06-01-training-conversational-model/encoded-persona/test/cache-0448a2cfaf5e4803.arrow
Step | Training Loss | Validation Loss |
---|---|---|
1000 | 2.759200 | 1.130620 |
2000 | 1.259800 | 1.009202 |
3000 | 1.153900 | 0.832176 |
4000 | 1.074400 | 0.916434 |
5000 | 1.095900 | 0.992374 |
6000 | 1.053400 | 0.854033 |
7000 | 1.078800 | 0.883357 |
8000 | 1.037700 | 0.886619 |
9000 | 1.098200 | 0.902168 |
10000 | 1.062200 | 0.891645 |
11000 | 1.017900 | 0.794563 |
12000 | 0.970700 | 0.780857 |
13000 | 1.023800 | 0.780511 |
14000 | 0.966700 | 0.795552 |
15000 | 1.137700 | 0.807680 |
16000 | 1.078700 | 0.788897 |
17000 | 0.974400 | 0.808052 |
18000 | 1.049300 | 0.800286 |
19000 | 0.999900 | 0.816804 |
20000 | 1.020100 | 0.740952 |
21000 | 0.975200 | 0.727781 |
22000 | 1.044900 | 0.735384 |
23000 | 0.990200 | 0.722581 |
24000 | 1.081300 | 0.704887 |
25000 | 0.977400 | 0.712826 |
26000 | 0.898100 | 0.752505 |
27000 | 1.017400 | 0.701749 |
28000 | 1.052800 | 0.699840 |
29000 | 0.913800 | 0.684769 |
30000 | 0.955000 | 0.669663 |
31000 | 1.010200 | 0.704554 |
32000 | 0.892700 | 0.706193 |
33000 | 0.942500 | 0.689601 |
34000 | 0.947700 | 0.638923 |
35000 | 0.869300 | 0.647789 |
36000 | 0.968700 | 0.662381 |
37000 | 0.942800 | 0.658328 |
38000 | 0.893200 | 0.667278 |
39000 | 0.952300 | 0.667240 |
40000 | 0.996300 | 0.629918 |
41000 | 1.010000 | 0.644377 |
42000 | 0.882400 | 0.653525 |
43000 | 1.043400 | 0.642774 |
44000 | 0.987000 | 0.641157 |
45000 | 0.841000 | 0.650937 |
46000 | 1.001300 | 0.645718 |
47000 | 1.018500 | 0.638493 |
48000 | 0.967500 | 0.638008 |
49000 | 0.950300 | 0.633663 |
50000 | 0.940500 | 0.632136 |
TrainOutput(global_step=50000, training_loss=1.0375269262695312, metrics={'train_runtime': 10829.9223, 'train_samples_per_second': 36.935, 'train_steps_per_second': 4.617, 'total_flos': 4.848046394564083e+16, 'train_loss': 1.0375269262695312, 'epoch': 0.15})
from pathlib import Path
from transformers import AutoTokenizer, GPT2LMHeadModel
DATA_FOLDER = Path("/data/blog/2022-06-01-training-conversational-model")
MODEL_NAME = "gpt2"
model = GPT2LMHeadModel.from_pretrained(DATA_FOLDER / "best-model")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model.eval()
model.cuda()
add_special_tokens_(model=model, tokenizer=tokenizer)
Some weights of the model checkpoint at /data/blog/2022-06-01-training-conversational-model/best-model were not used when initializing GPT2LMHeadModel: ['multiple_choice_head.summary.weight', 'multiple_choice_head.summary.bias']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
from typing import Any, Dict, List
from itertools import chain, cycle
import torch
from transformers import AutoModel, AutoTokenizer
@torch.no_grad()
def sample(personality: str, history: List[str], model: AutoModel, tokenizer: AutoTokenizer, **kwargs) -> List[str]:
inputs = _tokenize(
personality=personality,
history=history,
model=model,
tokenizer=tokenizer
)
output = model.sample(**inputs, **kwargs)
return tokenizer.decode(output[0])
def _tokenize(personality: str, history: List[str], model: AutoModel, tokenizer: AutoTokenizer) -> Dict[str, torch.Tensor]:
bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
personality_tokens = tokenizer(
personality,
return_attention_mask=False,
add_special_tokens=False,
).input_ids
history_tokens = tokenizer(
history,
return_attention_mask=False,
add_special_tokens=False,
).input_ids
personality_tokens = [bos] + personality_tokens
# the person speaks first so we start with speaker1
sequence = [personality_tokens] + [
[speaker] + utterance
for speaker, utterance in zip(cycle([speaker1, speaker2]), history_tokens)
] + [[speaker2]]
input_ids = list(chain(*sequence))
# the chatbot personality is first so we start with speaker2
# the speaker1 value is 0 and speaker2 is 1
token_type_ids = [
speaker
for speaker, row in zip(cycle([1, 0]), sequence)
for _ in row
]
input_ids = torch.tensor(input_ids, dtype=torch.long, device=model.device)
token_type_ids = torch.tensor(token_type_ids, dtype=torch.long, device=model.device)
return {
"input_ids": input_ids[None, :],
"token_type_ids": token_type_ids[None, :],
}
from transformers import (
LogitsProcessorList,
MinLengthLogitsProcessor,
TopKLogitsWarper,
TemperatureLogitsWarper,
StoppingCriteriaList,
MaxLengthCriteria,
)
# instantiate logits processors
logits_processor = LogitsProcessorList(
[
MinLengthLogitsProcessor(15, eos_token_id=tokenizer.eos_token_id),
]
)
# instantiate logits processors
logits_warper = LogitsProcessorList(
[
TopKLogitsWarper(50),
TemperatureLogitsWarper(0.7),
]
)
stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=100)])
sample(
personality="\n".join([
"I work at a museum.",
"I like to go to the park.",
"I am stuck in a wheelchair.",
]),
history=[
"hello, how are you?",
],
model=model,
tokenizer=tokenizer,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
logits_processor=logits_processor,
logits_warper=logits_warper,
stopping_criteria=stopping_criteria,
)
"<bos>I work at a museum.\nI like to go to the park.\nI am stuck in a wheelchair.<speaker1>hello, how are you?<speaker2>I'm okay. How are you this evening?<eos>"
from dataclasses import dataclass, field
from transformers import (
AutoModel,
AutoTokenizer,
LogitsProcessorList,
MinLengthLogitsProcessor,
TopKLogitsWarper,
TemperatureLogitsWarper,
StoppingCriteriaList,
MaxLengthCriteria,
)
@dataclass
class Conversationalist:
personality: List[str]
tokenizer: AutoTokenizer
model: AutoModel
history: List[str] = field(default_factory=list)
def converse(self, utterance: str) -> str:
response = self.sample(history=self.history + [utterance])
self.history = self.history + [utterance, response]
return response
@torch.no_grad()
def sample(self, history: List[str]) -> List[str]:
inputs = _tokenize(
personality="\n".join(self.personality),
history=history,
model=self.model,
tokenizer=self.tokenizer
)
# instantiate logits processors
logits_processor = LogitsProcessorList(
[
MinLengthLogitsProcessor(15, eos_token_id=self.tokenizer.eos_token_id),
]
)
# instantiate logits processors
logits_warper = LogitsProcessorList(
[
TopKLogitsWarper(50),
TemperatureLogitsWarper(0.7),
]
)
input_length = inputs["input_ids"].shape[-1]
stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=input_length + 50)])
output = model.sample(
**inputs,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id,
logits_processor=logits_processor,
logits_warper=logits_warper,
stopping_criteria=stopping_criteria,
)
return tokenizer.decode(output[0, input_length:], skip_special_tokens=True)
def _tokenize(personality: str, history: List[str], model: AutoModel, tokenizer: AutoTokenizer) -> Dict[str, torch.Tensor]:
bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
personality_tokens = tokenizer(
personality,
return_attention_mask=False,
add_special_tokens=False,
).input_ids
history_tokens = tokenizer(
history,
return_attention_mask=False,
add_special_tokens=False,
).input_ids
personality_tokens = [bos] + personality_tokens
# the person speaks first so we start with speaker1
sequence = [personality_tokens] + [
[speaker] + utterance
for speaker, utterance in zip(cycle([speaker1, speaker2]), history_tokens)
] + [[speaker2]]
input_ids = list(chain(*sequence))
# the chatbot personality is first so we start with speaker2
# the speaker1 value is 0 and speaker2 is 1
token_type_ids = [
speaker
for speaker, row in zip(cycle([1, 0]), sequence)
for _ in row
]
input_ids = torch.tensor(input_ids, dtype=torch.long, device=model.device)
token_type_ids = torch.tensor(token_type_ids, dtype=torch.long, device=model.device)
return {
"input_ids": input_ids[None, :],
"token_type_ids": token_type_ids[None, :],
}
"Hi, I'm ok, I am going to go to the park, just got done working."
'I do, it is nice. I go to the park every day.'
"I don't know what I am talking about. I've gone to college but I love my son."