Notebook Fourteen |
Repository
Transformers
Andrea Leone
University of Trento
January 2022
import project
import pandas as pd
import numpy as np
import torch
import transformers
project.notebook()
# df = pd.read_csv('./data/talks.csv')
# records = df[['transcript','category']][~df['transcript'].isnull()].to_numpy()
records = project.sql_query("""
SELECT transcript, category FROM talks
WHERE transcript IS NOT NULL
ORDER BY slug ASC;
""")
(x, y), (z, t) \
= train_set, test_set \
= splits \
= project.split_in_sets( records )
project.describe_sets(splits)
device = torch.device('cpu')
model_type = 'bert-base-cased'
model = transformers.BertForSequenceClassification.from_pretrained (
model_type, num_labels = 3,
output_attentions = False,
output_hidden_states = False
).to(device)
tokenizer = transformers.BertTokenizer.from_pretrained(model_type)
optimizer = transformers.AdamW ( model.parameters(), lr=2e-5, eps=1e-8 )
epochs = 5
model_name = './models/BERT.v{}.{}.net'.format(1, epochs)
performance = project.train_trf (model, x, y, tokenizer, optimizer, device, epochs, li=500)
results = project.test_trf (model, z, t, tokenizer, device)
project.export(model, model_name)
model_type = 'roberta-base'
model = transformers.RobertaForSequenceClassification.from_pretrained (
model_type, num_labels = 3,
output_attentions = False,
output_hidden_states = False
).to(device)
tokenizer = transformers.RobertaTokenizerFast.from_pretrained(model_type)
optimizer = transformers.AdamW ( model.parameters(), lr=2e-5, eps=1e-8 )
epochs = 5
model_name = './models/RoBERTa.v{}.{}.net'.format(1, epochs)
performance = project.train_trf (model, x, y, tokenizer, optimizer, device, epochs, li=500)
results = project.test_trf (model, z, t, tokenizer, device)
project.export(model, model_name)
model_type = 'distilbert-base-cased'
model = transformers.DistilBertForSequenceClassification.from_pretrained (
model_type, num_labels = 3,
output_attentions = False,
output_hidden_states = False
).to(device)
tokenizer = transformers.DistilBertTokenizer.from_pretrained(model_type)
optimizer = transformers.AdamW ( model.parameters(), lr=2e-5, eps=1e-8 )
epochs = 5
model_name = './models/DistilBERT.v{}.{}.net'.format(1, epochs)
performance = project.train_trf (model, x, y, tokenizer, optimizer, device, epochs, li=500)
results = project.test_trf (model, z, t, tokenizer, device)
project.export(model, model_name)
model_type = 'squeezebert/squeezebert-uncased'
model = transformers.SqueezeBertForSequenceClassification.from_pretrained (
model_type, num_labels = 3,
output_attentions = False,
output_hidden_states = False
).to(device)
tokenizer = transformers.SqueezeBertTokenizer.from_pretrained(model_type)
optimizer = transformers.AdamW ( model.parameters(), lr=2e-5, eps=1e-8 )
# used in the training session
version = 1
from_epoch = 15
model.load_state_dict (
torch.load('./models/SqueezeBERT.v{}.{}.net'.format(version, from_epoch), map_location=device),
)
epochs = 5
model_name = './models/SqueezeBERT.v{}.{}.net'.format(1, epochs)
performance = project.train_trf (model, x, y, tokenizer, optimizer, device, epochs, li=500)
results = project.test_trf (model, z, t, tokenizer, device)
project.export(model, model_name)
from transformers import AutoTokenizer, NystromformerForSequenceClassification
model_name = "uw-madison/nystromformer-512"
model = NystromformerForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer(x[0][:2300], return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(**inputs, labels=labels)
loss = outputs.loss
logits = outputs.logits
len(tokenizer(x[0][:2300], return_tensors="pt")['input_ids'][0])
Transformers score board
model accuracy precision recall es BERT .34837799 .11612599 .3 5 BERT .87447108 .87162724 .87438515 10 BERT .92806770 .92677159 .92359018 15 BERT .93229901 .93357080 .92739973 20 RoBERTa .34837799 .11612599 .3 5 RoBERTa .68928067 — — 10 RoBERTa .80253878 .80112364 .79699835 15 RoBERTa .85190409 .85917919 .83747284 20 DistilBERT .92242595 .92319025 .91704217 5 DistilBERT .94781382 .94554739 .94635791 10 DistilBERT .92806770 .92449340 .92972718 15 DistilBERT .92383638 .92122829 .92547669 20 SqueezeBERT .90409026 .90895179 .89725193 5 SqueezeBERT .93229901 .93061917 .93270826 10 SqueezeBERT .95345557 .95080893 .95318207 15 SqueezeBERT .94499294 .94611277 .94207616 20