Notebook One |
Repository
Transcript vectorisation
Andrea Leone
University of Trento
January 2022
import project
import numpy as np
import spacy
from tqdm.notebook import tqdm
Load the pre-trained pipelines for word embeddings:
en_core_web_lg
: English tok2vec pipeline optimized for CPU.en_core_web_trf
: English transformer pipeline based on RoBERTa.Both pipelines are trained on WordNet 3.0 lexical database of English, ClearNLP Constituent-to-Dependency Conversion, and OntoNotes 5 corpus.
nlp = spacy.load('en_core_web_lg')
trf = spacy.load('en_core_web_trf')
Query the records that still have no vector transcript
records = project.sql_query("""
SELECT * FROM talks
WHERE
transcript IS NOT NULL AND
vector IS NULL
ORDER BY slug ASC;
""")
For each record retrieved, get the transcript, input it in the nlp
pipeline to vectorise the entire document (token-per-token), extract the document vector converting the numerical values to float64
.
for record in tqdm( records ):
slug = record[0]
transcript = record[4]
vector = nlp( transcript ).vector.astype( np.float64 )
vector = project.sqlize_array( vector )
project.sql_commit("UPDATE talks SET vector='{0}' WHERE slug='{1}'".format(vector, slug))
0%| | 0/46 [00:00<?, ?it/s]
Query the records that still have no vectorised transcript
records = project.sql_query("""
SELECT * FROM talks
WHERE
transcript IS NOT NULL AND
vector_trf IS NULL
ORDER BY slug ASC;
""")
As transformer-based pretrained models work at tensor-level, they eventually need to be re-aligned to the tokens to extract word/span/document vectors.
from spacy.language import Language
@Language.factory('tensor2attr')
class Tensor2Attr:
def __init__(self, name, nlp):
pass
def __call__(self, doc):
self.add_attributes(doc)
return doc
def add_attributes(self, doc):
doc.user_hooks['vector'] = self.doc_tensor
doc.user_span_hooks['vector'] = self.span_tensor
doc.user_token_hooks['vector'] = self.token_tensor
doc.user_hooks['similarity'] = self.get_similarity
doc.user_span_hooks['similarity'] = self.get_similarity
doc.user_token_hooks['similarity'] = self.get_similarity
def doc_tensor(self, doc):
return doc._.trf_data.tensors[-1].mean(axis=0)
def span_tensor(self, span):
tensor_ix = span.doc._.trf_data.align[span.start: span.end].data.flatten()
out_dim = span.doc._.trf_data.tensors[0].shape[-1]
tensor = span.doc._.trf_data.tensors[0].reshape(-1, out_dim)[tensor_ix]
return tensor.mean(axis=0)
def token_tensor(self, token):
tensor_ix = token.doc._.trf_data.align[token.i].data.flatten()
out_dim = token.doc._.trf_data.tensors[0].shape[-1]
tensor = token.doc._.trf_data.tensors[0].reshape(-1, out_dim)[tensor_ix]
return tensor.mean(axis=0)
def get_similarity(self, doc1, doc2):
return np.dot(doc1.vector, doc2.vector) / (doc1.vector_norm * doc2.vector_norm)
trf.add_pipe('tensor2attr')
trf.pipeline
[('transformer', <spacy_transformers.pipeline_component.Transformer at 0x31121bc40>), ('tagger', <spacy.pipeline.tagger.Tagger at 0x2cdc33760>), ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2cd300c10>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler at 0x3113672c0>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x3113e9e80>), ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x311372ba0>), ('tensor2attr', <__main__.Tensor2Attr at 0x174b28040>)]
For each record retrieved, get the transcript, input it in the trf
pipeline to vectorise the entire document using the transformer, align the tensors with the tokens with the custom task, and extract the document vector converting the numerical values to float64
.
for record in tqdm( records ):
slug = record[0]
transcript = record[4]
vector = trf( transcript ).vector.astype( np.float64 )
vector = project.sqlize_array( vector )
project.sql_commit("UPDATE talks SET vector_trf='{0}' WHERE slug='{1}'".format(vector, slug))
0%| | 0/46 [00:00<?, ?it/s]