import project 

import spacy
import numpy  as np
import pandas as pd
import pickle
import os

from tqdm.notebook import tqdm


records = project.sql_query(""" 
    SELECT * FROM talks
    WHERE transcript IS NOT NULL
    ORDER BY slug ASC;
""")

df = project.create_dataframe_from(records)


nlp = spacy.load("en_core_web_lg")


docs_pkl = "data/docs.v4.pkl" 

if not os.path.exists( docs_pkl ): 

    docs = list() 

    for _,record in tqdm( list(df.iterrows()) ):
        docs.append( nlp( record["transcript"] ) )

    with open( docs_pkl, "wb" ) as file: 
        pickle.dump(docs, file)

else: 

    with open( docs_pkl, "rb" ) as file: 
        docs = pickle.load(file)

  0%|          | 0/4828 [00:00<?, ?it/s]


doc = nlp(""" 
    I have learned about the poetry and the wisdom and the grace 
    that can be found in the words of people all around us 
    when we simply take the time to listen.
""")


ladf = pd.DataFrame([ 
    [
        token.text, token.pos_, token.tag_, token.dep_,
        token.shape_, token.lemma_, token.is_stop
    ] for token in doc if token.pos_ not in ["SPACE", "PUNCT"]
], columns = [
    "", "Part-of-Speech", "Tag", "Dependency", "Shape", "Lemma", "Stop"
])

ladf = ladf.set_index("")
ladf.drop_duplicates()
print(ladf)

        Part-of-Speech  Tag Dependency Shape   Lemma   Stop
                                                           
I                 PRON  PRP      nsubj     X       I   True
have               AUX  VBP        aux  xxxx    have   True
learned           VERB  VBN       ROOT  xxxx   learn  False
about              ADP   IN       prep  xxxx   about   True
the                DET   DT        det   xxx     the   True
poetry            NOUN   NN       pobj  xxxx  poetry  False
and              CCONJ   CC         cc   xxx     and   True
the                DET   DT        det   xxx     the   True
wisdom            NOUN   NN       conj  xxxx  wisdom  False
and              CCONJ   CC         cc   xxx     and   True
the                DET   DT        det   xxx     the   True
grace             NOUN   NN       conj  xxxx   grace  False
that              PRON  WDT  nsubjpass  xxxx    that   True
can                AUX   MD        aux   xxx     can   True
be                 AUX   VB    auxpass    xx      be   True
found             VERB  VBN      relcl  xxxx    find  False
in                 ADP   IN       prep    xx      in   True
the                DET   DT        det   xxx     the   True
words             NOUN  NNS       pobj  xxxx    word  False
of                 ADP   IN       prep    xx      of   True
people            NOUN  NNS       pobj  xxxx  people  False
all                ADV   RB     advmod   xxx     all   True
around             ADP   IN       prep  xxxx  around   True
us                PRON  PRP   compound    xx      we   True
when             SCONJ  WRB     advmod  xxxx    when   True
we                PRON  PRP      nsubj    xx      we   True
simply             ADV   RB     advmod  xxxx  simply  False
take              VERB  VBP      advcl  xxxx    take   True
the                DET   DT        det   xxx     the   True
time              NOUN   NN       dobj  xxxx    time  False
to                PART   TO        aux    xx      to   True
listen            VERB   VB      relcl  xxxx  listen  False


mfdf = pd.DataFrame([ 
    [
        token.text, str(token.morph).replace("|", ", ")
    ] for token in doc if token.pos_ not in ["SPACE", "PUNCT"] and not token.morph.to_dict() == {}
], columns = [ "", "Morphological features" ])

mfdf = mfdf.set_index("")
print(mfdf)

                                Morphological features
                                                      
I        Case=Nom, Number=Sing, Person=1, PronType=Prs
have                Mood=Ind, Tense=Pres, VerbForm=Fin
learned         Aspect=Perf, Tense=Past, VerbForm=Part
the                         Definite=Def, PronType=Art
poetry                                     Number=Sing
and                                       ConjType=Cmp
the                         Definite=Def, PronType=Art
wisdom                                     Number=Sing
and                                       ConjType=Cmp
the                         Definite=Def, PronType=Art
grace                                      Number=Sing
that                                      PronType=Rel
can                                       VerbForm=Fin
be                                        VerbForm=Inf
found           Aspect=Perf, Tense=Past, VerbForm=Part
the                         Definite=Def, PronType=Art
words                                      Number=Plur
people                                     Number=Plur
us       Case=Acc, Number=Plur, Person=1, PronType=Prs
we       Case=Nom, Number=Plur, Person=1, PronType=Prs
take                          Tense=Pres, VerbForm=Fin
the                         Definite=Def, PronType=Art
time                                       Number=Sing
listen                                    VerbForm=Inf


spacy.displacy.render( 
    doc[1:-2], style="dep", jupyter=True,
    options={"bg":"transparent", "arrow_width": 5, "arrow_spacing": 30}
)


compute_similarity = lambda w1, w2: nlp(w1).similarity( nlp(w2) )


compute_similarity ( "creativity", "innovation" )

0.6643053677545637


compute_similarity ( "creativity", "kindness" )

0.4634827277856967


compute_similarity ( "creativity", "intelligence" )

0.4278424226247513


compute_similarity ( "fairness", "justice" )

0.5730289299368784


nlp("justice").vector

array([-3.4381e-01,  3.4376e-01,  4.9534e-01, -3.5717e-02, -8.8070e-02,
       -4.7633e-01, -3.4325e-02,  5.9654e-01, -1.9067e-01,  3.3019e+00,
       -4.3143e-01, -5.7032e-01,  3.1515e-01,  7.7676e-02, -5.9649e-01,
        1.9499e-01, -2.4696e-02,  5.1572e-02,  2.2789e-02,  2.9140e-01,
        7.3238e-02, -7.5725e-02,  2.8049e-01, -7.1488e-02,  6.4580e-01,
       -3.2782e-01,  3.5153e-01,  1.2905e-01,  1.2300e-01,  3.3861e-01,
       -1.1412e-01,  1.3384e-01, -3.7455e-02, -3.4492e-02, -1.3803e-01,
       -5.2819e-01, -3.8170e-01, -5.6324e-01, -2.7196e-01,  3.6408e-01,
        1.9637e-01, -3.9655e-02,  1.0042e-01, -7.8966e-02, -1.3110e-01,
        1.0679e-02, -4.2275e-01,  2.0671e-01, -2.8614e-01, -3.1247e-01,
        3.3054e-02,  6.1553e-01,  2.1825e-01,  2.9132e-01,  7.1687e-02,
        4.5740e-01, -8.6655e-02,  1.2583e-01, -2.0430e-01, -1.9350e-01,
       -3.3587e-01, -1.8023e-01, -5.6906e-04, -5.0594e-01,  1.8982e-01,
       -2.2953e-01, -8.4605e-01, -1.0636e-01,  6.0445e-01, -9.3434e-01,
       -2.0445e-01, -1.9468e-02, -8.5165e-02, -8.0196e-02,  6.9288e-02,
       -3.5492e-02, -8.8441e-02,  8.6998e-02,  1.2425e-01,  1.5891e-01,
       -4.5474e-01,  3.0423e-01,  1.3972e-01, -8.0910e-02,  1.6215e-01,
        7.2958e-02, -7.8371e-02, -8.5138e-01,  4.0089e-01, -2.5055e-01,
       -3.7060e-02,  3.0009e-02,  5.6314e-02, -7.7613e-02,  6.7582e-02,
       -3.3996e-01, -3.8363e-01,  4.8418e-01,  8.1895e-02, -1.2648e-01,
        6.7646e-02, -1.8651e-01,  4.9559e-01,  8.0809e-03,  3.2678e-02,
       -1.3852e+00, -2.6012e-01,  5.5367e-02, -2.5384e-01,  3.2156e-01,
       -1.0304e-01,  3.8231e-01, -6.4049e-02, -3.2753e-01,  4.8024e-01,
        4.7911e-01,  7.6437e-02, -1.1158e-01, -4.3274e-01, -7.8790e-02,
       -3.7597e-01, -1.7812e-02, -4.7244e-01,  4.6426e-01,  4.0035e-01,
        2.3399e-01, -2.3188e-01,  1.2163e-01, -1.4199e-01, -1.1887e-01,
        1.9661e-02,  3.3260e-02, -2.6521e-01, -3.5548e-01,  3.4249e-01,
        5.2267e-01, -6.0441e-01, -5.5235e-02,  6.9289e-02,  2.5693e-01,
       -8.2214e-01,  1.4612e-01,  5.2922e-01,  8.6075e-02, -1.8113e-01,
       -1.5880e-01,  1.7196e-01,  1.2318e-01, -5.8314e-01,  1.7558e-01,
       -1.5903e-01, -4.2397e-01,  2.4094e-01,  1.9245e-01,  9.0716e-02,
        1.3609e-01, -5.2854e-01, -9.0411e-01,  4.7853e-01,  2.2009e-01,
        2.3562e-01,  1.5602e-01,  2.2873e-02,  1.5689e-01, -1.3573e-02,
        1.4856e-01,  1.2146e-01, -1.0311e+00, -1.4630e-01,  3.5024e-01,
       -2.9306e-02, -5.0831e-01,  6.7779e-01,  2.1345e-01,  6.1324e-01,
       -2.1474e-03,  1.2525e-01,  2.0347e-02, -6.4408e-02, -6.2743e-01,
       -5.8759e-02,  1.7982e-01,  4.1751e-01, -2.0467e-02,  3.4598e-01,
        3.7176e-01, -7.7420e-02, -1.1708e-01,  1.0484e-01,  3.7756e-01,
       -9.2896e-01, -2.3951e-01,  4.2141e-02, -1.2649e-02,  1.2752e-02,
        5.7616e-01,  3.5483e-02, -3.2552e-02, -1.9331e-02, -4.7913e-02,
        1.3978e-01,  2.8369e-01, -1.6544e-01,  5.4986e-01, -2.4730e-01,
       -4.3667e-01, -9.5433e-02,  3.7300e-01,  1.7801e-01,  1.1707e-01,
        5.2180e-01,  3.1333e-01,  1.9200e-02,  5.9149e-01,  4.9123e-01,
        3.6028e-02, -1.2109e-01,  1.2935e-01,  1.4267e-01,  4.4740e-01,
       -4.7241e-01, -2.0163e-01, -3.3699e-01,  2.6804e-01,  1.8563e-01,
        4.3579e-01, -6.0257e-02, -1.0421e-01,  6.1407e-01, -1.2944e-01,
       -4.2510e-01, -8.2815e-02, -3.5562e-01,  2.5424e-01, -6.3312e-02,
       -1.6352e-02,  1.9740e-01, -5.0832e-02,  6.2742e-02, -1.8423e-02,
       -1.0329e-02, -2.0619e-01,  3.6215e-01, -2.3653e-01,  3.1223e-01,
        3.5238e-01,  2.4704e-01, -4.5721e-01, -4.6983e-01, -1.5507e-01,
        7.0817e-02,  2.7576e-01,  1.5181e-01,  1.8063e-01, -6.7651e-01,
        4.5966e-02,  6.1935e-01,  5.0209e-03, -4.3542e-02, -5.0789e-02,
       -3.3582e-01,  1.1226e-01,  5.1889e-02,  4.2312e-01, -3.7177e-01,
       -3.8564e-01, -1.9854e-01,  5.8098e-02,  3.1533e-01, -1.8467e-01,
       -1.4973e-01, -6.4299e-02, -3.7601e-01,  6.2940e-02,  1.0444e-01,
        6.4675e-01,  3.0656e-01,  2.3517e-02,  1.3982e-01,  4.8348e-01,
        1.0878e-01, -3.3280e-01, -8.1876e-03, -4.2152e-01, -1.7155e-01,
       -3.4764e-02, -1.8088e-02,  4.7000e-01, -2.2744e-01, -7.5262e-01,
       -2.7945e-01, -4.3493e-01,  4.1584e-02, -2.3649e-03, -8.9192e-02,
        8.5272e-02,  6.3913e-01, -2.6494e-01, -1.8196e-01,  1.5598e-01],
      dtype=float32)