import project 

import sklearn
import sklearn.neighbors
import sklearn.ensemble

project.notebook()


records = project.sql_query(""" 
    SELECT vector, category FROM talks
    WHERE  vector IS NOT NULL
    ORDER BY slug ASC;
""")

(x, y), (z, t)                       \
    = train_set, test_set            \
    = splits                         \
    = project.split_in_sets( records )

project.describe_sets(splits)

train_set  =>  (0, 1376) (1, 1572) (2, 1052) 
test_set   =>  (0,  243) (1,  275) (2,  192)


nc  = sklearn.neighbors.NearestCentroid().fit(x,y)


p   = nc.predict(z)

confusion_matrix          = project.confusion_matrix (t,p)
accuracy,precision,recall = project.present_metrics  (t,p)

accuracy    0.6366197183098592
precision   0.6310483706263429
recall      0.6231729330340441


knc = sklearn.neighbors.KNeighborsClassifier(
    n_neighbors=7, weights='distance',
    algorithm='ball_tree', leaf_size=50
).fit(x,y)


p   = knc.predict(z)

confusion_matrix          = project.confusion_matrix (t,p)
accuracy,precision,recall = project.present_metrics  (t,p)

accuracy    0.6788732394366197
precision   0.6697487757776398
recall      0.667469135802469


outliers = sklearn.neighbors.LocalOutlierFactor().fit_predict([x for x,y in records])


records2 = project.remove_outliers (records,  outliers)
splits2  = project.split_in_sets   (records2, splitting_value=3900)

project.describe_sets(splits2)

Data reduced from 4710 to 4630 (-1.70%).

train_set  =>  (0, 1332) (1, 1541) (2, 1027) 
test_set   =>  (0,  268) (1,  272) (2,  190)


outliers = sklearn.ensemble.IsolationForest(random_state=42).fit_predict([x for x,y in records])


records3 = project.remove_outliers (records,  outliers)
splits3  = project.split_in_sets   (records3, splitting_value=3900)

project.describe_sets(splits3)

Data reduced from 4710 to 4456 (-5.39%).

train_set  =>  (0, 1358) (1, 1519) (2, 1023) 
test_set   =>  (0,  181) (1,  230) (2,  145)

NearestCentroid Classifier¶

K-Neighbors Classifier¶

Local Outlier Factor Classifier¶

Isolation Forest Classifier¶