In [11]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [2]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/mfcc_ideology_features_vowel.csv')
In [ ]:
 
In [3]:
results_df=pd.read_csv('audio-results4/mfcc_ideology_features_vowel.csv')
results_df
Out[3]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.176755 0.119352 0.171616 0.091898 0.119699 0.743962 0.752967 7.673659e-01 0.615477 0.118797 281.626641 2.814327
1 Agglomerative clustering 3 0.079903 0.076257 0.146073 0.052101 0.076807 0.632785 0.523739 2.416853e-01 0.323306 0.058355 175.357120 3.188299
2 Birch 2222 0.000121 0.010006 0.969963 0.042058 0.080621 0.018950 0.000371 1.858922e-07 0.000450 0.096299 21.342075 0.408094
3 DBSCAN 2 0.180958 0.063216 0.078464 0.054194 0.064109 0.804238 0.768175 2.851439e-01 0.311101 0.094855 25.691531 4.815952
4 Mean-shift 5 0.057251 0.012966 0.010897 0.037353 0.016872 0.896843 0.890950 2.029430e-01 0.258124 0.142557 14.654892 1.837654
5 Optics 2 -0.048997 0.014243 0.012230 0.022057 0.015735 0.864426 0.020030 7.434944e-03 0.333333 -0.059086 40.819947 1.409906
6 Gaussian-mixture 2 0.334060 0.154053 0.167596 0.143292 0.154494 0.863835 0.876113 7.215050e-01 0.680041 0.167050 217.067298 2.371370
In [4]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/mfcc_ideology_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/mfcc_ideology_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1873, 1: 823})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 1481, 1: 952, 2: 263})
 2D representation
 3D representation
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({0: 2219, -1: 472, 1: 5})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2597, 0: 54, 1: 45})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2656, 1: 18, 2: 10, 3: 6, 4: 6})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2347, 1: 349})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [5]:
results_df=pd.read_csv('audio-results4/mfcc_ideology_features_vowel.csv-pca.csv')
results_df
Out[5]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.155123 0.104483 0.151801 0.080060 0.104832 0.732521 0.261869 0.247342 0.393134 0.242027 788.392316 1.596074
1 Agglomerative clustering 9 0.011751 0.052617 0.202452 0.030994 0.053758 0.327215 0.084941 0.022687 0.074246 0.182967 636.860795 1.160072
2 Birch 5 0.047063 0.067574 0.192135 0.041550 0.068325 0.470438 0.267804 0.073183 0.198157 0.243191 858.235353 1.210683
3 DBSCAN 4 0.100190 0.020548 0.022896 0.022982 0.022939 0.845591 0.830490 0.184965 0.181885 0.067034 29.831831 2.264263
4 Mean-shift 1 0.000000 0.000000 0.000000 1.000000 0.000000 0.903734 0.897997 0.500000 0.448999 -1.000000 -1.000000 -1.000000
5 Optics 2 -0.027636 0.004977 0.004633 0.012545 0.006767 0.880434 0.012982 0.004819 0.324074 -0.131291 22.262735 1.189385
6 Gaussian-mixture 2 0.343192 0.164539 0.183791 0.149634 0.164963 0.860668 0.126113 0.262006 0.318761 0.297817 654.289180 1.271646
In [6]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/mfcc_ideology_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/mfcc_ideology_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1841, 0: 855})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({6: 415, 1: 387, 4: 386, 0: 336, 2: 323, 7: 258, 3: 237, 5: 212, 8: 142})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({1: 791, 0: 727, 3: 555, 2: 418, 4: 205})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2462, -1: 214, 2: 8, 1: 8, 3: 4})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2636, 0: 36, 1: 24})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2696})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 2319, 0: 377})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [7]:
results_df=pd.read_csv('audio-results4/mfcc_ideology_features_vowel.csv-tsne.csv')
results_df
Out[7]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.002231 0.027569 0.043287 0.020608 0.027923 0.640509 0.530786 6.355984e-01 0.549795 0.276062 1190.732170 1.424224
1 Agglomerative clustering 946 0.000166 0.020742 0.689402 0.033488 0.063874 0.028741 0.001855 2.183153e-06 0.001057 0.338952 509.090126 0.710083
2 Birch 2516 0.000015 0.003010 0.982832 0.041476 0.079594 0.007313 0.000371 1.641703e-07 0.000397 0.062620 698.040122 0.152386
3 DBSCAN 2 -0.017998 0.003884 0.003655 0.017502 0.006048 0.891716 0.005193 1.927578e-03 0.333333 0.151832 35.647993 0.698666
4 Mean-shift 1 0.000000 0.000000 0.000000 1.000000 0.000000 0.903734 0.897997 5.000000e-01 0.448999 -1.000000 -1.000000 -1.000000
5 Optics 2 -0.041887 0.008292 0.007939 0.012488 0.009707 0.858891 0.019288 1.038289e-02 0.347826 0.021934 143.828081 0.768521
6 Gaussian-mixture 2 -0.015230 0.024822 0.038591 0.018687 0.025181 0.641208 0.482196 6.246600e-01 0.546867 0.274317 1170.246510 1.406985
In [8]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/mfcc_ideology_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/mfcc_ideology_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1412, 0: 1284})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({-1: 2666, 1: 16, 0: 14})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2578, 1: 69, 0: 49})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2696})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1563, 0: 1133})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [9]:
results_df=pd.read_csv('audio-results4/mfcc_ideology_features_vowel.csv-umap.csv')
results_df
Out[9]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.008242 0.041304 0.064609 0.030732 0.041652 0.642465 0.452522 0.335768 0.439765 0.249237 971.999817 1.548162
1 Agglomerative clustering 2 -0.023517 0.004101 0.003227 0.011161 0.005006 0.883719 0.879451 0.491285 0.457800 0.483113 312.827278 0.425791
2 Birch 16 0.003879 0.048154 0.225895 0.028095 0.049974 0.254170 0.052300 0.005856 0.054223 0.328580 1177.586031 0.938904
3 DBSCAN 3 -0.030324 0.006861 0.006611 0.018130 0.009689 0.882122 0.007047 0.001962 0.250000 0.236802 79.744396 0.486467
4 Mean-shift 2 -0.022407 0.002177 0.002072 0.005545 0.003016 0.877168 0.873516 0.491204 0.469004 0.411536 319.466691 0.700326
5 Optics 4 0.009383 0.011823 0.013908 0.014588 0.014240 0.850294 0.012240 0.007239 0.172359 0.028590 185.346559 0.641321
6 Gaussian-mixture 2 -0.010271 0.034116 0.053078 0.025522 0.034470 0.640034 0.503338 0.646102 0.554396 0.247036 962.625306 1.534664
In [10]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/mfcc_ideology_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/mfcc_ideology_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1391, 1: 1305})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2644, 1: 52})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({4: 309, 5: 269, 12: 253, 6: 252, 2: 208, 7: 187, 1: 183, 8: 183, 0: 161, 11: 144, 15: 144, 14: 141, 13: 117, 3: 78, 10: 52, 9: 15})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({-1: 2642, 0: 19, 2: 18, 1: 17})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2531, 3: 52, 1: 44, 0: 37, 2: 32})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2624, 1: 72})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1518, 0: 1178})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]: