In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [3]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formantsLpc_ideology_features_vowel.csv')
In [6]:
results_df=pd.read_csv('audio-results4/formantsLpc_ideology_features_vowel.csv')
results_df
Out[6]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 -0.012170 0.002790 0.004765 0.002368 0.003164 0.653348 0.581231 0.455782 0.482766 0.481895 3114.307010 0.815215
1 Agglomerative clustering 2 -0.005666 0.004185 0.007017 0.003365 0.004549 0.640625 0.476261 0.555278 0.520504 0.435485 2723.768152 0.893687
2 Birch 2 -0.012142 0.002707 0.004637 0.002306 0.003081 0.653703 0.582344 0.456402 0.482988 0.481956 3114.126505 0.814417
3 DBSCAN 2 0.134967 0.031527 0.039376 0.027595 0.032449 0.810217 0.791543 0.312083 0.424601 0.113396 56.519707 6.896549
4 Mean-shift 2 -0.000656 -0.000495 0.000121 0.012092 0.000240 0.903332 0.897626 0.499793 0.448980 0.728738 21.539908 0.195975
5 Optics 3 0.013505 0.009407 0.008770 0.018743 0.011949 0.881034 0.008531 0.002375 0.230000 -0.309523 29.370155 1.031359
6 Gaussian-mixture 2 -0.012478 0.002556 0.004398 0.002198 0.002931 0.655473 0.412463 0.542318 0.516619 0.483368 3107.601233 0.812483
In [7]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formantsLpc_ideology_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formantsLpc_ideology_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1678, 1: 1018})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({1: 1497, 0: 1199})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 1681, 1: 1015})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2302, -1: 346, 1: 48})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2623, 0: 25, 1: 25, 2: 23})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2695, 1: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1697, 0: 999})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [8]:
results_df=pd.read_csv('audio-results4/formantsLpc_ideology_features_vowel.csv-pca.csv')
results_df
Out[8]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 -0.012170 0.002790 0.004765 0.002368 0.003164 0.653348 0.581231 0.455782 0.482766 0.481895 3114.307010 0.815215
1 Agglomerative clustering 2 -0.005666 0.004185 0.007017 0.003365 0.004549 0.640625 0.476261 0.555278 0.520504 0.435485 2723.768152 0.893687
2 Birch 2 -0.012142 0.002707 0.004637 0.002306 0.003081 0.653703 0.582344 0.456402 0.482988 0.481956 3114.126505 0.814417
3 DBSCAN 2 0.134967 0.031527 0.039376 0.027595 0.032449 0.810217 0.791543 0.312083 0.424601 0.113396 56.519707 6.896549
4 Mean-shift 2 -0.000656 -0.000495 0.000121 0.012092 0.000240 0.903332 0.897626 0.499793 0.448980 0.728738 21.539908 0.195975
5 Optics 3 0.013505 0.009407 0.008770 0.018743 0.011949 0.881034 0.008531 0.002375 0.230000 -0.309523 29.370155 1.031359
6 Gaussian-mixture 2 -0.012478 0.002556 0.004398 0.002198 0.002931 0.655473 0.587537 0.457682 0.483381 0.483368 3107.601233 0.812483
In [9]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formantsLpc_ideology_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formantsLpc_ideology_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1678, 1: 1018})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({1: 1497, 0: 1199})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 1681, 1: 1015})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2302, -1: 346, 1: 48})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2623, 0: 25, 1: 25, 2: 23})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2695, 1: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1697, 1: 999})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [10]:
results_df=pd.read_csv('audio-results4/formantsLpc_ideology_features_vowel.csv-tsne.csv')
results_df
Out[10]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 -0.008622 0.003445 0.005836 0.002830 0.003812 0.644415 0.547849 0.450088 0.481193 0.444881 2944.528386 0.885166
1 Agglomerative clustering 352 0.000298 0.023302 0.430500 0.024542 0.046437 0.051580 0.002967 0.000009 0.002841 0.460163 1937.635915 0.692352
2 Birch 394 0.000273 0.022433 0.441190 0.024903 0.047145 0.050769 0.001113 0.000003 0.002538 0.410229 1665.387597 0.761314
3 DBSCAN 147 0.000622 0.029299 0.308044 0.022254 0.041509 0.133925 0.001855 0.000014 0.005631 0.220414 128.340510 1.301686
4 Mean-shift 2 -0.006305 0.002953 0.005106 0.002458 0.003319 0.642111 0.535608 0.452943 0.482449 0.442392 2933.913082 0.894441
5 Optics 95 0.004613 0.028196 0.218664 0.021311 0.038837 0.367271 0.004451 0.000052 0.010417 -0.007130 37.335084 1.402520
6 Gaussian-mixture 2 -0.010619 0.004212 0.006986 0.003407 0.004580 0.646000 0.554525 0.445747 0.479402 0.445169 2929.943343 0.881765
In [11]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formantsLpc_ideology_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formantsLpc_ideology_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1572, 1: 1124})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1527, 1: 1169})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1600, 1: 1096})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [12]:
results_df=pd.read_csv('audio-results4/formantsLpc_ideology_features_vowel.csv-umap.csv')
results_df
Out[12]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 -0.010862 0.005557 0.009056 0.004401 0.005924 0.644454 0.546736 0.438187 0.476642 0.566670 5545.817410 0.639787
1 Agglomerative clustering 2 -0.015646 0.004293 0.006995 0.003503 0.004668 0.655181 0.585682 0.446979 0.479117 0.552626 5101.374115 0.632492
2 Birch 195 0.000265 0.027975 0.353159 0.022605 0.042490 0.072028 0.004822 0.000044 0.004353 0.451341 4397.159607 0.756060
3 DBSCAN 104 -0.002623 0.033612 0.289709 0.023227 0.043006 0.141642 0.001855 0.000020 0.007937 0.249264 348.515469 1.293268
4 Mean-shift 2 -0.011332 0.004830 0.007925 0.003867 0.005198 0.645999 0.554154 0.442318 0.478081 0.566754 5535.568682 0.636081
5 Optics 103 -0.010207 0.026647 0.221455 0.020643 0.037766 0.332017 0.007789 0.000145 0.010385 0.056348 42.582956 1.336667
6 Gaussian-mixture 2 -0.012031 0.004469 0.007354 0.003605 0.004839 0.647692 0.438798 0.555370 0.521175 0.566125 5503.929018 0.632788
In [13]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formantsLpc_ideology_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formantsLpc_ideology_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1583, 1: 1113})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 1704, 1: 992})
 2D representation
 3D representation
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1603, 1: 1093})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1624, 0: 1072})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: