In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [5]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formants20_ideologyFive_features_vowel.csv')
In [6]:
train_data.shape
Out[6]:
(2728, 3)
In [ ]:
 
In [7]:
results_df=pd.read_csv('audio-results4/formants20_ideologyFive_features_vowel.csv')
results_df
Out[7]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.300761 0.140981 0.163759 0.124379 0.141378 0.833877 0.849707 0.732422 0.661128 0.387974 880.436183 1.340712
1 Agglomerative clustering 3 0.156916 0.118266 0.203682 0.083871 0.118817 0.703897 0.648827 0.278166 0.333924 0.279226 835.139038 1.215189
2 Birch 2 -0.005223 0.001068 0.002124 0.001073 0.001425 0.639930 0.547654 0.469842 0.488021 0.236334 791.082485 1.588534
3 DBSCAN 2 0.340101 0.185181 0.242504 0.150660 0.185854 0.835735 0.809751 0.376784 0.595054 0.335520 216.939505 2.752643
4 Mean-shift 3 0.017095 0.004512 0.003706 0.029524 0.006586 0.893740 0.888196 0.337162 0.416465 0.493237 99.323554 0.683790
5 Optics 2 -0.026989 0.006172 0.005699 0.011684 0.007661 0.864712 0.017962 0.010660 0.362963 -0.147118 26.793214 1.299365
6 Gaussian-mixture 2 0.471211 0.267887 0.245859 0.295286 0.268315 0.908335 0.082845 0.265201 0.201394 0.455396 766.398381 0.973004
In [8]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formants20_ideologyFive_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formants20_ideologyFive_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 2269, 1: 459})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 1842, 1: 596, 2: 290})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 1571, 1: 1157})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2242, -1: 396, 1: 90})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2638, 0: 45, 1: 45})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2710, 1: 14, 2: 4})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 2501, 0: 227})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [9]:
results_df=pd.read_csv('audio-results4/formants20_ideologyFive_features_vowel.csv-pca.csv')
results_df
Out[9]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.300761 0.140981 0.163759 0.124379 0.141378 0.833877 0.849707 0.732422 0.661128 0.387974 880.436183 1.340712
1 Agglomerative clustering 3 0.156916 0.118266 0.203682 0.083871 0.118817 0.703897 0.648827 0.278166 0.333924 0.279226 835.139038 1.215189
2 Birch 2 -0.005223 0.001068 0.002124 0.001073 0.001425 0.639930 0.547654 0.469842 0.488021 0.236334 791.082485 1.588534
3 DBSCAN 2 0.111437 0.027330 0.025366 0.032794 0.028606 0.856268 0.840909 0.314548 0.300931 0.321374 74.295522 2.695492
4 Mean-shift 3 0.017095 0.004512 0.003706 0.029524 0.006586 0.893740 0.888196 0.337162 0.416465 0.493237 99.323554 0.683790
5 Optics 2 -0.026989 0.006172 0.005699 0.011684 0.007661 0.864712 0.017962 0.010660 0.362963 -0.147118 26.793214 1.299365
6 Gaussian-mixture 2 0.471211 0.267887 0.245859 0.295286 0.268315 0.908335 0.082845 0.265201 0.201394 0.455396 766.398381 0.973004
In [10]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formants20_ideologyFive_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formants20_ideologyFive_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 2269, 1: 459})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 1842, 1: 596, 2: 290})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 1571, 1: 1157})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2541, -1: 175, 1: 12})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2638, 0: 45, 1: 45})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2710, 1: 14, 2: 4})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 2501, 0: 227})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [11]:
results_df=pd.read_csv('audio-results4/formants20_ideologyFive_features_vowel.csv-tsne.csv')
results_df
Out[11]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.012586 2.284908e-02 3.494622e-02 0.017358 2.319508e-02 0.639806 0.442815 0.377028 0.452250 0.309422 1425.379819 1.308461
1 Agglomerative clustering 360 0.000559 3.545468e-02 5.237562e-01 0.031060 5.864210e-02 0.052862 0.004399 0.000014 0.002778 0.446850 1426.563015 0.704475
2 Birch 406 0.000568 3.586133e-02 5.471541e-01 0.032155 6.073958e-02 0.052789 0.002199 0.000006 0.002463 0.376367 1159.536412 0.805643
3 DBSCAN 144 0.001981 4.676292e-02 4.079127e-01 0.031591 5.864086e-02 0.158161 0.013930 0.000149 0.007315 0.225238 105.618549 1.291047
4 Mean-shift 1 0.000000 1.769098e-15 3.226002e-16 1.000000 6.452004e-16 0.897715 0.891129 0.500000 0.445565 -1.000000 -1.000000 -1.000000
5 Optics 92 -0.013331 3.350913e-02 2.323712e-01 0.024127 4.371487e-02 0.357460 0.009897 0.000119 0.010753 -0.010777 36.523732 1.391599
6 Gaussian-mixture 2 -0.028934 1.500635e-02 2.219430e-02 0.011756 1.537034e-02 0.650912 0.579179 0.409205 0.461468 0.290825 1291.710026 1.262055
In [12]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formants20_ideologyFive_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formants20_ideologyFive_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1397, 0: 1331})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 2728})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1763, 1: 965})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [13]:
results_df=pd.read_csv('audio-results4/formants20_ideologyFive_features_vowel.csv-umap.csv')
results_df
Out[13]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.010745 2.244230e-02 3.434028e-02 0.017052 2.278838e-02 0.638952 0.552419 0.621776 0.547266 0.315351 1413.659984 1.316540
1 Agglomerative clustering 109 0.001602 4.744199e-02 4.028458e-01 0.030031 5.589539e-02 0.097724 0.005865 0.000115 0.005357 0.460941 1970.343583 0.729527
2 Birch 133 0.001501 4.572877e-02 4.133311e-01 0.029970 5.588820e-02 0.093786 0.010630 0.000090 0.007034 0.439560 1814.399950 0.786641
3 DBSCAN 129 0.004443 4.736903e-02 3.922633e-01 0.031534 5.837439e-02 0.173874 0.030425 0.000331 0.007995 0.299660 147.631575 1.164871
4 Mean-shift 1 0.000000 1.769098e-15 3.226002e-16 1.000000 6.452004e-16 0.897715 0.891129 0.500000 0.445565 -1.000000 -1.000000 -1.000000
5 Optics 98 -0.001702 3.402930e-02 2.533286e-01 0.024120 4.404638e-02 0.310941 0.009897 0.000112 0.010101 0.115397 51.602661 1.190559
6 Gaussian-mixture 2 -0.000658 1.493802e-02 2.300673e-02 0.011447 1.528721e-02 0.635315 0.517595 0.599281 0.538639 0.305477 1327.132406 1.353205
In [14]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formants20_ideologyFive_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formants20_ideologyFive_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1382, 1: 1346})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 2728})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1437, 0: 1291})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: