import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline

train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formants50_ideologyFive_features_vowel.csv')

train_data.shape

(2728, 3)

results_df=pd.read_csv('audio-results4/formants50_ideologyFive_features_vowel.csv')
results_df

actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formants50_ideologyFive_features_vowel.csv')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/formants50_ideologyFive_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1970, 0: 758})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({1: 1818, 0: 910})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 1716, 2: 512, 1: 383, 5: 65, 4: 26, 6: 7, 9: 7, 3: 6, 7: 5, 8: 1})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 2638, -1: 84, 1: 6})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2626, 1: 50, 2: 27, 0: 25})
 2D representation

PCA transformed data¶

results_df=pd.read_csv('audio-results4/formants50_ideologyFive_features_vowel.csv-pca.csv')
results_df

pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formants50_ideologyFive_features_vowel.csv-pca')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/formants50_ideologyFive_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1970, 1: 758})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({1: 1818, 0: 910})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 1716, 2: 512, 1: 383, 5: 65, 4: 26, 6: 7, 9: 7, 3: 6, 7: 5, 8: 1})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 2638, -1: 84, 1: 6})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2661, 1: 41, 0: 26})
 2D representation

t-SNE transformed data¶

results_df=pd.read_csv('audio-results4/formants50_ideologyFive_features_vowel.csv-tsne.csv')
results_df

tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formants50_ideologyFive_features_vowel.csv-tsne')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/formants50_ideologyFive_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1382, 1: 1346})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 2728})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({1: 1366, 0: 1362})
 2D representation

 3D representation

Umap transformed data¶

results_df=pd.read_csv('audio-results4/formants50_ideologyFive_features_vowel.csv-umap.csv')
results_df

umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formants50_ideologyFive_features_vowel.csv-umap')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/formants50_ideologyFive_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1477, 1: 1251})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1412, 2: 678, 1: 638})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({0: 1553, 1: 1175})
 2D representation

 3D representation

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.162921	0.083682	0.114173	0.066494	0.084042	0.746098	0.243035	0.285624	0.396347	0.367415	1298.362170	1.208018
1	Agglomerative clustering	2	0.094663	0.049846	0.071538	0.038669	0.050202	0.701909	0.302419	0.326333	0.424208	0.322612	1086.645930	1.381837
2	Birch	10	0.154210	0.112260	0.235570	0.075545	0.114402	0.670706	0.652859	0.115823	0.132994	0.279821	377.914211	0.926925
3	DBSCAN	2	0.076064	0.018255	0.014309	0.032170	0.019807	0.878743	0.869135	0.329047	0.521312	0.453904	66.295195	2.345823
4	Mean-shift	3	0.037534	0.013681	0.008824	0.063861	0.015506	0.895111	0.890029	0.341789	0.455764	0.655355	138.324327	1.130572
5	Optics	3	-0.041314	0.007208	0.007396	0.012813	0.009379	0.858674	0.009531	0.003413	0.255000	-0.289585	14.144455	1.768938
6	Gaussian-mixture	2	0.439454	0.236987	0.246897	0.228568	0.237379	0.885347	0.898460	0.765689	0.739907	0.398010	849.129885	1.020409

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.162921	0.083682	0.114173	0.066494	0.084042	0.746098	0.756965	0.714376	0.603653	0.367415	1298.362170	1.208018
1	Agglomerative clustering	2	0.094663	0.049846	0.071538	0.038669	0.050202	0.701909	0.302419	0.326333	0.424208	0.322612	1086.645930	1.381837
2	Birch	10	0.154210	0.112260	0.235570	0.075545	0.114402	0.670706	0.652859	0.115823	0.132994	0.279821	377.914211	0.926925
3	DBSCAN	2	0.076064	0.018255	0.014309	0.032170	0.019807	0.878743	0.869135	0.329047	0.521312	0.453904	66.295195	2.345823
4	Mean-shift	3	0.037534	0.013681	0.008824	0.063861	0.015506	0.895111	0.890029	0.341789	0.455764	0.655355	138.324327	1.130572
5	Optics	2	-0.035864	0.010416	0.008335	0.021781	0.012056	0.871227	0.009531	0.003565	0.333333	-0.269624	13.069045	1.890882
6	Gaussian-mixture	2	0.439454	0.236987	0.246897	0.228568	0.237379	0.885347	0.101540	0.234311	0.260093	0.398010	849.129885	1.020409

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.014388	4.549567e-02	6.906758e-02	0.034296	4.583358e-02	0.640387	0.439516	0.330842	0.434343	0.354279	1912.690310	1.136426
1	Agglomerative clustering	358	0.000592	3.695327e-02	5.347511e-01	0.031767	5.997205e-02	0.053418	0.004032	0.000013	0.002793	0.454193	1533.236588	0.693419
2	Birch	594	0.000420	3.026813e-02	5.931972e-01	0.032925	6.238728e-02	0.044466	0.001466	0.000003	0.001684	0.396891	1430.322224	0.706424
3	DBSCAN	153	0.003122	4.581603e-02	4.056078e-01	0.031539	5.852795e-02	0.172748	0.008798	0.000064	0.006494	0.190485	80.010843	1.370363
4	Mean-shift	1	0.000000	1.769098e-15	3.226002e-16	1.000000	6.452004e-16	0.897715	0.891129	0.500000	0.445565	-1.000000	-1.000000	-1.000000
5	Optics	84	0.001440	4.571749e-02	2.750131e-01	0.030932	5.560957e-02	0.414886	0.008798	0.000116	0.011765	-0.050016	40.184781	1.213255
6	Gaussian-mixture	2	0.017468	4.538564e-02	6.890751e-02	0.034213	4.572357e-02	0.641547	0.433651	0.330507	0.434224	0.353929	1910.211254	1.137232

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.044875	0.064390	0.097216	0.048508	0.064722	0.654386	0.610704	0.701769	0.578842	0.383672	2225.008496	1.052620
1	Agglomerative clustering	111	0.001750	0.047170	0.403826	0.029939	0.055745	0.096983	0.017962	0.000182	0.008829	0.462216	2516.507760	0.727847
2	Birch	149	0.001229	0.044461	0.417877	0.029800	0.055633	0.089712	0.009531	0.000111	0.007056	0.440378	2281.101949	0.803075
3	DBSCAN	145	0.004898	0.043978	0.384154	0.030303	0.056174	0.185189	0.009164	0.000111	0.007376	0.297034	109.185261	1.283596
4	Mean-shift	3	-0.037575	0.031890	0.064547	0.021636	0.032409	0.537121	0.439150	0.189883	0.290260	0.352382	1867.449109	0.933422
5	Optics	99	-0.002192	0.042330	0.310749	0.028332	0.051929	0.275384	0.008798	0.000099	0.010000	0.149084	65.994400	1.200915
6	Gaussian-mixture	2	0.060041	0.065465	0.098242	0.049464	0.065799	0.663889	0.633431	0.704176	0.580786	0.383712	2209.417548	1.044670