import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline

train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formants20_ideologyFive_features_vowel.csv')

train_data.shape

(2728, 3)

results_df=pd.read_csv('audio-results4/formants20_ideologyFive_features_vowel.csv')
results_df

actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formants20_ideologyFive_features_vowel.csv')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/formants20_ideologyFive_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 2269, 1: 459})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 1842, 1: 596, 2: 290})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 1571, 1: 1157})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 2242, -1: 396, 1: 90})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2638, 0: 45, 1: 45})
 2D representation

PCA transformed data¶

results_df=pd.read_csv('audio-results4/formants20_ideologyFive_features_vowel.csv-pca.csv')
results_df

pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formants20_ideologyFive_features_vowel.csv-pca')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/formants20_ideologyFive_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 2269, 1: 459})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 1842, 1: 596, 2: 290})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 1571, 1: 1157})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 2541, -1: 175, 1: 12})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2638, 0: 45, 1: 45})
 2D representation

t-SNE transformed data¶

results_df=pd.read_csv('audio-results4/formants20_ideologyFive_features_vowel.csv-tsne.csv')
results_df

tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formants20_ideologyFive_features_vowel.csv-tsne')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/formants20_ideologyFive_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1397, 0: 1331})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 2728})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({0: 1763, 1: 965})
 2D representation

 3D representation

Umap transformed data¶

results_df=pd.read_csv('audio-results4/formants20_ideologyFive_features_vowel.csv-umap.csv')
results_df

umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formants20_ideologyFive_features_vowel.csv-umap')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/formants20_ideologyFive_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1382, 1: 1346})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 2728})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({1: 1437, 0: 1291})
 2D representation

 3D representation

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.300761	0.140981	0.163759	0.124379	0.141378	0.833877	0.849707	0.732422	0.661128	0.387974	880.436183	1.340712
1	Agglomerative clustering	3	0.156916	0.118266	0.203682	0.083871	0.118817	0.703897	0.648827	0.278166	0.333924	0.279226	835.139038	1.215189
2	Birch	2	-0.005223	0.001068	0.002124	0.001073	0.001425	0.639930	0.547654	0.469842	0.488021	0.236334	791.082485	1.588534
3	DBSCAN	2	0.340101	0.185181	0.242504	0.150660	0.185854	0.835735	0.809751	0.376784	0.595054	0.335520	216.939505	2.752643
4	Mean-shift	3	0.017095	0.004512	0.003706	0.029524	0.006586	0.893740	0.888196	0.337162	0.416465	0.493237	99.323554	0.683790
5	Optics	2	-0.026989	0.006172	0.005699	0.011684	0.007661	0.864712	0.017962	0.010660	0.362963	-0.147118	26.793214	1.299365
6	Gaussian-mixture	2	0.471211	0.267887	0.245859	0.295286	0.268315	0.908335	0.082845	0.265201	0.201394	0.455396	766.398381	0.973004

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.300761	0.140981	0.163759	0.124379	0.141378	0.833877	0.849707	0.732422	0.661128	0.387974	880.436183	1.340712
1	Agglomerative clustering	3	0.156916	0.118266	0.203682	0.083871	0.118817	0.703897	0.648827	0.278166	0.333924	0.279226	835.139038	1.215189
2	Birch	2	-0.005223	0.001068	0.002124	0.001073	0.001425	0.639930	0.547654	0.469842	0.488021	0.236334	791.082485	1.588534
3	DBSCAN	2	0.111437	0.027330	0.025366	0.032794	0.028606	0.856268	0.840909	0.314548	0.300931	0.321374	74.295522	2.695492
4	Mean-shift	3	0.017095	0.004512	0.003706	0.029524	0.006586	0.893740	0.888196	0.337162	0.416465	0.493237	99.323554	0.683790
5	Optics	2	-0.026989	0.006172	0.005699	0.011684	0.007661	0.864712	0.017962	0.010660	0.362963	-0.147118	26.793214	1.299365
6	Gaussian-mixture	2	0.471211	0.267887	0.245859	0.295286	0.268315	0.908335	0.082845	0.265201	0.201394	0.455396	766.398381	0.973004

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.012586	2.284908e-02	3.494622e-02	0.017358	2.319508e-02	0.639806	0.442815	0.377028	0.452250	0.309422	1425.379819	1.308461
1	Agglomerative clustering	360	0.000559	3.545468e-02	5.237562e-01	0.031060	5.864210e-02	0.052862	0.004399	0.000014	0.002778	0.446850	1426.563015	0.704475
2	Birch	406	0.000568	3.586133e-02	5.471541e-01	0.032155	6.073958e-02	0.052789	0.002199	0.000006	0.002463	0.376367	1159.536412	0.805643
3	DBSCAN	144	0.001981	4.676292e-02	4.079127e-01	0.031591	5.864086e-02	0.158161	0.013930	0.000149	0.007315	0.225238	105.618549	1.291047
4	Mean-shift	1	0.000000	1.769098e-15	3.226002e-16	1.000000	6.452004e-16	0.897715	0.891129	0.500000	0.445565	-1.000000	-1.000000	-1.000000
5	Optics	92	-0.013331	3.350913e-02	2.323712e-01	0.024127	4.371487e-02	0.357460	0.009897	0.000119	0.010753	-0.010777	36.523732	1.391599
6	Gaussian-mixture	2	-0.028934	1.500635e-02	2.219430e-02	0.011756	1.537034e-02	0.650912	0.579179	0.409205	0.461468	0.290825	1291.710026	1.262055

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.010745	2.244230e-02	3.434028e-02	0.017052	2.278838e-02	0.638952	0.552419	0.621776	0.547266	0.315351	1413.659984	1.316540
1	Agglomerative clustering	109	0.001602	4.744199e-02	4.028458e-01	0.030031	5.589539e-02	0.097724	0.005865	0.000115	0.005357	0.460941	1970.343583	0.729527
2	Birch	133	0.001501	4.572877e-02	4.133311e-01	0.029970	5.588820e-02	0.093786	0.010630	0.000090	0.007034	0.439560	1814.399950	0.786641
3	DBSCAN	129	0.004443	4.736903e-02	3.922633e-01	0.031534	5.837439e-02	0.173874	0.030425	0.000331	0.007995	0.299660	147.631575	1.164871
4	Mean-shift	1	0.000000	1.769098e-15	3.226002e-16	1.000000	6.452004e-16	0.897715	0.891129	0.500000	0.445565	-1.000000	-1.000000	-1.000000
5	Optics	98	-0.001702	3.402930e-02	2.533286e-01	0.024120	4.404638e-02	0.310941	0.009897	0.000112	0.010101	0.115397	51.602661	1.190559
6	Gaussian-mixture	2	-0.000658	1.493802e-02	2.300673e-02	0.011447	1.528721e-02	0.635315	0.517595	0.599281	0.538639	0.305477	1327.132406	1.353205