import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline

train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formantsLpc_ideology_features_vowel.csv')

results_df=pd.read_csv('audio-results4/formantsLpc_ideology_features_vowel.csv')
results_df

actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formantsLpc_ideology_features_vowel.csv')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})

audio-results4/formantsLpc_ideology_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1678, 1: 1018})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({1: 1497, 0: 1199})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 1681, 1: 1015})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 2302, -1: 346, 1: 48})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2623, 0: 25, 1: 25, 2: 23})
 2D representation

PCA transformed data¶

results_df=pd.read_csv('audio-results4/formantsLpc_ideology_features_vowel.csv-pca.csv')
results_df

pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formantsLpc_ideology_features_vowel.csv-pca')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})

audio-results4/formantsLpc_ideology_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1678, 1: 1018})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({1: 1497, 0: 1199})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 1681, 1: 1015})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 2302, -1: 346, 1: 48})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2623, 0: 25, 1: 25, 2: 23})
 2D representation

t-SNE transformed data¶

results_df=pd.read_csv('audio-results4/formantsLpc_ideology_features_vowel.csv-tsne.csv')
results_df

tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formantsLpc_ideology_features_vowel.csv-tsne')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})

audio-results4/formantsLpc_ideology_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1572, 1: 1124})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1527, 1: 1169})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({0: 1600, 1: 1096})
 2D representation

 3D representation

Umap transformed data¶

results_df=pd.read_csv('audio-results4/formantsLpc_ideology_features_vowel.csv-umap.csv')
results_df

umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formantsLpc_ideology_features_vowel.csv-umap')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})

audio-results4/formantsLpc_ideology_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1583, 1: 1113})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 1704, 1: 992})
 2D representation

 3D representation

Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1603, 1: 1093})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({1: 1624, 0: 1072})
 2D representation

 3D representation

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	-0.012170	0.002790	0.004765	0.002368	0.003164	0.653348	0.581231	0.455782	0.482766	0.481895	3114.307010	0.815215
1	Agglomerative clustering	2	-0.005666	0.004185	0.007017	0.003365	0.004549	0.640625	0.476261	0.555278	0.520504	0.435485	2723.768152	0.893687
2	Birch	2	-0.012142	0.002707	0.004637	0.002306	0.003081	0.653703	0.582344	0.456402	0.482988	0.481956	3114.126505	0.814417
3	DBSCAN	2	0.134967	0.031527	0.039376	0.027595	0.032449	0.810217	0.791543	0.312083	0.424601	0.113396	56.519707	6.896549
4	Mean-shift	2	-0.000656	-0.000495	0.000121	0.012092	0.000240	0.903332	0.897626	0.499793	0.448980	0.728738	21.539908	0.195975
5	Optics	3	0.013505	0.009407	0.008770	0.018743	0.011949	0.881034	0.008531	0.002375	0.230000	-0.309523	29.370155	1.031359
6	Gaussian-mixture	2	-0.012478	0.002556	0.004398	0.002198	0.002931	0.655473	0.412463	0.542318	0.516619	0.483368	3107.601233	0.812483

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	-0.012170	0.002790	0.004765	0.002368	0.003164	0.653348	0.581231	0.455782	0.482766	0.481895	3114.307010	0.815215
1	Agglomerative clustering	2	-0.005666	0.004185	0.007017	0.003365	0.004549	0.640625	0.476261	0.555278	0.520504	0.435485	2723.768152	0.893687
2	Birch	2	-0.012142	0.002707	0.004637	0.002306	0.003081	0.653703	0.582344	0.456402	0.482988	0.481956	3114.126505	0.814417
3	DBSCAN	2	0.134967	0.031527	0.039376	0.027595	0.032449	0.810217	0.791543	0.312083	0.424601	0.113396	56.519707	6.896549
4	Mean-shift	2	-0.000656	-0.000495	0.000121	0.012092	0.000240	0.903332	0.897626	0.499793	0.448980	0.728738	21.539908	0.195975
5	Optics	3	0.013505	0.009407	0.008770	0.018743	0.011949	0.881034	0.008531	0.002375	0.230000	-0.309523	29.370155	1.031359
6	Gaussian-mixture	2	-0.012478	0.002556	0.004398	0.002198	0.002931	0.655473	0.587537	0.457682	0.483381	0.483368	3107.601233	0.812483

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	-0.008622	0.003445	0.005836	0.002830	0.003812	0.644415	0.547849	0.450088	0.481193	0.444881	2944.528386	0.885166
1	Agglomerative clustering	352	0.000298	0.023302	0.430500	0.024542	0.046437	0.051580	0.002967	0.000009	0.002841	0.460163	1937.635915	0.692352
2	Birch	394	0.000273	0.022433	0.441190	0.024903	0.047145	0.050769	0.001113	0.000003	0.002538	0.410229	1665.387597	0.761314
3	DBSCAN	147	0.000622	0.029299	0.308044	0.022254	0.041509	0.133925	0.001855	0.000014	0.005631	0.220414	128.340510	1.301686
4	Mean-shift	2	-0.006305	0.002953	0.005106	0.002458	0.003319	0.642111	0.535608	0.452943	0.482449	0.442392	2933.913082	0.894441
5	Optics	95	0.004613	0.028196	0.218664	0.021311	0.038837	0.367271	0.004451	0.000052	0.010417	-0.007130	37.335084	1.402520
6	Gaussian-mixture	2	-0.010619	0.004212	0.006986	0.003407	0.004580	0.646000	0.554525	0.445747	0.479402	0.445169	2929.943343	0.881765

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	-0.010862	0.005557	0.009056	0.004401	0.005924	0.644454	0.546736	0.438187	0.476642	0.566670	5545.817410	0.639787
1	Agglomerative clustering	2	-0.015646	0.004293	0.006995	0.003503	0.004668	0.655181	0.585682	0.446979	0.479117	0.552626	5101.374115	0.632492
2	Birch	195	0.000265	0.027975	0.353159	0.022605	0.042490	0.072028	0.004822	0.000044	0.004353	0.451341	4397.159607	0.756060
3	DBSCAN	104	-0.002623	0.033612	0.289709	0.023227	0.043006	0.141642	0.001855	0.000020	0.007937	0.249264	348.515469	1.293268
4	Mean-shift	2	-0.011332	0.004830	0.007925	0.003867	0.005198	0.645999	0.554154	0.442318	0.478081	0.566754	5535.568682	0.636081
5	Optics	103	-0.010207	0.026647	0.221455	0.020643	0.037766	0.332017	0.007789	0.000145	0.010385	0.056348	42.582956	1.336667
6	Gaussian-mixture	2	-0.012031	0.004469	0.007354	0.003605	0.004839	0.647692	0.438798	0.555370	0.521175	0.566125	5503.929018	0.632788