import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline

train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/chroma_ideology_features_vowel.csv')

train_data.shape

(2696, 12)

results_df=pd.read_csv('audio-results4/chroma_ideology_features_vowel.csv')
results_df

actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/chroma_ideology_features_vowel.csv')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})

audio-results4/chroma_ideology_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1369, 1: 1327})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 982, 1: 461, 3: 388, 2: 348, 4: 289, 5: 228})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({5: 588, 1: 509, 0: 438, 4: 437, 3: 410, 2: 314})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 2295, -1: 396, 1: 5})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2654, 1: 21, 0: 21})
 2D representation

PCA transformed data¶

results_df=pd.read_csv('audio-results4/chroma_ideology_features_vowel.csv-pca.csv')
results_df

pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/chroma_ideology_features_vowel.csv-pca')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})

audio-results4/chroma_ideology_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1385, 0: 1311})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({3: 910, 2: 637, 1: 594, 0: 555})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 837, 3: 646, 1: 621, 2: 592})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 2487, -1: 202, 1: 7})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2539, 1: 22, 7: 17, 2: 15, 0: 15, 6: 14, 4: 13, 10: 11, 8: 10, 11: 10, 9: 10, 5: 10, 3: 10})
 2D representation

t-SNE transformed data¶

results_df=pd.read_csv('audio-results4/chroma_ideology_features_vowel.csv-tsne.csv')
results_df

tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/chroma_ideology_features_vowel.csv-tsne')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})

audio-results4/chroma_ideology_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1360, 1: 1336})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
predicted_labels--> Counter({-1: 2553, 0: 113, 1: 30})
 2D representation

 3D representation

MEAN-SHIFT
predicted_labels--> Counter({0: 1531, 1: 1165})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({1: 1669, 0: 1027})
 2D representation

 3D representation

Umap transformed data¶

results_df=pd.read_csv('audio-results4/chroma_ideology_features_vowel.csv-umap.csv')
results_df

umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/chroma_ideology_features_vowel.csv-umap')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})

audio-results4/chroma_ideology_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1398, 0: 1298})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 1118, 2: 1017, 1: 561})
 2D representation

 3D representation

Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({1: 1063, 2: 867, 0: 766})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({1: 1356, 0: 1340})
 2D representation

 3D representation

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.000142	-0.000176	0.000290	0.000138	0.000187	0.639051	0.510386	0.511424	0.504187	0.155323	504.527175	2.214680
1	Agglomerative clustering	6	0.005948	-0.000137	0.002417	0.000479	0.000800	0.426397	0.348665	0.088886	0.168170	0.139662	376.863499	1.673787
2	Birch	6	0.000864	-0.000151	0.002345	0.000436	0.000735	0.375909	0.159496	0.052165	0.161393	0.179767	473.940438	1.463033
3	DBSCAN	2	0.156648	0.045228	0.053277	0.040774	0.046194	0.815158	0.786721	0.292028	0.308061	0.014644	14.996554	5.651054
4	Mean-shift	1	0.000000	0.000000	0.000000	1.000000	0.000000	0.903734	0.897997	0.500000	0.448999	-1.000000	-1.000000	-1.000000
5	Optics	2	-0.024323	0.006050	0.005130	0.018553	0.008037	0.886934	0.007789	0.002891	0.333333	-0.129722	14.610190	1.540118
6	Gaussian-mixture	2	-0.000271	0.000163	0.000817	0.000388	0.000527	0.638882	0.501113	0.519154	0.507019	0.154512	501.821210	2.219037

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.000358	-0.000132	0.000359	0.000171	0.000231	0.639299	0.484421	0.487297	0.495342	0.253513	953.338119	1.548498
1	Agglomerative clustering	4	0.002452	0.001188	0.004750	0.001146	0.001846	0.462908	0.205119	0.111901	0.247088	0.239854	916.806973	1.173909
2	Birch	4	0.000652	-0.000515	0.000362	0.000087	0.000140	0.456651	0.305638	0.139079	0.253078	0.278242	1101.011947	1.059872
3	DBSCAN	2	0.143651	0.035098	0.033844	0.039255	0.036349	0.859940	0.843472	0.316317	0.447240	0.180720	23.066877	4.676062
4	Mean-shift	1	0.000000	0.000000	0.000000	1.000000	0.000000	0.903734	0.897997	0.500000	0.448999	-1.000000	-1.000000	-1.000000
5	Optics	12	-0.034692	-0.000180	0.007691	0.006947	0.007300	0.846041	0.005564	0.000725	0.075291	-0.502093	6.382254	1.745241
6	Gaussian-mixture	2	0.001155	0.000097	0.000714	0.000340	0.000461	0.640280	0.472181	0.482093	0.493420	0.253548	952.396265	1.546556

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.000215	0.000084	0.000694	0.000330	0.000447	0.639027	0.510015	0.517664	0.506472	0.328508	1471.347474	1.264673
1	Agglomerative clustering	492	0.000145	0.010386	0.389489	0.020995	0.039843	0.042177	0.002226	0.000005	0.001742	0.434271	1183.442345	0.708632
2	Birch	868	0.000152	0.011475	0.550623	0.027491	0.052368	0.033349	0.001484	0.000006	0.001382	0.384014	1153.879463	0.622018
3	DBSCAN	171	-0.011097	0.008321	0.162108	0.013213	0.024435	0.268020	0.013724	0.000145	0.005783	0.030732	39.756221	1.217995
4	Mean-shift	2	0.003027	0.000138	0.000777	0.000374	0.000505	0.645943	0.560831	0.518560	0.506928	0.297481	1215.800397	1.376404
5	Optics	2	-0.003004	0.003232	0.003948	0.005544	0.004612	0.856244	0.036721	0.013631	0.292035	-0.112749	96.652005	1.139493
6	Gaussian-mixture	2	0.001783	-0.000307	0.000101	0.000050	0.000067	0.657451	0.402819	0.493434	0.497450	0.295419	1184.536532	1.346589

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.000539	-0.000107	0.000397	0.000189	0.000256	0.639568	0.480341	0.486637	0.495097	0.355309	1654.374786	1.174635
1	Agglomerative clustering	3	0.000694	-0.000491	0.000095	0.000030	0.000045	0.540603	0.395401	0.210163	0.335295	0.370240	1969.869039	0.894925
2	Birch	23	0.000040	0.000705	0.016266	0.001739	0.003142	0.197105	0.042656	0.003747	0.040669	0.359962	2117.346914	0.844302
3	DBSCAN	59	-0.003787	0.003110	0.054640	0.005221	0.009531	0.222023	0.041914	0.001154	0.016673	0.087576	217.909835	1.266741
4	Mean-shift	3	-0.000663	-0.000489	0.000076	0.000023	0.000035	0.526090	0.297107	0.229548	0.335068	0.374204	2046.782239	0.878006
5	Optics	69	-0.013573	0.006666	0.075213	0.009060	0.016172	0.447196	0.006677	0.000244	0.014391	-0.129214	53.031266	1.157567
6	Gaussian-mixture	2	-0.000153	-0.000235	0.000200	0.000095	0.000129	0.638869	0.501113	0.509484	0.503475	0.341254	1559.961677	1.223740