import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline

train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/chroma_ideologyFive_features_vowel.csv')

train_data.shape

(2728, 12)

results_df=pd.read_csv('audio-results4/chroma_ideologyFive_features_vowel.csv')
results_df

actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/chroma_ideologyFive_features_vowel.csv')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/chroma_ideologyFive_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1591, 1: 1137})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 1371, 1: 895, 2: 462})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({2: 691, 0: 528, 3: 451, 1: 426, 5: 392, 4: 239, 6: 1})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 1525, -1: 1151, 2: 29, 1: 14, 3: 9})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2696, 1: 22, 0: 10})
 2D representation

PCA transformed data¶

results_df=pd.read_csv('audio-results4/chroma_ideologyFive_features_vowel.csv-pca.csv')
results_df

pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/chroma_ideologyFive_features_vowel.csv-pca')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/chroma_ideologyFive_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1523, 0: 1205})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 1257, 1: 666, 2: 469, 3: 336})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({1: 1215, 2: 841, 0: 672})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 2487, -1: 234, 1: 7})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2541, 6: 23, 13: 15, 4: 14, 1: 14, 5: 14, 9: 14, 3: 14, 8: 13, 12: 12, 0: 12, 11: 11, 7: 11, 10: 10, 2: 10})
 2D representation

t-SNE transformed data¶

results_df=pd.read_csv('audio-results4/chroma_ideologyFive_features_vowel.csv-tsne.csv')
results_df

tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/chroma_ideologyFive_features_vowel.csv-tsne')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/chroma_ideologyFive_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1386, 1: 1342})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({-1: 2706, 0: 14, 1: 8})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2644, 2: 33, 1: 29, 0: 22})
 2D representation

 3D representation

MEAN-SHIFT
predicted_labels--> Counter({2: 710, 3: 708, 0: 693, 1: 617})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({0: 1368, 1: 1360})
 2D representation

Umap transformed data¶

results_df=pd.read_csv('audio-results4/chroma_ideologyFive_features_vowel.csv-umap.csv')
results_df

umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/chroma_ideologyFive_features_vowel.csv-umap')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/chroma_ideologyFive_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1440, 1: 1288})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1175, 1: 844, 2: 709})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({1: 1419, 0: 1309})
 2D representation

 3D representation

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.001130	-0.000302	0.000085	0.000043	0.000057	0.643831	0.567449	0.506071	0.502423	0.144406	442.147902	2.376112
1	Agglomerative clustering	3	0.002279	0.000818	0.002679	0.000911	0.001359	0.560820	0.476906	0.260163	0.327047	0.126483	360.457106	2.209342
2	Birch	7	0.003702	0.001026	0.006158	0.001212	0.002026	0.385672	0.195015	0.052375	0.147179	0.173180	355.155899	1.399540
3	DBSCAN	4	0.043094	0.045123	0.076172	0.033462	0.046497	0.646045	0.531891	0.119966	0.204450	-0.115111	24.458226	3.885707
4	Mean-shift	2	0.015011	0.008295	0.004826	0.150551	0.009353	0.898040	0.891862	0.504845	0.821035	0.362924	10.623167	1.266938
5	Optics	2	-0.018801	0.004536	0.003953	0.019138	0.006553	0.885009	0.003666	0.001371	0.333333	-0.093451	17.232206	1.253964
6	Gaussian-mixture	2	0.011044	0.002213	0.003802	0.001945	0.002573	0.651603	0.405425	0.459497	0.483616	0.148431	433.021504	2.400182

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.000078	-0.000353	0.000005	0.000003	0.000004	0.638995	0.453812	0.498469	0.499398	0.256808	963.575471	1.547358
1	Agglomerative clustering	4	0.012463	0.002213	0.006763	0.001844	0.002898	0.512512	0.441349	0.171847	0.250931	0.252480	987.588709	1.125539
2	Birch	3	0.007488	0.003802	0.008866	0.002856	0.004321	0.538139	0.253299	0.213960	0.315934	0.275034	1077.177227	1.229789
3	DBSCAN	2	0.232180	0.084098	0.081013	0.089851	0.085204	0.863371	0.836877	0.313040	0.305991	0.131967	14.916960	6.424408
4	Mean-shift	2	0.025713	0.004318	0.003113	0.017695	0.005294	0.890898	0.885997	0.508943	0.579775	0.360258	84.845726	1.055357
5	Optics	14	-0.016796	0.007674	0.017256	0.013840	0.015360	0.833565	0.004399	0.000329	0.066667	-0.508346	8.667157	1.788002
6	Gaussian-mixture	2	0.015873	0.002508	0.004167	0.002195	0.002875	0.664834	0.374267	0.458270	0.482428	0.263930	915.540192	1.552229

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.002258	0.003240	0.005414	0.002689	0.003593	0.635636	0.525293	5.489206e-01	0.518990	0.353538	1750.296605	1.170702
1	Agglomerative clustering	526	0.000184	0.013379	0.419462	0.023346	0.044230	0.040524	0.003666	1.343951e-05	0.002112	0.422495	1296.962940	0.710854
2	Birch	915	0.000126	0.013444	0.573703	0.029624	0.056339	0.031211	0.000367	4.495665e-07	0.001093	0.368276	1264.124755	0.616304
3	DBSCAN	2	-0.013298	0.002561	0.002713	0.017887	0.004711	0.888966	0.005132	1.919649e-03	0.333333	0.006527	14.950167	0.848068
4	Mean-shift	4	-0.000292	0.001591	0.005597	0.001391	0.002228	0.449099	0.253299	1.139182e-01	0.251856	0.343918	2103.198358	0.914998
5	Optics	3	-0.029175	0.002175	0.003370	0.006786	0.004503	0.866340	0.008065	3.001356e-03	0.247257	-0.187414	28.741283	1.119563
6	Gaussian-mixture	2	0.001362	0.002981	0.005025	0.002495	0.003334	0.635204	0.519428	5.471076e-01	0.518281	0.353295	1749.248469	1.170892

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.003448	0.001958	0.003479	0.001731	0.002312	0.637003	0.537023	0.539246	0.515278	0.348331	1639.180156	1.205799
1	Agglomerative clustering	84	0.000431	0.007591	0.099446	0.007875	0.014594	0.106331	0.014663	0.000407	0.012725	0.340882	1939.037983	0.904654
2	Birch	26	0.000340	0.003137	0.029787	0.003180	0.005746	0.181642	0.031891	0.002968	0.037049	0.362170	2243.283643	0.877944
3	DBSCAN	75	-0.002391	0.008160	0.089744	0.008808	0.016041	0.235477	0.034457	0.000509	0.011347	0.039222	164.098648	1.343574
4	Mean-shift	3	0.002192	0.000175	0.001428	0.000457	0.000692	0.531216	0.425587	0.261656	0.340932	0.355717	1749.229890	0.941264
5	Optics	69	-0.015321	0.007149	0.071149	0.009916	0.017406	0.505933	0.008798	0.000563	0.015675	-0.220891	35.531110	1.230087
6	Gaussian-mixture	2	0.002136	0.001282	0.002464	0.001225	0.001636	0.636020	0.471408	0.466962	0.487158	0.348068	1635.492585	1.207489