import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline

train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/logMelSpec_muslim_features_vowel.csv')

train_data.shape

(240, 128)

results_df=pd.read_csv('audio-results4/logMelSpec_muslim_features_vowel.csv')
results_df

actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/logMelSpec_muslim_features_vowel.csv')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})

audio-results4/logMelSpec_muslim_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 66, 4: 55, 2: 52, 0: 43, 3: 24})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 205, 1: 32, 2: 3})
 2D representation

 3D representation

Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({0: 215, -1: 22, 1: 3})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({0: 240})
 2D representation

 3D representation

MEAN-SHIFT
predicted_labels--> Counter({0: 235, 1: 5})
 2D representation

PCA transformed data¶

results_df=pd.read_csv('audio-results4/logMelSpec_muslim_features_vowel.csv-pca.csv')
results_df

pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/logMelSpec_muslim_features_vowel.csv-pca')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})

audio-results4/logMelSpec_muslim_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 65, 2: 51, 0: 51, 3: 49, 4: 24})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 229, 1: 11})
 2D representation

 3D representation

Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({-1: 122, 1: 111, 0: 7})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 213, 1: 16, 0: 11})
 2D representation

 3D representation

MEAN-SHIFT
predicted_labels--> Counter({0: 203, 1: 30, 2: 7})
 2D representation

t-SNE transformed data¶

results_df=pd.read_csv('audio-results4/logMelSpec_muslim_features_vowel.csv-tsne.csv')
results_df

tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/logMelSpec_muslim_features_vowel.csv-tsne')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})

audio-results4/logMelSpec_muslim_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 54, 3: 48, 1: 48, 2: 46, 4: 44})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({0: 209, -1: 30, 1: 1})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({0: 240})
 2D representation

 3D representation

MEAN-SHIFT
predicted_labels--> Counter({0: 239, 1: 1})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({3: 57, 2: 55, 1: 48, 0: 41, 4: 39})
 2D representation

Umap transformed data¶

results_df=pd.read_csv('audio-results4/logMelSpec_muslim_features_vowel.csv-umap.csv')
results_df

umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/logMelSpec_muslim_features_vowel.csv-umap')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})

audio-results4/logMelSpec_muslim_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({3: 56, 4: 55, 0: 46, 2: 45, 1: 38})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 159, 1: 81})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({9: 22, 1: 18, 4: 18, 2: 17, 6: 17, 11: 17, 5: 16, 10: 15, 0: 14, 8: 14, 3: 13, 13: 13, 7: 12, 14: 12, 15: 10, 16: 9, 12: 3})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({-1: 118, 1: 23, 5: 13, 8: 12, 2: 11, 6: 11, 0: 8, 3: 8, 10: 7, 4: 7, 12: 6, 9: 6, 7: 5, 11: 5})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 186, 1: 29, 0: 25})
 2D representation

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.002200	2.547598e-02	0.067865	0.044364	0.053654	0.322804	0.212500	0.285014	0.223919	0.099611	41.453568	2.207506
1	Agglomerative clustering	3	-0.009430	2.239266e-03	0.017141	0.038219	0.023668	0.595885	0.566667	0.185196	0.198374	0.259431	39.873813	1.148859
2	Birch	158	0.002524	3.430180e-02	0.823178	0.169803	0.281533	0.057021	0.004167	0.000040	0.003165	0.032680	5.587613	0.782031
3	DBSCAN	2	0.029059	3.489222e-02	0.038867	0.106604	0.056965	0.636879	0.600000	0.150943	0.111628	0.176915	10.795369	2.746961
4	Mean-shift	2	0.010322	1.530699e-02	0.015666	0.158000	0.028505	0.683867	0.650000	0.196226	0.132766	0.174987	5.762039	1.657554
5	Optics	1	0.000000	-5.797594e-17	0.000000	1.000000	0.000000	0.694318	0.662500	0.200000	0.132500	-1.000000	-1.000000	-1.000000
6	Gaussian-mixture	5	0.026097	1.307371e-02	0.051033	0.037006	0.042902	0.382041	0.137500	0.097363	0.147301	0.104725	39.759095	2.159768

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.004415	0.018267	0.059036	0.038486	0.046596	0.323117	0.175000	0.243956	0.177445	0.260406	132.380528	1.119692
1	Agglomerative clustering	2	0.005602	-0.007056	0.004647	0.025510	0.007862	0.665301	0.633333	0.191195	0.132751	0.507387	69.518393	0.684968
2	Birch	46	0.012161	0.046571	0.386636	0.113330	0.175282	0.154923	0.033333	0.001094	0.017391	0.282222	85.887384	0.738876
3	DBSCAN	2	-0.019487	0.011426	0.026591	0.033791	0.029762	0.466281	0.066667	0.122722	0.180180	-0.035591	9.451787	2.743165
4	Mean-shift	3	0.000566	0.017329	0.029958	0.060631	0.040102	0.593788	0.570833	0.214707	0.217734	0.342121	90.849797	1.047782
5	Optics	2	0.002223	-0.007566	0.012680	0.030276	0.017874	0.619345	0.041667	0.022254	0.146780	-0.294473	3.118736	2.328787
6	Gaussian-mixture	5	-0.024460	0.020666	0.059427	0.045061	0.051257	0.373637	0.370833	0.204426	0.216209	0.218962	117.703745	1.145961

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.002649	8.649036e-03	0.047400	0.030124	0.036837	0.310762	0.212500	0.203251	0.198384	0.242287	85.409443	1.125963
1	Agglomerative clustering	240	0.000000	-6.363593e-14	1.000000	0.186350	0.314157	0.000000	0.004167	0.000026	0.004167	-1.000000	-1.000000	-1.000000
2	Birch	240	0.000000	-6.363593e-14	1.000000	0.186350	0.314157	0.000000	0.008333	0.000115	0.008333	-1.000000	-1.000000	-1.000000
3	DBSCAN	2	-0.023686	6.663225e-03	0.018043	0.045703	0.025872	0.600577	0.566667	0.142558	0.108453	-0.062689	0.949784	7.920837
4	Mean-shift	2	-0.005764	-4.945772e-03	0.001684	0.063718	0.003281	0.689262	0.658333	0.198742	0.132218	0.534289	8.247693	0.332222
5	Optics	1	0.000000	-5.797594e-17	0.000000	1.000000	0.000000	0.694318	0.662500	0.200000	0.132500	-1.000000	-1.000000	-1.000000
6	Gaussian-mixture	5	0.009037	1.668214e-02	0.057391	0.036678	0.044754	0.318527	0.183333	0.235313	0.207199	0.223369	78.861885	1.185786

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.018758	0.018493	0.059656	0.038091	0.046495	0.325725	0.154167	0.155904	0.157838	0.413348	275.045396	0.882485
1	Agglomerative clustering	2	0.035808	0.010670	0.017217	0.027502	0.021177	0.532795	0.470833	0.170392	0.144561	0.422720	225.045855	0.891764
2	Birch	17	0.008194	0.027060	0.188880	0.069359	0.101461	0.181885	0.091667	0.042251	0.086190	0.408973	264.023639	0.819391
3	DBSCAN	13	-0.052288	0.016954	0.130299	0.068498	0.089792	0.320376	0.025000	0.002695	0.053571	0.009459	39.929345	0.952941
4	Mean-shift	3	-0.000997	-0.007293	0.010235	0.009733	0.009978	0.407904	0.312500	0.235607	0.162040	0.358024	220.848124	1.066792
5	Optics	2	-0.064487	0.007065	0.024402	0.036197	0.029151	0.519048	0.087500	0.022013	0.140000	0.137265	78.253097	1.077156
6	Gaussian-mixture	5	0.023662	0.025133	0.067936	0.043367	0.052940	0.329456	0.270833	0.221099	0.245030	0.413091	273.197966	0.896744