import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline

train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/zeroCrossings_muslim_features_vowel.csv')

train_data.shape

(240, 11)

results_df=pd.read_csv('audio-results4/zeroCrossings_muslim_features_vowel.csv')
results_df

actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/zeroCrossings_muslim_features_vowel.csv')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})

audio-results4/zeroCrossings_muslim_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 97, 3: 93, 2: 46, 0: 3, 4: 1})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 130, 1: 101, 2: 8, 3: 1})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 239, 1: 1})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({1: 155, -1: 68, 0: 17})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 137, 1: 52, 0: 51})
 2D representation

PCA transformed data¶

results_df=pd.read_csv('audio-results4/zeroCrossings_muslim_features_vowel.csv-pca.csv')
results_df

pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/zeroCrossings_muslim_features_vowel.csv-pca')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})

audio-results4/zeroCrossings_muslim_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 87, 2: 84, 3: 59, 1: 9, 4: 1})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 133, 1: 97, 2: 9, 3: 1})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 239, 1: 1})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({1: 95, 0: 79, -1: 66})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 163, 1: 43, 0: 34})
 2D representation

t-SNE transformed data¶

results_df=pd.read_csv('audio-results4/zeroCrossings_muslim_features_vowel.csv-tsne.csv')
results_df

tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/zeroCrossings_muslim_features_vowel.csv-tsne')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})

audio-results4/zeroCrossings_muslim_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 55, 4: 49, 0: 46, 2: 46, 3: 44})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({0: 141, -1: 59, 1: 40})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 181, 2: 24, 0: 20, 1: 15})
 2D representation

 3D representation

MEAN-SHIFT
predicted_labels--> Counter({0: 240})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({0: 58, 2: 49, 1: 47, 4: 44, 3: 42})
 2D representation

Umap transformed data¶

results_df=pd.read_csv('audio-results4/zeroCrossings_muslim_features_vowel.csv-umap.csv')
results_df

umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/zeroCrossings_muslim_features_vowel.csv-umap')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})

audio-results4/zeroCrossings_muslim_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({4: 70, 0: 62, 1: 49, 3: 35, 2: 24})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({1: 119, 0: 97, 2: 24})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({2: 24, 1: 22, 0: 21, 8: 21, 5: 19, 3: 17, 4: 16, 13: 16, 6: 14, 7: 12, 14: 12, 9: 11, 11: 11, 12: 10, 15: 8, 10: 6})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 119, 2: 97, 1: 24})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({2: 31, 10: 28, 7: 25, 8: 21, 0: 20, 3: 19, -1: 16, 11: 16, 1: 16, 5: 14, 4: 13, 6: 11, 9: 10})
 2D representation

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.034789	0.095055	0.125536	0.113688	0.119318	0.430423	0.058333	0.187131	0.164934	0.427100	209.117285	0.675069
1	Agglomerative clustering	4	0.133037	0.102654	0.112038	0.137441	0.123446	0.545469	0.491667	0.289695	0.208976	0.469997	181.838929	0.587481
2	Birch	2	-0.005764	-0.004946	0.001684	0.063718	0.003281	0.689262	0.658333	0.198742	0.132218	0.825172	41.719463	0.123482
3	DBSCAN	2	-0.071036	0.054416	0.066216	0.081753	0.073168	0.454950	0.104167	0.132156	0.166540	0.151756	72.141037	1.798670
4	Mean-shift	3	-0.019714	-0.001400	0.011153	0.079215	0.019553	0.666670	0.637500	0.192453	0.131330	0.671245	70.381289	0.464503
5	Optics	2	0.011517	0.047884	0.063987	0.066648	0.065290	0.453955	0.195833	0.096355	0.153343	0.109595	44.933214	2.327164
6	Gaussian-mixture	5	0.002426	0.068018	0.106263	0.083746	0.093671	0.377460	0.200000	0.208945	0.248951	0.387896	171.361565	0.849198

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.018071	0.076355	0.111980	0.093272	0.101774	0.399861	0.279167	0.154910	0.154433	0.459821	271.366623	0.640353
1	Agglomerative clustering	4	0.131724	0.098001	0.108420	0.131944	0.119031	0.544888	0.495833	0.290953	0.204689	0.510783	226.572955	0.534320
2	Birch	2	-0.005764	-0.004946	0.001684	0.063718	0.003281	0.689262	0.658333	0.198742	0.132218	0.833935	44.393129	0.117538
3	DBSCAN	2	0.036996	0.088986	0.107842	0.101267	0.104451	0.426252	0.308333	0.183519	0.152920	0.368736	113.428182	1.163037
4	Mean-shift	3	-0.029894	0.001517	0.013335	0.078728	0.022807	0.657059	0.629167	0.189937	0.130736	0.666933	85.541210	0.472544
5	Optics	2	0.207133	0.075399	0.085931	0.103533	0.093914	0.600901	0.062500	0.062812	0.069425	0.198834	58.161844	0.772040
6	Gaussian-mixture	5	0.012386	0.075180	0.113922	0.091583	0.101538	0.389336	0.083333	0.118910	0.207130	0.448787	150.415479	1.002415

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.025410	4.262315e-02	0.089860	0.057135	0.069855	0.329026	0.208333	0.209472	0.213633	0.285363	107.854450	1.047740
1	Agglomerative clustering	240	0.000000	-6.353634e-14	1.000000	0.186350	0.314157	0.000000	0.004167	0.000026	0.004167	-1.000000	-1.000000	-1.000000
2	Birch	240	0.000000	-6.373552e-14	1.000000	0.186350	0.314157	0.000000	0.008333	0.000115	0.008333	-1.000000	-1.000000	-1.000000
3	DBSCAN	2	0.027336	1.640557e-02	0.033582	0.035875	0.034691	0.470741	0.412500	0.150863	0.128960	0.098930	24.111838	5.051985
4	Mean-shift	1	0.000000	-5.797594e-17	0.000000	1.000000	0.000000	0.694318	0.662500	0.200000	0.132500	-1.000000	-1.000000	-1.000000
5	Optics	3	0.041119	5.772493e-02	0.078244	0.097051	0.086638	0.551798	0.033333	0.031930	0.063889	-0.076721	21.296228	1.561104
6	Gaussian-mixture	5	0.029073	3.562961e-02	0.081071	0.051655	0.063103	0.332935	0.283333	0.268468	0.255657	0.285062	107.112824	1.040879

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.081453	0.109963	0.170720	0.112895	0.135913	0.387853	0.162500	0.130789	0.136009	0.602613	1037.388616	0.574975
1	Agglomerative clustering	3	0.125115	0.101912	0.114159	0.123477	0.118635	0.517602	0.191667	0.128495	0.099592	0.709724	681.718448	0.385095
2	Birch	16	0.033750	0.071076	0.254827	0.096003	0.139465	0.224279	0.104167	0.031891	0.078223	0.660897	3128.183619	0.433536
3	DBSCAN	3	0.125115	0.101912	0.114159	0.123477	0.118635	0.517602	0.450000	0.234736	0.184181	0.709724	681.718448	0.385095
4	Mean-shift	3	0.125115	0.101912	0.114159	0.123477	0.118635	0.517602	0.462500	0.280890	0.196638	0.709724	681.718448	0.385095
5	Optics	12	0.034291	0.091438	0.258225	0.105045	0.149339	0.240354	0.104167	0.037287	0.101560	0.548071	257.152757	1.039962
6	Gaussian-mixture	5	-0.035227	0.095513	0.150388	0.103650	0.122720	0.325645	0.179167	0.201351	0.206579	0.612678	552.207635	0.565666