import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline

train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formants50_muslim_features_vowel.csv')

train_data.shape

(240, 3)

results_df=pd.read_csv('audio-results4/formants50_muslim_features_vowel.csv')
results_df

actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formants50_muslim_features_vowel.csv')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})

audio-results4/formants50_muslim_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({2: 120, 0: 55, 4: 29, 1: 21, 3: 15})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 218, 1: 22})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 203, 1: 37})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 218, -1: 17, 1: 5})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 226, 0: 14})
 2D representation

PCA transformed data¶

results_df=pd.read_csv('audio-results4/formants50_muslim_features_vowel.csv-pca.csv')
results_df

pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formants50_muslim_features_vowel.csv-pca')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})

audio-results4/formants50_muslim_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({3: 120, 0: 55, 1: 29, 4: 21, 2: 15})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 218, 1: 22})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 203, 1: 37})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 218, -1: 17, 1: 5})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 226, 0: 14})
 2D representation

t-SNE transformed data¶

results_df=pd.read_csv('audio-results4/formants50_muslim_features_vowel.csv-tsne.csv')
results_df

tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formants50_muslim_features_vowel.csv-tsne')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})

audio-results4/formants50_muslim_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 53, 3: 51, 4: 49, 0: 46, 2: 41})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({0: 120, -1: 72, 2: 31, 3: 10, 1: 7})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 230, 0: 10})
 2D representation

 3D representation

MEAN-SHIFT
predicted_labels--> Counter({0: 240})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({3: 72, 0: 55, 2: 44, 1: 39, 4: 30})
 2D representation

Umap transformed data¶

results_df=pd.read_csv('audio-results4/formants50_muslim_features_vowel.csv-umap.csv')
results_df

umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formants50_muslim_features_vowel.csv-umap')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})

audio-results4/formants50_muslim_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 61, 2: 55, 1: 47, 3: 46, 4: 31})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 124, 2: 59, 1: 57})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({4: 40, 5: 40, 1: 29, 0: 27, 6: 20, 8: 20, 3: 19, 2: 18, 9: 16, 7: 11})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({-1: 223, 0: 10, 1: 7})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 100, 3: 52, 1: 33, 0: 29, 2: 26})
 2D representation

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.048513	0.041893	0.083258	0.064126	0.072450	0.426571	0.204167	0.124137	0.200395	0.319838	149.881262	1.029625
1	Agglomerative clustering	2	0.022210	-0.011492	0.001947	0.006490	0.002995	0.642583	0.616667	0.214417	0.152127	0.571739	165.248700	0.622850
2	Birch	2	0.043043	0.007998	0.014791	0.035142	0.020820	0.614860	0.587500	0.233866	0.156597	0.557571	219.089888	0.772156
3	DBSCAN	2	0.026844	-0.004256	0.013767	0.039550	0.020424	0.643344	0.612500	0.165860	0.144954	0.437144	37.114055	1.517398
4	Mean-shift	7	0.078358	0.015147	0.055454	0.069608	0.061730	0.584137	0.554167	0.151910	0.161474	0.331027	68.581185	0.936835
5	Optics	1	-0.056810	0.008882	0.014366	0.065985	0.023595	0.632608	0.054167	0.013627	0.154762	-0.255248	3.920693	1.866694
6	Gaussian-mixture	5	0.050896	0.057632	0.098588	0.080278	0.088496	0.449828	0.291667	0.345358	0.302577	0.291894	125.351081	1.160166

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.048513	0.041893	0.083258	0.064126	0.072450	0.426571	0.212500	0.151879	0.218022	0.319838	149.881262	1.029625
1	Agglomerative clustering	2	0.022210	-0.011492	0.001947	0.006490	0.002995	0.642583	0.616667	0.214417	0.152127	0.571739	165.248700	0.622850
2	Birch	2	0.043043	0.007998	0.014791	0.035142	0.020820	0.614860	0.587500	0.233866	0.156597	0.557571	219.089888	0.772156
3	DBSCAN	2	0.026844	-0.004256	0.013767	0.039550	0.020424	0.643344	0.612500	0.165860	0.144954	0.437144	37.114055	1.517398
4	Mean-shift	7	0.078358	0.015147	0.055454	0.069608	0.061730	0.584137	0.550000	0.148870	0.113855	0.331027	68.581185	0.936835
5	Optics	1	-0.056810	0.008882	0.014366	0.065985	0.023595	0.632608	0.054167	0.013627	0.154762	-0.255248	3.920693	1.866694
6	Gaussian-mixture	5	0.018016	0.038901	0.083018	0.057383	0.067860	0.363016	0.191667	0.250025	0.207448	0.202305	126.850313	1.269497

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.011528	2.884120e-02	0.072625	0.046198	0.056473	0.318220	0.229167	0.227751	0.233241	0.266684	97.786660	1.094468
1	Agglomerative clustering	240	0.000000	-6.363593e-14	1.000000	0.186350	0.314157	0.000000	0.004167	0.000026	0.004167	-1.000000	-1.000000	-1.000000
2	Birch	240	0.000000	-6.363593e-14	1.000000	0.186350	0.314157	0.000000	0.004167	0.000026	0.004167	-1.000000	-1.000000	-1.000000
3	DBSCAN	4	-0.010740	-7.105174e-03	0.027172	0.022980	0.024901	0.408204	0.345833	0.130332	0.138530	-0.016419	18.672404	2.901425
4	Mean-shift	1	0.000000	-5.797594e-17	0.000000	1.000000	0.000000	0.694318	0.662500	0.200000	0.132500	-1.000000	-1.000000	-1.000000
5	Optics	1	0.025424	-3.246306e-03	0.006752	0.039813	0.011546	0.675729	0.020833	0.005241	0.083333	0.047206	11.269141	1.159755
6	Gaussian-mixture	5	0.012665	4.588086e-02	0.093089	0.060748	0.073519	0.331495	0.225000	0.300540	0.211597	0.241247	92.003950	1.087626

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.018047	0.033397	0.078020	0.050239	0.061121	0.328839	0.204167	0.119003	0.178831	0.449631	304.470679	0.826642
1	Agglomerative clustering	3	0.029040	0.047016	0.064075	0.063687	0.063880	0.445476	0.370833	0.196710	0.154805	0.477222	268.427938	0.737895
2	Birch	10	0.007070	0.023808	0.119353	0.054676	0.074996	0.239191	0.125000	0.064661	0.101938	0.461275	327.313938	0.778165
3	DBSCAN	2	0.037417	-0.009348	0.010741	0.036113	0.016558	0.660756	0.029167	0.019110	0.123810	0.306742	34.236158	0.522905
4	Mean-shift	4	0.039161	0.038332	0.069746	0.054815	0.061385	0.402115	0.337500	0.276512	0.188277	0.450701	267.691515	0.781745
5	Optics	4	0.041507	0.048792	0.094504	0.065878	0.077637	0.384547	0.079167	0.031688	0.108499	0.368523	208.218761	0.811797
6	Gaussian-mixture	5	0.020272	0.036581	0.081682	0.053183	0.064421	0.335761	0.216667	0.153136	0.198687	0.443141	294.907584	0.829147