import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline

train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/mfcc_muslim_features_vowel.csv')

train_data.shape

(240, 13)

results_df=pd.read_csv('audio-results4/mfcc_muslim_features_vowel.csv')
results_df

actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/mfcc_muslim_features_vowel.csv')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})

audio-results4/mfcc_muslim_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({3: 58, 4: 52, 2: 51, 0: 48, 1: 31})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 192, 1: 48})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 231, 1: 9})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({-1: 152, 0: 60, 2: 13, 1: 11, 3: 4})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({0: 240})
 2D representation

PCA transformed data¶

results_df=pd.read_csv('audio-results4/mfcc_muslim_features_vowel.csv-pca.csv')
results_df

pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/mfcc_muslim_features_vowel.csv-pca')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})

audio-results4/mfcc_muslim_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 56, 4: 49, 2: 47, 3: 45, 1: 43})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 79, 1: 61, 2: 56, 3: 44})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({2: 75, 3: 57, 1: 55, 0: 53})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 171, -1: 60, 1: 9})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 229, 0: 11})
 2D representation

t-SNE transformed data¶

results_df=pd.read_csv('audio-results4/mfcc_muslim_features_vowel.csv-tsne.csv')
results_df

tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/mfcc_muslim_features_vowel.csv-tsne')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})

audio-results4/mfcc_muslim_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({2: 52, 3: 49, 4: 49, 0: 47, 1: 43})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({0: 155, -1: 80, 1: 5})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({0: 240})
 2D representation

 3D representation

MEAN-SHIFT
predicted_labels--> Counter({0: 233, 1: 7})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({0: 67, 2: 58, 4: 45, 1: 36, 3: 34})
 2D representation

Umap transformed data¶

results_df=pd.read_csv('audio-results4/mfcc_muslim_features_vowel.csv-umap.csv')
results_df

umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/mfcc_muslim_features_vowel.csv-umap')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})

audio-results4/mfcc_muslim_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 57, 0: 52, 2: 47, 4: 45, 3: 39})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({3: 55, 4: 50, 1: 49, 0: 47, 2: 39})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({1: 31, 4: 27, 0: 25, 2: 24, 3: 23, 7: 23, 5: 20, 9: 19, 6: 19, 8: 16, 10: 13})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({-1: 52, 6: 33, 11: 20, 2: 19, 0: 17, 1: 16, 5: 12, 13: 12, 3: 10, 8: 9, 10: 8, 4: 7, 12: 6, 7: 6, 9: 5, 15: 5, 14: 3})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 153, 1: 39, 0: 27, 2: 21})
 2D representation

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.043502	9.051057e-02	0.148992	0.095706	0.116548	0.347453	0.195833	0.143850	0.192945	0.111764	25.306967	2.165053
1	Agglomerative clustering	2	-0.082096	2.692399e-02	0.028813	0.058807	0.038676	0.535644	0.491667	0.148428	0.122917	0.091499	23.309823	2.359739
2	Birch	2	0.065935	6.978901e-02	0.048229	0.308017	0.083399	0.694086	0.650000	0.196226	0.135065	0.175546	8.455691	1.938029
3	DBSCAN	4	-0.035317	5.199972e-03	0.038441	0.039130	0.038782	0.455758	0.175000	0.044025	0.116667	-0.094465	4.567208	3.598599
4	Mean-shift	1	0.000000	-5.797594e-17	0.000000	1.000000	0.000000	0.694318	0.662500	0.200000	0.132500	-1.000000	-1.000000	-1.000000
5	Optics	1	0.000000	-5.797594e-17	0.000000	1.000000	0.000000	0.694318	0.662500	0.200000	0.132500	-1.000000	-1.000000	-1.000000
6	Gaussian-mixture	5	-0.012643	7.748358e-02	0.124180	0.092258	0.105865	0.367038	0.045833	0.045087	0.158319	0.104357	20.759759	2.128010

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.069847	1.123442e-01	0.176939	0.112579	0.137606	0.364503	0.283333	0.186113	0.251920	0.291529	104.014995	1.039882
1	Agglomerative clustering	4	0.036628	8.460775e-02	0.123520	0.092454	0.105753	0.378131	0.162500	0.138921	0.105588	0.268633	92.939300	1.069304
2	Birch	4	0.035732	9.620200e-02	0.137252	0.101868	0.116942	0.373868	0.183333	0.069473	0.165901	0.295274	104.835271	1.021278
3	DBSCAN	2	-0.031591	-7.034795e-04	0.016707	0.023991	0.019697	0.508949	0.458333	0.115304	0.107212	0.092514	10.799876	3.914580
4	Mean-shift	1	0.000000	-5.797594e-17	0.000000	1.000000	0.000000	0.694318	0.662500	0.200000	0.132500	-1.000000	-1.000000	-1.000000
5	Optics	1	0.034321	4.918684e-04	0.009043	0.049637	0.015298	0.676448	0.020833	0.005241	0.075758	-0.070527	5.408569	1.704391
6	Gaussian-mixture	5	0.072831	1.073271e-01	0.170433	0.108763	0.132787	0.368108	0.158333	0.186008	0.166298	0.285470	102.003188	1.059069

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.029578	7.315166e-02	0.128054	0.081358	0.099500	0.331949	0.266667	0.313387	0.268207	0.239667	84.677258	1.118155
1	Agglomerative clustering	240	0.000000	-6.363593e-14	1.000000	0.186350	0.314157	0.000000	0.008333	0.000347	0.008333	-1.000000	-1.000000	-1.000000
2	Birch	240	0.000000	-6.363593e-14	1.000000	0.186350	0.314157	0.000000	0.008333	0.000409	0.008333	-1.000000	-1.000000	-1.000000
3	DBSCAN	2	0.038171	3.479782e-03	0.018938	0.026523	0.022098	0.522857	0.450000	0.113208	0.116129	-0.008944	4.914734	5.616258
4	Mean-shift	2	0.038426	3.452892e-02	0.027243	0.211054	0.048257	0.688987	0.650000	0.196226	0.133906	0.170785	12.665392	1.093261
5	Optics	1	0.000000	-5.797594e-17	0.000000	1.000000	0.000000	0.694318	0.662500	0.200000	0.132500	-1.000000	-1.000000	-1.000000
6	Gaussian-mixture	5	0.041849	6.003447e-02	0.110762	0.071837	0.087151	0.350904	0.300000	0.138525	0.238275	0.224458	79.367738	1.109322

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.047288	0.086824	0.144898	0.092420	0.112857	0.347715	0.195833	0.227957	0.189806	0.398901	165.997345	0.857070
1	Agglomerative clustering	5	0.056063	0.101537	0.163350	0.104050	0.127124	0.354022	0.166667	0.213425	0.175005	0.394911	161.757610	0.902572
2	Birch	11	0.033347	0.093476	0.239972	0.103299	0.144428	0.248680	0.137500	0.046010	0.117084	0.374132	162.586606	0.917346
3	DBSCAN	16	0.002862	0.053740	0.221093	0.088475	0.126377	0.219612	0.070833	0.010444	0.059040	0.165878	48.466406	1.069505
4	Mean-shift	3	0.010153	0.079324	0.097528	0.092807	0.095109	0.415867	0.241667	0.185970	0.118136	0.300556	104.579573	1.152263
5	Optics	3	-0.094513	0.053072	0.079895	0.078367	0.079124	0.415691	0.108333	0.062571	0.165785	0.149905	51.665732	1.157557
6	Gaussian-mixture	5	0.056762	0.090600	0.149421	0.095575	0.116581	0.356505	0.220833	0.283642	0.225959	0.396812	164.830310	0.835609