import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline

train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formants80_muslim_features_vowel.csv')

train_data.shape

(240, 3)

results_df=pd.read_csv('audio-results4/formants80_muslim_features_vowel.csv')
results_df

actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formants80_muslim_features_vowel.csv')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})

audio-results4/formants80_muslim_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 97, 0: 59, 4: 40, 2: 31, 3: 13})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 192, 1: 48})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 207, 1: 33})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 188, -1: 45, 1: 7})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({0: 194, -1: 46})
 2D representation

PCA transformed data¶

results_df=pd.read_csv('audio-results4/formants80_muslim_features_vowel.csv-pca.csv')
results_df

pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formants80_muslim_features_vowel.csv-pca')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})

audio-results4/formants80_muslim_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 101, 4: 55, 3: 40, 1: 31, 2: 13})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 192, 1: 48})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 207, 1: 33})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 188, -1: 45, 1: 7})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({0: 194, -1: 46})
 2D representation

t-SNE transformed data¶

results_df=pd.read_csv('audio-results4/formants80_muslim_features_vowel.csv-tsne.csv')
results_df

tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formants80_muslim_features_vowel.csv-tsne')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})

audio-results4/formants80_muslim_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({2: 53, 0: 52, 1: 50, 4: 46, 3: 39})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({0: 183, -1: 50, 1: 7})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 207, 0: 20, 1: 13})
 2D representation

 3D representation

MEAN-SHIFT
predicted_labels--> Counter({0: 240})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({2: 74, 3: 52, 1: 40, 0: 38, 4: 36})
 2D representation

Umap transformed data¶

results_df=pd.read_csv('audio-results4/formants80_muslim_features_vowel.csv-umap.csv')
results_df

umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formants80_muslim_features_vowel.csv-umap')

Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})

audio-results4/formants80_muslim_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 64, 1: 47, 2: 47, 3: 42, 4: 40})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 192, 1: 48})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({7: 46, 1: 35, 5: 33, 0: 23, 3: 21, 2: 20, 6: 20, 4: 17, 9: 15, 8: 10})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({1: 48, 9: 31, 0: 25, 4: 24, 3: 21, 6: 19, -1: 18, 7: 15, 8: 9, 10: 9, 5: 8, 2: 7, 11: 6})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({0: 88, 1: 67, -1: 47, 2: 38})
 2D representation

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.059826	0.065694	0.112848	0.080485	0.093958	0.400436	0.216667	0.198545	0.185099	0.302860	181.966261	1.056784
1	Agglomerative clustering	2	0.072273	0.006873	0.014056	0.028689	0.018868	0.603979	0.570833	0.228834	0.155208	0.550014	260.301893	0.789638
2	Birch	2	0.036015	0.003566	0.011694	0.029829	0.016802	0.621153	0.591667	0.206870	0.147387	0.579029	251.048818	0.666809
3	DBSCAN	2	0.101247	0.018323	0.031381	0.052691	0.039335	0.604744	0.558333	0.164006	0.164640	0.385162	119.695486	1.271244
4	Mean-shift	4	0.067873	0.027279	0.044300	0.067481	0.053486	0.584220	0.558333	0.239187	0.163609	0.462290	133.046046	0.776184
5	Optics	1	0.082848	0.009862	0.016232	0.033927	0.021958	0.612689	0.562500	0.141509	0.115979	0.546460	231.737633	0.829549
6	Gaussian-mixture	5	0.077262	0.042138	0.078316	0.075182	0.076717	0.524005	0.091667	0.186065	0.192215	0.321862	151.203624	1.000535

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.063629	0.065921	0.112852	0.081009	0.094315	0.406494	0.416667	0.380837	0.270615	0.305898	181.980961	1.056147
1	Agglomerative clustering	2	0.072273	0.006873	0.014056	0.028689	0.018868	0.603979	0.570833	0.228834	0.155208	0.550014	260.301893	0.789638
2	Birch	2	0.036015	0.003566	0.011694	0.029829	0.016802	0.621153	0.591667	0.206870	0.147387	0.579029	251.048818	0.666809
3	DBSCAN	2	0.101247	0.018323	0.031381	0.052691	0.039335	0.604744	0.558333	0.164006	0.164640	0.385162	119.695486	1.271244
4	Mean-shift	4	0.067873	0.027279	0.044300	0.067481	0.053486	0.584220	0.558333	0.239187	0.163609	0.462290	133.046046	0.776184
5	Optics	1	0.082848	0.009862	0.016232	0.033927	0.021958	0.612689	0.562500	0.141509	0.115979	0.546460	231.737633	0.829549
6	Gaussian-mixture	5	0.036087	0.006630	0.039309	0.045323	0.042102	0.536128	0.116667	0.168356	0.196482	0.337532	131.335963	0.813020

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.002387	3.271196e-02	0.077433	0.049316	0.060256	0.311554	0.229167	0.294496	0.224738	0.273674	106.049626	1.059468
1	Agglomerative clustering	240	0.000000	-6.363593e-14	1.000000	0.186350	0.314157	0.000000	0.004167	0.000026	0.004167	-1.000000	-1.000000	-1.000000
2	Birch	240	0.000000	-6.353634e-14	1.000000	0.186350	0.314157	0.000000	0.000000	0.000000	0.000000	-1.000000	-1.000000	-1.000000
3	DBSCAN	2	-0.017103	-3.997084e-03	0.013819	0.022169	0.017025	0.540644	0.504167	0.138607	0.133099	-0.023227	5.519560	6.227439
4	Mean-shift	1	0.000000	-5.797594e-17	0.000000	1.000000	0.000000	0.694318	0.662500	0.200000	0.132500	-1.000000	-1.000000	-1.000000
5	Optics	2	0.040605	3.591035e-02	0.044205	0.091652	0.059643	0.619557	0.058333	0.026447	0.121154	-0.004790	21.457305	1.287757
6	Gaussian-mixture	5	0.013533	3.776722e-02	0.083214	0.054162	0.065616	0.331573	0.141667	0.146105	0.176188	0.250544	94.235123	1.101692

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	5	0.009803	0.049750	0.098521	0.063096	0.076926	0.320416	0.287500	0.408619	0.252880	0.500047	418.396268	0.709299
1	Agglomerative clustering	2	0.072273	0.006873	0.014056	0.028689	0.018868	0.603979	0.570833	0.228834	0.155208	0.528574	255.159879	0.593442
2	Birch	10	0.004649	0.058970	0.171740	0.079170	0.108379	0.239862	0.083333	0.015576	0.090026	0.483613	442.082082	0.692031
3	DBSCAN	12	0.008579	0.048946	0.182864	0.078191	0.109542	0.231820	0.100000	0.022809	0.073013	0.333634	181.968722	1.344627
4	Mean-shift	3	0.025176	0.008644	0.026304	0.025466	0.025878	0.430048	0.320833	0.223996	0.172917	0.494167	365.263785	0.764438
5	Optics	3	0.040365	0.058506	0.093007	0.071143	0.080619	0.389999	0.295833	0.133285	0.145031	0.477824	359.889853	0.739270
6	Gaussian-mixture	5	0.012505	0.055137	0.104751	0.067887	0.082383	0.328316	0.304167	0.396526	0.254884	0.493614	407.480931	0.715743