In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [3]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formants50_muslim_features_vowel.csv')
In [4]:
train_data.shape
Out[4]:
(240, 3)
In [5]:
results_df=pd.read_csv('audio-results4/formants50_muslim_features_vowel.csv')
results_df
Out[5]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.048513 0.041893 0.083258 0.064126 0.072450 0.426571 0.204167 0.124137 0.200395 0.319838 149.881262 1.029625
1 Agglomerative clustering 2 0.022210 -0.011492 0.001947 0.006490 0.002995 0.642583 0.616667 0.214417 0.152127 0.571739 165.248700 0.622850
2 Birch 2 0.043043 0.007998 0.014791 0.035142 0.020820 0.614860 0.587500 0.233866 0.156597 0.557571 219.089888 0.772156
3 DBSCAN 2 0.026844 -0.004256 0.013767 0.039550 0.020424 0.643344 0.612500 0.165860 0.144954 0.437144 37.114055 1.517398
4 Mean-shift 7 0.078358 0.015147 0.055454 0.069608 0.061730 0.584137 0.554167 0.151910 0.161474 0.331027 68.581185 0.936835
5 Optics 1 -0.056810 0.008882 0.014366 0.065985 0.023595 0.632608 0.054167 0.013627 0.154762 -0.255248 3.920693 1.866694
6 Gaussian-mixture 5 0.050896 0.057632 0.098588 0.080278 0.088496 0.449828 0.291667 0.345358 0.302577 0.291894 125.351081 1.160166
In [6]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formants50_muslim_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formants50_muslim_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({2: 120, 0: 55, 4: 29, 1: 21, 3: 15})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 218, 1: 22})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 203, 1: 37})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 218, -1: 17, 1: 5})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 226, 0: 14})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 187, 1: 28, 2: 14, 6: 4, 4: 3, 3: 2, 5: 2})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({4: 133, 2: 46, 0: 30, 3: 19, 1: 12})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [7]:
results_df=pd.read_csv('audio-results4/formants50_muslim_features_vowel.csv-pca.csv')
results_df
Out[7]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.048513 0.041893 0.083258 0.064126 0.072450 0.426571 0.212500 0.151879 0.218022 0.319838 149.881262 1.029625
1 Agglomerative clustering 2 0.022210 -0.011492 0.001947 0.006490 0.002995 0.642583 0.616667 0.214417 0.152127 0.571739 165.248700 0.622850
2 Birch 2 0.043043 0.007998 0.014791 0.035142 0.020820 0.614860 0.587500 0.233866 0.156597 0.557571 219.089888 0.772156
3 DBSCAN 2 0.026844 -0.004256 0.013767 0.039550 0.020424 0.643344 0.612500 0.165860 0.144954 0.437144 37.114055 1.517398
4 Mean-shift 7 0.078358 0.015147 0.055454 0.069608 0.061730 0.584137 0.550000 0.148870 0.113855 0.331027 68.581185 0.936835
5 Optics 1 -0.056810 0.008882 0.014366 0.065985 0.023595 0.632608 0.054167 0.013627 0.154762 -0.255248 3.920693 1.866694
6 Gaussian-mixture 5 0.018016 0.038901 0.083018 0.057383 0.067860 0.363016 0.191667 0.250025 0.207448 0.202305 126.850313 1.269497
In [8]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formants50_muslim_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formants50_muslim_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({3: 120, 0: 55, 1: 29, 4: 21, 2: 15})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 218, 1: 22})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 203, 1: 37})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 218, -1: 17, 1: 5})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 226, 0: 14})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 187, 1: 28, 2: 14, 6: 4, 5: 3, 3: 2, 4: 2})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({4: 94, 1: 59, 3: 31, 2: 29, 0: 27})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [9]:
results_df=pd.read_csv('audio-results4/formants50_muslim_features_vowel.csv-tsne.csv')
results_df
Out[9]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.011528 2.884120e-02 0.072625 0.046198 0.056473 0.318220 0.229167 0.227751 0.233241 0.266684 97.786660 1.094468
1 Agglomerative clustering 240 0.000000 -6.363593e-14 1.000000 0.186350 0.314157 0.000000 0.004167 0.000026 0.004167 -1.000000 -1.000000 -1.000000
2 Birch 240 0.000000 -6.363593e-14 1.000000 0.186350 0.314157 0.000000 0.004167 0.000026 0.004167 -1.000000 -1.000000 -1.000000
3 DBSCAN 4 -0.010740 -7.105174e-03 0.027172 0.022980 0.024901 0.408204 0.345833 0.130332 0.138530 -0.016419 18.672404 2.901425
4 Mean-shift 1 0.000000 -5.797594e-17 0.000000 1.000000 0.000000 0.694318 0.662500 0.200000 0.132500 -1.000000 -1.000000 -1.000000
5 Optics 1 0.025424 -3.246306e-03 0.006752 0.039813 0.011546 0.675729 0.020833 0.005241 0.083333 0.047206 11.269141 1.159755
6 Gaussian-mixture 5 0.012665 4.588086e-02 0.093089 0.060748 0.073519 0.331495 0.225000 0.300540 0.211597 0.241247 92.003950 1.087626
In [10]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formants50_muslim_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formants50_muslim_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 53, 3: 51, 4: 49, 0: 46, 2: 41})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({0: 120, -1: 72, 2: 31, 3: 10, 1: 7})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 230, 0: 10})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 240})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({3: 72, 0: 55, 2: 44, 1: 39, 4: 30})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [11]:
results_df=pd.read_csv('audio-results4/formants50_muslim_features_vowel.csv-umap.csv')
results_df
Out[11]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.018047 0.033397 0.078020 0.050239 0.061121 0.328839 0.204167 0.119003 0.178831 0.449631 304.470679 0.826642
1 Agglomerative clustering 3 0.029040 0.047016 0.064075 0.063687 0.063880 0.445476 0.370833 0.196710 0.154805 0.477222 268.427938 0.737895
2 Birch 10 0.007070 0.023808 0.119353 0.054676 0.074996 0.239191 0.125000 0.064661 0.101938 0.461275 327.313938 0.778165
3 DBSCAN 2 0.037417 -0.009348 0.010741 0.036113 0.016558 0.660756 0.029167 0.019110 0.123810 0.306742 34.236158 0.522905
4 Mean-shift 4 0.039161 0.038332 0.069746 0.054815 0.061385 0.402115 0.337500 0.276512 0.188277 0.450701 267.691515 0.781745
5 Optics 4 0.041507 0.048792 0.094504 0.065878 0.077637 0.384547 0.079167 0.031688 0.108499 0.368523 208.218761 0.811797
6 Gaussian-mixture 5 0.020272 0.036581 0.081682 0.053183 0.064421 0.335761 0.216667 0.153136 0.198687 0.443141 294.907584 0.829147
In [12]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formants50_muslim_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formants50_muslim_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 61, 2: 55, 1: 47, 3: 46, 4: 31})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 124, 2: 59, 1: 57})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({4: 40, 5: 40, 1: 29, 0: 27, 6: 20, 8: 20, 3: 19, 2: 18, 9: 16, 7: 11})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({-1: 223, 0: 10, 1: 7})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 100, 3: 52, 1: 33, 0: 29, 2: 26})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 103, 1: 58, 2: 42, 3: 37})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({3: 69, 0: 55, 2: 45, 4: 43, 1: 28})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: