In [16]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [6]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formants80_muslim_features_vowel.csv')
In [7]:
train_data.shape
Out[7]:
(240, 3)
In [8]:
results_df=pd.read_csv('audio-results4/formants80_muslim_features_vowel.csv')
results_df
Out[8]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.059826 0.065694 0.112848 0.080485 0.093958 0.400436 0.216667 0.198545 0.185099 0.302860 181.966261 1.056784
1 Agglomerative clustering 2 0.072273 0.006873 0.014056 0.028689 0.018868 0.603979 0.570833 0.228834 0.155208 0.550014 260.301893 0.789638
2 Birch 2 0.036015 0.003566 0.011694 0.029829 0.016802 0.621153 0.591667 0.206870 0.147387 0.579029 251.048818 0.666809
3 DBSCAN 2 0.101247 0.018323 0.031381 0.052691 0.039335 0.604744 0.558333 0.164006 0.164640 0.385162 119.695486 1.271244
4 Mean-shift 4 0.067873 0.027279 0.044300 0.067481 0.053486 0.584220 0.558333 0.239187 0.163609 0.462290 133.046046 0.776184
5 Optics 1 0.082848 0.009862 0.016232 0.033927 0.021958 0.612689 0.562500 0.141509 0.115979 0.546460 231.737633 0.829549
6 Gaussian-mixture 5 0.077262 0.042138 0.078316 0.075182 0.076717 0.524005 0.091667 0.186065 0.192215 0.321862 151.203624 1.000535
In [9]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formants80_muslim_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formants80_muslim_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 97, 0: 59, 4: 40, 2: 31, 3: 13})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 192, 1: 48})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 207, 1: 33})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 188, -1: 45, 1: 7})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({0: 194, -1: 46})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 187, 1: 39, 2: 13, 3: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 163, 2: 26, 4: 20, 3: 17, 0: 14})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [10]:
results_df=pd.read_csv('audio-results4/formants80_muslim_features_vowel.csv-pca.csv')
results_df
Out[10]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.063629 0.065921 0.112852 0.081009 0.094315 0.406494 0.416667 0.380837 0.270615 0.305898 181.980961 1.056147
1 Agglomerative clustering 2 0.072273 0.006873 0.014056 0.028689 0.018868 0.603979 0.570833 0.228834 0.155208 0.550014 260.301893 0.789638
2 Birch 2 0.036015 0.003566 0.011694 0.029829 0.016802 0.621153 0.591667 0.206870 0.147387 0.579029 251.048818 0.666809
3 DBSCAN 2 0.101247 0.018323 0.031381 0.052691 0.039335 0.604744 0.558333 0.164006 0.164640 0.385162 119.695486 1.271244
4 Mean-shift 4 0.067873 0.027279 0.044300 0.067481 0.053486 0.584220 0.558333 0.239187 0.163609 0.462290 133.046046 0.776184
5 Optics 1 0.082848 0.009862 0.016232 0.033927 0.021958 0.612689 0.562500 0.141509 0.115979 0.546460 231.737633 0.829549
6 Gaussian-mixture 5 0.036087 0.006630 0.039309 0.045323 0.042102 0.536128 0.116667 0.168356 0.196482 0.337532 131.335963 0.813020
In [11]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formants80_muslim_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formants80_muslim_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 101, 4: 55, 3: 40, 1: 31, 2: 13})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 192, 1: 48})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 207, 1: 33})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 188, -1: 45, 1: 7})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({0: 194, -1: 46})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 187, 1: 39, 2: 13, 3: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 175, 0: 35, 4: 17, 3: 10, 2: 3})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [12]:
results_df=pd.read_csv('audio-results4/formants80_muslim_features_vowel.csv-tsne.csv')
results_df
Out[12]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.002387 3.271196e-02 0.077433 0.049316 0.060256 0.311554 0.229167 0.294496 0.224738 0.273674 106.049626 1.059468
1 Agglomerative clustering 240 0.000000 -6.363593e-14 1.000000 0.186350 0.314157 0.000000 0.004167 0.000026 0.004167 -1.000000 -1.000000 -1.000000
2 Birch 240 0.000000 -6.353634e-14 1.000000 0.186350 0.314157 0.000000 0.000000 0.000000 0.000000 -1.000000 -1.000000 -1.000000
3 DBSCAN 2 -0.017103 -3.997084e-03 0.013819 0.022169 0.017025 0.540644 0.504167 0.138607 0.133099 -0.023227 5.519560 6.227439
4 Mean-shift 1 0.000000 -5.797594e-17 0.000000 1.000000 0.000000 0.694318 0.662500 0.200000 0.132500 -1.000000 -1.000000 -1.000000
5 Optics 2 0.040605 3.591035e-02 0.044205 0.091652 0.059643 0.619557 0.058333 0.026447 0.121154 -0.004790 21.457305 1.287757
6 Gaussian-mixture 5 0.013533 3.776722e-02 0.083214 0.054162 0.065616 0.331573 0.141667 0.146105 0.176188 0.250544 94.235123 1.101692
In [13]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formants80_muslim_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formants80_muslim_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({2: 53, 0: 52, 1: 50, 4: 46, 3: 39})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({0: 183, -1: 50, 1: 7})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 207, 0: 20, 1: 13})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 240})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({2: 74, 3: 52, 1: 40, 0: 38, 4: 36})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [14]:
results_df=pd.read_csv('audio-results4/formants80_muslim_features_vowel.csv-umap.csv')
results_df
Out[14]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.009803 0.049750 0.098521 0.063096 0.076926 0.320416 0.287500 0.408619 0.252880 0.500047 418.396268 0.709299
1 Agglomerative clustering 2 0.072273 0.006873 0.014056 0.028689 0.018868 0.603979 0.570833 0.228834 0.155208 0.528574 255.159879 0.593442
2 Birch 10 0.004649 0.058970 0.171740 0.079170 0.108379 0.239862 0.083333 0.015576 0.090026 0.483613 442.082082 0.692031
3 DBSCAN 12 0.008579 0.048946 0.182864 0.078191 0.109542 0.231820 0.100000 0.022809 0.073013 0.333634 181.968722 1.344627
4 Mean-shift 3 0.025176 0.008644 0.026304 0.025466 0.025878 0.430048 0.320833 0.223996 0.172917 0.494167 365.263785 0.764438
5 Optics 3 0.040365 0.058506 0.093007 0.071143 0.080619 0.389999 0.295833 0.133285 0.145031 0.477824 359.889853 0.739270
6 Gaussian-mixture 5 0.012505 0.055137 0.104751 0.067887 0.082383 0.328316 0.304167 0.396526 0.254884 0.493614 407.480931 0.715743
In [15]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formants80_muslim_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formants80_muslim_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 64, 1: 47, 2: 47, 3: 42, 4: 40})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 192, 1: 48})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({7: 46, 1: 35, 5: 33, 0: 23, 3: 21, 2: 20, 6: 20, 4: 17, 9: 15, 8: 10})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({1: 48, 9: 31, 0: 25, 4: 24, 3: 21, 6: 19, -1: 18, 7: 15, 8: 9, 10: 9, 5: 8, 2: 7, 11: 6})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({0: 88, 1: 67, -1: 47, 2: 38})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 96, 1: 96, 2: 48})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 71, 1: 47, 2: 46, 3: 44, 4: 32})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: