In [12]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [2]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formants20_muslim_features_vowel.csv')
In [3]:
train_data.shape
Out[3]:
(240, 3)
In [4]:
results_df=pd.read_csv('audio-results4/formants20_muslim_features_vowel.csv')
results_df
Out[4]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.017895 0.026241 0.066172 0.048101 0.055708 0.376684 0.250000 0.259089 0.261745 0.266682 148.746166 1.114827
1 Agglomerative clustering 2 0.012197 -0.008123 0.002728 0.005128 0.003561 0.561311 0.191667 0.199129 0.139441 0.508585 230.897279 0.887124
2 Birch 2 0.039956 -0.005412 0.005178 0.011249 0.007091 0.600220 0.570833 0.214707 0.149994 0.556635 244.747876 0.786932
3 DBSCAN 3 0.034700 -0.004843 0.020000 0.031363 0.024425 0.574914 0.537500 0.146992 0.146208 0.397362 67.164817 1.412177
4 Mean-shift 2 0.002222 -0.003092 0.007280 0.021677 0.010900 0.624251 0.595833 0.194001 0.140403 0.548099 166.874264 0.716054
5 Optics 1 -0.036974 -0.001750 0.008086 0.033718 0.013044 0.634874 0.054167 0.013627 0.135417 -0.223862 6.109066 1.574688
6 Gaussian-mixture 5 0.054295 0.007453 0.039349 0.042325 0.040782 0.523176 0.070833 0.216100 0.289949 0.285643 106.891797 1.040694
In [5]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formants20_muslim_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formants20_muslim_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({3: 98, 0: 61, 2: 43, 1: 28, 4: 10})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({1: 184, 0: 56})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 197, 1: 43})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 189, -1: 41, 2: 5, 1: 5})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 224, 0: 16})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 214, 1: 26})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({2: 165, 3: 40, 4: 24, 1: 9, 0: 2})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [6]:
results_df=pd.read_csv('audio-results4/formants20_muslim_features_vowel.csv-pca.csv')
results_df
Out[6]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.019168 0.026256 0.066157 0.048211 0.055776 0.378788 0.320833 0.256731 0.240191 0.266489 148.769420 1.117080
1 Agglomerative clustering 2 0.012197 -0.008123 0.002728 0.005128 0.003561 0.561311 0.191667 0.199129 0.139441 0.508585 230.897279 0.887124
2 Birch 2 0.039956 -0.005412 0.005178 0.011249 0.007091 0.600220 0.570833 0.214707 0.149994 0.556635 244.747876 0.786932
3 DBSCAN 3 0.034700 -0.004843 0.020000 0.031363 0.024425 0.574914 0.537500 0.146992 0.146208 0.397362 67.164817 1.412177
4 Mean-shift 2 0.002222 -0.003092 0.007280 0.021677 0.010900 0.624251 0.595833 0.194001 0.140403 0.548099 166.874264 0.716054
5 Optics 1 -0.032442 -0.003105 0.007223 0.031552 0.011755 0.639410 0.050000 0.012579 0.133333 -0.225391 5.933781 1.536939
6 Gaussian-mixture 5 0.023260 0.011573 0.049123 0.035165 0.040988 0.370522 0.250000 0.167711 0.198232 0.218873 134.761986 1.247747
In [7]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formants20_muslim_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formants20_muslim_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 99, 3: 61, 2: 42, 1: 28, 4: 10})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({1: 184, 0: 56})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 197, 1: 43})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 189, -1: 41, 2: 5, 1: 5})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 225, 0: 15})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 214, 1: 26})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 82, 1: 73, 4: 47, 3: 29, 2: 9})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [8]:
results_df=pd.read_csv('audio-results4/formants20_muslim_features_vowel.csv-tsne.csv')
results_df
Out[8]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.005263 2.094116e-02 0.062772 0.039875 0.048770 0.312591 0.195833 0.187951 0.194391 0.271981 99.446961 1.099325
1 Agglomerative clustering 240 0.000000 -6.363593e-14 1.000000 0.186350 0.314157 0.000000 0.004167 0.000026 0.004167 -1.000000 -1.000000 -1.000000
2 Birch 240 0.000000 -6.363593e-14 1.000000 0.186350 0.314157 0.000000 0.004167 0.000089 0.004167 -1.000000 -1.000000 -1.000000
3 DBSCAN 4 0.016521 1.481703e-02 0.051435 0.041284 0.045804 0.410123 0.333333 0.127187 0.141687 -0.027360 20.155396 3.042213
4 Mean-shift 1 0.000000 -5.797594e-17 0.000000 1.000000 0.000000 0.694318 0.662500 0.200000 0.132500 -1.000000 -1.000000 -1.000000
5 Optics 3 0.010476 -6.291522e-03 0.022195 0.032476 0.026369 0.570627 0.050000 0.024351 0.127404 -0.106643 16.268277 1.419943
6 Gaussian-mixture 5 0.001694 2.374260e-02 0.066278 0.042097 0.051490 0.309707 0.195833 0.169343 0.202166 0.263772 95.132948 1.096758
In [9]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formants20_muslim_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formants20_muslim_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({2: 51, 4: 50, 0: 48, 1: 48, 3: 43})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({0: 115, -1: 67, 1: 36, 2: 14, 3: 8})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 194, 2: 17, 0: 16, 1: 13})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 240})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({2: 53, 4: 48, 1: 47, 0: 46, 3: 46})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [10]:
results_df=pd.read_csv('audio-results4/formants20_muslim_features_vowel.csv-umap.csv')
results_df
Out[10]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.013798 0.031614 0.075956 0.048650 0.059311 0.323594 0.150000 0.146514 0.168669 0.405894 279.138335 0.906312
1 Agglomerative clustering 2 0.032913 0.026302 0.029821 0.047431 0.036619 0.530164 0.466667 0.197388 0.146465 0.469214 285.582464 0.811533
2 Birch 24 0.004837 0.034505 0.253841 0.084278 0.126542 0.156488 0.016667 0.001673 0.025000 0.447317 342.477857 0.686399
3 DBSCAN 16 -0.005733 0.030729 0.174548 0.081375 0.111001 0.317179 0.045833 0.008224 0.054367 0.068717 23.497719 1.079798
4 Mean-shift 4 0.023422 0.020317 0.050046 0.037948 0.043165 0.374042 0.262500 0.258974 0.178383 0.385815 258.018882 0.930333
5 Optics 2 0.000886 0.033350 0.051013 0.049213 0.050097 0.415709 0.183333 0.057894 0.098586 0.401588 246.169706 0.928654
6 Gaussian-mixture 5 0.016942 0.021593 0.063380 0.040957 0.049759 0.330506 0.170833 0.152804 0.178710 0.362955 238.552847 1.003716
In [11]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formants20_muslim_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formants20_muslim_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({2: 64, 4: 49, 3: 44, 1: 43, 0: 40})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 158, 1: 82})
 2D representation
 3D representation
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({-1: 106, 2: 16, 6: 14, 0: 12, 1: 11, 10: 10, 4: 9, 13: 9, 11: 8, 7: 8, 3: 7, 12: 5, 9: 5, 5: 5, 14: 5, 15: 5, 8: 5})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 110, 0: 75, 1: 55})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 78, 1: 73, 2: 52, 3: 37})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({2: 70, 4: 49, 0: 46, 1: 38, 3: 37})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: