In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [3]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/zeroCrossings_muslim_features_vowel.csv')
In [4]:
train_data.shape
Out[4]:
(240, 11)
In [6]:
results_df=pd.read_csv('audio-results4/zeroCrossings_muslim_features_vowel.csv')
results_df
Out[6]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.034789 0.095055 0.125536 0.113688 0.119318 0.430423 0.058333 0.187131 0.164934 0.427100 209.117285 0.675069
1 Agglomerative clustering 4 0.133037 0.102654 0.112038 0.137441 0.123446 0.545469 0.491667 0.289695 0.208976 0.469997 181.838929 0.587481
2 Birch 2 -0.005764 -0.004946 0.001684 0.063718 0.003281 0.689262 0.658333 0.198742 0.132218 0.825172 41.719463 0.123482
3 DBSCAN 2 -0.071036 0.054416 0.066216 0.081753 0.073168 0.454950 0.104167 0.132156 0.166540 0.151756 72.141037 1.798670
4 Mean-shift 3 -0.019714 -0.001400 0.011153 0.079215 0.019553 0.666670 0.637500 0.192453 0.131330 0.671245 70.381289 0.464503
5 Optics 2 0.011517 0.047884 0.063987 0.066648 0.065290 0.453955 0.195833 0.096355 0.153343 0.109595 44.933214 2.327164
6 Gaussian-mixture 5 0.002426 0.068018 0.106263 0.083746 0.093671 0.377460 0.200000 0.208945 0.248951 0.387896 171.361565 0.849198
In [13]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/zeroCrossings_muslim_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})
audio-results4/zeroCrossings_muslim_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 97, 3: 93, 2: 46, 0: 3, 4: 1})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 130, 1: 101, 2: 8, 3: 1})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 239, 1: 1})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({1: 155, -1: 68, 0: 17})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 137, 1: 52, 0: 51})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 233, 1: 6, 2: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 84, 4: 83, 2: 52, 0: 20, 3: 1})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [7]:
results_df=pd.read_csv('audio-results4/zeroCrossings_muslim_features_vowel.csv-pca.csv')
results_df
Out[7]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.018071 0.076355 0.111980 0.093272 0.101774 0.399861 0.279167 0.154910 0.154433 0.459821 271.366623 0.640353
1 Agglomerative clustering 4 0.131724 0.098001 0.108420 0.131944 0.119031 0.544888 0.495833 0.290953 0.204689 0.510783 226.572955 0.534320
2 Birch 2 -0.005764 -0.004946 0.001684 0.063718 0.003281 0.689262 0.658333 0.198742 0.132218 0.833935 44.393129 0.117538
3 DBSCAN 2 0.036996 0.088986 0.107842 0.101267 0.104451 0.426252 0.308333 0.183519 0.152920 0.368736 113.428182 1.163037
4 Mean-shift 3 -0.029894 0.001517 0.013335 0.078728 0.022807 0.657059 0.629167 0.189937 0.130736 0.666933 85.541210 0.472544
5 Optics 2 0.207133 0.075399 0.085931 0.103533 0.093914 0.600901 0.062500 0.062812 0.069425 0.198834 58.161844 0.772040
6 Gaussian-mixture 5 0.012386 0.075180 0.113922 0.091583 0.101538 0.389336 0.083333 0.118910 0.207130 0.448787 150.415479 1.002415
In [8]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/zeroCrossings_muslim_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})
audio-results4/zeroCrossings_muslim_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 87, 2: 84, 3: 59, 1: 9, 4: 1})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 133, 1: 97, 2: 9, 3: 1})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 239, 1: 1})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({1: 95, 0: 79, -1: 66})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 163, 1: 43, 0: 34})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 231, 1: 8, 2: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({2: 88, 1: 77, 4: 61, 0: 12, 3: 2})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [9]:
results_df=pd.read_csv('audio-results4/zeroCrossings_muslim_features_vowel.csv-tsne.csv')
results_df
Out[9]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.025410 4.262315e-02 0.089860 0.057135 0.069855 0.329026 0.208333 0.209472 0.213633 0.285363 107.854450 1.047740
1 Agglomerative clustering 240 0.000000 -6.353634e-14 1.000000 0.186350 0.314157 0.000000 0.004167 0.000026 0.004167 -1.000000 -1.000000 -1.000000
2 Birch 240 0.000000 -6.373552e-14 1.000000 0.186350 0.314157 0.000000 0.008333 0.000115 0.008333 -1.000000 -1.000000 -1.000000
3 DBSCAN 2 0.027336 1.640557e-02 0.033582 0.035875 0.034691 0.470741 0.412500 0.150863 0.128960 0.098930 24.111838 5.051985
4 Mean-shift 1 0.000000 -5.797594e-17 0.000000 1.000000 0.000000 0.694318 0.662500 0.200000 0.132500 -1.000000 -1.000000 -1.000000
5 Optics 3 0.041119 5.772493e-02 0.078244 0.097051 0.086638 0.551798 0.033333 0.031930 0.063889 -0.076721 21.296228 1.561104
6 Gaussian-mixture 5 0.029073 3.562961e-02 0.081071 0.051655 0.063103 0.332935 0.283333 0.268468 0.255657 0.285062 107.112824 1.040879
In [10]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/zeroCrossings_muslim_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})
audio-results4/zeroCrossings_muslim_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 55, 4: 49, 0: 46, 2: 46, 3: 44})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({0: 141, -1: 59, 1: 40})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 181, 2: 24, 0: 20, 1: 15})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 240})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 58, 2: 49, 1: 47, 4: 44, 3: 42})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [11]:
results_df=pd.read_csv('audio-results4/zeroCrossings_muslim_features_vowel.csv-umap.csv')
results_df
Out[11]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.081453 0.109963 0.170720 0.112895 0.135913 0.387853 0.162500 0.130789 0.136009 0.602613 1037.388616 0.574975
1 Agglomerative clustering 3 0.125115 0.101912 0.114159 0.123477 0.118635 0.517602 0.191667 0.128495 0.099592 0.709724 681.718448 0.385095
2 Birch 16 0.033750 0.071076 0.254827 0.096003 0.139465 0.224279 0.104167 0.031891 0.078223 0.660897 3128.183619 0.433536
3 DBSCAN 3 0.125115 0.101912 0.114159 0.123477 0.118635 0.517602 0.450000 0.234736 0.184181 0.709724 681.718448 0.385095
4 Mean-shift 3 0.125115 0.101912 0.114159 0.123477 0.118635 0.517602 0.462500 0.280890 0.196638 0.709724 681.718448 0.385095
5 Optics 12 0.034291 0.091438 0.258225 0.105045 0.149339 0.240354 0.104167 0.037287 0.101560 0.548071 257.152757 1.039962
6 Gaussian-mixture 5 -0.035227 0.095513 0.150388 0.103650 0.122720 0.325645 0.179167 0.201351 0.206579 0.612678 552.207635 0.565666
In [12]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/zeroCrossings_muslim_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'cantdecide': 13, 'misaligned': 13, 'other': 8})
Counter({0: 159, 4: 47, 1: 13, 2: 13, 3: 8})
audio-results4/zeroCrossings_muslim_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({4: 70, 0: 62, 1: 49, 3: 35, 2: 24})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({1: 119, 0: 97, 2: 24})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({2: 24, 1: 22, 0: 21, 8: 21, 5: 19, 3: 17, 4: 16, 13: 16, 6: 14, 7: 12, 14: 12, 9: 11, 11: 11, 12: 10, 15: 8, 10: 6})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 119, 2: 97, 1: 24})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({2: 31, 10: 28, 7: 25, 8: 21, 0: 20, 3: 19, -1: 16, 11: 16, 1: 16, 5: 14, 4: 13, 6: 11, 9: 10})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 119, 1: 97, 2: 24})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 97, 3: 49, 0: 37, 4: 33, 2: 24})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: