In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [6]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/zeroCrossings_ideology_features_vowel.csv')
In [7]:
train_data.shape
Out[7]:
(2696, 27)
In [8]:
results_df=pd.read_csv('audio-results4/zeroCrossings_ideology_features_vowel.csv')
results_df
Out[8]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 -0.087866 0.053130 0.069122 0.043697 0.053545 0.711414 0.683976 0.384058 0.437182 0.457585 1426.486979 1.129534
1 Agglomerative clustering 2 -0.000656 -0.000495 0.000121 0.012092 0.000240 0.903332 0.897626 0.499793 0.448980 0.814159 40.897066 0.132077
2 Birch 11 -0.096522 0.038108 0.058607 0.032551 0.041855 0.728084 0.716246 0.072509 0.079649 0.430165 284.776141 1.038104
3 DBSCAN 2 -0.038316 0.007783 0.006918 0.014979 0.009465 0.870730 0.866840 0.321768 0.298353 0.623808 305.108284 1.337551
4 Mean-shift 8 -0.034165 0.005626 0.007978 0.015925 0.010630 0.874800 0.871291 0.121685 0.125826 0.560120 149.286203 0.840411
5 Optics 6 0.203122 0.052002 0.060116 0.051057 0.055217 0.870859 0.011869 0.011558 0.126190 -0.036987 32.583290 0.879089
6 Gaussian-mixture 2 -0.080888 0.028416 0.028544 0.029418 0.028974 0.805755 0.198813 0.552292 0.554408 0.500673 962.586876 1.090444
In [9]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/zeroCrossings_ideology_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/zeroCrossings_ideology_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 2115, 1: 581})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2695, 1: 1})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 2204, 5: 423, 3: 28, 6: 14, 2: 10, 8: 6, 10: 3, 7: 3, 1: 2, 9: 2, 4: 1})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2611, -1: 74, 1: 11})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2498, 1: 63, 2: 35, 4: 31, 3: 27, 5: 22, 0: 20})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2622, 4: 35, 6: 15, 2: 11, 1: 9, 5: 2, 3: 1, 7: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 2433, 0: 263})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [10]:
results_df=pd.read_csv('audio-results4/zeroCrossings_ideology_features_vowel.csv-pca.csv')
results_df
Out[10]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 -0.088380 0.052080 0.067308 0.043030 0.052498 0.714391 0.688427 0.386536 0.437583 0.501752 1919.594521 0.948633
1 Agglomerative clustering 2 -0.084910 0.030914 0.033375 0.029700 0.031430 0.784437 0.777819 0.436309 0.445405 0.544210 1682.935844 0.874977
2 Birch 7 -0.098538 0.045066 0.077507 0.033847 0.047118 0.678157 0.660237 0.105033 0.123921 0.421883 717.476759 0.895170
3 DBSCAN 2 -0.022742 0.004286 0.004074 0.012807 0.006182 0.884403 0.879822 0.326587 0.298816 0.687122 424.240946 0.732047
4 Mean-shift 2 -0.021551 0.003522 0.002826 0.010417 0.004446 0.885290 0.880935 0.492111 0.458680 0.702601 739.295353 0.583547
5 Optics 4 0.160569 0.036486 0.039380 0.038096 0.038727 0.868507 0.018546 0.014445 0.228964 -0.112342 48.312597 0.866169
6 Gaussian-mixture 2 -0.037332 0.011902 0.008722 0.023599 0.012737 0.875672 0.871662 0.485337 0.447619 0.652847 817.102329 0.633294
In [11]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/zeroCrossings_ideology_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/zeroCrossings_ideology_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 2127, 1: 569})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2368, 1: 328})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 2052, 3: 494, 6: 59, 4: 44, 1: 30, 5: 14, 2: 3})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2646, 1: 30, -1: 20})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2514, 3: 51, 0: 44, 2: 44, 1: 43})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2648, 1: 48})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2625, 1: 71})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [12]:
results_df=pd.read_csv('audio-results4/zeroCrossings_ideology_features_vowel.csv-tsne.csv')
results_df
Out[12]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 -0.002430 0.000406 0.001191 0.000570 0.000771 0.640898 0.470326 5.229635e-01 0.508491 0.362526 1756.137341 1.184806
1 Agglomerative clustering 362 0.000171 0.021628 0.420727 0.023902 0.045234 0.050183 0.003709 1.141029e-05 0.002762 0.458357 3360.063330 0.692749
2 Birch 825 -0.000070 0.009810 0.512669 0.025926 0.049357 0.032849 0.000371 5.006696e-07 0.001212 0.412155 3883.765179 0.582445
3 DBSCAN 69 -0.000384 0.036445 0.258398 0.023757 0.043514 0.214221 0.040059 6.372809e-04 0.013534 0.119838 146.769505 1.280336
4 Mean-shift 4 -0.006244 0.000251 0.002023 0.000637 0.000969 0.547575 0.460682 1.814374e-01 0.243916 0.325934 1023.694548 0.977184
5 Optics 81 0.004748 0.028695 0.195025 0.021564 0.038833 0.431493 0.004080 1.340268e-04 0.012852 -0.087714 38.726297 1.118296
6 Gaussian-mixture 2 -0.001546 0.000525 0.001377 0.000656 0.000889 0.639468 0.514837 4.752221e-01 0.490889 0.359058 1743.108443 1.189005
In [13]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/zeroCrossings_ideology_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/zeroCrossings_ideology_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1477, 0: 1219})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1325, 1: 749, 2: 621, 3: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1429, 1: 1267})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [14]:
results_df=pd.read_csv('audio-results4/zeroCrossings_ideology_features_vowel.csv-umap.csv')
results_df
Out[14]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 -0.028382 0.073114 0.111511 0.054769 0.073459 0.642391 0.487018 0.693423 0.574161 0.466055 2590.346826 0.829096
1 Agglomerative clustering 8 0.021587 0.055022 0.183809 0.033162 0.056188 0.400293 0.081231 0.046764 0.106091 0.570543 6351.737760 0.489970
2 Birch 81 0.002606 0.035508 0.296291 0.022758 0.042270 0.119111 0.013353 0.000741 0.013433 0.526307 21337.079013 0.618640
3 DBSCAN 32 0.013210 0.043813 0.249044 0.026125 0.047289 0.237126 0.054525 0.002621 0.025990 0.463507 3239.947420 1.331013
4 Mean-shift 5 0.065216 0.073069 0.183362 0.046290 0.073919 0.548573 0.422478 0.096027 0.187010 0.578662 4087.374049 0.513710
5 Optics 108 0.001817 0.026521 0.237720 0.020150 0.037151 0.272475 0.004080 0.000101 0.009276 0.204709 66.290296 1.288578
6 Gaussian-mixture 2 -0.028382 0.073114 0.111511 0.054769 0.073459 0.642391 0.487018 0.693423 0.574161 0.466055 2590.346826 0.829096
In [15]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/zeroCrossings_ideology_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/zeroCrossings_ideology_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1632, 0: 1064})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({5: 685, 2: 543, 4: 449, 1: 426, 6: 312, 0: 204, 3: 49, 7: 28})
 2D representation
 3D representation
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1220, 1: 769, 2: 426, 3: 204, 4: 77})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1632, 0: 1064})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: