In [6]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [7]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/logMelSpec_ideology_features_vowel.csv')
In [8]:
train_data.shape
Out[8]:
(2696, 128)
In [9]:
results_df=pd.read_csv('audio-results4/logMelSpec_ideology_features_vowel.csv')
results_df
Out[9]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.042665 0.019960 0.030561 0.015228 0.020328 0.674371 0.642433 0.615568 0.545202 0.181783 661.977328 1.897175
1 Agglomerative clustering 2 0.015824 0.006879 0.004178 0.101822 0.008027 0.903687 0.898368 0.505041 0.749461 0.607071 57.835137 0.494631
2 Birch 2453 0.000099 0.006598 1.000000 0.042632 0.081778 0.016471 0.000742 0.000002 0.000815 0.088637 47.824370 0.275283
3 DBSCAN 2 0.124325 0.072499 0.107030 0.055665 0.073239 0.722821 0.655415 0.248660 0.650811 0.069685 49.960852 4.050736
4 Mean-shift 2 0.024200 0.008186 0.005079 0.063094 0.009400 0.902586 0.897626 0.507852 0.676993 0.545370 90.157163 0.658748
5 Optics 3 -0.043101 0.011035 0.010221 0.019678 0.013453 0.870601 0.008902 0.002478 0.250000 -0.151842 20.724668 1.551607
6 Gaussian-mixture 2 0.017591 0.000222 0.000743 0.001354 0.000960 0.867157 0.134273 0.492181 0.483025 0.053394 10.936117 6.254728
In [10]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/logMelSpec_ideology_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/logMelSpec_ideology_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1687, 1: 1009})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2691, 1: 5})
 2D representation
 3D representation
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({0: 1850, -1: 841, 1: 5})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2613, 2: 31, 1: 28, 0: 24})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2685, 1: 11})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 2577, 0: 119})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [11]:
results_df=pd.read_csv('audio-results4/logMelSpec_ideology_features_vowel.csv-pca.csv')
results_df
Out[11]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.052094 0.022167 0.033450 0.016995 0.022539 0.684929 0.337908 0.379932 0.451760 0.315176 1356.883915 1.234670
1 Agglomerative clustering 2 0.015824 0.006879 0.004178 0.101822 0.008027 0.903687 0.898368 0.505041 0.749461 0.702568 96.675969 0.350196
2 Birch 6 0.039131 0.039280 0.101759 0.025195 0.040390 0.515518 0.397255 0.096830 0.188656 0.237530 709.465829 1.088791
3 DBSCAN 2 0.073338 0.009688 0.010311 0.011693 0.010959 0.846424 0.833828 0.310588 0.349635 0.306665 97.146681 2.524525
4 Mean-shift 3 0.059596 0.015996 0.011013 0.042627 0.017504 0.895920 0.892804 0.347522 0.416933 0.533469 176.439269 0.521451
5 Optics 2 -0.030685 0.008187 0.006735 0.019510 0.010013 0.881770 0.010015 0.003717 0.333333 -0.311905 4.034551 2.811664
6 Gaussian-mixture 2 0.182332 0.053981 0.053033 0.056120 0.054533 0.855502 0.862018 0.605676 0.613831 0.414255 792.109150 1.058268
In [12]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/logMelSpec_ideology_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/logMelSpec_ideology_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1748, 0: 948})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2691, 1: 5})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 1086, 3: 731, 2: 611, 1: 232, 4: 31, 5: 5})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2480, -1: 209, 1: 7})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2641, 1: 28, 0: 27})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2652, 1: 43, 2: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2443, 1: 253})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [13]:
results_df=pd.read_csv('audio-results4/logMelSpec_ideology_features_vowel.csv-tsne.csv')
results_df
Out[13]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.012537 0.022488 0.035436 0.016854 0.022843 0.644101 0.557493 6.246818e-01 0.545723 0.312379 1460.538704 1.286468
1 Agglomerative clustering 939 0.000138 0.017909 0.658079 0.031991 0.061016 0.028323 0.000742 8.797709e-07 0.000710 0.376934 592.805094 0.674272
2 Birch 1899 0.000058 0.010773 0.889333 0.039220 0.075128 0.015877 0.001113 4.047277e-06 0.001053 0.215151 708.327585 0.364762
3 DBSCAN 2 -0.016891 0.003524 0.003410 0.017410 0.005703 0.892517 0.003709 1.376842e-03 0.333333 0.102359 25.755276 0.783978
4 Mean-shift 2 0.012338 0.021804 0.034375 0.016350 0.022160 0.644038 0.557122 6.228636e-01 0.545058 0.308986 1426.035053 1.297966
5 Optics 2 -0.002489 -0.001481 0.000016 0.000028 0.000021 0.869222 0.015950 1.236704e-02 0.333071 0.005303 104.332611 0.860136
6 Gaussian-mixture 2 0.022263 0.026815 0.042041 0.020071 0.027170 0.649508 0.419139 3.639187e-01 0.449836 0.312149 1455.166706 1.284049
In [14]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/logMelSpec_ideology_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/logMelSpec_ideology_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1388, 1: 1308})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({-1: 2668, 1: 18, 0: 10})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2593, 1: 62, 0: 41})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 1389, 1: 1307})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1453, 0: 1243})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [15]:
results_df=pd.read_csv('audio-results4/logMelSpec_ideology_features_vowel.csv-umap.csv')
results_df
Out[15]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.011993 0.015036 0.023857 0.011363 0.015394 0.644508 0.441024 0.397055 0.462174 0.320724 1510.508518 1.250920
1 Agglomerative clustering 81 0.001109 0.030847 0.265889 0.020208 0.037561 0.109263 0.022997 0.000356 0.012270 0.324067 1061.616939 0.959172
2 Birch 32 0.003542 0.035952 0.219984 0.021416 0.039033 0.179180 0.021142 0.001038 0.020630 0.332525 1214.301318 0.895165
3 DBSCAN 41 0.001167 0.036632 0.210674 0.022946 0.041384 0.275322 0.076039 0.002016 0.019446 0.089159 243.944123 1.116230
4 Mean-shift 1 0.000000 0.000000 0.000000 1.000000 0.000000 0.903734 0.897997 0.500000 0.448999 -1.000000 -1.000000 -1.000000
5 Optics 5 -0.029855 0.010478 0.014330 0.012244 0.013205 0.831674 0.017433 0.008071 0.193939 -0.061994 95.341171 0.776439
6 Gaussian-mixture 2 0.010083 0.008554 0.013799 0.006585 0.008916 0.644632 0.441395 0.421437 0.471052 0.317561 1486.434715 1.262842
In [16]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/logMelSpec_ideology_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/logMelSpec_ideology_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1420, 0: 1276})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
predicted_labels--> Counter({-1: 2494, 1: 55, 4: 39, 0: 38, 3: 35, 2: 35})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2696})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1449, 0: 1247})
 2D representation
 3D representation
In [ ]:
 
In [ ]: