In [9]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [10]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/chroma_ideologyFive_features_vowel.csv')
In [11]:
train_data.shape
Out[11]:
(2728, 12)
In [14]:
results_df=pd.read_csv('audio-results4/chroma_ideologyFive_features_vowel.csv')
results_df
Out[14]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.001130 -0.000302 0.000085 0.000043 0.000057 0.643831 0.567449 0.506071 0.502423 0.144406 442.147902 2.376112
1 Agglomerative clustering 3 0.002279 0.000818 0.002679 0.000911 0.001359 0.560820 0.476906 0.260163 0.327047 0.126483 360.457106 2.209342
2 Birch 7 0.003702 0.001026 0.006158 0.001212 0.002026 0.385672 0.195015 0.052375 0.147179 0.173180 355.155899 1.399540
3 DBSCAN 4 0.043094 0.045123 0.076172 0.033462 0.046497 0.646045 0.531891 0.119966 0.204450 -0.115111 24.458226 3.885707
4 Mean-shift 2 0.015011 0.008295 0.004826 0.150551 0.009353 0.898040 0.891862 0.504845 0.821035 0.362924 10.623167 1.266938
5 Optics 2 -0.018801 0.004536 0.003953 0.019138 0.006553 0.885009 0.003666 0.001371 0.333333 -0.093451 17.232206 1.253964
6 Gaussian-mixture 2 0.011044 0.002213 0.003802 0.001945 0.002573 0.651603 0.405425 0.459497 0.483616 0.148431 433.021504 2.400182
In [15]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/chroma_ideologyFive_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/chroma_ideologyFive_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1591, 1: 1137})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 1371, 1: 895, 2: 462})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({2: 691, 0: 528, 3: 451, 1: 426, 5: 392, 4: 239, 6: 1})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 1525, -1: 1151, 2: 29, 1: 14, 3: 9})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2696, 1: 22, 0: 10})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2724, 1: 4})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1639, 0: 1089})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [16]:
results_df=pd.read_csv('audio-results4/chroma_ideologyFive_features_vowel.csv-pca.csv')
results_df
Out[16]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.000078 -0.000353 0.000005 0.000003 0.000004 0.638995 0.453812 0.498469 0.499398 0.256808 963.575471 1.547358
1 Agglomerative clustering 4 0.012463 0.002213 0.006763 0.001844 0.002898 0.512512 0.441349 0.171847 0.250931 0.252480 987.588709 1.125539
2 Birch 3 0.007488 0.003802 0.008866 0.002856 0.004321 0.538139 0.253299 0.213960 0.315934 0.275034 1077.177227 1.229789
3 DBSCAN 2 0.232180 0.084098 0.081013 0.089851 0.085204 0.863371 0.836877 0.313040 0.305991 0.131967 14.916960 6.424408
4 Mean-shift 2 0.025713 0.004318 0.003113 0.017695 0.005294 0.890898 0.885997 0.508943 0.579775 0.360258 84.845726 1.055357
5 Optics 14 -0.016796 0.007674 0.017256 0.013840 0.015360 0.833565 0.004399 0.000329 0.066667 -0.508346 8.667157 1.788002
6 Gaussian-mixture 2 0.015873 0.002508 0.004167 0.002195 0.002875 0.664834 0.374267 0.458270 0.482428 0.263930 915.540192 1.552229
In [17]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/chroma_ideologyFive_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/chroma_ideologyFive_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1523, 0: 1205})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 1257, 1: 666, 2: 469, 3: 336})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({1: 1215, 2: 841, 0: 672})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2487, -1: 234, 1: 7})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2541, 6: 23, 13: 15, 4: 14, 1: 14, 5: 14, 9: 14, 3: 14, 8: 13, 12: 12, 0: 12, 11: 11, 7: 11, 10: 10, 2: 10})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2698, 1: 30})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1746, 0: 982})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [18]:
results_df=pd.read_csv('audio-results4/chroma_ideologyFive_features_vowel.csv-tsne.csv')
results_df
Out[18]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.002258 0.003240 0.005414 0.002689 0.003593 0.635636 0.525293 5.489206e-01 0.518990 0.353538 1750.296605 1.170702
1 Agglomerative clustering 526 0.000184 0.013379 0.419462 0.023346 0.044230 0.040524 0.003666 1.343951e-05 0.002112 0.422495 1296.962940 0.710854
2 Birch 915 0.000126 0.013444 0.573703 0.029624 0.056339 0.031211 0.000367 4.495665e-07 0.001093 0.368276 1264.124755 0.616304
3 DBSCAN 2 -0.013298 0.002561 0.002713 0.017887 0.004711 0.888966 0.005132 1.919649e-03 0.333333 0.006527 14.950167 0.848068
4 Mean-shift 4 -0.000292 0.001591 0.005597 0.001391 0.002228 0.449099 0.253299 1.139182e-01 0.251856 0.343918 2103.198358 0.914998
5 Optics 3 -0.029175 0.002175 0.003370 0.006786 0.004503 0.866340 0.008065 3.001356e-03 0.247257 -0.187414 28.741283 1.119563
6 Gaussian-mixture 2 0.001362 0.002981 0.005025 0.002495 0.003334 0.635204 0.519428 5.471076e-01 0.518281 0.353295 1749.248469 1.170892
In [19]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/chroma_ideologyFive_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/chroma_ideologyFive_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1386, 1: 1342})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({-1: 2706, 0: 14, 1: 8})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2644, 2: 33, 1: 29, 0: 22})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({2: 710, 3: 708, 0: 693, 1: 617})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1368, 1: 1360})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [20]:
results_df=pd.read_csv('audio-results4/chroma_ideologyFive_features_vowel.csv-umap.csv')
results_df
Out[20]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.003448 0.001958 0.003479 0.001731 0.002312 0.637003 0.537023 0.539246 0.515278 0.348331 1639.180156 1.205799
1 Agglomerative clustering 84 0.000431 0.007591 0.099446 0.007875 0.014594 0.106331 0.014663 0.000407 0.012725 0.340882 1939.037983 0.904654
2 Birch 26 0.000340 0.003137 0.029787 0.003180 0.005746 0.181642 0.031891 0.002968 0.037049 0.362170 2243.283643 0.877944
3 DBSCAN 75 -0.002391 0.008160 0.089744 0.008808 0.016041 0.235477 0.034457 0.000509 0.011347 0.039222 164.098648 1.343574
4 Mean-shift 3 0.002192 0.000175 0.001428 0.000457 0.000692 0.531216 0.425587 0.261656 0.340932 0.355717 1749.229890 0.941264
5 Optics 69 -0.015321 0.007149 0.071149 0.009916 0.017406 0.505933 0.008798 0.000563 0.015675 -0.220891 35.531110 1.230087
6 Gaussian-mixture 2 0.002136 0.001282 0.002464 0.001225 0.001636 0.636020 0.471408 0.466962 0.487158 0.348068 1635.492585 1.207489
In [21]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/chroma_ideologyFive_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/chroma_ideologyFive_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1440, 1: 1288})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1175, 1: 844, 2: 709})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1419, 0: 1309})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]: