In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [5]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/chroma_ideology_features_vowel.csv')
In [6]:
train_data.shape
Out[6]:
(2696, 12)
In [7]:
results_df=pd.read_csv('audio-results4/chroma_ideology_features_vowel.csv')
results_df
Out[7]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.000142 -0.000176 0.000290 0.000138 0.000187 0.639051 0.510386 0.511424 0.504187 0.155323 504.527175 2.214680
1 Agglomerative clustering 6 0.005948 -0.000137 0.002417 0.000479 0.000800 0.426397 0.348665 0.088886 0.168170 0.139662 376.863499 1.673787
2 Birch 6 0.000864 -0.000151 0.002345 0.000436 0.000735 0.375909 0.159496 0.052165 0.161393 0.179767 473.940438 1.463033
3 DBSCAN 2 0.156648 0.045228 0.053277 0.040774 0.046194 0.815158 0.786721 0.292028 0.308061 0.014644 14.996554 5.651054
4 Mean-shift 1 0.000000 0.000000 0.000000 1.000000 0.000000 0.903734 0.897997 0.500000 0.448999 -1.000000 -1.000000 -1.000000
5 Optics 2 -0.024323 0.006050 0.005130 0.018553 0.008037 0.886934 0.007789 0.002891 0.333333 -0.129722 14.610190 1.540118
6 Gaussian-mixture 2 -0.000271 0.000163 0.000817 0.000388 0.000527 0.638882 0.501113 0.519154 0.507019 0.154512 501.821210 2.219037
In [8]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/chroma_ideology_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/chroma_ideology_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1369, 1: 1327})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 982, 1: 461, 3: 388, 2: 348, 4: 289, 5: 228})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({5: 588, 1: 509, 0: 438, 4: 437, 3: 410, 2: 314})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2295, -1: 396, 1: 5})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2654, 1: 21, 0: 21})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2696})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1368, 0: 1328})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [9]:
results_df=pd.read_csv('audio-results4/chroma_ideology_features_vowel.csv-pca.csv')
results_df
Out[9]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.000358 -0.000132 0.000359 0.000171 0.000231 0.639299 0.484421 0.487297 0.495342 0.253513 953.338119 1.548498
1 Agglomerative clustering 4 0.002452 0.001188 0.004750 0.001146 0.001846 0.462908 0.205119 0.111901 0.247088 0.239854 916.806973 1.173909
2 Birch 4 0.000652 -0.000515 0.000362 0.000087 0.000140 0.456651 0.305638 0.139079 0.253078 0.278242 1101.011947 1.059872
3 DBSCAN 2 0.143651 0.035098 0.033844 0.039255 0.036349 0.859940 0.843472 0.316317 0.447240 0.180720 23.066877 4.676062
4 Mean-shift 1 0.000000 0.000000 0.000000 1.000000 0.000000 0.903734 0.897997 0.500000 0.448999 -1.000000 -1.000000 -1.000000
5 Optics 12 -0.034692 -0.000180 0.007691 0.006947 0.007300 0.846041 0.005564 0.000725 0.075291 -0.502093 6.382254 1.745241
6 Gaussian-mixture 2 0.001155 0.000097 0.000714 0.000340 0.000461 0.640280 0.472181 0.482093 0.493420 0.253548 952.396265 1.546556
In [10]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/chroma_ideology_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/chroma_ideology_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1385, 0: 1311})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({3: 910, 2: 637, 1: 594, 0: 555})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 837, 3: 646, 1: 621, 2: 592})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2487, -1: 202, 1: 7})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2539, 1: 22, 7: 17, 2: 15, 0: 15, 6: 14, 4: 13, 10: 11, 8: 10, 11: 10, 9: 10, 5: 10, 3: 10})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2696})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1420, 0: 1276})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [11]:
results_df=pd.read_csv('audio-results4/chroma_ideology_features_vowel.csv-tsne.csv')
results_df
Out[11]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.000215 0.000084 0.000694 0.000330 0.000447 0.639027 0.510015 0.517664 0.506472 0.328508 1471.347474 1.264673
1 Agglomerative clustering 492 0.000145 0.010386 0.389489 0.020995 0.039843 0.042177 0.002226 0.000005 0.001742 0.434271 1183.442345 0.708632
2 Birch 868 0.000152 0.011475 0.550623 0.027491 0.052368 0.033349 0.001484 0.000006 0.001382 0.384014 1153.879463 0.622018
3 DBSCAN 171 -0.011097 0.008321 0.162108 0.013213 0.024435 0.268020 0.013724 0.000145 0.005783 0.030732 39.756221 1.217995
4 Mean-shift 2 0.003027 0.000138 0.000777 0.000374 0.000505 0.645943 0.560831 0.518560 0.506928 0.297481 1215.800397 1.376404
5 Optics 2 -0.003004 0.003232 0.003948 0.005544 0.004612 0.856244 0.036721 0.013631 0.292035 -0.112749 96.652005 1.139493
6 Gaussian-mixture 2 0.001783 -0.000307 0.000101 0.000050 0.000067 0.657451 0.402819 0.493434 0.497450 0.295419 1184.536532 1.346589
In [12]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/chroma_ideology_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/chroma_ideology_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1360, 1: 1336})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
predicted_labels--> Counter({-1: 2553, 0: 113, 1: 30})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 1531, 1: 1165})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1669, 0: 1027})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [13]:
results_df=pd.read_csv('audio-results4/chroma_ideology_features_vowel.csv-umap.csv')
results_df
Out[13]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.000539 -0.000107 0.000397 0.000189 0.000256 0.639568 0.480341 0.486637 0.495097 0.355309 1654.374786 1.174635
1 Agglomerative clustering 3 0.000694 -0.000491 0.000095 0.000030 0.000045 0.540603 0.395401 0.210163 0.335295 0.370240 1969.869039 0.894925
2 Birch 23 0.000040 0.000705 0.016266 0.001739 0.003142 0.197105 0.042656 0.003747 0.040669 0.359962 2117.346914 0.844302
3 DBSCAN 59 -0.003787 0.003110 0.054640 0.005221 0.009531 0.222023 0.041914 0.001154 0.016673 0.087576 217.909835 1.266741
4 Mean-shift 3 -0.000663 -0.000489 0.000076 0.000023 0.000035 0.526090 0.297107 0.229548 0.335068 0.374204 2046.782239 0.878006
5 Optics 69 -0.013573 0.006666 0.075213 0.009060 0.016172 0.447196 0.006677 0.000244 0.014391 -0.129214 53.031266 1.157567
6 Gaussian-mixture 2 -0.000153 -0.000235 0.000200 0.000095 0.000129 0.638869 0.501113 0.509484 0.503475 0.341254 1559.961677 1.223740
In [14]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/chroma_ideology_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/chroma_ideology_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1398, 0: 1298})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 1118, 2: 1017, 1: 561})
 2D representation
 3D representation
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({1: 1063, 2: 867, 0: 766})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1356, 0: 1340})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: