In [12]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [2]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/logMelSpec_ideologyFive_features_vowel.csv')
In [3]:
train_data.shape
Out[3]:
(2728, 128)
In [ ]:
 
In [4]:
results_df=pd.read_csv('audio-results4/logMelSpec_ideologyFive_features_vowel.csv')
results_df
Out[4]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.155946 0.070294 0.094197 0.056542 0.070667 0.751501 0.237903 3.078675e-01 0.403181 0.192509 524.840464 2.004538
1 Agglomerative clustering 3 0.118085 0.051254 0.034157 0.118118 0.052990 0.895850 0.891129 3.550081e-01 0.474454 0.390113 126.568159 1.007376
2 Birch 2285 0.000190 0.012468 1.000000 0.045168 0.086432 0.022107 0.000367 1.800233e-07 0.000438 0.107881 18.054870 0.428688
3 DBSCAN 2 0.069686 0.026534 0.017133 0.078986 0.028158 0.893892 0.883798 3.305910e-01 0.298650 0.277389 26.088032 2.043369
4 Mean-shift 2 0.109926 0.038910 0.026346 0.080537 0.039703 0.890535 0.888563 5.414172e-01 0.672758 0.418717 248.667181 1.054088
5 Optics 2 -0.034993 0.010110 0.008083 0.021571 0.011759 0.872003 0.013196 4.936240e-03 0.333333 -0.015600 28.332337 1.236410
6 Gaussian-mixture 2 0.591591 0.380272 0.376741 0.384554 0.380607 0.921785 0.067815 1.813992e-01 0.172655 0.245832 272.701190 2.046779
In [5]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/logMelSpec_ideologyFive_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/logMelSpec_ideologyFive_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 2018, 0: 710})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2678, 1: 42, 2: 8})
 2D representation
 3D representation
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({0: 2691, -1: 35, 1: 2})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2663, 0: 36, 1: 29})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2663, 1: 65})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 2440, 0: 288})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [6]:
results_df=pd.read_csv('audio-results4/logMelSpec_ideologyFive_features_vowel.csv-pca.csv')
results_df
Out[6]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.158103 0.072373 0.097087 0.058161 0.072744 0.751670 0.237537 0.304706 0.401856 0.337605 1170.595602 1.287640
1 Agglomerative clustering 2 0.101732 0.043696 0.027623 0.115140 0.044557 0.895645 0.107405 0.463709 0.278130 0.592831 426.190299 0.616241
2 Birch 7 0.113558 0.126711 0.303958 0.081027 0.127947 0.579094 0.435117 0.077354 0.141430 0.238779 615.983487 1.096640
3 DBSCAN 2 0.130555 0.032360 0.028098 0.042104 0.033704 0.868079 0.853372 0.322166 0.426205 0.426844 157.231922 1.873393
4 Mean-shift 2 0.092320 0.042297 0.026006 0.127456 0.043199 0.896887 0.893328 0.532268 0.740404 0.617092 406.287885 0.556429
5 Optics 2 -0.037636 0.011003 0.008840 0.021955 0.012604 0.869651 0.014296 0.005348 0.333333 -0.219786 14.555914 1.578511
6 Gaussian-mixture 2 0.360665 0.172589 0.159029 0.189836 0.173072 0.888719 0.897361 0.685269 0.733745 0.452082 840.315839 1.023097
In [7]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/logMelSpec_ideologyFive_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/logMelSpec_ideologyFive_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 2015, 0: 713})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({1: 2684, 0: 44})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 1206, 1: 870, 2: 425, 4: 160, 6: 48, 5: 14, 3: 5})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2573, -1: 147, 1: 8})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2657, 0: 39, 1: 32})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2692, 1: 36})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2499, 1: 229})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [8]:
results_df=pd.read_csv('audio-results4/logMelSpec_ideologyFive_features_vowel.csv-tsne.csv')
results_df
Out[8]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.075675 1.102741e-01 1.656396e-01 0.083005 1.105910e-01 0.668096 0.644062 7.574310e-01 0.601192 0.295375 1370.411702 1.313963
1 Agglomerative clustering 1051 0.000234 3.273396e-02 8.281813e-01 0.041383 7.882779e-02 0.028040 0.001833 1.320587e-05 0.001189 0.372316 653.223466 0.647908
2 Birch 2068 0.000073 1.413727e-02 9.610508e-01 0.043705 8.360735e-02 0.014737 0.000733 3.978272e-07 0.000484 0.182438 799.780071 0.320199
3 DBSCAN 2 -0.010999 1.804317e-03 2.217721e-03 0.017503 3.936659e-03 0.890555 0.004765 1.782531e-03 0.333333 0.264296 31.927923 1.008056
4 Mean-shift 1 0.000000 1.769098e-15 3.226002e-16 1.000000 6.452004e-16 0.897715 0.891129 5.000000e-01 0.445565 -1.000000 -1.000000 -1.000000
5 Optics 4 -0.041103 1.254510e-02 1.305056e-02 0.018197 1.519991e-02 0.851200 0.014663 3.290827e-03 0.200000 -0.135417 69.014862 0.902364
6 Gaussian-mixture 2 0.255538 1.847795e-01 2.533069e-01 0.145829 1.850975e-01 0.772850 0.791422 8.179457e-01 0.651314 0.278723 1180.286541 1.207213
In [9]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/logMelSpec_ideologyFive_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/logMelSpec_ideologyFive_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1518, 1: 1210})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({-1: 2710, 0: 13, 1: 5})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2604, 0: 40, 3: 35, 1: 26, 2: 23})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2728})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1950, 1: 778})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [10]:
results_df=pd.read_csv('audio-results4/logMelSpec_ideologyFive_features_vowel.csv-umap.csv')
results_df
Out[10]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.168164 0.116982 0.165646 0.090818 0.117316 0.731584 0.741569 0.761895 0.616553 0.315466 993.040042 1.487733
1 Agglomerative clustering 2 -0.021299 0.006643 0.004576 0.021933 0.007572 0.883088 0.877566 0.492390 0.444816 0.669634 563.762820 0.251018
2 Birch 235 0.001351 0.056953 0.595469 0.038931 0.073084 0.075976 0.001466 0.000007 0.000516 0.341035 1222.320404 0.905804
3 DBSCAN 2 -0.029903 0.008388 0.006700 0.020909 0.010149 0.876331 0.012463 0.004662 0.333333 0.345880 301.439971 0.414710
4 Mean-shift 2 -0.021299 0.006643 0.004576 0.021933 0.007572 0.883088 0.877566 0.492390 0.444816 0.669634 563.762820 0.251018
5 Optics 6 0.363338 0.251871 0.379069 0.190463 0.253536 0.846124 0.013930 0.002233 0.142857 -0.048695 270.355413 0.779449
6 Gaussian-mixture 2 0.201264 0.149247 0.209907 0.116176 0.149570 0.745405 0.759531 0.792663 0.631949 0.295388 910.952502 1.433939
In [11]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/logMelSpec_ideologyFive_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/logMelSpec_ideologyFive_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1852, 1: 876})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2691, 1: 37})
 2D representation
 3D representation
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({-1: 2674, 0: 34, 1: 20})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2289, 4: 213, 2: 55, 3: 52, 1: 44, 0: 38, 5: 37})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2691, 1: 37})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1873, 1: 855})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: