In [9]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [10]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/mfcc_ideologyFive_features_vowel.csv')
In [11]:
train_data.shape
Out[11]:
(2728, 13)
In [12]:
results_df=pd.read_csv('audio-results4/mfcc_ideologyFive_features_vowel.csv')
results_df
Out[12]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.016274 0.056348 0.085407 0.042417 0.056682 0.641197 0.435484 0.313802 0.427713 0.110739 350.287552 2.686954
1 Agglomerative clustering 2 0.649229 0.447850 0.466572 0.431097 0.448133 0.928191 0.939150 0.871277 0.834389 0.179656 230.709885 2.263013
2 Birch 2 -0.000642 -0.000484 0.000123 0.012938 0.000243 0.897317 0.890762 0.499794 0.445545 0.663595 15.903743 0.242808
3 DBSCAN 2 0.184688 0.064796 0.054477 0.083985 0.066086 0.877428 0.858138 0.320993 0.302807 0.177009 20.365912 3.946749
4 Mean-shift 2 -0.000642 -0.000484 0.000123 0.012938 0.000243 0.897317 0.890762 0.499794 0.445545 0.663595 15.903743 0.242808
5 Optics 2 -0.036324 0.010561 0.008461 0.021772 0.012186 0.870827 0.010997 0.004114 0.333333 -0.034120 31.917927 1.200985
6 Gaussian-mixture 2 0.337928 0.246440 0.326392 0.198344 0.246745 0.809248 0.169721 0.145474 0.318321 0.129879 176.396969 3.509053
In [13]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/mfcc_ideologyFive_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/mfcc_ideologyFive_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1391, 1: 1337})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2393, 1: 335})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 2727, 1: 1})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2577, -1: 145, 1: 6})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2660, 1: 38, 0: 30})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2727, 1: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 2036, 0: 692})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [14]:
results_df=pd.read_csv('audio-results4/mfcc_ideologyFive_features_vowel.csv-pca.csv')
results_df
Out[14]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.012085 0.052973 0.080294 0.039899 0.053308 0.639781 0.556818 0.680401 0.570088 0.250685 947.704145 1.528516
1 Agglomerative clustering 4 0.105769 0.165126 0.392742 0.104992 0.165690 0.558086 0.416789 0.148700 0.257501 0.240259 935.071262 1.145706
2 Birch 5 0.094052 0.163848 0.399935 0.103565 0.164525 0.535144 0.329545 0.095833 0.202785 0.274818 821.463444 0.950683
3 DBSCAN 3 0.266904 0.103731 0.112080 0.099176 0.105234 0.853376 0.820015 0.230049 0.231765 0.135008 58.416733 2.307579
4 Mean-shift 2 0.034826 0.020979 0.011776 0.183000 0.022127 0.898613 0.892962 0.511373 0.835560 0.427101 38.367833 0.816753
5 Optics 2 -0.040609 0.012008 0.009725 0.022401 0.013562 0.866914 0.016129 0.006033 0.333333 -0.017876 46.667537 0.944523
6 Gaussian-mixture 2 0.392557 0.245753 0.302921 0.207201 0.246080 0.842888 0.137830 0.167482 0.299854 0.275362 640.386893 1.581857
In [15]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/mfcc_ideologyFive_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/mfcc_ideologyFive_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1410, 0: 1318})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 1121, 1: 795, 3: 476, 2: 336})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 898, 4: 823, 1: 685, 2: 321, 3: 1})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2413, -1: 296, 2: 10, 1: 9})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2650, 0: 44, 1: 34})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2719, 1: 9})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 2177, 0: 551})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [16]:
results_df=pd.read_csv('audio-results4/mfcc_ideologyFive_features_vowel.csv-tsne.csv')
results_df
Out[16]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.028180 9.255041e-02 1.399489e-01 0.069495 9.287167e-02 0.645826 0.584311 0.731295 0.589777 0.324522 1710.617078 1.203074
1 Agglomerative clustering 843 0.000317 3.797370e-02 7.975324e-01 0.041182 7.831905e-02 0.032847 0.002933 0.000014 0.001631 0.355804 578.516816 0.720103
2 Birch 2351 0.000039 8.073284e-03 9.793275e-01 0.043659 8.359088e-02 0.010798 0.000733 0.000002 0.000851 0.109184 754.384654 0.233542
3 DBSCAN 2 -0.030399 8.547338e-03 6.825843e-03 0.020860 1.028595e-02 0.875929 0.010630 0.003976 0.333333 0.178124 79.024040 0.633752
4 Mean-shift 1 0.000000 1.769098e-15 3.226002e-16 1.000000 6.452004e-16 0.897715 0.891129 0.500000 0.445565 -1.000000 -1.000000 -1.000000
5 Optics 2 -0.034574 6.761861e-03 6.078194e-03 0.012993 8.282011e-03 0.865040 0.012830 0.006770 0.345912 0.194352 125.648547 0.661727
6 Gaussian-mixture 2 0.024291 9.200448e-02 1.390929e-01 0.069095 9.232601e-02 0.644448 0.578812 0.729688 0.589198 0.324440 1709.560442 1.202672
In [17]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/mfcc_ideologyFive_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/mfcc_ideologyFive_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1383, 0: 1345})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({-1: 2673, 0: 29, 1: 26})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2642, 1: 53, 0: 33})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2728})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1400, 0: 1328})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [18]:
results_df=pd.read_csv('audio-results4/mfcc_ideologyFive_features_vowel.csv-umap.csv')
results_df
Out[18]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.047580 0.099927 0.150952 0.075039 0.100246 0.653910 0.389663 0.257058 0.405562 0.309735 1452.207514 1.274124
1 Agglomerative clustering 83 0.003776 0.080276 0.592263 0.046742 0.086646 0.117665 0.001833 0.000060 0.000932 0.319799 1171.259733 0.983480
2 Birch 42 0.006147 0.091886 0.553184 0.052221 0.095433 0.168780 0.022727 0.001100 0.028711 0.340894 1288.165522 0.919936
3 DBSCAN 2 -0.009840 0.001319 0.001971 0.016951 0.003531 0.891347 0.003299 0.001234 0.333333 0.424794 43.337125 0.404926
4 Mean-shift 4 0.124901 0.174721 0.379102 0.114040 0.175337 0.600055 0.446848 0.138660 0.249470 0.338498 1351.812755 0.951689
5 Optics 4 0.582486 0.383562 0.482197 0.319935 0.384654 0.910631 0.076246 0.129427 0.245939 0.119601 349.690499 0.639533
6 Gaussian-mixture 2 0.012810 0.088890 0.134191 0.066818 0.089213 0.640950 0.438050 0.276818 0.413044 0.309241 1445.973678 1.269568
In [19]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/mfcc_ideologyFive_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/mfcc_ideologyFive_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1420, 0: 1308})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({-1: 2712, 0: 9, 1: 7})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2375, 1: 229, 0: 45, 2: 45, 3: 34})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 1228, 1: 906, 2: 513, 3: 81})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1450, 1: 1278})
 2D representation
 3D representation
In [ ]:
 
In [ ]: