In [6]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [2]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formants50_ideologyFive_features_vowel.csv')
In [3]:
train_data.shape
Out[3]:
(2728, 3)
In [ ]:
 
In [4]:
results_df=pd.read_csv('audio-results4/formants50_ideologyFive_features_vowel.csv')
results_df
Out[4]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.162921 0.083682 0.114173 0.066494 0.084042 0.746098 0.243035 0.285624 0.396347 0.367415 1298.362170 1.208018
1 Agglomerative clustering 2 0.094663 0.049846 0.071538 0.038669 0.050202 0.701909 0.302419 0.326333 0.424208 0.322612 1086.645930 1.381837
2 Birch 10 0.154210 0.112260 0.235570 0.075545 0.114402 0.670706 0.652859 0.115823 0.132994 0.279821 377.914211 0.926925
3 DBSCAN 2 0.076064 0.018255 0.014309 0.032170 0.019807 0.878743 0.869135 0.329047 0.521312 0.453904 66.295195 2.345823
4 Mean-shift 3 0.037534 0.013681 0.008824 0.063861 0.015506 0.895111 0.890029 0.341789 0.455764 0.655355 138.324327 1.130572
5 Optics 3 -0.041314 0.007208 0.007396 0.012813 0.009379 0.858674 0.009531 0.003413 0.255000 -0.289585 14.144455 1.768938
6 Gaussian-mixture 2 0.439454 0.236987 0.246897 0.228568 0.237379 0.885347 0.898460 0.765689 0.739907 0.398010 849.129885 1.020409
In [5]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formants50_ideologyFive_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formants50_ideologyFive_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1970, 0: 758})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({1: 1818, 0: 910})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 1716, 2: 512, 1: 383, 5: 65, 4: 26, 6: 7, 9: 7, 3: 6, 7: 5, 8: 1})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2638, -1: 84, 1: 6})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2626, 1: 50, 2: 27, 0: 25})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2707, 1: 19, 2: 2})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2394, 1: 334})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [7]:
results_df=pd.read_csv('audio-results4/formants50_ideologyFive_features_vowel.csv-pca.csv')
results_df
Out[7]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.162921 0.083682 0.114173 0.066494 0.084042 0.746098 0.756965 0.714376 0.603653 0.367415 1298.362170 1.208018
1 Agglomerative clustering 2 0.094663 0.049846 0.071538 0.038669 0.050202 0.701909 0.302419 0.326333 0.424208 0.322612 1086.645930 1.381837
2 Birch 10 0.154210 0.112260 0.235570 0.075545 0.114402 0.670706 0.652859 0.115823 0.132994 0.279821 377.914211 0.926925
3 DBSCAN 2 0.076064 0.018255 0.014309 0.032170 0.019807 0.878743 0.869135 0.329047 0.521312 0.453904 66.295195 2.345823
4 Mean-shift 3 0.037534 0.013681 0.008824 0.063861 0.015506 0.895111 0.890029 0.341789 0.455764 0.655355 138.324327 1.130572
5 Optics 2 -0.035864 0.010416 0.008335 0.021781 0.012056 0.871227 0.009531 0.003565 0.333333 -0.269624 13.069045 1.890882
6 Gaussian-mixture 2 0.439454 0.236987 0.246897 0.228568 0.237379 0.885347 0.101540 0.234311 0.260093 0.398010 849.129885 1.020409
In [12]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formants50_ideologyFive_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formants50_ideologyFive_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1970, 1: 758})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({1: 1818, 0: 910})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 1716, 2: 512, 1: 383, 5: 65, 4: 26, 6: 7, 9: 7, 3: 6, 7: 5, 8: 1})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2638, -1: 84, 1: 6})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2661, 1: 41, 0: 26})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2707, 1: 19, 2: 2})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 2394, 0: 334})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [8]:
results_df=pd.read_csv('audio-results4/formants50_ideologyFive_features_vowel.csv-tsne.csv')
results_df
Out[8]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.014388 4.549567e-02 6.906758e-02 0.034296 4.583358e-02 0.640387 0.439516 0.330842 0.434343 0.354279 1912.690310 1.136426
1 Agglomerative clustering 358 0.000592 3.695327e-02 5.347511e-01 0.031767 5.997205e-02 0.053418 0.004032 0.000013 0.002793 0.454193 1533.236588 0.693419
2 Birch 594 0.000420 3.026813e-02 5.931972e-01 0.032925 6.238728e-02 0.044466 0.001466 0.000003 0.001684 0.396891 1430.322224 0.706424
3 DBSCAN 153 0.003122 4.581603e-02 4.056078e-01 0.031539 5.852795e-02 0.172748 0.008798 0.000064 0.006494 0.190485 80.010843 1.370363
4 Mean-shift 1 0.000000 1.769098e-15 3.226002e-16 1.000000 6.452004e-16 0.897715 0.891129 0.500000 0.445565 -1.000000 -1.000000 -1.000000
5 Optics 84 0.001440 4.571749e-02 2.750131e-01 0.030932 5.560957e-02 0.414886 0.008798 0.000116 0.011765 -0.050016 40.184781 1.213255
6 Gaussian-mixture 2 0.017468 4.538564e-02 6.890751e-02 0.034213 4.572357e-02 0.641547 0.433651 0.330507 0.434224 0.353929 1910.211254 1.137232
In [11]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formants50_ideologyFive_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formants50_ideologyFive_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1382, 1: 1346})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 2728})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1366, 0: 1362})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [9]:
results_df=pd.read_csv('audio-results4/formants50_ideologyFive_features_vowel.csv-umap.csv')
results_df
Out[9]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.044875 0.064390 0.097216 0.048508 0.064722 0.654386 0.610704 0.701769 0.578842 0.383672 2225.008496 1.052620
1 Agglomerative clustering 111 0.001750 0.047170 0.403826 0.029939 0.055745 0.096983 0.017962 0.000182 0.008829 0.462216 2516.507760 0.727847
2 Birch 149 0.001229 0.044461 0.417877 0.029800 0.055633 0.089712 0.009531 0.000111 0.007056 0.440378 2281.101949 0.803075
3 DBSCAN 145 0.004898 0.043978 0.384154 0.030303 0.056174 0.185189 0.009164 0.000111 0.007376 0.297034 109.185261 1.283596
4 Mean-shift 3 -0.037575 0.031890 0.064547 0.021636 0.032409 0.537121 0.439150 0.189883 0.290260 0.352382 1867.449109 0.933422
5 Optics 99 -0.002192 0.042330 0.310749 0.028332 0.051929 0.275384 0.008798 0.000099 0.010000 0.149084 65.994400 1.200915
6 Gaussian-mixture 2 0.060041 0.065465 0.098242 0.049464 0.065799 0.663889 0.633431 0.704176 0.580786 0.383712 2209.417548 1.044670
In [10]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formants50_ideologyFive_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formants50_ideologyFive_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1477, 1: 1251})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1412, 2: 678, 1: 638})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1553, 1: 1175})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: