In [5]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [2]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formants80_ideologyFive_features_vowel.csv')
In [3]:
train_data.shape
Out[3]:
(2728, 3)
In [ ]:
 
In [4]:
results_df=pd.read_csv('audio-results4/formants80_ideologyFive_features_vowel.csv')
results_df
Out[4]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.013152 0.008362 0.013063 0.006538 0.008715 0.643312 0.429252 0.424098 0.470212 0.304503 1285.260106 1.216036
1 Agglomerative clustering 2 -0.024739 0.003038 0.004600 0.002741 0.003435 0.694078 0.669355 0.461279 0.480680 0.267731 941.348654 1.234579
2 Birch 2 0.032301 0.009172 0.013764 0.007298 0.009538 0.673231 0.644795 0.576070 0.532329 0.321666 1264.492109 1.194497
3 DBSCAN 2 0.052515 0.010029 0.008187 0.020418 0.011688 0.880105 0.870968 0.326777 0.326182 0.576391 159.626970 1.975479
4 Mean-shift 4 0.032771 0.014075 0.010255 0.043688 0.016611 0.889788 0.881965 0.248168 0.237236 0.627774 168.014612 0.562821
5 Optics 2 -0.036253 0.008623 0.007318 0.016716 0.010179 0.867153 0.011364 0.004251 0.322917 -0.332237 7.059677 5.399184
6 Gaussian-mixture 2 0.060905 0.007348 0.008088 0.007653 0.007864 0.810767 0.814516 0.541250 0.538339 0.438907 165.057045 3.255383
In [9]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formants80_ideologyFive_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formants80_ideologyFive_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1508, 0: 1220})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2007, 1: 721})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 1766, 1: 962})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2653, -1: 63, 1: 12})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2649, 1: 47, 0: 32})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2692, 1: 18, 2: 17, 3: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2405, 1: 323})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [10]:
results_df=pd.read_csv('audio-results4/formants80_ideologyFive_features_vowel.csv-pca.csv')
results_df
Out[10]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.013152 0.008362 0.013063 0.006538 0.008715 0.643312 0.429252 0.424098 0.470212 0.304503 1285.260106 1.216036
1 Agglomerative clustering 2 -0.024739 0.003038 0.004600 0.002741 0.003435 0.694078 0.669355 0.461279 0.480680 0.267731 941.348654 1.234579
2 Birch 2 0.032301 0.009172 0.013764 0.007298 0.009538 0.673231 0.644795 0.576070 0.532329 0.321666 1264.492109 1.194497
3 DBSCAN 2 0.052515 0.010029 0.008187 0.020418 0.011688 0.880105 0.870968 0.326777 0.326182 0.576391 159.626970 1.975479
4 Mean-shift 4 0.032771 0.014075 0.010255 0.043688 0.016611 0.889788 0.881965 0.248168 0.237236 0.627774 168.014612 0.562821
5 Optics 2 -0.036253 0.008623 0.007318 0.016716 0.010179 0.867153 0.011364 0.004251 0.322917 -0.332237 7.059677 5.399184
6 Gaussian-mixture 2 0.071534 0.009941 0.010210 0.010777 0.010486 0.825947 0.829545 0.543771 0.547001 0.436691 191.221230 2.773435
In [11]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formants80_ideologyFive_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formants80_ideologyFive_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1508, 0: 1220})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2007, 1: 721})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 1766, 1: 962})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2653, -1: 63, 1: 12})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2649, 1: 47, 0: 32})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2692, 1: 18, 2: 17, 3: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2454, 1: 274})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [12]:
results_df=pd.read_csv('audio-results4/formants80_ideologyFive_features_vowel.csv-tsne.csv')
results_df
Out[12]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.005783 0.015126 0.023321 0.011579 0.015474 0.636943 0.538490 0.600660 0.539063 0.391627 2419.296869 1.008633
1 Agglomerative clustering 372 0.000307 0.020535 0.401280 0.023651 0.044669 0.049786 0.003666 0.000019 0.002846 0.449019 1564.882113 0.705538
2 Birch 529 0.000171 0.016645 0.440537 0.024809 0.046972 0.043325 0.004399 0.000048 0.002993 0.385300 1363.640872 0.730558
3 DBSCAN 149 0.000623 0.024039 0.256219 0.019603 0.036420 0.138052 0.002199 0.000036 0.006776 0.175314 113.499198 1.350308
4 Mean-shift 2 0.015557 0.014919 0.022942 0.011442 0.015269 0.642703 0.430352 0.399592 0.460793 0.388475 2380.436167 1.008865
5 Optics 90 0.002323 0.023600 0.169232 0.019301 0.034650 0.432535 0.006598 0.000081 0.010411 -0.089367 31.889907 1.474229
6 Gaussian-mixture 2 0.001088 0.017633 0.027074 0.013461 0.017982 0.635706 0.475440 0.392378 0.458154 0.391991 2409.936870 1.010189
In [13]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formants80_ideologyFive_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formants80_ideologyFive_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1366, 0: 1362})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({1: 1471, 0: 1257})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1424, 1: 1304})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [14]:
results_df=pd.read_csv('audio-results4/formants80_ideologyFive_features_vowel.csv-umap.csv')
results_df
Out[14]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.006469 0.015030 0.023176 0.011507 0.015378 0.637219 0.540689 0.600416 0.538969 0.447803 3330.929884 0.859430
1 Agglomerative clustering 2 -0.002367 -0.001000 0.000081 0.000917 0.000148 0.893181 0.887097 0.499215 0.483950 0.456454 85.353954 0.368103
2 Birch 268 0.000277 0.020431 0.324837 0.020729 0.038971 0.065348 0.002933 0.000023 0.003435 0.417261 2435.813387 0.771516
3 DBSCAN 121 0.000558 0.022196 0.222718 0.017568 0.032568 0.140795 0.025293 0.000330 0.009513 0.263993 204.278456 1.259022
4 Mean-shift 4 0.010641 0.011422 0.026080 0.007940 0.012174 0.531347 0.452713 0.235625 0.274174 0.254940 1760.201080 1.122241
5 Optics 92 -0.004956 0.018328 0.154362 0.015684 0.028474 0.348650 0.006598 0.000111 0.010872 0.037909 47.129276 1.353530
6 Gaussian-mixture 2 0.012586 0.015101 0.023252 0.011569 0.015450 0.640516 0.560117 0.600972 0.539297 0.447483 3316.887129 0.856481
In [15]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formants80_ideologyFive_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formants80_ideologyFive_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1370, 1: 1358})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2715, 1: 13})
 2D representation
 3D representation
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1173, 1: 869, 2: 646, 3: 40})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1437, 1: 1291})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: