In [8]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [9]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formantslpc_ideologyFive_features_vowel.csv')
In [10]:
train_data.shape
Out[10]:
(2728, 3)
In [ ]:
 
In [11]:
results_df=pd.read_csv('audio-results4/formantslpc_ideologyFive_features_vowel.csv')
results_df
Out[11]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 -0.009030 0.000253 0.000889 0.000493 0.000634 0.675056 0.359971 0.518229 0.508240 0.496138 2761.916962 0.856377
1 Agglomerative clustering 2 -0.004908 -0.000223 0.000224 0.000126 0.000161 0.680827 0.348974 0.509103 0.504186 0.492485 2678.907149 0.858811
2 Birch 2 -0.009183 0.000281 0.000928 0.000514 0.000662 0.674674 0.639296 0.481360 0.491585 0.495958 2761.882870 0.857006
3 DBSCAN 2 0.104183 0.026702 0.021574 0.040490 0.028150 0.875355 0.863636 0.326990 0.522260 0.233516 20.170216 3.626100
4 Mean-shift 5 0.016379 0.024246 0.039667 0.018921 0.025621 0.663765 0.614370 0.186358 0.196762 0.335124 739.106074 0.932296
5 Optics 2 -0.044002 0.010438 0.009029 0.017401 0.011890 0.859753 0.021994 0.008227 0.327869 -0.297751 38.807978 1.141131
6 Gaussian-mixture 2 -0.006212 -0.000128 0.000356 0.000200 0.000256 0.681609 0.652126 0.488558 0.494713 0.499000 2754.090209 0.846311
In [12]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formantslpc_ideologyFive_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formantslpc_ideologyFive_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1877, 0: 851})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({1: 1903, 0: 825})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 1875, 1: 853})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2613, -1: 109, 1: 6})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2630, 0: 61, 1: 37})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 1786, 1: 898, 4: 31, 2: 11, 3: 2})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1910, 1: 818})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [13]:
results_df=pd.read_csv('audio-results4/formantslpc_ideologyFive_features_vowel.csv-pca.csv')
results_df
Out[13]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 -0.009030 0.000253 0.000889 0.000493 0.000634 0.675056 0.359971 0.518229 0.508240 0.496138 2761.916962 0.856377
1 Agglomerative clustering 2 -0.004908 -0.000223 0.000224 0.000126 0.000161 0.680827 0.348974 0.509103 0.504186 0.492485 2678.907149 0.858811
2 Birch 2 -0.009183 0.000281 0.000928 0.000514 0.000662 0.674674 0.639296 0.481360 0.491585 0.495958 2761.882870 0.857006
3 DBSCAN 2 0.104183 0.026702 0.021574 0.040490 0.028150 0.875355 0.863636 0.326990 0.522260 0.233516 20.170216 3.626100
4 Mean-shift 5 0.016379 0.024246 0.039667 0.018921 0.025621 0.663765 0.614370 0.186358 0.196762 0.335124 739.106074 0.932296
5 Optics 2 -0.038306 0.007849 0.007020 0.013851 0.009317 0.861531 0.020528 0.007679 0.321839 -0.305164 36.881424 1.148656
6 Gaussian-mixture 2 -0.006212 -0.000128 0.000356 0.000200 0.000256 0.681609 0.347874 0.511442 0.505287 0.499000 2754.090209 0.846311
In [14]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formantslpc_ideologyFive_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formantslpc_ideologyFive_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1877, 0: 851})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({1: 1903, 0: 825})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 1875, 1: 853})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2613, -1: 109, 1: 6})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2633, 0: 58, 1: 37})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 1786, 1: 898, 4: 31, 3: 11, 2: 2})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1910, 0: 818})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [15]:
results_df=pd.read_csv('audio-results4/formantslpc_ideologyFive_features_vowel.csv-tsne.csv')
results_df
Out[15]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 -0.012252 0.001993 0.003425 0.001800 0.002360 0.653916 0.593475 0.463039 0.484477 0.453469 2808.753658 0.875823
1 Agglomerative clustering 359 0.000552 0.035847 0.525936 0.031235 0.058967 0.053231 0.004765 0.000031 0.003292 0.445990 2158.117418 0.715202
2 Birch 493 0.000420 0.031250 0.554325 0.031581 0.059758 0.047317 0.001100 0.000003 0.002028 0.399360 1965.372688 0.726528
3 DBSCAN 137 0.002066 0.048490 0.413461 0.032245 0.059824 0.155556 0.002199 0.000018 0.007246 0.194101 106.652267 1.370998
4 Mean-shift 2 -0.013393 0.002509 0.004174 0.002194 0.002876 0.653503 0.592009 0.459261 0.482890 0.453839 2805.570138 0.876913
5 Optics 85 0.004270 0.038145 0.238298 0.026846 0.048256 0.419372 0.005499 0.000072 0.011628 -0.083568 36.726517 1.431627
6 Gaussian-mixture 2 -0.012177 0.001322 0.002426 0.001301 0.001694 0.661750 0.387463 0.530699 0.513255 0.453929 2771.733359 0.858105
In [16]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formantslpc_ideologyFive_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formantslpc_ideologyFive_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1740, 1: 988})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1740, 1: 988})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1798, 0: 930})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [17]:
results_df=pd.read_csv('audio-results4/formantslpc_ideologyFive_features_vowel.csv-umap.csv')
results_df
Out[17]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 -0.014560 0.003170 0.005134 0.002697 0.003536 0.652703 0.589443 0.454865 0.481069 0.616908 6779.719456 0.563802
1 Agglomerative clustering 2 -0.014560 0.003170 0.005134 0.002697 0.003536 0.652703 0.410557 0.545135 0.518931 0.616908 6779.719456 0.563802
2 Birch 104 0.001520 0.049312 0.406027 0.030908 0.057443 0.103015 0.003666 0.000068 0.009907 0.448752 6620.617433 0.754083
3 DBSCAN 96 0.004205 0.051690 0.386637 0.032605 0.060139 0.164671 0.020894 0.000638 0.011260 0.285987 224.356538 1.568553
4 Mean-shift 2 -0.014265 0.003543 0.005700 0.002973 0.003908 0.650133 0.582111 0.452230 0.480147 0.613907 6729.435949 0.571220
5 Optics 97 0.004166 0.047129 0.326277 0.031189 0.056935 0.319132 0.009531 0.000109 0.010204 0.086061 46.979556 1.389718
6 Gaussian-mixture 2 -0.015030 0.003291 0.005305 0.002791 0.003658 0.653290 0.590909 0.454210 0.480744 0.616484 6759.971290 0.562864
In [18]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formantslpc_ideologyFive_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/formantslpc_ideologyFive_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1737, 1: 991})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({1: 1737, 0: 991})
 2D representation
 3D representation
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1715, 1: 1013})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1743, 1: 985})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: