In [11]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [2]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formants80_ideology_features_vowel.csv')
In [ ]:
 
In [3]:
results_df=pd.read_csv('audio-results4/formants80_ideology_features_vowel.csv')
results_df
Out[3]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.088407 0.050350 0.074850 0.038348 0.050713 0.700612 0.309347 0.320515 0.427055 0.334631 1285.257300 1.210937
1 Agglomerative clustering 2 0.170175 0.052926 0.061747 0.047024 0.053389 0.814158 0.823813 0.635977 0.594705 0.394625 1016.572969 1.126116
2 Birch 8 0.066294 0.049318 0.117557 0.032454 0.050865 0.597731 0.539318 0.077893 0.120818 0.253598 485.674459 0.924959
3 DBSCAN 4 0.084714 0.021019 0.028657 0.019182 0.022981 0.787458 0.763353 0.171302 0.249341 0.185657 35.403920 2.800571
4 Mean-shift 2 0.008781 0.001347 0.001329 0.027690 0.002537 0.902630 0.897255 0.502810 0.615923 0.746532 147.529019 0.425298
5 Optics 2 -0.034799 0.009558 0.007851 0.020162 0.011301 0.878211 0.009644 0.003580 0.333333 -0.182472 23.773244 1.116218
6 Gaussian-mixture 2 0.154321 0.056102 0.072793 0.046188 0.056516 0.777668 0.212908 0.337731 0.411640 0.369538 683.413095 1.649992
In [4]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formants80_ideology_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formants80_ideology_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1771, 0: 925})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2276, 1: 420})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 1529, 2: 588, 1: 347, 6: 168, 3: 53, 5: 8, 7: 2, 4: 1})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2251, -1: 421, 2: 10, 3: 8, 1: 6})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2632, 1: 38, 0: 26})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2690, 1: 6})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 2119, 0: 577})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [5]:
results_df=pd.read_csv('audio-results4/formants80_ideology_features_vowel.csv-pca.csv')
results_df
Out[5]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.088407 0.050350 0.074850 0.038348 0.050713 0.700612 0.309347 0.320515 0.427055 0.334631 1285.257300 1.210937
1 Agglomerative clustering 2 0.170175 0.052926 0.061747 0.047024 0.053389 0.814158 0.823813 0.635977 0.594705 0.394625 1016.572969 1.126116
2 Birch 8 0.066294 0.049318 0.117557 0.032454 0.050865 0.597731 0.539318 0.077893 0.120818 0.253598 485.674459 0.924959
3 DBSCAN 4 0.084714 0.021019 0.028657 0.019182 0.022981 0.787458 0.763353 0.171302 0.249341 0.185657 35.403920 2.800571
4 Mean-shift 2 0.008781 0.001347 0.001329 0.027690 0.002537 0.902630 0.897255 0.502810 0.615923 0.746532 147.529019 0.425298
5 Optics 2 -0.034799 0.009558 0.007851 0.020162 0.011301 0.878211 0.009644 0.003580 0.333333 -0.182472 23.773244 1.116218
6 Gaussian-mixture 2 0.150731 0.059942 0.080160 0.048380 0.060341 0.765748 0.774481 0.674587 0.588815 0.367170 829.498755 1.488280
In [6]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formants80_ideology_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formants80_ideology_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1771, 0: 925})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2276, 1: 420})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 1529, 2: 588, 1: 347, 6: 168, 3: 53, 5: 8, 7: 2, 4: 1})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2251, -1: 421, 2: 10, 3: 8, 1: 6})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2632, 1: 38, 0: 26})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2690, 1: 6})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2061, 1: 635})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [7]:
results_df=pd.read_csv('audio-results4/formants80_ideology_features_vowel.csv-tsne.csv')
results_df
Out[7]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.065237 0.061607 0.094649 0.046047 0.061953 0.673605 0.354970 0.297568 0.423402 0.317042 1515.514755 1.234337
1 Agglomerative clustering 350 0.000322 0.025433 0.449115 0.025613 0.048462 0.051861 0.005193 0.000026 0.003175 0.445011 1372.903659 0.716233
2 Birch 570 0.000211 0.017687 0.483879 0.025870 0.049114 0.043423 0.001113 0.000002 0.001754 0.377190 1216.392759 0.741782
3 DBSCAN 111 0.001379 0.030092 0.276405 0.021536 0.039959 0.161053 0.031157 0.001288 0.012228 0.179385 133.441044 1.458129
4 Mean-shift 2 0.049554 0.059277 0.091866 0.044132 0.059621 0.662651 0.620920 0.698677 0.573874 0.309239 1434.888043 1.284348
5 Optics 88 -0.007491 0.026977 0.198212 0.020693 0.037474 0.395307 0.005193 0.000174 0.012289 -0.048138 34.301360 1.398593
6 Gaussian-mixture 2 0.071423 0.039484 0.059288 0.030009 0.039849 0.690448 0.326409 0.339685 0.435926 0.308958 1438.131089 1.211598
In [8]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formants80_ideology_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formants80_ideology_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1588, 0: 1108})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1511, 1: 1185})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1737, 0: 959})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [9]:
results_df=pd.read_csv('audio-results4/formants80_ideology_features_vowel.csv-umap.csv')
results_df
Out[9]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.086855 0.075809 0.115318 0.056846 0.076153 0.686609 0.669881 0.722716 0.585824 0.403346 2306.925104 0.998638
1 Agglomerative clustering 112 0.000715 0.034007 0.324638 0.023002 0.042960 0.092656 0.008531 0.000085 0.005704 0.443503 2141.418808 0.774429
2 Birch 113 0.000996 0.033764 0.320475 0.022986 0.042896 0.099011 0.013724 0.000135 0.008850 0.427438 1933.416493 0.809188
3 DBSCAN 89 0.000330 0.033080 0.275399 0.022287 0.041237 0.155715 0.012611 0.000156 0.011111 0.311815 236.948586 1.490555
4 Mean-shift 2 0.072424 0.073585 0.112988 0.054936 0.073927 0.676095 0.650593 0.720035 0.583191 0.395568 2251.912583 1.027831
5 Optics 103 -0.000019 0.027976 0.234679 0.021068 0.038665 0.312149 0.005193 0.000118 0.011218 0.115778 56.822564 1.263737
6 Gaussian-mixture 2 0.093187 0.070748 0.106702 0.053309 0.071097 0.693702 0.318620 0.285387 0.415752 0.403454 2293.528203 0.986037
In [10]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formants80_ideology_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formants80_ideology_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1647, 1: 1049})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1585, 1: 1111})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 1696, 0: 1000})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: