In [6]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [7]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formants20_ideology_features_vowel.csv')
In [8]:
train_data.shape
Out[8]:
(2696, 3)
In [9]:
results_df=pd.read_csv('audio-results4/formants20_ideology_features_vowel.csv')
results_df
Out[9]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.136931 0.033141 0.035215 0.032243 0.033664 0.832014 0.838650 0.591053 0.581050 0.618810 2724.476350 0.331998
1 Agglomerative clustering 2 0.142513 0.035424 0.037236 0.034752 0.035952 0.835329 0.841988 0.592912 0.584824 0.622670 2709.888445 0.300310
2 Birch 2 0.136931 0.033141 0.035215 0.032243 0.033664 0.832014 0.838650 0.591053 0.581050 0.618810 2724.476350 0.331998
3 DBSCAN 2 0.147480 0.036127 0.039801 0.034970 0.037229 0.833742 0.839763 0.394449 0.390198 0.573840 1431.242022 0.652571
4 Mean-shift 2 0.137542 0.033387 0.035435 0.032512 0.033911 0.832381 0.839021 0.591260 0.581459 0.619255 2723.939296 0.328186
5 Optics 2 0.145430 0.037612 0.040044 0.037031 0.038478 0.835606 0.841988 0.395275 0.390011 0.612370 1379.192636 0.263886
6 Gaussian-mixture 2 0.142513 0.035424 0.037236 0.034752 0.035952 0.835329 0.841988 0.592912 0.584824 0.622670 2709.888445 0.300310
In [10]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formants20_ideology_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formants20_ideology_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 2382, 1: 314})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2391, 1: 305})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 2382, 1: 314})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2382, 1: 305, -1: 9})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({0: 2390, 1: 305, -1: 1})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2383, 1: 313})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2391, 1: 305})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [11]:
results_df=pd.read_csv('audio-results4/formants20_ideology_features_vowel.csv-pca.csv')
results_df
Out[11]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.136931 0.033141 0.035215 0.032243 0.033664 0.832014 0.838650 0.591053 0.581050 0.618810 2724.476350 0.331998
1 Agglomerative clustering 2 0.142513 0.035424 0.037236 0.034752 0.035952 0.835329 0.841988 0.592912 0.584824 0.622670 2709.888445 0.300310
2 Birch 2 0.136931 0.033141 0.035215 0.032243 0.033664 0.832014 0.838650 0.591053 0.581050 0.618810 2724.476350 0.331998
3 DBSCAN 2 0.147480 0.036127 0.039801 0.034970 0.037229 0.833742 0.839763 0.394449 0.390198 0.573840 1431.242022 0.652571
4 Mean-shift 2 0.137542 0.033387 0.035435 0.032512 0.033911 0.832381 0.839021 0.591260 0.581459 0.619255 2723.939296 0.328186
5 Optics 2 0.145430 0.037612 0.040044 0.037031 0.038478 0.835606 0.841988 0.395275 0.390011 0.612370 1379.192636 0.263886
6 Gaussian-mixture 2 0.142513 0.035424 0.037236 0.034752 0.035952 0.835329 0.841988 0.592912 0.584824 0.622670 2709.888445 0.300310
In [12]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formants20_ideology_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formants20_ideology_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 2382, 1: 314})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2391, 1: 305})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 2382, 1: 314})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2382, 1: 305, -1: 9})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({0: 2390, 1: 305, -1: 1})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2383, 1: 313})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2391, 1: 305})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [13]:
results_df=pd.read_csv('audio-results4/formants20_ideology_features_vowel.csv-tsne.csv')
results_df
Out[13]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.142513 0.035424 0.037236 0.034752 0.035952 0.835329 0.841988 0.592912 0.584824 0.781500 8766.610009 0.198965
1 Agglomerative clustering 2 0.142513 0.035424 0.037236 0.034752 0.035952 0.835329 0.841988 0.592912 0.584824 0.781500 8766.610009 0.198965
2 Birch 355 -0.002545 0.012184 0.318227 0.018946 0.035763 0.074122 0.003709 0.000012 0.002561 0.421419 4805.913985 0.819870
3 DBSCAN 2 0.142513 0.035424 0.037236 0.034752 0.035952 0.835329 0.841988 0.592912 0.584824 0.781500 8766.610009 0.198965
4 Mean-shift 2 0.142513 0.035424 0.037236 0.034752 0.035952 0.835329 0.841988 0.592912 0.584824 0.781500 8766.610009 0.198965
5 Optics 76 0.006907 0.017453 0.132906 0.015328 0.027486 0.429193 0.003709 0.000179 0.010035 -0.033383 251.664246 1.305298
6 Gaussian-mixture 2 0.142513 0.035424 0.037236 0.034752 0.035952 0.835329 0.841988 0.592912 0.584824 0.781500 8766.610009 0.198965
In [14]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formants20_ideology_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formants20_ideology_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 2391, 1: 305})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2391, 1: 305})
 2D representation
 3D representation
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({0: 2391, 1: 305})
 2D representation
 3D representation
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 2391, 1: 305})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2391, 1: 305})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [9]:
results_df=pd.read_csv('audio-results4/formants20_ideology_features_vowel.csv-umap.csv')
results_df
Out[9]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.008242 0.041304 0.064609 0.030732 0.041652 0.642465 0.452522 0.335768 0.439765 0.249237 971.999817 1.548162
1 Agglomerative clustering 2 -0.023517 0.004101 0.003227 0.011161 0.005006 0.883719 0.879451 0.491285 0.457800 0.483113 312.827278 0.425791
2 Birch 16 0.003879 0.048154 0.225895 0.028095 0.049974 0.254170 0.052300 0.005856 0.054223 0.328580 1177.586031 0.938904
3 DBSCAN 3 -0.030324 0.006861 0.006611 0.018130 0.009689 0.882122 0.007047 0.001962 0.250000 0.236802 79.744396 0.486467
4 Mean-shift 2 -0.022407 0.002177 0.002072 0.005545 0.003016 0.877168 0.873516 0.491204 0.469004 0.411536 319.466691 0.700326
5 Optics 4 0.009383 0.011823 0.013908 0.014588 0.014240 0.850294 0.012240 0.007239 0.172359 0.028590 185.346559 0.641321
6 Gaussian-mixture 2 -0.010271 0.034116 0.053078 0.025522 0.034470 0.640034 0.503338 0.646102 0.554396 0.247036 962.625306 1.534664
In [15]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formants20_ideology_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formants20_ideology_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 2391, 1: 305})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2391, 1: 305})
 2D representation
 3D representation
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({-1: 2578, 6: 33, 2: 15, 4: 14, 5: 13, 7: 13, 0: 10, 1: 10, 3: 10})
 2D representation
 3D representation
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 2391, 1: 305})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 2391, 0: 305})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: