In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [3]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/zeroCrossings_ideologyFive_features_vowel.csv')
In [4]:
train_data.shape
Out[4]:
(2728, 530)
In [ ]:
 
In [5]:
results_df=pd.read_csv('audio-results4/zeroCrossings_ideologyFive_features_vowel.csv')
results_df
Out[5]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 -0.026934 0.048032 0.071295 0.036609 0.048377 0.638868 0.521261 0.339763 0.434851 0.378865 1028.065327 1.125223
1 Agglomerative clustering 2 -0.000642 -0.000484 0.000123 0.012938 0.000243 0.897317 0.890762 0.499794 0.445545 0.954799 651.097631 0.031278
2 Birch 2 -0.000642 -0.000484 0.000123 0.012938 0.000243 0.897317 0.890762 0.499794 0.445545 0.954799 651.097631 0.031278
3 DBSCAN 2 -0.005447 -0.000856 0.000629 0.002805 0.001028 0.884952 0.879032 0.329794 0.363582 0.643148 157.814694 2.025316
4 Mean-shift 6 -0.003176 -0.002336 0.000614 0.012949 0.001173 0.895722 0.889296 0.166324 0.148488 0.820177 215.972385 0.089023
5 Optics 2 0.174168 0.113629 0.085756 0.174290 0.114952 0.891485 0.030059 0.052623 0.600903 -0.098252 50.140990 0.861420
6 Gaussian-mixture 2 0.009444 -0.000412 0.000242 0.000529 0.000332 0.866717 0.864003 0.503992 0.510967 0.612404 392.451525 1.206402
In [9]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/zeroCrossings_ideologyFive_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/zeroCrossings_ideologyFive_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1655, 1: 1073})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2727, 1: 1})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 2727, 1: 1})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2691, -1: 32, 1: 5})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2637, 1: 47, 0: 44})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2723, 1: 1, 5: 1, 3: 1, 4: 1, 2: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2628, 1: 100})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [6]:
results_df=pd.read_csv('audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-pca.csv')
results_df
Out[6]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 -0.027619 0.047550 0.070528 0.036259 0.047895 0.639119 0.523094 0.340791 0.435162 0.434889 1375.838494 0.936426
1 Agglomerative clustering 2 -0.000642 -0.000484 0.000123 0.012938 0.000243 0.897317 0.890762 0.499794 0.445545 0.959757 823.662711 0.028059
2 Birch 2 -0.000642 -0.000484 0.000123 0.012938 0.000243 0.897317 0.890762 0.499794 0.445545 0.959757 823.662711 0.028059
3 DBSCAN 2 -0.017480 -0.000221 0.000956 0.001403 0.001138 0.846783 0.840909 0.315533 0.313150 0.447148 323.119574 1.285344
4 Mean-shift 3 -0.002546 -0.000742 0.000491 0.014265 0.000950 0.896121 0.889663 0.332785 0.296990 0.835570 503.246762 0.340622
5 Optics 3 0.150267 0.097336 0.090278 0.109540 0.098981 0.869539 0.017595 0.010848 0.273753 -0.104876 55.899549 0.840575
6 Gaussian-mixture 2 -0.010291 -0.000285 0.000263 0.000288 0.000275 0.814731 0.814150 0.493754 0.492972 0.520866 733.888701 1.041487
In [10]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1660, 1: 1068})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2727, 1: 1})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 2727, 1: 1})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2578, -1: 130, 1: 20})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 2570, 1: 67, 2: 50, 0: 41})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2724, 2: 3, 1: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2468, 1: 260})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [7]:
results_df=pd.read_csv('audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-tsne.csv')
results_df
Out[7]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.033649 7.483463e-02 1.132411e-01 0.056248 7.516223e-02 0.648099 0.407625 0.286348 0.417039 0.308246 1204.575611 1.448123
1 Agglomerative clustering 338 0.000638 4.518202e-02 5.932241e-01 0.035548 6.707700e-02 0.054838 0.005499 0.000018 0.002959 0.434740 4133.412487 0.705597
2 Birch 580 0.000353 3.559516e-02 6.372685e-01 0.035418 6.710623e-02 0.043463 0.001833 0.000004 0.001232 0.383644 3843.072813 0.704161
3 DBSCAN 141 0.005557 5.842079e-02 4.666477e-01 0.038095 7.044044e-02 0.197416 0.008431 0.000087 0.007313 0.169170 88.863604 1.218553
4 Mean-shift 1 0.000000 1.769098e-15 3.226002e-16 1.000000 6.452004e-16 0.897715 0.891129 0.500000 0.445565 -1.000000 -1.000000 -1.000000
5 Optics 87 0.007562 5.436222e-02 3.268927e-01 0.035613 6.422828e-02 0.402573 0.008431 0.000108 0.011364 -0.060594 40.634738 1.276768
6 Gaussian-mixture 2 0.033399 7.470108e-02 1.130418e-01 0.056148 7.502872e-02 0.647990 0.592009 0.713447 0.582878 0.308057 1204.103375 1.448260
In [11]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1397, 0: 1331})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 2728})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1396, 1: 1332})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [8]:
results_df=pd.read_csv('audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-umap.csv')
results_df
Out[8]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.010320 0.000315 0.000972 0.000546 0.000699 0.685873 0.662023 0.519235 0.508846 0.486012 2194.604159 0.823348
1 Agglomerative clustering 8 0.072187 0.120391 0.345307 0.073775 0.121576 0.489081 0.253299 0.038856 0.117025 0.631658 3384.161704 0.468086
2 Birch 46 0.007059 0.074318 0.464746 0.042703 0.078219 0.161521 0.022727 0.000683 0.021758 0.514259 17835.943980 0.615782
3 DBSCAN 3 0.052039 0.089596 0.138411 0.066938 0.090236 0.690497 0.332111 0.318316 0.353015 0.514445 1728.680714 0.590009
4 Mean-shift 5 0.075392 0.134943 0.331647 0.085305 0.135704 0.532563 0.263196 0.064391 0.193521 0.578058 3109.287003 0.636512
5 Optics 113 0.007979 0.055510 0.418362 0.035664 0.065725 0.253564 0.005865 0.000058 0.008256 0.213715 72.847674 1.156582
6 Gaussian-mixture 2 0.029984 0.020611 0.031242 0.015772 0.020962 0.653385 0.604472 0.616992 0.546461 0.419297 1962.424341 1.064706
In [12]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})
audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1903, 1: 825})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({1: 745, 4: 742, 0: 738, 2: 276, 3: 83, 5: 70, 6: 41, 7: 33})
 2D representation
 3D representation
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({1: 1903, 0: 742, 2: 83})
 2D representation
 3D representation
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({2: 1014, 1: 745, 0: 742, 3: 144, 4: 83})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1570, 1: 1158})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: