In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
c:\users\negih\appdata\local\programs\python\python38\lib\site-packages\librosa\util\decorators.py:9: NumbaDeprecationWarning: An import was requested from a module that has moved location.
Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
c:\users\negih\appdata\local\programs\python\python38\lib\site-packages\librosa\util\decorators.py:9: NumbaDeprecationWarning: An import was requested from a module that has moved location.
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
In [2]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formants50_ideology_features_vowel.csv')
In [ ]:
 
In [3]:
results_df=pd.read_csv('audio-results4/formants50_ideology_features_vowel.csv')
results_df
Out[3]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.092432 0.040228 0.058237 0.031170 0.040606 0.716015 0.288205 0.344200 0.432755 0.311596 1065.784191 1.263674
1 Agglomerative clustering 3 0.106340 0.056043 0.091357 0.041120 0.056713 0.698659 0.281899 0.193818 0.278935 0.312206 966.787979 0.983968
2 Birch 2 0.026183 0.001446 0.001651 0.003354 0.002213 0.873339 0.871662 0.511123 0.527728 0.559811 715.568981 0.635304
3 DBSCAN 2 0.054904 0.004672 0.006504 0.005020 0.005666 0.811604 0.801929 0.310566 0.361583 0.432796 513.432609 1.672417
4 Mean-shift 4 0.024207 0.000773 0.002171 0.003710 0.002739 0.871513 0.869065 0.252421 0.268324 0.582072 412.381624 0.470183
5 Optics 2 0.021764 0.002741 0.003180 0.007556 0.004476 0.881242 0.878709 0.339067 0.359574 0.615105 464.256177 0.406591
6 Gaussian-mixture 2 -0.021932 0.000917 0.001611 0.001257 0.001412 0.775257 0.771884 0.481354 0.486593 0.363152 88.446150 4.499355
In [4]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formants50_ideology_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formants50_ideology_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1872, 0: 824})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({1: 1772, 0: 857, 2: 67})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 2593, 1: 103})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2374, -1: 255, 1: 67})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({0: 2620, 1: 67, -1: 9})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2589, 1: 75, 2: 31, 3: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2292, 1: 404})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [5]:
results_df=pd.read_csv('audio-results4/formants50_ideology_features_vowel.csv-pca.csv')
results_df
Out[5]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.092104 0.040094 0.058063 0.031061 0.040472 0.715755 0.288576 0.344406 0.432889 0.311601 1065.776518 1.264055
1 Agglomerative clustering 3 0.106340 0.056043 0.091357 0.041120 0.056713 0.698659 0.281899 0.193818 0.278935 0.312206 966.787979 0.983968
2 Birch 2 0.026183 0.001446 0.001651 0.003354 0.002213 0.873339 0.871662 0.511123 0.527728 0.559811 715.568981 0.635304
3 DBSCAN 2 0.054904 0.004672 0.006504 0.005020 0.005666 0.811604 0.801929 0.310566 0.361583 0.432796 513.432609 1.672417
4 Mean-shift 4 0.024207 0.000773 0.002171 0.003710 0.002739 0.871513 0.869065 0.252421 0.268324 0.582072 412.381624 0.470183
5 Optics 2 0.021764 0.002741 0.003180 0.007556 0.004476 0.881242 0.878709 0.339067 0.359574 0.615105 464.256177 0.406591
6 Gaussian-mixture 2 0.027211 0.002264 0.002110 0.005973 0.003118 0.884855 0.882047 0.510459 0.539533 0.625071 682.362391 0.268972
In [6]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formants50_ideology_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formants50_ideology_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1871, 0: 825})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({1: 1772, 0: 857, 2: 67})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 2593, 1: 103})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 2374, -1: 255, 1: 67})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({0: 2620, 1: 67, -1: 9})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 2589, 1: 75, 2: 31, 3: 1})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2629, 1: 67})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [7]:
results_df=pd.read_csv('audio-results4/formants50_ideology_features_vowel.csv-tsne.csv')
results_df
Out[7]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.018197 0.027858 0.043730 0.020822 0.028212 0.646809 0.429525 0.361643 0.449181 0.306823 1345.653901 1.320267
1 Agglomerative clustering 347 0.000229 0.017014 0.370677 0.021188 0.040085 0.051578 0.004822 0.000015 0.002882 0.436314 1477.605332 0.727516
2 Birch 387 0.000062 0.016926 0.382438 0.021882 0.041396 0.054796 0.001855 0.000022 0.001895 0.386497 1165.804424 0.795906
3 DBSCAN 185 0.001693 0.018317 0.242379 0.018846 0.034973 0.257989 0.002596 0.000016 0.005376 0.169661 46.382770 1.156250
4 Mean-shift 2 0.027211 0.002264 0.002110 0.005973 0.003118 0.884855 0.882047 0.510459 0.539533 0.422452 302.242513 0.495401
5 Optics 81 0.006552 0.021224 0.159565 0.017326 0.031259 0.420538 0.013353 0.000732 0.019268 -0.037886 51.513661 1.137796
6 Gaussian-mixture 2 0.021638 0.025343 0.039759 0.018984 0.025698 0.649337 0.580119 0.632445 0.548835 0.306801 1339.807026 1.321981
In [8]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formants50_ideology_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formants50_ideology_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1415, 0: 1281})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 2629, 1: 67})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 1455, 1: 1241})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [9]:
results_df=pd.read_csv('audio-results4/formants50_ideology_features_vowel.csv-umap.csv')
results_df
Out[9]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 2 0.027211 0.002264 0.002110 0.005973 0.003118 0.884855 0.882047 0.510459 0.539533 0.823262 3523.770567 0.133402
1 Agglomerative clustering 2 0.027211 0.002264 0.002110 0.005973 0.003118 0.884855 0.882047 0.510459 0.539533 0.823262 3523.770567 0.133402
2 Birch 142 0.000125 0.022119 0.260059 0.017816 0.033348 0.085888 0.006306 0.000049 0.005701 0.436495 4558.172265 0.825683
3 DBSCAN 2 -0.000027 -0.000689 0.000780 0.002242 0.001157 0.884110 0.017804 0.008758 0.380741 0.247163 995.024240 0.424755
4 Mean-shift 2 0.027211 0.002264 0.002110 0.005973 0.003118 0.884855 0.882047 0.510459 0.539533 0.823262 3523.770567 0.133402
5 Optics 97 0.004448 0.023276 0.194246 0.018652 0.034036 0.359516 0.013724 0.000649 0.014258 0.028656 18.414464 1.476107
6 Gaussian-mixture 2 0.027211 0.002264 0.002110 0.005973 0.003118 0.884855 0.882047 0.510459 0.539533 0.823262 3523.770567 0.133402
In [10]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formants50_ideology_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})
audio-results4/formants50_ideology_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 2629, 1: 67})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 2629, 1: 67})
 2D representation
 3D representation
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({-1: 2637, 0: 50, 1: 9})
 2D representation
 3D representation
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 2629, 1: 67})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 2629, 1: 67})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: