import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline

train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/mfcc_ideologyFive_features_vowel.csv')

train_data.shape

(2728, 13)

results_df=pd.read_csv('audio-results4/mfcc_ideologyFive_features_vowel.csv')
results_df

actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/mfcc_ideologyFive_features_vowel.csv')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/mfcc_ideologyFive_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1391, 1: 1337})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 2393, 1: 335})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 2727, 1: 1})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 2577, -1: 145, 1: 6})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2660, 1: 38, 0: 30})
 2D representation

PCA transformed data¶

results_df=pd.read_csv('audio-results4/mfcc_ideologyFive_features_vowel.csv-pca.csv')
results_df

pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/mfcc_ideologyFive_features_vowel.csv-pca')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/mfcc_ideologyFive_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1410, 0: 1318})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 1121, 1: 795, 3: 476, 2: 336})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 898, 4: 823, 1: 685, 2: 321, 3: 1})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 2413, -1: 296, 2: 10, 1: 9})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2650, 0: 44, 1: 34})
 2D representation

t-SNE transformed data¶

results_df=pd.read_csv('audio-results4/mfcc_ideologyFive_features_vowel.csv-tsne.csv')
results_df

tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/mfcc_ideologyFive_features_vowel.csv-tsne')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/mfcc_ideologyFive_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1383, 0: 1345})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({-1: 2673, 0: 29, 1: 26})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2642, 1: 53, 0: 33})
 2D representation

 3D representation

MEAN-SHIFT
predicted_labels--> Counter({0: 2728})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({1: 1400, 0: 1328})
 2D representation

Umap transformed data¶

results_df=pd.read_csv('audio-results4/mfcc_ideologyFive_features_vowel.csv-umap.csv')
results_df

umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/mfcc_ideologyFive_features_vowel.csv-umap')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/mfcc_ideologyFive_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1420, 0: 1308})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({-1: 2712, 0: 9, 1: 7})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2375, 1: 229, 0: 45, 2: 45, 3: 34})
 2D representation

 3D representation

MEAN-SHIFT
predicted_labels--> Counter({0: 1228, 1: 906, 2: 513, 3: 81})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({0: 1450, 1: 1278})
 2D representation

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.016274	0.056348	0.085407	0.042417	0.056682	0.641197	0.435484	0.313802	0.427713	0.110739	350.287552	2.686954
1	Agglomerative clustering	2	0.649229	0.447850	0.466572	0.431097	0.448133	0.928191	0.939150	0.871277	0.834389	0.179656	230.709885	2.263013
2	Birch	2	-0.000642	-0.000484	0.000123	0.012938	0.000243	0.897317	0.890762	0.499794	0.445545	0.663595	15.903743	0.242808
3	DBSCAN	2	0.184688	0.064796	0.054477	0.083985	0.066086	0.877428	0.858138	0.320993	0.302807	0.177009	20.365912	3.946749
4	Mean-shift	2	-0.000642	-0.000484	0.000123	0.012938	0.000243	0.897317	0.890762	0.499794	0.445545	0.663595	15.903743	0.242808
5	Optics	2	-0.036324	0.010561	0.008461	0.021772	0.012186	0.870827	0.010997	0.004114	0.333333	-0.034120	31.917927	1.200985
6	Gaussian-mixture	2	0.337928	0.246440	0.326392	0.198344	0.246745	0.809248	0.169721	0.145474	0.318321	0.129879	176.396969	3.509053

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.012085	0.052973	0.080294	0.039899	0.053308	0.639781	0.556818	0.680401	0.570088	0.250685	947.704145	1.528516
1	Agglomerative clustering	4	0.105769	0.165126	0.392742	0.104992	0.165690	0.558086	0.416789	0.148700	0.257501	0.240259	935.071262	1.145706
2	Birch	5	0.094052	0.163848	0.399935	0.103565	0.164525	0.535144	0.329545	0.095833	0.202785	0.274818	821.463444	0.950683
3	DBSCAN	3	0.266904	0.103731	0.112080	0.099176	0.105234	0.853376	0.820015	0.230049	0.231765	0.135008	58.416733	2.307579
4	Mean-shift	2	0.034826	0.020979	0.011776	0.183000	0.022127	0.898613	0.892962	0.511373	0.835560	0.427101	38.367833	0.816753
5	Optics	2	-0.040609	0.012008	0.009725	0.022401	0.013562	0.866914	0.016129	0.006033	0.333333	-0.017876	46.667537	0.944523
6	Gaussian-mixture	2	0.392557	0.245753	0.302921	0.207201	0.246080	0.842888	0.137830	0.167482	0.299854	0.275362	640.386893	1.581857

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.028180	9.255041e-02	1.399489e-01	0.069495	9.287167e-02	0.645826	0.584311	0.731295	0.589777	0.324522	1710.617078	1.203074
1	Agglomerative clustering	843	0.000317	3.797370e-02	7.975324e-01	0.041182	7.831905e-02	0.032847	0.002933	0.000014	0.001631	0.355804	578.516816	0.720103
2	Birch	2351	0.000039	8.073284e-03	9.793275e-01	0.043659	8.359088e-02	0.010798	0.000733	0.000002	0.000851	0.109184	754.384654	0.233542
3	DBSCAN	2	-0.030399	8.547338e-03	6.825843e-03	0.020860	1.028595e-02	0.875929	0.010630	0.003976	0.333333	0.178124	79.024040	0.633752
4	Mean-shift	1	0.000000	1.769098e-15	3.226002e-16	1.000000	6.452004e-16	0.897715	0.891129	0.500000	0.445565	-1.000000	-1.000000	-1.000000
5	Optics	2	-0.034574	6.761861e-03	6.078194e-03	0.012993	8.282011e-03	0.865040	0.012830	0.006770	0.345912	0.194352	125.648547	0.661727
6	Gaussian-mixture	2	0.024291	9.200448e-02	1.390929e-01	0.069095	9.232601e-02	0.644448	0.578812	0.729688	0.589198	0.324440	1709.560442	1.202672

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.047580	0.099927	0.150952	0.075039	0.100246	0.653910	0.389663	0.257058	0.405562	0.309735	1452.207514	1.274124
1	Agglomerative clustering	83	0.003776	0.080276	0.592263	0.046742	0.086646	0.117665	0.001833	0.000060	0.000932	0.319799	1171.259733	0.983480
2	Birch	42	0.006147	0.091886	0.553184	0.052221	0.095433	0.168780	0.022727	0.001100	0.028711	0.340894	1288.165522	0.919936
3	DBSCAN	2	-0.009840	0.001319	0.001971	0.016951	0.003531	0.891347	0.003299	0.001234	0.333333	0.424794	43.337125	0.404926
4	Mean-shift	4	0.124901	0.174721	0.379102	0.114040	0.175337	0.600055	0.446848	0.138660	0.249470	0.338498	1351.812755	0.951689
5	Optics	4	0.582486	0.383562	0.482197	0.319935	0.384654	0.910631	0.076246	0.129427	0.245939	0.119601	349.690499	0.639533
6	Gaussian-mixture	2	0.012810	0.088890	0.134191	0.066818	0.089213	0.640950	0.438050	0.276818	0.413044	0.309241	1445.973678	1.269568