import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline

train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/zeroCrossings_ideologyFive_features_vowel.csv')

train_data.shape

(2728, 530)

results_df=pd.read_csv('audio-results4/zeroCrossings_ideologyFive_features_vowel.csv')
results_df

actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/zeroCrossings_ideologyFive_features_vowel.csv')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/zeroCrossings_ideologyFive_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1655, 1: 1073})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 2727, 1: 1})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 2727, 1: 1})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 2691, -1: 32, 1: 5})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2637, 1: 47, 0: 44})
 2D representation

PCA transformed data¶

results_df=pd.read_csv('audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-pca.csv')
results_df

pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-pca')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1660, 1: 1068})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 2727, 1: 1})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 2727, 1: 1})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 2578, -1: 130, 1: 20})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2570, 1: 67, 2: 50, 0: 41})
 2D representation

t-SNE transformed data¶

results_df=pd.read_csv('audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-tsne.csv')
results_df

tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-tsne')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1397, 0: 1331})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 2728})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({0: 1396, 1: 1332})
 2D representation

 3D representation

Umap transformed data¶

results_df=pd.read_csv('audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-umap.csv')
results_df

umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-umap')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2431, 'ee': 297})
Counter({0: 2431, 1: 297})

audio-results4/zeroCrossings_ideologyFive_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 1903, 1: 825})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({1: 745, 4: 742, 0: 738, 2: 276, 3: 83, 5: 70, 6: 41, 7: 33})
 2D representation

 3D representation

Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({1: 1903, 0: 742, 2: 83})
 2D representation

 3D representation

OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({2: 1014, 1: 745, 0: 742, 3: 144, 4: 83})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({0: 1570, 1: 1158})
 2D representation

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	-0.026934	0.048032	0.071295	0.036609	0.048377	0.638868	0.521261	0.339763	0.434851	0.378865	1028.065327	1.125223
1	Agglomerative clustering	2	-0.000642	-0.000484	0.000123	0.012938	0.000243	0.897317	0.890762	0.499794	0.445545	0.954799	651.097631	0.031278
2	Birch	2	-0.000642	-0.000484	0.000123	0.012938	0.000243	0.897317	0.890762	0.499794	0.445545	0.954799	651.097631	0.031278
3	DBSCAN	2	-0.005447	-0.000856	0.000629	0.002805	0.001028	0.884952	0.879032	0.329794	0.363582	0.643148	157.814694	2.025316
4	Mean-shift	6	-0.003176	-0.002336	0.000614	0.012949	0.001173	0.895722	0.889296	0.166324	0.148488	0.820177	215.972385	0.089023
5	Optics	2	0.174168	0.113629	0.085756	0.174290	0.114952	0.891485	0.030059	0.052623	0.600903	-0.098252	50.140990	0.861420
6	Gaussian-mixture	2	0.009444	-0.000412	0.000242	0.000529	0.000332	0.866717	0.864003	0.503992	0.510967	0.612404	392.451525	1.206402

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	-0.027619	0.047550	0.070528	0.036259	0.047895	0.639119	0.523094	0.340791	0.435162	0.434889	1375.838494	0.936426
1	Agglomerative clustering	2	-0.000642	-0.000484	0.000123	0.012938	0.000243	0.897317	0.890762	0.499794	0.445545	0.959757	823.662711	0.028059
2	Birch	2	-0.000642	-0.000484	0.000123	0.012938	0.000243	0.897317	0.890762	0.499794	0.445545	0.959757	823.662711	0.028059
3	DBSCAN	2	-0.017480	-0.000221	0.000956	0.001403	0.001138	0.846783	0.840909	0.315533	0.313150	0.447148	323.119574	1.285344
4	Mean-shift	3	-0.002546	-0.000742	0.000491	0.014265	0.000950	0.896121	0.889663	0.332785	0.296990	0.835570	503.246762	0.340622
5	Optics	3	0.150267	0.097336	0.090278	0.109540	0.098981	0.869539	0.017595	0.010848	0.273753	-0.104876	55.899549	0.840575
6	Gaussian-mixture	2	-0.010291	-0.000285	0.000263	0.000288	0.000275	0.814731	0.814150	0.493754	0.492972	0.520866	733.888701	1.041487

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.033649	7.483463e-02	1.132411e-01	0.056248	7.516223e-02	0.648099	0.407625	0.286348	0.417039	0.308246	1204.575611	1.448123
1	Agglomerative clustering	338	0.000638	4.518202e-02	5.932241e-01	0.035548	6.707700e-02	0.054838	0.005499	0.000018	0.002959	0.434740	4133.412487	0.705597
2	Birch	580	0.000353	3.559516e-02	6.372685e-01	0.035418	6.710623e-02	0.043463	0.001833	0.000004	0.001232	0.383644	3843.072813	0.704161
3	DBSCAN	141	0.005557	5.842079e-02	4.666477e-01	0.038095	7.044044e-02	0.197416	0.008431	0.000087	0.007313	0.169170	88.863604	1.218553
4	Mean-shift	1	0.000000	1.769098e-15	3.226002e-16	1.000000	6.452004e-16	0.897715	0.891129	0.500000	0.445565	-1.000000	-1.000000	-1.000000
5	Optics	87	0.007562	5.436222e-02	3.268927e-01	0.035613	6.422828e-02	0.402573	0.008431	0.000108	0.011364	-0.060594	40.634738	1.276768
6	Gaussian-mixture	2	0.033399	7.470108e-02	1.130418e-01	0.056148	7.502872e-02	0.647990	0.592009	0.713447	0.582878	0.308057	1204.103375	1.448260

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	0.010320	0.000315	0.000972	0.000546	0.000699	0.685873	0.662023	0.519235	0.508846	0.486012	2194.604159	0.823348
1	Agglomerative clustering	8	0.072187	0.120391	0.345307	0.073775	0.121576	0.489081	0.253299	0.038856	0.117025	0.631658	3384.161704	0.468086
2	Birch	46	0.007059	0.074318	0.464746	0.042703	0.078219	0.161521	0.022727	0.000683	0.021758	0.514259	17835.943980	0.615782
3	DBSCAN	3	0.052039	0.089596	0.138411	0.066938	0.090236	0.690497	0.332111	0.318316	0.353015	0.514445	1728.680714	0.590009
4	Mean-shift	5	0.075392	0.134943	0.331647	0.085305	0.135704	0.532563	0.263196	0.064391	0.193521	0.578058	3109.287003	0.636512
5	Optics	113	0.007979	0.055510	0.418362	0.035664	0.065725	0.253564	0.005865	0.000058	0.008256	0.213715	72.847674	1.156582
6	Gaussian-mixture	2	0.029984	0.020611	0.031242	0.015772	0.020962	0.653385	0.604472	0.616992	0.546461	0.419297	1962.424341	1.064706