import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline

train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/zeroCrossings_ideology_features_vowel.csv')

train_data.shape

(2696, 27)

results_df=pd.read_csv('audio-results4/zeroCrossings_ideology_features_vowel.csv')
results_df

actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/zeroCrossings_ideology_features_vowel.csv')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})

audio-results4/zeroCrossings_ideology_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 2115, 1: 581})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 2695, 1: 1})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 2204, 5: 423, 3: 28, 6: 14, 2: 10, 8: 6, 10: 3, 7: 3, 1: 2, 9: 2, 4: 1})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 2611, -1: 74, 1: 11})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2498, 1: 63, 2: 35, 4: 31, 3: 27, 5: 22, 0: 20})
 2D representation

PCA transformed data¶

results_df=pd.read_csv('audio-results4/zeroCrossings_ideology_features_vowel.csv-pca.csv')
results_df

pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/zeroCrossings_ideology_features_vowel.csv-pca')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})

audio-results4/zeroCrossings_ideology_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 2127, 1: 569})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({0: 2368, 1: 328})
 2D representation

 3D representation

Birch
predicted_labels--> Counter({0: 2052, 3: 494, 6: 59, 4: 44, 1: 30, 5: 14, 2: 3})
 2D representation

 3D representation

DBSCAN
predicted_labels--> Counter({0: 2646, 1: 30, -1: 20})
 2D representation

 3D representation

OPTICS
predicted_labels--> Counter({-1: 2514, 3: 51, 0: 44, 2: 44, 1: 43})
 2D representation

t-SNE transformed data¶

results_df=pd.read_csv('audio-results4/zeroCrossings_ideology_features_vowel.csv-tsne.csv')
results_df

tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/zeroCrossings_ideology_features_vowel.csv-tsne')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})

audio-results4/zeroCrossings_ideology_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1477, 0: 1219})
 2D representation

 3D representation

Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1325, 1: 749, 2: 621, 3: 1})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({0: 1429, 1: 1267})
 2D representation

 3D representation

Umap transformed data¶

results_df=pd.read_csv('audio-results4/zeroCrossings_ideology_features_vowel.csv-umap.csv')
results_df

umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/zeroCrossings_ideology_features_vowel.csv-umap')

Actual Distribution of the labels in the dataset: --> 
Counter({'ai': 2421, 'ee': 275})
Counter({0: 2421, 1: 275})

audio-results4/zeroCrossings_ideology_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 1632, 0: 1064})
 2D representation

 3D representation

Agglomerative
predicted_labels--> Counter({5: 685, 2: 543, 4: 449, 1: 426, 6: 312, 0: 204, 3: 49, 7: 28})
 2D representation

 3D representation

Birch
Too many labels to show
DBSCAN
Too many labels to show
OPTICS
Too many labels to show
MEAN-SHIFT
predicted_labels--> Counter({0: 1220, 1: 769, 2: 426, 3: 204, 4: 77})
 2D representation

 3D representation

Gaussian-Mixture
predicted_labels--> Counter({1: 1632, 0: 1064})
 2D representation

 3D representation

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	-0.087866	0.053130	0.069122	0.043697	0.053545	0.711414	0.683976	0.384058	0.437182	0.457585	1426.486979	1.129534
1	Agglomerative clustering	2	-0.000656	-0.000495	0.000121	0.012092	0.000240	0.903332	0.897626	0.499793	0.448980	0.814159	40.897066	0.132077
2	Birch	11	-0.096522	0.038108	0.058607	0.032551	0.041855	0.728084	0.716246	0.072509	0.079649	0.430165	284.776141	1.038104
3	DBSCAN	2	-0.038316	0.007783	0.006918	0.014979	0.009465	0.870730	0.866840	0.321768	0.298353	0.623808	305.108284	1.337551
4	Mean-shift	8	-0.034165	0.005626	0.007978	0.015925	0.010630	0.874800	0.871291	0.121685	0.125826	0.560120	149.286203	0.840411
5	Optics	6	0.203122	0.052002	0.060116	0.051057	0.055217	0.870859	0.011869	0.011558	0.126190	-0.036987	32.583290	0.879089
6	Gaussian-mixture	2	-0.080888	0.028416	0.028544	0.029418	0.028974	0.805755	0.198813	0.552292	0.554408	0.500673	962.586876	1.090444

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	-0.088380	0.052080	0.067308	0.043030	0.052498	0.714391	0.688427	0.386536	0.437583	0.501752	1919.594521	0.948633
1	Agglomerative clustering	2	-0.084910	0.030914	0.033375	0.029700	0.031430	0.784437	0.777819	0.436309	0.445405	0.544210	1682.935844	0.874977
2	Birch	7	-0.098538	0.045066	0.077507	0.033847	0.047118	0.678157	0.660237	0.105033	0.123921	0.421883	717.476759	0.895170
3	DBSCAN	2	-0.022742	0.004286	0.004074	0.012807	0.006182	0.884403	0.879822	0.326587	0.298816	0.687122	424.240946	0.732047
4	Mean-shift	2	-0.021551	0.003522	0.002826	0.010417	0.004446	0.885290	0.880935	0.492111	0.458680	0.702601	739.295353	0.583547
5	Optics	4	0.160569	0.036486	0.039380	0.038096	0.038727	0.868507	0.018546	0.014445	0.228964	-0.112342	48.312597	0.866169
6	Gaussian-mixture	2	-0.037332	0.011902	0.008722	0.023599	0.012737	0.875672	0.871662	0.485337	0.447619	0.652847	817.102329	0.633294

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	-0.002430	0.000406	0.001191	0.000570	0.000771	0.640898	0.470326	5.229635e-01	0.508491	0.362526	1756.137341	1.184806
1	Agglomerative clustering	362	0.000171	0.021628	0.420727	0.023902	0.045234	0.050183	0.003709	1.141029e-05	0.002762	0.458357	3360.063330	0.692749
2	Birch	825	-0.000070	0.009810	0.512669	0.025926	0.049357	0.032849	0.000371	5.006696e-07	0.001212	0.412155	3883.765179	0.582445
3	DBSCAN	69	-0.000384	0.036445	0.258398	0.023757	0.043514	0.214221	0.040059	6.372809e-04	0.013534	0.119838	146.769505	1.280336
4	Mean-shift	4	-0.006244	0.000251	0.002023	0.000637	0.000969	0.547575	0.460682	1.814374e-01	0.243916	0.325934	1023.694548	0.977184
5	Optics	81	0.004748	0.028695	0.195025	0.021564	0.038833	0.431493	0.004080	1.340268e-04	0.012852	-0.087714	38.726297	1.118296
6	Gaussian-mixture	2	-0.001546	0.000525	0.001377	0.000656	0.000889	0.639468	0.514837	4.752221e-01	0.490889	0.359058	1743.108443	1.189005

	Unnamed: 0	n_clusters	ARI	AMI	H	C	V	FM	A	R	P	silhouette	calinski	davies
0	K-Means	2	-0.028382	0.073114	0.111511	0.054769	0.073459	0.642391	0.487018	0.693423	0.574161	0.466055	2590.346826	0.829096
1	Agglomerative clustering	8	0.021587	0.055022	0.183809	0.033162	0.056188	0.400293	0.081231	0.046764	0.106091	0.570543	6351.737760	0.489970
2	Birch	81	0.002606	0.035508	0.296291	0.022758	0.042270	0.119111	0.013353	0.000741	0.013433	0.526307	21337.079013	0.618640
3	DBSCAN	32	0.013210	0.043813	0.249044	0.026125	0.047289	0.237126	0.054525	0.002621	0.025990	0.463507	3239.947420	1.331013
4	Mean-shift	5	0.065216	0.073069	0.183362	0.046290	0.073919	0.548573	0.422478	0.096027	0.187010	0.578662	4087.374049	0.513710
5	Optics	108	0.001817	0.026521	0.237720	0.020150	0.037151	0.272475	0.004080	0.000101	0.009276	0.204709	66.290296	1.288578
6	Gaussian-mixture	2	-0.028382	0.073114	0.111511	0.054769	0.073459	0.642391	0.487018	0.693423	0.574161	0.466055	2590.346826	0.829096