Loading all the necessary header files
from imutils import paths
import face_recognition
import argparse
import pickle
import cv2
import os
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.pyplot as mpld3
import pandas as pd
from collections import Counter
from sklearn import metrics
from pylab import *
from sklearn.cluster import DBSCAN
from sklearn.model_selection import GridSearchCV
from kneed import DataGenerator, KneeLocator
import sklearn
import hdbscan
from scipy.spatial import distance
from scipy.cluster import hierarchy
from s_dbw import S_Dbw
%matplotlib inline
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
Loading the filepaths for a dataset
def loadFilePaths(dirname):
files=os.listdir(dirname)
files_path=[os.path.join(dirname,file) for file in files ]
return files_path
ideology_files_path=loadFilePaths('ideology_image_dataset')
Using the face recognition moudle to extract the face embeddings and saving it to the pickle file
files_path=ideology_files_path
data=[]
for (i, imagePath) in enumerate(files_path):
print(" Status: %s / %s" %(i, len(files_path)), end="\r")
image = cv2.imread(imagePath)
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
boxes = face_recognition.face_locations(rgb)
encodings = face_recognition.face_encodings(rgb, boxes)
d=[{"image-path":imagePath,"face-location":box,"face-encodings":enc} for (box,enc) in zip(boxes,encodings)]
data.append(d)
Saving the face encodings in the pickle file
f = open("ideology_data_face_Embeddings.pickle", "wb")
f.write(pickle.dumps(data))
f.close()
Loading the encodings from pickle file
with open('ideology_data_face_Embeddings.pickle','rb') as f:
ideology_face_data=pickle.load(f)
ideology_face_data_updated=[]
for element in ideology_face_data:
ideology_face_data_updated.extend(element)
train_data=np.array(ideology_face_data_updated)
Extracting the encodings , and making the data ready for the clustering
def getEncodings(data):
train_data=np.array(data)
encodings = [d["face-encodings"] for d in train_data]
return encodings
ideology_encodings=getEncodings(ideology_face_data_updated)
len(ideology_encodings)
3532
Visualizing some detected faces from a image
def showImage(file_path,coordinates):
#print(coordinates)
top=coordinates[0]
right=coordinates[1]
bottom=coordinates[2]
left=coordinates[3]
image = cv2.imread(file_path)
# print(image.shape)
#height,width, channels
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
cv2.rectangle(image, (left, top), (right, bottom), (0, 255, 0), 2)
plt.imshow(image)
plt.axis("off")
plt.show()
#cv2.rectangle(image, (left, top), (right, bottom), (0, 255, 0), 2)
showImage('ideology_image_dataset/clip_2805.jpg',ideology_face_data[9][0]['face-location'])
showImage('ideology_image_dataset/clip_2805.jpg',ideology_face_data[9][1]['face-location'])
showImage('ideology_image_dataset/clip_2805.jpg',ideology_face_data[9][2]['face-location'])
Implementing the clusteirng models
def cv_silhouette_scorer(estimator, X):
estimator.fit(X)
try:
cluster_labels = estimator.labels_
except Exception as e:
# print(e,estimator)
cluster_labels=estimator.predict(X)
num_labels = len(set(cluster_labels))
num_samples = len(X)
if num_labels == 1 or num_labels == num_samples or num_labels<=2:
return -1
else:
return metrics.silhouette_score(X, cluster_labels)
def evaluation_Score(features,y_pred,output_df,model):
try:
num_labels=len(set(y_pred))
total_samples=len(y_pred)
print("labels",num_labels)
if(num_labels==1 or num_labels==total_samples):
output_df.loc[model,'silhouette'] =-1
output_df.loc[model,'calinski'] =-1
output_df.loc[model,'davies'] =-1
else:
output_df.loc[model,'silhouette'] =metrics.silhouette_score(features,y_pred)
output_df.loc[model,'calinski'] =metrics.calinski_harabasz_score(features, y_pred)
output_df.loc[model,'davies'] =metrics.davies_bouldin_score(features,y_pred)
features = np.array(features)
score = S_Dbw(features,y_pred, centers_id=None, method='Tong', alg_noise='bind',
centr='mean', nearest_centr=True, metric='euclidean')
print("Score",score)
output_df.loc[model,'S_Dbw']=score
except Exception as e:
print(e)
pass
return output_df
def getEpsilon(train_data):
neigh = sklearn.neighbors.NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(train_data)
distances, indices = nbrs.kneighbors(train_data)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
y=distances
x=list(np.arange(0,len(distances)))
epsilons=[]
for s in range(10,120,25):
try:
kneedle = KneeLocator(x,y,S=s, curve='convex', direction='increasing')
epsilon=kneedle.all_elbows_y[0]
if(len(epsilons)>=1 and epsilons[-1]-epsilon<=0.001):
print(" ")
else:
epsilons.append(epsilon)
except Exception as e:
print(e)
if(len(epsilons)>=1):
epsilons.append(epsilons[-1]+s/10)
else:
epsilons.append(s/10)
epsilons.append(0.6)
epsilons.append(0.5)
epsilons.append(0.8)
print(epsilons)
return epsilons
def runGridSearch(estimator,params_dict,train_data):
cv = [(slice(None), slice(None))]
gs = GridSearchCV(estimator=estimator, param_grid=params_dict, scoring=cv_silhouette_scorer, cv=cv, n_jobs=-1)
gs.fit(train_data)
# print("Grid search",gs.cv_results_)
try:
predicted_labels= gs.best_estimator_.labels_
except:
predicted_labels=gs.predict(train_data)
return predicted_labels
Running for DBSCAN with grid search
#epsilons=getEpsilon(ideology_encodings)
output_df = pd.DataFrame(index=['DBSCAN-GridSearch'],columns=['n_clusters','silhouette','calinski','davies',
'S_Dbw'])
e
params_dict = {'eps':[0.3,0.4,0.5,0.6,0.7],'min_samples':[5],'metric':['euclidean','manhattan','mahalanobis', 'minkowski']}
predicted_labels2=runGridSearch(sklearn.cluster.DBSCAN(),params_dict,ideology_encodings)
evaluation_Score(ideology_encodings,predicted_labels2,output_df,'DBSCAN-GridSearch')
if(set(predicted_labels2).issuperset({-1})):
n_clustersLen=len(set(predicted_labels2))-1
else:
n_clustersLen=len(set(predicted_labels2))
output_df.loc['DBSCAN-GridSearch','n_clusters']=n_clustersLen
output_df
labels 116 Score 0.6942229654459516
n_clusters | silhouette | calinski | davies | S_Dbw | |
---|---|---|---|---|---|
DBSCAN-GridSearch | 115 | 0.0246545 | 15.8596 | 1.52634 | 0.694223 |
Saving the corresponding labels
np.save('bio-metric-predicted-dbscanwithGrid-labels.npy',np.array(predicted_labels2))
Visualizing the results
def showClustering(predicted_labels,label):
#subplots_adjust(hspace=0.000)
label_indexs= np.where(predicted_labels==label)[0]
# print(len(label_indexs))
print("CLUSTER--> ",label,"TOTAL IMAGES--> ",len(label_indexs))
if(len(label_indexs)>=500):
fig=plt.figure(figsize=(10, 300))
elif(len(label_indexs)>100 and len(label_indexs)<500):
fig=plt.figure(figsize=(10, 70))
elif(len(label_indexs)>=50 and len(label_indexs)<100):
fig=plt.figure(figsize=(10, 30))
elif(len(label_indexs)>=20 and len(label_indexs)<50):
fig=plt.figure(figsize=(10, 20))
elif(len(label_indexs)>=0 and len(label_indexs)<20):
fig=plt.figure(figsize=(10, 8))
for i,index in enumerate(label_indexs):
image = cv2.imread(train_data[index]["image-path"])
(top, right, bottom, left) = train_data[index]["face-location"]
face = image[top:bottom, left:right]
face= cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
columns = 4
rows = np.ceil(len(label_indexs)/float(columns))
#print(columns,rows)
fig.add_subplot(rows,columns, i+1)
#fig.tight_layout()
plt.imshow(face)
plt.show()
predicted_labels_withoutGrid=np.load('bio-metric-predicted-dbscanwithoutGrid-labels.npy')
predicted_labels_withGrid=np.load('bio-metric-predicted-dbscanwithGrid-labels.npy')
unique_labels=set(predicted_labels_withGrid)
len(unique_labels)
for label in list(unique_labels):
if(label!=-1):
showClustering(predicted_labels_withGrid,label)
CLUSTER--> 0 TOTAL IMAGES--> 27
CLUSTER--> 1 TOTAL IMAGES--> 26
CLUSTER--> 2 TOTAL IMAGES--> 9
CLUSTER--> 3 TOTAL IMAGES--> 19
CLUSTER--> 4 TOTAL IMAGES--> 13
CLUSTER--> 5 TOTAL IMAGES--> 67
CLUSTER--> 6 TOTAL IMAGES--> 12
CLUSTER--> 7 TOTAL IMAGES--> 9
CLUSTER--> 8 TOTAL IMAGES--> 12
CLUSTER--> 9 TOTAL IMAGES--> 14
CLUSTER--> 10 TOTAL IMAGES--> 135
CLUSTER--> 11 TOTAL IMAGES--> 7
CLUSTER--> 12 TOTAL IMAGES--> 5
CLUSTER--> 13 TOTAL IMAGES--> 16
CLUSTER--> 14 TOTAL IMAGES--> 13
CLUSTER--> 15 TOTAL IMAGES--> 7
CLUSTER--> 16 TOTAL IMAGES--> 25
CLUSTER--> 17 TOTAL IMAGES--> 7
CLUSTER--> 18 TOTAL IMAGES--> 6
CLUSTER--> 19 TOTAL IMAGES--> 10
CLUSTER--> 20 TOTAL IMAGES--> 7
CLUSTER--> 21 TOTAL IMAGES--> 19
CLUSTER--> 22 TOTAL IMAGES--> 71
CLUSTER--> 23 TOTAL IMAGES--> 34
CLUSTER--> 24 TOTAL IMAGES--> 20
CLUSTER--> 25 TOTAL IMAGES--> 10
CLUSTER--> 26 TOTAL IMAGES--> 31
CLUSTER--> 27 TOTAL IMAGES--> 17
CLUSTER--> 28 TOTAL IMAGES--> 23
CLUSTER--> 29 TOTAL IMAGES--> 13
CLUSTER--> 30 TOTAL IMAGES--> 7
CLUSTER--> 31 TOTAL IMAGES--> 17
CLUSTER--> 32 TOTAL IMAGES--> 6
CLUSTER--> 33 TOTAL IMAGES--> 18
CLUSTER--> 34 TOTAL IMAGES--> 5
CLUSTER--> 35 TOTAL IMAGES--> 9
CLUSTER--> 36 TOTAL IMAGES--> 12
CLUSTER--> 37 TOTAL IMAGES--> 12
CLUSTER--> 38 TOTAL IMAGES--> 15
CLUSTER--> 39 TOTAL IMAGES--> 16
CLUSTER--> 40 TOTAL IMAGES--> 10
CLUSTER--> 41 TOTAL IMAGES--> 11
CLUSTER--> 42 TOTAL IMAGES--> 11
CLUSTER--> 43 TOTAL IMAGES--> 5
CLUSTER--> 44 TOTAL IMAGES--> 82
CLUSTER--> 45 TOTAL IMAGES--> 7
CLUSTER--> 46 TOTAL IMAGES--> 7
CLUSTER--> 47 TOTAL IMAGES--> 6
CLUSTER--> 48 TOTAL IMAGES--> 5
CLUSTER--> 49 TOTAL IMAGES--> 6
CLUSTER--> 50 TOTAL IMAGES--> 7
CLUSTER--> 51 TOTAL IMAGES--> 33
CLUSTER--> 52 TOTAL IMAGES--> 12
CLUSTER--> 53 TOTAL IMAGES--> 18
CLUSTER--> 54 TOTAL IMAGES--> 17
CLUSTER--> 55 TOTAL IMAGES--> 13
CLUSTER--> 56 TOTAL IMAGES--> 6
CLUSTER--> 57 TOTAL IMAGES--> 14
CLUSTER--> 58 TOTAL IMAGES--> 15
CLUSTER--> 59 TOTAL IMAGES--> 15
CLUSTER--> 60 TOTAL IMAGES--> 5
CLUSTER--> 61 TOTAL IMAGES--> 6
CLUSTER--> 62 TOTAL IMAGES--> 14
CLUSTER--> 63 TOTAL IMAGES--> 8
CLUSTER--> 64 TOTAL IMAGES--> 6
CLUSTER--> 65 TOTAL IMAGES--> 6
CLUSTER--> 66 TOTAL IMAGES--> 5
CLUSTER--> 67 TOTAL IMAGES--> 5
CLUSTER--> 68 TOTAL IMAGES--> 13
CLUSTER--> 69 TOTAL IMAGES--> 7
CLUSTER--> 70 TOTAL IMAGES--> 13
CLUSTER--> 71 TOTAL IMAGES--> 6
CLUSTER--> 72 TOTAL IMAGES--> 7
CLUSTER--> 73 TOTAL IMAGES--> 12
CLUSTER--> 74 TOTAL IMAGES--> 6
CLUSTER--> 75 TOTAL IMAGES--> 6
CLUSTER--> 76 TOTAL IMAGES--> 6
CLUSTER--> 77 TOTAL IMAGES--> 5
CLUSTER--> 78 TOTAL IMAGES--> 6
CLUSTER--> 79 TOTAL IMAGES--> 19
CLUSTER--> 80 TOTAL IMAGES--> 6
CLUSTER--> 81 TOTAL IMAGES--> 7
CLUSTER--> 82 TOTAL IMAGES--> 5
CLUSTER--> 83 TOTAL IMAGES--> 5
CLUSTER--> 84 TOTAL IMAGES--> 6
CLUSTER--> 85 TOTAL IMAGES--> 6
CLUSTER--> 86 TOTAL IMAGES--> 5
CLUSTER--> 87 TOTAL IMAGES--> 6
CLUSTER--> 88 TOTAL IMAGES--> 9
CLUSTER--> 89 TOTAL IMAGES--> 7
CLUSTER--> 90 TOTAL IMAGES--> 13
CLUSTER--> 91 TOTAL IMAGES--> 7
CLUSTER--> 92 TOTAL IMAGES--> 5
CLUSTER--> 93 TOTAL IMAGES--> 5
CLUSTER--> 94 TOTAL IMAGES--> 5
CLUSTER--> 95 TOTAL IMAGES--> 8
CLUSTER--> 96 TOTAL IMAGES--> 5
CLUSTER--> 97 TOTAL IMAGES--> 9
CLUSTER--> 98 TOTAL IMAGES--> 9
CLUSTER--> 99 TOTAL IMAGES--> 8
CLUSTER--> 100 TOTAL IMAGES--> 7
CLUSTER--> 101 TOTAL IMAGES--> 7
CLUSTER--> 102 TOTAL IMAGES--> 10
CLUSTER--> 103 TOTAL IMAGES--> 5
CLUSTER--> 104 TOTAL IMAGES--> 6
CLUSTER--> 105 TOTAL IMAGES--> 8
CLUSTER--> 106 TOTAL IMAGES--> 5
CLUSTER--> 107 TOTAL IMAGES--> 7
CLUSTER--> 108 TOTAL IMAGES--> 5
CLUSTER--> 109 TOTAL IMAGES--> 8
CLUSTER--> 110 TOTAL IMAGES--> 13
CLUSTER--> 111 TOTAL IMAGES--> 6
CLUSTER--> 112 TOTAL IMAGES--> 9
CLUSTER--> 113 TOTAL IMAGES--> 3
CLUSTER--> 114 TOTAL IMAGES--> 8