Loading all the necessary header files

In [12]:
from imutils import paths
import face_recognition
import argparse
import pickle
import cv2
import os

import numpy as np

from matplotlib import pyplot as plt
import matplotlib.pyplot as mpld3
import pandas as pd
from collections import Counter
from sklearn import metrics
from pylab import *
from sklearn.cluster import DBSCAN
from sklearn.model_selection import GridSearchCV
from kneed import DataGenerator, KneeLocator
import sklearn
import hdbscan
from scipy.spatial import distance
from scipy.cluster import hierarchy
from s_dbw import S_Dbw
%matplotlib inline
In [35]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

Loading the filepaths for a dataset

In [3]:
def loadFilePaths(dirname):
    files=os.listdir(dirname)
    files_path=[os.path.join(dirname,file) for file in files ]
    return files_path
    
    
In [4]:
ideology_files_path=loadFilePaths('ideology_image_dataset')

Using the face recognition moudle to extract the face embeddings and saving it to the pickle file

In [8]:
files_path=ideology_files_path
data=[]
for (i, imagePath) in enumerate(files_path):
    print(" Status: %s / %s" %(i, len(files_path)), end="\r")
    image = cv2.imread(imagePath)
    rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    boxes = face_recognition.face_locations(rgb)
    encodings = face_recognition.face_encodings(rgb, boxes)

    d=[{"image-path":imagePath,"face-location":box,"face-encodings":enc} for (box,enc) in zip(boxes,encodings)]
    data.append(d)

Saving the face encodings in the pickle file

In [9]:
f = open("ideology_data_face_Embeddings.pickle", "wb")
f.write(pickle.dumps(data))
f.close()

Loading the encodings from pickle file

In [13]:
with open('ideology_data_face_Embeddings.pickle','rb') as f:
    ideology_face_data=pickle.load(f)
In [33]:
ideology_face_data_updated=[]

for element in ideology_face_data:
    ideology_face_data_updated.extend(element)
train_data=np.array(ideology_face_data_updated)

Extracting the encodings , and making the data ready for the clustering

In [15]:
def getEncodings(data):
    train_data=np.array(data)
    encodings = [d["face-encodings"] for d in train_data]
    return encodings
In [17]:
ideology_encodings=getEncodings(ideology_face_data_updated)
len(ideology_encodings)
Out[17]:
3532

Visualizing some detected faces from a image

In [18]:
def showImage(file_path,coordinates):
    #print(coordinates)
    top=coordinates[0]
    right=coordinates[1]
    bottom=coordinates[2]
    left=coordinates[3]
    image = cv2.imread(file_path)
   # print(image.shape)
    #height,width, channels
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    cv2.rectangle(image, (left, top), (right, bottom), (0, 255, 0), 2)
    plt.imshow(image)
    plt.axis("off")
    plt.show()
    #cv2.rectangle(image, (left, top), (right, bottom), (0, 255, 0), 2)
    
In [19]:
showImage('ideology_image_dataset/clip_2805.jpg',ideology_face_data[9][0]['face-location'])
showImage('ideology_image_dataset/clip_2805.jpg',ideology_face_data[9][1]['face-location'])
showImage('ideology_image_dataset/clip_2805.jpg',ideology_face_data[9][2]['face-location'])

Implementing the clusteirng models

In [21]:
def cv_silhouette_scorer(estimator, X):
    estimator.fit(X)
    try:
        cluster_labels = estimator.labels_
    except Exception as e:
      #  print(e,estimator)
        cluster_labels=estimator.predict(X)
    num_labels = len(set(cluster_labels))
    num_samples = len(X)
    if num_labels == 1 or num_labels == num_samples or num_labels<=2:
        return -1
    else:
        return metrics.silhouette_score(X, cluster_labels)
In [22]:
def evaluation_Score(features,y_pred,output_df,model):
    try:
        
        num_labels=len(set(y_pred))
        total_samples=len(y_pred)
        print("labels",num_labels)
        if(num_labels==1 or num_labels==total_samples):
            output_df.loc[model,'silhouette'] =-1
            output_df.loc[model,'calinski'] =-1
            output_df.loc[model,'davies'] =-1
            
        else:
            output_df.loc[model,'silhouette'] =metrics.silhouette_score(features,y_pred)
            output_df.loc[model,'calinski'] =metrics.calinski_harabasz_score(features, y_pred)
            output_df.loc[model,'davies'] =metrics.davies_bouldin_score(features,y_pred)
            features = np.array(features)
            score = S_Dbw(features,y_pred, centers_id=None, method='Tong', alg_noise='bind',
                            centr='mean', nearest_centr=True, metric='euclidean')
            print("Score",score)
            output_df.loc[model,'S_Dbw']=score
    

    except Exception as e:
        print(e)
        pass
        
    return output_df
       
In [23]:
def getEpsilon(train_data):
    neigh = sklearn.neighbors.NearestNeighbors(n_neighbors=2)
    nbrs = neigh.fit(train_data)
    distances, indices = nbrs.kneighbors(train_data)
    distances = np.sort(distances, axis=0)
    distances = distances[:,1]
    y=distances
    x=list(np.arange(0,len(distances)))
    epsilons=[]
    for s in range(10,120,25):
        try:
            kneedle = KneeLocator(x,y,S=s, curve='convex', direction='increasing')
            epsilon=kneedle.all_elbows_y[0]
            if(len(epsilons)>=1 and epsilons[-1]-epsilon<=0.001):
                print(" ")
                
            else:    
                epsilons.append(epsilon)
        
        except Exception as e:
            print(e)
            if(len(epsilons)>=1):
                epsilons.append(epsilons[-1]+s/10)
            else:
                epsilons.append(s/10)
    
    epsilons.append(0.6)
    epsilons.append(0.5)
    epsilons.append(0.8)
        
    print(epsilons)
 
    return epsilons
In [24]:
def runGridSearch(estimator,params_dict,train_data):
    
    cv = [(slice(None), slice(None))]
    gs = GridSearchCV(estimator=estimator, param_grid=params_dict, scoring=cv_silhouette_scorer, cv=cv, n_jobs=-1)
    gs.fit(train_data)
  #  print("Grid search",gs.cv_results_)
    try:
        predicted_labels= gs.best_estimator_.labels_
    except:
        predicted_labels=gs.predict(train_data)
    
    
    return predicted_labels

Running for DBSCAN with grid search

In [27]:
#epsilons=getEpsilon(ideology_encodings)
output_df = pd.DataFrame(index=['DBSCAN-GridSearch'],columns=['n_clusters','silhouette','calinski','davies',
                                                                                   'S_Dbw'])
e
params_dict = {'eps':[0.3,0.4,0.5,0.6,0.7],'min_samples':[5],'metric':['euclidean','manhattan','mahalanobis', 'minkowski']}
predicted_labels2=runGridSearch(sklearn.cluster.DBSCAN(),params_dict,ideology_encodings)
evaluation_Score(ideology_encodings,predicted_labels2,output_df,'DBSCAN-GridSearch')
if(set(predicted_labels2).issuperset({-1})):
    n_clustersLen=len(set(predicted_labels2))-1
else:
    n_clustersLen=len(set(predicted_labels2))
output_df.loc['DBSCAN-GridSearch','n_clusters']=n_clustersLen
output_df
labels 116
Score 0.6942229654459516
Out[27]:
n_clusters silhouette calinski davies S_Dbw
DBSCAN-GridSearch 115 0.0246545 15.8596 1.52634 0.694223

Saving the corresponding labels

In [28]:
np.save('bio-metric-predicted-dbscanwithGrid-labels.npy',np.array(predicted_labels2))

Visualizing the results

In [30]:
def showClustering(predicted_labels,label):
    #subplots_adjust(hspace=0.000)
    label_indexs= np.where(predicted_labels==label)[0]
   # print(len(label_indexs))
    print("CLUSTER--> ",label,"TOTAL IMAGES--> ",len(label_indexs))
    if(len(label_indexs)>=500):
        fig=plt.figure(figsize=(10, 300))
        
        
    elif(len(label_indexs)>100 and len(label_indexs)<500):
        fig=plt.figure(figsize=(10, 70))
    elif(len(label_indexs)>=50 and len(label_indexs)<100):
        fig=plt.figure(figsize=(10, 30))
        
    elif(len(label_indexs)>=20 and len(label_indexs)<50):
        fig=plt.figure(figsize=(10, 20))
    
    elif(len(label_indexs)>=0 and len(label_indexs)<20):
        fig=plt.figure(figsize=(10, 8))
    
    for i,index in enumerate(label_indexs):
       
        
        image = cv2.imread(train_data[index]["image-path"])
        (top, right, bottom, left) = train_data[index]["face-location"]
        face = image[top:bottom, left:right]
        face= cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
       
     
       
        
        columns = 4
        rows = np.ceil(len(label_indexs)/float(columns))
        #print(columns,rows)
        
        fig.add_subplot(rows,columns, i+1)
        #fig.tight_layout()
        plt.imshow(face)
    
   
    plt.show()
    
    
In [36]:
predicted_labels_withoutGrid=np.load('bio-metric-predicted-dbscanwithoutGrid-labels.npy')
predicted_labels_withGrid=np.load('bio-metric-predicted-dbscanwithGrid-labels.npy')
unique_labels=set(predicted_labels_withGrid)
len(unique_labels)
for label in list(unique_labels):
    if(label!=-1):
        showClustering(predicted_labels_withGrid,label)
CLUSTER-->  0 TOTAL IMAGES-->  27
CLUSTER-->  1 TOTAL IMAGES-->  26
CLUSTER-->  2 TOTAL IMAGES-->  9
CLUSTER-->  3 TOTAL IMAGES-->  19
CLUSTER-->  4 TOTAL IMAGES-->  13
CLUSTER-->  5 TOTAL IMAGES-->  67
CLUSTER-->  6 TOTAL IMAGES-->  12
CLUSTER-->  7 TOTAL IMAGES-->  9
CLUSTER-->  8 TOTAL IMAGES-->  12
CLUSTER-->  9 TOTAL IMAGES-->  14
CLUSTER-->  10 TOTAL IMAGES-->  135
CLUSTER-->  11 TOTAL IMAGES-->  7
CLUSTER-->  12 TOTAL IMAGES-->  5
CLUSTER-->  13 TOTAL IMAGES-->  16
CLUSTER-->  14 TOTAL IMAGES-->  13
CLUSTER-->  15 TOTAL IMAGES-->  7
CLUSTER-->  16 TOTAL IMAGES-->  25
CLUSTER-->  17 TOTAL IMAGES-->  7
CLUSTER-->  18 TOTAL IMAGES-->  6
CLUSTER-->  19 TOTAL IMAGES-->  10
CLUSTER-->  20 TOTAL IMAGES-->  7
CLUSTER-->  21 TOTAL IMAGES-->  19
CLUSTER-->  22 TOTAL IMAGES-->  71
CLUSTER-->  23 TOTAL IMAGES-->  34
CLUSTER-->  24 TOTAL IMAGES-->  20
CLUSTER-->  25 TOTAL IMAGES-->  10
CLUSTER-->  26 TOTAL IMAGES-->  31
CLUSTER-->  27 TOTAL IMAGES-->  17
CLUSTER-->  28 TOTAL IMAGES-->  23
CLUSTER-->  29 TOTAL IMAGES-->  13
CLUSTER-->  30 TOTAL IMAGES-->  7
CLUSTER-->  31 TOTAL IMAGES-->  17
CLUSTER-->  32 TOTAL IMAGES-->  6
CLUSTER-->  33 TOTAL IMAGES-->  18
CLUSTER-->  34 TOTAL IMAGES-->  5
CLUSTER-->  35 TOTAL IMAGES-->  9
CLUSTER-->  36 TOTAL IMAGES-->  12
CLUSTER-->  37 TOTAL IMAGES-->  12
CLUSTER-->  38 TOTAL IMAGES-->  15
CLUSTER-->  39 TOTAL IMAGES-->  16
CLUSTER-->  40 TOTAL IMAGES-->  10
CLUSTER-->  41 TOTAL IMAGES-->  11
CLUSTER-->  42 TOTAL IMAGES-->  11
CLUSTER-->  43 TOTAL IMAGES-->  5
CLUSTER-->  44 TOTAL IMAGES-->  82
CLUSTER-->  45 TOTAL IMAGES-->  7
CLUSTER-->  46 TOTAL IMAGES-->  7
CLUSTER-->  47 TOTAL IMAGES-->  6
CLUSTER-->  48 TOTAL IMAGES-->  5
CLUSTER-->  49 TOTAL IMAGES-->  6
CLUSTER-->  50 TOTAL IMAGES-->  7
CLUSTER-->  51 TOTAL IMAGES-->  33
CLUSTER-->  52 TOTAL IMAGES-->  12
CLUSTER-->  53 TOTAL IMAGES-->  18
CLUSTER-->  54 TOTAL IMAGES-->  17
CLUSTER-->  55 TOTAL IMAGES-->  13
CLUSTER-->  56 TOTAL IMAGES-->  6
CLUSTER-->  57 TOTAL IMAGES-->  14
CLUSTER-->  58 TOTAL IMAGES-->  15
CLUSTER-->  59 TOTAL IMAGES-->  15
CLUSTER-->  60 TOTAL IMAGES-->  5
CLUSTER-->  61 TOTAL IMAGES-->  6
CLUSTER-->  62 TOTAL IMAGES-->  14
CLUSTER-->  63 TOTAL IMAGES-->  8
CLUSTER-->  64 TOTAL IMAGES-->  6
CLUSTER-->  65 TOTAL IMAGES-->  6
CLUSTER-->  66 TOTAL IMAGES-->  5
CLUSTER-->  67 TOTAL IMAGES-->  5
CLUSTER-->  68 TOTAL IMAGES-->  13
CLUSTER-->  69 TOTAL IMAGES-->  7
CLUSTER-->  70 TOTAL IMAGES-->  13
CLUSTER-->  71 TOTAL IMAGES-->  6
CLUSTER-->  72 TOTAL IMAGES-->  7
CLUSTER-->  73 TOTAL IMAGES-->  12
CLUSTER-->  74 TOTAL IMAGES-->  6
CLUSTER-->  75 TOTAL IMAGES-->  6
CLUSTER-->  76 TOTAL IMAGES-->  6
CLUSTER-->  77 TOTAL IMAGES-->  5
CLUSTER-->  78 TOTAL IMAGES-->  6
CLUSTER-->  79 TOTAL IMAGES-->  19
CLUSTER-->  80 TOTAL IMAGES-->  6
CLUSTER-->  81 TOTAL IMAGES-->  7
CLUSTER-->  82 TOTAL IMAGES-->  5
CLUSTER-->  83 TOTAL IMAGES-->  5
CLUSTER-->  84 TOTAL IMAGES-->  6
CLUSTER-->  85 TOTAL IMAGES-->  6
CLUSTER-->  86 TOTAL IMAGES-->  5
CLUSTER-->  87 TOTAL IMAGES-->  6
CLUSTER-->  88 TOTAL IMAGES-->  9
CLUSTER-->  89 TOTAL IMAGES-->  7
CLUSTER-->  90 TOTAL IMAGES-->  13
CLUSTER-->  91 TOTAL IMAGES-->  7
CLUSTER-->  92 TOTAL IMAGES-->  5
CLUSTER-->  93 TOTAL IMAGES-->  5
CLUSTER-->  94 TOTAL IMAGES-->  5
CLUSTER-->  95 TOTAL IMAGES-->  8
CLUSTER-->  96 TOTAL IMAGES-->  5
CLUSTER-->  97 TOTAL IMAGES-->  9
CLUSTER-->  98 TOTAL IMAGES-->  9
CLUSTER-->  99 TOTAL IMAGES-->  8
CLUSTER-->  100 TOTAL IMAGES-->  7
CLUSTER-->  101 TOTAL IMAGES-->  7
CLUSTER-->  102 TOTAL IMAGES-->  10
CLUSTER-->  103 TOTAL IMAGES-->  5
CLUSTER-->  104 TOTAL IMAGES-->  6
CLUSTER-->  105 TOTAL IMAGES-->  8
CLUSTER-->  106 TOTAL IMAGES-->  5
CLUSTER-->  107 TOTAL IMAGES-->  7
CLUSTER-->  108 TOTAL IMAGES-->  5
CLUSTER-->  109 TOTAL IMAGES-->  8
CLUSTER-->  110 TOTAL IMAGES-->  13
CLUSTER-->  111 TOTAL IMAGES-->  6
CLUSTER-->  112 TOTAL IMAGES-->  9
CLUSTER-->  113 TOTAL IMAGES-->  3
CLUSTER-->  114 TOTAL IMAGES-->  8