Source code for unsupervised

## MyDataPy
## Unsupervised methods shelf
## Simon Lebastard - Nov 2018

## REQUIREMENTS ##################################

### External requirements ########################
import importlib

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.spatial import distance_matrix
from scipy.spatial.distance import cdist
from scipy.stats import multivariate_normal
from matplotlib.patches import Ellipse
import math

import time
from cvxopt import matrix
from cvxopt import solvers
solvers.options['show_progress'] = False

## Importing self-made fcts
from metrics import *
from kernels import *
from kernelCore import *

## Debugging
import pdb


### Internal requirements ########################
importlib.import_module('dataTools')

#################################################

## K-MEANS ######################################

[docs]class Kmeans:
  """
  Standard K-means method

  Attributes
  ----------
  ind : int
    instance ID

  data: np.array
    the data bound to this instance.

  N: int
    number of records

  d: int
    dimension

  K: int
    number of clusters

  centroids: np.array
    current centroids. Only initialized at run

  assignment: np.array
    maps every data point to a cluster ID

  ToDo
  ----
  1, Low: Generalize this class to a kernel. Have the kernel bound to the instance from the initialization
  
  2, Low: Allow for the data to be reset
  """
  def __init__(self, data, nClass, ind=0, init='def'):
    # Create ID for instance + load data
    self.ind = ind
    self.data = data
    self.frame = np.zeros((2,2))
    self.frame[:,0] = self.data.min()
    self.frame[:,1] = self.data.max()
    
    # Check if data is non-empty, store data size
    [self.N,self.d] = np.shape(self.data)
    self.K = nClass
    
    # Initiate centroids
    self.centroids = np.zeros((self.K,self.d))
    initCentroids = np.zeros(self.K)
    if init == 'stoch':
      # Kmeans++ - computing probs for choosing initial centroids
      initCentroids[0] = np.random.randint(0,self.N-1)
      self.centroids[0,:] = self.data[np.array(initCentroids[0]).astype(int)]
      for index in range(self.K-1):
        distances = distance_matrix(self.data,self.centroids[:index+1,:])
        probs = np.amin(distances*distances,axis=1)
        probs[np.array(initCentroids).astype(int)] = 0
        Reg = np.sum(probs,axis=0)
        probs = probs/Reg
        initCentroids[index+1] = np.random.choice(self.N,1,p=probs)
        self.centroids[index+1,:] = self.data[np.array(initCentroids[index+1]).astype(int)]
      
    else:
      # Standard Kmeans - choose random data points as initial centroids
      initCentroids = np.random.choice(self.N, self.K, replace=False)
      self.centroids = self.data[np.array(initCentroids)]
    
    # Initiate assignment vector
    self.assignment = np.zeros(self.N)
  

  def assignCentroids(self):
    oldAssignment = self.assignment
    distances = distance_matrix(self.data,self.centroids)
    self.assignment = np.argmin(distances,axis=1)
    changeFlag = np.array_equal(oldAssignment,self.assignment)
    return changeFlag
  
    
  def computeCentroids(self):
    # Create indicator matrix - gives cluster belonging for each data point
    W = (self.assignment[:,None]==range(self.K)).astype(int)
    
    # Compute centroid
    for k in range(self.K):
      self.centroids[k,:] = np.average(self.data,axis=0,weights=W[:,k].reshape(self.N))

      
[docs]  def run(self):
    """
    Computes centroids position and cluster assignments until convergence
    """
    stop = False
    itCount = 0
    while not stop:
      stop = self.assignCentroids()
      self.computeCentroids()
      itCount += 1
    print('Convergence in {0:2d} iterations \n \n'.format(itCount))
 


[docs]  def draw(self,size=40,scale=0.8):
    """
    Draws the current clustering in a new figure

    Parameters
    ----------
    size: int
      marker size. Defaults to 40

    scale: float
      needs rework. Do not use right now

    ToDo
    ----
    Change scale integration
    """
    listMarkers = ["s","d","*","+","h","x",'h','H','p','P','X']
    listColors = ['xkcd:purple','xkcd:green','xkcd:blue','xkcd:pink',
                  'xkcd:brown','xkcd:red','xkcd:light blue','xkcd:teal'
                  ,'xkcd:orange','xkcd:light green','xkcd:magenta',
                  'xkcd:yellow','xkcd:sky blue','xkcd:grey','xkcd:lime green',
                  'xkcd:light purple','xkcd:violet','xkcd:dark green',
                  'xkcd:turquoise','xkcd:lavender','xkcd:dark blue','xkcd:tan',
                  'xkcd:cyan','xkcd:aqua']
    plt.figure(self.ind, figsize=(15,15))
    for k in range(self.K):
      mark = np.random.choice(listMarkers,1)[0]
      color = np.random.choice(listColors,1)[0]
      plt.scatter(self.data[self.assignment==k][:,0], self.data[self.assignment==k][:,1], marker=mark, c=color, s=size, alpha=0.5)
      plt.scatter(self.centroids[k,0],self.centroids[k,1], marker=mark, c=color, s=3*size, alpha=1)
      axes = plt.gca()
      axes.set_xlim([scale*self.frame[0,0],(2-scale)*self.frame[0,1]])
      axes.set_ylim([scale*self.frame[1,0],(2-scale)*self.frame[1,1]])

    
    
[docs]  def stats(self):
    """
    Computes intra-cluster distortion, inter-cluster distortion, quality estimation for clustering
    """

    W = (self.assignment[:,None]==range(self.K)).astype(int)
    distrt = np.zeros(self.K)
    relDistrt = np.zeros(self.K)
    effectives = np.zeros(self.K)

    interDist = distance_matrix(self.centroids,self.centroids)
    interDist[interDist==0] = np.nan
    globDistrt = np.nanmean(interDist*interDist)

    # Print general statistics
    print('Average inter-cluster distortion: {0:.2f} \n \n'.format(np.sqrt(globDistrt)))
    
    # Print statistics per cluster
    for k in range(self.K):
      effectives[k] = np.shape(self.data[W[:,k]==1])[0]
      distrt[k] = np.mean(np.power(np.linalg.norm(self.data[W[:,k]==1] - self.centroids[k,:],axis=1),2),axis=0)
      relDistrt[k] = distrt[k]/globDistrt
      
      effectives = effectives.astype(int)
      print('Cluster {0:3d}: {1:3d} entities'.format(k,effectives[k]))
      print('intra-cluster avg distortion {0:.2f}. Relative distortion {1:.2f}% \n'.format(np.sqrt(distrt[k]),100*np.sqrt(relDistrt[k])))
    
    print('Average relative distortion: {0:.2f}%'.format(np.average(100*np.sqrt(relDistrt),weights=effectives)))
    
    return [np.average(100*np.sqrt(relDistrt),weights=effectives),distrt,relDistrt]


################################################

## GAUSSIAN MIXTURE ###########################

[docs]class GaussianMixture:
  """
  Instance for fitting gaussian mixtures to a dataset

  Attributes
  ----------
  K : int
    number of clusters

  d : int
    dimension

  isotropic : boolean
    true forces clusters to be spherical. Defaults to true

  pi : np.array, Kx1
    current estimate of the class variable probabilities. Initialized at train()

  mu : np.array, Kxd
    current estimate of clusters first order momentum. Initialized at train()

  sigma : np.array, KxK
    current estimate of covaraince matrix. Initialized at train()

  n : int
    number of data points for linked dataset
  """
  def __init__(self, K, d, pi, mu, sigma, isotropic = True):
    self.K, self.d = K, d
    self.isotropic = isotropic
    self.pi, self.mu, self.sigma = pi, mu, sigma
  
[docs]  def train(self, X, eps = 1e-4, max_iter = 10000, verbose = True):
    """
    EM algorithm for estimating gaussian mixture parameters

    Parameters
    ----------
    X : np.array
      nxd dataset

    eps : float
      stop threshold on change in log-likelihood

    max_iter : int
      maximum number of iterations

    verbose : boolean
      defaults to True
    """
    log_likelihoods = []
    n, d = X.shape
    print("BEGIN EM ALGORITHM")
    
    while(len(log_likelihoods) < max_iter):
      #Expectation Step
      if self.isotropic:
        P = []
        for k in range(self.K):
          P.append(self.pi[k]*multivariate_normal.pdf(X, self.mu[k], sigma[k]).reshape(n, 1))
        P = np.hstack(P) #pi*Normal
      else:
        P = np.hstack([self.pi[k]*multivariate_normal.pdf(X, self.mu[k], self.sigma[k]).reshape(n, 1) for k in range(self.K)]) #pi*Normal
      Q = P/P.sum(axis=1)[:, np.newaxis]

      #Maximization step
      self.pi = Q.sum(axis=0)/Q.sum()
      self.mu = [X.T.dot(Q[:,k])/sum(Q[:, k]) for k in range(self.K)]
      if self.isotropic:
        for k in range(self.K):
          temp = (X - self.mu[k]).dot((X - self.mu[k]).T)
          Q_k = np.tile(Q[:, k], (n, 1)).T
          self.sigma[k] = np.trace(temp*Q_k)/(d*sum(Q[:, k]))*np.eye(d)
          
      else:
        self.sigma = [((X-self.mu[k]).T*Q[:,k]).dot(X-self.mu[k])/sum(Q[:, k]) for k in range(self.K)]

      #Log likelihood computation for exit condition
      log_likelihood = np.mean(np.log(np.sum(P, axis=1)))
      log_likelihoods.append(log_likelihood)
      if len(log_likelihoods) < 2:
        continue
      if np.abs(log_likelihoods[-2] - log_likelihood) < eps:
        print("Convergence Threshold reached")
        print("Last value of Average (Partial) Log_likelihood %0.2f" % log_likelihood)
        break
      
      
    if verbose:
      self.printResults(log_likelihoods)
      
      
  
[docs]  def predict(self, X):
    """
    Predicts cluster assignments from a dataset

    Parameters
    ----------
    X : np.array
      nxd input dataset

    Returns
    -------
    predictions : np.array
      cluster assignment, one per data point
    """
    n = X.shape[0]
    
    P = np.hstack([self.pi[k]*multivariate_normal.pdf(X, self.mu[k], self.sigma[k]).reshape(n, 1) for k in range(0, self.K)])
    predictions = np.argmax(P, axis = 1)
    
    return predictions
  
  
  
[docs]  def printResults(self, log_likelihoods):
    """
    Print the learnt parameters after training and the evolution of the partial
    log likelihood through time

    ToDo
    ----
    This does not fit well into the package philosophy. To be refactorized
    """
    plt.plot(log_likelihoods)
    plt.ylabel('Average (Partial) Log Likelihood')
    plt.xlabel('Iteration')
    plt.title('Evolution of Average (Partial) Log Likelihood')
    plt.show()
    
    print("MU")
    for mu in self.mu: print(mu)
    print("\n")
    
    print("SIGMA - ISOTROPY = ", self.isotropic)
    for sigma in self.sigma: print(sigma)
    print("\n")
    
    print("PI")
    for pi in self.pi: print(pi)

      
      
[docs]  def print_log_likelihood(self, X):
    """
    Print the average value of (partial) log_likelihood

    ToDo
    ----
    This does not fit well into the package philosophy. To be refactorized
    """
    n = X.shape[0]
    
    if self.isotropic:
      P = []
      for k in range(self.K):
        P.append(self.pi[k]*multivariate_normal.pdf(X, self.mu[k], sigma[k]).reshape(n, 1))
      P = np.hstack(P)
    else:
      P = np.hstack([self.pi[k]*multivariate_normal.pdf(X, self.mu[k], self.sigma[k]).reshape(n, 1) for k in range(self.K)])
    
    print(np.mean(np.log(np.sum(P, axis=1))))
      
      
[docs]  def draw(self, data, predictions, size=40, scale=0.8, eps=0.1):
    """
    Prints data points, centroids and alineates the covariances matrices

    Parameters
    ----------
    data : np.array
      input dataset. This is meant to be the dataset bound to this instance

    predictions : np.array
      array of cluster assignments

    size : int
      marker size

    scale : float
      to be refactorized. Do not use at the moment

    eps : float

    ToDo
    ----
    1, high : Refactorize scale parameters  
    """
    listMarkers = ["s","d","*","+","h","x",'h','H','p','P','X']
    listColors = ['xkcd:purple','xkcd:green','xkcd:blue','xkcd:pink',
                  'xkcd:brown','xkcd:red','xkcd:light blue','xkcd:teal'
                  ,'xkcd:orange','xkcd:light green','xkcd:magenta',
                  'xkcd:yellow','xkcd:sky blue','xkcd:grey','xkcd:lime green',
                  'xkcd:light purple','xkcd:violet','xkcd:dark green',
                  'xkcd:turquoise','xkcd:lavender','xkcd:dark blue','xkcd:tan',
                  'xkcd:cyan','xkcd:aqua']
    fig, ax = plt.subplots(figsize=(15,15))
    frame = np.zeros((2,2))
    frame[:,0] = data.min()
    frame[:,1] = data.max()
    for k in range(self.K):
      # Pick style
      mark = np.random.choice(listMarkers,1)[0]
      color = np.random.choice(listColors,1)[0]
      # Data points
      ax.scatter(data[predictions==k][:,0], data[predictions==k][:,1], marker=mark, c=color, s=size, alpha=0.5)
      # Centroid
      ax.scatter(self.mu[k][0],self.mu[k][1], marker=mark, c=color, s=3*size, alpha=1)
      # Covariance matrix delineation
      S = np.zeros((self.d,self.d))
      if self.isotropic:
        S = self.sigma[k]*np.identity(self.d)
      else:
        S = self.sigma[k]
      [E,V] = np.linalg.eig(S)
      #ang = np.angle(V[0,0]+V[0,1]*1j, deg=True)
      ang = np.degrees(np.arctan2(*V[:,0][::-1]))
      ell = Ellipse(self.mu[k],E[0]*np.sqrt(2*np.log(1/eps)),E[1]*np.sqrt(2*np.log(1/eps)),angle=ang,facecolor='none')
      ax.add_artist(ell)
      ell.set_clip_box(ax.bbox)
      ell.set_alpha(0.2)
      ell.set_facecolor(color)
    # Scaling
    axes = fig.gca()
    axes.set_xlim([scale*frame[0,0],(2-scale)*frame[0,1]])
    axes.set_ylim([scale*frame[1,0],(2-scale)*frame[1,1]])


###################################################