Source code for evclust.bpec

# -*- coding: utf-8 -*-
# This file as well as the whole evclust package are licenced under the MIT licence (see the LICENCE.txt)
# Armel SOUBEIGA (armelsoubeiga.github.io), France, 2024

"""
This module contains the main function for Belief Peak Evidential Clustering (BPEC).

    Z.-G. Su and T. Denoeux. BPEC: Belief-Peaks Evidential Clustering. IEEE Transactions on Fuzzy Systems, 27(1):111-123, 2019.

"""

#---------------------- Packges------------------------------------------------
from evclust.utils import makeF, extractMass
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt




#---------------------- bpec------------------------------------------------
[docs] def bpec(x, g, type='full', pairs=None, Omega=True, alpha=1, beta=2, delta=10, epsi=1e-3, disp=True, m0=None): """ Belief Peak Evidential Clustering (BPEC) computes a credal partition from a matrix of attribute data. BPEC is identical to ECM, except that the prototypes are computed from delta-Bel graph using function delta_Bel. The ECM algorithm is then run keeping the prototypes fixed. The distance to the prototypes can be the Euclidean disatnce. Parameters: ------------ x (numpy.ndarray): Input matrix of size n x d, where n is the number of objects and d the number of attributes. g (numpy.ndarray, or None): Matrix of size c x d of prototypes (the belief peaks). type (str): Type of focal sets ("simple", "full", or "pairs"). Omega (bool): Whether to include the whole frame in focal sets (default: True). pairs (list or None): Pairs to include in focal sets, used only if type="pairs". alpha (float): Exponent of the cardinality in the cost function. beta (float): Exponent of masses in the cost function. delta (float): Distance to the empty set. epsi (float): Minimum amount of improvement. disp (bool): Whether to display intermediate results (default: True). m0 (numpy.ndarray or None): Initial credal partition. Should be a matrix with n rows and a number of columns equal to the number of focal sets specified by 'type' and 'pairs'. Returns: -------- The credal partition (an object of class "credpart"). Example: -------- .. highlight:: python .. code-block:: python # Clustering from evclust.datasets import load_fourclass from evclust.bpec import bpec, delta_Bel df = load_fourclass() x = df.iloc[:, 0:2] DB = delta_Bel(x,100,0.9) clus = bpec(x, DB['g0'], type='pairs', delta=3, distance=1) References: ------------ Z.-G. Su and T. Denoeux. BPEC: Belief-Peaks Evidential Clustering. IEEE Transactions on Fuzzy Systems, 27(1):111-123, 2019. .. seealso:: :func:`~extractMass`, :func:`~makeF`, :func:`~delta_Bel`, :func:`~setDistances` .. note:: Keywords : Dempster-Shafer theory, belief functions, unsupervised learning, soft clustering, density peaks clustering. BPEC can find the true number of clusters and create a credal partition for some datasets with good performances. When the number of clusters is small (usually less than ten), the performances of BPEC and its informative variant with a limited number of composite clusters are approximately equal. In contrast, BPEC can be enhanced if less informative composite clusters (i.e., focal sets) when the number of clusters is large. Furthermore, BPEC can provide hard, fuzzy, possibilistic and even rough partitions. """ # ---------------------- Delta-Bel -------------------------------------- if g is None: DB = delta_Bel(x,100,0.9) g = DB['g0'] else: g = g # ---------------------- Initializations -------------------------------------- x = np.array(x) n, d = x.shape delta2 = delta ** 2 c = g.shape[0] F = makeF(c, type, pairs, Omega) f = F.shape[0] card = np.sum(F[1:], axis=1) # ---------------------- Iterations ------------------------------------------- pasfini = True gplus = np.zeros((f - 1, d)) iter_ = 0 # Compute prototypes for focal sets for i in range(1, f): fi = F[i, :] truc = np.tile(fi, (c, 1)) gplus[i - 1, :] = np.sum(g * truc, axis=0) / np.sum(fi) # Compute initial Euclidean distances to prototypes D = np.zeros((n, f - 1)) for j in range(f - 1): D[:, j] = np.sum((x - np.tile(gplus[j, :], (n, 1))) ** 2, axis=1) # Compute initial masses if m0 is None: m = np.zeros((n, f - 1)) for i in range(n): vect0 = D[i, :] for j in range(f - 1): vect1 = (np.tile(D[i, j], f - 1) / vect0) ** (1 / (beta - 1)) vect2 = (np.tile(card[j] ** (alpha / (beta - 1)), f - 1) / (card ** (alpha / (beta - 1)))) vect3 = vect1 * vect2 m[i, j] = 1 / (np.sum(vect3) + (card[j] ** alpha * D[i, j] / delta2) ** (1 / (beta - 1))) if np.isnan(m[i, j]): m[i, j] = 1 else: m = m0[:, 1:f] mvide = 1 - np.sum(m, axis=1) Jold = np.sum((m ** beta) * D * np.tile(card ** alpha, (n, 1))) + delta2 * np.sum(mvide ** beta) if disp: print(iter_, Jold) # Main iteration loop while pasfini: iter_ += 1 dist = setDistances(x, F, g, m, alpha, distance=0) D = dist['D'] Smeans = dist['Smean'] # Update masses for i in range(n): vect0 = D[i, :] for j in range(f - 1): vect1 = (np.tile(D[i, j], f - 1) / vect0) ** (1 / (beta - 1)) vect2 = (np.tile(card[j] ** (alpha / (beta - 1)), f - 1) / (card ** (alpha / (beta - 1)))) vect3 = vect1 * vect2 m[i, j] = 1 / (np.sum(vect3) + (card[j] ** alpha * D[i, j] / delta2) ** (1 / (beta - 1))) if np.isnan(m[i, j]): m[i, j] = 1 mvide = 1 - np.sum(m, axis=1) J = np.sum((m ** beta) * D * np.tile(card ** alpha, (n, 1))) + delta2 * np.sum(mvide ** beta) if disp: print(iter_, J) pasfini = np.abs(J - Jold) > epsi Jold = J # Finalize masses and return results m = np.hstack((mvide[:, np.newaxis], m)) clus = extractMass(m, F, g=g, method="bpec", crit=J, param={'alpha': alpha, 'beta': beta, 'delta': delta, 'S': Smeans}) return clus
#---------------------- Utils for BPEC------------------------------------------------
[docs] def delta_Bel(x, K, q=0.9): """ Delta-Bel graph for Belief Peak Evidential Clustering (BPEC). This function computes the delta-Bel graph used to determine the prototypes in the Belief Peak Evidential Clustering (BPEC) algorithm. The user must manually specify the rectangles containing the prototypes (typically in the upper-right corner of the graph if the clusters are well-separated). These prototypes are then used by the bpec function to compute a credal partition. Parameters: ----------- x (numpy.ndarray): Input matrix of size n x d, where n is the number of objects and d is the number of attributes. K (int): Number of neighbors used to determine belief values. q (float): Parameter of the algorithm, between 0 and 1 (default: 0.9). Returns: -------- dict: A dictionary containing: BelC (numpy.ndarray): The belief values. delta (numpy.ndarray): The delta values. g0 (numpy.ndarray): A c x d matrix containing the prototypes. ii (list): List of indices of the belief peaks. """ n = x.shape[0] # Compute K-nearest neighbors nbrs = NearestNeighbors(n_neighbors=K).fit(x) distances, indices = nbrs.kneighbors(x) # Calculate beliefs that possibly support itself as a center alpha = 1 / K if q is None: g = np.ones(n) else: g = 1 / np.apply_along_axis(lambda row: np.quantile(row, q), axis=1, arr=distances) BelC = np.zeros(n) for i in range(n): BelC[i] = 1 - np.prod(1 - alpha * np.exp(-g[indices[i]] ** 2 * distances[i] ** 2)) # Calculate delta values Dist = distance.cdist(x, x, metric='euclidean') maxdist = np.max(Dist) bel_sorted_indices = np.argsort(-BelC) #bel_sorted = BelC[bel_sorted_indices] delta = np.zeros(n) delta[bel_sorted_indices[0]] = -1 for i in range(1, n): delta[bel_sorted_indices[i]] = maxdist for j in range(i): dist_ij = Dist[bel_sorted_indices[i], bel_sorted_indices[j]] if dist_ij < delta[bel_sorted_indices[i]]: delta[bel_sorted_indices[i]] = dist_ij delta[bel_sorted_indices[0]] = np.max(delta) # Plot the delta-Bel graph plt.scatter(delta, BelC) plt.ylim([np.min(BelC) - 0.1, np.max(BelC) + 0.1]) plt.xlabel("Delta") plt.ylabel("Belief") plt.title("Delta-Bel Graph") plt.show() print("With the mouse, select the lower-left corner of a rectangle containing the prototypes.") # Wait for the user to click on the plot to select the rectangle selected_points = plt.ginput(1) if len(selected_points) == 0: print("No points were selected. Automatically selecting prototypes...") # Automatically select prototypes in the upper-right corner of the Delta-Bel graph delta_threshold = np.percentile(delta, 95) # Top 5% delta values belief_threshold = np.percentile(BelC, 95) # Top 5% belief values ii = np.where((delta > delta_threshold) & (BelC > belief_threshold))[0] print(f"Automatically selected prototype indices: {ii}") else: t = selected_points[0] # Extract the selected point (x, y) plt.axvline(x=t[0], linestyle='--') plt.axhline(y=t[1], linestyle='--') # Find the indices of the prototypes based on the user's selection ii = np.where((delta > t[0]) & (BelC > t[1]))[0] print(f"Manually selected prototype indices: {ii}") # Highlight selected prototypes on the plot plt.scatter(delta[ii], BelC[ii], c='red', marker='x') plt.show() # Extract prototypes if isinstance(x, np.ndarray): g0 = x[ii, :] elif isinstance(x, pd.DataFrame): g0 = x.iloc[ii, :].values # Convertir en tableau NumPy si nécessaire else: raise TypeError("Input matrix x must be a numpy array or pandas DataFrame.") return { 'BelC': BelC, 'delta': delta, 'g0': g0, 'ii': ii.tolist() }
#---------------------- setDistances------------------------------------------
[docs] def setDistances(x, F, g, m, alpha, distance): """ Computation of distances to centers and variance matrices in each cluster. Function called by cecm. """ nbFoc = F.shape[0] K = F.shape[1] n = x.shape[0] nbAtt = x.shape[1] beta = 2 gplus = np.zeros((nbFoc-1, nbAtt)) for i in range(1, nbFoc): fi = F[i, :] truc = np.tile(fi, (nbAtt, 1)).T gplus[i-1, :] = np.sum(g * truc, axis=0) / np.sum(fi) if distance == 0: S = [np.eye(nbAtt)] * K else: ind = np.where(np.sum(F, axis=1) == 1)[0] S = [] for i in ind: Sigmai = np.zeros((nbAtt, nbAtt)) for k in range(n): omegai = np.tile(F[i, :], (K, 1)) indAj = np.where(np.sum(np.minimum(omegai, F), axis=1) > 0)[0] for j in indAj: aux = x[k, :] - gplus[j-1, :] Sigmai += np.sum(F[j, :]) ** (alpha - 1) * m[k, j-1] ** beta * np.outer(aux, aux) Si = np.linalg.det(Sigmai) ** (1/nbAtt) * np.linalg.inv(Sigmai) S.append(Si) Smean = [] for i in range(nbFoc-1): aux = np.zeros((nbAtt, nbAtt)) for j in range(K): aux += F[i+1, j] * S[j] Smean.append(aux / max(np.sum(F[i+1, :]), 1)) D = np.zeros((n, nbFoc-1)) for j in range(nbFoc-1): aux = x - np.tile(gplus[j, :], (n, 1)) if distance == 0: D[:, j] = np.diag(np.dot(aux, aux.T)) else: D[:, j] = np.diag(np.dot(np.dot(aux, Smean[j]), aux.T)) return {'D': D, 'Smean': Smean}