Source code for evclust.bpec

# -*- coding: utf-8 -*-
# This file as well as the whole evclust package are licenced under the MIT licence (see the LICENCE.txt)
# Armel SOUBEIGA (armelsoubeiga.github.io), France, 2024

"""
This module contains the main function for Belief Peak Evidential Clustering (BPEC).

    Z.-G. Su and T. Denoeux. BPEC: Belief-Peaks Evidential Clustering. IEEE Transactions on Fuzzy Systems, 27(1):111-123, 2019.

"""

#---------------------- Packges------------------------------------------------
from evclust.utils import makeF, extractMass
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt




#---------------------- bpec------------------------------------------------

[docs]
def bpec(x, g, type='full', pairs=None, Omega=True, alpha=1, beta=2, delta=10, epsi=1e-3, disp=True, m0=None):

    """
    Belief Peak Evidential Clustering (BPEC) computes a credal partition from a matrix of attribute data.
    BPEC is identical to ECM, except that the prototypes are computed from delta-Bel graph using function delta_Bel. 
    The ECM algorithm is then run keeping the prototypes fixed. The distance to the prototypes can be the Euclidean disatnce.

    Parameters:
    ------------
    x (numpy.ndarray): 
        Input matrix of size n x d, where n is the number of objects and d the number of attributes.
    g (numpy.ndarray, or None): 
        Matrix of size c x d of prototypes (the belief peaks).
    type (str): 
        Type of focal sets ("simple", "full", or "pairs").
    Omega (bool): 
        Whether to include the whole frame in focal sets (default: True).
    pairs (list or None): 
        Pairs to include in focal sets, used only if type="pairs".
    alpha (float): 
        Exponent of the cardinality in the cost function.
    beta (float): 
        Exponent of masses in the cost function.
    delta (float): 
        Distance to the empty set.
    epsi (float): 
        Minimum amount of improvement.
    disp (bool): 
        Whether to display intermediate results (default: True).
    m0 (numpy.ndarray or None): 
        Initial credal partition. Should be a matrix with n rows and a number of columns equal to the number 
        of focal sets specified by 'type' and 'pairs'.

    Returns:
    --------
    The credal partition (an object of class "credpart").

    Example:
    --------
    .. highlight:: python
    .. code-block:: python

        # Clustering
        from evclust.datasets import load_fourclass
        from evclust.bpec import bpec, delta_Bel

        df = load_fourclass()
        x = df.iloc[:, 0:2]
        DB = delta_Bel(x,100,0.9)
        clus = bpec(x, DB['g0'], type='pairs', delta=3, distance=1)
    
    References:
    ------------
        Z.-G. Su and T. Denoeux. BPEC: Belief-Peaks Evidential Clustering. IEEE Transactions on Fuzzy Systems, 27(1):111-123, 2019.


    .. seealso::
        :func:`~extractMass`, :func:`~makeF`, 
        :func:`~delta_Bel`,  :func:`~setDistances`
    
    .. note::
        Keywords : Dempster-Shafer theory, belief functions, unsupervised learning, soft clustering, density peaks clustering.
        BPEC can find the true number of clusters and create a credal partition for some datasets with good performances.
        When the number of clusters is small (usually less than ten), the performances of BPEC and its informative variant with a limited number of composite clusters are approximately equal. 
        In contrast, BPEC can be enhanced if less informative composite clusters (i.e., focal sets) when the number of clusters is large. 
        Furthermore, BPEC can provide hard, fuzzy, possibilistic and even rough partitions. 
    """
    
    # ---------------------- Delta-Bel --------------------------------------
    if g is None:
        DB = delta_Bel(x,100,0.9)
        g = DB['g0']
    else:
        g = g
    
    # ---------------------- Initializations --------------------------------------
    x = np.array(x)
    n, d = x.shape  
    delta2 = delta ** 2
    c = g.shape[0]  

    F = makeF(c, type, pairs, Omega)  
    f = F.shape[0]
    card = np.sum(F[1:], axis=1)  

    # ---------------------- Iterations -------------------------------------------
    pasfini = True  
    gplus = np.zeros((f - 1, d))
    iter_ = 0

    # Compute prototypes for focal sets
    for i in range(1, f):
        fi = F[i, :]
        truc = np.tile(fi, (c, 1))
        gplus[i - 1, :] = np.sum(g * truc, axis=0) / np.sum(fi)

    # Compute initial Euclidean distances to prototypes
    D = np.zeros((n, f - 1))
    for j in range(f - 1):
        D[:, j] = np.sum((x - np.tile(gplus[j, :], (n, 1))) ** 2, axis=1)

    # Compute initial masses
    if m0 is None:
        m = np.zeros((n, f - 1))
        for i in range(n):
            vect0 = D[i, :]
            for j in range(f - 1):
                vect1 = (np.tile(D[i, j], f - 1) / vect0) ** (1 / (beta - 1))
                vect2 = (np.tile(card[j] ** (alpha / (beta - 1)), f - 1) / 
                         (card ** (alpha / (beta - 1))))
                vect3 = vect1 * vect2
                m[i, j] = 1 / (np.sum(vect3) + (card[j] ** alpha * D[i, j] / delta2) ** (1 / (beta - 1)))
                if np.isnan(m[i, j]):
                    m[i, j] = 1
    else:
        m = m0[:, 1:f]

    mvide = 1 - np.sum(m, axis=1)  
    Jold = np.sum((m ** beta) * D * np.tile(card ** alpha, (n, 1))) + delta2 * np.sum(mvide ** beta)

    if disp:
        print(iter_, Jold)

    # Main iteration loop
    while pasfini:
        iter_ += 1
        dist = setDistances(x, F, g, m, alpha, distance=0)
        D = dist['D']
        Smeans = dist['Smean']

        # Update masses
        for i in range(n):
            vect0 = D[i, :]
            for j in range(f - 1):
                vect1 = (np.tile(D[i, j], f - 1) / vect0) ** (1 / (beta - 1))
                vect2 = (np.tile(card[j] ** (alpha / (beta - 1)), f - 1) / 
                         (card ** (alpha / (beta - 1))))
                vect3 = vect1 * vect2
                m[i, j] = 1 / (np.sum(vect3) + (card[j] ** alpha * D[i, j] / delta2) ** (1 / (beta - 1)))
                if np.isnan(m[i, j]):
                    m[i, j] = 1

        mvide = 1 - np.sum(m, axis=1)
        J = np.sum((m ** beta) * D * np.tile(card ** alpha, (n, 1))) + delta2 * np.sum(mvide ** beta)

        if disp:
            print(iter_, J)

        pasfini = np.abs(J - Jold) > epsi
        Jold = J

    # Finalize masses and return results
    m = np.hstack((mvide[:, np.newaxis], m))
    clus = extractMass(m, F, g=g, method="bpec", crit=J, param={'alpha': alpha,
                                        'beta': beta, 'delta': delta, 'S': Smeans})
    return clus




#---------------------- Utils for BPEC------------------------------------------------  

[docs]
def delta_Bel(x, K, q=0.9):
    """
    Delta-Bel graph for Belief Peak Evidential Clustering (BPEC).
    This function computes the delta-Bel graph used to determine the prototypes in the Belief Peak Evidential Clustering (BPEC) algorithm. The user must manually specify 
    the rectangles containing the prototypes (typically in the upper-right corner of the graph if the clusters are well-separated). These prototypes are then used 
    by the bpec function to compute a credal partition.

    Parameters:
    -----------
    x (numpy.ndarray): 
        Input matrix of size n x d, where n is the number of objects and d is the number of attributes.
    K (int): 
        Number of neighbors used to determine belief values.
    q (float): 
        Parameter of the algorithm, between 0 and 1 (default: 0.9).

    Returns:
    --------
    dict: A dictionary containing:
        BelC (numpy.ndarray): 
            The belief values.
        delta (numpy.ndarray): 
            The delta values.
        g0 (numpy.ndarray): 
            A c x d matrix containing the prototypes.
        ii (list):
            List of indices of the belief peaks.
    """

    n = x.shape[0]
    # Compute K-nearest neighbors
    nbrs = NearestNeighbors(n_neighbors=K).fit(x)
    distances, indices = nbrs.kneighbors(x)

    # Calculate beliefs that possibly support itself as a center
    alpha = 1 / K
    if q is None:
        g = np.ones(n)
    else:
        g = 1 / np.apply_along_axis(lambda row: np.quantile(row, q), axis=1, arr=distances)

    BelC = np.zeros(n)
    for i in range(n):
        BelC[i] = 1 - np.prod(1 - alpha * np.exp(-g[indices[i]] ** 2 * distances[i] ** 2))

    # Calculate delta values
    Dist = distance.cdist(x, x, metric='euclidean')
    maxdist = np.max(Dist)
    bel_sorted_indices = np.argsort(-BelC)  
    #bel_sorted  = BelC[bel_sorted_indices]

    delta = np.zeros(n)
    delta[bel_sorted_indices[0]] = -1
    for i in range(1, n):
        delta[bel_sorted_indices[i]] = maxdist
        for j in range(i):
            dist_ij = Dist[bel_sorted_indices[i], bel_sorted_indices[j]]
            if dist_ij < delta[bel_sorted_indices[i]]:
                delta[bel_sorted_indices[i]] = dist_ij
    delta[bel_sorted_indices[0]] = np.max(delta)

    # Plot the delta-Bel graph
    plt.scatter(delta, BelC)
    plt.ylim([np.min(BelC) - 0.1, np.max(BelC) + 0.1])
    plt.xlabel("Delta")
    plt.ylabel("Belief")
    plt.title("Delta-Bel Graph")
    plt.show()
    print("With the mouse, select the lower-left corner of a rectangle containing the prototypes.")
    
    # Wait for the user to click on the plot to select the rectangle
    selected_points = plt.ginput(1)
    if len(selected_points) == 0:
        print("No points were selected. Automatically selecting prototypes...")

        # Automatically select prototypes in the upper-right corner of the Delta-Bel graph
        delta_threshold = np.percentile(delta, 95)  # Top 5% delta values
        belief_threshold = np.percentile(BelC, 95)  # Top 5% belief values
        ii = np.where((delta > delta_threshold) & (BelC > belief_threshold))[0]

        print(f"Automatically selected prototype indices: {ii}")
    else:
        t = selected_points[0]  # Extract the selected point (x, y)
        plt.axvline(x=t[0], linestyle='--')
        plt.axhline(y=t[1], linestyle='--')

        # Find the indices of the prototypes based on the user's selection
        ii = np.where((delta > t[0]) & (BelC > t[1]))[0]
        print(f"Manually selected prototype indices: {ii}")

    # Highlight selected prototypes on the plot
    plt.scatter(delta[ii], BelC[ii], c='red', marker='x')
    plt.show()


    # Extract prototypes
    if isinstance(x, np.ndarray):
        g0 = x[ii, :]
    elif isinstance(x, pd.DataFrame):
        g0 = x.iloc[ii, :].values  # Convertir en tableau NumPy si nécessaire
    else:
        raise TypeError("Input matrix x must be a numpy array or pandas DataFrame.")
    return { 'BelC': BelC, 'delta': delta, 'g0': g0, 'ii': ii.tolist() }




#---------------------- setDistances------------------------------------------

[docs]
def setDistances(x, F, g, m, alpha, distance):
    """
    Computation of distances to centers and variance matrices in each cluster.
    Function called by cecm.

    """

    nbFoc = F.shape[0]
    K = F.shape[1]
    n = x.shape[0]
    nbAtt = x.shape[1]
    beta = 2

    gplus = np.zeros((nbFoc-1, nbAtt))
    for i in range(1, nbFoc):
        fi = F[i, :]
        truc = np.tile(fi, (nbAtt, 1)).T
        gplus[i-1, :] = np.sum(g * truc, axis=0) / np.sum(fi)

    if distance == 0:
        S = [np.eye(nbAtt)] * K 
    else:
        ind = np.where(np.sum(F, axis=1) == 1)[0]
        S = []
        for i in ind:
            Sigmai = np.zeros((nbAtt, nbAtt))
            for k in range(n):
                omegai = np.tile(F[i, :], (K, 1))
                indAj = np.where(np.sum(np.minimum(omegai, F), axis=1) > 0)[0]
                for j in indAj:
                    aux = x[k, :] - gplus[j-1, :]
                    Sigmai += np.sum(F[j, :]) ** (alpha - 1) * m[k, j-1] ** beta * np.outer(aux, aux)
            Si = np.linalg.det(Sigmai) ** (1/nbAtt) * np.linalg.inv(Sigmai)
            S.append(Si)

    Smean = []
    for i in range(nbFoc-1):
        aux = np.zeros((nbAtt, nbAtt))
        for j in range(K):
            aux += F[i+1, j] * S[j]
        Smean.append(aux / max(np.sum(F[i+1, :]), 1))

    D = np.zeros((n, nbFoc-1))
    for j in range(nbFoc-1):
        aux = x - np.tile(gplus[j, :], (n, 1))
        if distance == 0:
            D[:, j] = np.diag(np.dot(aux, aux.T))
        else:
            D[:, j] = np.diag(np.dot(np.dot(aux, Smean[j]), aux.T))

    return {'D': D, 'Smean': Smean}