Source code for evclust.utils

# -*- coding: utf-8 -*-
# This file as well as the whole evclust package are licenced under the MIT licence (see the LICENCE.txt)
# Armel SOUBEIGA (armelsoubeiga.github.io), France, 2023

"""
This module contains the utils function 
"""

#---------------------- Packges------------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from scipy.spatial import ConvexHull
import seaborn as sns
from sklearn.decomposition import PCA




#---------------------- makeF--------------------------------------------------

[docs] def makeF(c, type=['simple', 'full', 'pairs'], pairs=None, Omega=True): """ Creation of a matrix of focal sets. `makeF` creates a matrix of focal sets. Parameters: ----------- c (int): Number of clusters. type (str): Type of focal sets ("simple": {}, singletons, and Ω; "full": all 2^c subsets of Ω; "pairs": {}, singletons, Ω, and all or selected pairs). pairs (ndarray or None): Set of pairs to be included in the focal sets; if None, all pairs are included. Used only if type="pairs". Omega (bool): If True (default), Ω is a focal set (for types 'simple' and 'pairs'). Returns: -------- ndarray: A matrix (f, c) of focal sets. """ if type == 'full': # All the 2^c focal sets ii = np.arange(2**c) N = len(ii) F = np.zeros((N, c)) CC = np.array([np.binary_repr(i, width=c) for i in range(N)]) for i in range(N): F[i, :] = np.array([int(s) for s in CC[i]]) F = F[:, ::-1] else: # type = 'simple' or 'pairs' F = np.vstack((np.zeros(c), np.eye(c))) # the empty set and the singletons if type == 'pairs': # type = 'pairs' if pairs is None: # pairs not specified: we take them all for i in range(c - 1): for j in range(i + 1, c): f = np.zeros(c) f[[i, j]] = 1 F = np.vstack((F, f)) else: # pairs specified n = pairs.shape[0] for i in range(n): f = np.zeros(c) f[pairs[i, :]] = 1 F = np.vstack((F, f)) if Omega and not ((type == "pairs") and (c == 2)) and not ((type == "simple") and (c == 1)): F = np.vstack((F, np.ones(c))) # the whole frame return F
#---------------------- get_ensembles------------------------------------------
[docs] def get_ensembles(table): """ Get cluster name using mass Parameters: ------------ table (ndarray): Matrix of mass functions. The first column corresponds to the degree of conflict. Returns: -------- Returns a list of cluster names in order according to the max mass approach """ result = [] for row in table: row_str = 'Cl_' + '_'.join([str(i + 1) if elem == 1 else str(int(elem)) for i, elem in enumerate(row) if elem != 0]) result.append(row_str) result[0] = 'Cl_atypique' result[-1] = 'Cl_incertains' cleaned_result = [''.join(ch for i, ch in enumerate(row_str) if ch != '_' or (i > 0 and row_str[i-1] != '_')) for row_str in result] return cleaned_result
#---------------------- extractMass--------------------------------------------
[docs] def extractMass(mass, F, g=None, S=None, method=None, crit=None, Kmat=None, trace=None, D=None, W=None, J=None, param=None): """ Creates an object of class credpart. extractMass computes different outputs (hard, fuzzy, rough partitions, etc.) from a credal partition and creates an object of class credpart. Parameters: ------------ mass (ndarray): Matrix of mass functions. The first column corresponds to the degree of conflict. F (ndarray): Matrix of focal sets. The first row always corresponds to the empty set. g (ndarray, optional): The prototypes (if defined). Defaults to None. S (ndarray, optional): The matrices S_j defining the metrics for each cluster and each group of clusters (if defined). Defaults to None. method (str): The method used to construct the credal partition. crit (float, optional): The value of the optimized criterion (depends on the method used). Defaults to None. Kmat (ndarray, optional): The matrix of degrees of conflict. Same size as D. Defaults to None. trace (ndarray, optional): The trace of criterion values. Defaults to None. D (ndarray, optional): The normalized dissimilarity matrix. Defaults to None. W (ndarray, optional): The weight matrix. Defaults to None. J (ndarray, optional): The matrix of indices. Defaults to None. param (list, optional): A method-dependent list of parameters. Defaults to None. Returns: --------- method (str): The method used to construct the credal partition. F (ndarray): Matrix of focal sets. The first row always corresponds to the empty set. mass (ndarray): Mass functions. g (ndarray, optional): The prototypes (if defined). S (ndarray, optional): The matrices S_j defining the metrics for each cluster and each group of clusters (if defined). pl (ndarray): Unnormalized plausibilities of the singletons. pl_n (ndarray): Normalized plausibilities of the singletons. p (ndarray): Probabilities derived from pl by the plausibility transformation. bel (ndarray): Unnormalized beliefs of the singletons. bel_n (ndarray): Normalized beliefs of the singletons. y_pl (ndarray): Maximum plausibility clusters. y_bel (ndarray): Maximum belief clusters. betp (ndarray): Unnormalized pignistic probabilities of the singletons. betp_n (ndarray): Normalized pignistic probabilities of the singletons. Y (ndarray): Sets of clusters with maximum mass. outlier (ndarray): Array of 0's and 1's, indicating which objects are outliers. lower_approx (list): Lower approximations of clusters, a list of length c. upper_approx (list): Upper approximations of clusters, a list of length c. Ynd (ndarray): Sets of clusters selected by the interval dominance rule. lower_approx_nd (list): Lower approximations of clusters using the interval dominance rule, a list of length c. upper_approx_nd (list): Upper approximations of clusters using the interval dominance rule, a list of length c. N (float): Average nonspecificity. crit (float, optional): The value of the optimized criterion (depends on the method used). Kmat (ndarray, optional): The matrix of degrees of conflict. Same size as D . D (ndarray, optional): The normalized dissimilarity matrix . trace (ndarray, optional): The trace of criterion values . W (ndarray, optional): The weight matrix . J (ndarray, optional): The matrix of indices. param (list, optional): A method-dependent list of parameters. References: ------------ T. Denoeux and O. Kanjanatarakul. Beyond Fuzzy, Possibilistic and Rough: An Investigation of Belief Functions in Clustering. 8th International conference on soft methods in probability and statistics, Rome, 12-14 September, 2016. M.-H. Masson and T. Denoeux. ECM: An evidential version of the fuzzy c-means algorithm. Pattern Recognition, Vol. 41, Issue 4, pages 1384-1397, 2008. """ n = mass.shape[0] c = F.shape[1] if any(F[0, :] == 1): F = np.vstack((np.zeros(c), F)) # add the empty set mass = np.hstack((np.zeros((n, 1)), mass)) f = F.shape[0] card = np.sum(F, axis=1) conf = mass[:, 0] # degree of conflict C = 1 / (1 - conf) mass_n = C[:, np.newaxis] * mass[:, 1:f] # normalized mass function pl = np.matmul(mass, F) # unnormalized plausibility pl_n = C[:, np.newaxis] * pl # normalized plausibility p = pl / np.sum(pl, axis=1, keepdims=True) # plausibility-derived probability bel = mass[:, card == 1] # unnormalized belief bel_n = C[:, np.newaxis] * bel # normalized belief y_pl = np.argmax(pl, axis=1) # maximum plausibility cluster y_bel = np.argmax(bel, axis=1) # maximum belief cluster Y = F[np.argmax(mass, axis=1), :] # maximum mass set of clusters # non dominated elements Ynd = np.zeros((n, c)) for i in range(n): ii = np.where(pl[i, :] >= bel[i, y_bel[i]])[0] Ynd[i, ii] = 1 #P = F / card[:, np.newaxis] nonzero_card = np.where(card != 0) P = np.zeros_like(F) P[nonzero_card] = F[nonzero_card] / card[nonzero_card, np.newaxis] P[0, :] = 0 betp = np.matmul(mass, P) # unnormalized pignistic probability betp_n = C[:, np.newaxis] * betp # normalized pignistic probability lower_approx, upper_approx = [], [] lower_approx_nd, upper_approx_nd = [], [] nclus = np.sum(Y, axis=1) outlier = np.where(nclus == 0)[0] # outliers nclus_nd = np.sum(Ynd, axis=1) for i in range(c): upper_approx.append(np.where(Y[:, i] == 1)[0]) # upper approximation lower_approx.append(np.where((Y[:, i] == 1) & (nclus == 1))[0]) # upper approximation upper_approx_nd.append(np.where(Ynd[:, i] == 1)[0]) # upper approximation lower_approx_nd.append(np.where((Ynd[:, i] == 1) & (nclus_nd == 1))[0]) card = np.concatenate(([c], card[1:f])) Card = np.tile(card, (n, 1)) N = np.sum(np.log(Card) * mass) / np.log(c) / n clus = {'conf': conf, 'F': F, 'mass': mass, 'mass_n': mass_n, 'pl': pl, 'pl_n': pl_n, 'bel': bel, 'bel_n': bel_n, 'y_pl': y_pl, 'y_bel': y_bel, 'Y': Y, 'betp': betp, 'betp_n': betp_n, 'p': p, 'upper_approx': upper_approx, 'lower_approx': lower_approx, 'Ynd': Ynd, 'upper_approx_nd': upper_approx_nd, 'lower_approx_nd': lower_approx_nd, 'N': N, 'outlier': outlier , 'g': g, 'S': S, 'crit': crit, 'Kmat': Kmat, 'trace': trace, 'D': D, 'method': method, 'W': W, 'J': J, 'param': param} return clus
#---------------------- summary------------------------------------------------
[docs] def ev_summary(clus): """ Summary of a credal partition. summary_credpart is the summary method for credpart objects. This function extracts basic information from credpart objects. Parameters: ----------- clus : object An object of class "credpart", encoding a credal partition. Returns: -------- None Prints basic information on the credal partition. """ c = clus['F'].shape[1] n = clus['mass'].shape[0] print("------ Credal partition ------") print(f"{c} classes,") print(f"{n} objects") print(f"Generated by {clus['method']}") print("Focal sets:") print(clus['F']) print(f"Value of the criterion = {clus['crit']:.2f}") print(f"Nonspecificity = {clus['N']:.2f}") if clus['g'] is not None: print("Prototypes:") print(clus['g']) print(f"Number of outliers = {len(clus['outlier']):.2f}")
#---------------------- plot------------------------------------------------
[docs] def ev_plot(x, X=None, ytrue=None, Outliers=True, Approx=1, cex=1, cexvar='pl', cex_outliers=5, cex_protos=5, lwd=1, ask=False, plot_Shepard=False, plot_approx=True, plot_protos=True, xlab='$x_1$' , ylab='$x_2$'): """ Plotting a credal partition. Generates plots of a credal partition. This function plots different views of a credal partition in a two-dimensional attribute space. Parameters: ----------- x : object An object of class "credpart", encoding a credal partition. X : array-like, optional A data matrix. If it has more than two columns (attributes), only the first two columns are used. ytrue : array-like, optional The vector of true class labels. If supplied, a different color is used for each true cluster. Otherwise, the maximum-plausibility clusters are used instead. Outliers : bool, optional If True, the outliers are plotted, and they are not included in the lower and upper approximations of the clusters. Approx : int, optional If Approx==1 (default), the lower and upper cluster approximations are computed using the interval dominance rule. Otherwise, the maximum mass rule is used. cex : float, optional Maximum size of data points. cexvar : str, optional Parameter determining if the size of the data points is proportional to the plausibilities ('pl', the default), the plausibilities of the normalized credal partition ('pl.n'), the degrees of belief ('bel'), the degrees of belief of the normalized credal partition ('bel.n'), or if it is constant ('cst', default). cex_outliers : float, optional Size of data points for outliers. cex_protos : float, optional Size of data points for prototypes (if applicable). lwd : int, optional Line width for drawing the lower and upper approximations. ask : bool, optional Logical; if True, the user is asked before each plot. plot_Shepard : bool, optional Logical. plot_approx : bool, optional Logical; if True (default) the convex hulls of the lower and upper approximations are plotted. plot_protos : bool, optional Logical; if True (default) the prototypes are plotted (for methods generating prototypes, like ECM). xlab : str, optional Label of horizontal axis. ylab : str, optional Label of vertical axis. Returns: ---------- None """ clus = x if X is not None: x = X y = ytrue plt.rcParams['interactive'] = ask if y is None: y = clus['y_pl'] c = len(np.unique(clus['y_pl'])) if Approx == 1: lower_approx = clus['lower_approx_nd'] upper_approx = clus['upper_approx_nd'] else: lower_approx = clus['lower_approx'] upper_approx = clus['upper_approx'] if Outliers: for i in range(c): lower_approx[i] = np.setdiff1d(lower_approx[i], clus['outlier']) upper_approx[i] = np.setdiff1d(upper_approx[i], clus['outlier']) if cexvar == 'pl': cex = cex * np.apply_along_axis(np.max, 1, clus['pl']) elif cexvar == 'pl_n': cex = cex * np.apply_along_axis(np.max, 1, clus['pl_n']) elif cexvar == 'bel': cex = cex * np.apply_along_axis(np.max, 1, clus['bel']) elif cexvar == 'bel_n': cex = cex * np.apply_along_axis(np.max, 1, clus['bel_n']) colors = [mcolors.to_rgba('C{}'.format(i)) for i in y] color = [mcolors.to_rgba('C{}'.format(i)) for i in np.unique(y)] plt.scatter(x.iloc[:, 0], x.iloc[:, 1], c=colors, s=cex) if Outliers: plt.scatter(x.iloc[clus['outlier'], 0], x.iloc[clus['outlier'], 1], c='black', marker='x', s=cex_outliers) if 'g' in clus and plot_protos and clus['g'] is not None: plt.scatter(clus['g'][:, 0], clus['g'][:, 1], c=color, marker='s', s=cex_protos) if plot_approx: for i in range(1, c + 1): xx = x.iloc[lower_approx[i - 1]] if xx.shape[0] >= 3: hull = ConvexHull(xx.iloc[:, :2]) for simplex in hull.simplices: plt.plot(xx.iloc[simplex, 0], xx.iloc[simplex, 1], linewidth=lwd, color='C{}'.format(i-1)) xx = x.iloc[upper_approx[i - 1]] if xx.shape[0] >= 3: hull = ConvexHull(xx.iloc[:, :2]) for simplex in hull.simplices: plt.plot(xx.iloc[simplex, 0], xx.iloc[simplex, 1], linestyle='dashed', linewidth=lwd, color='C{}'.format(i-1)) plt.xlabel(xlab) plt.ylabel(ylab) plt.tight_layout() plt.show()
#---------------------- plot with pca------------------------------------------------
[docs] def ev_pcaplot(data, x, normalize=False, splite=False, cex=8, cex_protos=5): """ Plot PCA results with cluster colors. This function performs PCA on the input data and plots the resulting PCA scores, using the specified cluster information in 'x'. Parameters: ----------- data : DataFrame The input data containing the attributes (columns) and samples (rows). x : object An object of class "credpart", encoding a credal partition. normalize : bool, optional If True, the data will be normalized before performing PCA. Default is False. splite : bool, optional If True, provides access to several different axes-level functions that show the views of clusters. Returns: --------- None """ if normalize: data = (data - data.mean()) / data.std() # Normalize the data mas = pd.DataFrame(x["mass"]) c = len(np.unique(x['y_pl'])) cols = get_ensembles(x['F']) mas.columns = cols mas["Cluster"] = mas.apply(lambda row: row.idxmax(), axis=1) pca = PCA(n_components=2) pca_result = pca.fit_transform(data) variance_percent = np.round(pca.explained_variance_ratio_ * 100, 1) ind_coord = pd.DataFrame(pca_result, columns=["Dim.1", "Dim.2"]) ind_coord["Cluster"] = pd.Categorical(mas["Cluster"]) mean_coords = ind_coord.groupby('Cluster').mean() pcolor = sns.color_palette("Dark2", n_colors=len(ind_coord["Cluster"].unique())) plt.figure(figsize=(8, 6)) if splite: sns.relplot(data=ind_coord, x="Dim.1", y="Dim.2", hue="Cluster", col="Cluster", style="Cluster", palette=pcolor, s=cex, col_wrap=int((c**2)/2)) else: sns.scatterplot(data=ind_coord, x="Dim.1", y="Dim.2", hue="Cluster", palette=pcolor, style="Cluster", s=cex) sns.scatterplot(data=mean_coords, x="Dim.1", y="Dim.2", s=(cex+25), hue="Cluster", palette=pcolor, style="Cluster",legend=False) sns.despine() legend = plt.legend(title="Cluster", loc='lower right', markerscale=0.3) plt.setp(legend.get_title(), fontsize=7) plt.setp(legend.get_texts(), fontsize=7) plt.tick_params(axis='both', labelsize=7) plt.xlabel("X Label", fontsize=7) plt.ylabel("Y Label", fontsize=7) plt.xlabel(f"Dim 1 ({variance_percent[0]}%)") plt.ylabel(f"Dim 2 ({variance_percent[1]}%)") plt.show()
#---------------------- plot for time series------------------------------------------------
[docs] def ev_tsplot(X, V, clus, plot_centers=True): """ Plot the results of evidential clustering algorithm for time series. Parameters ----------- X : array-like The time series data. V : array-like The medoid (center) time series for each cluster. clus : dict The clustering results, with 'mass' and 'F' keys. plot_centers : bool, optional If True, plot the cluster centers in color. If False, plot the individual series and the cluster centers in black. Default is True. """ # Get the cluster labels from 'clus' mas = pd.DataFrame(clus['mass']) mas.columns = get_ensembles(clus['F']) cluster = pd.Categorical(mas.apply(lambda row: row.idxmax(), axis=1)) # Number of clusters unique_clusters = np.unique(cluster) k = len(unique_clusters) # Number grid grid_cols = int(np.ceil(np.sqrt(k))) grid_rows = int(np.ceil(k / grid_cols)) fig, axes = plt.subplots(nrows=grid_rows, ncols=grid_cols, figsize=(10, 6)) plt.rcParams["figure.dpi"] = 100 colors = plt.cm.viridis(np.linspace(0, 1, k)) for i in range(grid_rows): for j in range(grid_cols): idx = i * grid_cols + j if idx < k: ax = axes[i, j] if plot_centers: ax.plot(V[idx], color=colors[idx], linewidth=2) else: cluster_series = X[cluster == unique_clusters[idx]] for series in cluster_series: ax.plot(series, color=colors[idx], alpha=0.5) ax.plot(V[idx], color='black', linewidth=2) ax.set_title(f'Cluster {unique_clusters[idx]}') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) else: axes[i, j].axis('off') plt.tight_layout() plt.show() return fig, axes