# -*- coding: utf-8 -*-
# This file as well as the whole evclust package are licenced under the MIT licence (see the LICENCE.txt)
# Armel SOUBEIGA (armelsoubeiga.github.io), France, 2023
"""
This module contains the utils function
"""
#---------------------- Packges------------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from scipy.spatial import ConvexHull
import seaborn as sns
from sklearn.decomposition import PCA
#---------------------- makeF--------------------------------------------------
[docs]
def makeF(c, type=['simple', 'full', 'pairs'], pairs=None, Omega=True):
"""
Creation of a matrix of focal sets. `makeF` creates a matrix of focal sets.
Parameters:
-----------
c (int):
Number of clusters.
type (str):
Type of focal sets ("simple": {}, singletons, and Ω; "full": all 2^c subsets of Ω;
"pairs": {}, singletons, Ω, and all or selected pairs).
pairs (ndarray or None):
Set of pairs to be included in the focal sets; if None, all pairs are included. Used only if type="pairs".
Omega (bool):
If True (default), Ω is a focal set (for types 'simple' and 'pairs').
Returns:
--------
ndarray: A matrix (f, c) of focal sets.
"""
if type == 'full': # All the 2^c focal sets
ii = np.arange(2**c)
N = len(ii)
F = np.zeros((N, c))
CC = np.array([np.binary_repr(i, width=c) for i in range(N)])
for i in range(N):
F[i, :] = np.array([int(s) for s in CC[i]])
F = F[:, ::-1]
else: # type = 'simple' or 'pairs'
F = np.vstack((np.zeros(c), np.eye(c))) # the empty set and the singletons
if type == 'pairs': # type = 'pairs'
if pairs is None: # pairs not specified: we take them all
for i in range(c - 1):
for j in range(i + 1, c):
f = np.zeros(c)
f[[i, j]] = 1
F = np.vstack((F, f))
else: # pairs specified
n = pairs.shape[0]
for i in range(n):
f = np.zeros(c)
f[pairs[i, :]] = 1
F = np.vstack((F, f))
if Omega and not ((type == "pairs") and (c == 2)) and not ((type == "simple") and (c == 1)):
F = np.vstack((F, np.ones(c))) # the whole frame
return F
#---------------------- get_ensembles------------------------------------------
[docs]
def get_ensembles(table):
"""
Get cluster name using mass
Parameters:
------------
table (ndarray):
Matrix of mass functions. The first column corresponds to the degree of conflict.
Returns:
--------
Returns a list of cluster names in order according to the max mass approach
"""
result = []
for row in table:
row_str = 'Cl_' + '_'.join([str(i + 1) if elem == 1 else str(int(elem)) for i, elem in enumerate(row) if elem != 0])
result.append(row_str)
result[0] = 'Cl_atypique'
result[-1] = 'Cl_incertains'
cleaned_result = [''.join(ch for i, ch in enumerate(row_str) if ch != '_' or (i > 0 and row_str[i-1] != '_')) for row_str in result]
return cleaned_result
#---------------------- extractMass--------------------------------------------
#---------------------- summary------------------------------------------------
[docs]
def ev_summary(clus):
"""
Summary of a credal partition. summary_credpart is the summary method for credpart objects.
This function extracts basic information from credpart objects.
Parameters:
-----------
clus : object
An object of class "credpart", encoding a credal partition.
Returns:
--------
None
Prints basic information on the credal partition.
"""
c = clus['F'].shape[1]
n = clus['mass'].shape[0]
print("------ Credal partition ------")
print(f"{c} classes,")
print(f"{n} objects")
print(f"Generated by {clus['method']}")
print("Focal sets:")
print(clus['F'])
print(f"Value of the criterion = {clus['crit']:.2f}")
print(f"Nonspecificity = {clus['N']:.2f}")
if clus['g'] is not None:
print("Prototypes:")
print(clus['g'])
print(f"Number of outliers = {len(clus['outlier']):.2f}")
#---------------------- plot------------------------------------------------
[docs]
def ev_plot(x, X=None, ytrue=None, Outliers=True, Approx=1, cex=1,
cexvar='pl', cex_outliers=5, cex_protos=5, lwd=1,
ask=False, plot_Shepard=False, plot_approx=True,
plot_protos=True, xlab='$x_1$' , ylab='$x_2$'):
"""
Plotting a credal partition. Generates plots of a credal partition.
This function plots different views of a credal partition in a two-dimensional attribute space.
Parameters:
-----------
x : object
An object of class "credpart", encoding a credal partition.
X : array-like, optional
A data matrix. If it has more than two columns (attributes), only the first two columns are used.
ytrue : array-like, optional
The vector of true class labels. If supplied, a different color is used for each true cluster.
Otherwise, the maximum-plausibility clusters are used instead.
Outliers : bool, optional
If True, the outliers are plotted, and they are not included in the lower and upper approximations of the clusters.
Approx : int, optional
If Approx==1 (default), the lower and upper cluster approximations are computed using the interval dominance rule.
Otherwise, the maximum mass rule is used.
cex : float, optional
Maximum size of data points.
cexvar : str, optional
Parameter determining if the size of the data points is proportional to the plausibilities ('pl', the default),
the plausibilities of the normalized credal partition ('pl.n'), the degrees of belief ('bel'),
the degrees of belief of the normalized credal partition ('bel.n'), or if it is constant ('cst', default).
cex_outliers : float, optional
Size of data points for outliers.
cex_protos : float, optional
Size of data points for prototypes (if applicable).
lwd : int, optional
Line width for drawing the lower and upper approximations.
ask : bool, optional
Logical; if True, the user is asked before each plot.
plot_Shepard : bool, optional
Logical.
plot_approx : bool, optional
Logical; if True (default) the convex hulls of the lower and upper approximations are plotted.
plot_protos : bool, optional
Logical; if True (default) the prototypes are plotted (for methods generating prototypes, like ECM).
xlab : str, optional
Label of horizontal axis.
ylab : str, optional
Label of vertical axis.
Returns:
----------
None
"""
clus = x
if X is not None:
x = X
y = ytrue
plt.rcParams['interactive'] = ask
if y is None:
y = clus['y_pl']
c = len(np.unique(clus['y_pl']))
if Approx == 1:
lower_approx = clus['lower_approx_nd']
upper_approx = clus['upper_approx_nd']
else:
lower_approx = clus['lower_approx']
upper_approx = clus['upper_approx']
if Outliers:
for i in range(c):
lower_approx[i] = np.setdiff1d(lower_approx[i], clus['outlier'])
upper_approx[i] = np.setdiff1d(upper_approx[i], clus['outlier'])
if cexvar == 'pl':
cex = cex * np.apply_along_axis(np.max, 1, clus['pl'])
elif cexvar == 'pl_n':
cex = cex * np.apply_along_axis(np.max, 1, clus['pl_n'])
elif cexvar == 'bel':
cex = cex * np.apply_along_axis(np.max, 1, clus['bel'])
elif cexvar == 'bel_n':
cex = cex * np.apply_along_axis(np.max, 1, clus['bel_n'])
colors = [mcolors.to_rgba('C{}'.format(i)) for i in y]
color = [mcolors.to_rgba('C{}'.format(i)) for i in np.unique(y)]
plt.scatter(x.iloc[:, 0], x.iloc[:, 1], c=colors, s=cex)
if Outliers:
plt.scatter(x.iloc[clus['outlier'], 0], x.iloc[clus['outlier'], 1], c='black', marker='x', s=cex_outliers)
if 'g' in clus and plot_protos and clus['g'] is not None:
plt.scatter(clus['g'][:, 0], clus['g'][:, 1], c=color, marker='s', s=cex_protos)
if plot_approx:
for i in range(1, c + 1):
xx = x.iloc[lower_approx[i - 1]]
if xx.shape[0] >= 3:
hull = ConvexHull(xx.iloc[:, :2])
for simplex in hull.simplices:
plt.plot(xx.iloc[simplex, 0], xx.iloc[simplex, 1], linewidth=lwd, color='C{}'.format(i-1))
xx = x.iloc[upper_approx[i - 1]]
if xx.shape[0] >= 3:
hull = ConvexHull(xx.iloc[:, :2])
for simplex in hull.simplices:
plt.plot(xx.iloc[simplex, 0], xx.iloc[simplex, 1], linestyle='dashed', linewidth=lwd, color='C{}'.format(i-1))
plt.xlabel(xlab)
plt.ylabel(ylab)
plt.tight_layout()
plt.show()
#---------------------- plot with pca------------------------------------------------
[docs]
def ev_pcaplot(data, x, normalize=False, splite=False, cex=8, cex_protos=5):
"""
Plot PCA results with cluster colors.
This function performs PCA on the input data and plots the resulting PCA scores,
using the specified cluster information in 'x'.
Parameters:
-----------
data : DataFrame
The input data containing the attributes (columns) and samples (rows).
x : object
An object of class "credpart", encoding a credal partition.
normalize : bool, optional
If True, the data will be normalized before performing PCA. Default is False.
splite : bool, optional
If True, provides access to several different axes-level functions that show the views of clusters.
Returns:
---------
None
"""
if normalize:
data = (data - data.mean()) / data.std() # Normalize the data
mas = pd.DataFrame(x["mass"])
c = len(np.unique(x['y_pl']))
cols = get_ensembles(x['F'])
mas.columns = cols
mas["Cluster"] = mas.apply(lambda row: row.idxmax(), axis=1)
pca = PCA(n_components=2)
pca_result = pca.fit_transform(data)
variance_percent = np.round(pca.explained_variance_ratio_ * 100, 1)
ind_coord = pd.DataFrame(pca_result, columns=["Dim.1", "Dim.2"])
ind_coord["Cluster"] = pd.Categorical(mas["Cluster"])
mean_coords = ind_coord.groupby('Cluster').mean()
pcolor = sns.color_palette("Dark2", n_colors=len(ind_coord["Cluster"].unique()))
plt.figure(figsize=(8, 6))
if splite:
sns.relplot(data=ind_coord, x="Dim.1", y="Dim.2", hue="Cluster", col="Cluster",
style="Cluster", palette=pcolor, s=cex, col_wrap=int((c**2)/2))
else:
sns.scatterplot(data=ind_coord, x="Dim.1", y="Dim.2", hue="Cluster", palette=pcolor,
style="Cluster", s=cex)
sns.scatterplot(data=mean_coords, x="Dim.1", y="Dim.2", s=(cex+25), hue="Cluster",
palette=pcolor, style="Cluster",legend=False)
sns.despine()
legend = plt.legend(title="Cluster", loc='lower right', markerscale=0.3)
plt.setp(legend.get_title(), fontsize=7)
plt.setp(legend.get_texts(), fontsize=7)
plt.tick_params(axis='both', labelsize=7)
plt.xlabel("X Label", fontsize=7)
plt.ylabel("Y Label", fontsize=7)
plt.xlabel(f"Dim 1 ({variance_percent[0]}%)")
plt.ylabel(f"Dim 2 ({variance_percent[1]}%)")
plt.show()
#---------------------- plot for time series------------------------------------------------
[docs]
def ev_tsplot(X, V, clus, plot_centers=True):
"""
Plot the results of evidential clustering algorithm for time series.
Parameters
-----------
X : array-like
The time series data.
V : array-like
The medoid (center) time series for each cluster.
clus : dict
The clustering results, with 'mass' and 'F' keys.
plot_centers : bool, optional
If True, plot the cluster centers in color. If False, plot the individual series and the cluster centers in black.
Default is True.
"""
# Get the cluster labels from 'clus'
mas = pd.DataFrame(clus['mass'])
mas.columns = get_ensembles(clus['F'])
cluster = pd.Categorical(mas.apply(lambda row: row.idxmax(), axis=1))
# Number of clusters
unique_clusters = np.unique(cluster)
k = len(unique_clusters)
# Number grid
grid_cols = int(np.ceil(np.sqrt(k)))
grid_rows = int(np.ceil(k / grid_cols))
fig, axes = plt.subplots(nrows=grid_rows, ncols=grid_cols, figsize=(10, 6))
plt.rcParams["figure.dpi"] = 100
colors = plt.cm.viridis(np.linspace(0, 1, k))
for i in range(grid_rows):
for j in range(grid_cols):
idx = i * grid_cols + j
if idx < k:
ax = axes[i, j]
if plot_centers:
ax.plot(V[idx], color=colors[idx], linewidth=2)
else:
cluster_series = X[cluster == unique_clusters[idx]]
for series in cluster_series:
ax.plot(series, color=colors[idx], alpha=0.5)
ax.plot(V[idx], color='black', linewidth=2)
ax.set_title(f'Cluster {unique_clusters[idx]}')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
else:
axes[i, j].axis('off')
plt.tight_layout()
plt.show()
return fig, axes