Source code for symdet.utils.data_clustering

"""
This program and the accompanying materials are made available under the terms of the
Eclipse Public License v2.0 which accompanies this distribution, and is available at
https://www.eclipse.org/legal/epl-v20.html
SPDX-License-Identifier: EPL-2.0

Copyright Contributors to the Zincware Project.

Description: Methods to help with clustering data.
"""
import tensorflow as tf
import numpy as np
from typing import Tuple
import sys


def _build_condlist(data: np.array, bin_values: dict) -> Tuple:
    """
    Build the condition list for the piecewise implementation.

    Parameters
    ----------
    data : np.array
            Data on which to apply conditions.
    bin_values : np.array
            Bin numbers for the constraint.

    Returns
    -------
    conditions : list
            These are conditions applied to the data.
    classes : list
            Class keys.
    """

    conditions = []
    classes = []
    for key in bin_values:
        conditions.append(
            np.logical_and(
                data >= (bin_values[key][0]), data <= (bin_values[key][1])
            )
        )
        classes.append(key)

    return conditions, classes


def _function_to_bins(function_values: tf.Tensor, bin_values: dict) -> tf.Tensor:
    """
    Sort function values into bins.

    Parameters
    ----------
    function_values : np.array
            Function values corresponding to radii values.
    bin_values : dict
            bin dictionary where keys are the class numbers.

    Returns
    -------
    conditions : tf.Tensor
            Conditions from the cond list build.
    """

    conditions, functions = _build_condlist(function_values, bin_values)

    return tf.convert_to_tensor(conditions)


[docs]def range_binning( image: tf.Tensor, domain: tf.Tensor, value_range: list, bin_operation: list, representatives: int = 100) -> dict: """ A method to apply simple range binning to some data. Parameters ---------- image : tf.Tensor data to cluster. domain : tf.Tensor data pool to return clustered. representatives : int Number of class representatives to have for each bin. value_range : list The parameters within which to bin e.g. k in [-5, 5] bin_operation : list Operation to apply to the bins e.g [1/5, 1e-3] will lead to bins of the form [k/5 - 1e-3, k/5 + 1e-3] Returns ------- classes : dict Data class numbers and their data representatives as a dictionary. """ # Construct the classes and their range. bin_values = {} n_classes = (value_range[1] - value_range[0]) + 1 for k in np.linspace(value_range[0], value_range[1], n_classes, dtype=int): bin_values[f"{k + abs(value_range[0])}"] = [ bin_operation[0] * k - bin_operation[1], bin_operation[0] * k + bin_operation[1], ] # Collect the bin masks bin_masks = _function_to_bins(image, bin_values) # Check that there is enough data in each class. bin_count = tf.reduce_sum(tf.cast(bin_masks, tf.int8), 1) if any(bin_count) < representatives: print("WARNING: Not enough data! Some classes will be under-represented.") class_keys = list(bin_values.keys()) clustered_data = {} for i in range(len(class_keys)): clustered_data[class_keys[i]] = {} filtered_domain = tf.boolean_mask(domain, bin_masks[i]) filtered_image = tf.boolean_mask(image, bin_masks[i]) clustered_data[class_keys[i]]['domain'] = filtered_domain[0:representatives] clustered_data[class_keys[i]]['image'] = filtered_image[0:representatives] return clustered_data
[docs]def compute_com(data: np.ndarray): """ Compute the center of mass of some data. Parameters ---------- data : np.ndarray Data on which to compute the center of mass. Returns ------- """ return tf.reduce_mean(data, axis=0)
[docs]def compute_radius_of_gyration(data: np.ndarray, com: np.ndarray): """ Compute the radius of gyration of some data. Parameters ---------- data : np.ndarray com : np.ndarray Returns ------- """ rg_primitive = tf.reduce_sum((data - com)**2, axis=1) return tf.reduce_mean(rg_primitive, axis=0)