Source code for preprocess

import warnings
from typing import Optional, Union, Callable, List

import numpy
import pandas
import seaborn
from matplotlib import axes, pyplot, dates, ticker
from scipy.cluster import hierarchy


[docs]def visualize_feature(series: pandas.Series, remove_na: bool = False, *, ax: Optional[axes.Axes] = None,
                      **kwargs) -> axes.Axes:
    """
    Visualize a feature series:

    * If the feature is float then the method plots the distribution plot.
    * If the feature is datetime then the method plots a line plot of progression of amount thought time.
    * If the feature is object, categorical, boolean or integer then the method plots count plot (histogram).

    :param series: the data series.
    :param remove_na: True to ignore NA values when plotting; False otherwise.
    :param ax: Axes in which to draw the plot, otherwise use the currently-active Axes.
    :param kwargs: other keyword arguments

                   All other keyword arguments are passed to ``matplotlib.axes.Axes.pcolormesh()``.
    :return: Returns the Axes object with the plot drawn onto it.
    """
    if ax is None:
        pyplot.figure()
        ax = pyplot.gca()

    if remove_na:
        feature_series = series.dropna()
    else:
        feature_series = series

    if str(feature_series.dtype).startswith("float"):
        seaborn.histplot(feature_series, ax=ax, kde=True, **kwargs)
        labels = ax.get_xticks()
    elif str(feature_series.dtype).startswith("datetime"):
        feature_series.value_counts().plot(kind="line", ax=ax, **kwargs)
        labels = ax.get_xticks()
    else:
        seaborn.countplot(x=_copy_series_or_keep_top_10(feature_series), ax=ax, **kwargs)
        labels = ax.get_xticklabels()

    if not ax.get_title():
        ax.set_title(f"{feature_series.name} ({feature_series.dtype})")
        ax.set_xlabel("")

    ticks_loc = ax.get_xticks().tolist()
    ax.xaxis.set_major_locator(ticker.FixedLocator(ticks_loc))
    ax.set_xticklabels(labels, rotation=45, horizontalalignment='right')

    if str(feature_series.dtype).startswith("datetime"):
        ax.xaxis.set_major_formatter(_convert_numbers_to_dates)

    return ax


[docs]def get_correlated_features(data_frame: pandas.DataFrame, features: List[str], target_feature: str,
                            threshold: float = 0.95, method: Union[str, Callable] = 'pearson',
                            min_periods: Optional[int] = 1) -> pandas.DataFrame:
    """
    Calculate which features correlated above a threshold and extract a data frame with the correlations and correlation
    to the target feature.

    :param data_frame: the data frame.
    :param features: list of features names.
    :param target_feature: name of target feature.
    :param threshold: the threshold (default 0.95).
    :param method: {‘pearson’, ‘kendall’, ‘spearman’} or callable

                   Method of correlation:

                   * pearson : standard correlation coefficient
                   * kendall : Kendall Tau correlation coefficient
                   * spearman : Spearman rank correlation
                   * callable: callable with input two 1d ndarrays and returning a float. Note that the returned matrix from corr will have 1 along the diagonals and will be symmetric regardless of the callable’s behavior.

    :param min_periods: Minimum number of observations required per pair of columns to have a valid result. Currently only available for Pearson and Spearman correlation.
    :return: data frame with the correlations and correlation to the target feature.
    """
    correlations = _calc_corrections(data_frame[features + [target_feature]], method, min_periods)
    target_corr = correlations[target_feature].transpose()
    features_corr = correlations.loc[features, features]
    corr_matrix = features_corr.where(numpy.triu(numpy.ones(features_corr.shape), k=1).astype(numpy.bool_))
    corr_matrix = corr_matrix[(~numpy.isnan(corr_matrix))].stack().reset_index()
    corr_matrix = corr_matrix[corr_matrix[0].abs() >= threshold]
    if corr_matrix.shape[0] > 0:
        corr_matrix["level_0_target_corr"] = target_corr[corr_matrix["level_0"]].values.tolist()[0]
        corr_matrix["level_1_target_corr"] = target_corr[corr_matrix["level_1"]].values.tolist()[0]
        corr_matrix = corr_matrix.rename({0: "level_0_level_1_corr"}, axis=1).reset_index(drop=True)
        return corr_matrix
    else:
        warnings.warn(f"Correlation threshold {threshold} was too high. An empty frame was returned", UserWarning)
        return pandas.DataFrame(
            columns=['level_0', 'level_1', 'level_0_level_1_corr', 'level_0_target_corr', 'level_1_target_corr'])


[docs]def visualize_correlations(data: pandas.DataFrame, method: Union[str, Callable] = 'pearson',
                           min_periods: Optional[int] = 1, *, ax: Optional[axes.Axes] = None,
                           **kwargs) -> axes.Axes:
    """
    Compute pairwise correlation of columns, excluding NA/null values, and visualize it with heat map.
    `Original code <https://seaborn.pydata.org/examples/many_pairwise_correlations.html>`_

    :param data: the data frame, were each feature is a column.
    :param method: {‘pearson’, ‘kendall’, ‘spearman’} or callable

                   Method of correlation:

                   * pearson : standard correlation coefficient
                   * kendall : Kendall Tau correlation coefficient
                   * spearman : Spearman rank correlation
                   * callable: callable with input two 1d ndarrays and returning a float. Note that the returned matrix from corr will have 1 along the diagonals and will be symmetric regardless of the callable’s behavior.

    :param min_periods: Minimum number of observations required per pair of columns to have a valid result. Currently only available for Pearson and Spearman correlation.
    :param ax: Axes in which to draw the plot, otherwise use the currently-active Axes.
    :param kwargs: other keyword arguments

                   All other keyword arguments are passed to ``matplotlib.axes.Axes.pcolormesh()``.
    :return: Returns the Axes object with the plot drawn onto it.
    """
    if ax is None:
        pyplot.figure()
        ax = pyplot.gca()

    corr = _calc_corrections(data, method, min_periods)
    mask = numpy.triu(numpy.ones_like(corr, dtype=numpy.bool_))
    seaborn.heatmap(corr, mask=mask, annot=True, fmt=".3f", ax=ax, **kwargs)
    return ax


[docs]def plot_correlation_dendrogram(data: pandas.DataFrame, correlation_method: Union[str, Callable] = 'pearson',
                                min_periods: Optional[int] = 1,
                                cluster_distance_method: Union[str, Callable] = "average", *,
                                ax: Optional[axes.Axes] = None,
                                **kwargs) -> axes.Axes:
    """
    Plot dendrogram of a correlation matrix. This consists of a chart that that shows hierarchically the variables that
    are most correlated by the connecting trees. The closer to the right that the connection is, the more correlated the features are.

    :param data: the data frame, were each feature is a column.
    :param correlation_method: {‘pearson’, ‘kendall’, ‘spearman’} or callable

                   Method of correlation:

                   * pearson : standard correlation coefficient
                   * kendall : Kendall Tau correlation coefficient
                   * spearman : Spearman rank correlation
                   * callable: callable with input two 1d ndarrays and returning a float. Note that the returned matrix from corr will have 1 along the diagonals and will be symmetric regardless of the callable’s behavior.

    :param min_periods: Minimum number of observations required per pair of columns to have a valid result. Currently only available for Pearson and Spearman correlation.
    :param cluster_distance_method: The following are methods for calculating the distance between the newly formed cluster.

            Methods of linkage:

            * single: This is also known as the Nearest Point Algorithm.
            * complete: This is also known by the Farthest Point Algorithm or Voor Hees Algorithm.
            * average:
            .. math:: d(u,v) = \\sum_{ij} \\frac{d(u[i], v[j])}{(|u|*|v|)}

            This is also called the UPGMA algorithm.

            * weighted:
            .. math:: d(u,v) = (dist(s,v) + dist(t,v))/2

            where cluster u was formed with cluster s and t and v
            is a remaining cluster in the forest. (also called WPGMA)

            * centroid: Euclidean distance between the centroids
            * median: This is also known as the WPGMC algorithm.
            * ward: uses the Ward variance minimization algorithm.

            see `scipy.cluster.hierarchy.linkage <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html>`_ for more information.

    :param ax: Axes in which to draw the plot, otherwise use the currently-active Axes.
    :param kwargs: other keyword arguments

                   All other keyword arguments are passed to ``matplotlib.axes.Axes.pcolormesh()``.
    :return: Returns the Axes object with the plot drawn onto it.
    """

    if ax is None:
        pyplot.figure()
        ax = pyplot.gca()

    corr = _calc_corrections(data, correlation_method, min_periods)
    # reverse the distance
    corr_condensed = hierarchy.distance.squareform(1 - corr)
    z = hierarchy.linkage(corr_condensed, method=cluster_distance_method)
    ax.set(**kwargs)
    hierarchy.dendrogram(z, labels=numpy.asarray(data.columns.tolist()), orientation="left", ax=ax)
    return ax


def _calc_corrections(data, method, min_periods):
    return data.apply(lambda x: x.factorize()[0]).corr(method=method, min_periods=min_periods)


[docs]def plot_features_interaction(feature_1: str, feature_2: str, data: pandas.DataFrame, *,
                              ax: Optional[axes.Axes] = None, **kwargs) -> axes.Axes:
    """
    Plots the joint distribution between two features:

    * If both features are either categorical, boolean or object then the method plots the shared histogram.
    * If one feature is either categorical, boolean or object and the other is numeric then the method plots a boxplot chart.
    * If one feature is datetime and the other is numeric or datetime then the method plots a line plot graph.
    * If one feature is datetime and the other is either categorical, boolean or object the method plots a violin plot (combination of boxplot and kernel density estimate).
    * If both features are numeric then the method plots scatter graph.

    :param feature_1: the name of the first feature.
    :param feature_2: the name of the second feature.
    :param data: the data frame, were each feature is a column.
    :param ax: Axes in which to draw the plot, otherwise use the currently-active Axes.
    :param kwargs: other keyword arguments

                   All other keyword arguments are passed to ``matplotlib.axes.Axes.pcolormesh()``.
    :return: Returns the Axes object with the plot drawn onto it.
    """
    if ax is None:
        pyplot.figure()
        ax = pyplot.gca()

    dup_df = pandas.DataFrame()
    if str(data[feature_1].dtype) in ["object", "category", "bool"]:
        dup_df[feature_1] = _copy_series_or_keep_top_10(data[feature_1])
        if str(data[feature_2].dtype) in ["object", "category", "bool"]:
            # both features are categorical
            dup_df[feature_2] = _copy_series_or_keep_top_10(data[feature_2])
            group_feature_1 = dup_df[feature_1].unique().tolist()
            ax.hist([dup_df.loc[dup_df[feature_1] == value, feature_2] for value in group_feature_1],
                    label=group_feature_1, **kwargs)
            ax.set_xlabel(feature_1)
            ax.legend(title=feature_2)
        elif str(data[feature_2].dtype).startswith("datetime"):
            # first feature is categorical and the second is datetime
            dup_df[feature_2] = data[feature_2].apply(dates.date2num)
            chart = seaborn.violinplot(x=feature_2, y=feature_1, data=dup_df, ax=ax)
            ticks_loc = chart.get_xticks().tolist()
            chart.xaxis.set_major_locator(ticker.FixedLocator(ticks_loc))
            chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
            ax.xaxis.set_major_formatter(_convert_numbers_to_dates)
        else:
            # first feature is categorical and the second is numeric
            dup_df[feature_2] = data[feature_2]
            chart = seaborn.boxplot(x=feature_1, y=feature_2, data=dup_df, ax=ax, **kwargs)
            chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
    elif str(data[feature_1].dtype).startswith("datetime"):
        if str(data[feature_2].dtype) in ["object", "category", "bool"]:
            # first feature is datetime and the second is categorical
            dup_df[feature_1] = data[feature_1].apply(dates.date2num)
            dup_df[feature_2] = _copy_series_or_keep_top_10(data[feature_2])
            chart = seaborn.violinplot(x=feature_1, y=feature_2, data=dup_df, ax=ax)
            ticks_loc = chart.get_xticks().tolist()
            chart.xaxis.set_major_locator(ticker.FixedLocator(ticks_loc))
            chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
            ax.xaxis.set_major_formatter(_convert_numbers_to_dates)
        else:
            # first feature is datetime and the second is numeric or datetime
            ax.plot(data[feature_1], data[feature_2], **kwargs)
            ax.set_xlabel(feature_1)
            ax.set_ylabel(feature_2)
    elif str(data[feature_2].dtype) in ["object", "category", "bool"]:
        # first feature is numeric and the second is categorical
        dup_df[feature_2] = _copy_series_or_keep_top_10(data[feature_2])
        dup_df[feature_1] = data[feature_1]
        chart = seaborn.boxplot(x=feature_2, y=feature_1, data=dup_df, ax=ax, **kwargs)
        chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
    elif str(data[feature_2].dtype).startswith("datetime"):
        # first feature is numeric and the second is datetime
        ax.plot(data[feature_2], data[feature_1], **kwargs)
        ax.set_xlabel(feature_2)
        ax.set_ylabel(feature_1)
    else:
        # both features are numeric
        ax.scatter(data[feature_1], data[feature_2], **kwargs)
        ax.set_xlabel(feature_1)
        ax.set_ylabel(feature_2)

    return ax


def _copy_series_or_keep_top_10(series: pandas.Series) -> pandas.Series:
    if str(series.dtype) == "bool":
        # avoiding RuntimeWarning from numpy (Converting input from bool to <class 'numpy.uint8'> for compatibility.)
        return series.apply(lambda val: "True" if val else "False")
    if len(series.unique().tolist()) > 10:
        top10 = series.value_counts()[:10].index.tolist()
        return series.apply(lambda val: val if val in top10 else "Other values")
    return series


@pyplot.FuncFormatter
def _convert_numbers_to_dates(x, pos):
    return dates.num2date(x).strftime('%Y-%m-%d %H:%M')