Source code for strings

"""String manipulation utilities for data science tasks."""

import re
from typing import List, Tuple, Optional, Callable, Union

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer


def _tokenize(text_tags: str) -> List[str]:
    tags = text_tags.split(",")
    tags = [re.sub(r"[^a-zA-Z0-9_$-]", "", x) for x in tags]
    tags = [x.strip() for x in tags]
    tags = [x for x in tags if x]  # More concise than checking length
    return tags



[docs]
def append_tags_to_frame(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    field_name: str,
    prefix: str = "",
    max_features: Optional[int] = 500,
    min_df: Union[int, float] = 1,
    lowercase: bool = False,
    tokenizer: Optional[Callable[[str], List[str]]] = _tokenize,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Extract tags from a given field and append them to the dataframe.

    :param X_train: Pandas DataFrame with the train features.
    :param X_test: Pandas DataFrame with the test features.
    :param field_name: The feature to parse.
    :param prefix: The prefix for new tag features.
    :param max_features: Maximum number of tag names to consider. Default is 500. This helps limit the number of
                         new columns created, especially useful for datasets with a large number of unique tags.
    :param min_df: When building the tag name set, ignore tags with a document frequency strictly
                   lower than the given threshold. If min_df is a float, the parameter represents a proportion
                   of documents. If integer, it represents absolute counts. Default is 1. This helps filter out
                   rare tags.
    :param lowercase: Convert all characters to lowercase before tokenizing the tag names. Default is False. Set to
                      True if you want case-insensitive tag matching.
    :param tokenizer: Callable to override the string tokenization step while preserving the
                      preprocessing and n-grams generation steps. Default splits by ",", and
                      retains alphanumeric characters with special characters "_", "$", and "-".
    :return: The train and test DataFrames with tags appended.
    :raise KeyError: if one of the frames is missing columns.
    """
    vectorizer = CountVectorizer(
        binary=True,
        tokenizer=tokenizer,
        encoding="utf-8",
        lowercase=lowercase,
        min_df=min_df,
        max_features=max_features,
        token_pattern=None,
    )

    if X_train.empty:
        return pd.DataFrame(), pd.DataFrame()

    x_train_filled = X_train[field_name].fillna("")
    x_test_filled = X_test[field_name].fillna("")

    x_train_count_matrix = vectorizer.fit_transform(x_train_filled)
    x_test_count_matrix = vectorizer.transform(x_test_filled)

    feature_names = [prefix + tag_name for tag_name in vectorizer.get_feature_names_out()]

    x_train_tags = pd.DataFrame(x_train_count_matrix.toarray(), columns=feature_names, index=X_train.index)
    x_test_tags = pd.DataFrame(x_test_count_matrix.toarray(), columns=feature_names, index=X_test.index)

    x_train_reduced = X_train.drop(columns=[field_name])
    x_test_reduced = X_test.drop(columns=[field_name])

    return (
        pd.merge(x_train_reduced, x_train_tags, left_index=True, right_index=True, how="left"),
        pd.merge(x_test_reduced, x_test_tags, left_index=True, right_index=True, how="left"),
    )




[docs]
def extract_significant_terms_from_subset(
    data_frame: pd.DataFrame,
    subset_data_frame: pd.DataFrame,
    field_name: str,
    vectorizer: CountVectorizer = CountVectorizer(encoding="utf-8", lowercase=True, max_features=500),
) -> pd.Series:
    """Return interesting or unusual occurrences of terms in a subset.

    Based on the elasticsearch significant_text aggregation:
    https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-significantterms-aggregation.html#_scripted

    :param data_frame: The full dataset.
    :param subset_data_frame: The subset partition data over which the scoring will be calculated.
                              It can be filtered by feature or other boolean criteria.
    :param field_name: The feature to parse.
    :param vectorizer: Text count vectorizer which converts a collection of text to a matrix of token counts.
                       See more info here:
                       https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
    :return: Series of terms with scoring over the subset.

    :author: Eran Hirsch (https://github.com/eranhirs)
    """
    if data_frame.empty:
        return pd.Series()

    count_matrix = vectorizer.fit_transform(data_frame[field_name].dropna())
    matrix_df = pd.DataFrame(count_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    subset_x = vectorizer.transform(subset_data_frame[field_name].dropna())
    subset_matrix_df = pd.DataFrame(subset_x.toarray(), columns=vectorizer.get_feature_names_out())

    subset_freq = subset_matrix_df.sum()
    superset_freq = matrix_df.sum()

    return (subset_freq / (superset_freq - subset_freq + 1)).sort_values(ascending=False)