Source code for strings

"""String manipulation utilities for data science tasks."""

import re
from typing import List, Tuple, Optional, Callable, Union

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer


def _tokenize(text_tags: str) -> List[str]:
    tags = text_tags.split(",")
    tags = [re.sub(r"[^a-zA-Z0-9_$-]", "", x) for x in tags]
    tags = [x.strip() for x in tags]
    tags = [x for x in tags if x]  # More concise than checking length
    return tags


[docs] def append_tags_to_frame( X_train: pd.DataFrame, X_test: pd.DataFrame, field_name: str, prefix: str = "", max_features: Optional[int] = 500, min_df: Union[int, float] = 1, lowercase: bool = False, tokenizer: Optional[Callable[[str], List[str]]] = _tokenize, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Extract tags from a given field and append them to the dataframe. :param X_train: Pandas DataFrame with the train features. :param X_test: Pandas DataFrame with the test features. :param field_name: The feature to parse. :param prefix: The prefix for new tag features. :param max_features: Maximum number of tag names to consider. Default is 500. This helps limit the number of new columns created, especially useful for datasets with a large number of unique tags. :param min_df: When building the tag name set, ignore tags with a document frequency strictly lower than the given threshold. If min_df is a float, the parameter represents a proportion of documents. If integer, it represents absolute counts. Default is 1. This helps filter out rare tags. :param lowercase: Convert all characters to lowercase before tokenizing the tag names. Default is False. Set to True if you want case-insensitive tag matching. :param tokenizer: Callable to override the string tokenization step while preserving the preprocessing and n-grams generation steps. Default splits by ",", and retains alphanumeric characters with special characters "_", "$", and "-". :return: The train and test DataFrames with tags appended. :raise KeyError: if one of the frames is missing columns. """ vectorizer = CountVectorizer( binary=True, tokenizer=tokenizer, encoding="utf-8", lowercase=lowercase, min_df=min_df, max_features=max_features, token_pattern=None, ) if X_train.empty: return pd.DataFrame(), pd.DataFrame() x_train_filled = X_train[field_name].fillna("") x_test_filled = X_test[field_name].fillna("") x_train_count_matrix = vectorizer.fit_transform(x_train_filled) x_test_count_matrix = vectorizer.transform(x_test_filled) feature_names = [prefix + tag_name for tag_name in vectorizer.get_feature_names_out()] x_train_tags = pd.DataFrame(x_train_count_matrix.toarray(), columns=feature_names, index=X_train.index) x_test_tags = pd.DataFrame(x_test_count_matrix.toarray(), columns=feature_names, index=X_test.index) x_train_reduced = X_train.drop(columns=[field_name]) x_test_reduced = X_test.drop(columns=[field_name]) return ( pd.merge(x_train_reduced, x_train_tags, left_index=True, right_index=True, how="left"), pd.merge(x_test_reduced, x_test_tags, left_index=True, right_index=True, how="left"), )
[docs] def extract_significant_terms_from_subset( data_frame: pd.DataFrame, subset_data_frame: pd.DataFrame, field_name: str, vectorizer: CountVectorizer = CountVectorizer(encoding="utf-8", lowercase=True, max_features=500), ) -> pd.Series: """Return interesting or unusual occurrences of terms in a subset. Based on the elasticsearch significant_text aggregation: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-significantterms-aggregation.html#_scripted :param data_frame: The full dataset. :param subset_data_frame: The subset partition data over which the scoring will be calculated. It can be filtered by feature or other boolean criteria. :param field_name: The feature to parse. :param vectorizer: Text count vectorizer which converts a collection of text to a matrix of token counts. See more info here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html :return: Series of terms with scoring over the subset. :author: Eran Hirsch (https://github.com/eranhirs) """ if data_frame.empty: return pd.Series() count_matrix = vectorizer.fit_transform(data_frame[field_name].dropna()) matrix_df = pd.DataFrame(count_matrix.toarray(), columns=vectorizer.get_feature_names_out()) subset_x = vectorizer.transform(subset_data_frame[field_name].dropna()) subset_matrix_df = pd.DataFrame(subset_x.toarray(), columns=vectorizer.get_feature_names_out()) subset_freq = subset_matrix_df.sum() superset_freq = matrix_df.sum() return (subset_freq / (superset_freq - subset_freq + 1)).sort_values(ascending=False)