Source code for strings

import re
from typing import List, Tuple, Optional, Callable, Union

import pandas
from sklearn.feature_extraction.text import CountVectorizer


def _tokenize(text_tags: str) -> List[str]:
    tags = text_tags.split(",")
    tags = [re.sub(r"[^a-zA-Z0-9_$-]", "", x) for x in tags]
    tags = [x.strip() for x in tags]
    tags = [x for x in tags if len(x) > 0]
    return tags


[docs]def append_tags_to_frame(X_train: pandas.DataFrame, X_test: pandas.DataFrame, field_name: str, prefix: Optional[str] = "", max_features: Optional[int] = 500, min_df: Union[int, float] = 1, lowercase=False, tokenizer: Optional[Callable[[str], List[str]]] = _tokenize) -> Tuple[ pandas.DataFrame, pandas.DataFrame]: """ Extracts tags from a given field and append them as dataframe. :param X_train: Pandas' dataframe with the train features. :param X_test: Pandas' dataframe with the test features. :param field_name: the feature to parse. :param prefix: the given prefix for new tag feature. :param max_features: int or None, default=500. max tags names to consider. :param min_df: float in range [0.0, 1.0] or int, default=1. When building the tag name set ignore tags that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. :param lowercase: boolean, default=False. Convert all characters to lowercase before tokenizing the tag names. :param tokenizer: callable or None. Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Default splits by ",", and retain alphanumeric characters with special characters "_", "$" and "-". :return: the train and test with tags appended. """ vectorizer = CountVectorizer(binary=True, tokenizer=tokenizer, encoding="latin1", lowercase=lowercase, min_df=min_df, max_features=max_features) x_train_count_matrix = vectorizer.fit_transform(X_train[field_name].dropna()) x_train_tags = pandas.DataFrame(x_train_count_matrix.toarray(), columns=[prefix + tag_name for tag_name in vectorizer.get_feature_names_out()]) x_train_tags.index = X_train.index x_test_count_matrix = vectorizer.transform(X_test[field_name].dropna()) x_test_tags = pandas.DataFrame(x_test_count_matrix.toarray(), columns=[prefix + tag_name for tag_name in vectorizer.get_feature_names_out()]) x_test_tags.index = X_test.index x_train_reduced = X_train.drop(columns=[field_name]) x_test_reduced = X_test.drop(columns=[field_name]) return pandas.merge(x_train_reduced, x_train_tags, left_index=True, right_index=True, how="left"), pandas.merge( x_test_reduced, x_test_tags, left_index=True, right_index=True, how="left")
[docs]def extract_significant_terms_from_subset(data_frame: pandas.DataFrame, subset_data_frame: pandas.DataFrame, field_name: str, vectorizer: CountVectorizer = CountVectorizer(encoding="latin1", lowercase=True, max_features=500)) -> pandas.Series: """ Returns interesting or unusual occurrences of terms in a subset. Based on the `elasticsearch significant_text aggregation <https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-significantterms-aggregation.html#_scripted>`_ :param data_frame: the full data set. :param subset_data_frame: the subset partition data, with over it the scoring will be calculated. Can a filter by feature or other boolean criteria. :param field_name: the feature to parse. :param vectorizer: text count vectorizer which converts collection of text to a matrix of token counts. See more info `here <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_ . :return: Series of terms with scoring over the subset. :author: `Eran Hirsch <https://github.com/eranhirs>`_ """ count_matrix = vectorizer.fit_transform(data_frame[field_name].dropna()) matrix_df = pandas.DataFrame(count_matrix.toarray(), columns=vectorizer.get_feature_names_out()) subset_X = vectorizer.transform(subset_data_frame[field_name].dropna()) subset_matrix_df = pandas.DataFrame(subset_X.toarray(), columns=vectorizer.get_feature_names_out()) subset_freq = subset_matrix_df.sum() superset_freq = matrix_df.sum() return (subset_freq / (superset_freq - subset_freq + 1)).sort_values(ascending=False)