21105664715

Committed 18 Jan 2026 03:58AM UTC coverage: 86.972% (-9.8%) from 96.765%

Build # 21105664715

Build Type

push

github

Committed by

idanmoradarthas

Commit Message

linting

Run Details

2 of 2 new or added lines in 1 file covered. (100.0%)

102 existing lines in 2 files now uncovered.

721 of 829 relevant lines covered (86.97%)

10.44 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.67

/ds_utils/strings.py

"""String manipulation utilities for data science tasks."""

import re
from collections import Counter
from typing import List, Tuple, Optional, Callable, Union

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer


def _tokenize(text_tags: str) -> List[str]:
    tags = text_tags.split(",")
    tags = [re.sub(r"[^a-zA-Z0-9_$-]", "", x) for x in tags]
    tags = [x.strip() for x in tags]
    tags = [x for x in tags if x]  # More concise than checking length
    return tags


def _normalize_tags(value, tokenizer, lowercase):
    """Normalize tag input to a list of strings.

    Handles both string inputs (which need tokenization) and list inputs
    (which are already tokenized).

    :param value: Either a string to tokenize or a list of tags
    :param tokenizer: Tokenizer function to use for string inputs
    :param lowercase: Whether to convert to lowercase
    :return: List of normalized tag strings
    """
    tags = []
    if isinstance(value, str):
        if value:  # non-empty string
            tags = tokenizer(value)
    elif isinstance(value, list):
        tags = value

    # Apply lowercase if requested
    if lowercase:
        tags = [tag.lower() if isinstance(tag, str) else str(tag).lower() for tag in tags]

    return tags


def append_tags_to_frame(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    field_name: str,
    prefix: str = "",
    max_features: Optional[int] = 500,
    min_df: Union[int, float] = 1,
    lowercase: bool = False,
    sparse: bool = False,
    tokenizer: Optional[Callable[[str], List[str]]] = _tokenize,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Extract tags from a column and append them as binarized features to the dataframe.

    This function processes a specified column in the train and test dataframes that contains tags.
    It supports columns with either string-based tags (e.g., "tag1,tag2") or list-based tags
    (e.g., ["tag1", "tag2"]). The function identifies a vocabulary of tags from the training data,
    filters them based on frequency, and then creates new binary columns for each tag.

    Supported Input Types for the Tags Column:
    - str: Comma-separated tags. The default tokenizer splits by comma, trims whitespace, and removes
           non-alphanumeric characters (except "_", "$", "-"). Empty strings are treated as having no tags.
    - List[str]: A pre-tokenized list of tags. Empty lists are treated as having no tags.
    - NaN/None: Handled as empty.

    Tokenization Rules (for string inputs):
    - The default tokenizer splits the input string by commas (",").
    - Whitespace around tags is automatically trimmed.
    - Duplicate tags within the same string (e.g., "tag1,tag1") are treated as a single occurrence for that row.
    - Casing is preserved unless `lowercase=True`.

    `min_df` Behavior:
    - This parameter filters out tags that are not frequent enough in the training data.
    - If `int`: The absolute minimum number of rows a tag must appear in to be included.
    - If `float` (between 0.0 and 1.0): The minimum fraction of rows a tag must appear in.
    - This filtering is applied *before* the final vocabulary is selected and binarized.

    Column Naming Logic:
    - The `prefix` argument is prepended to each tag to form the new column names.
    - Example: With `prefix="tag_"` and a tag "python", the resulting column will be "tag_python".

    Column Ordering:
    - The generated tag columns are always sorted alphabetically, ensuring a deterministic and stable
      order that can be relied upon for feature alignment in downstream modeling.

    :param X_train: Pandas DataFrame with the train features.
    :param X_test: Pandas DataFrame with the test features.
    :param field_name: The name of the column to parse for tags.
    :param prefix: A string prefix for the new binarized tag columns.
    :param max_features: The maximum number of tags to include, based on frequency. Default is 500.
    :param min_df: The minimum document frequency for a tag to be included. Can be an int or a float. Default is 1.
    :param lowercase: If True, all tags are converted to lowercase. Default is False.
    :param sparse: If True, returns a DataFrame with sparse columns. Default is False.
    :param tokenizer: A custom function to tokenize string inputs. Defaults to an internal tokenizer.
    :return: A tuple containing the transformed train and test DataFrames.
    :raise KeyError: If `field_name` is not in the input dataframes.
    """
    if X_train.empty:
        return pd.DataFrame(), pd.DataFrame()

    x_train_filled = X_train[field_name].fillna("")

    # Tokenize the training data (handles both strings and lists)
    train_tags = x_train_filled.apply(lambda x: _normalize_tags(x, tokenizer, lowercase))

    # Calculate document frequency
    doc_freq = Counter(tag for tags_list in train_tags for tag in set(tags_list))

    # Filter by min_df
    if isinstance(min_df, int):
        tags_to_keep = {tag for tag, freq in doc_freq.items() if freq >= min_df}
    else:  # float
        min_doc_count = min_df * len(X_train)
        tags_to_keep = {tag for tag, freq in doc_freq.items() if freq >= min_doc_count}

    # Select top max_features by frequency
    if max_features is not None:
        # Sort by frequency (descending), then alphabetically for deterministic ordering
        top_tags = sorted(tags_to_keep, key=lambda tag: (-doc_freq[tag], tag))[:max_features]
        tags_to_keep = set(top_tags)

    # Filter the tokenized tags to only include those in tags_to_keep
    train_tags_filtered = train_tags.apply(lambda tags: [tag for tag in tags if tag in tags_to_keep])

    # Use MultiLabelBinarizer to create the binary matrix
    mlb = MultiLabelBinarizer(classes=sorted(list(tags_to_keep)), sparse_output=sparse)
    x_train_binarized = mlb.fit_transform(train_tags_filtered)

    # Prepare test data (handles both strings and lists)
    test_tags = X_test[field_name].fillna("").apply(lambda x: _normalize_tags(x, tokenizer, lowercase))
    test_tags_filtered = test_tags.apply(lambda tags: [tag for tag in tags if tag in tags_to_keep])
    x_test_binarized = mlb.transform(test_tags_filtered)

    # Create DataFrames for the binarized tags
    feature_names = [prefix + tag_name for tag_name in mlb.classes_]
    if sparse:
        x_train_tags = pd.DataFrame.sparse.from_spmatrix(x_train_binarized, index=X_train.index, columns=feature_names)
        x_test_tags = pd.DataFrame.sparse.from_spmatrix(x_test_binarized, index=X_test.index, columns=feature_names)
    else:
        x_train_tags = pd.DataFrame(x_train_binarized, columns=feature_names, index=X_train.index)
        x_test_tags = pd.DataFrame(x_test_binarized, columns=feature_names, index=X_test.index)

    x_train_reduced = X_train.drop(columns=[field_name])
    x_test_reduced = X_test.drop(columns=[field_name])

    return (
        pd.merge(x_train_reduced, x_train_tags, left_index=True, right_index=True, how="left"),
        pd.merge(x_test_reduced, x_test_tags, left_index=True, right_index=True, how="left"),
    )


def extract_significant_terms_from_subset(
    data_frame: pd.DataFrame,
    subset_data_frame: pd.DataFrame,
    field_name: str,
    vectorizer: CountVectorizer = CountVectorizer(encoding="utf-8", lowercase=True, max_features=500),
) -> pd.Series:
    """Return interesting or unusual occurrences of terms in a subset.

    Based on the elasticsearch significant_text aggregation:
    https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-significantterms-aggregation.html#_scripted

    :param data_frame: The full dataset.
    :param subset_data_frame: The subset partition data over which the scoring will be calculated.
                              It can be filtered by feature or other boolean criteria.
    :param field_name: The feature to parse.
    :param vectorizer: Text count vectorizer which converts a collection of text to a matrix of token counts.
                       See more info here:
                       https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
    :return: Series of terms with scoring over the subset.

    :author: Eran Hirsch (https://github.com/eranhirs)
    """
    if data_frame.empty:
        return pd.Series()

    count_matrix = vectorizer.fit_transform(data_frame[field_name].dropna())
    matrix_df = pd.DataFrame(count_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    subset_x = vectorizer.transform(subset_data_frame[field_name].dropna())
    subset_matrix_df = pd.DataFrame(subset_x.toarray(), columns=vectorizer.get_feature_names_out())

    subset_freq = subset_matrix_df.sum()
    superset_freq = matrix_df.sum()

    return (subset_freq / (superset_freq - subset_freq + 1)).sort_values(ascending=False)

1	"""String manipulation utilities for data science tasks."""
2
3	import re	12✔
4	from collections import Counter	12✔
5	from typing import List, Tuple, Optional, Callable, Union	12✔
6
7	import pandas as pd	12✔
8	from sklearn.feature_extraction.text import CountVectorizer	12✔
9	from sklearn.preprocessing import MultiLabelBinarizer	12✔
10
11
12	def _tokenize(text_tags: str) -> List[str]:	12✔
13	tags = text_tags.split(",")	12✔
14	tags = [re.sub(r"[^a-zA-Z0-9_$-]", "", x) for x in tags]	12✔
15	tags = [x.strip() for x in tags]	12✔
16	tags = [x for x in tags if x] # More concise than checking length	12✔
17	return tags	12✔
18
19
20	def _normalize_tags(value, tokenizer, lowercase):	12✔
21	"""Normalize tag input to a list of strings.
22
23	Handles both string inputs (which need tokenization) and list inputs
24	(which are already tokenized).
25
26	:param value: Either a string to tokenize or a list of tags
27	:param tokenizer: Tokenizer function to use for string inputs
28	:param lowercase: Whether to convert to lowercase
29	:return: List of normalized tag strings
30	"""
31	tags = []	12✔
32	if isinstance(value, str):	12✔
33	if value: # non-empty string	12✔
34	tags = tokenizer(value)	12✔
35	elif isinstance(value, list):	12✔
36	tags = value	12✔
37
38	# Apply lowercase if requested
39	if lowercase:	12✔
40	tags = [tag.lower() if isinstance(tag, str) else str(tag).lower() for tag in tags]	12✔
41
42	return tags	12✔
43
44
45	def append_tags_to_frame(	12✔
46	X_train: pd.DataFrame,
47	X_test: pd.DataFrame,
48	field_name: str,
49	prefix: str = "",
50	max_features: Optional[int] = 500,
51	min_df: Union[int, float] = 1,
52	lowercase: bool = False,
53	sparse: bool = False,
54	tokenizer: Optional[Callable[[str], List[str]]] = _tokenize,
55	) -> Tuple[pd.DataFrame, pd.DataFrame]:
56	"""Extract tags from a column and append them as binarized features to the dataframe.
57
58	This function processes a specified column in the train and test dataframes that contains tags.
59	It supports columns with either string-based tags (e.g., "tag1,tag2") or list-based tags
60	(e.g., ["tag1", "tag2"]). The function identifies a vocabulary of tags from the training data,
61	filters them based on frequency, and then creates new binary columns for each tag.
62
63	Supported Input Types for the Tags Column:
64	- str: Comma-separated tags. The default tokenizer splits by comma, trims whitespace, and removes
65	non-alphanumeric characters (except "_", "$", "-"). Empty strings are treated as having no tags.
66	- List[str]: A pre-tokenized list of tags. Empty lists are treated as having no tags.
67	- NaN/None: Handled as empty.
68
69	Tokenization Rules (for string inputs):
70	- The default tokenizer splits the input string by commas (",").
71	- Whitespace around tags is automatically trimmed.
72	- Duplicate tags within the same string (e.g., "tag1,tag1") are treated as a single occurrence for that row.
73	- Casing is preserved unless `lowercase=True`.
74
75	`min_df` Behavior:
76	- This parameter filters out tags that are not frequent enough in the training data.
77	- If `int`: The absolute minimum number of rows a tag must appear in to be included.
78	- If `float` (between 0.0 and 1.0): The minimum fraction of rows a tag must appear in.
79	- This filtering is applied before the final vocabulary is selected and binarized.
80
81	Column Naming Logic:
82	- The `prefix` argument is prepended to each tag to form the new column names.
83	- Example: With `prefix="tag_"` and a tag "python", the resulting column will be "tag_python".
84
85	Column Ordering:
86	- The generated tag columns are always sorted alphabetically, ensuring a deterministic and stable
87	order that can be relied upon for feature alignment in downstream modeling.
88
89	:param X_train: Pandas DataFrame with the train features.
90	:param X_test: Pandas DataFrame with the test features.
91	:param field_name: The name of the column to parse for tags.
92	:param prefix: A string prefix for the new binarized tag columns.
93	:param max_features: The maximum number of tags to include, based on frequency. Default is 500.
94	:param min_df: The minimum document frequency for a tag to be included. Can be an int or a float. Default is 1.
95	:param lowercase: If True, all tags are converted to lowercase. Default is False.
96	:param sparse: If True, returns a DataFrame with sparse columns. Default is False.
97	:param tokenizer: A custom function to tokenize string inputs. Defaults to an internal tokenizer.
98	:return: A tuple containing the transformed train and test DataFrames.
99	:raise KeyError: If `field_name` is not in the input dataframes.
100	"""
101	if X_train.empty:	12✔
102	return pd.DataFrame(), pd.DataFrame()	12✔
103
104	x_train_filled = X_train[field_name].fillna("")	12✔
105
106	# Tokenize the training data (handles both strings and lists)
107	train_tags = x_train_filled.apply(lambda x: _normalize_tags(x, tokenizer, lowercase))	12✔
108
109	# Calculate document frequency
110	doc_freq = Counter(tag for tags_list in train_tags for tag in set(tags_list))	12✔
111
112	# Filter by min_df
113	if isinstance(min_df, int):	12✔
114	tags_to_keep = {tag for tag, freq in doc_freq.items() if freq >= min_df}	12✔
115	else: # float
UNCOV 116	min_doc_count = min_df * len(X_train)	×
UNCOV 117	tags_to_keep = {tag for tag, freq in doc_freq.items() if freq >= min_doc_count}	×
118
119	# Select top max_features by frequency
120	if max_features is not None:	12✔
121	# Sort by frequency (descending), then alphabetically for deterministic ordering
122	top_tags = sorted(tags_to_keep, key=lambda tag: (-doc_freq[tag], tag))[:max_features]	12✔
123	tags_to_keep = set(top_tags)	12✔
124
125	# Filter the tokenized tags to only include those in tags_to_keep
126	train_tags_filtered = train_tags.apply(lambda tags: [tag for tag in tags if tag in tags_to_keep])	12✔
127
128	# Use MultiLabelBinarizer to create the binary matrix
129	mlb = MultiLabelBinarizer(classes=sorted(list(tags_to_keep)), sparse_output=sparse)	12✔
130	x_train_binarized = mlb.fit_transform(train_tags_filtered)	12✔
131
132	# Prepare test data (handles both strings and lists)
133	test_tags = X_test[field_name].fillna("").apply(lambda x: _normalize_tags(x, tokenizer, lowercase))	12✔
134	test_tags_filtered = test_tags.apply(lambda tags: [tag for tag in tags if tag in tags_to_keep])	12✔
135	x_test_binarized = mlb.transform(test_tags_filtered)	12✔
136
137	# Create DataFrames for the binarized tags
138	feature_names = [prefix + tag_name for tag_name in mlb.classes_]	12✔
139	if sparse:	12✔
140	x_train_tags = pd.DataFrame.sparse.from_spmatrix(x_train_binarized, index=X_train.index, columns=feature_names)	12✔
141	x_test_tags = pd.DataFrame.sparse.from_spmatrix(x_test_binarized, index=X_test.index, columns=feature_names)	12✔
142	else:
143	x_train_tags = pd.DataFrame(x_train_binarized, columns=feature_names, index=X_train.index)	12✔
144	x_test_tags = pd.DataFrame(x_test_binarized, columns=feature_names, index=X_test.index)	12✔
145
146	x_train_reduced = X_train.drop(columns=[field_name])	12✔
147	x_test_reduced = X_test.drop(columns=[field_name])	12✔
148
149	return (	12✔
150	pd.merge(x_train_reduced, x_train_tags, left_index=True, right_index=True, how="left"),
151	pd.merge(x_test_reduced, x_test_tags, left_index=True, right_index=True, how="left"),
152	)
153
154
155	def extract_significant_terms_from_subset(	12✔
156	data_frame: pd.DataFrame,
157	subset_data_frame: pd.DataFrame,
158	field_name: str,
159	vectorizer: CountVectorizer = CountVectorizer(encoding="utf-8", lowercase=True, max_features=500),
160	) -> pd.Series:
161	"""Return interesting or unusual occurrences of terms in a subset.
162
163	Based on the elasticsearch significant_text aggregation:
164	https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-significantterms-aggregation.html#_scripted
165
166	:param data_frame: The full dataset.
167	:param subset_data_frame: The subset partition data over which the scoring will be calculated.
168	It can be filtered by feature or other boolean criteria.
169	:param field_name: The feature to parse.
170	:param vectorizer: Text count vectorizer which converts a collection of text to a matrix of token counts.
171	See more info here:
172	https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
173	:return: Series of terms with scoring over the subset.
174
175	:author: Eran Hirsch (https://github.com/eranhirs)
176	"""
177	if data_frame.empty:	12✔
178	return pd.Series()	12✔
179
180	count_matrix = vectorizer.fit_transform(data_frame[field_name].dropna())	12✔
181	matrix_df = pd.DataFrame(count_matrix.toarray(), columns=vectorizer.get_feature_names_out())	12✔
182
183	subset_x = vectorizer.transform(subset_data_frame[field_name].dropna())	12✔
184	subset_matrix_df = pd.DataFrame(subset_x.toarray(), columns=vectorizer.get_feature_names_out())	12✔
185
186	subset_freq = subset_matrix_df.sum()	12✔
187	superset_freq = matrix_df.sum()	12✔
188
189	return (subset_freq / (superset_freq - subset_freq + 1)).sort_values(ascending=False)	12✔

idanmoradarthas / DataScienceUtils / 21105664715

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous