• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

idanmoradarthas / DataScienceUtils / 21105664715

18 Jan 2026 03:58AM UTC coverage: 86.972% (-9.8%) from 96.765%
21105664715

push

github

idanmoradarthas
linting

2 of 2 new or added lines in 1 file covered. (100.0%)

102 existing lines in 2 files now uncovered.

721 of 829 relevant lines covered (86.97%)

10.44 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.67
/ds_utils/strings.py
1
"""String manipulation utilities for data science tasks."""
2

3
import re
12✔
4
from collections import Counter
12✔
5
from typing import List, Tuple, Optional, Callable, Union
12✔
6

7
import pandas as pd
12✔
8
from sklearn.feature_extraction.text import CountVectorizer
12✔
9
from sklearn.preprocessing import MultiLabelBinarizer
12✔
10

11

12
def _tokenize(text_tags: str) -> List[str]:
12✔
13
    tags = text_tags.split(",")
12✔
14
    tags = [re.sub(r"[^a-zA-Z0-9_$-]", "", x) for x in tags]
12✔
15
    tags = [x.strip() for x in tags]
12✔
16
    tags = [x for x in tags if x]  # More concise than checking length
12✔
17
    return tags
12✔
18

19

20
def _normalize_tags(value, tokenizer, lowercase):
12✔
21
    """Normalize tag input to a list of strings.
22

23
    Handles both string inputs (which need tokenization) and list inputs
24
    (which are already tokenized).
25

26
    :param value: Either a string to tokenize or a list of tags
27
    :param tokenizer: Tokenizer function to use for string inputs
28
    :param lowercase: Whether to convert to lowercase
29
    :return: List of normalized tag strings
30
    """
31
    tags = []
12✔
32
    if isinstance(value, str):
12✔
33
        if value:  # non-empty string
12✔
34
            tags = tokenizer(value)
12✔
35
    elif isinstance(value, list):
12✔
36
        tags = value
12✔
37

38
    # Apply lowercase if requested
39
    if lowercase:
12✔
40
        tags = [tag.lower() if isinstance(tag, str) else str(tag).lower() for tag in tags]
12✔
41

42
    return tags
12✔
43

44

45
def append_tags_to_frame(
12✔
46
    X_train: pd.DataFrame,
47
    X_test: pd.DataFrame,
48
    field_name: str,
49
    prefix: str = "",
50
    max_features: Optional[int] = 500,
51
    min_df: Union[int, float] = 1,
52
    lowercase: bool = False,
53
    sparse: bool = False,
54
    tokenizer: Optional[Callable[[str], List[str]]] = _tokenize,
55
) -> Tuple[pd.DataFrame, pd.DataFrame]:
56
    """Extract tags from a column and append them as binarized features to the dataframe.
57

58
    This function processes a specified column in the train and test dataframes that contains tags.
59
    It supports columns with either string-based tags (e.g., "tag1,tag2") or list-based tags
60
    (e.g., ["tag1", "tag2"]). The function identifies a vocabulary of tags from the training data,
61
    filters them based on frequency, and then creates new binary columns for each tag.
62

63
    Supported Input Types for the Tags Column:
64
    - str: Comma-separated tags. The default tokenizer splits by comma, trims whitespace, and removes
65
           non-alphanumeric characters (except "_", "$", "-"). Empty strings are treated as having no tags.
66
    - List[str]: A pre-tokenized list of tags. Empty lists are treated as having no tags.
67
    - NaN/None: Handled as empty.
68

69
    Tokenization Rules (for string inputs):
70
    - The default tokenizer splits the input string by commas (",").
71
    - Whitespace around tags is automatically trimmed.
72
    - Duplicate tags within the same string (e.g., "tag1,tag1") are treated as a single occurrence for that row.
73
    - Casing is preserved unless `lowercase=True`.
74

75
    `min_df` Behavior:
76
    - This parameter filters out tags that are not frequent enough in the training data.
77
    - If `int`: The absolute minimum number of rows a tag must appear in to be included.
78
    - If `float` (between 0.0 and 1.0): The minimum fraction of rows a tag must appear in.
79
    - This filtering is applied *before* the final vocabulary is selected and binarized.
80

81
    Column Naming Logic:
82
    - The `prefix` argument is prepended to each tag to form the new column names.
83
    - Example: With `prefix="tag_"` and a tag "python", the resulting column will be "tag_python".
84

85
    Column Ordering:
86
    - The generated tag columns are always sorted alphabetically, ensuring a deterministic and stable
87
      order that can be relied upon for feature alignment in downstream modeling.
88

89
    :param X_train: Pandas DataFrame with the train features.
90
    :param X_test: Pandas DataFrame with the test features.
91
    :param field_name: The name of the column to parse for tags.
92
    :param prefix: A string prefix for the new binarized tag columns.
93
    :param max_features: The maximum number of tags to include, based on frequency. Default is 500.
94
    :param min_df: The minimum document frequency for a tag to be included. Can be an int or a float. Default is 1.
95
    :param lowercase: If True, all tags are converted to lowercase. Default is False.
96
    :param sparse: If True, returns a DataFrame with sparse columns. Default is False.
97
    :param tokenizer: A custom function to tokenize string inputs. Defaults to an internal tokenizer.
98
    :return: A tuple containing the transformed train and test DataFrames.
99
    :raise KeyError: If `field_name` is not in the input dataframes.
100
    """
101
    if X_train.empty:
12✔
102
        return pd.DataFrame(), pd.DataFrame()
12✔
103

104
    x_train_filled = X_train[field_name].fillna("")
12✔
105

106
    # Tokenize the training data (handles both strings and lists)
107
    train_tags = x_train_filled.apply(lambda x: _normalize_tags(x, tokenizer, lowercase))
12✔
108

109
    # Calculate document frequency
110
    doc_freq = Counter(tag for tags_list in train_tags for tag in set(tags_list))
12✔
111

112
    # Filter by min_df
113
    if isinstance(min_df, int):
12✔
114
        tags_to_keep = {tag for tag, freq in doc_freq.items() if freq >= min_df}
12✔
115
    else:  # float
UNCOV
116
        min_doc_count = min_df * len(X_train)
×
UNCOV
117
        tags_to_keep = {tag for tag, freq in doc_freq.items() if freq >= min_doc_count}
×
118

119
    # Select top max_features by frequency
120
    if max_features is not None:
12✔
121
        # Sort by frequency (descending), then alphabetically for deterministic ordering
122
        top_tags = sorted(tags_to_keep, key=lambda tag: (-doc_freq[tag], tag))[:max_features]
12✔
123
        tags_to_keep = set(top_tags)
12✔
124

125
    # Filter the tokenized tags to only include those in tags_to_keep
126
    train_tags_filtered = train_tags.apply(lambda tags: [tag for tag in tags if tag in tags_to_keep])
12✔
127

128
    # Use MultiLabelBinarizer to create the binary matrix
129
    mlb = MultiLabelBinarizer(classes=sorted(list(tags_to_keep)), sparse_output=sparse)
12✔
130
    x_train_binarized = mlb.fit_transform(train_tags_filtered)
12✔
131

132
    # Prepare test data (handles both strings and lists)
133
    test_tags = X_test[field_name].fillna("").apply(lambda x: _normalize_tags(x, tokenizer, lowercase))
12✔
134
    test_tags_filtered = test_tags.apply(lambda tags: [tag for tag in tags if tag in tags_to_keep])
12✔
135
    x_test_binarized = mlb.transform(test_tags_filtered)
12✔
136

137
    # Create DataFrames for the binarized tags
138
    feature_names = [prefix + tag_name for tag_name in mlb.classes_]
12✔
139
    if sparse:
12✔
140
        x_train_tags = pd.DataFrame.sparse.from_spmatrix(x_train_binarized, index=X_train.index, columns=feature_names)
12✔
141
        x_test_tags = pd.DataFrame.sparse.from_spmatrix(x_test_binarized, index=X_test.index, columns=feature_names)
12✔
142
    else:
143
        x_train_tags = pd.DataFrame(x_train_binarized, columns=feature_names, index=X_train.index)
12✔
144
        x_test_tags = pd.DataFrame(x_test_binarized, columns=feature_names, index=X_test.index)
12✔
145

146
    x_train_reduced = X_train.drop(columns=[field_name])
12✔
147
    x_test_reduced = X_test.drop(columns=[field_name])
12✔
148

149
    return (
12✔
150
        pd.merge(x_train_reduced, x_train_tags, left_index=True, right_index=True, how="left"),
151
        pd.merge(x_test_reduced, x_test_tags, left_index=True, right_index=True, how="left"),
152
    )
153

154

155
def extract_significant_terms_from_subset(
12✔
156
    data_frame: pd.DataFrame,
157
    subset_data_frame: pd.DataFrame,
158
    field_name: str,
159
    vectorizer: CountVectorizer = CountVectorizer(encoding="utf-8", lowercase=True, max_features=500),
160
) -> pd.Series:
161
    """Return interesting or unusual occurrences of terms in a subset.
162

163
    Based on the elasticsearch significant_text aggregation:
164
    https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-significantterms-aggregation.html#_scripted
165

166
    :param data_frame: The full dataset.
167
    :param subset_data_frame: The subset partition data over which the scoring will be calculated.
168
                              It can be filtered by feature or other boolean criteria.
169
    :param field_name: The feature to parse.
170
    :param vectorizer: Text count vectorizer which converts a collection of text to a matrix of token counts.
171
                       See more info here:
172
                       https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
173
    :return: Series of terms with scoring over the subset.
174

175
    :author: Eran Hirsch (https://github.com/eranhirs)
176
    """
177
    if data_frame.empty:
12✔
178
        return pd.Series()
12✔
179

180
    count_matrix = vectorizer.fit_transform(data_frame[field_name].dropna())
12✔
181
    matrix_df = pd.DataFrame(count_matrix.toarray(), columns=vectorizer.get_feature_names_out())
12✔
182

183
    subset_x = vectorizer.transform(subset_data_frame[field_name].dropna())
12✔
184
    subset_matrix_df = pd.DataFrame(subset_x.toarray(), columns=vectorizer.get_feature_names_out())
12✔
185

186
    subset_freq = subset_matrix_df.sum()
12✔
187
    superset_freq = matrix_df.sum()
12✔
188

189
    return (subset_freq / (superset_freq - subset_freq + 1)).sort_values(ascending=False)
12✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc