• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

SamhammerAG / ai-data-preprocessing-queue / 18784596270

24 Oct 2025 03:35PM UTC coverage: 91.457% (-0.06%) from 91.515%
18784596270

Pull #14

github

cwehmeier
KIT-4469 fixed added more unit tests
Pull Request #14: KIT-4467 added signature removal as step

31 of 34 new or added lines in 1 file covered. (91.18%)

182 of 199 relevant lines covered (91.46%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.18
/ai_data_preprocessing_queue/Steps/remove_signature.py
1
import re
1✔
2
from typing import Any
1✔
3

4

5
def remove_newline(text: str) -> str:
1✔
6
    """Remove excessive newlines or spaces from the text."""
7
    pattern = re.compile(r"\s{2,}|[\n\r]{3,}")
1✔
8
    result = pattern.sub(" ", text)
1✔
9
    result = re.sub(r"\s+", " ", result).strip()
1✔
10

11
    return result
1✔
12

13

14
GreetingExpressions = ["sincerely", "best regards", "happy holidays", "kind regards", "warm regards", "cheers",
1✔
15
                       "regards", "mit freundlichen grüßen", "freundliche grüße", "beste grüße", "viele grüße",
16
                       "herzliche grüße", "liebe grüße", "mit freundlichen grüssen", "freundliche grüsse",
17
                       "beste grüsse", "viele grüsse", "herzliche grüsse", "liebe grüsse"]
18
greetings_regex = r"(" + "|".join(GreetingExpressions) + r")\s*,?\s*"
1✔
19

20

21
def remove_greetings_and_following_text(text: str) -> str:
1✔
22
    pattern = greetings_regex + ".*"
1✔
23
    return re.sub(pattern, "", text, flags=re.IGNORECASE | re.UNICODE | re.DOTALL).strip()
1✔
24

25

26
# thank you expressions should be removed after greetings and following signature text,
27
# as they often appear at the beginning of a message
28
THANK_EXPRESSIONS = [
1✔
29
    r"thank you(?: very much)?",   # thank you, thank you very much
30
    r"thankyou(?: very much)?",   # thankyou, thankyou very much
31
    r"thanks(?: a lot| again)?",   # thanks, thanks a lot, thanks again
32
    r"many thanks",                # many thanks
33
    r"a thousand thanks",          # a thousand thanks
34
    r"danke(?: schön)?",           # danke, danke schön, danke und
35
    r"vielen dank",                # vielen dank
36
    r"dankeschön",                 # dankeschön
37
    r"besten dank"                 # besten dank
38
]
39

40
# Suffixes which could follow thank you expressions
41
THANK_SUFFIXES = [
1✔
42
    r"(?:in advance(?: for (?:your|the) (?:help|support|understanding|assistance))?)",
43
    r"(?:for (?:your|the) (?:help|support|understanding|assistance))",
44
    r"(?:schon mal\s+)?(?:im voraus\s+)?für\s+(?:ihre|ihr|eure|die|den)\s+(?:hilfe|support|verständnis)",
45
    r"vorab",
46
    r"kindly?"
47
]
48

49
# Combine them into a final regex pattern and compile
50
thank_expressions = r"|".join(THANK_EXPRESSIONS)
1✔
51
suffixes = r"(?:\s+(?:" + r"|".join(THANK_SUFFIXES) + r"))?"
1✔
52
final_pattern = (
1✔
53
    r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*"
54
)
55
thanking_regex = re.compile(final_pattern, flags=re.IGNORECASE | re.UNICODE)
1✔
56

57

58
def remove_thanking_expressions(text: str) -> str:
1✔
59
    return thanking_regex.sub("", text)
1✔
60

61

62
# In the end, single greetings are removed again, which could not
63
# be reliably removed by the preceding expressions
64
single_greeting_words = ["liebe grüße", "liebe grüsse", "grüße", "grüsse", "gruß", "gruss"]
1✔
65
single_greetings_pattern = r"\b(?:{})\b".format("|".join(single_greeting_words))
1✔
66

67

68
def remove_single_greeting_words(text: str, pattern: str) -> str:
1✔
69
    return re.sub(pattern, " ", text, flags=re.IGNORECASE | re.UNICODE)
1✔
70

71

72
def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any:
1✔
73
    if not item:
1✔
NEW
74
        return item
×
75
    try:
1✔
76
        text_greetings_removed = remove_greetings_and_following_text(item)
1✔
77
        thankyou_removed = remove_thanking_expressions(text_greetings_removed)
1✔
78
        single_greetings_removed = remove_single_greeting_words(thankyou_removed, single_greetings_pattern)
1✔
79

80
        return remove_newline(single_greetings_removed)
1✔
NEW
81
    except Exception as e:
×
NEW
82
        raise ValueError(f"An error occurred while removing signature: {e}") from e
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc