• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

SamhammerAG / ai-data-preprocessing-queue / 18751031261

23 Oct 2025 02:03PM UTC coverage: 76.263% (-15.3%) from 91.515%
18751031261

Pull #14

github

cwehmeier
KIT-4467 added signature removal as step
Pull Request #14: KIT-4467 added signature removal as step

0 of 33 new or added lines in 1 file covered. (0.0%)

151 of 198 relevant lines covered (76.26%)

0.76 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/ai_data_preprocessing_queue/Steps/remove_signature.py
NEW
1
import re
×
2

3

NEW
4
def remove_newline(text: str) -> str:
×
5
    """Remove excessive newlines or spaces from the text."""
NEW
6
    pattern = re.compile(r"\s{2,}|[\n\r]{3,}")
×
NEW
7
    result = pattern.sub(" ", text)
×
NEW
8
    result = re.sub(r"\s+", " ", result).strip()
×
9

NEW
10
    return result
×
11

12

NEW
13
GreetingExpressions = ["sincerely", "best regards", "happy holidays", "kind regards", "warm regards", "cheers",
×
14
                       "regards", "mit freundlichen grüßen", "freundliche grüße", "beste grüße", "viele grüße",
15
                       "herzliche grüße", "liebe grüße", "mit freundlichen grüssen", "freundliche grüsse",
16
                       "beste grüsse", "viele grüsse", "herzliche grüsse", "liebe grüsse"]
NEW
17
greetings_regex = r"(" + "|".join(GreetingExpressions) + r")\s*,?\s*"
×
18

19

NEW
20
def remove_greetings_and_following_text(text: str) -> str:
×
NEW
21
    pattern = greetings_regex + ".*"
×
NEW
22
    return re.sub(pattern, "", text, flags=re.IGNORECASE | re.UNICODE)
×
23

24

25
# thank you expressions should be removed after greetings and following signature text,
26
# as they often appear at the beginning of a message
NEW
27
THANK_EXPRESSIONS = [
×
28
    r"thank you(?: very much)?",   # thank you, thank you very much
29
    r"thankyou(?: very much)?",   # thankyou, thankyou very much
30
    r"thanks(?: a lot| again)?",   # thanks, thanks a lot, thanks again
31
    r"many thanks",                # many thanks
32
    r"a thousand thanks",          # a thousand thanks
33
    r"danke(?: schön)?",           # danke, danke schön, danke und
34
    r"vielen dank",                # vielen dank
35
    r"dankeschön",                 # dankeschön
36
    r"besten dank"                 # besten dank
37
]
38

39
# Suffixes which could follow thank you expressions
NEW
40
THANK_SUFFIXES = [
×
41
    r"(?:in advance(?: for (?:your|the) (?:help|support|understanding|assistance))?)",
42
    r"(?:for (?:your|the) (?:help|support|understanding|assistance))",
43
    r"(?:schon mal )?(?:im voraus)?(?: für (?:ihre|ihr|eure|die|den) (?:hilfe|support|verständnis))?",
44
    r"vorab",
45
    r"kindly?"
46
]
47

48
# Combine them into a final regex pattern and compile
NEW
49
thank_expressions = r"|".join(THANK_EXPRESSIONS)
×
NEW
50
suffixes = r"(?:\s+(?:" + r"|".join(THANK_SUFFIXES) + r"))?"
×
NEW
51
final_pattern = (
×
52
    r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*"
53
)
NEW
54
thanking_regex = re.compile(final_pattern, flags=re.IGNORECASE | re.UNICODE)
×
55

56

NEW
57
def remove_thanking_expressions(text: str) -> str:
×
NEW
58
    return thanking_regex.sub("", text)
×
59

60

61
# In the end, single greetings are removed again, which could not
62
# be reliably removed by the preceding expressions
NEW
63
single_greeting_words = ["liebe grüße", "liebe grüsse", "grüße", "grüsse", "gruß", "gruss"]
×
NEW
64
single_greetings_pattern = r"\b(?:{})\b".format("|".join(single_greeting_words))
×
65

66

NEW
67
def remove_single_greeting_words(text: str, pattern: str) -> str:
×
NEW
68
    return re.sub(pattern, " ", text, flags=re.IGNORECASE | re.UNICODE)
×
69

70

NEW
71
def step(text: str) -> str:
×
NEW
72
    if not text:
×
NEW
73
        return text
×
NEW
74
    try:
×
NEW
75
        text_greetings_removed = remove_greetings_and_following_text(text)
×
NEW
76
        thankyou_removed = remove_thanking_expressions(text_greetings_removed)
×
NEW
77
        single_greetings_removed = remove_single_greeting_words(thankyou_removed, single_greetings_pattern)
×
78

NEW
79
        return remove_newline(single_greetings_removed)
×
NEW
80
    except Exception as e:
×
NEW
81
        raise ValueError(f"An error occurred while removing signature: {e}") from e
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc