12744218044

Committed 13 Jan 2025 09:26AM UTC coverage: 91.352% (+0.3%) from 91.099%

Build # 12744218044

Build Type

Pull #8693

github

Committed by

web-flow

Commit Message

Merge 4a3ad897d into db76ae284

Pull Request Pull Request #8693: feat: Add `ComponentTool` to Haystack tools

Run Details

8968 of 9817 relevant lines covered (91.35%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.12

haystack/components/preprocessors/sentence_tokenizer.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

import re
from pathlib import Path
from typing import Any, Dict, List, Literal, Tuple

from haystack import logging
from haystack.lazy_imports import LazyImport

with LazyImport("Run 'pip install nltk'") as nltk_imports:
    import nltk

logger = logging.getLogger(__name__)

Language = Literal[
    "ru", "sl", "es", "sv", "tr", "cs", "da", "nl", "en", "et", "fi", "fr", "de", "el", "it", "no", "pl", "pt", "ml"
]

ISO639_TO_NLTK = {
    "ru": "russian",
    "sl": "slovene",
    "es": "spanish",
    "sv": "swedish",
    "tr": "turkish",
    "cs": "czech",
    "da": "danish",
    "nl": "dutch",
    "en": "english",
    "et": "estonian",
    "fi": "finnish",
    "fr": "french",
    "de": "german",
    "el": "greek",
    "it": "italian",
    "no": "norwegian",
    "pl": "polish",
    "pt": "portuguese",
    "ml": "malayalam",
}

QUOTE_SPANS_RE = re.compile(r"\W(\"+|\'+).*?\1")

if nltk_imports.is_successful():

    def load_sentence_tokenizer(
        language: Language, keep_white_spaces: bool = False
    ) -> nltk.tokenize.punkt.PunktSentenceTokenizer:
        """
        Utility function to load the nltk sentence tokenizer.

        :param language: The language for the tokenizer.
        :param keep_white_spaces: If True, the tokenizer will keep white spaces between sentences.
        :returns: nltk sentence tokenizer.
        """
        try:
            nltk.data.find("tokenizers/punkt_tab")
        except LookupError:
            try:
                nltk.download("punkt_tab")
            except FileExistsError as error:
                logger.debug("NLTK punkt tokenizer seems to be already downloaded. Error message: {error}", error=error)

        language_name = ISO639_TO_NLTK.get(language)

        if language_name is not None:
            sentence_tokenizer = nltk.data.load(f"tokenizers/punkt_tab/{language_name}.pickle")
        else:
            logger.warning(
                "PreProcessor couldn't find the default sentence tokenizer model for {language}. "
                " Using English instead. You may train your own model and use the 'tokenizer_model_folder' parameter.",
                language=language,
            )
            sentence_tokenizer = nltk.data.load("tokenizers/punkt_tab/english.pickle")

        if keep_white_spaces:
            sentence_tokenizer._lang_vars = CustomPunktLanguageVars()

        return sentence_tokenizer

    class CustomPunktLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):
        # The following adjustment of PunktSentenceTokenizer is inspired by:
        # https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
        # It is needed for preserving whitespace while splitting text into sentences.
        _period_context_fmt = r"""
                %(SentEndChars)s             # a potential sentence ending
                \s*                          # match potential whitespace [ \t\n\x0B\f\r]
                (?=(?P<after_tok>
                    %(NonWord)s              # either other punctuation
                    |
                    (?P<next_tok>\S+)        # or some other token - original version: \s+(?P<next_tok>\S+)
                ))"""

        def period_context_re(self) -> re.Pattern:
            """
            Compiles and returns a regular expression to find contexts including possible sentence boundaries.

            :returns: A compiled regular expression pattern.
            """
            try:
                return self._re_period_context  # type: ignore
            except:  # noqa: E722
                self._re_period_context = re.compile(
                    self._period_context_fmt
                    % {
                        "NonWord": self._re_non_word_chars,
                        # SentEndChars might be followed by closing brackets, so we match them here.
                        "SentEndChars": self._re_sent_end_chars + r"[\)\]}]*",
                    },
                    re.UNICODE | re.VERBOSE,
                )
                return self._re_period_context


class SentenceSplitter:  # pylint: disable=too-few-public-methods
    """
    SentenceSplitter splits a text into sentences using the nltk sentence tokenizer
    """

    def __init__(
        self,
        language: Language = "en",
        use_split_rules: bool = True,
        extend_abbreviations: bool = True,
        keep_white_spaces: bool = False,
    ) -> None:
        """
        Initializes the SentenceSplitter with the specified language, split rules, and abbreviation handling.

        :param language: The language for the tokenizer. Default is "en".
        :param use_split_rules: If True, the additional split rules are used. If False, the rules are not used.
        :param extend_abbreviations: If True, the abbreviations used by NLTK's PunktTokenizer are extended by a list
            of curated abbreviations if available. If False, the default abbreviations are used.
            Currently supported languages are: en, de.
        :param keep_white_spaces: If True, the tokenizer will keep white spaces between sentences.
        """
        nltk_imports.check()
        self.language = language
        self.sentence_tokenizer = load_sentence_tokenizer(language, keep_white_spaces=keep_white_spaces)
        self.use_split_rules = use_split_rules
        if extend_abbreviations:
            abbreviations = SentenceSplitter._read_abbreviations(language)
            self.sentence_tokenizer._params.abbrev_types.update(abbreviations)
        self.keep_white_spaces = keep_white_spaces

    def split_sentences(self, text: str) -> List[Dict[str, Any]]:
        """
        Splits a text into sentences including references to original char positions for each split.

        :param text: The text to split.
        :returns: list of sentences with positions.
        """
        sentence_spans = list(self.sentence_tokenizer.span_tokenize(text))
        if self.use_split_rules:
            sentence_spans = SentenceSplitter._apply_split_rules(text, sentence_spans)

        sentences = [{"sentence": text[start:end], "start": start, "end": end} for start, end in sentence_spans]
        return sentences

    @staticmethod
    def _apply_split_rules(text: str, sentence_spans: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
        """
        Applies additional split rules to the sentence spans.

        :param text: The text to split.
        :param sentence_spans: The list of sentence spans to split.
        :returns: The list of sentence spans after applying the split rules.
        """
        new_sentence_spans = []
        quote_spans = [match.span() for match in QUOTE_SPANS_RE.finditer(text)]
        while sentence_spans:
            span = sentence_spans.pop(0)
            next_span = sentence_spans[0] if len(sentence_spans) > 0 else None
            while next_span and SentenceSplitter._needs_join(text, span, next_span, quote_spans):
                sentence_spans.pop(0)
                span = (span[0], next_span[1])
                next_span = sentence_spans[0] if len(sentence_spans) > 0 else None
            start, end = span
            new_sentence_spans.append((start, end))
        return new_sentence_spans

    @staticmethod
    def _needs_join(
        text: str, span: Tuple[int, int], next_span: Tuple[int, int], quote_spans: List[Tuple[int, int]]
    ) -> bool:
        """
        Checks if the spans need to be joined as parts of one sentence.

        This method determines whether two adjacent sentence spans should be joined back together as a single sentence.
        It's used to prevent incorrect sentence splitting in specific cases like quotations, numbered lists,
        and parenthetical expressions.

        :param text: The text containing the spans.
        :param span: Tuple of (start, end) positions for the current sentence span.
        :param next_span: Tuple of (start, end) positions for the next sentence span.
        :param quote_spans: All quoted spans within text.
        :returns:
            True if the spans needs to be joined.
        """
        start, end = span
        next_start, next_end = next_span

        # sentence. sentence"\nsentence -> no split (end << quote_end)
        # sentence.", sentence -> no split (end < quote_end)
        # sentence?", sentence -> no split (end < quote_end)
        if any(quote_start < end < quote_end for quote_start, quote_end in quote_spans):
            # sentence boundary is inside a quote
            return True

        # sentence." sentence -> split (end == quote_end)
        # sentence?" sentence -> no split (end == quote_end)
        if any(quote_start < end == quote_end and text[quote_end - 2] == "?" for quote_start, quote_end in quote_spans):
            # question is cited
            return True

        if re.search(r"(^|\n)\s*\d{1,2}\.$", text[start:end]) is not None:
            # sentence ends with a numeration
            return True

        # next sentence starts with a bracket or we return False
        return re.search(r"^\s*[\(\[]", text[next_start:next_end]) is not None

    @staticmethod
    def _read_abbreviations(lang: Language) -> List[str]:
        """
        Reads the abbreviations for a given language from the abbreviations file.

        :param lang: The language to read the abbreviations for.
        :returns: List of abbreviations.
        """
        abbreviations_file = Path(__file__).parent.parent.parent / f"data/abbreviations/{lang}.txt"
        if not abbreviations_file.exists():
            logger.warning("No abbreviations file found for {language}. Using default abbreviations.", language=lang)
            return []

        abbreviations = abbreviations_file.read_text().split("\n")
        return abbreviations

1	# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2	#
3	# SPDX-License-Identifier: Apache-2.0
4
5	import re	1✔
6	from pathlib import Path	1✔
7	from typing import Any, Dict, List, Literal, Tuple	1✔
8
9	from haystack import logging	1✔
10	from haystack.lazy_imports import LazyImport	1✔
11
12	with LazyImport("Run 'pip install nltk'") as nltk_imports:	1✔
13	import nltk	1✔
14
15	logger = logging.getLogger(__name__)	1✔
16
17	Language = Literal[	1✔
18	"ru", "sl", "es", "sv", "tr", "cs", "da", "nl", "en", "et", "fi", "fr", "de", "el", "it", "no", "pl", "pt", "ml"
19	]
20
21	ISO639_TO_NLTK = {	1✔
22	"ru": "russian",
23	"sl": "slovene",
24	"es": "spanish",
25	"sv": "swedish",
26	"tr": "turkish",
27	"cs": "czech",
28	"da": "danish",
29	"nl": "dutch",
30	"en": "english",
31	"et": "estonian",
32	"fi": "finnish",
33	"fr": "french",
34	"de": "german",
35	"el": "greek",
36	"it": "italian",
37	"no": "norwegian",
38	"pl": "polish",
39	"pt": "portuguese",
40	"ml": "malayalam",
41	}
42
43	QUOTE_SPANS_RE = re.compile(r"\W(\"+\|\'+).*?\1")	1✔
44
45	if nltk_imports.is_successful():	1✔
46
47	def load_sentence_tokenizer(	1✔
48	language: Language, keep_white_spaces: bool = False
49	) -> nltk.tokenize.punkt.PunktSentenceTokenizer:
50	"""
51	Utility function to load the nltk sentence tokenizer.
52
53	:param language: The language for the tokenizer.
54	:param keep_white_spaces: If True, the tokenizer will keep white spaces between sentences.
55	:returns: nltk sentence tokenizer.
56	"""
57	try:	1✔
58	nltk.data.find("tokenizers/punkt_tab")	1✔
59	except LookupError:	1✔
60	try:	1✔
61	nltk.download("punkt_tab")	1✔
62	except FileExistsError as error:	×
63	logger.debug("NLTK punkt tokenizer seems to be already downloaded. Error message: {error}", error=error)	×
64
65	language_name = ISO639_TO_NLTK.get(language)	1✔
66
67	if language_name is not None:	1✔
68	sentence_tokenizer = nltk.data.load(f"tokenizers/punkt_tab/{language_name}.pickle")	1✔
69	else:
70	logger.warning(	×
71	"PreProcessor couldn't find the default sentence tokenizer model for {language}. "
72	" Using English instead. You may train your own model and use the 'tokenizer_model_folder' parameter.",
73	language=language,
74	)
75	sentence_tokenizer = nltk.data.load("tokenizers/punkt_tab/english.pickle")	×
76
77	if keep_white_spaces:	1✔
78	sentence_tokenizer._lang_vars = CustomPunktLanguageVars()	1✔
79
80	return sentence_tokenizer	1✔
81
82	class CustomPunktLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):	1✔
83	# The following adjustment of PunktSentenceTokenizer is inspired by:
84	# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
85	# It is needed for preserving whitespace while splitting text into sentences.
86	_period_context_fmt = r"""	1✔
87	%(SentEndChars)s # a potential sentence ending
88	\s* # match potential whitespace [ \t\n\x0B\f\r]
89	(?=(?P<after_tok>
90	%(NonWord)s # either other punctuation
91	\|
92	(?P<next_tok>\S+) # or some other token - original version: \s+(?P<next_tok>\S+)
93	))"""
94
95	def period_context_re(self) -> re.Pattern:	1✔
96	"""
97	Compiles and returns a regular expression to find contexts including possible sentence boundaries.
98
99	:returns: A compiled regular expression pattern.
100	"""
101	try:	1✔
102	return self._re_period_context # type: ignore	1✔
103	except: # noqa: E722	1✔
104	self._re_period_context = re.compile(	1✔
105	self._period_context_fmt
106	% {
107	"NonWord": self._re_non_word_chars,
108	# SentEndChars might be followed by closing brackets, so we match them here.
109	"SentEndChars": self._re_sent_end_chars + r"[\)\]}]*",
110	},
111	re.UNICODE \| re.VERBOSE,
112	)
113	return self._re_period_context	1✔
114
115
116	class SentenceSplitter: # pylint: disable=too-few-public-methods	1✔
117	"""
118	SentenceSplitter splits a text into sentences using the nltk sentence tokenizer
119	"""
120
121	def __init__(	1✔
122	self,
123	language: Language = "en",
124	use_split_rules: bool = True,
125	extend_abbreviations: bool = True,
126	keep_white_spaces: bool = False,
127	) -> None:
128	"""
129	Initializes the SentenceSplitter with the specified language, split rules, and abbreviation handling.
130
131	:param language: The language for the tokenizer. Default is "en".
132	:param use_split_rules: If True, the additional split rules are used. If False, the rules are not used.
133	:param extend_abbreviations: If True, the abbreviations used by NLTK's PunktTokenizer are extended by a list
134	of curated abbreviations if available. If False, the default abbreviations are used.
135	Currently supported languages are: en, de.
136	:param keep_white_spaces: If True, the tokenizer will keep white spaces between sentences.
137	"""
138	nltk_imports.check()	1✔
139	self.language = language	1✔
140	self.sentence_tokenizer = load_sentence_tokenizer(language, keep_white_spaces=keep_white_spaces)	1✔
141	self.use_split_rules = use_split_rules	1✔
142	if extend_abbreviations:	1✔
143	abbreviations = SentenceSplitter._read_abbreviations(language)	1✔
144	self.sentence_tokenizer._params.abbrev_types.update(abbreviations)	1✔
145	self.keep_white_spaces = keep_white_spaces	1✔
146
147	def split_sentences(self, text: str) -> List[Dict[str, Any]]:	1✔
148	"""
149	Splits a text into sentences including references to original char positions for each split.
150
151	:param text: The text to split.
152	:returns: list of sentences with positions.
153	"""
154	sentence_spans = list(self.sentence_tokenizer.span_tokenize(text))	1✔
155	if self.use_split_rules:	1✔
156	sentence_spans = SentenceSplitter._apply_split_rules(text, sentence_spans)	1✔
157
158	sentences = [{"sentence": text[start:end], "start": start, "end": end} for start, end in sentence_spans]	1✔
159	return sentences	1✔
160
161	@staticmethod	1✔
162	def _apply_split_rules(text: str, sentence_spans: List[Tuple[int, int]]) -> List[Tuple[int, int]]:	1✔
163	"""
164	Applies additional split rules to the sentence spans.
165
166	:param text: The text to split.
167	:param sentence_spans: The list of sentence spans to split.
168	:returns: The list of sentence spans after applying the split rules.
169	"""
170	new_sentence_spans = []	1✔
171	quote_spans = [match.span() for match in QUOTE_SPANS_RE.finditer(text)]	1✔
172	while sentence_spans:	1✔
173	span = sentence_spans.pop(0)	1✔
174	next_span = sentence_spans[0] if len(sentence_spans) > 0 else None	1✔
175	while next_span and SentenceSplitter._needs_join(text, span, next_span, quote_spans):	1✔
176	sentence_spans.pop(0)	1✔
177	span = (span[0], next_span[1])	1✔
178	next_span = sentence_spans[0] if len(sentence_spans) > 0 else None	1✔
179	start, end = span	1✔
180	new_sentence_spans.append((start, end))	1✔
181	return new_sentence_spans	1✔
182
183	@staticmethod	1✔
184	def _needs_join(	1✔
185	text: str, span: Tuple[int, int], next_span: Tuple[int, int], quote_spans: List[Tuple[int, int]]
186	) -> bool:
187	"""
188	Checks if the spans need to be joined as parts of one sentence.
189
190	This method determines whether two adjacent sentence spans should be joined back together as a single sentence.
191	It's used to prevent incorrect sentence splitting in specific cases like quotations, numbered lists,
192	and parenthetical expressions.
193
194	:param text: The text containing the spans.
195	:param span: Tuple of (start, end) positions for the current sentence span.
196	:param next_span: Tuple of (start, end) positions for the next sentence span.
197	:param quote_spans: All quoted spans within text.
198	:returns:
199	True if the spans needs to be joined.
200	"""
201	start, end = span	1✔
202	next_start, next_end = next_span	1✔
203
204	# sentence. sentence"\nsentence -> no split (end << quote_end)
205	# sentence.", sentence -> no split (end < quote_end)
206	# sentence?", sentence -> no split (end < quote_end)
207	if any(quote_start < end < quote_end for quote_start, quote_end in quote_spans):	1✔
208	# sentence boundary is inside a quote
209	return True	1✔
210
211	# sentence." sentence -> split (end == quote_end)
212	# sentence?" sentence -> no split (end == quote_end)
213	if any(quote_start < end == quote_end and text[quote_end - 2] == "?" for quote_start, quote_end in quote_spans):	1✔
214	# question is cited
215	return True	×
216
217	if re.search(r"(^\|\n)\s*\d{1,2}\.$", text[start:end]) is not None:	1✔
218	# sentence ends with a numeration
219	return True	1✔
220
221	# next sentence starts with a bracket or we return False
222	return re.search(r"^\s*[\(\[]", text[next_start:next_end]) is not None	1✔
223
224	@staticmethod	1✔
225	def _read_abbreviations(lang: Language) -> List[str]:	1✔
226	"""
227	Reads the abbreviations for a given language from the abbreviations file.
228
229	:param lang: The language to read the abbreviations for.
230	:returns: List of abbreviations.
231	"""
232	abbreviations_file = Path(__file__).parent.parent.parent / f"data/abbreviations/{lang}.txt"	1✔
233	if not abbreviations_file.exists():	1✔
234	logger.warning("No abbreviations file found for {language}. Using default abbreviations.", language=lang)	1✔
235	return []	1✔
236
237	abbreviations = abbreviations_file.read_text().split("\n")	1✔
238	return abbreviations	1✔

deepset-ai / haystack / 12744218044

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous