11062719915

Committed 27 Sep 2024 01:06AM CUT coverage: 99.946%. Remained the same

Build # 11062719915

Build Type

push

github

Committed by

davidmezzetti

Commit Message

Update documentation

Run Details

7406 of 7410 relevant lines covered (99.95%)

1.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.15

/src/python/txtai/data/questions.py

"""
Questions module
"""

from .base import Data


class Questions(Data):
    """
    Tokenizes question-answering datasets as input for training question-answering models.
    """

    def __init__(self, tokenizer, columns, maxlength, stride):
        """
        Creates a new instance for tokenizing Questions training data.

        Args:
            tokenizer: model tokenizer
            columns: tuple of columns to use for question/context/answer
            maxlength: maximum sequence length
            stride: chunk size for splitting data for QA tasks
        """

        super().__init__(tokenizer, columns, maxlength)

        if not self.columns:
            self.columns = ("question", "context", "answers")

        self.question, self.context, self.answer = self.columns
        self.stride = stride
        self.rpad = tokenizer.padding_side == "right"

    def process(self, data):
        # Tokenize data
        tokenized = self.tokenize(data)

        # Get mapping of overflowing tokens and answer offsets
        samples = tokenized.pop("overflow_to_sample_mapping")
        offsets = tokenized.pop("offset_mapping")

        # Start/end positions
        tokenized["start_positions"] = []
        tokenized["end_positions"] = []

        for x, offset in enumerate(offsets):
            # Label NO ANSWER with CLS token
            inputids = tokenized["input_ids"][x]
            clstoken = inputids.index(self.tokenizer.cls_token_id)

            # Sequence ids
            sequences = tokenized.sequence_ids(x)

            # Get and format answer
            answers = self.answers(data, samples[x])

            # If no answers are given, set cls token as answer.
            if len(answers["answer_start"]) == 0:
                tokenized["start_positions"].append(clstoken)
                tokenized["end_positions"].append(clstoken)
            else:
                # Start/end character index of the answer in the text.
                startchar = answers["answer_start"][0]
                endchar = startchar + len(answers["text"][0])

                # Start token index of the current span in the text.
                start = 0
                while sequences[start] != (1 if self.rpad else 0):
                    start += 1

                # End token index of the current span in the text.
                end = len(inputids) - 1
                while sequences[end] != (1 if self.rpad else 0):
                    end -= 1

                # Label with CLS token if out of span
                if not (offset[start][0] <= startchar and offset[end][1] >= endchar):
                    tokenized["start_positions"].append(clstoken)
                    tokenized["end_positions"].append(clstoken)
                else:
                    # Map start character and end character to matching token index
                    while start < len(offset) and offset[start][0] <= startchar:
                        start += 1
                    tokenized["start_positions"].append(start - 1)

                    while offset[end][1] >= endchar:
                        end -= 1
                    tokenized["end_positions"].append(end + 1)

        return tokenized

    def tokenize(self, data):
        """
        Tokenizes batch of data

        Args:
            data: input data batch

        Returns:
            tokenized data
        """

        # Trim question whitespace
        data[self.question] = [x.lstrip() for x in data[self.question]]

        # Tokenize records
        return self.tokenizer(
            data[self.question if self.rpad else self.context],
            data[self.context if self.rpad else self.question],
            truncation="only_second" if self.rpad else "only_first",
            max_length=self.maxlength,
            stride=self.stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding=True,
        )

    def answers(self, data, index):
        """
        Gets and formats an answer.

        Args:
            data: input examples
            index: answer index to retrieve

        Returns:
            answers dict
        """

        # Answer mappings
        answers = data[self.answer][index]
        context = data[self.context][index]

        # Handle mapping string answers to dict
        if not isinstance(answers, dict):
            if not answers:
                answers = {"text": [], "answer_start": []}
            else:
                answers = {"text": [answers], "answer_start": [context.index(answers)]}

        return answers

1	"""
2	Questions module
3	"""
4
5	from .base import Data	1✔
6
7
8	class Questions(Data):	1✔
9	"""
10	Tokenizes question-answering datasets as input for training question-answering models.
11	"""
12
13	def __init__(self, tokenizer, columns, maxlength, stride):	1✔
14	"""
15	Creates a new instance for tokenizing Questions training data.
16
17	Args:
18	tokenizer: model tokenizer
19	columns: tuple of columns to use for question/context/answer
20	maxlength: maximum sequence length
21	stride: chunk size for splitting data for QA tasks
22	"""
23
24	super().__init__(tokenizer, columns, maxlength)	1✔
25
26	if not self.columns:	1✔
27	self.columns = ("question", "context", "answers")	1✔
28
29	self.question, self.context, self.answer = self.columns	1✔
30	self.stride = stride	1✔
31	self.rpad = tokenizer.padding_side == "right"	1✔
32
33	def process(self, data):	1✔
34	# Tokenize data
35	tokenized = self.tokenize(data)	1✔
36
37	# Get mapping of overflowing tokens and answer offsets
38	samples = tokenized.pop("overflow_to_sample_mapping")	1✔
39	offsets = tokenized.pop("offset_mapping")	1✔
40
41	# Start/end positions
42	tokenized["start_positions"] = []	1✔
43	tokenized["end_positions"] = []	1✔
44
45	for x, offset in enumerate(offsets):	1✔
46	# Label NO ANSWER with CLS token
47	inputids = tokenized["input_ids"][x]	1✔
48	clstoken = inputids.index(self.tokenizer.cls_token_id)	1✔
49
50	# Sequence ids
51	sequences = tokenized.sequence_ids(x)	1✔
52
53	# Get and format answer
54	answers = self.answers(data, samples[x])	1✔
55
56	# If no answers are given, set cls token as answer.
57	if len(answers["answer_start"]) == 0:	1✔
58	tokenized["start_positions"].append(clstoken)	1✔
59	tokenized["end_positions"].append(clstoken)	1✔
60	else:
61	# Start/end character index of the answer in the text.
62	startchar = answers["answer_start"][0]	1✔
63	endchar = startchar + len(answers["text"][0])	1✔
64
65	# Start token index of the current span in the text.
66	start = 0	1✔
67	while sequences[start] != (1 if self.rpad else 0):	1✔
68	start += 1	1✔
69
70	# End token index of the current span in the text.
71	end = len(inputids) - 1	1✔
72	while sequences[end] != (1 if self.rpad else 0):	1✔
73	end -= 1	1✔
74
75	# Label with CLS token if out of span
76	if not (offset[start][0] <= startchar and offset[end][1] >= endchar):	1✔
77	tokenized["start_positions"].append(clstoken)	×
78	tokenized["end_positions"].append(clstoken)	×
79	else:
80	# Map start character and end character to matching token index
81	while start < len(offset) and offset[start][0] <= startchar:	1✔
82	start += 1	1✔
83	tokenized["start_positions"].append(start - 1)	1✔
84
85	while offset[end][1] >= endchar:	1✔
86	end -= 1	1✔
87	tokenized["end_positions"].append(end + 1)	1✔
88
89	return tokenized	1✔
90
91	def tokenize(self, data):	1✔
92	"""
93	Tokenizes batch of data
94
95	Args:
96	data: input data batch
97
98	Returns:
99	tokenized data
100	"""
101
102	# Trim question whitespace
103	data[self.question] = [x.lstrip() for x in data[self.question]]	1✔
104
105	# Tokenize records
106	return self.tokenizer(	1✔
107	data[self.question if self.rpad else self.context],
108	data[self.context if self.rpad else self.question],
109	truncation="only_second" if self.rpad else "only_first",
110	max_length=self.maxlength,
111	stride=self.stride,
112	return_overflowing_tokens=True,
113	return_offsets_mapping=True,
114	padding=True,
115	)
116
117	def answers(self, data, index):	1✔
118	"""
119	Gets and formats an answer.
120
121	Args:
122	data: input examples
123	index: answer index to retrieve
124
125	Returns:
126	answers dict
127	"""
128
129	# Answer mappings
130	answers = data[self.answer][index]	1✔
131	context = data[self.context][index]	1✔
132
133	# Handle mapping string answers to dict
134	if not isinstance(answers, dict):	1✔
135	if not answers:	1✔
136	answers = {"text": [], "answer_start": []}	1✔
137	else:
138	answers = {"text": [answers], "answer_start": [context.index(answers)]}	1✔
139
140	return answers	1✔

neuml / txtai / 11062719915

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous