• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

neuml / txtai / 11062719915

27 Sep 2024 01:06AM CUT coverage: 99.946%. Remained the same
11062719915

push

github

davidmezzetti
Update documentation

7406 of 7410 relevant lines covered (99.95%)

1.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.15
/src/python/txtai/data/questions.py
1
"""
2
Questions module
3
"""
4

5
from .base import Data
1✔
6

7

8
class Questions(Data):
1✔
9
    """
10
    Tokenizes question-answering datasets as input for training question-answering models.
11
    """
12

13
    def __init__(self, tokenizer, columns, maxlength, stride):
1✔
14
        """
15
        Creates a new instance for tokenizing Questions training data.
16

17
        Args:
18
            tokenizer: model tokenizer
19
            columns: tuple of columns to use for question/context/answer
20
            maxlength: maximum sequence length
21
            stride: chunk size for splitting data for QA tasks
22
        """
23

24
        super().__init__(tokenizer, columns, maxlength)
1✔
25

26
        if not self.columns:
1✔
27
            self.columns = ("question", "context", "answers")
1✔
28

29
        self.question, self.context, self.answer = self.columns
1✔
30
        self.stride = stride
1✔
31
        self.rpad = tokenizer.padding_side == "right"
1✔
32

33
    def process(self, data):
1✔
34
        # Tokenize data
35
        tokenized = self.tokenize(data)
1✔
36

37
        # Get mapping of overflowing tokens and answer offsets
38
        samples = tokenized.pop("overflow_to_sample_mapping")
1✔
39
        offsets = tokenized.pop("offset_mapping")
1✔
40

41
        # Start/end positions
42
        tokenized["start_positions"] = []
1✔
43
        tokenized["end_positions"] = []
1✔
44

45
        for x, offset in enumerate(offsets):
1✔
46
            # Label NO ANSWER with CLS token
47
            inputids = tokenized["input_ids"][x]
1✔
48
            clstoken = inputids.index(self.tokenizer.cls_token_id)
1✔
49

50
            # Sequence ids
51
            sequences = tokenized.sequence_ids(x)
1✔
52

53
            # Get and format answer
54
            answers = self.answers(data, samples[x])
1✔
55

56
            # If no answers are given, set cls token as answer.
57
            if len(answers["answer_start"]) == 0:
1✔
58
                tokenized["start_positions"].append(clstoken)
1✔
59
                tokenized["end_positions"].append(clstoken)
1✔
60
            else:
61
                # Start/end character index of the answer in the text.
62
                startchar = answers["answer_start"][0]
1✔
63
                endchar = startchar + len(answers["text"][0])
1✔
64

65
                # Start token index of the current span in the text.
66
                start = 0
1✔
67
                while sequences[start] != (1 if self.rpad else 0):
1✔
68
                    start += 1
1✔
69

70
                # End token index of the current span in the text.
71
                end = len(inputids) - 1
1✔
72
                while sequences[end] != (1 if self.rpad else 0):
1✔
73
                    end -= 1
1✔
74

75
                # Label with CLS token if out of span
76
                if not (offset[start][0] <= startchar and offset[end][1] >= endchar):
1✔
77
                    tokenized["start_positions"].append(clstoken)
×
78
                    tokenized["end_positions"].append(clstoken)
×
79
                else:
80
                    # Map start character and end character to matching token index
81
                    while start < len(offset) and offset[start][0] <= startchar:
1✔
82
                        start += 1
1✔
83
                    tokenized["start_positions"].append(start - 1)
1✔
84

85
                    while offset[end][1] >= endchar:
1✔
86
                        end -= 1
1✔
87
                    tokenized["end_positions"].append(end + 1)
1✔
88

89
        return tokenized
1✔
90

91
    def tokenize(self, data):
1✔
92
        """
93
        Tokenizes batch of data
94

95
        Args:
96
            data: input data batch
97

98
        Returns:
99
            tokenized data
100
        """
101

102
        # Trim question whitespace
103
        data[self.question] = [x.lstrip() for x in data[self.question]]
1✔
104

105
        # Tokenize records
106
        return self.tokenizer(
1✔
107
            data[self.question if self.rpad else self.context],
108
            data[self.context if self.rpad else self.question],
109
            truncation="only_second" if self.rpad else "only_first",
110
            max_length=self.maxlength,
111
            stride=self.stride,
112
            return_overflowing_tokens=True,
113
            return_offsets_mapping=True,
114
            padding=True,
115
        )
116

117
    def answers(self, data, index):
1✔
118
        """
119
        Gets and formats an answer.
120

121
        Args:
122
            data: input examples
123
            index: answer index to retrieve
124

125
        Returns:
126
            answers dict
127
        """
128

129
        # Answer mappings
130
        answers = data[self.answer][index]
1✔
131
        context = data[self.context][index]
1✔
132

133
        # Handle mapping string answers to dict
134
        if not isinstance(answers, dict):
1✔
135
            if not answers:
1✔
136
                answers = {"text": [], "answer_start": []}
1✔
137
            else:
138
                answers = {"text": [answers], "answer_start": [context.index(answers)]}
1✔
139

140
        return answers
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc