• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

neuml / txtai / 11062719915

27 Sep 2024 01:06AM UTC coverage: 99.946%. Remained the same
11062719915

push

github

davidmezzetti
Update documentation

7406 of 7410 relevant lines covered (99.95%)

1.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.59
/src/python/txtai/vectors/words.py
1
"""
2
Word Vectors module
3
"""
4

5
import logging
1✔
6
import os
1✔
7
import tempfile
1✔
8

9
from errno import ENOENT
1✔
10
from multiprocessing import Pool
1✔
11

12
import numpy as np
1✔
13

14
# Conditionally import Word Vector libraries as they aren't installed by default
15
try:
1✔
16
    import fasttext
1✔
17
    from pymagnitude import converter, Magnitude
1✔
18

19
    WORDS = True
1✔
20
except ImportError:
1✔
21
    WORDS = False
1✔
22

23
from ..pipeline import Tokenizer
1✔
24

25
from .base import Vectors
1✔
26

27
# Logging configuration
28
logger = logging.getLogger(__name__)
1✔
29

30
# Multiprocessing helper methods
31
# pylint: disable=W0603
32
VECTORS = None
1✔
33

34

35
def create(config, scoring):
1✔
36
    """
37
    Multiprocessing helper method. Creates a global embeddings object to be accessed in a new subprocess.
38

39
    Args:
40
        config: vector configuration
41
        scoring: scoring instance
42
    """
43

44
    global VECTORS
45

46
    # Create a global embedding object using configuration and saved
47
    VECTORS = WordVectors(config, scoring, None)
×
48

49

50
def transform(document):
1✔
51
    """
52
    Multiprocessing helper method. Transforms document into an embeddings vector.
53

54
    Args:
55
        document: (id, data, tags)
56

57
    Returns:
58
        (id, embedding)
59
    """
60

61
    return (document[0], VECTORS.transform(document))
×
62

63

64
class WordVectors(Vectors):
1✔
65
    """
66
    Builds vectors using weighted word embeddings.
67
    """
68

69
    def loadmodel(self, path):
1✔
70
        # Ensure that vector path exists
71
        if not path or not os.path.isfile(path):
1✔
72
            raise IOError(ENOENT, "Vector model file not found", path)
1✔
73

74
        # Load magnitude model. If this is a training run (uninitialized config), block until vectors are fully loaded
75
        return Magnitude(path, case_insensitive=True, blocking=not self.initialized)
1✔
76

77
    def encode(self, data):
1✔
78
        # Iterate over each data element, tokenize (if necessary) and build an aggregated embeddings vector
79
        embeddings = []
1✔
80
        for tokens in data:
1✔
81
            # Convert to tokens if necessary
82
            if isinstance(tokens, str):
1✔
83
                tokens = Tokenizer.tokenize(tokens)
1✔
84

85
            # Generate weights for each vector using a scoring method
86
            weights = self.scoring.weights(tokens) if self.scoring else None
1✔
87

88
            # pylint: disable=E1133
89
            if weights and [x for x in weights if x > 0]:
1✔
90
                # Build weighted average embeddings vector. Create weights array as float32 to match embeddings precision.
91
                embedding = np.average(self.lookup(tokens), weights=np.array(weights, dtype=np.float32), axis=0)
1✔
92
            else:
93
                # If no weights, use mean
94
                embedding = np.mean(self.lookup(tokens), axis=0)
1✔
95

96
            embeddings.append(embedding)
1✔
97

98
        return np.array(embeddings, dtype=np.float32)
1✔
99

100
    def index(self, documents, batchsize=1):
1✔
101
        # Use default single process indexing logic
102
        if "parallel" in self.config and not self.config["parallel"]:
1✔
103
            return super().index(documents, batchsize)
1✔
104

105
        # Customize indexing logic with multiprocessing pool to efficiently build vectors
106
        ids, dimensions, batches, stream = [], None, 0, None
1✔
107

108
        # Shared objects with Pool
109
        args = (self.config, self.scoring)
1✔
110

111
        # Convert all documents to embedding arrays, stream embeddings to disk to control memory usage
112
        with Pool(os.cpu_count(), initializer=create, initargs=args) as pool:
1✔
113
            with tempfile.NamedTemporaryFile(mode="wb", suffix=".npy", delete=False) as output:
1✔
114
                stream = output.name
1✔
115
                embeddings = []
1✔
116
                for uid, embedding in pool.imap(transform, documents):
1✔
117
                    if not dimensions:
1✔
118
                        # Set number of dimensions for embeddings
119
                        dimensions = embedding.shape[0]
1✔
120

121
                    ids.append(uid)
1✔
122
                    embeddings.append(embedding)
1✔
123

124
                    if len(embeddings) == batchsize:
1✔
125
                        np.save(output, np.array(embeddings, dtype=np.float32))
1✔
126
                        batches += 1
1✔
127

128
                        embeddings = []
1✔
129

130
                # Final embeddings batch
131
                if embeddings:
1✔
132
                    np.save(output, np.array(embeddings, dtype=np.float32))
1✔
133
                    batches += 1
1✔
134

135
        return (ids, dimensions, batches, stream)
1✔
136

137
    def lookup(self, tokens):
1✔
138
        """
139
        Queries word vectors for given list of input tokens.
140

141
        Args:
142
            tokens: list of tokens to query
143

144
        Returns:
145
            word vectors array
146
        """
147

148
        return self.model.query(tokens)
1✔
149

150
    @staticmethod
1✔
151
    def isdatabase(path):
1✔
152
        """
153
        Checks if this is a SQLite database file which is the file format used for word vectors databases.
154

155
        Args:
156
            path: path to check
157

158
        Returns:
159
            True if this is a SQLite database
160
        """
161

162
        if isinstance(path, str) and os.path.isfile(path) and os.path.getsize(path) >= 100:
1✔
163
            # Read 100 byte SQLite header
164
            with open(path, "rb") as f:
1✔
165
                header = f.read(100)
1✔
166

167
            # Check for SQLite header
168
            return header.startswith(b"SQLite format 3\000")
1✔
169

170
        return False
1✔
171

172
    @staticmethod
1✔
173
    def build(data, size, mincount, path):
1✔
174
        """
175
        Builds fastText vectors from a file.
176

177
        Args:
178
            data: path to input data file
179
            size: number of vector dimensions
180
            mincount: minimum number of occurrences required to register a token
181
            path: path to output file
182
        """
183

184
        # Train on data file using largest dimension size
185
        model = fasttext.train_unsupervised(data, dim=size, minCount=mincount)
1✔
186

187
        # Output file path
188
        logger.info("Building %d dimension model", size)
1✔
189

190
        # Output vectors in vec/txt format
191
        with open(path + ".txt", "w", encoding="utf-8") as output:
1✔
192
            words = model.get_words()
1✔
193
            output.write(f"{len(words)} {model.get_dimension()}\n")
1✔
194

195
            for word in words:
1✔
196
                # Skip end of line token
197
                if word != "</s>":
1✔
198
                    vector = model.get_word_vector(word)
1✔
199
                    data = ""
1✔
200
                    for v in vector:
1✔
201
                        data += " " + str(v)
1✔
202

203
                    output.write(word + data + "\n")
1✔
204

205
        # Build magnitude vectors database
206
        logger.info("Converting vectors to magnitude format")
1✔
207
        converter.convert(path + ".txt", path + ".magnitude", subword=True)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc