11062719915

Committed 27 Sep 2024 01:06AM UTC coverage: 99.946%. Remained the same

Build # 11062719915

Build Type

push

github

Committed by

davidmezzetti

Commit Message

Update documentation

Run Details

7406 of 7410 relevant lines covered (99.95%)

1.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.59

/src/python/txtai/vectors/words.py

"""
Word Vectors module
"""

import logging
import os
import tempfile

from errno import ENOENT
from multiprocessing import Pool

import numpy as np

# Conditionally import Word Vector libraries as they aren't installed by default
try:
    import fasttext
    from pymagnitude import converter, Magnitude

    WORDS = True
except ImportError:
    WORDS = False

from ..pipeline import Tokenizer

from .base import Vectors

# Logging configuration
logger = logging.getLogger(__name__)

# Multiprocessing helper methods
# pylint: disable=W0603
VECTORS = None


def create(config, scoring):
    """
    Multiprocessing helper method. Creates a global embeddings object to be accessed in a new subprocess.

    Args:
        config: vector configuration
        scoring: scoring instance
    """

    global VECTORS

    # Create a global embedding object using configuration and saved
    VECTORS = WordVectors(config, scoring, None)


def transform(document):
    """
    Multiprocessing helper method. Transforms document into an embeddings vector.

    Args:
        document: (id, data, tags)

    Returns:
        (id, embedding)
    """

    return (document[0], VECTORS.transform(document))


class WordVectors(Vectors):
    """
    Builds vectors using weighted word embeddings.
    """

    def loadmodel(self, path):
        # Ensure that vector path exists
        if not path or not os.path.isfile(path):
            raise IOError(ENOENT, "Vector model file not found", path)

        # Load magnitude model. If this is a training run (uninitialized config), block until vectors are fully loaded
        return Magnitude(path, case_insensitive=True, blocking=not self.initialized)

    def encode(self, data):
        # Iterate over each data element, tokenize (if necessary) and build an aggregated embeddings vector
        embeddings = []
        for tokens in data:
            # Convert to tokens if necessary
            if isinstance(tokens, str):
                tokens = Tokenizer.tokenize(tokens)

            # Generate weights for each vector using a scoring method
            weights = self.scoring.weights(tokens) if self.scoring else None

            # pylint: disable=E1133
            if weights and [x for x in weights if x > 0]:
                # Build weighted average embeddings vector. Create weights array as float32 to match embeddings precision.
                embedding = np.average(self.lookup(tokens), weights=np.array(weights, dtype=np.float32), axis=0)
            else:
                # If no weights, use mean
                embedding = np.mean(self.lookup(tokens), axis=0)

            embeddings.append(embedding)

        return np.array(embeddings, dtype=np.float32)

    def index(self, documents, batchsize=1):
        # Use default single process indexing logic
        if "parallel" in self.config and not self.config["parallel"]:
            return super().index(documents, batchsize)

        # Customize indexing logic with multiprocessing pool to efficiently build vectors
        ids, dimensions, batches, stream = [], None, 0, None

        # Shared objects with Pool
        args = (self.config, self.scoring)

        # Convert all documents to embedding arrays, stream embeddings to disk to control memory usage
        with Pool(os.cpu_count(), initializer=create, initargs=args) as pool:
            with tempfile.NamedTemporaryFile(mode="wb", suffix=".npy", delete=False) as output:
                stream = output.name
                embeddings = []
                for uid, embedding in pool.imap(transform, documents):
                    if not dimensions:
                        # Set number of dimensions for embeddings
                        dimensions = embedding.shape[0]

                    ids.append(uid)
                    embeddings.append(embedding)

                    if len(embeddings) == batchsize:
                        np.save(output, np.array(embeddings, dtype=np.float32))
                        batches += 1

                        embeddings = []

                # Final embeddings batch
                if embeddings:
                    np.save(output, np.array(embeddings, dtype=np.float32))
                    batches += 1

        return (ids, dimensions, batches, stream)

    def lookup(self, tokens):
        """
        Queries word vectors for given list of input tokens.

        Args:
            tokens: list of tokens to query

        Returns:
            word vectors array
        """

        return self.model.query(tokens)

    @staticmethod
    def isdatabase(path):
        """
        Checks if this is a SQLite database file which is the file format used for word vectors databases.

        Args:
            path: path to check

        Returns:
            True if this is a SQLite database
        """

        if isinstance(path, str) and os.path.isfile(path) and os.path.getsize(path) >= 100:
            # Read 100 byte SQLite header
            with open(path, "rb") as f:
                header = f.read(100)

            # Check for SQLite header
            return header.startswith(b"SQLite format 3\000")

        return False

    @staticmethod
    def build(data, size, mincount, path):
        """
        Builds fastText vectors from a file.

        Args:
            data: path to input data file
            size: number of vector dimensions
            mincount: minimum number of occurrences required to register a token
            path: path to output file
        """

        # Train on data file using largest dimension size
        model = fasttext.train_unsupervised(data, dim=size, minCount=mincount)

        # Output file path
        logger.info("Building %d dimension model", size)

        # Output vectors in vec/txt format
        with open(path + ".txt", "w", encoding="utf-8") as output:
            words = model.get_words()
            output.write(f"{len(words)} {model.get_dimension()}\n")

            for word in words:
                # Skip end of line token
                if word != "</s>":
                    vector = model.get_word_vector(word)
                    data = ""
                    for v in vector:
                        data += " " + str(v)

                    output.write(word + data + "\n")

        # Build magnitude vectors database
        logger.info("Converting vectors to magnitude format")
        converter.convert(path + ".txt", path + ".magnitude", subword=True)

1	"""
2	Word Vectors module
3	"""
4
5	import logging	1✔
6	import os	1✔
7	import tempfile	1✔
8
9	from errno import ENOENT	1✔
10	from multiprocessing import Pool	1✔
11
12	import numpy as np	1✔
13
14	# Conditionally import Word Vector libraries as they aren't installed by default
15	try:	1✔
16	import fasttext	1✔
17	from pymagnitude import converter, Magnitude	1✔
18
19	WORDS = True	1✔
20	except ImportError:	1✔
21	WORDS = False	1✔
22
23	from ..pipeline import Tokenizer	1✔
24
25	from .base import Vectors	1✔
26
27	# Logging configuration
28	logger = logging.getLogger(__name__)	1✔
29
30	# Multiprocessing helper methods
31	# pylint: disable=W0603
32	VECTORS = None	1✔
33
34
35	def create(config, scoring):	1✔
36	"""
37	Multiprocessing helper method. Creates a global embeddings object to be accessed in a new subprocess.
38
39	Args:
40	config: vector configuration
41	scoring: scoring instance
42	"""
43
44	global VECTORS
45
46	# Create a global embedding object using configuration and saved
47	VECTORS = WordVectors(config, scoring, None)	×
48
49
50	def transform(document):	1✔
51	"""
52	Multiprocessing helper method. Transforms document into an embeddings vector.
53
54	Args:
55	document: (id, data, tags)
56
57	Returns:
58	(id, embedding)
59	"""
60
61	return (document[0], VECTORS.transform(document))	×
62
63
64	class WordVectors(Vectors):	1✔
65	"""
66	Builds vectors using weighted word embeddings.
67	"""
68
69	def loadmodel(self, path):	1✔
70	# Ensure that vector path exists
71	if not path or not os.path.isfile(path):	1✔
72	raise IOError(ENOENT, "Vector model file not found", path)	1✔
73
74	# Load magnitude model. If this is a training run (uninitialized config), block until vectors are fully loaded
75	return Magnitude(path, case_insensitive=True, blocking=not self.initialized)	1✔
76
77	def encode(self, data):	1✔
78	# Iterate over each data element, tokenize (if necessary) and build an aggregated embeddings vector
79	embeddings = []	1✔
80	for tokens in data:	1✔
81	# Convert to tokens if necessary
82	if isinstance(tokens, str):	1✔
83	tokens = Tokenizer.tokenize(tokens)	1✔
84
85	# Generate weights for each vector using a scoring method
86	weights = self.scoring.weights(tokens) if self.scoring else None	1✔
87
88	# pylint: disable=E1133
89	if weights and [x for x in weights if x > 0]:	1✔
90	# Build weighted average embeddings vector. Create weights array as float32 to match embeddings precision.
91	embedding = np.average(self.lookup(tokens), weights=np.array(weights, dtype=np.float32), axis=0)	1✔
92	else:
93	# If no weights, use mean
94	embedding = np.mean(self.lookup(tokens), axis=0)	1✔
95
96	embeddings.append(embedding)	1✔
97
98	return np.array(embeddings, dtype=np.float32)	1✔
99
100	def index(self, documents, batchsize=1):	1✔
101	# Use default single process indexing logic
102	if "parallel" in self.config and not self.config["parallel"]:	1✔
103	return super().index(documents, batchsize)	1✔
104
105	# Customize indexing logic with multiprocessing pool to efficiently build vectors
106	ids, dimensions, batches, stream = [], None, 0, None	1✔
107
108	# Shared objects with Pool
109	args = (self.config, self.scoring)	1✔
110
111	# Convert all documents to embedding arrays, stream embeddings to disk to control memory usage
112	with Pool(os.cpu_count(), initializer=create, initargs=args) as pool:	1✔
113	with tempfile.NamedTemporaryFile(mode="wb", suffix=".npy", delete=False) as output:	1✔
114	stream = output.name	1✔
115	embeddings = []	1✔
116	for uid, embedding in pool.imap(transform, documents):	1✔
117	if not dimensions:	1✔
118	# Set number of dimensions for embeddings
119	dimensions = embedding.shape[0]	1✔
120
121	ids.append(uid)	1✔
122	embeddings.append(embedding)	1✔
123
124	if len(embeddings) == batchsize:	1✔
125	np.save(output, np.array(embeddings, dtype=np.float32))	1✔
126	batches += 1	1✔
127
128	embeddings = []	1✔
129
130	# Final embeddings batch
131	if embeddings:	1✔
132	np.save(output, np.array(embeddings, dtype=np.float32))	1✔
133	batches += 1	1✔
134
135	return (ids, dimensions, batches, stream)	1✔
136
137	def lookup(self, tokens):	1✔
138	"""
139	Queries word vectors for given list of input tokens.
140
141	Args:
142	tokens: list of tokens to query
143
144	Returns:
145	word vectors array
146	"""
147
148	return self.model.query(tokens)	1✔
149
150	@staticmethod	1✔
151	def isdatabase(path):	1✔
152	"""
153	Checks if this is a SQLite database file which is the file format used for word vectors databases.
154
155	Args:
156	path: path to check
157
158	Returns:
159	True if this is a SQLite database
160	"""
161
162	if isinstance(path, str) and os.path.isfile(path) and os.path.getsize(path) >= 100:	1✔
163	# Read 100 byte SQLite header
164	with open(path, "rb") as f:	1✔
165	header = f.read(100)	1✔
166
167	# Check for SQLite header
168	return header.startswith(b"SQLite format 3\000")	1✔
169
170	return False	1✔
171
172	@staticmethod	1✔
173	def build(data, size, mincount, path):	1✔
174	"""
175	Builds fastText vectors from a file.
176
177	Args:
178	data: path to input data file
179	size: number of vector dimensions
180	mincount: minimum number of occurrences required to register a token
181	path: path to output file
182	"""
183
184	# Train on data file using largest dimension size
185	model = fasttext.train_unsupervised(data, dim=size, minCount=mincount)	1✔
186
187	# Output file path
188	logger.info("Building %d dimension model", size)	1✔
189
190	# Output vectors in vec/txt format
191	with open(path + ".txt", "w", encoding="utf-8") as output:	1✔
192	words = model.get_words()	1✔
193	output.write(f"{len(words)} {model.get_dimension()}\n")	1✔
194
195	for word in words:	1✔
196	# Skip end of line token
197	if word != "</s>":	1✔
198	vector = model.get_word_vector(word)	1✔
199	data = ""	1✔
200	for v in vector:	1✔
201	data += " " + str(v)	1✔
202
203	output.write(word + data + "\n")	1✔
204
205	# Build magnitude vectors database
206	logger.info("Converting vectors to magnitude format")	1✔
207	converter.convert(path + ".txt", path + ".magnitude", subword=True)	1✔

neuml / txtai / 11062719915

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous