11062719915

Committed 27 Sep 2024 01:06AM CUT coverage: 99.946%. Remained the same

Build # 11062719915

Build Type

push

github

Committed by

davidmezzetti

Commit Message

Update documentation

Run Details

7406 of 7410 relevant lines covered (99.95%)

1.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

100.0

/src/python/txtai/ann/faiss.py

"""
Faiss module
"""

import math

import numpy as np

from faiss import index_factory, IO_FLAG_MMAP, METRIC_INNER_PRODUCT, read_index, write_index
from faiss import index_binary_factory, read_index_binary, write_index_binary, IndexBinaryIDMap

from .base import ANN


class Faiss(ANN):
    """
    Builds an ANN index using the Faiss library.
    """

    def __init__(self, config):
        super().__init__(config)

        # Scalar quantization
        quantize = self.config.get("quantize")
        self.qbits = quantize if quantize and isinstance(quantize, int) and not isinstance(quantize, bool) else None

    def load(self, path):
        # Get read function
        readindex = read_index_binary if self.qbits else read_index

        # Load index
        self.backend = readindex(path, IO_FLAG_MMAP if self.setting("mmap") is True else 0)

    def index(self, embeddings):
        # Compute model training size
        train, sample = embeddings, self.setting("sample")
        if sample:
            # Get sample for training
            rng = np.random.default_rng(0)
            indices = sorted(rng.choice(train.shape[0], int(sample * train.shape[0]), replace=False, shuffle=False))
            train = train[indices]

        # Configure embeddings index. Inner product is equal to cosine similarity on normalized vectors.
        params = self.configure(embeddings.shape[0], train.shape[0])

        # Create index
        self.backend = self.create(embeddings, params)

        # Train model
        self.backend.train(train)

        # Add embeddings - position in embeddings is used as the id
        self.backend.add_with_ids(embeddings, np.arange(embeddings.shape[0], dtype=np.int64))

        # Add id offset and index build metadata
        self.config["offset"] = embeddings.shape[0]
        self.metadata({"components": params})

    def append(self, embeddings):
        new = embeddings.shape[0]

        # Append new ids - position in embeddings + existing offset is used as the id
        self.backend.add_with_ids(embeddings, np.arange(self.config["offset"], self.config["offset"] + new, dtype=np.int64))

        # Update id offset and index metadata
        self.config["offset"] += new
        self.metadata()

    def delete(self, ids):
        # Remove specified ids
        self.backend.remove_ids(np.array(ids, dtype=np.int64))

    def search(self, queries, limit):
        # Set nprobe and nflip search parameters
        self.backend.nprobe = self.nprobe()
        self.backend.nflip = self.setting("nflip", self.backend.nprobe)

        # Run the query
        scores, ids = self.backend.search(queries, limit)

        # Map results to [(id, score)]
        results = []
        for x, score in enumerate(scores):
            # Transform scores
            score = [1.0 - (x / (self.config["dimensions"] * 8)) for x in score.tolist()] if self.qbits else score.tolist()

            # Add results
            results.append(list(zip(ids[x].tolist(), score)))

        return results

    def count(self):
        return self.backend.ntotal

    def save(self, path):
        # Get write function
        writeindex = write_index_binary if self.qbits else write_index

        # Write index
        writeindex(self.backend, path)

    def configure(self, count, train):
        """
        Configures settings for a new index.

        Args:
            count: initial number of embeddings rows
            train: number of rows selected for model training

        Returns:
            user-specified or generated components setting
        """

        # Lookup components setting
        components = self.setting("components")

        if components:
            # Format and return components string
            return self.components(components, train)

        # Derive quantization. Prefer backend-specific setting. Fallback to root-level parameter.
        quantize = self.setting("quantize", self.config.get("quantize"))
        quantize = 8 if isinstance(quantize, bool) else quantize

        # Get storage setting
        storage = f"SQ{quantize}" if quantize else "Flat"

        # Small index, use storage directly with IDMap
        if count <= 5000:
            return "BFlat" if self.qbits else f"IDMap,{storage}"

        x = self.cells(train)
        components = f"BIVF{x}" if self.qbits else f"IVF{x},{storage}"

        return components

    def create(self, embeddings, params):
        """
        Creates a new index.

        Args:
            embeddings: embeddings to index
            params: index parameters

        Returns:
            new index
        """

        # Create binary index
        if self.qbits:
            index = index_binary_factory(embeddings.shape[1] * 8, params)

            # Wrap with BinaryIDMap, if necessary
            if any(x in params for x in ["BFlat", "BHNSW"]):
                index = IndexBinaryIDMap(index)

            return index

        # Create standard float index
        return index_factory(embeddings.shape[1], params, METRIC_INNER_PRODUCT)

    def cells(self, count):
        """
        Calculates the number of IVF cells for an IVF index.

        Args:
            count: number of embeddings rows

        Returns:
            number of IVF cells
        """

        # Calculate number of IVF cells where x = min(4 * sqrt(embeddings count), embeddings count / 39)
        # Faiss requires at least 39 * x data points
        return max(min(round(4 * math.sqrt(count)), int(count / 39)), 1)

    def components(self, components, train):
        """
        Formats a components string. This method automatically calculates the optimal number of IVF cells, if omitted.

        Args:
            components: input components string
            train: number of rows selected for model training

        Returns:
            formatted components string
        """

        # Optimal number of IVF cells
        x = self.cells(train)

        # Add number of IVF cells, if missing
        components = [f"IVF{x}" if component == "IVF" else component for component in components.split(",")]

        # Return components string
        return ",".join(components)

    def nprobe(self):
        """
        Gets or derives the nprobe search parameter.

        Returns:
            nprobe setting
        """

        # Get size of embeddings index
        count = self.count()

        default = 6 if count <= 5000 else round(self.cells(count) / 16)
        return self.setting("nprobe", default)

1	"""
2	Faiss module
3	"""
4
5	import math	1✔
6
7	import numpy as np	1✔
8
9	from faiss import index_factory, IO_FLAG_MMAP, METRIC_INNER_PRODUCT, read_index, write_index	1✔
10	from faiss import index_binary_factory, read_index_binary, write_index_binary, IndexBinaryIDMap	1✔
11
12	from .base import ANN	1✔
13
14
15	class Faiss(ANN):	1✔
16	"""
17	Builds an ANN index using the Faiss library.
18	"""
19
20	def __init__(self, config):	1✔
21	super().__init__(config)	1✔
22
23	# Scalar quantization
24	quantize = self.config.get("quantize")	1✔
25	self.qbits = quantize if quantize and isinstance(quantize, int) and not isinstance(quantize, bool) else None	1✔
26
27	def load(self, path):	1✔
28	# Get read function
29	readindex = read_index_binary if self.qbits else read_index	1✔
30
31	# Load index
32	self.backend = readindex(path, IO_FLAG_MMAP if self.setting("mmap") is True else 0)	1✔
33
34	def index(self, embeddings):	1✔
35	# Compute model training size
36	train, sample = embeddings, self.setting("sample")	1✔
37	if sample:	1✔
38	# Get sample for training
39	rng = np.random.default_rng(0)	1✔
40	indices = sorted(rng.choice(train.shape[0], int(sample * train.shape[0]), replace=False, shuffle=False))	1✔
41	train = train[indices]	1✔
42
43	# Configure embeddings index. Inner product is equal to cosine similarity on normalized vectors.
44	params = self.configure(embeddings.shape[0], train.shape[0])	1✔
45
46	# Create index
47	self.backend = self.create(embeddings, params)	1✔
48
49	# Train model
50	self.backend.train(train)	1✔
51
52	# Add embeddings - position in embeddings is used as the id
53	self.backend.add_with_ids(embeddings, np.arange(embeddings.shape[0], dtype=np.int64))	1✔
54
55	# Add id offset and index build metadata
56	self.config["offset"] = embeddings.shape[0]	1✔
57	self.metadata({"components": params})	1✔
58
59	def append(self, embeddings):	1✔
60	new = embeddings.shape[0]	1✔
61
62	# Append new ids - position in embeddings + existing offset is used as the id
63	self.backend.add_with_ids(embeddings, np.arange(self.config["offset"], self.config["offset"] + new, dtype=np.int64))	1✔
64
65	# Update id offset and index metadata
66	self.config["offset"] += new	1✔
67	self.metadata()	1✔
68
69	def delete(self, ids):	1✔
70	# Remove specified ids
71	self.backend.remove_ids(np.array(ids, dtype=np.int64))	1✔
72
73	def search(self, queries, limit):	1✔
74	# Set nprobe and nflip search parameters
75	self.backend.nprobe = self.nprobe()	1✔
76	self.backend.nflip = self.setting("nflip", self.backend.nprobe)	1✔
77
78	# Run the query
79	scores, ids = self.backend.search(queries, limit)	1✔
80
81	# Map results to [(id, score)]
82	results = []	1✔
83	for x, score in enumerate(scores):	1✔
84	# Transform scores
85	score = [1.0 - (x / (self.config["dimensions"] * 8)) for x in score.tolist()] if self.qbits else score.tolist()	1✔
86
87	# Add results
88	results.append(list(zip(ids[x].tolist(), score)))	1✔
89
90	return results	1✔
91
92	def count(self):	1✔
93	return self.backend.ntotal	1✔
94
95	def save(self, path):	1✔
96	# Get write function
97	writeindex = write_index_binary if self.qbits else write_index	1✔
98
99	# Write index
100	writeindex(self.backend, path)	1✔
101
102	def configure(self, count, train):	1✔
103	"""
104	Configures settings for a new index.
105
106	Args:
107	count: initial number of embeddings rows
108	train: number of rows selected for model training
109
110	Returns:
111	user-specified or generated components setting
112	"""
113
114	# Lookup components setting
115	components = self.setting("components")	1✔
116
117	if components:	1✔
118	# Format and return components string
119	return self.components(components, train)	1✔
120
121	# Derive quantization. Prefer backend-specific setting. Fallback to root-level parameter.
122	quantize = self.setting("quantize", self.config.get("quantize"))	1✔
123	quantize = 8 if isinstance(quantize, bool) else quantize	1✔
124
125	# Get storage setting
126	storage = f"SQ{quantize}" if quantize else "Flat"	1✔
127
128	# Small index, use storage directly with IDMap
129	if count <= 5000:	1✔
130	return "BFlat" if self.qbits else f"IDMap,{storage}"	1✔
131
132	x = self.cells(train)	1✔
133	components = f"BIVF{x}" if self.qbits else f"IVF{x},{storage}"	1✔
134
135	return components	1✔
136
137	def create(self, embeddings, params):	1✔
138	"""
139	Creates a new index.
140
141	Args:
142	embeddings: embeddings to index
143	params: index parameters
144
145	Returns:
146	new index
147	"""
148
149	# Create binary index
150	if self.qbits:	1✔
151	index = index_binary_factory(embeddings.shape[1] * 8, params)	1✔
152
153	# Wrap with BinaryIDMap, if necessary
154	if any(x in params for x in ["BFlat", "BHNSW"]):	1✔
155	index = IndexBinaryIDMap(index)	1✔
156
157	return index	1✔
158
159	# Create standard float index
160	return index_factory(embeddings.shape[1], params, METRIC_INNER_PRODUCT)	1✔
161
162	def cells(self, count):	1✔
163	"""
164	Calculates the number of IVF cells for an IVF index.
165
166	Args:
167	count: number of embeddings rows
168
169	Returns:
170	number of IVF cells
171	"""
172
173	# Calculate number of IVF cells where x = min(4 * sqrt(embeddings count), embeddings count / 39)
174	# Faiss requires at least 39 * x data points
175	return max(min(round(4 * math.sqrt(count)), int(count / 39)), 1)	1✔
176
177	def components(self, components, train):	1✔
178	"""
179	Formats a components string. This method automatically calculates the optimal number of IVF cells, if omitted.
180
181	Args:
182	components: input components string
183	train: number of rows selected for model training
184
185	Returns:
186	formatted components string
187	"""
188
189	# Optimal number of IVF cells
190	x = self.cells(train)	1✔
191
192	# Add number of IVF cells, if missing
193	components = [f"IVF{x}" if component == "IVF" else component for component in components.split(",")]	1✔
194
195	# Return components string
196	return ",".join(components)	1✔
197
198	def nprobe(self):	1✔
199	"""
200	Gets or derives the nprobe search parameter.
201
202	Returns:
203	nprobe setting
204	"""
205
206	# Get size of embeddings index
207	count = self.count()	1✔
208
209	default = 6 if count <= 5000 else round(self.cells(count) / 16)	1✔
210	return self.setting("nprobe", default)	1✔

neuml / txtai / 11062719915

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous