• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

neuml / txtai / 11062719915

27 Sep 2024 01:06AM CUT coverage: 99.946%. Remained the same
11062719915

push

github

davidmezzetti
Update documentation

7406 of 7410 relevant lines covered (99.95%)

1.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

100.0
/src/python/txtai/ann/faiss.py
1
"""
2
Faiss module
3
"""
4

5
import math
1✔
6

7
import numpy as np
1✔
8

9
from faiss import index_factory, IO_FLAG_MMAP, METRIC_INNER_PRODUCT, read_index, write_index
1✔
10
from faiss import index_binary_factory, read_index_binary, write_index_binary, IndexBinaryIDMap
1✔
11

12
from .base import ANN
1✔
13

14

15
class Faiss(ANN):
1✔
16
    """
17
    Builds an ANN index using the Faiss library.
18
    """
19

20
    def __init__(self, config):
1✔
21
        super().__init__(config)
1✔
22

23
        # Scalar quantization
24
        quantize = self.config.get("quantize")
1✔
25
        self.qbits = quantize if quantize and isinstance(quantize, int) and not isinstance(quantize, bool) else None
1✔
26

27
    def load(self, path):
1✔
28
        # Get read function
29
        readindex = read_index_binary if self.qbits else read_index
1✔
30

31
        # Load index
32
        self.backend = readindex(path, IO_FLAG_MMAP if self.setting("mmap") is True else 0)
1✔
33

34
    def index(self, embeddings):
1✔
35
        # Compute model training size
36
        train, sample = embeddings, self.setting("sample")
1✔
37
        if sample:
1✔
38
            # Get sample for training
39
            rng = np.random.default_rng(0)
1✔
40
            indices = sorted(rng.choice(train.shape[0], int(sample * train.shape[0]), replace=False, shuffle=False))
1✔
41
            train = train[indices]
1✔
42

43
        # Configure embeddings index. Inner product is equal to cosine similarity on normalized vectors.
44
        params = self.configure(embeddings.shape[0], train.shape[0])
1✔
45

46
        # Create index
47
        self.backend = self.create(embeddings, params)
1✔
48

49
        # Train model
50
        self.backend.train(train)
1✔
51

52
        # Add embeddings - position in embeddings is used as the id
53
        self.backend.add_with_ids(embeddings, np.arange(embeddings.shape[0], dtype=np.int64))
1✔
54

55
        # Add id offset and index build metadata
56
        self.config["offset"] = embeddings.shape[0]
1✔
57
        self.metadata({"components": params})
1✔
58

59
    def append(self, embeddings):
1✔
60
        new = embeddings.shape[0]
1✔
61

62
        # Append new ids - position in embeddings + existing offset is used as the id
63
        self.backend.add_with_ids(embeddings, np.arange(self.config["offset"], self.config["offset"] + new, dtype=np.int64))
1✔
64

65
        # Update id offset and index metadata
66
        self.config["offset"] += new
1✔
67
        self.metadata()
1✔
68

69
    def delete(self, ids):
1✔
70
        # Remove specified ids
71
        self.backend.remove_ids(np.array(ids, dtype=np.int64))
1✔
72

73
    def search(self, queries, limit):
1✔
74
        # Set nprobe and nflip search parameters
75
        self.backend.nprobe = self.nprobe()
1✔
76
        self.backend.nflip = self.setting("nflip", self.backend.nprobe)
1✔
77

78
        # Run the query
79
        scores, ids = self.backend.search(queries, limit)
1✔
80

81
        # Map results to [(id, score)]
82
        results = []
1✔
83
        for x, score in enumerate(scores):
1✔
84
            # Transform scores
85
            score = [1.0 - (x / (self.config["dimensions"] * 8)) for x in score.tolist()] if self.qbits else score.tolist()
1✔
86

87
            # Add results
88
            results.append(list(zip(ids[x].tolist(), score)))
1✔
89

90
        return results
1✔
91

92
    def count(self):
1✔
93
        return self.backend.ntotal
1✔
94

95
    def save(self, path):
1✔
96
        # Get write function
97
        writeindex = write_index_binary if self.qbits else write_index
1✔
98

99
        # Write index
100
        writeindex(self.backend, path)
1✔
101

102
    def configure(self, count, train):
1✔
103
        """
104
        Configures settings for a new index.
105

106
        Args:
107
            count: initial number of embeddings rows
108
            train: number of rows selected for model training
109

110
        Returns:
111
            user-specified or generated components setting
112
        """
113

114
        # Lookup components setting
115
        components = self.setting("components")
1✔
116

117
        if components:
1✔
118
            # Format and return components string
119
            return self.components(components, train)
1✔
120

121
        # Derive quantization. Prefer backend-specific setting. Fallback to root-level parameter.
122
        quantize = self.setting("quantize", self.config.get("quantize"))
1✔
123
        quantize = 8 if isinstance(quantize, bool) else quantize
1✔
124

125
        # Get storage setting
126
        storage = f"SQ{quantize}" if quantize else "Flat"
1✔
127

128
        # Small index, use storage directly with IDMap
129
        if count <= 5000:
1✔
130
            return "BFlat" if self.qbits else f"IDMap,{storage}"
1✔
131

132
        x = self.cells(train)
1✔
133
        components = f"BIVF{x}" if self.qbits else f"IVF{x},{storage}"
1✔
134

135
        return components
1✔
136

137
    def create(self, embeddings, params):
1✔
138
        """
139
        Creates a new index.
140

141
        Args:
142
            embeddings: embeddings to index
143
            params: index parameters
144

145
        Returns:
146
            new index
147
        """
148

149
        # Create binary index
150
        if self.qbits:
1✔
151
            index = index_binary_factory(embeddings.shape[1] * 8, params)
1✔
152

153
            # Wrap with BinaryIDMap, if necessary
154
            if any(x in params for x in ["BFlat", "BHNSW"]):
1✔
155
                index = IndexBinaryIDMap(index)
1✔
156

157
            return index
1✔
158

159
        # Create standard float index
160
        return index_factory(embeddings.shape[1], params, METRIC_INNER_PRODUCT)
1✔
161

162
    def cells(self, count):
1✔
163
        """
164
        Calculates the number of IVF cells for an IVF index.
165

166
        Args:
167
            count: number of embeddings rows
168

169
        Returns:
170
            number of IVF cells
171
        """
172

173
        # Calculate number of IVF cells where x = min(4 * sqrt(embeddings count), embeddings count / 39)
174
        # Faiss requires at least 39 * x data points
175
        return max(min(round(4 * math.sqrt(count)), int(count / 39)), 1)
1✔
176

177
    def components(self, components, train):
1✔
178
        """
179
        Formats a components string. This method automatically calculates the optimal number of IVF cells, if omitted.
180

181
        Args:
182
            components: input components string
183
            train: number of rows selected for model training
184

185
        Returns:
186
            formatted components string
187
        """
188

189
        # Optimal number of IVF cells
190
        x = self.cells(train)
1✔
191

192
        # Add number of IVF cells, if missing
193
        components = [f"IVF{x}" if component == "IVF" else component for component in components.split(",")]
1✔
194

195
        # Return components string
196
        return ",".join(components)
1✔
197

198
    def nprobe(self):
1✔
199
        """
200
        Gets or derives the nprobe search parameter.
201

202
        Returns:
203
            nprobe setting
204
        """
205

206
        # Get size of embeddings index
207
        count = self.count()
1✔
208

209
        default = 6 if count <= 5000 else round(self.cells(count) / 16)
1✔
210
        return self.setting("nprobe", default)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc