• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

anthonypdawson / vector-inspector / 24101498785

07 Apr 2026 07:56PM UTC coverage: 80.257% (-0.6%) from 80.844%
24101498785

push

github

web-flow
0.7.0 - Image and text file Ingestion, Document preview (#29)

* feat: Add ingestion dialog and functionality for images and documents

- Implemented IngestionDialog for configuring image and document ingestion pipelines.
- Added signals in MetadataActionButtons for requesting image and document ingestion.
- Enhanced MetadataView to handle image and document ingestion, including validation and background processing.
- Introduced utility functions for file type detection and preview loading.
- Developed tests for FileIngestionService covering image and document ingestion scenarios, including edge cases.
- Added support for re-ingesting files from the metadata table context menu.

* feat: Update dependencies to include torch, transformers, Pillow, pypdf, and python-docx

* feat: Enhance file preview functionality in UI components and metadata table

* feat: Add comprehensive tests for file preview functionality across various components

* feat: Implement JSON-safe conversion for metadata and enhance item detail displays

* feat: Enhance file ingestion with telemetry events and progress updates; add import actions to the connection menu

* feat: Move import actions to Tools menu and update related tests

* feat: Add telemetry events for app launch, CLI first use, and update actions for full transparency

* Update src/vector_inspector/ui/views/metadata/metadata_table.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update src/vector_inspector/services/file_ingestion_service.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/utils/test_json_safe.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* chore: Bump version to 0.7.0 in pyproject.toml and __init__.py

* feat: Enhance file preview functionality and logging in metadata views

* feat: Enhance DimensionAwareEmbeddingFunction to support explicit model selection

* feat: Enhance error logging in SearchThread for imp... (continued)

753 of 1090 new or added lines in 18 files covered. (69.08%)

1 existing line in 1 file now uncovered.

14435 of 17986 relevant lines covered (80.26%)

0.8 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.12
/src/vector_inspector/core/embedding_utils.py
1
"""Utilities for managing embedding models and vector dimensions."""
2

3
from __future__ import annotations  # Allows us to use class names in typehints while lazyloading
1✔
4

5
from typing import Any
1✔
6

7
# Lazy import: see below
8
from vector_inspector.core.logging import log_info
1✔
9
from vector_inspector.core.model_registry import get_model_registry
1✔
10

11
# Default model to use when dimension is unknown or not mapped
12
DEFAULT_MODEL = ("all-MiniLM-L6-v2", "sentence-transformer")
1✔
13

14

15
def _get_dimension_to_model_dict():
1✔
16
    """Build dimension->models dictionary from registry.
17

18
    Returns:
19
        Dict mapping dimension to list of (name, type, description) tuples
20
    """
21
    registry = get_model_registry()
1✔
22
    dimension_map = {}
1✔
23

24
    for dimension in registry.get_all_dimensions():
1✔
25
        models = registry.get_models_by_dimension(dimension)
1✔
26
        dimension_map[dimension] = [(m.name, m.type, m.description) for m in models]
1✔
27

28
    return dimension_map
1✔
29

30

31
# For backward compatibility - dynamically loads from registry
32
DIMENSION_TO_MODEL = _get_dimension_to_model_dict()
1✔
33

34

35
def get_model_for_dimension(dimension: int, prefer_multimodal: bool = True) -> tuple[str, str]:
1✔
36
    """
37
    Get the appropriate embedding model name and type for a given vector dimension.
38

39
    Args:
40
        dimension: The vector dimension size
41
        prefer_multimodal: If True and multiple models exist for this dimension,
42
                          prefer multi-modal (CLIP) over text-only models
43

44
    Returns:
45
        Tuple of (model_name, model_type) where model_type is "sentence-transformer" or "clip"
46
    """
47
    registry = get_model_registry()
1✔
48
    models = registry.get_models_by_dimension(dimension)
1✔
49

50
    if not models:
1✔
51
        # Find the closest dimension if exact match not found
52
        closest_dim = registry.find_closest_dimension(dimension)
1✔
53
        if closest_dim:
1✔
54
            models = registry.get_models_by_dimension(closest_dim)
1✔
55

56
    if not models:
1✔
57
        return DEFAULT_MODEL
1✔
58

59
    if len(models) == 1:
1✔
60
        return (models[0].name, models[0].type)
1✔
61

62
    # Multiple models available - apply preference
63
    if prefer_multimodal:
1✔
64
        # Prefer CLIP/multimodal
65
        for model in models:
1✔
66
            if model.modality == "multimodal" or model.type == "clip":
1✔
67
                return (model.name, model.type)
1✔
68

69
    # Default to first option
70
    return (models[0].name, models[0].type)
1✔
71

72

73
def get_available_models_for_dimension(dimension: int) -> list:
1✔
74
    """
75
    Get all available model options for a given dimension.
76
    Includes both predefined (from registry) and custom user-added models.
77

78
    Args:
79
        dimension: The vector dimension size
80

81
    Returns:
82
        List of tuples: [(model_name, model_type, description), ...]
83
    """
84
    # Start with models from registry
85
    registry = get_model_registry()
1✔
86
    registry_models = registry.get_models_by_dimension(dimension)
1✔
87
    models = [(m.name, m.type, m.description) for m in registry_models]
1✔
88

89
    # Add custom models from settings
90
    try:
1✔
91
        from vector_inspector.services.settings_service import SettingsService
1✔
92

93
        settings = SettingsService()
1✔
94
        custom_models = settings.get_custom_embedding_models(dimension)
1✔
95

96
        for model in custom_models:
1✔
97
            # Format: (model_name, model_type, description)
98
            models.append((model["name"], model["type"], f"{model['description']} (custom)"))
1✔
99
    except Exception as e:
1✔
100
        log_info("Warning: Could not load custom models: %s", e)
1✔
101

102
    return models
1✔
103

104

105
def load_embedding_model(model_name: str, model_type: str) -> SentenceTransformer | Any:
1✔
106
    """
107
    Load an embedding model (sentence-transformer or CLIP).
108

109
    Uses disk cache when available to speed up repeated loads.
110

111
    Args:
112
        model_name: Name of the model to load
113
        model_type: Type of model ("sentence-transformer" or "clip")
114

115
    Returns:
116
        Loaded model (SentenceTransformer or CLIP model)
117
    """
118
    from vector_inspector.core.model_cache import (
1✔
119
        is_cache_enabled,
120
        load_cached_path,
121
        save_model_to_cache,
122
    )
123

124
    # Try to load from cache first
125
    cached_path = load_cached_path(model_name)
1✔
126

127
    if model_type == "clip":
1✔
128
        # Delegate to the shared, thread-safe in-memory cache in lazy_imports so
129
        # that the ingestion code path and the search code path always receive the
130
        # same model object and loading never races across QThreads (which can
131
        # corrupt torch_cpu.dll native state and cause a silent access-violation
132
        # crash).  Disk-cache saving for cold-start speed-up is skipped here
133
        # because the in-memory cache already handles within-process reuse.
NEW
134
        from vector_inspector.utils.lazy_imports import get_clip_model_and_processor
×
135

NEW
136
        return get_clip_model_and_processor(model_name)
×
137
    from sentence_transformers import SentenceTransformer
1✔
138

139
    if cached_path:
1✔
140
        try:
×
141
            # Load from cache
142
            model = SentenceTransformer(str(cached_path))
×
143
            log_info(f"Loaded sentence-transformer from cache: {model_name}")
×
144
            return model
×
145
        except Exception as e:
×
146
            log_info(f"Failed to load from cache, downloading: {e}")
×
147

148
    # Load from HuggingFace
149
    model = SentenceTransformer(model_name)
1✔
150

151
    # Cache for future use
152
    if is_cache_enabled():
1✔
153
        save_model_to_cache(model, model_name, model_type)
1✔
154

155
    # Returns a SentenceTransformer instance
156
    return model
1✔
157

158

159
def encode_text(text: str, model: SentenceTransformer | tuple, model_type: str) -> list:
1✔
160
    """
161
    Encode text using the appropriate model.
162

163
    Args:
164
        text: Text to encode
165
        model: The loaded model (SentenceTransformer or (CLIPModel, CLIPProcessor) tuple)
166
        model_type: Type of model ("sentence-transformer" or "clip")
167

168
    Returns:
169
        Embedding vector as a list
170
    """
171
    if model_type == "clip":
1✔
172
        import torch
1✔
173

174
        clip_model, processor = model
1✔
175
        inputs = processor(text=[text], return_tensors="pt", padding=True)
1✔
176
        with torch.no_grad():
1✔
177
            text_features = clip_model.get_text_features(**inputs)
1✔
178
        # Some HuggingFace CLIP variants return BaseModelOutputWithPooling instead of
179
        # a raw tensor.  Unwrap to the pooled tensor before normalising.
180
        if not isinstance(text_features, torch.Tensor):
1✔
181
            if hasattr(text_features, "pooler_output") and text_features.pooler_output is not None:
1✔
182
                text_features = text_features.pooler_output
1✔
183
            elif hasattr(text_features, "last_hidden_state"):
1✔
184
                text_features = text_features.last_hidden_state[:, 0]
1✔
185
            else:
186
                raise TypeError(
1✔
187
                    f"CLIP get_text_features returned unexpected type {type(text_features).__name__}; "
188
                    "expected a Tensor or BaseModelOutputWithPooling"
189
                )
190
        # Normalize the features (CLIP embeddings are typically normalized)
191
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
1✔
192
        return text_features[0].cpu().numpy().tolist()
1✔
193
    # sentence-transformer
194
    # Lazy import for type hint only
195
    # from sentence_transformers import SentenceTransformer
196
    embedding = model.encode(text)
1✔
197
    return embedding.tolist()
1✔
198

199

200
def get_embedding_model_for_dimension(
1✔
201
    dimension: int,
202
) -> tuple[SentenceTransformer | tuple, str, str]:
203
    """
204
    Get a loaded embedding model for a specific dimension.
205

206
    Args:
207
        dimension: The vector dimension size
208

209
    Returns:
210
        Tuple of (loaded_model, model_name, model_type)
211
    """
212
    model_name, model_type = get_model_for_dimension(dimension)
1✔
213
    model = load_embedding_model(model_name, model_type)
1✔
214
    # Returns a tuple: (loaded_model, model_name, model_type)
215
    return (model, model_name, model_type)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc