• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

anthonypdawson / vector-inspector / 22355054572

24 Feb 2026 02:25PM UTC coverage: 59.91% (+2.7%) from 57.258%
22355054572

Pull #20

github

anthonypdawson
feat: update release notes for version 0.5.0 with highlights, improved LanceDB reliability, new vector distributions view, better sample data controls, connection manager enhancements, and increased test coverage
Pull Request #20: Version 0.5.0 - Add Histogram visualization, update sample data generation, connection info panel, tests

321 of 545 new or added lines in 13 files covered. (58.9%)

12 existing lines in 5 files now uncovered.

8294 of 13844 relevant lines covered (59.91%)

0.6 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

46.48
/src/vector_inspector/core/sample_data/text_generator.py
1
"""Text-based sample data generator for testing vector databases."""
2

3
import random
1✔
4
from enum import Enum
1✔
5
from typing import Any
1✔
6

7

8
class SampleDataType(Enum):
1✔
9
    """Types of sample data that can be generated."""
10

11
    TEXT = "text"
1✔
12
    MARKDOWN = "markdown"
1✔
13
    JSON = "json"
1✔
14

15

16
# Sample text corpora for generating realistic-looking data
17
TOPICS = [
1✔
18
    "artificial intelligence",
19
    "machine learning",
20
    "natural language processing",
21
    "computer vision",
22
    "robotics",
23
    "data science",
24
    "cybersecurity",
25
    "cloud computing",
26
    "quantum computing",
27
    "blockchain",
28
    "internet of things",
29
    "augmented reality",
30
    "virtual reality",
31
    "edge computing",
32
    "5G networks",
33
]
34

35
SENTENCES = [
1✔
36
    "is transforming the way we work and live",
37
    "has seen rapid advancement in recent years",
38
    "continues to evolve at an unprecedented pace",
39
    "offers new opportunities for innovation",
40
    "presents both challenges and opportunities",
41
    "is reshaping multiple industries simultaneously",
42
    "requires careful consideration of ethical implications",
43
    "has become increasingly accessible to developers",
44
    "demonstrates remarkable potential for growth",
45
    "remains an active area of research and development",
46
    "integrates seamlessly with existing technologies",
47
    "enables new forms of human-computer interaction",
48
    "provides solutions to complex real-world problems",
49
    "has attracted significant investment and attention",
50
    "will likely define the future of technology",
51
    "is driving productivity gains across sectors",
52
    "is unlocking new abilities for small teams",
53
    "is improving decision-making through better insights",
54
    "is increasingly being adopted in production systems",
55
    "is enabling developers to build smarter applications",
56
    "is creating new roles and job functions",
57
    "is challenging existing regulatory frameworks",
58
    "is lowering the barrier to entry for innovation",
59
    "is powering breakthroughs in data-driven research",
60
    "is helping automate repetitive tasks effectively",
61
    "is evolving alongside improvements in hardware",
62
    "is prompting a rethinking of traditional workflows",
63
    "is bridging gaps between disciplines and teams",
64
    "is influencing curriculum and education trends",
65
    "is fostering research collaborations worldwide",
66
]
67

68

69
MARKDOWN_SECTIONS = [
1✔
70
    (
71
        "Introduction",
72
        "This section provides an overview of the topic and its significance in modern technology.",
73
    ),
74
    (
75
        "Key Concepts",
76
        "Understanding the fundamental principles is essential for grasping the broader implications.",
77
    ),
78
    (
79
        "Applications",
80
        "Practical applications span numerous industries including healthcare, finance, and education.",
81
    ),
82
    ("Challenges", "Despite significant progress, several obstacles remain to be addressed."),
83
    ("Future Directions", "Ongoing research continues to push the boundaries of what's possible."),
84
    ("Best Practices", "Following established guidelines helps ensure successful implementation."),
85
    ("Case Studies", "Real-world examples demonstrate the practical value of these technologies."),
86
    (
87
        "Tools and Frameworks",
88
        "A variety of platforms and libraries facilitate development and deployment.",
89
    ),
90
    (
91
        "Performance Metrics",
92
        "Measuring success requires appropriate benchmarks and evaluation criteria.",
93
    ),
94
    ("Conclusion", "The field continues to evolve with promising developments on the horizon."),
95
]
96

97
JSON_TITLES = [
1✔
98
    "Getting Started Guide",
99
    "Advanced Techniques",
100
    "Performance Optimization",
101
    "Security Best Practices",
102
    "Architecture Overview",
103
    "API Reference",
104
    "Troubleshooting Common Issues",
105
    "Integration Patterns",
106
    "Design Principles",
107
    "Deployment Strategies",
108
    "Monitoring and Observability",
109
    "Scaling Considerations",
110
    "Data Management",
111
    "Testing Approaches",
112
    "Version Migration Guide",
113
]
114

115
JSON_DESCRIPTIONS = [
1✔
116
    "A comprehensive introduction to fundamental concepts and techniques.",
117
    "Deep dive into advanced methodologies and optimization strategies.",
118
    "Practical guide for improving system performance and efficiency.",
119
    "Essential practices for maintaining security and data protection.",
120
    "Detailed overview of system architecture and component interactions.",
121
    "Complete reference for available APIs and integration methods.",
122
    "Solutions to frequently encountered problems and error messages.",
123
    "Common patterns for integrating with external systems and services.",
124
    "Core principles guiding system design and implementation decisions.",
125
    "Strategies for deploying applications across different environments.",
126
    "Guidelines for effective monitoring, logging, and observability.",
127
    "Approaches to scaling systems to handle increased load and data.",
128
    "Best practices for data modeling, storage, and retrieval.",
129
    "Comprehensive testing strategies including unit, integration, and E2E tests.",
130
    "Step-by-step instructions for migrating between major versions.",
131
]
132

133

134
def generate_sample_data(
1✔
135
    count: int, data_type: SampleDataType = SampleDataType.TEXT, randomize: bool = True
136
) -> list[dict[str, Any]]:
137
    """Generate sample data for testing vector databases.
138

139
    Args:
140
        count: Number of items to generate
141
        data_type: Type of data to generate (text, markdown, or json)
142

143
    Returns:
144
        List of dictionaries with 'text' and 'metadata' keys
145
    """
146
    if isinstance(data_type, str):
1✔
147
        data_type = SampleDataType(data_type)
×
148

149
    if data_type == SampleDataType.TEXT:
1✔
150
        return _generate_text_samples(count, randomize=randomize)
1✔
151
    if data_type == SampleDataType.MARKDOWN:
×
NEW
152
        return _generate_markdown_samples(count, randomize=randomize)
×
153
    if data_type == SampleDataType.JSON:
×
NEW
154
        return _generate_json_samples(count, randomize=randomize)
×
155
    raise ValueError(f"Unknown data type: {data_type}")
×
156

157

158
def _generate_text_samples(count: int, randomize: bool = True) -> list[dict[str, Any]]:
1✔
159
    """Generate simple text samples."""
160
    samples = []
1✔
161

162
    for i in range(count):
1✔
163
        if randomize:
1✔
164
            topic = random.choice(TOPICS)
1✔
165
            sentence = random.choice(SENTENCES)
1✔
166
        else:
167
            topic = TOPICS[i % len(TOPICS)]
1✔
168
            sentence = SENTENCES[i % len(SENTENCES)]
1✔
169
        text = f"{topic.capitalize()} {sentence}."
1✔
170

171
        # Add some variety with occasional two-sentence entries
172
        add_second = random.random() < 0.3 if randomize else i % 10 < 3
1✔
173
        if add_second:
1✔
174
            second_sentence = random.choice(SENTENCES) if randomize else SENTENCES[(i + 1) % len(SENTENCES)]
1✔
175
            text += f" It {second_sentence}."
1✔
176

177
        samples.append(
1✔
178
            {
179
                "text": text,
180
                "metadata": {"source": "sample", "type": "text", "index": i, "topic": topic},
181
            }
182
        )
183

184
    return samples
1✔
185

186

187
def _generate_markdown_samples(count: int, randomize: bool = True) -> list[dict[str, Any]]:
1✔
188
    """Generate markdown formatted samples."""
189
    samples = []
×
190

191
    for i in range(count):
×
192
        # Use section headers as titles
193

194
        section_idx = i % len(MARKDOWN_SECTIONS)
×
195
        title, content = MARKDOWN_SECTIONS[section_idx]
×
196

197
        # Add a topic-specific sentence
NEW
198
        if randomize:
×
NEW
199
            topic = random.choice(TOPICS)
×
NEW
200
            sentence = random.choice(SENTENCES)
×
201
        else:
NEW
202
            topic = TOPICS[i % len(TOPICS)]
×
NEW
203
            sentence = SENTENCES[i % len(SENTENCES)]
×
UNCOV
204
        additional_content = f"{topic.capitalize()} {sentence}."
×
205

206
        markdown_text = f"## {title}\n\n{content} {additional_content}"
×
207

208
        # Occasionally add a list
NEW
209
        add_list = random.random() < 0.3 if randomize else i % 10 < 3
×
NEW
210
        if add_list:
×
UNCOV
211
            markdown_text += "\n\n- Key point one\n- Key point two\n- Key point three"
×
212

213
        samples.append(
×
214
            {
215
                "text": markdown_text,
216
                "metadata": {
217
                    "source": "sample",
218
                    "type": "markdown",
219
                    "index": i,
220
                    "section": title,
221
                    "topic": topic,
222
                },
223
            }
224
        )
225

226
    return samples
×
227

228

229
def _generate_json_samples(count: int, randomize: bool = True) -> list[dict[str, Any]]:
1✔
230
    """Generate JSON-like structured samples."""
231
    samples = []
×
232

233
    for i in range(count):
×
234
        title_idx = i % len(JSON_TITLES)
×
235
        desc_idx = i % len(JSON_DESCRIPTIONS)
×
236

237
        title = JSON_TITLES[title_idx]
×
238
        description = JSON_DESCRIPTIONS[desc_idx]
×
NEW
239
        topic = random.choice(TOPICS) if randomize else TOPICS[i % len(TOPICS)]
×
240

241
        # Create a text representation of structured data
242
        text = f"Title: {title}\n\nDescription: {description}\n\nTopic: {topic.capitalize()}"
×
243

244
        # Occasionally add tags
NEW
245
        add_tags = random.random() < 0.5 if randomize else i % 2 == 0
×
NEW
246
        if add_tags:
×
NEW
247
            if randomize:
×
NEW
248
                tags = random.sample(TOPICS, k=min(3, len(TOPICS)))
×
249
            else:
250
                # deterministic tag selection
NEW
251
                tags = [TOPICS[(i + j) % len(TOPICS)] for j in range(min(3, len(TOPICS)))]
×
UNCOV
252
            text += f"\n\nTags: {', '.join(tags)}"
×
253

254
        samples.append(
×
255
            {
256
                "text": text,
257
                "metadata": {
258
                    "source": "sample",
259
                    "type": "json",
260
                    "index": i,
261
                    "title": title,
262
                    "topic": topic,
263
                    "category": random.choice(["tutorial", "reference", "guide", "documentation"]),
264
                },
265
            }
266
        )
267

268
    return samples
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc