22355054572

Committed 24 Feb 2026 02:25PM UTC coverage: 59.91% (+2.7%) from 57.258%

Build # 22355054572

Build Type

Pull #20

github

Committed by

anthonypdawson

Commit Message

feat: update release notes for version 0.5.0 with highlights, improved LanceDB reliability, new vector distributions view, better sample data controls, connection manager enhancements, and increased test coverage

Pull Request Pull Request #20: Version 0.5.0 - Add Histogram visualization, update sample data generation, connection info panel, tests

Coverage Stats

321 of 545 new or added lines in 13 files covered. (58.9%)

12 existing lines in 5 files now uncovered.

8294 of 13844 relevant lines covered (59.91%)

0.6 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

46.48

/src/vector_inspector/core/sample_data/text_generator.py

"""Text-based sample data generator for testing vector databases."""

import random
from enum import Enum
from typing import Any


class SampleDataType(Enum):
    """Types of sample data that can be generated."""

    TEXT = "text"
    MARKDOWN = "markdown"
    JSON = "json"


# Sample text corpora for generating realistic-looking data
TOPICS = [
    "artificial intelligence",
    "machine learning",
    "natural language processing",
    "computer vision",
    "robotics",
    "data science",
    "cybersecurity",
    "cloud computing",
    "quantum computing",
    "blockchain",
    "internet of things",
    "augmented reality",
    "virtual reality",
    "edge computing",
    "5G networks",
]

SENTENCES = [
    "is transforming the way we work and live",
    "has seen rapid advancement in recent years",
    "continues to evolve at an unprecedented pace",
    "offers new opportunities for innovation",
    "presents both challenges and opportunities",
    "is reshaping multiple industries simultaneously",
    "requires careful consideration of ethical implications",
    "has become increasingly accessible to developers",
    "demonstrates remarkable potential for growth",
    "remains an active area of research and development",
    "integrates seamlessly with existing technologies",
    "enables new forms of human-computer interaction",
    "provides solutions to complex real-world problems",
    "has attracted significant investment and attention",
    "will likely define the future of technology",
    "is driving productivity gains across sectors",
    "is unlocking new abilities for small teams",
    "is improving decision-making through better insights",
    "is increasingly being adopted in production systems",
    "is enabling developers to build smarter applications",
    "is creating new roles and job functions",
    "is challenging existing regulatory frameworks",
    "is lowering the barrier to entry for innovation",
    "is powering breakthroughs in data-driven research",
    "is helping automate repetitive tasks effectively",
    "is evolving alongside improvements in hardware",
    "is prompting a rethinking of traditional workflows",
    "is bridging gaps between disciplines and teams",
    "is influencing curriculum and education trends",
    "is fostering research collaborations worldwide",
]


MARKDOWN_SECTIONS = [
    (
        "Introduction",
        "This section provides an overview of the topic and its significance in modern technology.",
    ),
    (
        "Key Concepts",
        "Understanding the fundamental principles is essential for grasping the broader implications.",
    ),
    (
        "Applications",
        "Practical applications span numerous industries including healthcare, finance, and education.",
    ),
    ("Challenges", "Despite significant progress, several obstacles remain to be addressed."),
    ("Future Directions", "Ongoing research continues to push the boundaries of what's possible."),
    ("Best Practices", "Following established guidelines helps ensure successful implementation."),
    ("Case Studies", "Real-world examples demonstrate the practical value of these technologies."),
    (
        "Tools and Frameworks",
        "A variety of platforms and libraries facilitate development and deployment.",
    ),
    (
        "Performance Metrics",
        "Measuring success requires appropriate benchmarks and evaluation criteria.",
    ),
    ("Conclusion", "The field continues to evolve with promising developments on the horizon."),
]

JSON_TITLES = [
    "Getting Started Guide",
    "Advanced Techniques",
    "Performance Optimization",
    "Security Best Practices",
    "Architecture Overview",
    "API Reference",
    "Troubleshooting Common Issues",
    "Integration Patterns",
    "Design Principles",
    "Deployment Strategies",
    "Monitoring and Observability",
    "Scaling Considerations",
    "Data Management",
    "Testing Approaches",
    "Version Migration Guide",
]

JSON_DESCRIPTIONS = [
    "A comprehensive introduction to fundamental concepts and techniques.",
    "Deep dive into advanced methodologies and optimization strategies.",
    "Practical guide for improving system performance and efficiency.",
    "Essential practices for maintaining security and data protection.",
    "Detailed overview of system architecture and component interactions.",
    "Complete reference for available APIs and integration methods.",
    "Solutions to frequently encountered problems and error messages.",
    "Common patterns for integrating with external systems and services.",
    "Core principles guiding system design and implementation decisions.",
    "Strategies for deploying applications across different environments.",
    "Guidelines for effective monitoring, logging, and observability.",
    "Approaches to scaling systems to handle increased load and data.",
    "Best practices for data modeling, storage, and retrieval.",
    "Comprehensive testing strategies including unit, integration, and E2E tests.",
    "Step-by-step instructions for migrating between major versions.",
]


def generate_sample_data(
    count: int, data_type: SampleDataType = SampleDataType.TEXT, randomize: bool = True
) -> list[dict[str, Any]]:
    """Generate sample data for testing vector databases.

    Args:
        count: Number of items to generate
        data_type: Type of data to generate (text, markdown, or json)

    Returns:
        List of dictionaries with 'text' and 'metadata' keys
    """
    if isinstance(data_type, str):
        data_type = SampleDataType(data_type)

    if data_type == SampleDataType.TEXT:
        return _generate_text_samples(count, randomize=randomize)
    if data_type == SampleDataType.MARKDOWN:
        return _generate_markdown_samples(count, randomize=randomize)
    if data_type == SampleDataType.JSON:
        return _generate_json_samples(count, randomize=randomize)
    raise ValueError(f"Unknown data type: {data_type}")


def _generate_text_samples(count: int, randomize: bool = True) -> list[dict[str, Any]]:
    """Generate simple text samples."""
    samples = []

    for i in range(count):
        if randomize:
            topic = random.choice(TOPICS)
            sentence = random.choice(SENTENCES)
        else:
            topic = TOPICS[i % len(TOPICS)]
            sentence = SENTENCES[i % len(SENTENCES)]
        text = f"{topic.capitalize()} {sentence}."

        # Add some variety with occasional two-sentence entries
        add_second = random.random() < 0.3 if randomize else i % 10 < 3
        if add_second:
            second_sentence = random.choice(SENTENCES) if randomize else SENTENCES[(i + 1) % len(SENTENCES)]
            text += f" It {second_sentence}."

        samples.append(
            {
                "text": text,
                "metadata": {"source": "sample", "type": "text", "index": i, "topic": topic},
            }
        )

    return samples


def _generate_markdown_samples(count: int, randomize: bool = True) -> list[dict[str, Any]]:
    """Generate markdown formatted samples."""
    samples = []

    for i in range(count):
        # Use section headers as titles

        section_idx = i % len(MARKDOWN_SECTIONS)
        title, content = MARKDOWN_SECTIONS[section_idx]

        # Add a topic-specific sentence
        if randomize:
            topic = random.choice(TOPICS)
            sentence = random.choice(SENTENCES)
        else:
            topic = TOPICS[i % len(TOPICS)]
            sentence = SENTENCES[i % len(SENTENCES)]
        additional_content = f"{topic.capitalize()} {sentence}."

        markdown_text = f"## {title}\n\n{content} {additional_content}"

        # Occasionally add a list
        add_list = random.random() < 0.3 if randomize else i % 10 < 3
        if add_list:
            markdown_text += "\n\n- Key point one\n- Key point two\n- Key point three"

        samples.append(
            {
                "text": markdown_text,
                "metadata": {
                    "source": "sample",
                    "type": "markdown",
                    "index": i,
                    "section": title,
                    "topic": topic,
                },
            }
        )

    return samples


def _generate_json_samples(count: int, randomize: bool = True) -> list[dict[str, Any]]:
    """Generate JSON-like structured samples."""
    samples = []

    for i in range(count):
        title_idx = i % len(JSON_TITLES)
        desc_idx = i % len(JSON_DESCRIPTIONS)

        title = JSON_TITLES[title_idx]
        description = JSON_DESCRIPTIONS[desc_idx]
        topic = random.choice(TOPICS) if randomize else TOPICS[i % len(TOPICS)]

        # Create a text representation of structured data
        text = f"Title: {title}\n\nDescription: {description}\n\nTopic: {topic.capitalize()}"

        # Occasionally add tags
        add_tags = random.random() < 0.5 if randomize else i % 2 == 0
        if add_tags:
            if randomize:
                tags = random.sample(TOPICS, k=min(3, len(TOPICS)))
            else:
                # deterministic tag selection
                tags = [TOPICS[(i + j) % len(TOPICS)] for j in range(min(3, len(TOPICS)))]
            text += f"\n\nTags: {', '.join(tags)}"

        samples.append(
            {
                "text": text,
                "metadata": {
                    "source": "sample",
                    "type": "json",
                    "index": i,
                    "title": title,
                    "topic": topic,
                    "category": random.choice(["tutorial", "reference", "guide", "documentation"]),
                },
            }
        )

    return samples

1	"""Text-based sample data generator for testing vector databases."""
2
3	import random	1✔
4	from enum import Enum	1✔
5	from typing import Any	1✔
6
7
8	class SampleDataType(Enum):	1✔
9	"""Types of sample data that can be generated."""
10
11	TEXT = "text"	1✔
12	MARKDOWN = "markdown"	1✔
13	JSON = "json"	1✔
14
15
16	# Sample text corpora for generating realistic-looking data
17	TOPICS = [	1✔
18	"artificial intelligence",
19	"machine learning",
20	"natural language processing",
21	"computer vision",
22	"robotics",
23	"data science",
24	"cybersecurity",
25	"cloud computing",
26	"quantum computing",
27	"blockchain",
28	"internet of things",
29	"augmented reality",
30	"virtual reality",
31	"edge computing",
32	"5G networks",
33	]
34
35	SENTENCES = [	1✔
36	"is transforming the way we work and live",
37	"has seen rapid advancement in recent years",
38	"continues to evolve at an unprecedented pace",
39	"offers new opportunities for innovation",
40	"presents both challenges and opportunities",
41	"is reshaping multiple industries simultaneously",
42	"requires careful consideration of ethical implications",
43	"has become increasingly accessible to developers",
44	"demonstrates remarkable potential for growth",
45	"remains an active area of research and development",
46	"integrates seamlessly with existing technologies",
47	"enables new forms of human-computer interaction",
48	"provides solutions to complex real-world problems",
49	"has attracted significant investment and attention",
50	"will likely define the future of technology",
51	"is driving productivity gains across sectors",
52	"is unlocking new abilities for small teams",
53	"is improving decision-making through better insights",
54	"is increasingly being adopted in production systems",
55	"is enabling developers to build smarter applications",
56	"is creating new roles and job functions",
57	"is challenging existing regulatory frameworks",
58	"is lowering the barrier to entry for innovation",
59	"is powering breakthroughs in data-driven research",
60	"is helping automate repetitive tasks effectively",
61	"is evolving alongside improvements in hardware",
62	"is prompting a rethinking of traditional workflows",
63	"is bridging gaps between disciplines and teams",
64	"is influencing curriculum and education trends",
65	"is fostering research collaborations worldwide",
66	]
67
68
69	MARKDOWN_SECTIONS = [	1✔
70	(
71	"Introduction",
72	"This section provides an overview of the topic and its significance in modern technology.",
73	),
74	(
75	"Key Concepts",
76	"Understanding the fundamental principles is essential for grasping the broader implications.",
77	),
78	(
79	"Applications",
80	"Practical applications span numerous industries including healthcare, finance, and education.",
81	),
82	("Challenges", "Despite significant progress, several obstacles remain to be addressed."),
83	("Future Directions", "Ongoing research continues to push the boundaries of what's possible."),
84	("Best Practices", "Following established guidelines helps ensure successful implementation."),
85	("Case Studies", "Real-world examples demonstrate the practical value of these technologies."),
86	(
87	"Tools and Frameworks",
88	"A variety of platforms and libraries facilitate development and deployment.",
89	),
90	(
91	"Performance Metrics",
92	"Measuring success requires appropriate benchmarks and evaluation criteria.",
93	),
94	("Conclusion", "The field continues to evolve with promising developments on the horizon."),
95	]
96
97	JSON_TITLES = [	1✔
98	"Getting Started Guide",
99	"Advanced Techniques",
100	"Performance Optimization",
101	"Security Best Practices",
102	"Architecture Overview",
103	"API Reference",
104	"Troubleshooting Common Issues",
105	"Integration Patterns",
106	"Design Principles",
107	"Deployment Strategies",
108	"Monitoring and Observability",
109	"Scaling Considerations",
110	"Data Management",
111	"Testing Approaches",
112	"Version Migration Guide",
113	]
114
115	JSON_DESCRIPTIONS = [	1✔
116	"A comprehensive introduction to fundamental concepts and techniques.",
117	"Deep dive into advanced methodologies and optimization strategies.",
118	"Practical guide for improving system performance and efficiency.",
119	"Essential practices for maintaining security and data protection.",
120	"Detailed overview of system architecture and component interactions.",
121	"Complete reference for available APIs and integration methods.",
122	"Solutions to frequently encountered problems and error messages.",
123	"Common patterns for integrating with external systems and services.",
124	"Core principles guiding system design and implementation decisions.",
125	"Strategies for deploying applications across different environments.",
126	"Guidelines for effective monitoring, logging, and observability.",
127	"Approaches to scaling systems to handle increased load and data.",
128	"Best practices for data modeling, storage, and retrieval.",
129	"Comprehensive testing strategies including unit, integration, and E2E tests.",
130	"Step-by-step instructions for migrating between major versions.",
131	]
132
133
134	def generate_sample_data(	1✔
135	count: int, data_type: SampleDataType = SampleDataType.TEXT, randomize: bool = True
136	) -> list[dict[str, Any]]:
137	"""Generate sample data for testing vector databases.
138
139	Args:
140	count: Number of items to generate
141	data_type: Type of data to generate (text, markdown, or json)
142
143	Returns:
144	List of dictionaries with 'text' and 'metadata' keys
145	"""
146	if isinstance(data_type, str):	1✔
147	data_type = SampleDataType(data_type)	×
148
149	if data_type == SampleDataType.TEXT:	1✔
150	return _generate_text_samples(count, randomize=randomize)	1✔
151	if data_type == SampleDataType.MARKDOWN:	×
NEW 152	return _generate_markdown_samples(count, randomize=randomize)	×
153	if data_type == SampleDataType.JSON:	×
NEW 154	return _generate_json_samples(count, randomize=randomize)	×
155	raise ValueError(f"Unknown data type: {data_type}")	×
156
157
158	def _generate_text_samples(count: int, randomize: bool = True) -> list[dict[str, Any]]:	1✔
159	"""Generate simple text samples."""
160	samples = []	1✔
161
162	for i in range(count):	1✔
163	if randomize:	1✔
164	topic = random.choice(TOPICS)	1✔
165	sentence = random.choice(SENTENCES)	1✔
166	else:
167	topic = TOPICS[i % len(TOPICS)]	1✔
168	sentence = SENTENCES[i % len(SENTENCES)]	1✔
169	text = f"{topic.capitalize()} {sentence}."	1✔
170
171	# Add some variety with occasional two-sentence entries
172	add_second = random.random() < 0.3 if randomize else i % 10 < 3	1✔
173	if add_second:	1✔
174	second_sentence = random.choice(SENTENCES) if randomize else SENTENCES[(i + 1) % len(SENTENCES)]	1✔
175	text += f" It {second_sentence}."	1✔
176
177	samples.append(	1✔
178	{
179	"text": text,
180	"metadata": {"source": "sample", "type": "text", "index": i, "topic": topic},
181	}
182	)
183
184	return samples	1✔
185
186
187	def _generate_markdown_samples(count: int, randomize: bool = True) -> list[dict[str, Any]]:	1✔
188	"""Generate markdown formatted samples."""
189	samples = []	×
190
191	for i in range(count):	×
192	# Use section headers as titles
193
194	section_idx = i % len(MARKDOWN_SECTIONS)	×
195	title, content = MARKDOWN_SECTIONS[section_idx]	×
196
197	# Add a topic-specific sentence
NEW 198	if randomize:	×
NEW 199	topic = random.choice(TOPICS)	×
NEW 200	sentence = random.choice(SENTENCES)	×
201	else:
NEW 202	topic = TOPICS[i % len(TOPICS)]	×
NEW 203	sentence = SENTENCES[i % len(SENTENCES)]	×
UNCOV 204	additional_content = f"{topic.capitalize()} {sentence}."	×
205
206	markdown_text = f"## {title}\n\n{content} {additional_content}"	×
207
208	# Occasionally add a list
NEW 209	add_list = random.random() < 0.3 if randomize else i % 10 < 3	×
NEW 210	if add_list:	×
UNCOV 211	markdown_text += "\n\n- Key point one\n- Key point two\n- Key point three"	×
212
213	samples.append(	×
214	{
215	"text": markdown_text,
216	"metadata": {
217	"source": "sample",
218	"type": "markdown",
219	"index": i,
220	"section": title,
221	"topic": topic,
222	},
223	}
224	)
225
226	return samples	×
227
228
229	def _generate_json_samples(count: int, randomize: bool = True) -> list[dict[str, Any]]:	1✔
230	"""Generate JSON-like structured samples."""
231	samples = []	×
232
233	for i in range(count):	×
234	title_idx = i % len(JSON_TITLES)	×
235	desc_idx = i % len(JSON_DESCRIPTIONS)	×
236
237	title = JSON_TITLES[title_idx]	×
238	description = JSON_DESCRIPTIONS[desc_idx]	×
NEW 239	topic = random.choice(TOPICS) if randomize else TOPICS[i % len(TOPICS)]	×
240
241	# Create a text representation of structured data
242	text = f"Title: {title}\n\nDescription: {description}\n\nTopic: {topic.capitalize()}"	×
243
244	# Occasionally add tags
NEW 245	add_tags = random.random() < 0.5 if randomize else i % 2 == 0	×
NEW 246	if add_tags:	×
NEW 247	if randomize:	×
NEW 248	tags = random.sample(TOPICS, k=min(3, len(TOPICS)))	×
249	else:
250	# deterministic tag selection
NEW 251	tags = [TOPICS[(i + j) % len(TOPICS)] for j in range(min(3, len(TOPICS)))]	×
UNCOV 252	text += f"\n\nTags: {', '.join(tags)}"	×
253
254	samples.append(	×
255	{
256	"text": text,
257	"metadata": {
258	"source": "sample",
259	"type": "json",
260	"index": i,
261	"title": title,
262	"topic": topic,
263	"category": random.choice(["tutorial", "reference", "guide", "documentation"]),
264	},
265	}
266	)
267
268	return samples	×

anthonypdawson / vector-inspector / 22355054572

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous