17193676866

Committed 24 Aug 2025 08:56PM UTC coverage: 91.351% (-8.0%) from 99.312%

Build # 17193676866

Build Type

Pull #7

github

Committed by

web-flow

Commit Message

Merge 8ac64f79f into e594bd672

Pull Request Pull Request #7: WIP: Python SDK

Run Details

99 of 109 branches covered (90.83%)

Branch coverage included in aggregate %.

384 of 436 new or added lines in 13 files covered. (88.07%)

8 existing lines in 2 files now uncovered.

746 of 816 relevant lines covered (91.42%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.94

/src/md_server/core/validation.py

from urllib.parse import urlparse
from typing import Optional


class ValidationError(Exception):
    def __init__(self, message: str, details: Optional[dict] = None):
        super().__init__(message)
        self.details = details or {}


class URLValidator:
    @classmethod
    def validate_url(cls, url: str) -> str:
        if not url or not url.strip():
            raise ValidationError("URL cannot be empty")

        url = url.strip()
        parsed = urlparse(url)

        if not parsed.scheme:
            raise ValidationError("Invalid URL format")

        if parsed.scheme.lower() not in ["http", "https"]:
            raise ValidationError("Only HTTP/HTTPS URLs allowed")

        if not parsed.netloc:
            raise ValidationError("Invalid URL format")

        return url


class FileSizeValidator:
    DEFAULT_MAX_SIZE = 50 * 1024 * 1024  # 50MB default

    FORMAT_LIMITS = {
        "application/pdf": 50 * 1024 * 1024,  # 50MB for PDFs
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document": 25
        * 1024
        * 1024,  # 25MB for DOCX
        "application/vnd.openxmlformats-officedocument.presentationml.presentation": 25
        * 1024
        * 1024,  # 25MB for PPTX
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": 25
        * 1024
        * 1024,  # 25MB for XLSX
        "text/plain": 10 * 1024 * 1024,  # 10MB for text
        "text/html": 10 * 1024 * 1024,  # 10MB for HTML
        "text/markdown": 10 * 1024 * 1024,  # 10MB for markdown
        "application/json": 5 * 1024 * 1024,  # 5MB for JSON
        "image/png": 20 * 1024 * 1024,  # 20MB for images
        "image/jpeg": 20 * 1024 * 1024,  # 20MB for images
        "image/jpg": 20 * 1024 * 1024,  # 20MB for images
    }

    @classmethod
    def validate_size(
        cls,
        content_size: int,
        content_type: Optional[str] = None,
        max_size_mb: Optional[int] = None,
    ) -> None:
        if content_size <= 0:
            return

        # Use custom limit if provided, otherwise use format-specific limit
        if max_size_mb:
            limit = max_size_mb * 1024 * 1024
        else:
            limit = cls.FORMAT_LIMITS.get(content_type or "", cls.DEFAULT_MAX_SIZE)

        if content_size > limit:
            limit_mb = limit / (1024 * 1024)
            actual_mb = content_size / (1024 * 1024)
            raise ValidationError(
                f"File size {actual_mb:.1f}MB exceeds limit of {limit_mb:.0f}MB for {content_type or 'this format'}",
                {
                    "file_size": content_size,
                    "limit": limit,
                    "content_type": content_type,
                },
            )


class MimeTypeValidator:
    @classmethod
    def validate_mime_type(cls, mime_type: str) -> str:
        if not mime_type:
            raise ValidationError("MIME type cannot be empty")

        if len(mime_type) > 100:
            raise ValidationError("MIME type too long (max 100 characters)")

        if "/" not in mime_type:
            raise ValidationError("MIME type must contain '/' separator")

        if ".." in mime_type or "\\" in mime_type:
            raise ValidationError("Invalid characters in MIME type")

        if mime_type.count("/") != 1:
            raise ValidationError("MIME type must contain exactly one '/' separator")

        return mime_type.strip().lower()


class ContentValidator:
    # Magic byte signatures for file type detection
    MAGIC_BYTES = {
        b"\x25\x50\x44\x46": "application/pdf",  # PDF
        b"\x50\x4b\x03\x04": "application/zip",  # ZIP (includes DOCX, XLSX, PPTX)
        b"\x50\x4b\x05\x06": "application/zip",  # Empty ZIP
        b"\x50\x4b\x07\x08": "application/zip",  # ZIP
        b"\x89\x50\x4e\x47": "image/png",  # PNG
        b"\xff\xd8\xff": "image/jpeg",  # JPEG
        b"\x47\x49\x46\x38": "image/gif",  # GIF
        b"\x52\x49\x46\x46": "audio/wav",  # WAV (RIFF)
        b"\x49\x44\x33": "audio/mp3",  # MP3 with ID3
        b"\xff\xfb": "audio/mp3",  # MP3
        b"\x3c\x3f\x78\x6d\x6c": "application/xml",  # XML <?xml
        b"\x3c\x68\x74\x6d\x6c": "text/html",  # HTML <html
        b"\x3c\x21\x44\x4f\x43\x54\x59\x50\x45": "text/html",  # HTML <!DOCTYPE
    }

    @classmethod
    def detect_content_type(cls, content: bytes) -> str:
        if not content:
            return "application/octet-stream"

        for magic, content_type in cls.MAGIC_BYTES.items():
            if content.startswith(magic):
                return content_type

        try:
            content[:1024].decode("utf-8")
            return "text/plain"
        except UnicodeDecodeError:
            pass

        return "application/octet-stream"

    @classmethod
    def validate_content_type(
        cls, content: bytes, declared_type: Optional[str] = None
    ) -> str:
        detected_type = cls.detect_content_type(content)

        if not declared_type:
            return detected_type

        # Handle Office documents (ZIP-based formats)
        if detected_type == "application/zip" and declared_type in [
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        ]:
            return declared_type

        if detected_type == "application/octet-stream":
            return declared_type

        # For text types, be more permissive as detection can be inaccurate
        if declared_type.startswith("text/") and detected_type == "text/plain":
            return declared_type

        # Strict matching for security-sensitive binary types only
        security_sensitive = ["application/pdf", "image/png", "image/jpeg"]
        if declared_type in security_sensitive and detected_type != declared_type:
            raise ValidationError(
                f"Content type mismatch: declared {declared_type} but detected {detected_type}",
                {"declared": declared_type, "detected": detected_type},
            )

        return declared_type

1	from urllib.parse import urlparse	1✔
2	from typing import Optional	1✔
3
4
5	class ValidationError(Exception):	1✔
6	def __init__(self, message: str, details: Optional[dict] = None):	1✔
7	super().__init__(message)	1✔
8	self.details = details or {}	1✔
9
10
11	class URLValidator:	1✔
12	@classmethod	1✔
13	def validate_url(cls, url: str) -> str:	1✔
14	if not url or not url.strip():	1✔
15	raise ValidationError("URL cannot be empty")	1✔
16
17	url = url.strip()	1✔
18	parsed = urlparse(url)	1✔
19
20	if not parsed.scheme:	1✔
21	raise ValidationError("Invalid URL format")	1✔
22
23	if parsed.scheme.lower() not in ["http", "https"]:	1✔
24	raise ValidationError("Only HTTP/HTTPS URLs allowed")	1✔
25
26	if not parsed.netloc:	1✔
27	raise ValidationError("Invalid URL format")	1✔
28
29	return url	1✔
30
31
32	class FileSizeValidator:	1✔
33	DEFAULT_MAX_SIZE = 50 * 1024 * 1024 # 50MB default	1✔
34
35	FORMAT_LIMITS = {	1✔
36	"application/pdf": 50 * 1024 * 1024, # 50MB for PDFs
37	"application/vnd.openxmlformats-officedocument.wordprocessingml.document": 25
38	* 1024
39	* 1024, # 25MB for DOCX
40	"application/vnd.openxmlformats-officedocument.presentationml.presentation": 25
41	* 1024
42	* 1024, # 25MB for PPTX
43	"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": 25
44	* 1024
45	* 1024, # 25MB for XLSX
46	"text/plain": 10 * 1024 * 1024, # 10MB for text
47	"text/html": 10 * 1024 * 1024, # 10MB for HTML
48	"text/markdown": 10 * 1024 * 1024, # 10MB for markdown
49	"application/json": 5 * 1024 * 1024, # 5MB for JSON
50	"image/png": 20 * 1024 * 1024, # 20MB for images
51	"image/jpeg": 20 * 1024 * 1024, # 20MB for images
52	"image/jpg": 20 * 1024 * 1024, # 20MB for images
53	}
54
55	@classmethod	1✔
56	def validate_size(	1✔
57	cls,
58	content_size: int,
59	content_type: Optional[str] = None,
60	max_size_mb: Optional[int] = None,
61	) -> None:
62	if content_size <= 0:	1✔
63	return	1✔
64
65	# Use custom limit if provided, otherwise use format-specific limit
66	if max_size_mb:	1✔
67	limit = max_size_mb * 1024 * 1024	1✔
68	else:
69	limit = cls.FORMAT_LIMITS.get(content_type or "", cls.DEFAULT_MAX_SIZE)	1✔
70
71	if content_size > limit:	1✔
72	limit_mb = limit / (1024 * 1024)	1✔
73	actual_mb = content_size / (1024 * 1024)	1✔
74	raise ValidationError(	1✔
75	f"File size {actual_mb:.1f}MB exceeds limit of {limit_mb:.0f}MB for {content_type or 'this format'}",
76	{
77	"file_size": content_size,
78	"limit": limit,
79	"content_type": content_type,
80	},
81	)
82
83
84	class MimeTypeValidator:	1✔
85	@classmethod	1✔
86	def validate_mime_type(cls, mime_type: str) -> str:	1✔
87	if not mime_type:	1✔
88	raise ValidationError("MIME type cannot be empty")	1✔
89
90	if len(mime_type) > 100:	1✔
91	raise ValidationError("MIME type too long (max 100 characters)")	1✔
92
93	if "/" not in mime_type:	1✔
94	raise ValidationError("MIME type must contain '/' separator")	1✔
95
96	if ".." in mime_type or "\\" in mime_type:	1✔
97	raise ValidationError("Invalid characters in MIME type")	1✔
98
99	if mime_type.count("/") != 1:	1✔
100	raise ValidationError("MIME type must contain exactly one '/' separator")	1✔
101
102	return mime_type.strip().lower()	1✔
103
104
105	class ContentValidator:	1✔
106	# Magic byte signatures for file type detection
107	MAGIC_BYTES = {	1✔
108	b"\x25\x50\x44\x46": "application/pdf", # PDF
109	b"\x50\x4b\x03\x04": "application/zip", # ZIP (includes DOCX, XLSX, PPTX)
110	b"\x50\x4b\x05\x06": "application/zip", # Empty ZIP
111	b"\x50\x4b\x07\x08": "application/zip", # ZIP
112	b"\x89\x50\x4e\x47": "image/png", # PNG
113	b"\xff\xd8\xff": "image/jpeg", # JPEG
114	b"\x47\x49\x46\x38": "image/gif", # GIF
115	b"\x52\x49\x46\x46": "audio/wav", # WAV (RIFF)
116	b"\x49\x44\x33": "audio/mp3", # MP3 with ID3
117	b"\xff\xfb": "audio/mp3", # MP3
118	b"\x3c\x3f\x78\x6d\x6c": "application/xml", # XML <?xml
119	b"\x3c\x68\x74\x6d\x6c": "text/html", # HTML <html
120	b"\x3c\x21\x44\x4f\x43\x54\x59\x50\x45": "text/html", # HTML <!DOCTYPE
121	}
122
123	@classmethod	1✔
124	def detect_content_type(cls, content: bytes) -> str:	1✔
125	if not content:	1✔
126	return "application/octet-stream"	1✔
127
128	for magic, content_type in cls.MAGIC_BYTES.items():	1✔
129	if content.startswith(magic):	1✔
130	return content_type	1✔
131
132	try:	1✔
133	content[:1024].decode("utf-8")	1✔
134	return "text/plain"	1✔
135	except UnicodeDecodeError:	1✔
136	pass	1✔
137
138	return "application/octet-stream"	1✔
139
140	@classmethod	1✔
141	def validate_content_type(	1✔
142	cls, content: bytes, declared_type: Optional[str] = None
143	) -> str:
144	detected_type = cls.detect_content_type(content)	1✔
145
146	if not declared_type:	1✔
147	return detected_type	×
148
149	# Handle Office documents (ZIP-based formats)
150	if detected_type == "application/zip" and declared_type in [	1✔
151	"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
152	"application/vnd.openxmlformats-officedocument.presentationml.presentation",
153	"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
154	]:
155	return declared_type	1✔
156
157	if detected_type == "application/octet-stream":	1✔
158	return declared_type	×
159
160	# For text types, be more permissive as detection can be inaccurate
161	if declared_type.startswith("text/") and detected_type == "text/plain":	1✔
NEW 162	return declared_type	×
163
164	# Strict matching for security-sensitive binary types only
165	security_sensitive = ["application/pdf", "image/png", "image/jpeg"]	1✔
166	if declared_type in security_sensitive and detected_type != declared_type:	1✔
167	raise ValidationError(	1✔
168	f"Content type mismatch: declared {declared_type} but detected {detected_type}",
169	{"declared": declared_type, "detected": detected_type},
170	)
171
172	return declared_type	1✔

peteretelej / md-server / 17193676866

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous