• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

thanos / petrify / 27216719496

09 Jun 2026 03:23PM UTC coverage: 85.15%. First build
27216719496

Pull #6

github

web-flow
Merge 26e3ccd83 into e9de08346
Pull Request #6: New flow

537 of 631 new or added lines in 5 files covered. (85.1%)

539 of 633 relevant lines covered (85.15%)

2.29 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.06
/src/html_parser.rs
1
use crate::types::{Resource, ResourceType};
2
use anyhow::{anyhow, Result};
3
use html5ever::parse_document;
4
use html5ever::tendril::TendrilSink;
5
use markup5ever_rcdom::{Handle, NodeData, RcDom};
6
use regex::Regex;
7
use std::collections::{HashMap, HashSet};
8
use url::Url;
9

10
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
11
struct UrlReference {
12
    original: String,
13
    resolved: Url,
14
}
15

16
pub struct HtmlParser {
17
    base_url: Url,
18
    output_dir: String,
19
}
20

21
impl HtmlParser {
22
    pub fn new(base_url: Url, output_dir: String) -> Self {
3✔
23
        Self {
24
            base_url,
25
            output_dir,
26
        }
27
    }
28

29
    pub fn parse_html(&self, html_content: &str) -> Result<(String, Vec<Resource>)> {
3✔
30
        let dom = parse_document(RcDom::default(), Default::default())
6✔
31
            .from_utf8()
32
            .one(html_content.as_bytes());
6✔
33

34
        let mut resources = Vec::new();
3✔
35
        let mut modified_html = html_content.to_string();
6✔
36

37
        let references = self.extract_url_references(&dom.document)?;
6✔
38
        let mut local_paths = HashMap::new();
3✔
39

40
        for reference in &references {
6✔
41
            if local_paths.contains_key(reference.resolved.as_str()) {
6✔
42
                continue;
43
            }
44
            match self.create_resource(&reference.resolved) {
3✔
45
                Ok(resource) => {
3✔
46
                    local_paths.insert(reference.resolved.to_string(), resource.local_path.clone());
6✔
47
                    resources.push(resource);
3✔
48
                }
NEW
49
                Err(e) => {
×
NEW
50
                    log::warn!(
×
51
                        "Failed to create resource for {}: {}",
52
                        reference.resolved.as_str(),
53
                        e
54
                    );
55
                }
56
            }
57
        }
58

59
        for reference in &references {
3✔
60
            if let Some(local_path) = local_paths.get(reference.resolved.as_str()) {
9✔
61
                modified_html =
6✔
62
                    self.replace_url_in_html(&modified_html, &reference.original, local_path);
3✔
63
            }
64
        }
65

66
        Ok((modified_html, resources))
3✔
67
    }
68

69
    fn extract_url_references(&self, handle: &Handle) -> Result<Vec<UrlReference>> {
3✔
70
        let mut references = Vec::new();
3✔
71
        let mut seen = HashSet::new();
3✔
72
        self.walk_dom(handle, &mut references, &mut seen)?;
6✔
73
        Ok(references)
3✔
74
    }
75

76
    fn walk_dom(
3✔
77
        &self,
78
        handle: &Handle,
79
        references: &mut Vec<UrlReference>,
80
        seen: &mut HashSet<(String, String)>,
81
    ) -> Result<()> {
82
        let node = handle;
3✔
83

84
        match &node.data {
3✔
85
            NodeData::Element { ref attrs, .. } => {
3✔
86
                for attr in attrs.borrow().iter() {
6✔
87
                    if self.is_url_attribute(&attr.name.local) {
4✔
88
                        let url_str = attr.value.to_string();
2✔
89
                        if let Ok(url) = self.resolve_url(&url_str) {
8✔
90
                            let key = (url_str.clone(), url.to_string());
4✔
91
                            if seen.insert(key) {
2✔
92
                                references.push(UrlReference {
2✔
93
                                    original: url_str,
2✔
94
                                    resolved: url,
2✔
95
                                });
96
                            }
97
                        }
98
                    }
99
                }
100
            }
101
            NodeData::Text { ref contents } => {
3✔
102
                if let Some(urls_in_text) = self.extract_urls_from_text(&contents.borrow()) {
6✔
103
                    for url_str in urls_in_text {
4✔
104
                        if let Ok(url) = self.resolve_url(&url_str) {
4✔
105
                            let key = (url_str.clone(), url.to_string());
2✔
106
                            if seen.insert(key) {
1✔
107
                                references.push(UrlReference {
1✔
108
                                    original: url_str,
1✔
109
                                    resolved: url,
1✔
110
                                });
111
                            }
112
                        }
113
                    }
114
                }
115
            }
116
            _ => {}
117
        }
118

119
        for child in node.children.borrow().iter() {
6✔
120
            self.walk_dom(child, references, seen)?;
6✔
121
        }
122

123
        Ok(())
3✔
124
    }
125

126
    fn is_url_attribute(&self, attr_name: &str) -> bool {
2✔
127
        matches!(
4✔
128
            attr_name,
129
            "src" | "href" | "data-src" | "data-original" | "poster" | "background" | "data-srcset" | "data-lazy-src"
130
        )
131
    }
132

133
    fn extract_urls_from_text(&self, text: &str) -> Option<Vec<String>> {
3✔
134
        let mut urls = Vec::new();
3✔
135
        
136
        // Extract URLs from CSS @import statements
137
        let import_regex = Regex::new(r#"@import\s+["']([^"']+)["']"#).ok()?;
6✔
138
        for cap in import_regex.captures_iter(text) {
9✔
139
            if let Some(url) = cap.get(1) {
2✔
140
                urls.push(url.as_str().to_string());
2✔
141
            }
142
        }
143

144
        // Extract URLs from CSS url() functions
145
        let url_regex = Regex::new(r#"url\(["']?([^"')]+)["']?\)"#).ok()?;
3✔
146
        for cap in url_regex.captures_iter(text) {
9✔
147
            if let Some(url) = cap.get(1) {
2✔
148
                urls.push(url.as_str().to_string());
2✔
149
            }
150
        }
151

152
        if urls.is_empty() {
4✔
153
            None
2✔
154
        } else {
155
            Some(urls)
1✔
156
        }
157
    }
158

159
    fn resolve_url(&self, url_str: &str) -> Result<Url> {
3✔
160
        if url_str.starts_with("data:") || url_str.starts_with("#") {
3✔
161
            return Err(anyhow!("Skipping data URL or fragment"));
1✔
162
        }
163

164
        if url_str.starts_with("//") {
3✔
165
            let scheme = self.base_url.scheme();
1✔
166
            Ok(Url::parse(&format!("{scheme}:{url_str}"))?)
2✔
167
        } else if url_str.starts_with('/') {
6✔
168
            // Absolute path
169
            let mut url = self.base_url.clone();
2✔
170
            url.set_path(url_str);
2✔
171
            Ok(url)
2✔
172
        } else if url_str.starts_with("http://") || url_str.starts_with("https://") {
7✔
173
            // Absolute URL
174
            Ok(Url::parse(url_str)?)
1✔
175
        } else {
176
            // Relative URL
177
            Ok(self.base_url.join(url_str)?)
3✔
178
        }
179
    }
180

181
    fn create_resource(&self, url: &Url) -> Result<Resource> {
3✔
182
        let resource_type = self.determine_resource_type(url);
3✔
183
        let local_path = self.generate_local_path(url, &resource_type)?;
6✔
184
        
185
        Ok(Resource {
3✔
186
            url: url.clone(),
3✔
187
            local_path,
3✔
188
            resource_type: resource_type.clone(),
3✔
189
            mime_type: self.guess_mime_type(url, &resource_type),
3✔
190
            size: None,
191
            downloaded: false,
192
        })
193
    }
194

195
    fn determine_resource_type(&self, url: &Url) -> ResourceType {
3✔
196
        let path = url.path();
3✔
197
        let extension = path.split('.').next_back().unwrap_or("").to_lowercase();
3✔
198

199
        // Check for explicit file extensions first
200
        match extension.as_str() {
6✔
201
            "html" | "htm" => ResourceType::HTML,
3✔
202
            "css" => ResourceType::CSS,
6✔
203
            "js" | "javascript" => ResourceType::JavaScript,
6✔
204
            "jpg" | "jpeg" | "png" | "gif" | "webp" | "svg" | "ico" | "bmp" | "tiff" | "tif" => ResourceType::Image,
3✔
205
            "mp4" | "webm" | "ogg" | "avi" | "mov" | "m4v" => ResourceType::Video,
2✔
206
            "pdf" => ResourceType::PDF,
2✔
207
            "woff" | "woff2" | "ttf" | "otf" | "eot" => ResourceType::Font,
4✔
208
            _ => {
209
                // For URLs without extensions, check if they look like HTML pages
210
                if self.looks_like_html_page(path) {
2✔
211
                    ResourceType::HTML
2✔
212
                } else {
NEW
213
                    ResourceType::Other
×
214
                }
215
            }
216
        }
217
    }
218

219
    fn looks_like_html_page(&self, path: &str) -> bool {
2✔
220
        // Skip empty paths and root
221
        if path.is_empty() || path == "/" {
2✔
NEW
222
            return false;
×
223
        }
224
        
225
        // Check if the path ends with a slash (directory-like)
226
        if path.ends_with('/') {
2✔
NEW
227
            return true;
×
228
        }
229
        
230
        // Check if the path looks like a content page (not a file)
231
        let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
6✔
232
        
233
        // If it's a single segment without extension, likely a page
234
        if segments.len() == 1 && !segments[0].contains('.') {
6✔
235
            return true;
2✔
236
        }
237
        
238
        // If it's multiple segments and the last one doesn't have an extension, likely a page
NEW
239
        if segments.len() > 1 {
×
NEW
240
            let last_segment = segments.last().unwrap_or(&"");
×
NEW
241
            if !last_segment.contains('.') && !last_segment.is_empty() {
×
NEW
242
                return true;
×
243
            }
244
        }
245
        
NEW
246
        false
×
247
    }
248

249
    fn generate_local_path(&self, url: &Url, resource_type: &ResourceType) -> Result<String> {
3✔
250
        let path = url.path();
3✔
251
        
252
        let subdirectory = match resource_type {
3✔
253
            ResourceType::HTML => "",  // HTML files maintain original structure
2✔
254
            ResourceType::CSS => "static/css",
3✔
255
            ResourceType::JavaScript => "static/js",
2✔
256
            ResourceType::Image => "static/images",
3✔
NEW
257
            ResourceType::Video => "static/video",
×
NEW
258
            ResourceType::PDF => "static/pdf",
×
NEW
259
            ResourceType::Font => "static/fonts",
×
NEW
260
            ResourceType::Other => "static/other",
×
261
        };
262

263
        // For HTML files, preserve the original directory structure
264
        if *resource_type == ResourceType::HTML {
3✔
265
            let path_segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
6✔
266
            
267
            if path_segments.is_empty() || path == "/" {
6✔
268
                // Root page
NEW
269
                return Ok(format!("{}/index.html", self.output_dir));
×
270
            } else {
271
                // Create directory structure matching the original URL
272
                let dir_path = path_segments[..path_segments.len()-1].join("/");
2✔
273
                let filename = path_segments.last().unwrap_or(&"index");
4✔
274
                
275
                // Handle trailing slash (directory-like URLs)
276
                if path.ends_with('/') {
2✔
NEW
277
                    if dir_path.is_empty() {
×
NEW
278
                        return Ok(format!("{}/{}/index.html", self.output_dir, filename));
×
279
                    } else {
NEW
280
                        return Ok(format!("{}/{}/{}/index.html", self.output_dir, dir_path, filename));
×
281
                    }
282
                } else {
283
                    // Regular file path
284
                    if dir_path.is_empty() {
4✔
285
                        return Ok(format!("{}/{}.html", self.output_dir, filename));
4✔
286
                    } else {
NEW
287
                        return Ok(format!("{}/{}/{}.html", self.output_dir, dir_path, filename));
×
288
                    }
289
                }
290
            }
291
        }
292

293
        // For non-HTML resources, use the existing logic
294
        let filename = path.split('/').next_back().unwrap_or("unknown");
3✔
295
        let mut unique_filename = filename.to_string();
3✔
296
        if filename == "index.html" || filename.is_empty() {
9✔
NEW
297
            let host = url.host_str().unwrap_or("unknown");
×
NEW
298
            let path_segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
×
NEW
299
            if path_segments.is_empty() {
×
NEW
300
                unique_filename = format!("{}.html", host);
×
301
            } else {
NEW
302
                unique_filename = format!("{}.html", path_segments.join("_"));
×
303
            }
304
        }
305

306
        Ok(format!("{}/{}/{}", self.output_dir, subdirectory, unique_filename))
6✔
307
    }
308

309
    fn guess_mime_type(&self, url: &Url, resource_type: &ResourceType) -> String {
3✔
310
        let path = url.path();
3✔
311
        let extension = path.split('.').next_back().unwrap_or("").to_lowercase();
3✔
312

313
        match extension.as_str() {
6✔
314
            "html" | "htm" => "text/html".to_string(),
5✔
315
            "css" => "text/css".to_string(),
9✔
316
            "js" => "application/javascript".to_string(),
10✔
317
            "jpg" | "jpeg" => "image/jpeg".to_string(),
7✔
318
            "png" => "image/png".to_string(),
9✔
319
            "gif" => "image/gif".to_string(),
4✔
320
            "webp" => "image/webp".to_string(),
4✔
321
            "svg" => "image/svg+xml".to_string(),
4✔
322
            "ico" => "image/x-icon".to_string(),
4✔
323
            "mp4" => "video/mp4".to_string(),
4✔
324
            "webm" => "video/webm".to_string(),
4✔
325
            "ogg" => "video/ogg".to_string(),
4✔
326
            "pdf" => "application/pdf".to_string(),
4✔
327
            "woff" => "font/woff".to_string(),
4✔
328
            "woff2" => "font/woff2".to_string(),
4✔
329
            "ttf" => "font/ttf".to_string(),
4✔
330
            _ => match resource_type {
2✔
331
                ResourceType::HTML => "text/html".to_string(),
2✔
NEW
332
                ResourceType::CSS => "text/css".to_string(),
×
333
                ResourceType::JavaScript => "application/javascript".to_string(),
2✔
NEW
334
                ResourceType::Image => "image/jpeg".to_string(),
×
NEW
335
                ResourceType::Video => "video/mp4".to_string(),
×
NEW
336
                ResourceType::PDF => "application/pdf".to_string(),
×
NEW
337
                ResourceType::Font => "font/woff".to_string(),
×
NEW
338
                ResourceType::Other => "application/octet-stream".to_string(),
×
339
            },
340
        }
341
    }
342

343
    fn replace_url_in_html(&self, html: &str, original_url: &str, local_path: &str) -> String {
3✔
344
        let relative_path = self.make_relative_path(local_path);
3✔
345
        html.replace(original_url, &relative_path)
6✔
346
    }
347

348
    fn make_relative_path(&self, local_path: &str) -> String {
3✔
349
        // Convert absolute path to relative path from the HTML file location
350
        if let Some(relative) = local_path.strip_prefix(&self.output_dir) {
3✔
351
            relative
352
                .strip_prefix('/')
353
                .unwrap_or(relative)
3✔
354
                .to_string()
355
        } else {
NEW
356
            local_path.to_string()
×
357
        }
358
    }
359
}
360

361
#[cfg(test)]
362
mod tests {
363
    use super::*;
364

365
    #[test]
366
    fn test_url_resolution() {
367
        let base_url = Url::parse("https://example.com/page/").unwrap();
368
        let parser = HtmlParser::new(base_url, "./output".to_string());
369
        
370
        let relative_url = "image.jpg";
371
        let resolved = parser.resolve_url(relative_url).unwrap();
372
        assert_eq!(resolved.as_str(), "https://example.com/page/image.jpg");
373
    }
374

375
    #[test]
376
    fn test_resource_type_detection() {
377
        let base_url = Url::parse("https://example.com/").unwrap();
378
        let parser = HtmlParser::new(base_url, "./output".to_string());
379

380
        let image_url = Url::parse("https://example.com/image.png").unwrap();
381
        let resource_type = parser.determine_resource_type(&image_url);
382
        assert!(matches!(resource_type, ResourceType::Image));
383
    }
384

385
    #[test]
386
    fn test_protocol_relative_url_resolution() {
387
        let base_url = Url::parse("http://example.com/page/").unwrap();
388
        let parser = HtmlParser::new(base_url, "./output".to_string());
389

390
        let resolved = parser.resolve_url("//cdn.example.com/lib.js").unwrap();
391
        assert_eq!(resolved.as_str(), "http://cdn.example.com/lib.js");
392
    }
393

394
    #[test]
395
    fn test_extensionless_path_is_html() {
396
        let base_url = Url::parse("https://example.com/").unwrap();
397
        let parser = HtmlParser::new(base_url, "./output".to_string());
398

399
        let about = Url::parse("https://example.com/about").unwrap();
400
        assert_eq!(parser.determine_resource_type(&about), ResourceType::HTML);
401
    }
402

403
    #[test]
404
    fn test_absolute_path_resolution() {
405
        let base_url = Url::parse("https://example.com/page/").unwrap();
406
        let parser = HtmlParser::new(base_url, "./output".to_string());
407
        let resolved = parser.resolve_url("/assets/app.css").unwrap();
408
        assert_eq!(resolved.as_str(), "https://example.com/assets/app.css");
409
    }
410

411
    #[test]
412
    fn test_absolute_https_url_resolution() {
413
        let base_url = Url::parse("https://example.com/").unwrap();
414
        let parser = HtmlParser::new(base_url, "./output".to_string());
415
        let resolved = parser
416
            .resolve_url("https://cdn.example.com/lib.js")
417
            .unwrap();
418
        assert_eq!(resolved.as_str(), "https://cdn.example.com/lib.js");
419
    }
420

421
    #[test]
422
    fn test_skips_data_and_fragment_urls() {
423
        let base_url = Url::parse("https://example.com/").unwrap();
424
        let parser = HtmlParser::new(base_url, "./output".to_string());
425
        assert!(parser.resolve_url("#section").is_err());
426
        assert!(parser.resolve_url("data:image/png;base64,abc").is_err());
427
    }
428

429
    #[test]
430
    fn extracts_urls_from_inline_css() {
431
        let html = r#"<html><head><style>
432
            @import "imported.css";
433
            body { background: url(bg.png); }
434
        </style></head></html>"#;
435
        let base_url = Url::parse("https://example.com/").unwrap();
436
        let parser = HtmlParser::new(base_url, "./output".to_string());
437
        let (_, resources) = parser.parse_html(html).unwrap();
438
        assert!(
439
            resources
440
                .iter()
441
                .any(|r| r.url.path().ends_with("imported.css"))
442
        );
443
        assert!(
444
            resources.iter().any(|r| r.url.path().ends_with("bg.png"))
445
        );
446
    }
447

448
    #[test]
449
    fn guess_mime_type_falls_back_to_resource_type() {
450
        let base_url = Url::parse("https://example.com/").unwrap();
451
        let parser = HtmlParser::new(base_url, "./output".to_string());
452
        let url = Url::parse("https://example.com/no-extension").unwrap();
453
        assert_eq!(
454
            parser.guess_mime_type(&url, &ResourceType::JavaScript),
455
            "application/javascript"
456
        );
457
    }
458

459
    #[test]
460
    fn make_relative_path_strips_output_prefix() {
461
        let base_url = Url::parse("https://example.com/").unwrap();
462
        let parser = HtmlParser::new(base_url, "/output".to_string());
463
        let relative = parser.make_relative_path("/output/static/css/app.css");
464
        assert_eq!(relative, "static/css/app.css");
465
    }
466
} 
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc