diff --git a/crates/oxide/src/extractor/pre_processors/ruby.rs b/crates/oxide/src/extractor/pre_processors/ruby.rs index 89ac1ee7b1c6..6add86a4527e 100644 --- a/crates/oxide/src/extractor/pre_processors/ruby.rs +++ b/crates/oxide/src/extractor/pre_processors/ruby.rs @@ -34,44 +34,46 @@ impl PreProcessor for Ruby { // Extract embedded template languages // https://viewcomponent.org/guide/templates.html#interpolations - let content_as_str = std::str::from_utf8(content).unwrap(); - - let starts = TEMPLATE_START_REGEX - .captures_iter(content_as_str) - .collect::>(); - let ends = TEMPLATE_END_REGEX - .captures_iter(content_as_str) - .collect::>(); - - for start in starts.iter() { - // The language for this block - let lang = start.get(1).unwrap().as_str(); - - // The HEREDOC delimiter - let delimiter_start = start.get(2).unwrap().as_str(); - - // Where the "body" starts for the HEREDOC block - let body_start = start.get(0).unwrap().end(); - - // Look through all of the ends to find a matching language - for end in ends.iter() { - // 1. This must appear after the start - let body_end = end.get(0).unwrap().start(); - if body_end < body_start { - continue; - } + // Only process if content is valid UTF-8, otherwise skip HEREDOC extraction + // but still perform the byte-level Ruby processing below + if let Ok(content_as_str) = std::str::from_utf8(content) { + let starts = TEMPLATE_START_REGEX + .captures_iter(content_as_str) + .collect::>(); + let ends = TEMPLATE_END_REGEX + .captures_iter(content_as_str) + .collect::>(); + + for start in starts.iter() { + // The language for this block + let lang = start.get(1).unwrap().as_str(); + + // The HEREDOC delimiter + let delimiter_start = start.get(2).unwrap().as_str(); + + // Where the "body" starts for the HEREDOC block + let body_start = start.get(0).unwrap().end(); + + // Look through all of the ends to find a matching language + for end in ends.iter() { + // 1. This must appear after the start + let body_end = end.get(0).unwrap().start(); + if body_end < body_start { + continue; + } - // The languages must match otherwise we haven't found the end - let delimiter_end = end.get(1).unwrap().as_str(); - if delimiter_end != delimiter_start { - continue; - } + // The languages must match otherwise we haven't found the end + let delimiter_end = end.get(1).unwrap().as_str(); + if delimiter_end != delimiter_start { + continue; + } - let body = &content_as_str[body_start..body_end]; - let replaced = pre_process_input(body.as_bytes(), &lang.to_ascii_lowercase()); + let body = &content_as_str[body_start..body_end]; + let replaced = pre_process_input(body.as_bytes(), &lang.to_ascii_lowercase()); - result.replace_range(body_start..body_end, replaced); - break; + result.replace_range(body_start..body_end, replaced); + break; + } } } @@ -427,4 +429,26 @@ mod tests { vec!["text-amber-600", "text-sky-500", "text-green-500"], ); } + + #[test] + fn test_invalid_utf8_does_not_panic() { + use crate::extractor::pre_processors::pre_processor::PreProcessor; + + // Invalid UTF-8 sequence: 0x80 is a continuation byte without a leading byte + let invalid_utf8: &[u8] = &[0x80, 0x81, 0x82]; + + let processor = Ruby::default(); + + // Should not panic, just return the input unchanged + let result = processor.process(invalid_utf8); + assert_eq!(result, invalid_utf8); + } + + #[test] + fn test_valid_utf8_with_multibyte_chars() { + // Test that valid UTF-8 with multi-byte characters (like em-dashes) works + let input = "# Comment with em—dash\n%w[flex px-2.5]"; + + Ruby::test_extract_contains(input, vec!["flex", "px-2.5"]); + } } diff --git a/crates/oxide/src/extractor/pre_processors/vue.rs b/crates/oxide/src/extractor/pre_processors/vue.rs index 119e2a3d2079..d71e32a6e672 100644 --- a/crates/oxide/src/extractor/pre_processors/vue.rs +++ b/crates/oxide/src/extractor/pre_processors/vue.rs @@ -15,13 +15,15 @@ impl PreProcessor for Vue { fn process(&self, content: &[u8]) -> Vec { let mut result = content.to_vec(); - let content_as_str = std::str::from_utf8(content).unwrap(); - for (_, [lang, body]) in TEMPLATE_REGEX - .captures_iter(content_as_str) - .map(|c| c.extract()) - { - let replaced = pre_process_input(body.as_bytes(), lang); - result = result.replace(body, replaced); + // Only process template tags if content is valid UTF-8 + if let Ok(content_as_str) = std::str::from_utf8(content) { + for (_, [lang, body]) in TEMPLATE_REGEX + .captures_iter(content_as_str) + .map(|c| c.extract()) + { + let replaced = pre_process_input(body.as_bytes(), lang); + result = result.replace(body, replaced); + } } result @@ -43,4 +45,16 @@ mod tests { Vue::test_extract_contains(input, vec!["bg-neutral-900", "text-red-500"]); } + + #[test] + fn test_invalid_utf8_does_not_panic() { + // Invalid UTF-8 sequence: 0x80 is a continuation byte without a leading byte + let invalid_utf8: &[u8] = &[0x80, 0x81, 0x82]; + + let processor = Vue::default(); + + // Should not panic, just return the input unchanged + let result = processor.process(invalid_utf8); + assert_eq!(result, invalid_utf8); + } }