@@ -15,6 +15,8 @@ use regex::Regex;
1515use reqwest:: Url ;
1616use serde:: { Deserialize , Serialize } ;
1717
18+ use super :: parse_operator:: convert_html_to_text;
19+
1820#[ derive( Debug , Serialize , Deserialize , Clone ) ]
1921pub struct IngestResult {
2022 pub status : Status ,
@@ -414,6 +416,7 @@ pub async fn get_crawl_from_firecrawl(scrape_id: uuid::Uuid) -> Result<IngestRes
414416 log:: error!( "Error parsing response from firecrawl: {:?}" , e) ;
415417 ServiceError :: InternalServerError ( "Error parsing response from firecrawl" . to_string ( ) )
416418 } ) ?;
419+
417420 if ingest_result. status != Status :: Completed {
418421 log:: info!( "Crawl status: {:?}" , ingest_result. status) ;
419422 return Ok ( ingest_result) ;
@@ -511,34 +514,42 @@ pub fn get_tags(url: String) -> Vec<String> {
511514 Vec :: new ( )
512515}
513516
514- pub fn get_images ( markdown_content : & str ) -> Vec < String > {
515- let image_pattern = Regex :: new ( r"\((https?://.*?\.(?:png|jpg|jpeg|gif|bmp|webp))\)" ) . unwrap ( ) ;
516- image_pattern
517- . captures_iter ( markdown_content)
518- . filter_map ( |cap| cap. get ( 1 ) )
519- . map ( |m| m. as_str ( ) . to_string ( ) )
520- . collect ( )
521- }
522-
523517pub fn chunk_html ( html : & str ) -> Vec < ( String , String ) > {
524518 let re = Regex :: new ( r"(?i)<h[1-6].*?>" ) . unwrap ( ) ;
525519 let mut chunks = Vec :: new ( ) ;
526520 let mut current_chunk = ( String :: new ( ) , String :: new ( ) ) ;
527521 let mut last_end = 0 ;
522+ let mut short_chunk: Option < ( String , String ) > = None ;
528523
529524 for cap in re. find_iter ( html) {
530525 if last_end != cap. start ( ) {
531526 current_chunk. 1 . push_str ( & html[ last_end..cap. start ( ) ] ) ;
532527 }
533528
534- if !current_chunk. 1 . is_empty ( ) {
529+ if !current_chunk. 1 . is_empty ( ) && current_chunk . 0 != current_chunk . 1 {
535530 current_chunk. 1 = current_chunk. 1 . trim ( ) . to_string ( ) ;
536- chunks. push ( current_chunk) ;
537531
538- current_chunk = ( String :: new ( ) , String :: new ( ) ) ;
532+ if let Some ( prev_short_chunk) = short_chunk. take ( ) {
533+ current_chunk. 1 = format ! ( "{} {}" , prev_short_chunk. 1 , current_chunk. 1 ) ;
534+ current_chunk. 0 = prev_short_chunk. 0 ;
535+ }
536+
537+ if convert_html_to_text ( & current_chunk. 1 )
538+ . split_whitespace ( )
539+ . count ( )
540+ > 5
541+ {
542+ chunks. push ( current_chunk) ;
543+ current_chunk = ( String :: new ( ) , String :: new ( ) ) ;
544+ } else {
545+ short_chunk = Some ( current_chunk) ;
546+ current_chunk = ( String :: new ( ) , String :: new ( ) ) ;
547+ }
539548 }
540549
541- current_chunk. 0 = cap. as_str ( ) . to_string ( ) ;
550+ if current_chunk. 0 . is_empty ( ) {
551+ current_chunk. 0 = cap. as_str ( ) . to_string ( ) ;
552+ }
542553 current_chunk. 1 = cap. as_str ( ) . to_string ( ) ;
543554 last_end = cap. end ( ) ;
544555 }
@@ -549,7 +560,15 @@ pub fn chunk_html(html: &str) -> Vec<(String, String)> {
549560
550561 if !current_chunk. 1 . is_empty ( ) {
551562 current_chunk. 1 = current_chunk. 1 . trim ( ) . to_string ( ) ;
563+
564+ if let Some ( prev_short_chunk) = short_chunk. take ( ) {
565+ current_chunk. 1 = format ! ( "{} {}" , prev_short_chunk. 1 , current_chunk. 1 ) ;
566+ current_chunk. 0 = prev_short_chunk. 0 ;
567+ }
568+
552569 chunks. push ( current_chunk) ;
570+ } else if let Some ( last_short_chunk) = short_chunk {
571+ chunks. push ( last_short_chunk) ;
553572 }
554573
555574 chunks
0 commit comments