Skip to content

Commit d4dd68d

Browse files
densumeshskeptrunedev
authored andcommitted
bugfix: concate short chunks to next one
1 parent 51de3e1 commit d4dd68d

File tree

2 files changed

+33
-15
lines changed

2 files changed

+33
-15
lines changed

server/src/bin/crawl-worker.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use trieve_server::{
1818
data::models::{CrawlStatus, Pool},
1919
errors::ServiceError,
2020
establish_connection, get_env,
21-
operators::crawl_operator::{get_images, get_tags, update_crawl_status},
21+
operators::crawl_operator::{get_tags, update_crawl_status},
2222
};
2323
use trieve_server::{
2424
handlers::chunk_handler::ChunkReqPayload, operators::crawl_operator::chunk_html,
@@ -128,7 +128,6 @@ async fn crawl(
128128
chunk_html: Some(chunk_html.clone()),
129129
link: Some(page_link.clone()),
130130
tag_set: Some(page_tags.clone()),
131-
image_urls: Some(get_images(&chunk_html.clone())),
132131
metadata: Some(json!({
133132
"title": page_title.clone(),
134133
"description": page_description.clone(),

server/src/operators/crawl_operator.rs

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ use regex::Regex;
1515
use reqwest::Url;
1616
use serde::{Deserialize, Serialize};
1717

18+
use super::parse_operator::convert_html_to_text;
19+
1820
#[derive(Debug, Serialize, Deserialize, Clone)]
1921
pub struct IngestResult {
2022
pub status: Status,
@@ -414,6 +416,7 @@ pub async fn get_crawl_from_firecrawl(scrape_id: uuid::Uuid) -> Result<IngestRes
414416
log::error!("Error parsing response from firecrawl: {:?}", e);
415417
ServiceError::InternalServerError("Error parsing response from firecrawl".to_string())
416418
})?;
419+
417420
if ingest_result.status != Status::Completed {
418421
log::info!("Crawl status: {:?}", ingest_result.status);
419422
return Ok(ingest_result);
@@ -511,34 +514,42 @@ pub fn get_tags(url: String) -> Vec<String> {
511514
Vec::new()
512515
}
513516

514-
pub fn get_images(markdown_content: &str) -> Vec<String> {
515-
let image_pattern = Regex::new(r"\((https?://.*?\.(?:png|jpg|jpeg|gif|bmp|webp))\)").unwrap();
516-
image_pattern
517-
.captures_iter(markdown_content)
518-
.filter_map(|cap| cap.get(1))
519-
.map(|m| m.as_str().to_string())
520-
.collect()
521-
}
522-
523517
pub fn chunk_html(html: &str) -> Vec<(String, String)> {
524518
let re = Regex::new(r"(?i)<h[1-6].*?>").unwrap();
525519
let mut chunks = Vec::new();
526520
let mut current_chunk = (String::new(), String::new());
527521
let mut last_end = 0;
522+
let mut short_chunk: Option<(String, String)> = None;
528523

529524
for cap in re.find_iter(html) {
530525
if last_end != cap.start() {
531526
current_chunk.1.push_str(&html[last_end..cap.start()]);
532527
}
533528

534-
if !current_chunk.1.is_empty() {
529+
if !current_chunk.1.is_empty() && current_chunk.0 != current_chunk.1 {
535530
current_chunk.1 = current_chunk.1.trim().to_string();
536-
chunks.push(current_chunk);
537531

538-
current_chunk = (String::new(), String::new());
532+
if let Some(prev_short_chunk) = short_chunk.take() {
533+
current_chunk.1 = format!("{} {}", prev_short_chunk.1, current_chunk.1);
534+
current_chunk.0 = prev_short_chunk.0;
535+
}
536+
537+
if convert_html_to_text(&current_chunk.1)
538+
.split_whitespace()
539+
.count()
540+
> 5
541+
{
542+
chunks.push(current_chunk);
543+
current_chunk = (String::new(), String::new());
544+
} else {
545+
short_chunk = Some(current_chunk);
546+
current_chunk = (String::new(), String::new());
547+
}
539548
}
540549

541-
current_chunk.0 = cap.as_str().to_string();
550+
if current_chunk.0.is_empty() {
551+
current_chunk.0 = cap.as_str().to_string();
552+
}
542553
current_chunk.1 = cap.as_str().to_string();
543554
last_end = cap.end();
544555
}
@@ -549,7 +560,15 @@ pub fn chunk_html(html: &str) -> Vec<(String, String)> {
549560

550561
if !current_chunk.1.is_empty() {
551562
current_chunk.1 = current_chunk.1.trim().to_string();
563+
564+
if let Some(prev_short_chunk) = short_chunk.take() {
565+
current_chunk.1 = format!("{} {}", prev_short_chunk.1, current_chunk.1);
566+
current_chunk.0 = prev_short_chunk.0;
567+
}
568+
552569
chunks.push(current_chunk);
570+
} else if let Some(last_short_chunk) = short_chunk {
571+
chunks.push(last_short_chunk);
553572
}
554573

555574
chunks

0 commit comments

Comments
 (0)