Skip to content

Commit 1fde18a

Browse files
densumeshskeptrunedev
authored andcommitted
feature: chunk on headings
1 parent aa46afb commit 1fde18a

File tree

2 files changed

+55
-21
lines changed

2 files changed

+55
-21
lines changed

server/src/bin/scrape-worker.rs

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ use std::sync::{
88
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, Layer};
99

1010
use actix_web::web;
11-
use trieve_server::handlers::chunk_handler::ChunkReqPayload;
1211
use trieve_server::operators::chunk_operator::create_chunk_metadata;
1312
use trieve_server::operators::dataset_operator::get_dataset_by_id_query;
1413
use trieve_server::{
@@ -21,6 +20,9 @@ use trieve_server::{
2120
establish_connection, get_env,
2221
operators::crawl_operator::{get_chunk_html, get_images, get_tags, update_crawl_status},
2322
};
23+
use trieve_server::{
24+
handlers::chunk_handler::ChunkReqPayload, operators::crawl_operator::chunk_markdown,
25+
};
2426
use ureq::json;
2527

2628
async fn crawl(
@@ -92,25 +94,31 @@ async fn crawl(
9294
let page_markdown = page.markdown.clone().unwrap_or_default();
9395
let page_tags = get_tags(page_link.clone());
9496

95-
let chunk = ChunkReqPayload {
96-
chunk_html: Some(get_chunk_html(
97-
page_markdown.clone(),
98-
page_title.clone(),
99-
"".to_string(),
100-
0,
101-
None,
102-
)),
103-
link: Some(page_link.clone()),
104-
tag_set: Some(page_tags),
105-
image_urls: Some(get_images(&page_markdown.clone())),
106-
metadata: Some(json!({
107-
"title": page_title.clone(),
108-
"description": page_description.clone(),
109-
"url": page_link.clone(),
110-
})),
111-
..Default::default()
112-
};
113-
chunks.push(chunk);
97+
let chunk_html = get_chunk_html(
98+
page_markdown.clone(),
99+
page_title.clone(),
100+
"".to_string(),
101+
0,
102+
None,
103+
);
104+
105+
let chunked_markdown = chunk_markdown(&chunk_html.clone());
106+
107+
for chunk in chunked_markdown {
108+
let chunk = ChunkReqPayload {
109+
chunk_html: Some(chunk.clone()),
110+
link: Some(page_link.clone()),
111+
tag_set: Some(page_tags.clone()),
112+
image_urls: Some(get_images(&chunk.clone())),
113+
metadata: Some(json!({
114+
"title": page_title.clone(),
115+
"description": page_description.clone(),
116+
"url": page_link.clone(),
117+
})),
118+
..Default::default()
119+
};
120+
chunks.push(chunk);
121+
}
114122
}
115123

116124
let dataset = get_dataset_by_id_query(

server/src/operators/crawl_operator.rs

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -402,10 +402,36 @@ impl Cleaners {
402402
}
403403

404404
pub fn get_images(markdown_content: &str) -> Vec<String> {
405-
let image_pattern = Regex::new(r"!\[.*?\]\((.*?\.(?:png|webp))\)").unwrap();
405+
let image_pattern = Regex::new(r"!\[.*?\]\((.*?\.(?:png|webp|jpeg|jpg))\)").unwrap();
406406
image_pattern
407407
.captures_iter(markdown_content)
408408
.filter_map(|cap| cap.get(1))
409409
.map(|m| m.as_str().to_string())
410410
.collect()
411411
}
412+
413+
pub fn chunk_markdown(markdown: &str) -> Vec<String> {
414+
let re = Regex::new(r"(?m)^(#{1,6}\s.+)$").unwrap();
415+
let mut chunks = Vec::new();
416+
let mut current_chunk = String::new();
417+
418+
for line in markdown.lines() {
419+
if re.is_match(line) {
420+
if !current_chunk.is_empty() {
421+
chunks.push(current_chunk.trim().to_string());
422+
current_chunk = String::new();
423+
}
424+
current_chunk.push_str(line);
425+
current_chunk.push('\n');
426+
} else {
427+
current_chunk.push_str(line);
428+
current_chunk.push('\n');
429+
}
430+
}
431+
432+
if !current_chunk.is_empty() {
433+
chunks.push(current_chunk.trim().to_string());
434+
}
435+
436+
chunks
437+
}

0 commit comments

Comments
 (0)