@@ -8,7 +8,6 @@ use std::sync::{
88use tracing_subscriber:: { layer:: SubscriberExt , util:: SubscriberInitExt , EnvFilter , Layer } ;
99
1010use actix_web:: web;
11- use trieve_server:: handlers:: chunk_handler:: ChunkReqPayload ;
1211use trieve_server:: operators:: chunk_operator:: create_chunk_metadata;
1312use trieve_server:: operators:: dataset_operator:: get_dataset_by_id_query;
1413use trieve_server:: {
@@ -21,6 +20,9 @@ use trieve_server::{
2120 establish_connection, get_env,
2221 operators:: crawl_operator:: { get_chunk_html, get_images, get_tags, update_crawl_status} ,
2322} ;
23+ use trieve_server:: {
24+ handlers:: chunk_handler:: ChunkReqPayload , operators:: crawl_operator:: chunk_markdown,
25+ } ;
2426use ureq:: json;
2527
2628async fn crawl (
@@ -92,25 +94,31 @@ async fn crawl(
9294 let page_markdown = page. markdown . clone ( ) . unwrap_or_default ( ) ;
9395 let page_tags = get_tags ( page_link. clone ( ) ) ;
9496
95- let chunk = ChunkReqPayload {
96- chunk_html : Some ( get_chunk_html (
97- page_markdown. clone ( ) ,
98- page_title. clone ( ) ,
99- "" . to_string ( ) ,
100- 0 ,
101- None ,
102- ) ) ,
103- link : Some ( page_link. clone ( ) ) ,
104- tag_set : Some ( page_tags) ,
105- image_urls : Some ( get_images ( & page_markdown. clone ( ) ) ) ,
106- metadata : Some ( json ! ( {
107- "title" : page_title. clone( ) ,
108- "description" : page_description. clone( ) ,
109- "url" : page_link. clone( ) ,
110- } ) ) ,
111- ..Default :: default ( )
112- } ;
113- chunks. push ( chunk) ;
97+ let chunk_html = get_chunk_html (
98+ page_markdown. clone ( ) ,
99+ page_title. clone ( ) ,
100+ "" . to_string ( ) ,
101+ 0 ,
102+ None ,
103+ ) ;
104+
105+ let chunked_markdown = chunk_markdown ( & chunk_html. clone ( ) ) ;
106+
107+ for chunk in chunked_markdown {
108+ let chunk = ChunkReqPayload {
109+ chunk_html : Some ( chunk. clone ( ) ) ,
110+ link : Some ( page_link. clone ( ) ) ,
111+ tag_set : Some ( page_tags. clone ( ) ) ,
112+ image_urls : Some ( get_images ( & chunk. clone ( ) ) ) ,
113+ metadata : Some ( json ! ( {
114+ "title" : page_title. clone( ) ,
115+ "description" : page_description. clone( ) ,
116+ "url" : page_link. clone( ) ,
117+ } ) ) ,
118+ ..Default :: default ( )
119+ } ;
120+ chunks. push ( chunk) ;
121+ }
114122 }
115123
116124 let dataset = get_dataset_by_id_query (
0 commit comments