@@ -51,42 +51,83 @@ pub fn inflate_raw_batch(
5151
5252 let header_size = 4 + ( num_blocks + 1 ) * 4 ;
5353
54+ // Estimate output size based on compressed input size * typical ratio
55+ // Cap at reasonable size to avoid over-allocation for large max_block_size
5456 let mut total_input_size = 0usize ;
5557 for i in 0 ..num_blocks {
5658 total_input_size += input_lengths[ i] as usize ;
5759 }
58- let estimated_output = total_input_size * 4 ;
60+ // Estimate 4x compression ratio, but cap per-block estimate at max_block_size
61+ let estimated_per_block = ( total_input_size * 4 / num_blocks. max ( 1 ) ) . min ( max_out) ;
62+ let estimated_output = estimated_per_block * num_blocks;
5963
60- let mut result = Vec :: with_capacity ( header_size + estimated_output) ;
61- result. resize ( header_size, 0 ) ;
64+ // For small estimates, decompress directly into pre-allocated buffer
65+ // For large estimates (>32MB), use temp buffer to avoid over-allocation
66+ let use_direct = estimated_output < 32 * 1024 * 1024 ;
6267
63- result[ 0 ..4 ] . copy_from_slice ( & ( num_blocks as u32 ) . to_le_bytes ( ) ) ;
68+ if use_direct {
69+ // Pre-allocate and decompress directly - eliminates temp buffer copy
70+ let mut result = vec ! [ 0u8 ; header_size + num_blocks * max_out] ;
71+ result[ 0 ..4 ] . copy_from_slice ( & ( num_blocks as u32 ) . to_le_bytes ( ) ) ;
6472
65- let offsets_start = 4 ;
66- let mut data_offset = 0u32 ;
73+ let offsets_start = 4 ;
74+ let mut data_offset = 0u32 ;
75+ let data_start = header_size;
6776
68- let mut temp_buf = vec ! [ 0u8 ; max_out] ;
77+ for i in 0 ..num_blocks {
78+ let start = input_offsets[ i] as usize + ZLIB_HEADER_SIZE ;
79+ let len = input_lengths[ i] as usize - ZLIB_HEADER_SIZE ;
80+ let input = & inputs[ start..start + len] ;
6981
70- for i in 0 ..num_blocks {
71- let start = input_offsets[ i] as usize + ZLIB_HEADER_SIZE ;
72- let len = input_lengths[ i] as usize - ZLIB_HEADER_SIZE ;
73- let input = & inputs[ start..start + len] ;
82+ let offset_pos = offsets_start + i * 4 ;
83+ result[ offset_pos..offset_pos + 4 ] . copy_from_slice ( & data_offset. to_le_bytes ( ) ) ;
7484
75- let offset_pos = offsets_start + i * 4 ;
76- result[ offset_pos..offset_pos + 4 ] . copy_from_slice ( & data_offset . to_le_bytes ( ) ) ;
85+ let output_start = data_start + data_offset as usize ;
86+ let output_slice = & mut result[ output_start..output_start + max_out ] ;
7787
78- let actual_size = decompressor
79- . deflate_decompress ( input, & mut temp_buf )
80- . map_err ( |e| JsError :: new ( & format ! ( "decompression failed: {:?}" , e) ) ) ?;
88+ let actual_size = decompressor
89+ . deflate_decompress ( input, output_slice )
90+ . map_err ( |e| JsError :: new ( & format ! ( "decompression failed: {:?}" , e) ) ) ?;
8191
82- result. extend_from_slice ( & temp_buf[ ..actual_size] ) ;
83- data_offset += actual_size as u32 ;
84- }
92+ data_offset += actual_size as u32 ;
93+ }
8594
86- let final_offset_pos = offsets_start + num_blocks * 4 ;
87- result[ final_offset_pos..final_offset_pos + 4 ] . copy_from_slice ( & data_offset. to_le_bytes ( ) ) ;
95+ let final_offset_pos = offsets_start + num_blocks * 4 ;
96+ result[ final_offset_pos..final_offset_pos + 4 ] . copy_from_slice ( & data_offset. to_le_bytes ( ) ) ;
8897
89- Ok ( result. into_boxed_slice ( ) )
98+ result. truncate ( header_size + data_offset as usize ) ;
99+ Ok ( result. into_boxed_slice ( ) )
100+ } else {
101+ // Use temp buffer approach for large blocks to avoid over-allocation
102+ let mut result = Vec :: with_capacity ( header_size + estimated_output) ;
103+ result. resize ( header_size, 0 ) ;
104+ result[ 0 ..4 ] . copy_from_slice ( & ( num_blocks as u32 ) . to_le_bytes ( ) ) ;
105+
106+ let offsets_start = 4 ;
107+ let mut data_offset = 0u32 ;
108+ let mut temp_buf = vec ! [ 0u8 ; max_out] ;
109+
110+ for i in 0 ..num_blocks {
111+ let start = input_offsets[ i] as usize + ZLIB_HEADER_SIZE ;
112+ let len = input_lengths[ i] as usize - ZLIB_HEADER_SIZE ;
113+ let input = & inputs[ start..start + len] ;
114+
115+ let offset_pos = offsets_start + i * 4 ;
116+ result[ offset_pos..offset_pos + 4 ] . copy_from_slice ( & data_offset. to_le_bytes ( ) ) ;
117+
118+ let actual_size = decompressor
119+ . deflate_decompress ( input, & mut temp_buf)
120+ . map_err ( |e| JsError :: new ( & format ! ( "decompression failed: {:?}" , e) ) ) ?;
121+
122+ result. extend_from_slice ( & temp_buf[ ..actual_size] ) ;
123+ data_offset += actual_size as u32 ;
124+ }
125+
126+ let final_offset_pos = offsets_start + num_blocks * 4 ;
127+ result[ final_offset_pos..final_offset_pos + 4 ] . copy_from_slice ( & data_offset. to_le_bytes ( ) ) ;
128+
129+ Ok ( result. into_boxed_slice ( ) )
130+ }
90131}
91132
92133// BigWig block parsing functions
@@ -525,6 +566,194 @@ pub fn parse_summary_blocks(
525566 result. into_boxed_slice ( )
526567}
527568
569+ /// Parse a single BigBed block
570+ /// BigBed format: chromId(u32), start(i32), end(i32), rest(null-terminated string)
571+ ///
572+ /// Returns: [count: u32][starts: i32*n][ends: i32*n][string_offsets: u32*(n+1)][string_data: bytes]
573+ fn parse_bigbed_block_into (
574+ data : & [ u8 ] ,
575+ base_offset : u32 ,
576+ req_chr_id : u32 ,
577+ req_start : i32 ,
578+ req_end : i32 ,
579+ starts : & mut Vec < i32 > ,
580+ ends : & mut Vec < i32 > ,
581+ unique_id_offsets : & mut Vec < u32 > ,
582+ string_offsets : & mut Vec < u32 > ,
583+ string_data : & mut Vec < u8 > ,
584+ ) {
585+ let filter = req_start != 0 || req_end != 0 ;
586+ let mut offset = 0usize ;
587+
588+ while offset + 12 <= data. len ( ) {
589+ let record_start_offset = offset;
590+ let chrom_id = read_u32_le ( data, offset) ;
591+ offset += 4 ;
592+ let start = read_i32_le ( data, offset) ;
593+ offset += 4 ;
594+ let end = read_i32_le ( data, offset) ;
595+ offset += 4 ;
596+
597+ // Find null terminator for rest string
598+ let string_start = offset;
599+ while offset < data. len ( ) && data[ offset] != 0 {
600+ offset += 1 ;
601+ }
602+ let string_end = offset;
603+ offset += 1 ; // skip null terminator
604+
605+ let passes = !filter || ( chrom_id == req_chr_id && start < req_end && end > req_start) ;
606+ if passes {
607+ starts. push ( start) ;
608+ ends. push ( end) ;
609+ unique_id_offsets. push ( base_offset + record_start_offset as u32 ) ;
610+ string_offsets. push ( string_data. len ( ) as u32 ) ;
611+ string_data. extend_from_slice ( & data[ string_start..string_end] ) ;
612+ }
613+ }
614+ }
615+
616+ /// Parse multiple uncompressed BigBed blocks
617+ /// Returns: [count: u32][starts: i32*n][ends: i32*n][uid_offsets: u32*n][string_offsets: u32*(n+1)][string_data: bytes]
618+ #[ wasm_bindgen]
619+ pub fn parse_bigbed_blocks (
620+ inputs : & [ u8 ] ,
621+ input_offsets : & [ u32 ] ,
622+ input_lengths : & [ u32 ] ,
623+ block_file_offsets : & [ u32 ] ,
624+ req_chr_id : u32 ,
625+ req_start : i32 ,
626+ req_end : i32 ,
627+ ) -> Box < [ u8 ] > {
628+ let num_blocks = input_offsets. len ( ) ;
629+
630+ let mut starts: Vec < i32 > = Vec :: new ( ) ;
631+ let mut ends: Vec < i32 > = Vec :: new ( ) ;
632+ let mut unique_id_offsets: Vec < u32 > = Vec :: new ( ) ;
633+ let mut string_offsets: Vec < u32 > = Vec :: new ( ) ;
634+ let mut string_data: Vec < u8 > = Vec :: new ( ) ;
635+
636+ for i in 0 ..num_blocks {
637+ let offset = input_offsets[ i] as usize ;
638+ let length = input_lengths[ i] as usize ;
639+ let data = & inputs[ offset..offset + length] ;
640+ let base_offset = ( block_file_offsets[ i] as u32 ) << 8 ;
641+
642+ parse_bigbed_block_into (
643+ data,
644+ base_offset,
645+ req_chr_id,
646+ req_start,
647+ req_end,
648+ & mut starts,
649+ & mut ends,
650+ & mut unique_id_offsets,
651+ & mut string_offsets,
652+ & mut string_data,
653+ ) ;
654+ }
655+
656+ // Add final string offset
657+ string_offsets. push ( string_data. len ( ) as u32 ) ;
658+
659+ let count = starts. len ( ) as u32 ;
660+ // Layout: count(4) + starts(4*n) + ends(4*n) + uid_offsets(4*n) + string_offsets(4*(n+1)) + string_data
661+ let result_size = 4 + count as usize * 16 + 4 + string_data. len ( ) ;
662+ let mut result = Vec :: with_capacity ( result_size) ;
663+
664+ result. extend_from_slice ( & count. to_le_bytes ( ) ) ;
665+ for & s in & starts {
666+ result. extend_from_slice ( & s. to_le_bytes ( ) ) ;
667+ }
668+ for & e in & ends {
669+ result. extend_from_slice ( & e. to_le_bytes ( ) ) ;
670+ }
671+ for & u in & unique_id_offsets {
672+ result. extend_from_slice ( & u. to_le_bytes ( ) ) ;
673+ }
674+ for & so in & string_offsets {
675+ result. extend_from_slice ( & so. to_le_bytes ( ) ) ;
676+ }
677+ result. extend_from_slice ( & string_data) ;
678+
679+ result. into_boxed_slice ( )
680+ }
681+
682+ /// Combined decompress + parse for BigBed blocks
683+ /// Returns: [count: u32][starts: i32*n][ends: i32*n][uid_offsets: u32*n][string_offsets: u32*(n+1)][string_data: bytes]
684+ #[ wasm_bindgen]
685+ pub fn decompress_and_parse_bigbed (
686+ inputs : & [ u8 ] ,
687+ input_offsets : & [ u32 ] ,
688+ input_lengths : & [ u32 ] ,
689+ block_file_offsets : & [ u32 ] ,
690+ max_block_size : u32 ,
691+ req_chr_id : u32 ,
692+ req_start : i32 ,
693+ req_end : i32 ,
694+ ) -> Result < Box < [ u8 ] > , JsError > {
695+ let mut decompressor = Decompressor :: new ( ) ;
696+ let num_blocks = input_offsets. len ( ) ;
697+ let max_out = max_block_size as usize ;
698+
699+ let mut temp_buf = vec ! [ 0u8 ; max_out] ;
700+
701+ let mut starts: Vec < i32 > = Vec :: new ( ) ;
702+ let mut ends: Vec < i32 > = Vec :: new ( ) ;
703+ let mut unique_id_offsets: Vec < u32 > = Vec :: new ( ) ;
704+ let mut string_offsets: Vec < u32 > = Vec :: new ( ) ;
705+ let mut string_data: Vec < u8 > = Vec :: new ( ) ;
706+
707+ for i in 0 ..num_blocks {
708+ let start = input_offsets[ i] as usize + ZLIB_HEADER_SIZE ;
709+ let len = input_lengths[ i] as usize - ZLIB_HEADER_SIZE ;
710+ let input = & inputs[ start..start + len] ;
711+ let base_offset = ( block_file_offsets[ i] as u32 ) << 8 ;
712+
713+ let actual_size = decompressor
714+ . deflate_decompress ( input, & mut temp_buf)
715+ . map_err ( |e| JsError :: new ( & format ! ( "decompression failed: {:?}" , e) ) ) ?;
716+
717+ let data = & temp_buf[ ..actual_size] ;
718+ parse_bigbed_block_into (
719+ data,
720+ base_offset,
721+ req_chr_id,
722+ req_start,
723+ req_end,
724+ & mut starts,
725+ & mut ends,
726+ & mut unique_id_offsets,
727+ & mut string_offsets,
728+ & mut string_data,
729+ ) ;
730+ }
731+
732+ // Add final string offset
733+ string_offsets. push ( string_data. len ( ) as u32 ) ;
734+
735+ let count = starts. len ( ) as u32 ;
736+ let result_size = 4 + count as usize * 16 + 4 + string_data. len ( ) ;
737+ let mut result = Vec :: with_capacity ( result_size) ;
738+
739+ result. extend_from_slice ( & count. to_le_bytes ( ) ) ;
740+ for & s in & starts {
741+ result. extend_from_slice ( & s. to_le_bytes ( ) ) ;
742+ }
743+ for & e in & ends {
744+ result. extend_from_slice ( & e. to_le_bytes ( ) ) ;
745+ }
746+ for & u in & unique_id_offsets {
747+ result. extend_from_slice ( & u. to_le_bytes ( ) ) ;
748+ }
749+ for & so in & string_offsets {
750+ result. extend_from_slice ( & so. to_le_bytes ( ) ) ;
751+ }
752+ result. extend_from_slice ( & string_data) ;
753+
754+ Ok ( result. into_boxed_slice ( ) )
755+ }
756+
528757/// Combined decompress + parse for summary blocks
529758#[ wasm_bindgen]
530759pub fn decompress_and_parse_summary (
0 commit comments