Skip to content

Commit 2c55e4a

Browse files
committed
cut: fix -s flag for newline delimiter and improve performance
- Fixed the -s flag incorrectly suppressing output when the delimiter is a newline. - Improved performance in cut_fields_newline_char_delim. - Updated tests to match GNU cut behavior for newline delimiters.
1 parent bed3108 commit 2c55e4a

3 files changed

Lines changed: 317 additions & 16 deletions

File tree

src/uu/cut/benches/cut_bench.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,24 @@ fn cut_fields_custom_delim(bencher: Bencher) {
7171
});
7272
}
7373

74+
/// Benchmark cutting fields with newline delimiter
75+
#[divan::bench]
76+
fn cut_fields_newline_delim(bencher: Bencher) {
77+
let mut data = Vec::new();
78+
for i in 0..100_000 {
79+
let line = format!("field_content_number_{i}\n");
80+
data.extend_from_slice(line.as_bytes());
81+
}
82+
let file_path = setup_test_file(&data);
83+
84+
bencher.bench(|| {
85+
black_box(run_util_function(
86+
uumain,
87+
&["-d", "\n", "-f", "1,3,5", file_path.to_str().unwrap()],
88+
));
89+
});
90+
}
91+
7492
fn main() {
7593
divan::main();
7694
}

src/uu/cut/src/cut.rs

Lines changed: 100 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// For the full copyright and license information, please view the LICENSE
44
// file that was distributed with this source code.
55

6-
// spell-checker:ignore (ToDO) delim sourcefiles
6+
// spell-checker:ignore (ToDO) delim sourcefiles undelimited
77

88
use bstr::io::BufReadExt;
99
use clap::{Arg, ArgAction, ArgMatches, Command, builder::ValueParser};
@@ -254,35 +254,112 @@ fn cut_fields_implicit_out_delim<R: Read, W: Write, M: Matcher>(
254254
Ok(())
255255
}
256256

257-
/// The input delimiter is identical to `newline_char`
257+
/// Streams and filters fields where the record terminator and
258+
/// field delimiter are the same character (specified by `newline_char`)
258259
fn cut_fields_newline_char_delim<R: Read, W: Write>(
259260
reader: R,
260261
out: &mut W,
261262
ranges: &[Range],
263+
only_delimited: bool,
262264
newline_char: u8,
263265
out_delim: &[u8],
264266
) -> UResult<()> {
265-
let buf_in = BufReader::new(reader);
267+
let mut reader = BufReader::new(reader);
268+
let mut line = Vec::new();
266269

267-
let segments: Vec<_> = buf_in.split(newline_char).filter_map(Result::ok).collect();
268-
let mut print_delim = false;
270+
// We start at 1 because 'cut' field indexing is 1-based
271+
let mut current_field_idx = 1;
272+
let mut first_field_printed = false;
273+
let mut has_data = false;
274+
let mut suppressed = false;
269275

270-
for &Range { low, high } in ranges {
271-
for i in low..=high {
272-
// "- 1" is necessary because fields start from 1 whereas a Vec starts from 0
273-
if let Some(segment) = segments.get(i - 1) {
274-
if print_delim {
275-
out.write_all(out_delim)?;
276+
let mut range_idx = 0;
277+
278+
loop {
279+
line.clear();
280+
281+
let is_selected = range_idx < ranges.len() && current_field_idx >= ranges[range_idx].low;
282+
let needs_data = is_selected || current_field_idx == 1;
283+
284+
let mut bytes_processed = 0;
285+
286+
if needs_data {
287+
// Standard read: copies bytes into `line`
288+
bytes_processed = reader.read_until(newline_char, &mut line)?;
289+
} else {
290+
// Zero-allocation skip: scans the buffer and advances the cursor without copying
291+
loop {
292+
let buf = reader.fill_buf()?;
293+
if buf.is_empty() {
294+
break; // EOF
295+
}
296+
297+
if let Some(pos) = buf.iter().position(|&b| b == newline_char) {
298+
reader.consume(pos + 1);
299+
bytes_processed += pos + 1;
300+
break;
301+
}
302+
let len = buf.len();
303+
reader.consume(len);
304+
bytes_processed += len;
305+
}
306+
}
307+
308+
if bytes_processed == 0 {
309+
break;
310+
}
311+
has_data = true;
312+
313+
// To comply with -s when the stream consists of only a single field.
314+
if current_field_idx == 1 {
315+
let is_eof_next = reader.fill_buf()?.is_empty();
316+
317+
if is_eof_next && line.last() != Some(&newline_char) {
318+
if only_delimited {
319+
suppressed = true;
276320
} else {
277-
print_delim = true;
321+
// GNU cut prints the whole line if no delimiter is found.
322+
out.write_all(&line)?;
278323
}
279-
out.write_all(segment.as_slice())?;
280-
} else {
281324
break;
282325
}
283326
}
327+
328+
if range_idx < ranges.len() && current_field_idx > ranges[range_idx].high {
329+
range_idx += 1;
330+
331+
// EARLY EXIT: If we've exhausted all ranges, stop reading the stream entirely.
332+
if range_idx >= ranges.len() {
333+
break;
334+
}
335+
}
336+
337+
// Check if the current field falls inside the current active range
338+
let is_selected = range_idx < ranges.len() && current_field_idx >= ranges[range_idx].low;
339+
340+
if is_selected {
341+
if first_field_printed {
342+
out.write_all(out_delim)?;
343+
}
344+
345+
let has_newline = line.last() == Some(&newline_char);
346+
let content = if has_newline {
347+
&line[..line.len() - 1]
348+
} else {
349+
&line[..]
350+
};
351+
352+
out.write_all(content)?;
353+
first_field_printed = true;
354+
}
355+
356+
current_field_idx += 1;
357+
}
358+
359+
if has_data && !suppressed {
360+
out.write_all(&[newline_char])?;
284361
}
285-
out.write_all(&[newline_char])?;
362+
286363
Ok(())
287364
}
288365

@@ -297,7 +374,14 @@ fn cut_fields<R: Read, W: Write>(
297374
match field_opts.delimiter {
298375
Delimiter::Slice(delim) if delim == [newline_char] => {
299376
let out_delim = opts.out_delimiter.unwrap_or(delim);
300-
cut_fields_newline_char_delim(reader, out, ranges, newline_char, out_delim)
377+
cut_fields_newline_char_delim(
378+
reader,
379+
out,
380+
ranges,
381+
field_opts.only_delimited,
382+
newline_char,
383+
out_delim,
384+
)
301385
}
302386
Delimiter::Slice(delim) => {
303387
let matcher = ExactMatcher::new(delim);

tests/by-util/test_cut.rs

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,205 @@ fn test_newline_as_delimiter_with_output_delimiter() {
301301
.stdout_only_bytes("a:b\n");
302302
}
303303

304+
#[test]
305+
fn test_newline_as_delimiter_no_delimiter_suppressed() {
306+
for param in ["-s", "--only-delimited", "--only-del"] {
307+
new_ucmd!()
308+
.args(&["-d", "\n", param, "-f", "1"])
309+
.pipe_in("abc")
310+
.succeeds()
311+
.no_output();
312+
}
313+
}
314+
315+
#[test]
316+
fn test_newline_as_delimiter_found_not_suppressed() {
317+
// Has an internal \n delimiter, so -s shouldn't suppress it
318+
for param in ["-s", "--only-delimited", "--only-del"] {
319+
new_ucmd!()
320+
.args(&["-d", "\n", param, "-f", "1"])
321+
.pipe_in("abc\ndef\n")
322+
.succeeds()
323+
.stdout_only("abc\n");
324+
}
325+
}
326+
327+
#[test]
328+
fn test_newline_as_delimiter_multiple_fields() {
329+
// Check field selection when \n is the delimiter
330+
new_ucmd!()
331+
.args(&["-d", "\n", "-f", "2"])
332+
.pipe_in("abc\ndef\n")
333+
.succeeds()
334+
.stdout_only("def\n");
335+
}
336+
337+
#[test]
338+
fn test_newline_as_delimiter_double_newline() {
339+
// Field 2 is the empty space between newlines
340+
new_ucmd!()
341+
.args(&["-d", "\n", "-s", "-f", "2"])
342+
.pipe_in("abc\n\n")
343+
.succeeds()
344+
.stdout_only("\n");
345+
346+
// Requesting both fields
347+
new_ucmd!()
348+
.args(&["-d", "\n", "-s", "-f", "1,2"])
349+
.pipe_in("abc\n\n")
350+
.succeeds()
351+
.stdout_only("abc\n\n");
352+
}
353+
354+
#[test]
355+
fn test_newline_as_delimiter_only_newlines() {
356+
// Extracting empty fields from a string of just newlines
357+
new_ucmd!()
358+
.args(&["-d", "\n", "-s", "-f", "1"])
359+
.pipe_in("\n\n")
360+
.succeeds()
361+
.stdout_only("\n");
362+
363+
new_ucmd!()
364+
.args(&["-d", "\n", "-s", "-f", "2"])
365+
.pipe_in("\n\n")
366+
.succeeds()
367+
.stdout_only("\n");
368+
369+
new_ucmd!()
370+
.args(&["-d", "\n", "-s", "-f", "1,2"])
371+
.pipe_in("\n\n")
372+
.succeeds()
373+
.stdout_only("\n\n");
374+
}
375+
376+
#[test]
377+
fn test_newline_as_delimiter_last_field_no_newline() {
378+
// The last chunk is Field 2 even without a final newline
379+
new_ucmd!()
380+
.args(&["-d", "\n", "-f", "2"])
381+
.pipe_in("abc\ndef")
382+
.succeeds()
383+
.stdout_only("def\n");
384+
}
385+
386+
#[test]
387+
fn test_newline_as_delimiter_complement() {
388+
// Select everything except the second line
389+
new_ucmd!()
390+
.args(&["-d", "\n", "-f", "2", "--complement"])
391+
.pipe_in("line1\nline2\nline3\n")
392+
.succeeds()
393+
.stdout_only("line1\nline3\n");
394+
}
395+
396+
#[test]
397+
fn test_newline_as_delimiter_out_of_bounds() {
398+
// GNU cut: print an empty string + terminator for missing fields
399+
new_ucmd!()
400+
.args(&["-d", "\n", "-f", "3"])
401+
.pipe_in("a\nb\n")
402+
.succeeds()
403+
.stdout_only("\n");
404+
405+
// GNU cut avoids trailing delimiters for out-of-bounds fields when delimiter is \n
406+
new_ucmd!()
407+
.args(&["-d", "\n", "-f", "1,3"])
408+
.pipe_in("a\nb\n")
409+
.succeeds()
410+
.stdout_only("a\n");
411+
}
412+
413+
#[test]
414+
fn test_newline_as_delimiter_no_delimiter_prints_all() {
415+
// GNU cut: If no delimiter is found, the entire line (the whole file)
416+
// is printed regardless of the field requested, unless -s is used.
417+
new_ucmd!()
418+
.args(&["-d", "\n", "-f", "2"])
419+
.pipe_in("a")
420+
.succeeds()
421+
.stdout_only("a\n");
422+
}
423+
424+
#[test]
425+
fn test_newline_as_delimiter_empty_input() {
426+
new_ucmd!()
427+
.args(&["-d", "\n", "-f", "1"])
428+
.pipe_in("")
429+
.succeeds()
430+
.no_output();
431+
}
432+
433+
#[test]
434+
fn test_newline_as_delimiter_s_flag_no_newline_at_all() {
435+
new_ucmd!()
436+
.args(&["-d", "\n", "-s", "-f", "1"])
437+
.pipe_in("abc")
438+
.succeeds()
439+
.no_output();
440+
}
441+
442+
#[test]
443+
fn test_newline_as_delimiter_single_field_included() {
444+
for param in ["-s", "--only-delimited", "--only-del"] {
445+
new_ucmd!()
446+
.args(&["-d", "\n", param, "-f", "1"])
447+
.pipe_in("abc\n")
448+
.succeeds()
449+
.stdout_only("abc\n"); // GNU cut outputs the field + terminator
450+
}
451+
}
452+
453+
#[test]
454+
fn test_newline_as_delimiter_intervening_skipped_fields() {
455+
// Selecting non-adjacent lines (Fields 1 and 3)
456+
new_ucmd!()
457+
.args(&["-d", "\n", "-f", "1,3"])
458+
.pipe_in("line1\nline2\nline3\n")
459+
.succeeds()
460+
.stdout_only("line1\nline3\n");
461+
}
462+
463+
#[test]
464+
fn test_newline_as_delimiter_multibyte_normalization() {
465+
// Ensure multibyte records at EOF still get a normalized newline
466+
new_ucmd!()
467+
.args(&["-d", "\n", "-f", "2"])
468+
.pipe_in("\n😼")
469+
.succeeds()
470+
.stdout_only("😼\n");
471+
}
472+
473+
#[test]
474+
fn test_newline_as_delimiter_empty_first_record() {
475+
// Select Field 2 when Field 1 is empty
476+
new_ucmd!()
477+
.args(&["-d", "\n", "-f", "2"])
478+
.pipe_in("\nb")
479+
.succeeds()
480+
.stdout_only("b\n");
481+
}
482+
483+
#[test]
484+
fn test_newline_as_delimiter_overlapping_unordered_ranges() {
485+
// Request fields out of order and with overlapping ranges
486+
new_ucmd!()
487+
.args(&["-d", "\n", "-f", "2-3,1,2"])
488+
.pipe_in("a\nb\nc\n")
489+
.succeeds()
490+
.stdout_only("a\nb\nc\n");
491+
}
492+
493+
#[test]
494+
fn test_newline_as_delimiter_complement_last_record() {
495+
// Test --complement on the final record
496+
new_ucmd!()
497+
.args(&["-d", "\n", "-f", "1", "--complement"])
498+
.pipe_in("a\nb")
499+
.succeeds()
500+
.stdout_only("b\n");
501+
}
502+
304503
#[test]
305504
fn test_multiple_delimiters() {
306505
new_ucmd!()

0 commit comments

Comments
 (0)