Skip to content

Commit 69c5cfe

Browse files
committed
cut: fix -s flag for newline delimiter and improve performance
1 parent 2fba533 commit 69c5cfe

File tree

3 files changed

+296
-19
lines changed

3 files changed

+296
-19
lines changed

src/uu/cut/benches/cut_bench.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,24 @@ fn cut_fields_custom_delim(bencher: Bencher) {
7171
});
7272
}
7373

74+
/// Benchmark cutting fields with newline delimiter
75+
#[divan::bench]
76+
fn cut_fields_newline_delim(bencher: Bencher) {
77+
let mut data = Vec::new();
78+
for i in 0..100_000 {
79+
let line = format!("field_content_number_{i}\n");
80+
data.extend_from_slice(line.as_bytes());
81+
}
82+
let file_path = setup_test_file(&data);
83+
84+
bencher.bench(|| {
85+
black_box(run_util_function(
86+
uumain,
87+
&["-d", "\n", "-f", "1,3,5", file_path.to_str().unwrap()],
88+
));
89+
});
90+
}
91+
7492
fn main() {
7593
divan::main();
7694
}

src/uu/cut/src/cut.rs

Lines changed: 58 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// For the full copyright and license information, please view the LICENSE
44
// file that was distributed with this source code.
55

6-
// spell-checker:ignore (ToDO) delim sourcefiles
6+
// spell-checker:ignore (ToDO) delim sourcefiles undelimited
77

88
use bstr::io::BufReadExt;
99
use clap::{Arg, ArgAction, ArgMatches, Command, builder::ValueParser};
@@ -254,35 +254,67 @@ fn cut_fields_implicit_out_delim<R: Read, W: Write, M: Matcher>(
254254
Ok(())
255255
}
256256

257-
/// The input delimiter is identical to `newline_char`
257+
/// Streams and filters fields using `\n` as the delimiter
258258
fn cut_fields_newline_char_delim<R: Read, W: Write>(
259259
reader: R,
260260
out: &mut W,
261261
ranges: &[Range],
262+
only_delimited: bool,
262263
newline_char: u8,
263264
out_delim: &[u8],
264265
) -> UResult<()> {
265-
let buf_in = BufReader::new(reader);
266-
267-
let segments: Vec<_> = buf_in.split(newline_char).filter_map(Result::ok).collect();
268-
let mut print_delim = false;
266+
let mut reader = BufReader::new(reader);
267+
let mut line = Vec::new();
268+
269+
// We start at 1 because 'cut' field indexing is 1-based
270+
let mut current_field_idx = 1;
271+
let mut first_field_printed = false;
272+
let mut has_data = false;
273+
let mut suppressed = false;
274+
275+
loop {
276+
line.clear();
277+
let bytes_read = reader.read_until(newline_char, &mut line)?;
278+
if bytes_read == 0 {
279+
break;
280+
}
281+
has_data = true;
269282

270-
for &Range { low, high } in ranges {
271-
for i in low..=high {
272-
// "- 1" is necessary because fields start from 1 whereas a Vec starts from 0
273-
if let Some(segment) = segments.get(i - 1) {
274-
if print_delim {
275-
out.write_all(out_delim)?;
276-
} else {
277-
print_delim = true;
278-
}
279-
out.write_all(segment.as_slice())?;
280-
} else {
283+
// To comply with -s when the stream consists of only a single field.
284+
if current_field_idx == 1 {
285+
let is_eof_next = reader.fill_buf()?.is_empty();
286+
if only_delimited && is_eof_next {
287+
suppressed = true;
281288
break;
282289
}
283290
}
291+
292+
if ranges
293+
.iter()
294+
.any(|r| current_field_idx >= r.low && current_field_idx <= r.high)
295+
{
296+
if first_field_printed {
297+
out.write_all(out_delim)?;
298+
}
299+
300+
let has_newline = line.last() == Some(&newline_char);
301+
let content = if has_newline {
302+
&line[..line.len() - 1]
303+
} else {
304+
&line[..]
305+
};
306+
307+
out.write_all(content)?;
308+
first_field_printed = true;
309+
}
310+
311+
current_field_idx += 1;
284312
}
285-
out.write_all(&[newline_char])?;
313+
314+
if has_data && !suppressed {
315+
out.write_all(&[newline_char])?;
316+
}
317+
286318
Ok(())
287319
}
288320

@@ -297,7 +329,14 @@ fn cut_fields<R: Read, W: Write>(
297329
match field_opts.delimiter {
298330
Delimiter::Slice(delim) if delim == [newline_char] => {
299331
let out_delim = opts.out_delimiter.unwrap_or(delim);
300-
cut_fields_newline_char_delim(reader, out, ranges, newline_char, out_delim)
332+
cut_fields_newline_char_delim(
333+
reader,
334+
out,
335+
ranges,
336+
field_opts.only_delimited,
337+
newline_char,
338+
out_delim,
339+
)
301340
}
302341
Delimiter::Slice(delim) => {
303342
let matcher = ExactMatcher::new(delim);

tests/by-util/test_cut.rs

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,226 @@ fn test_newline_as_delimiter_with_output_delimiter() {
301301
.stdout_only_bytes("a:b\n");
302302
}
303303

304+
#[test]
305+
fn test_newline_as_delimiter_no_delimiter_suppressed() {
306+
for param in ["-s", "--only-delimited", "--only-del"] {
307+
new_ucmd!()
308+
.args(&["-d", "\n", param, "-f", "1"])
309+
.pipe_in("abc")
310+
.succeeds()
311+
.no_output();
312+
}
313+
}
314+
315+
#[test]
316+
fn test_newline_as_delimiter_found_not_suppressed() {
317+
// Has an internal \n delimiter, so -s shouldn't suppress it
318+
for param in ["-s", "--only-delimited", "--only-del"] {
319+
new_ucmd!()
320+
.args(&["-d", "\n", param, "-f", "1"])
321+
.pipe_in("abc\ndef\n")
322+
.succeeds()
323+
.stdout_only("abc\n");
324+
}
325+
}
326+
327+
#[test]
328+
fn test_newline_as_delimiter_multiple_fields() {
329+
// Check field selection when \n is the delimiter
330+
new_ucmd!()
331+
.args(&["-d", "\n", "-f", "2"])
332+
.pipe_in("abc\ndef\n")
333+
.succeeds()
334+
.stdout_only("def\n");
335+
}
336+
337+
#[test]
338+
fn test_newline_as_delimiter_double_newline() {
339+
// Field 2 is the empty space between newlines
340+
new_ucmd!()
341+
.args(&["-d", "\n", "-s", "-f", "2"])
342+
.pipe_in("abc\n\n")
343+
.succeeds()
344+
.stdout_only("\n");
345+
346+
// Requesting both fields
347+
new_ucmd!()
348+
.args(&["-d", "\n", "-s", "-f", "1,2"])
349+
.pipe_in("abc\n\n")
350+
.succeeds()
351+
.stdout_only("abc\n\n");
352+
}
353+
354+
#[test]
355+
fn test_newline_as_delimiter_only_newlines() {
356+
// Extracting empty fields from a string of just newlines
357+
new_ucmd!()
358+
.args(&["-d", "\n", "-s", "-f", "1"])
359+
.pipe_in("\n\n")
360+
.succeeds()
361+
.stdout_only("\n");
362+
363+
new_ucmd!()
364+
.args(&["-d", "\n", "-s", "-f", "2"])
365+
.pipe_in("\n\n")
366+
.succeeds()
367+
.stdout_only("\n");
368+
369+
new_ucmd!()
370+
.args(&["-d", "\n", "-s", "-f", "1,2"])
371+
.pipe_in("\n\n")
372+
.succeeds()
373+
.stdout_only("\n\n");
374+
}
375+
376+
#[test]
377+
fn test_newline_as_delimiter_last_field_no_newline() {
378+
// The last chunk is Field 2 even without a final newline
379+
new_ucmd!()
380+
.args(&["-d", "\n", "-f", "2"])
381+
.pipe_in("abc\ndef")
382+
.succeeds()
383+
.stdout_only("def\n");
384+
}
385+
386+
#[test]
387+
fn test_newline_as_delimiter_complement() {
388+
// Select everything except the second line
389+
new_ucmd!()
390+
.args(&["-d", "\n", "-f", "2", "--complement"])
391+
.pipe_in("line1\nline2\nline3\n")
392+
.succeeds()
393+
.stdout_only("line1\nline3\n");
394+
}
395+
396+
#[test]
397+
fn test_newline_as_delimiter_out_of_bounds() {
398+
// GNU cut: print an empty string + terminator for missing fields
399+
new_ucmd!()
400+
.args(&["-d", "\n", "-f", "3"])
401+
.pipe_in("a\nb\n")
402+
.succeeds()
403+
.stdout_only("\n");
404+
405+
// GNU cut avoids trailing delimiters for out-of-bounds fields when delimiter is \n
406+
new_ucmd!()
407+
.args(&["-d", "\n", "-f", "1,3"])
408+
.pipe_in("a\nb\n")
409+
.succeeds()
410+
.stdout_only("a\n");
411+
}
412+
413+
#[test]
414+
fn test_newline_as_delimiter_out_of_bounds_standard_behavior() {
415+
// Expect a newline for out-of-bounds fields if input isn't empty and -s is off
416+
new_ucmd!()
417+
.args(&["-d", "\n", "-f", "2"])
418+
.pipe_in("a")
419+
.succeeds()
420+
.stdout_only("\n");
421+
}
422+
423+
#[test]
424+
fn test_newline_as_delimiter_empty_input() {
425+
new_ucmd!()
426+
.args(&["-d", "\n", "-f", "1"])
427+
.pipe_in("")
428+
.succeeds()
429+
.no_output();
430+
}
431+
432+
#[test]
433+
fn test_newline_as_delimiter_s_flag_no_newline_at_all() {
434+
new_ucmd!()
435+
.args(&["-d", "\n", "-s", "-f", "1"])
436+
.pipe_in("abc")
437+
.succeeds()
438+
.no_output();
439+
}
440+
441+
#[test]
442+
fn test_newline_as_delimiter_single_field_suppressed() {
443+
// GNU cut treats the final '\n' as a record terminator, not a delimiter.
444+
// Therefore, "abc\n" contains 0 delimiters and must be suppressed by `-s`.
445+
for param in ["-s", "--only-delimited", "--only-del"] {
446+
new_ucmd!()
447+
.args(&["-d", "\n", param, "-f", "1"])
448+
.pipe_in("abc\n")
449+
.succeeds()
450+
.no_output(); // Corrected from stdout_only("abc\n")
451+
}
452+
}
453+
454+
#[test]
455+
fn test_newline_as_delimiter_intervening_skipped_fields() {
456+
// Selecting non-adjacent lines (Fields 1 and 3)
457+
new_ucmd!()
458+
.args(&["-d", "\n", "-f", "1,3"])
459+
.pipe_in("line1\nline2\nline3\n")
460+
.succeeds()
461+
.stdout_only("line1\nline3\n");
462+
}
463+
464+
#[test]
465+
fn test_newline_as_delimiter_multibyte_normalization() {
466+
// Ensure multibyte records at EOF still get a normalized newline
467+
new_ucmd!()
468+
.args(&["-d", "\n", "-f", "2"])
469+
.pipe_in("\n😼")
470+
.succeeds()
471+
.stdout_only("😼\n");
472+
}
473+
474+
#[test]
475+
fn test_newline_as_delimiter_empty_first_record() {
476+
// Select Field 2 when Field 1 is empty
477+
new_ucmd!()
478+
.args(&["-d", "\n", "-f", "2"])
479+
.pipe_in("\nb")
480+
.succeeds()
481+
.stdout_only("b\n");
482+
}
483+
484+
#[test]
485+
fn test_newline_as_delimiter_overlapping_unordered_ranges() {
486+
// Request fields out of order and with overlapping ranges
487+
new_ucmd!()
488+
.args(&["-d", "\n", "-f", "2-3,1,2"])
489+
.pipe_in("a\nb\nc\n")
490+
.succeeds()
491+
.stdout_only("a\nb\nc\n");
492+
}
493+
494+
#[test]
495+
fn test_newline_as_delimiter_empty_first_record_select_second() {
496+
// Input: \nb (Field 1: empty, Field 2: 'b')
497+
new_ucmd!()
498+
.args(&["-d", "\n", "-f", "2"])
499+
.pipe_in("\nb")
500+
.succeeds()
501+
.stdout_only("b\n");
502+
}
503+
504+
#[test]
505+
fn test_newline_as_delimiter_complement_last_record() {
506+
// Test --complement on the final record
507+
new_ucmd!()
508+
.args(&["-d", "\n", "-f", "1", "--complement"])
509+
.pipe_in("a\nb")
510+
.succeeds()
511+
.stdout_only("b\n");
512+
}
513+
514+
#[test]
515+
fn test_newline_as_delimiter_s_suppress_entire_file() {
516+
// No newlines means no delimiters (-s should suppress everything)
517+
new_ucmd!()
518+
.args(&["-d", "\n", "-s", "-f", "1"])
519+
.pipe_in("no_newline_here")
520+
.succeeds()
521+
.no_output();
522+
}
523+
304524
#[test]
305525
fn test_multiple_delimiters() {
306526
new_ucmd!()

0 commit comments

Comments
 (0)