-
Notifications
You must be signed in to change notification settings - Fork 1.6k
[ty] Fix empty spans following a line terminator and unprintable character spans in diagnostics #19535
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ty] Fix empty spans following a line terminator and unprintable character spans in diagnostics #19535
Changes from 11 commits
366841e
7c31a78
02560f7
f7d28b1
0c4652a
c22449c
d746145
fffbb7d
fe82d7b
665909e
6180806
9c82529
3031673
a4f7434
cdef1a1
f606453
a1b3dcf
f5410bf
5ae6367
2e2ef66
c42ea1d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -1,3 +1,4 @@ | ||||||||||||
| use std::borrow::Cow; | ||||||||||||
| use std::collections::BTreeMap; | ||||||||||||
| use std::path::Path; | ||||||||||||
|
|
||||||||||||
|
|
@@ -7,7 +8,7 @@ use ruff_annotate_snippets::{ | |||||||||||
| }; | ||||||||||||
| use ruff_notebook::{Notebook, NotebookIndex}; | ||||||||||||
| use ruff_source_file::{LineIndex, OneIndexed, SourceCode}; | ||||||||||||
| use ruff_text_size::{TextRange, TextSize}; | ||||||||||||
| use ruff_text_size::{TextLen, TextRange, TextSize}; | ||||||||||||
|
|
||||||||||||
| use crate::diagnostic::stylesheet::DiagnosticStylesheet; | ||||||||||||
| use crate::{ | ||||||||||||
|
|
@@ -520,7 +521,7 @@ impl<'r> RenderableSnippets<'r> { | |||||||||||
| #[derive(Debug)] | ||||||||||||
| struct RenderableSnippet<'r> { | ||||||||||||
| /// The actual snippet text. | ||||||||||||
| snippet: &'r str, | ||||||||||||
| snippet: Cow<'r, str>, | ||||||||||||
| /// The absolute line number corresponding to where this | ||||||||||||
| /// snippet begins. | ||||||||||||
| line_start: OneIndexed, | ||||||||||||
|
|
@@ -580,6 +581,13 @@ impl<'r> RenderableSnippet<'r> { | |||||||||||
| .iter() | ||||||||||||
| .map(|ann| RenderableAnnotation::new(snippet_start, ann)) | ||||||||||||
| .collect(); | ||||||||||||
|
|
||||||||||||
| let EscapedSourceCode { | ||||||||||||
| text: snippet, | ||||||||||||
| annotations, | ||||||||||||
| } = replace_whitespace_and_unprintable(snippet, annotations) | ||||||||||||
| .fix_up_empty_spans_after_line_terminator(); | ||||||||||||
|
|
||||||||||||
| RenderableSnippet { | ||||||||||||
| snippet, | ||||||||||||
| line_start, | ||||||||||||
|
|
@@ -590,7 +598,7 @@ impl<'r> RenderableSnippet<'r> { | |||||||||||
|
|
||||||||||||
| /// Convert this to an "annotate" snippet. | ||||||||||||
| fn to_annotate<'a>(&'a self, path: &'a str) -> AnnotateSnippet<'a> { | ||||||||||||
| AnnotateSnippet::source(self.snippet) | ||||||||||||
| AnnotateSnippet::source(&self.snippet) | ||||||||||||
| .origin(path) | ||||||||||||
| .line_start(self.line_start.get()) | ||||||||||||
| .annotations( | ||||||||||||
|
|
@@ -820,6 +828,248 @@ fn relativize_path<'p>(cwd: &SystemPath, path: &'p str) -> &'p str { | |||||||||||
| path | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| /// A measure of the width of a line of text. | ||||||||||||
| #[derive(Clone, Copy, Default)] | ||||||||||||
| struct LineWidthBuilder { | ||||||||||||
| /// The width of the line. | ||||||||||||
| width: usize, | ||||||||||||
| /// The column of the line. | ||||||||||||
| /// This is used to calculate the width of tabs. | ||||||||||||
| column: usize, | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| impl LineWidthBuilder { | ||||||||||||
| fn get(&self) -> usize { | ||||||||||||
| self.width | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| /// Adds the given character to the line width. | ||||||||||||
| #[must_use] | ||||||||||||
| fn add_char(mut self, c: char) -> Self { | ||||||||||||
| const TAB_SIZE: usize = 4; | ||||||||||||
| match c { | ||||||||||||
| '\t' => { | ||||||||||||
| let tab_offset = TAB_SIZE - (self.column % TAB_SIZE); | ||||||||||||
| self.width += tab_offset; | ||||||||||||
| self.column += tab_offset; | ||||||||||||
| } | ||||||||||||
| '\n' | '\r' => { | ||||||||||||
| self.width = 0; | ||||||||||||
| self.column = 0; | ||||||||||||
| } | ||||||||||||
| _ => { | ||||||||||||
| self.width += unicode_width::UnicodeWidthChar::width(c).unwrap_or(0); | ||||||||||||
| self.column += 1; | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
| self | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| /// Given some source code and annotation ranges, this routine replaces tabs | ||||||||||||
| /// with ASCII whitespace, and unprintable characters with printable | ||||||||||||
| /// representations of them. | ||||||||||||
| /// | ||||||||||||
| /// The source code and annotations returned are updated to reflect changes made | ||||||||||||
| /// to the source code (if any). | ||||||||||||
| fn replace_whitespace_and_unprintable<'r>( | ||||||||||||
| source: &'r str, | ||||||||||||
| mut annotations: Vec<RenderableAnnotation<'r>>, | ||||||||||||
| ) -> EscapedSourceCode<'r> { | ||||||||||||
| let mut result = String::new(); | ||||||||||||
| let mut last_end = 0; | ||||||||||||
| let original_ranges: Vec<TextRange> = annotations.iter().map(|ann| ann.range).collect(); | ||||||||||||
| let mut line_width = LineWidthBuilder::default(); | ||||||||||||
|
||||||||||||
|
|
||||||||||||
| // Updates the annotation ranges given by the caller whenever a single byte (at `index` in | ||||||||||||
| // `source`) is replaced with `len` bytes. | ||||||||||||
| // | ||||||||||||
| // When the index occurs before the start of the range, the range is | ||||||||||||
| // offset by `len`. When the range occurs after or at the start but before | ||||||||||||
| // the end, then the end of the range only is offset by `len`. | ||||||||||||
| let mut update_ranges = |index: usize, len: u32| { | ||||||||||||
| for (ann, &original_range) in annotations.iter_mut().zip(&original_ranges) { | ||||||||||||
| if index < usize::from(original_range.start()) { | ||||||||||||
|
||||||||||||
| fn replace_whitespace_and_unprintable(source: &str, annotation_range: TextRange) -> SourceCode { | |
| let mut result = String::new(); | |
| let mut last_end = 0; | |
| let mut range = annotation_range; | |
| let mut line_width = LineWidthBuilder::new(IndentWidth::default()); |
though it pained me to collect here too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I realized that we didn't have a test for this in the stripped-down version, so I just added another test case. Without using original_ranges, this input:
��� # rendered as ^H^Z^[ in my editorcauses
thread 'diagnostic::render::full::tests::multiple_unprintable_characters' panicked at crates/ruff_annotate_snippets/src/renderer/display_list.rs:1450:29:
byte index 5 is not a char boundary; it is inside '␛' (bytes 4..7) of `␈␛`
From my earlier debugging, the issue was some kind of double counting in replace_whitespace_and_unprintable. The input annotation range here is 1..1, and the output in this panicking version is 5..5, instead of the correct 3..3, so it gets shifted twice if you compare against the current range.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ohh, I see now. This is a callback function. That's why it matters.
It seems unfortunate that we have to collect all original ranges even if the source code doesn't contain a single tab character (which should be the most common case).
Would it make sense to change the for (index, c) in source.char_indices() { to track the relative offset compared to the source offset. This way, you could call update_ranges with index + relative_offset and remove the original ranges thing entirely
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thinking about this: I think the relative offset is already known. It's result.text_len() - source.text_len(). That's how many characters where inserted by this function
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
result is likely empty until the very end of the function because we try to return a Cow::Borrowed(source) if nothing is modified, so I don't think we can get the offset from there.
It seems we update result everytime before we call update_ranges. Which makes sense to me. We update the source text and that, in return, requires updating the annotations.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alternatively, we could just do a regex search for tab and unprintable characters (looks like 5 distinct bytes?). If none are found, then you can just quit early with the Cow::Borrowed case. This search won't need to do UTF-8 decoding and regex might even vectorize it, so it could save you from allocating and UTF-8 decoding.
With all that said, even without that search to bail early, the up-front alloc here is almost certainly marginal. We are in the rendering code here, which does all sorts of allocs (including in annotate-snippets).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah maybe I'm misunderstanding the suggestion. In my naive attempt I tried result.text_len() - source.text_len() and kept getting underflow errors because we're building up the result over time, but source is always quite long from the beginning. Did you mean some sub-slice of source? Or maybe I injected this at the wrong point in the function.
What I meant to say above is "result doesn't have a comparable length to source until the very end of the function," not that it's totally empty.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh I see. In that case, isn't the relative offset result.text_len() - i?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, you're right, thanks! I had just missed accounting for the width of the character being inserted when I tried this initially.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given that we have to match on tabs here anyway, do you think the LineWidthBuilder is worth it or would it simplify the code if it were inlined instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it does look nicer inlined now that I tried it, thanks! You can see the whole looping logic on one screen now.
I don't see a great way to combine the two matches, but I still think it's an improvement.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, I see how to combine them. Obviously \n and \r won't hit the unprintable branch 🤦
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is probably a pre-existing issue but we should also handle \r here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought this would be okay in this context because checking \n as the preceding character should cover both \n and \r\n line endings, but it's easy enough to add \r either way.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's true. But Ruff still supports \r (because python does)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added!
Uh oh!
There was an error while loading. Please reload this page.