-
Notifications
You must be signed in to change notification settings - Fork 1k
feat: parse DataType List, ListView, LargeList, LargeListView, FixedSizeList
#8649
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 10 commits
Commits
Show all changes
15 commits
Select commit
Hold shift + click to select a range
79b6f7e
support parse list
dqkqd 28c6397
docs
dqkqd e25e4a1
refactor field name out parse list
dqkqd 08a6577
make `parse_list` easy to read
dqkqd a8822e5
support `ListView`
dqkqd 776288e
support `LargeList`, `LargeListView`
dqkqd 1c2edb4
return default list field name instead of None
dqkqd 2bc42d9
remove uneeded context name
dqkqd b00d849
support `FixedSizeList`
dqkqd 5b6cbd6
rename `Token::Count` to `Token::X`
dqkqd da856f9
docs and example
dqkqd 82197b5
typo
dqkqd 49495a6
rename `nullable` to `parse_opt_nullable`
dqkqd b6d502c
Merge remote-tracking branch 'public/main' into support-parse-list
dqkqd fca8cb3
merge main@public
dqkqd File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -83,7 +83,9 @@ impl<'a> Parser<'a> { | |
| Token::Decimal256 => self.parse_decimal_256(), | ||
| Token::Dictionary => self.parse_dictionary(), | ||
| Token::List => self.parse_list(), | ||
| Token::ListView => self.parse_list_view(), | ||
| Token::LargeList => self.parse_large_list(), | ||
| Token::LargeListView => self.parse_large_list_view(), | ||
| Token::FixedSizeList => self.parse_fixed_size_list(), | ||
| Token::Struct => self.parse_struct(), | ||
| tok => Err(make_error( | ||
|
|
@@ -93,35 +95,83 @@ impl<'a> Parser<'a> { | |
| } | ||
| } | ||
|
|
||
| /// Parses list field name | ||
| fn parse_list_field_name(&mut self, context: &str) -> ArrowResult<String> { | ||
| // field must be after a comma | ||
| if self | ||
| .tokenizer | ||
| .next_if(|next| matches!(next, Ok(Token::Comma))) | ||
| .is_none() | ||
| { | ||
| return Ok(Field::LIST_FIELD_DEFAULT_NAME.into()); | ||
| } | ||
|
|
||
| // expects: `field: 'field_name'`. | ||
| self.expect_token(Token::Field)?; | ||
| self.expect_token(Token::Colon)?; | ||
| self.parse_single_quoted_string(context) | ||
| } | ||
|
|
||
| /// Parses the List type | ||
| fn parse_list(&mut self) -> ArrowResult<DataType> { | ||
| self.expect_token(Token::LParen)?; | ||
| let nullable = self.nullable(); | ||
| let data_type = self.parse_next_type()?; | ||
| let field = self.parse_list_field_name("List")?; | ||
| self.expect_token(Token::RParen)?; | ||
| Ok(DataType::List(Arc::new(Field::new( | ||
| field, data_type, nullable, | ||
| )))) | ||
| } | ||
|
|
||
| /// Parses the ListView type | ||
| fn parse_list_view(&mut self) -> ArrowResult<DataType> { | ||
| self.expect_token(Token::LParen)?; | ||
| let nullable = self.nullable(); | ||
| let data_type = self.parse_next_type()?; | ||
| let field = self.parse_list_field_name("ListView")?; | ||
| self.expect_token(Token::RParen)?; | ||
| Ok(DataType::List(Arc::new(Field::new_list_field( | ||
| data_type, true, | ||
| Ok(DataType::ListView(Arc::new(Field::new( | ||
| field, data_type, nullable, | ||
| )))) | ||
| } | ||
|
|
||
| /// Parses the LargeList type | ||
| fn parse_large_list(&mut self) -> ArrowResult<DataType> { | ||
| self.expect_token(Token::LParen)?; | ||
| let nullable = self.nullable(); | ||
| let data_type = self.parse_next_type()?; | ||
| let field = self.parse_list_field_name("LargeList")?; | ||
| self.expect_token(Token::RParen)?; | ||
| Ok(DataType::LargeList(Arc::new(Field::new_list_field( | ||
| data_type, true, | ||
| Ok(DataType::LargeList(Arc::new(Field::new( | ||
| field, data_type, nullable, | ||
| )))) | ||
| } | ||
|
|
||
| /// Parses the LargeListView type | ||
| fn parse_large_list_view(&mut self) -> ArrowResult<DataType> { | ||
| self.expect_token(Token::LParen)?; | ||
| let nullable = self.nullable(); | ||
| let data_type = self.parse_next_type()?; | ||
| let field = self.parse_list_field_name("LargeListView")?; | ||
| self.expect_token(Token::RParen)?; | ||
| Ok(DataType::LargeListView(Arc::new(Field::new( | ||
| field, data_type, nullable, | ||
| )))) | ||
| } | ||
|
|
||
| /// Parses the FixedSizeList type | ||
| fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> { | ||
| self.expect_token(Token::LParen)?; | ||
| // expects: `length x #data_type [field]` | ||
| let length = self.parse_i32("FixedSizeList")?; | ||
| self.expect_token(Token::Comma)?; | ||
| self.expect_token(Token::X)?; | ||
| let nullable = self.nullable(); | ||
| let data_type = self.parse_next_type()?; | ||
| let field = self.parse_list_field_name("FixedSizeList")?; | ||
| self.expect_token(Token::RParen)?; | ||
| Ok(DataType::FixedSizeList( | ||
| Arc::new(Field::new_list_field(data_type, true)), | ||
| Arc::new(Field::new(field, data_type, nullable)), | ||
| length, | ||
| )) | ||
| } | ||
|
|
@@ -150,6 +200,19 @@ impl<'a> Parser<'a> { | |
| } | ||
| } | ||
|
|
||
| /// Parses the next single quoted string | ||
| fn parse_single_quoted_string(&mut self, context: &str) -> ArrowResult<String> { | ||
| let token = self.next_token()?; | ||
| if let Token::SingleQuotedString(string) = token { | ||
| Ok(string) | ||
| } else { | ||
| Err(make_error( | ||
| self.val, | ||
| &format!("expected single quoted string for {context}, got '{token}'"), | ||
| )) | ||
| } | ||
| } | ||
|
|
||
| /// Parses the next integer value | ||
| fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> { | ||
| match self.next_token()? { | ||
|
|
@@ -340,6 +403,8 @@ impl<'a> Parser<'a> { | |
| Box::new(value_type), | ||
| )) | ||
| } | ||
|
|
||
| /// Parses the next Struct (called after `Struct` has been consumed) | ||
| fn parse_struct(&mut self) -> ArrowResult<DataType> { | ||
| self.expect_token(Token::LParen)?; | ||
| let mut fields = Vec::new(); | ||
|
|
@@ -354,16 +419,13 @@ impl<'a> Parser<'a> { | |
| tok => { | ||
| return Err(make_error( | ||
| self.val, | ||
| &format!("Expected a quoted string for a field name; got {tok:?}"), | ||
| &format!("Expected a double quoted string for a field name; got {tok:?}"), | ||
| )); | ||
| } | ||
| }; | ||
| self.expect_token(Token::Colon)?; | ||
|
|
||
| let nullable = self | ||
| .tokenizer | ||
| .next_if(|next| matches!(next, Ok(Token::Nullable))) | ||
| .is_some(); | ||
| let nullable = self.nullable(); | ||
| let field_type = self.parse_next_type()?; | ||
| fields.push(Arc::new(Field::new(field_name, field_type, nullable))); | ||
| match self.next_token()? { | ||
|
|
@@ -382,6 +444,13 @@ impl<'a> Parser<'a> { | |
| Ok(DataType::Struct(Fields::from(fields))) | ||
| } | ||
|
|
||
| /// return and consume if the next token is `Token::Nullable` | ||
| fn nullable(&mut self) -> bool { | ||
| self.tokenizer | ||
| .next_if(|next| matches!(next, Ok(Token::Nullable))) | ||
| .is_some() | ||
| } | ||
|
|
||
| /// return the next token, or an error if there are none left | ||
| fn next_token(&mut self) -> ArrowResult<Token> { | ||
| match self.tokenizer.next() { | ||
|
|
@@ -406,6 +475,11 @@ fn is_separator(c: char) -> bool { | |
| c == '(' || c == ')' || c == ',' || c == ':' || c == ' ' | ||
| } | ||
|
|
||
| enum QuoteType { | ||
| Double, | ||
| Single, | ||
| } | ||
|
|
||
| #[derive(Debug)] | ||
| /// Splits a strings like Dictionary(Int32, Int64) into tokens sutable for parsing | ||
| /// | ||
|
|
@@ -497,7 +571,9 @@ impl<'a> Tokenizer<'a> { | |
| "Date64" => Token::SimpleType(DataType::Date64), | ||
|
|
||
| "List" => Token::List, | ||
| "ListView" => Token::ListView, | ||
| "LargeList" => Token::LargeList, | ||
| "LargeListView" => Token::LargeListView, | ||
| "FixedSizeList" => Token::FixedSizeList, | ||
|
|
||
| "s" | "Second" => Token::TimeUnit(TimeUnit::Second), | ||
|
|
@@ -527,6 +603,8 @@ impl<'a> Tokenizer<'a> { | |
| "None" => Token::None, | ||
|
|
||
| "nullable" => Token::Nullable, | ||
| "field" => Token::Field, | ||
| "x" => Token::X, | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I cannot find a better name for |
||
|
|
||
| "Struct" => Token::Struct, | ||
|
|
||
|
|
@@ -537,9 +615,14 @@ impl<'a> Tokenizer<'a> { | |
| Ok(token) | ||
| } | ||
|
|
||
| /// Parses e.g. `"foo bar"` | ||
| fn parse_quoted_string(&mut self) -> ArrowResult<Token> { | ||
| if self.next_char() != Some('\"') { | ||
| /// Parses e.g. `"foo bar"`, `'foo bar'` | ||
| fn parse_quoted_string(&mut self, quote_type: QuoteType) -> ArrowResult<Token> { | ||
| let quote = match quote_type { | ||
| QuoteType::Double => '\"', | ||
| QuoteType::Single => '\'', | ||
| }; | ||
|
|
||
| if self.next_char() != Some(quote) { | ||
| return Err(make_error(self.val, "Expected \"")); | ||
| } | ||
|
|
||
|
|
@@ -561,7 +644,7 @@ impl<'a> Tokenizer<'a> { | |
| is_escaped = true; | ||
| self.word.push(c); | ||
| } | ||
| '"' => { | ||
| c if c == quote => { | ||
| if is_escaped { | ||
| self.word.push(c); | ||
| is_escaped = false; | ||
|
|
@@ -585,7 +668,10 @@ impl<'a> Tokenizer<'a> { | |
| return Err(make_error(self.val, "empty strings aren't allowed")); | ||
| } | ||
|
|
||
| Ok(Token::DoubleQuotedString(val)) | ||
| match quote_type { | ||
| QuoteType::Double => Ok(Token::DoubleQuotedString(val)), | ||
| QuoteType::Single => Ok(Token::SingleQuotedString(val)), | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -601,7 +687,10 @@ impl Iterator for Tokenizer<'_> { | |
| continue; | ||
| } | ||
| '"' => { | ||
| return Some(self.parse_quoted_string()); | ||
| return Some(self.parse_quoted_string(QuoteType::Double)); | ||
| } | ||
| '\'' => { | ||
| return Some(self.parse_quoted_string(QuoteType::Single)); | ||
| } | ||
| '(' => { | ||
| self.next_char(); | ||
|
|
@@ -652,19 +741,26 @@ enum Token { | |
| None, | ||
| Integer(i64), | ||
| DoubleQuotedString(String), | ||
| SingleQuotedString(String), | ||
| List, | ||
| ListView, | ||
| LargeList, | ||
| LargeListView, | ||
| FixedSizeList, | ||
| Struct, | ||
| Nullable, | ||
| Field, | ||
| X, | ||
| } | ||
|
|
||
| impl Display for Token { | ||
| fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||
| match self { | ||
| Token::SimpleType(t) => write!(f, "{t}"), | ||
| Token::List => write!(f, "List"), | ||
| Token::ListView => write!(f, "ListView"), | ||
| Token::LargeList => write!(f, "LargeList"), | ||
| Token::LargeListView => write!(f, "LargeListView"), | ||
| Token::FixedSizeList => write!(f, "FixedSizeList"), | ||
| Token::Timestamp => write!(f, "Timestamp"), | ||
| Token::Time32 => write!(f, "Time32"), | ||
|
|
@@ -687,8 +783,11 @@ impl Display for Token { | |
| Token::Dictionary => write!(f, "Dictionary"), | ||
| Token::Integer(v) => write!(f, "Integer({v})"), | ||
| Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"), | ||
| Token::SingleQuotedString(s) => write!(f, "SingleQuotedString({s})"), | ||
| Token::Struct => write!(f, "Struct"), | ||
| Token::Nullable => write!(f, "nullable"), | ||
| Token::Field => write!(f, "field"), | ||
| Token::X => write!(f, "x"), | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -828,7 +927,58 @@ mod test { | |
| ), | ||
| ])), | ||
| DataType::Struct(Fields::empty()), | ||
| // TODO support more structured types (List, LargeList, Union, Map, RunEndEncoded, etc) | ||
| DataType::List(Arc::new(Field::new_list_field(DataType::Int64, true))), | ||
| DataType::List(Arc::new(Field::new_list_field(DataType::Int64, false))), | ||
| DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))), | ||
| DataType::List(Arc::new(Field::new("Int64", DataType::Int64, false))), | ||
| DataType::List(Arc::new(Field::new( | ||
| "nested_list", | ||
| DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))), | ||
| true, | ||
| ))), | ||
| DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, true))), | ||
| DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, false))), | ||
| DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))), | ||
| DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, false))), | ||
| DataType::ListView(Arc::new(Field::new( | ||
| "nested_list_view", | ||
| DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))), | ||
| true, | ||
| ))), | ||
| DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, true))), | ||
| DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, false))), | ||
| DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))), | ||
| DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, false))), | ||
| DataType::LargeList(Arc::new(Field::new( | ||
| "nested_large_list", | ||
| DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))), | ||
| true, | ||
| ))), | ||
| DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, true))), | ||
| DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, false))), | ||
| DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))), | ||
| DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, false))), | ||
| DataType::LargeListView(Arc::new(Field::new( | ||
| "nested_large_list_view", | ||
| DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))), | ||
| true, | ||
| ))), | ||
| DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, true)), 2), | ||
| DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, false)), 2), | ||
| DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, true)), 2), | ||
| DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, false)), 2), | ||
| DataType::FixedSizeList( | ||
| Arc::new(Field::new( | ||
| "nested_large_list_view", | ||
| DataType::FixedSizeList( | ||
| Arc::new(Field::new("Int64", DataType::Int64, true)), | ||
| 2, | ||
| ), | ||
| true, | ||
| )), | ||
| 2, | ||
| ), | ||
| // TODO support more structured types (Union, Map, RunEndEncoded, etc) | ||
| ] | ||
| } | ||
|
|
||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.