Skip to content

Commit cd61ead

Browse files
dqkqdalamb
andauthored
feat: parse DataType List, ListView, LargeList, LargeListView, FixedSizeList (#8649)
# Which issue does this PR close? - Part of #8648. This PR only implements for list types to make review easier. # Rationale for this change The format for `DataType::List` includes: - [x] `List(Int64)`: list not nullable. - [x] `List(nullable Int64)`: list nullable. - [x] `List(nullable Int64, field: 'foo')`: list nullable with field. ~`List(nullable Int64, metadata: {"foo1": "value1"})`: list with metadata.~ (... The list goes on for `ListView`, `LargeList`, `LargeListView`, `FixedSizeList`) `parse_data_type` cannot (or incorrectly) work on those data types listed above. # What changes are included in this PR? - Add `Token::...` to support new `Display` format for list types introduced in #8351 (e.g. `FixedSizeList(5 x nullable Int64, field: 'foo'`). - Add `fn nullable` to check whether nested data type is nullable. - Add `parse_single_quoted_string` and `parse_list_field_name` to handle `field: 'foo'`. # Are these changes tested? Yes. Added round trip tests. # Are there any user-facing changes? Yes. This is related to #8351 --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 2eabb59 commit cd61ead

File tree

1 file changed

+175
-21
lines changed

1 file changed

+175
-21
lines changed

arrow-schema/src/datatype_parse.rs

Lines changed: 175 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,9 @@ impl<'a> Parser<'a> {
8383
Token::Decimal256 => self.parse_decimal_256(),
8484
Token::Dictionary => self.parse_dictionary(),
8585
Token::List => self.parse_list(),
86+
Token::ListView => self.parse_list_view(),
8687
Token::LargeList => self.parse_large_list(),
88+
Token::LargeListView => self.parse_large_list_view(),
8789
Token::FixedSizeList => self.parse_fixed_size_list(),
8890
Token::Struct => self.parse_struct(),
8991
tok => Err(make_error(
@@ -93,35 +95,87 @@ impl<'a> Parser<'a> {
9395
}
9496
}
9597

96-
/// Parses the List type
98+
/// Parses list field name. Returns default field name if not found.
99+
fn parse_list_field_name(&mut self, context: &str) -> ArrowResult<String> {
100+
// field must be after a comma
101+
if self
102+
.tokenizer
103+
.next_if(|next| matches!(next, Ok(Token::Comma)))
104+
.is_none()
105+
{
106+
return Ok(Field::LIST_FIELD_DEFAULT_NAME.into());
107+
}
108+
109+
// expects: `field: 'field_name'`.
110+
self.expect_token(Token::Field)?;
111+
self.expect_token(Token::Colon)?;
112+
self.parse_single_quoted_string(context)
113+
}
114+
115+
/// Parses the List type (called after `List` has been consumed)
116+
/// E.g: List(nullable Int64, field: 'foo')
97117
fn parse_list(&mut self) -> ArrowResult<DataType> {
98118
self.expect_token(Token::LParen)?;
119+
let nullable = self.parse_opt_nullable();
120+
let data_type = self.parse_next_type()?;
121+
let field = self.parse_list_field_name("List")?;
122+
self.expect_token(Token::RParen)?;
123+
Ok(DataType::List(Arc::new(Field::new(
124+
field, data_type, nullable,
125+
))))
126+
}
127+
128+
/// Parses the ListView type (called after `ListView` has been consumed)
129+
/// E.g: ListView(nullable Int64, field: 'foo')
130+
fn parse_list_view(&mut self) -> ArrowResult<DataType> {
131+
self.expect_token(Token::LParen)?;
132+
let nullable = self.parse_opt_nullable();
99133
let data_type = self.parse_next_type()?;
134+
let field = self.parse_list_field_name("ListView")?;
100135
self.expect_token(Token::RParen)?;
101-
Ok(DataType::List(Arc::new(Field::new_list_field(
102-
data_type, true,
136+
Ok(DataType::ListView(Arc::new(Field::new(
137+
field, data_type, nullable,
103138
))))
104139
}
105140

106-
/// Parses the LargeList type
141+
/// Parses the LargeList type (called after `LargeList` has been consumed)
142+
/// E.g: LargeList(nullable Int64, field: 'foo')
107143
fn parse_large_list(&mut self) -> ArrowResult<DataType> {
108144
self.expect_token(Token::LParen)?;
145+
let nullable = self.parse_opt_nullable();
109146
let data_type = self.parse_next_type()?;
147+
let field = self.parse_list_field_name("LargeList")?;
110148
self.expect_token(Token::RParen)?;
111-
Ok(DataType::LargeList(Arc::new(Field::new_list_field(
112-
data_type, true,
149+
Ok(DataType::LargeList(Arc::new(Field::new(
150+
field, data_type, nullable,
113151
))))
114152
}
115153

116-
/// Parses the FixedSizeList type
154+
/// Parses the LargeListView type (called after `LargeListView` has been consumed)
155+
/// E.g: LargeListView(nullable Int64, field: 'foo')
156+
fn parse_large_list_view(&mut self) -> ArrowResult<DataType> {
157+
self.expect_token(Token::LParen)?;
158+
let nullable = self.parse_opt_nullable();
159+
let data_type = self.parse_next_type()?;
160+
let field = self.parse_list_field_name("LargeListView")?;
161+
self.expect_token(Token::RParen)?;
162+
Ok(DataType::LargeListView(Arc::new(Field::new(
163+
field, data_type, nullable,
164+
))))
165+
}
166+
167+
/// Parses the FixedSizeList type (called after `FixedSizeList` has been consumed)
168+
/// E.g: FixedSizeList(5 x nullable Int64, field: 'foo')
117169
fn parse_fixed_size_list(&mut self) -> ArrowResult<DataType> {
118170
self.expect_token(Token::LParen)?;
119171
let length = self.parse_i32("FixedSizeList")?;
120-
self.expect_token(Token::Comma)?;
172+
self.expect_token(Token::X)?;
173+
let nullable = self.parse_opt_nullable();
121174
let data_type = self.parse_next_type()?;
175+
let field = self.parse_list_field_name("FixedSizeList")?;
122176
self.expect_token(Token::RParen)?;
123177
Ok(DataType::FixedSizeList(
124-
Arc::new(Field::new_list_field(data_type, true)),
178+
Arc::new(Field::new(field, data_type, nullable)),
125179
length,
126180
))
127181
}
@@ -150,6 +204,19 @@ impl<'a> Parser<'a> {
150204
}
151205
}
152206

207+
/// Parses the next single quoted string
208+
fn parse_single_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
209+
let token = self.next_token()?;
210+
if let Token::SingleQuotedString(string) = token {
211+
Ok(string)
212+
} else {
213+
Err(make_error(
214+
self.val,
215+
&format!("expected single quoted string for {context}, got '{token}'"),
216+
))
217+
}
218+
}
219+
153220
/// Parses the next integer value
154221
fn parse_i64(&mut self, context: &str) -> ArrowResult<i64> {
155222
match self.next_token()? {
@@ -340,6 +407,8 @@ impl<'a> Parser<'a> {
340407
Box::new(value_type),
341408
))
342409
}
410+
411+
/// Parses the next Struct (called after `Struct` has been consumed)
343412
fn parse_struct(&mut self) -> ArrowResult<DataType> {
344413
self.expect_token(Token::LParen)?;
345414
let mut fields = Vec::new();
@@ -354,16 +423,13 @@ impl<'a> Parser<'a> {
354423
tok => {
355424
return Err(make_error(
356425
self.val,
357-
&format!("Expected a quoted string for a field name; got {tok:?}"),
426+
&format!("Expected a double quoted string for a field name; got {tok:?}"),
358427
));
359428
}
360429
};
361430
self.expect_token(Token::Colon)?;
362431

363-
let nullable = self
364-
.tokenizer
365-
.next_if(|next| matches!(next, Ok(Token::Nullable)))
366-
.is_some();
432+
let nullable = self.parse_opt_nullable();
367433
let field_type = self.parse_next_type()?;
368434
fields.push(Arc::new(Field::new(field_name, field_type, nullable)));
369435
match self.next_token()? {
@@ -382,6 +448,13 @@ impl<'a> Parser<'a> {
382448
Ok(DataType::Struct(Fields::from(fields)))
383449
}
384450

451+
/// return and consume if the next token is `Token::Nullable`
452+
fn parse_opt_nullable(&mut self) -> bool {
453+
self.tokenizer
454+
.next_if(|next| matches!(next, Ok(Token::Nullable)))
455+
.is_some()
456+
}
457+
385458
/// return the next token, or an error if there are none left
386459
fn next_token(&mut self) -> ArrowResult<Token> {
387460
match self.tokenizer.next() {
@@ -406,6 +479,11 @@ fn is_separator(c: char) -> bool {
406479
c == '(' || c == ')' || c == ',' || c == ':' || c == ' '
407480
}
408481

482+
enum QuoteType {
483+
Double,
484+
Single,
485+
}
486+
409487
#[derive(Debug)]
410488
/// Splits a strings like Dictionary(Int32, Int64) into tokens sutable for parsing
411489
///
@@ -497,7 +575,9 @@ impl<'a> Tokenizer<'a> {
497575
"Date64" => Token::SimpleType(DataType::Date64),
498576

499577
"List" => Token::List,
578+
"ListView" => Token::ListView,
500579
"LargeList" => Token::LargeList,
580+
"LargeListView" => Token::LargeListView,
501581
"FixedSizeList" => Token::FixedSizeList,
502582

503583
"s" | "Second" => Token::TimeUnit(TimeUnit::Second),
@@ -527,6 +607,8 @@ impl<'a> Tokenizer<'a> {
527607
"None" => Token::None,
528608

529609
"nullable" => Token::Nullable,
610+
"field" => Token::Field,
611+
"x" => Token::X,
530612

531613
"Struct" => Token::Struct,
532614

@@ -537,9 +619,14 @@ impl<'a> Tokenizer<'a> {
537619
Ok(token)
538620
}
539621

540-
/// Parses e.g. `"foo bar"`
541-
fn parse_quoted_string(&mut self) -> ArrowResult<Token> {
542-
if self.next_char() != Some('\"') {
622+
/// Parses e.g. `"foo bar"`, `'foo bar'`
623+
fn parse_quoted_string(&mut self, quote_type: QuoteType) -> ArrowResult<Token> {
624+
let quote = match quote_type {
625+
QuoteType::Double => '\"',
626+
QuoteType::Single => '\'',
627+
};
628+
629+
if self.next_char() != Some(quote) {
543630
return Err(make_error(self.val, "Expected \""));
544631
}
545632

@@ -561,7 +648,7 @@ impl<'a> Tokenizer<'a> {
561648
is_escaped = true;
562649
self.word.push(c);
563650
}
564-
'"' => {
651+
c if c == quote => {
565652
if is_escaped {
566653
self.word.push(c);
567654
is_escaped = false;
@@ -585,7 +672,10 @@ impl<'a> Tokenizer<'a> {
585672
return Err(make_error(self.val, "empty strings aren't allowed"));
586673
}
587674

588-
Ok(Token::DoubleQuotedString(val))
675+
match quote_type {
676+
QuoteType::Double => Ok(Token::DoubleQuotedString(val)),
677+
QuoteType::Single => Ok(Token::SingleQuotedString(val)),
678+
}
589679
}
590680
}
591681

@@ -601,7 +691,10 @@ impl Iterator for Tokenizer<'_> {
601691
continue;
602692
}
603693
'"' => {
604-
return Some(self.parse_quoted_string());
694+
return Some(self.parse_quoted_string(QuoteType::Double));
695+
}
696+
'\'' => {
697+
return Some(self.parse_quoted_string(QuoteType::Single));
605698
}
606699
'(' => {
607700
self.next_char();
@@ -652,19 +745,26 @@ enum Token {
652745
None,
653746
Integer(i64),
654747
DoubleQuotedString(String),
748+
SingleQuotedString(String),
655749
List,
750+
ListView,
656751
LargeList,
752+
LargeListView,
657753
FixedSizeList,
658754
Struct,
659755
Nullable,
756+
Field,
757+
X,
660758
}
661759

662760
impl Display for Token {
663761
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
664762
match self {
665763
Token::SimpleType(t) => write!(f, "{t}"),
666764
Token::List => write!(f, "List"),
765+
Token::ListView => write!(f, "ListView"),
667766
Token::LargeList => write!(f, "LargeList"),
767+
Token::LargeListView => write!(f, "LargeListView"),
668768
Token::FixedSizeList => write!(f, "FixedSizeList"),
669769
Token::Timestamp => write!(f, "Timestamp"),
670770
Token::Time32 => write!(f, "Time32"),
@@ -687,8 +787,11 @@ impl Display for Token {
687787
Token::Dictionary => write!(f, "Dictionary"),
688788
Token::Integer(v) => write!(f, "Integer({v})"),
689789
Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
790+
Token::SingleQuotedString(s) => write!(f, "SingleQuotedString({s})"),
690791
Token::Struct => write!(f, "Struct"),
691792
Token::Nullable => write!(f, "nullable"),
793+
Token::Field => write!(f, "field"),
794+
Token::X => write!(f, "x"),
692795
}
693796
}
694797
}
@@ -828,7 +931,58 @@ mod test {
828931
),
829932
])),
830933
DataType::Struct(Fields::empty()),
831-
// TODO support more structured types (List, LargeList, Union, Map, RunEndEncoded, etc)
934+
DataType::List(Arc::new(Field::new_list_field(DataType::Int64, true))),
935+
DataType::List(Arc::new(Field::new_list_field(DataType::Int64, false))),
936+
DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))),
937+
DataType::List(Arc::new(Field::new("Int64", DataType::Int64, false))),
938+
DataType::List(Arc::new(Field::new(
939+
"nested_list",
940+
DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))),
941+
true,
942+
))),
943+
DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, true))),
944+
DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, false))),
945+
DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
946+
DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, false))),
947+
DataType::ListView(Arc::new(Field::new(
948+
"nested_list_view",
949+
DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
950+
true,
951+
))),
952+
DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, true))),
953+
DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, false))),
954+
DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))),
955+
DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, false))),
956+
DataType::LargeList(Arc::new(Field::new(
957+
"nested_large_list",
958+
DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))),
959+
true,
960+
))),
961+
DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, true))),
962+
DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, false))),
963+
DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
964+
DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, false))),
965+
DataType::LargeListView(Arc::new(Field::new(
966+
"nested_large_list_view",
967+
DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))),
968+
true,
969+
))),
970+
DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, true)), 2),
971+
DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, false)), 2),
972+
DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, true)), 2),
973+
DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, false)), 2),
974+
DataType::FixedSizeList(
975+
Arc::new(Field::new(
976+
"nested_fixed_size_list",
977+
DataType::FixedSizeList(
978+
Arc::new(Field::new("Int64", DataType::Int64, true)),
979+
2,
980+
),
981+
true,
982+
)),
983+
2,
984+
),
985+
// TODO support more structured types (Union, Map, RunEndEncoded, etc)
832986
]
833987
}
834988

0 commit comments

Comments
 (0)