Skip to content

Commit 876f397

Browse files
authored
Fix parquet string statistics generation (#643) (#676)
* Fix string statistics generation, add tests * fix Int96 stats test * Add notes for additional tickets
1 parent ead64b7 commit 876f397

File tree

2 files changed

+134
-17
lines changed

2 files changed

+134
-17
lines changed

parquet/src/column/writer.rs

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1687,6 +1687,128 @@ mod tests {
16871687
);
16881688
}
16891689

1690+
#[test]
1691+
fn test_bool_statistics() {
1692+
let stats = statistics_roundtrip::<BoolType>(&[true, false, false, true]);
1693+
assert!(stats.has_min_max_set());
1694+
// should it be BooleanStatistics??
1695+
// https://github.com/apache/arrow-rs/issues/659
1696+
if let Statistics::Int32(stats) = stats {
1697+
assert_eq!(stats.min(), &0);
1698+
assert_eq!(stats.max(), &1);
1699+
} else {
1700+
panic!("expecting Statistics::Int32, got {:?}", stats);
1701+
}
1702+
}
1703+
1704+
#[test]
1705+
fn test_int32_statistics() {
1706+
let stats = statistics_roundtrip::<Int32Type>(&[-1, 3, -2, 2]);
1707+
assert!(stats.has_min_max_set());
1708+
if let Statistics::Int32(stats) = stats {
1709+
assert_eq!(stats.min(), &-2);
1710+
assert_eq!(stats.max(), &3);
1711+
} else {
1712+
panic!("expecting Statistics::Int32, got {:?}", stats);
1713+
}
1714+
}
1715+
1716+
#[test]
1717+
fn test_int64_statistics() {
1718+
let stats = statistics_roundtrip::<Int64Type>(&[-1, 3, -2, 2]);
1719+
assert!(stats.has_min_max_set());
1720+
if let Statistics::Int64(stats) = stats {
1721+
assert_eq!(stats.min(), &-2);
1722+
assert_eq!(stats.max(), &3);
1723+
} else {
1724+
panic!("expecting Statistics::Int64, got {:?}", stats);
1725+
}
1726+
}
1727+
1728+
#[test]
1729+
fn test_int96_statistics() {
1730+
let input = vec![
1731+
Int96::from(vec![1, 20, 30]),
1732+
Int96::from(vec![3, 20, 10]),
1733+
Int96::from(vec![0, 20, 30]),
1734+
Int96::from(vec![2, 20, 30]),
1735+
]
1736+
.into_iter()
1737+
.collect::<Vec<Int96>>();
1738+
1739+
let stats = statistics_roundtrip::<Int96Type>(&input);
1740+
assert!(stats.has_min_max_set());
1741+
if let Statistics::Int96(stats) = stats {
1742+
assert_eq!(stats.min(), &Int96::from(vec![0, 20, 30]));
1743+
assert_eq!(stats.max(), &Int96::from(vec![3, 20, 10]));
1744+
} else {
1745+
panic!("expecting Statistics::Int96, got {:?}", stats);
1746+
}
1747+
}
1748+
1749+
#[test]
1750+
fn test_float_statistics() {
1751+
let stats = statistics_roundtrip::<FloatType>(&[-1.0, 3.0, -2.0, 2.0]);
1752+
assert!(stats.has_min_max_set());
1753+
if let Statistics::Float(stats) = stats {
1754+
assert_eq!(stats.min(), &-2.0);
1755+
assert_eq!(stats.max(), &3.0);
1756+
} else {
1757+
panic!("expecting Statistics::Float, got {:?}", stats);
1758+
}
1759+
}
1760+
1761+
#[test]
1762+
fn test_double_statistics() {
1763+
let stats = statistics_roundtrip::<DoubleType>(&[-1.0, 3.0, -2.0, 2.0]);
1764+
assert!(stats.has_min_max_set());
1765+
if let Statistics::Double(stats) = stats {
1766+
assert_eq!(stats.min(), &-2.0);
1767+
assert_eq!(stats.max(), &3.0);
1768+
} else {
1769+
panic!("expecting Statistics::Double, got {:?}", stats);
1770+
}
1771+
}
1772+
1773+
#[test]
1774+
fn test_byte_array_statistics() {
1775+
let input = vec!["aawaa", "zz", "aaw", "m", "qrs"]
1776+
.iter()
1777+
.map(|&s| s.into())
1778+
.collect::<Vec<ByteArray>>();
1779+
1780+
let stats = statistics_roundtrip::<ByteArrayType>(&input);
1781+
assert!(stats.has_min_max_set());
1782+
if let Statistics::ByteArray(stats) = stats {
1783+
assert_eq!(stats.min(), &ByteArray::from("aaw"));
1784+
assert_eq!(stats.max(), &ByteArray::from("zz"));
1785+
} else {
1786+
panic!("expecting Statistics::ByteArray, got {:?}", stats);
1787+
}
1788+
}
1789+
1790+
#[test]
1791+
fn test_fixed_len_byte_array_statistics() {
1792+
let input = vec!["aawaa", "zz ", "aaw ", "m ", "qrs "]
1793+
.iter()
1794+
.map(|&s| {
1795+
let b: ByteArray = s.into();
1796+
b.into()
1797+
})
1798+
.collect::<Vec<FixedLenByteArray>>();
1799+
1800+
let stats = statistics_roundtrip::<FixedLenByteArrayType>(&input);
1801+
assert!(stats.has_min_max_set());
1802+
// should it be FixedLenByteArray?
1803+
// https://github.com/apache/arrow-rs/issues/660
1804+
if let Statistics::ByteArray(stats) = stats {
1805+
assert_eq!(stats.min(), &ByteArray::from("aaw "));
1806+
assert_eq!(stats.max(), &ByteArray::from("zz "));
1807+
} else {
1808+
panic!("expecting Statistics::ByteArray, got {:?}", stats);
1809+
}
1810+
}
1811+
16901812
#[test]
16911813
fn test_float_statistics_nan_middle() {
16921814
let stats = statistics_roundtrip::<FloatType>(&[1.0, f32::NAN, 2.0]);

parquet/src/data_type.rs

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -128,23 +128,18 @@ impl std::fmt::Debug for ByteArray {
128128

129129
impl PartialOrd for ByteArray {
130130
fn partial_cmp(&self, other: &ByteArray) -> Option<Ordering> {
131-
if self.data.is_some() && other.data.is_some() {
132-
match self.len().cmp(&other.len()) {
133-
Ordering::Greater => Some(Ordering::Greater),
134-
Ordering::Less => Some(Ordering::Less),
135-
Ordering::Equal => {
136-
for (v1, v2) in self.data().iter().zip(other.data().iter()) {
137-
match v1.cmp(v2) {
138-
Ordering::Greater => return Some(Ordering::Greater),
139-
Ordering::Less => return Some(Ordering::Less),
140-
_ => {}
141-
}
142-
}
143-
Some(Ordering::Equal)
144-
}
131+
// sort nulls first (consistent with PartialCmp on Option)
132+
//
133+
// Since ByteBuffer doesn't implement PartialOrd, so can't
134+
// derive an implementation
135+
match (&self.data, &other.data) {
136+
(None, None) => Some(Ordering::Equal),
137+
(None, Some(_)) => Some(Ordering::Less),
138+
(Some(_), None) => Some(Ordering::Greater),
139+
(Some(self_data), Some(other_data)) => {
140+
// compare slices directly
141+
self_data.data().partial_cmp(other_data.data())
145142
}
146-
} else {
147-
None
148143
}
149144
}
150145
}
@@ -1368,7 +1363,7 @@ mod tests {
13681363
let ba4 = ByteArray::from(vec![]);
13691364
let ba5 = ByteArray::from(vec![2, 2, 3]);
13701365

1371-
assert!(ba1 > ba2);
1366+
assert!(ba1 < ba2);
13721367
assert!(ba3 > ba1);
13731368
assert!(ba1 > ba4);
13741369
assert_eq!(ba1, ba11);

0 commit comments

Comments
 (0)