diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt index fb152e20c944..25c43d8273df 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt @@ -2,269 +2,322 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11834 11929 134 1.3 752.4 1.0X -SQL Json 8574 8597 32 1.8 545.1 1.4X -SQL Parquet Vectorized 116 136 17 135.5 7.4 102.0X -SQL Parquet MR 1703 1715 17 9.2 108.2 7.0X -SQL ORC Vectorized 172 215 48 91.2 11.0 68.6X -SQL ORC MR 1819 1825 8 8.6 115.7 6.5X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +SQL CSV 9636 9771 191 1.6 612.6 1.0X +SQL Json 7960 8227 378 2.0 506.1 1.2X +SQL Parquet Vectorized: DataPageV1 113 129 12 139.7 7.2 85.6X +SQL Parquet Vectorized: DataPageV2 84 93 12 186.6 5.4 114.3X +SQL Parquet MR: DataPageV1 1466 1470 6 10.7 93.2 6.6X +SQL Parquet MR: DataPageV2 1334 1347 18 11.8 84.8 7.2X +SQL ORC Vectorized 163 197 27 96.3 10.4 59.0X +SQL ORC MR 1554 1558 6 10.1 98.8 6.2X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz -Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 117 126 17 134.9 7.4 1.0X -ParquetReader Vectorized -> Row 47 49 3 336.5 3.0 2.5X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 94 103 13 167.1 6.0 1.0X +ParquetReader Vectorized: DataPageV2 77 86 11 204.3 4.9 1.2X +ParquetReader Vectorized -> Row: DataPageV1 44 47 4 357.0 2.8 2.1X +ParquetReader Vectorized -> Row: DataPageV2 35 37 3 445.2 2.2 2.7X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13434 13590 220 1.2 854.1 1.0X -SQL Json 10056 10073 24 1.6 639.3 1.3X -SQL Parquet Vectorized 212 229 12 74.3 13.5 63.4X -SQL Parquet MR 1883 1916 47 8.4 119.7 7.1X -SQL ORC Vectorized 200 241 30 78.8 12.7 67.3X -SQL ORC MR 1529 1549 28 10.3 97.2 8.8X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +SQL CSV 11479 11919 622 1.4 729.8 1.0X +SQL Json 9894 9922 39 1.6 629.1 1.2X +SQL Parquet Vectorized: DataPageV1 123 156 30 128.3 7.8 93.6X +SQL Parquet Vectorized: DataPageV2 126 138 19 125.2 8.0 91.4X +SQL Parquet MR: DataPageV1 1986 2500 726 7.9 126.3 5.8X +SQL Parquet MR: DataPageV2 1810 1898 126 8.7 115.1 6.3X +SQL ORC Vectorized 174 210 30 90.5 11.0 66.1X +SQL ORC MR 1645 1652 9 9.6 104.6 7.0X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz -Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 229 254 13 68.6 14.6 1.0X -ParquetReader Vectorized -> Row 162 171 14 96.9 10.3 1.4X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 166 177 14 94.9 10.5 1.0X +ParquetReader Vectorized: DataPageV2 165 172 11 95.3 10.5 1.0X +ParquetReader Vectorized -> Row: DataPageV1 95 100 5 165.7 6.0 1.7X +ParquetReader Vectorized -> Row: DataPageV2 85 89 6 186.0 5.4 2.0X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 14320 14476 221 1.1 910.4 1.0X -SQL Json 9769 10067 423 1.6 621.1 1.5X -SQL Parquet Vectorized 187 228 28 84.3 11.9 76.8X -SQL Parquet MR 2230 2240 14 7.1 141.8 6.4X -SQL ORC Vectorized 221 265 36 71.1 14.1 64.8X -SQL ORC MR 1763 1779 23 8.9 112.1 8.1X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +SQL CSV 12176 12646 664 1.3 774.1 1.0X +SQL Json 9696 9729 46 1.6 616.5 1.3X +SQL Parquet Vectorized: DataPageV1 151 201 33 103.9 9.6 80.4X +SQL Parquet Vectorized: DataPageV2 216 235 15 72.7 13.8 56.3X +SQL Parquet MR: DataPageV1 1915 2017 145 8.2 121.8 6.4X +SQL Parquet MR: DataPageV2 1954 1978 33 8.0 124.3 6.2X +SQL ORC Vectorized 197 235 25 79.7 12.6 61.7X +SQL ORC MR 1769 1829 85 8.9 112.5 6.9X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 246 255 12 64.1 15.6 1.0X -ParquetReader Vectorized -> Row 249 294 21 63.1 15.8 1.0X +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 230 237 12 68.5 14.6 1.0X +ParquetReader Vectorized: DataPageV2 293 298 9 53.6 18.7 0.8X +ParquetReader Vectorized -> Row: DataPageV1 215 265 23 73.2 13.7 1.1X +ParquetReader Vectorized -> Row: DataPageV2 279 301 32 56.3 17.8 0.8X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15460 15543 116 1.0 982.9 1.0X -SQL Json 10199 10393 274 1.5 648.4 1.5X -SQL Parquet Vectorized 163 203 30 96.5 10.4 94.8X -SQL Parquet MR 1914 2025 157 8.2 121.7 8.1X -SQL ORC Vectorized 324 355 23 48.5 20.6 47.7X -SQL ORC MR 1673 1701 39 9.4 106.4 9.2X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +SQL CSV 13069 13409 482 1.2 830.9 1.0X +SQL Json 10599 10621 32 1.5 673.9 1.2X +SQL Parquet Vectorized: DataPageV1 142 177 34 110.6 9.0 91.9X +SQL Parquet Vectorized: DataPageV2 313 359 28 50.2 19.9 41.7X +SQL Parquet MR: DataPageV1 1979 2044 92 7.9 125.8 6.6X +SQL Parquet MR: DataPageV2 1958 2030 101 8.0 124.5 6.7X +SQL ORC Vectorized 277 303 21 56.7 17.6 47.1X +SQL ORC MR 1692 1782 128 9.3 107.6 7.7X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz -Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 209 223 17 75.2 13.3 1.0X -ParquetReader Vectorized -> Row 303 307 6 51.9 19.3 0.7X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 253 269 18 62.1 16.1 1.0X +ParquetReader Vectorized: DataPageV2 1197 1199 3 13.1 76.1 0.2X +ParquetReader Vectorized -> Row: DataPageV1 273 361 110 57.7 17.3 0.9X +ParquetReader Vectorized -> Row: DataPageV2 379 438 37 41.5 24.1 0.7X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 19075 19147 101 0.8 1212.8 1.0X -SQL Json 12181 12369 265 1.3 774.5 1.6X -SQL Parquet Vectorized 230 268 25 68.5 14.6 83.1X -SQL Parquet MR 2160 2244 120 7.3 137.3 8.8X -SQL ORC Vectorized 396 444 41 39.7 25.2 48.2X -SQL ORC MR 1924 1939 21 8.2 122.3 9.9X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +SQL CSV 17143 17467 458 0.9 1089.9 1.0X +SQL Json 11507 12198 977 1.4 731.6 1.5X +SQL Parquet Vectorized: DataPageV1 238 253 19 66.0 15.2 71.9X +SQL Parquet Vectorized: DataPageV2 502 567 48 31.3 31.9 34.1X +SQL Parquet MR: DataPageV1 2333 2335 3 6.7 148.4 7.3X +SQL Parquet MR: DataPageV2 1948 1972 34 8.1 123.8 8.8X +SQL ORC Vectorized 389 408 20 40.5 24.7 44.1X +SQL ORC MR 1726 1817 128 9.1 109.7 9.9X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz -Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 273 311 43 57.5 17.4 1.0X -ParquetReader Vectorized -> Row 316 322 8 49.8 20.1 0.9X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 289 340 43 54.4 18.4 1.0X +ParquetReader Vectorized: DataPageV2 572 609 27 27.5 36.4 0.5X +ParquetReader Vectorized -> Row: DataPageV1 329 353 48 47.8 20.9 0.9X +ParquetReader Vectorized -> Row: DataPageV2 639 654 18 24.6 40.6 0.5X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15439 15605 235 1.0 981.6 1.0X -SQL Json 11709 11852 201 1.3 744.5 1.3X -SQL Parquet Vectorized 157 199 33 99.9 10.0 98.0X -SQL Parquet MR 1996 2120 176 7.9 126.9 7.7X -SQL ORC Vectorized 439 466 28 35.8 27.9 35.1X -SQL ORC MR 1965 1991 36 8.0 124.9 7.9X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +SQL CSV 13721 13812 129 1.1 872.4 1.0X +SQL Json 12147 17632 2196 1.3 772.3 1.1X +SQL Parquet Vectorized: DataPageV1 138 164 25 113.9 8.8 99.4X +SQL Parquet Vectorized: DataPageV2 151 180 26 104.4 9.6 91.1X +SQL Parquet MR: DataPageV1 2006 2078 101 7.8 127.6 6.8X +SQL Parquet MR: DataPageV2 2038 2040 2 7.7 129.6 6.7X +SQL ORC Vectorized 465 475 10 33.8 29.6 29.5X +SQL ORC MR 1814 1860 64 8.7 115.4 7.6X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz -Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 206 212 8 76.4 13.1 1.0X -ParquetReader Vectorized -> Row 220 266 29 71.4 14.0 0.9X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 275 404 187 57.2 17.5 1.0X +ParquetReader Vectorized: DataPageV2 275 287 12 57.2 17.5 1.0X +ParquetReader Vectorized -> Row: DataPageV1 227 265 24 69.2 14.4 1.2X +ParquetReader Vectorized -> Row: DataPageV2 228 259 28 69.1 14.5 1.2X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 20048 20816 1086 0.8 1274.6 1.0X -SQL Json 16265 16314 69 1.0 1034.1 1.2X -SQL Parquet Vectorized 238 296 29 66.1 15.1 84.3X -SQL Parquet MR 2414 2418 7 6.5 153.5 8.3X -SQL ORC Vectorized 555 604 38 28.4 35.3 36.2X -SQL ORC MR 2225 2242 24 7.1 141.5 9.0X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +SQL CSV 17269 17620 496 0.9 1097.9 1.0X +SQL Json 15636 15952 447 1.0 994.1 1.1X +SQL Parquet Vectorized: DataPageV1 238 267 18 66.0 15.1 72.5X +SQL Parquet Vectorized: DataPageV2 222 260 21 70.9 14.1 77.9X +SQL Parquet MR: DataPageV1 2418 2457 56 6.5 153.7 7.1X +SQL Parquet MR: DataPageV2 2194 2207 18 7.2 139.5 7.9X +SQL ORC Vectorized 519 528 14 30.3 33.0 33.3X +SQL ORC MR 1760 1770 14 8.9 111.9 9.8X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz -Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 317 352 35 49.6 20.2 1.0X -ParquetReader Vectorized -> Row 346 356 9 45.4 22.0 0.9X +Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 284 305 30 55.3 18.1 1.0X +ParquetReader Vectorized: DataPageV2 286 286 1 55.1 18.2 1.0X +ParquetReader Vectorized -> Row: DataPageV1 325 337 16 48.4 20.6 0.9X +ParquetReader Vectorized -> Row: DataPageV2 346 361 16 45.5 22.0 0.8X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13981 14223 342 0.7 1333.4 1.0X -SQL Json 11241 11293 74 0.9 1072.0 1.2X -SQL Parquet Vectorized 2060 2076 23 5.1 196.4 6.8X -SQL Parquet MR 3779 3931 216 2.8 360.4 3.7X -SQL ORC Vectorized 2085 2088 4 5.0 198.8 6.7X -SQL ORC MR 3739 3767 39 2.8 356.6 3.7X +SQL CSV 12428 12714 405 0.8 1185.2 1.0X +SQL Json 11088 11251 231 0.9 1057.4 1.1X +SQL Parquet Vectorized: DataPageV1 1990 1997 10 5.3 189.8 6.2X +SQL Parquet Vectorized: DataPageV2 2551 2618 95 4.1 243.3 4.9X +SQL Parquet MR: DataPageV1 3903 3913 15 2.7 372.2 3.2X +SQL Parquet MR: DataPageV2 3734 3920 263 2.8 356.1 3.3X +SQL ORC Vectorized 2153 2155 3 4.9 205.3 5.8X +SQL ORC MR 3485 3549 91 3.0 332.4 3.6X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8544 8579 50 1.2 814.8 1.0X -SQL Json 6705 6952 348 1.6 639.5 1.3X -SQL Parquet Vectorized 603 615 9 17.4 57.5 14.2X -SQL Parquet MR 1722 1725 4 6.1 164.2 5.0X -SQL ORC Vectorized 515 547 24 20.4 49.1 16.6X -SQL ORC MR 1827 1845 25 5.7 174.2 4.7X +SQL CSV 7116 7167 72 1.5 678.7 1.0X +SQL Json 6700 6741 58 1.6 639.0 1.1X +SQL Parquet Vectorized: DataPageV1 526 556 36 19.9 50.1 13.5X +SQL Parquet Vectorized: DataPageV2 518 533 15 20.2 49.4 13.7X +SQL Parquet MR: DataPageV1 1504 1656 216 7.0 143.4 4.7X +SQL Parquet MR: DataPageV2 1676 1676 1 6.3 159.8 4.2X +SQL ORC Vectorized 497 518 20 21.1 47.4 14.3X +SQL ORC MR 1657 1787 183 6.3 158.1 4.3X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz -Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 18854 19521 943 0.8 1198.7 1.0X -Data column - Json 12579 12688 154 1.3 799.8 1.5X -Data column - Parquet Vectorized 246 298 28 63.9 15.7 76.5X -Data column - Parquet MR 2693 2699 9 5.8 171.2 7.0X -Data column - ORC Vectorized 434 463 25 36.2 27.6 43.4X -Data column - ORC MR 2249 2303 77 7.0 143.0 8.4X -Partition column - CSV 6045 6199 217 2.6 384.3 3.1X -Partition column - Json 9463 9679 305 1.7 601.7 2.0X -Partition column - Parquet Vectorized 64 92 36 244.3 4.1 292.9X -Partition column - Parquet MR 1238 1252 20 12.7 78.7 15.2X -Partition column - ORC Vectorized 60 85 25 263.7 3.8 316.1X -Partition column - ORC MR 1440 1458 26 10.9 91.5 13.1X -Both columns - CSV 19647 20381 1038 0.8 1249.1 1.0X -Both columns - Json 12615 12654 55 1.2 802.0 1.5X -Both columns - Parquet Vectorized 337 345 9 46.7 21.4 56.0X -Both columns - Parquet MR 2461 2573 158 6.4 156.5 7.7X -Both columns - ORC Vectorized 432 470 54 36.4 27.5 43.6X -Both columns - ORC MR 2507 2536 40 6.3 159.4 7.5X +Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------- +Data column - CSV 18247 18411 232 0.9 1160.1 1.0X +Data column - Json 10860 11264 571 1.4 690.5 1.7X +Data column - Parquet Vectorized: DataPageV1 223 274 26 70.6 14.2 81.9X +Data column - Parquet Vectorized: DataPageV2 537 559 23 29.3 34.1 34.0X +Data column - Parquet MR: DataPageV1 2411 2517 150 6.5 153.3 7.6X +Data column - Parquet MR: DataPageV2 2299 2356 81 6.8 146.2 7.9X +Data column - ORC Vectorized 417 433 11 37.7 26.5 43.8X +Data column - ORC MR 2107 2178 101 7.5 134.0 8.7X +Partition column - CSV 6090 6186 136 2.6 387.2 3.0X +Partition column - Json 9479 9603 176 1.7 602.7 1.9X +Partition column - Parquet Vectorized: DataPageV1 49 69 28 322.0 3.1 373.6X +Partition column - Parquet Vectorized: DataPageV2 49 63 23 322.1 3.1 373.7X +Partition column - Parquet MR: DataPageV1 1200 1225 36 13.1 76.3 15.2X +Partition column - Parquet MR: DataPageV2 1199 1240 57 13.1 76.3 15.2X +Partition column - ORC Vectorized 53 77 26 295.0 3.4 342.2X +Partition column - ORC MR 1287 1346 83 12.2 81.8 14.2X +Both columns - CSV 17671 18140 663 0.9 1123.5 1.0X +Both columns - Json 11675 12167 696 1.3 742.3 1.6X +Both columns - Parquet Vectorized: DataPageV1 298 303 9 52.9 18.9 61.3X +Both columns - Parquet Vectorized: DataPageV2 541 580 36 29.1 34.4 33.7X +Both columns - Parquet MR: DataPageV1 2448 2491 60 6.4 155.6 7.5X +Both columns - Parquet MR: DataPageV2 2303 2352 69 6.8 146.4 7.9X +Both columns - ORC Vectorized 385 406 25 40.9 24.5 47.4X +Both columns - ORC MR 2118 2202 120 7.4 134.6 8.6X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10199 10226 38 1.0 972.6 1.0X -SQL Json 10744 10925 256 1.0 1024.6 0.9X -SQL Parquet Vectorized 1251 1261 15 8.4 119.3 8.2X -SQL Parquet MR 3306 3315 13 3.2 315.3 3.1X -ParquetReader Vectorized 849 904 48 12.4 80.9 12.0X -SQL ORC Vectorized 1184 1204 28 8.9 112.9 8.6X -SQL ORC MR 2895 2945 71 3.6 276.1 3.5X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +SQL CSV 7966 12723 2892 1.3 759.7 1.0X +SQL Json 9897 10008 157 1.1 943.9 0.8X +SQL Parquet Vectorized: DataPageV1 1176 1264 125 8.9 112.1 6.8X +SQL Parquet Vectorized: DataPageV2 2224 2326 144 4.7 212.1 3.6X +SQL Parquet MR: DataPageV1 3431 3483 73 3.1 327.2 2.3X +SQL Parquet MR: DataPageV2 3845 4043 280 2.7 366.7 2.1X +ParquetReader Vectorized: DataPageV1 1055 1056 2 9.9 100.6 7.6X +ParquetReader Vectorized: DataPageV2 2093 2119 37 5.0 199.6 3.8X +SQL ORC Vectorized 1129 1217 125 9.3 107.7 7.1X +SQL ORC MR 2931 2982 72 3.6 279.5 2.7X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7949 8052 145 1.3 758.1 1.0X -SQL Json 7750 7868 167 1.4 739.1 1.0X -SQL Parquet Vectorized 949 976 24 11.0 90.5 8.4X -SQL Parquet MR 2700 2722 31 3.9 257.5 2.9X -ParquetReader Vectorized 916 940 31 11.4 87.3 8.7X -SQL ORC Vectorized 1240 1249 13 8.5 118.2 6.4X -SQL ORC MR 2856 2929 103 3.7 272.4 2.8X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +SQL CSV 6338 6508 240 1.7 604.4 1.0X +SQL Json 7149 7247 138 1.5 681.8 0.9X +SQL Parquet Vectorized: DataPageV1 937 984 45 11.2 89.3 6.8X +SQL Parquet Vectorized: DataPageV2 1582 1608 37 6.6 150.9 4.0X +SQL Parquet MR: DataPageV1 2525 2721 277 4.2 240.8 2.5X +SQL Parquet MR: DataPageV2 2969 2974 7 3.5 283.1 2.1X +ParquetReader Vectorized: DataPageV1 933 940 12 11.2 88.9 6.8X +ParquetReader Vectorized: DataPageV2 1535 1549 20 6.8 146.4 4.1X +SQL ORC Vectorized 1144 1204 86 9.2 109.1 5.5X +SQL ORC MR 2816 2822 8 3.7 268.6 2.3X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5416 5542 179 1.9 516.5 1.0X -SQL Json 4760 4980 311 2.2 454.0 1.1X -SQL Parquet Vectorized 222 236 8 47.2 21.2 24.4X -SQL Parquet MR 1669 1685 22 6.3 159.2 3.2X -ParquetReader Vectorized 248 252 3 42.3 23.6 21.9X -SQL ORC Vectorized 409 472 81 25.6 39.0 13.2X -SQL ORC MR 1686 1687 0 6.2 160.8 3.2X +SQL CSV 4443 4504 86 2.4 423.7 1.0X +SQL Json 4528 4563 49 2.3 431.8 1.0X +SQL Parquet Vectorized: DataPageV1 213 233 15 49.2 20.3 20.8X +SQL Parquet Vectorized: DataPageV2 267 294 22 39.3 25.4 16.7X +SQL Parquet MR: DataPageV1 1691 1700 13 6.2 161.2 2.6X +SQL Parquet MR: DataPageV2 1515 1565 70 6.9 144.5 2.9X +ParquetReader Vectorized: DataPageV1 228 231 2 46.0 21.7 19.5X +ParquetReader Vectorized: DataPageV2 285 296 9 36.8 27.1 15.6X +SQL ORC Vectorized 369 425 82 28.4 35.2 12.1X +SQL ORC MR 1457 1463 9 7.2 138.9 3.0X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2244 2282 53 0.5 2140.4 1.0X -SQL Json 3015 3099 119 0.3 2875.6 0.7X -SQL Parquet Vectorized 50 77 29 20.9 47.9 44.7X -SQL Parquet MR 190 209 27 5.5 180.8 11.8X -SQL ORC Vectorized 57 76 20 18.5 54.0 39.6X -SQL ORC MR 158 195 40 6.6 151.0 14.2X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +SQL CSV 2374 2377 5 0.4 2264.2 1.0X +SQL Json 2693 2726 46 0.4 2568.5 0.9X +SQL Parquet Vectorized: DataPageV1 44 62 16 23.8 42.0 54.0X +SQL Parquet Vectorized: DataPageV2 63 81 21 16.5 60.5 37.5X +SQL Parquet MR: DataPageV1 173 198 27 6.1 164.6 13.8X +SQL Parquet MR: DataPageV2 161 193 30 6.5 153.5 14.8X +SQL ORC Vectorized 53 71 18 19.9 50.2 45.1X +SQL ORC MR 149 182 34 7.0 142.3 15.9X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5114 5296 257 0.2 4876.7 1.0X -SQL Json 11564 11828 373 0.1 11028.4 0.4X -SQL Parquet Vectorized 60 93 26 17.3 57.6 84.6X -SQL Parquet MR 198 232 31 5.3 188.9 25.8X -SQL ORC Vectorized 69 103 35 15.2 65.9 74.0X -SQL ORC MR 175 212 36 6.0 166.9 29.2X - -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +SQL CSV 5149 5193 62 0.2 4910.9 1.0X +SQL Json 10556 10891 475 0.1 10066.5 0.5X +SQL Parquet Vectorized: DataPageV1 64 96 28 16.3 61.3 80.1X +SQL Parquet Vectorized: DataPageV2 83 106 22 12.6 79.1 62.0X +SQL Parquet MR: DataPageV1 196 232 25 5.3 187.4 26.2X +SQL Parquet MR: DataPageV2 184 221 28 5.7 175.1 28.0X +SQL ORC Vectorized 74 98 31 14.1 70.8 69.3X +SQL ORC MR 182 214 38 5.8 173.9 28.2X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1025-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9072 9324 357 0.1 8651.4 1.0X -SQL Json 23444 23735 411 0.0 22358.1 0.4X -SQL Parquet Vectorized 91 129 28 11.5 86.7 99.8X -SQL Parquet MR 220 270 56 4.8 209.6 41.3X -SQL ORC Vectorized 96 110 20 10.9 91.8 94.2X -SQL ORC MR 216 240 33 4.8 206.2 41.9X +SQL CSV 9077 9107 43 0.1 8656.2 1.0X +SQL Json 20131 20886 1067 0.1 19198.5 0.5X +SQL Parquet Vectorized: DataPageV1 93 124 26 11.3 88.8 97.5X +SQL Parquet Vectorized: DataPageV2 103 128 29 10.2 98.5 87.9X +SQL Parquet MR: DataPageV1 218 257 35 4.8 207.6 41.7X +SQL Parquet MR: DataPageV2 213 255 29 4.9 202.7 42.7X +SQL ORC Vectorized 80 95 20 13.0 76.6 112.9X +SQL ORC MR 187 207 20 5.6 178.0 48.6X diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt index 85d506ec3454..ecba57c0c3cc 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-jdk17-results.txt @@ -2,269 +2,322 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11737 11812 106 1.3 746.2 1.0X -SQL Json 7827 7904 109 2.0 497.6 1.5X -SQL Parquet Vectorized 98 116 12 160.6 6.2 119.8X -SQL Parquet MR 1529 1541 18 10.3 97.2 7.7X -SQL ORC Vectorized 165 185 14 95.5 10.5 71.2X -SQL ORC MR 1433 1440 9 11.0 91.1 8.2X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz -Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 38 40 3 416.2 2.4 1.0X -ParquetReader Vectorized -> Row 38 39 3 419.1 2.4 1.0X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 15972 16369 561 1.0 1015.5 1.0X +SQL Json 9543 9580 54 1.6 606.7 1.7X +SQL Parquet Vectorized: DataPageV1 115 144 19 136.3 7.3 138.4X +SQL Parquet Vectorized: DataPageV2 95 109 15 165.1 6.1 167.6X +SQL Parquet MR: DataPageV1 2098 2119 30 7.5 133.4 7.6X +SQL Parquet MR: DataPageV2 2007 2012 6 7.8 127.6 8.0X +SQL ORC Vectorized 211 225 16 74.5 13.4 75.7X +SQL ORC MR 2077 2103 36 7.6 132.1 7.7X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 43 47 2 369.4 2.7 1.0X +ParquetReader Vectorized: DataPageV2 30 34 2 518.5 1.9 1.4X +ParquetReader Vectorized -> Row: DataPageV1 47 50 2 333.6 3.0 0.9X +ParquetReader Vectorized -> Row: DataPageV2 31 35 2 504.8 2.0 1.4X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13156 13192 51 1.2 836.4 1.0X -SQL Json 8690 8784 133 1.8 552.5 1.5X -SQL Parquet Vectorized 196 207 8 80.4 12.4 67.2X -SQL Parquet MR 1831 1834 4 8.6 116.4 7.2X -SQL ORC Vectorized 157 167 7 100.2 10.0 83.8X -SQL ORC MR 1381 1387 8 11.4 87.8 9.5X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz -Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 147 153 6 107.0 9.3 1.0X -ParquetReader Vectorized -> Row 149 162 24 105.7 9.5 1.0X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 17468 17543 105 0.9 1110.6 1.0X +SQL Json 11059 11065 8 1.4 703.1 1.6X +SQL Parquet Vectorized: DataPageV1 128 142 15 123.1 8.1 136.7X +SQL Parquet Vectorized: DataPageV2 126 141 8 125.2 8.0 139.1X +SQL Parquet MR: DataPageV1 2305 2331 36 6.8 146.5 7.6X +SQL Parquet MR: DataPageV2 2075 2095 28 7.6 131.9 8.4X +SQL ORC Vectorized 172 191 16 91.5 10.9 101.6X +SQL ORC MR 1777 1796 26 8.8 113.0 9.8X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 72 77 5 219.4 4.6 1.0X +ParquetReader Vectorized: DataPageV2 72 77 3 217.9 4.6 1.0X +ParquetReader Vectorized -> Row: DataPageV1 76 83 6 206.6 4.8 0.9X +ParquetReader Vectorized -> Row: DataPageV2 75 80 3 210.3 4.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 14024 14291 378 1.1 891.6 1.0X -SQL Json 9777 9849 102 1.6 621.6 1.4X -SQL Parquet Vectorized 153 175 18 102.9 9.7 91.8X -SQL Parquet MR 1971 1979 11 8.0 125.3 7.1X -SQL ORC Vectorized 193 211 15 81.4 12.3 72.5X -SQL ORC MR 1665 1693 39 9.4 105.9 8.4X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 18330 18332 3 0.9 1165.4 1.0X +SQL Json 11383 11429 66 1.4 723.7 1.6X +SQL Parquet Vectorized: DataPageV1 179 197 13 88.0 11.4 102.5X +SQL Parquet Vectorized: DataPageV2 239 263 18 65.7 15.2 76.6X +SQL Parquet MR: DataPageV1 2552 2567 21 6.2 162.3 7.2X +SQL Parquet MR: DataPageV2 2389 2436 67 6.6 151.9 7.7X +SQL ORC Vectorized 246 263 14 64.0 15.6 74.6X +SQL ORC MR 1965 2002 52 8.0 124.9 9.3X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 217 227 7 72.6 13.8 1.0X -ParquetReader Vectorized -> Row 214 216 2 73.5 13.6 1.0X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 253 263 11 62.2 16.1 1.0X +ParquetReader Vectorized: DataPageV2 306 317 7 51.4 19.4 0.8X +ParquetReader Vectorized -> Row: DataPageV1 246 250 4 64.0 15.6 1.0X +ParquetReader Vectorized -> Row: DataPageV2 316 321 4 49.8 20.1 0.8X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15107 15205 139 1.0 960.5 1.0X -SQL Json 9699 9773 104 1.6 616.7 1.6X -SQL Parquet Vectorized 144 160 24 109.6 9.1 105.2X -SQL Parquet MR 1903 1906 4 8.3 121.0 7.9X -SQL ORC Vectorized 227 234 6 69.4 14.4 66.6X -SQL ORC MR 1566 1578 17 10.0 99.5 9.6X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz -Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 209 214 4 75.2 13.3 1.0X -ParquetReader Vectorized -> Row 192 194 2 81.9 12.2 1.1X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 19573 19822 352 0.8 1244.4 1.0X +SQL Json 12141 12217 107 1.3 771.9 1.6X +SQL Parquet Vectorized: DataPageV1 192 222 28 81.8 12.2 101.8X +SQL Parquet Vectorized: DataPageV2 345 373 24 45.6 21.9 56.7X +SQL Parquet MR: DataPageV1 2736 2741 7 5.7 173.9 7.2X +SQL Parquet MR: DataPageV2 2467 2536 97 6.4 156.9 7.9X +SQL ORC Vectorized 332 356 20 47.4 21.1 59.0X +SQL ORC MR 2188 2193 7 7.2 139.1 8.9X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 291 295 4 54.1 18.5 1.0X +ParquetReader Vectorized: DataPageV2 493 518 39 31.9 31.3 0.6X +ParquetReader Vectorized -> Row: DataPageV1 300 306 8 52.5 19.1 1.0X +ParquetReader Vectorized -> Row: DataPageV2 471 483 11 33.4 30.0 0.6X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 19711 19743 44 0.8 1253.2 1.0X -SQL Json 11459 11500 59 1.4 728.5 1.7X -SQL Parquet Vectorized 202 210 7 77.9 12.8 97.6X -SQL Parquet MR 2093 2120 37 7.5 133.1 9.4X -SQL ORC Vectorized 356 384 22 44.2 22.6 55.4X -SQL ORC MR 1832 1844 17 8.6 116.4 10.8X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz -Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 290 290 0 54.3 18.4 1.0X -ParquetReader Vectorized -> Row 308 314 8 51.1 19.6 0.9X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 24692 24718 37 0.6 1569.9 1.0X +SQL Json 14839 14875 50 1.1 943.5 1.7X +SQL Parquet Vectorized: DataPageV1 295 316 29 53.3 18.7 83.7X +SQL Parquet Vectorized: DataPageV2 477 505 24 32.9 30.4 51.7X +SQL Parquet MR: DataPageV1 2841 2981 197 5.5 180.6 8.7X +SQL Parquet MR: DataPageV2 2616 2632 23 6.0 166.3 9.4X +SQL ORC Vectorized 388 403 11 40.5 24.7 63.6X +SQL ORC MR 2274 2372 138 6.9 144.6 10.9X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 376 387 9 41.9 23.9 1.0X +ParquetReader Vectorized: DataPageV2 585 591 6 26.9 37.2 0.6X +ParquetReader Vectorized -> Row: DataPageV1 377 387 9 41.8 23.9 1.0X +ParquetReader Vectorized -> Row: DataPageV2 576 586 10 27.3 36.6 0.7X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 16396 16602 292 1.0 1042.4 1.0X -SQL Json 11284 11591 433 1.4 717.4 1.5X -SQL Parquet Vectorized 137 168 14 114.7 8.7 119.6X -SQL Parquet MR 1901 1907 8 8.3 120.9 8.6X -SQL ORC Vectorized 429 447 12 36.6 27.3 38.2X -SQL ORC MR 1769 1841 102 8.9 112.4 9.3X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz -Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 234 253 10 67.2 14.9 1.0X -ParquetReader Vectorized -> Row 214 238 15 73.5 13.6 1.1X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 20566 20651 119 0.8 1307.6 1.0X +SQL Json 14337 14409 101 1.1 911.5 1.4X +SQL Parquet Vectorized: DataPageV1 154 167 8 101.9 9.8 133.2X +SQL Parquet Vectorized: DataPageV2 157 178 14 99.9 10.0 130.6X +SQL Parquet MR: DataPageV1 2730 2730 1 5.8 173.5 7.5X +SQL Parquet MR: DataPageV2 2459 2491 45 6.4 156.3 8.4X +SQL ORC Vectorized 479 501 15 32.9 30.4 43.0X +SQL ORC MR 2293 2343 71 6.9 145.8 9.0X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 272 283 9 57.9 17.3 1.0X +ParquetReader Vectorized: DataPageV2 250 288 27 62.9 15.9 1.1X +ParquetReader Vectorized -> Row: DataPageV1 291 301 6 54.1 18.5 0.9X +ParquetReader Vectorized -> Row: DataPageV2 293 305 14 53.6 18.6 0.9X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 20303 20621 449 0.8 1290.9 1.0X -SQL Json 14630 14734 147 1.1 930.1 1.4X -SQL Parquet Vectorized 212 246 23 74.0 13.5 95.6X -SQL Parquet MR 2073 2212 198 7.6 131.8 9.8X -SQL ORC Vectorized 445 455 9 35.4 28.3 45.6X -SQL ORC MR 1835 1902 95 8.6 116.7 11.1X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz -Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 279 297 12 56.3 17.8 1.0X -ParquetReader Vectorized -> Row 280 292 12 56.1 17.8 1.0X +SQL CSV 25753 25874 171 0.6 1637.3 1.0X +SQL Json 19097 19391 416 0.8 1214.2 1.3X +SQL Parquet Vectorized: DataPageV1 273 288 11 57.6 17.4 94.3X +SQL Parquet Vectorized: DataPageV2 240 277 25 65.5 15.3 107.3X +SQL Parquet MR: DataPageV1 2969 3042 103 5.3 188.8 8.7X +SQL Parquet MR: DataPageV2 2692 2747 78 5.8 171.1 9.6X +SQL ORC Vectorized 601 626 20 26.2 38.2 42.8X +SQL ORC MR 2458 2467 13 6.4 156.3 10.5X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 354 363 7 44.4 22.5 1.0X +ParquetReader Vectorized: DataPageV2 345 359 12 45.5 22.0 1.0X +ParquetReader Vectorized -> Row: DataPageV1 337 345 8 46.7 21.4 1.1X +ParquetReader Vectorized -> Row: DataPageV2 335 364 21 46.9 21.3 1.1X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 14027 14143 164 0.7 1337.7 1.0X -SQL Json 10476 10606 183 1.0 999.1 1.3X -SQL Parquet Vectorized 1969 2040 100 5.3 187.8 7.1X -SQL Parquet MR 3743 3834 128 2.8 357.0 3.7X -SQL ORC Vectorized 1926 1936 14 5.4 183.6 7.3X -SQL ORC MR 3383 3403 28 3.1 322.6 4.1X +SQL CSV 18074 18101 37 0.6 1723.7 1.0X +SQL Json 13211 13214 5 0.8 1259.9 1.4X +SQL Parquet Vectorized: DataPageV1 2249 2286 53 4.7 214.5 8.0X +SQL Parquet Vectorized: DataPageV2 2804 2818 20 3.7 267.4 6.4X +SQL Parquet MR: DataPageV1 4708 4779 100 2.2 449.0 3.8X +SQL Parquet MR: DataPageV2 4868 5046 251 2.2 464.3 3.7X +SQL ORC Vectorized 2145 2160 20 4.9 204.6 8.4X +SQL ORC MR 4180 4308 182 2.5 398.6 4.3X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8672 8905 330 1.2 827.0 1.0X -SQL Json 6369 6374 7 1.6 607.4 1.4X -SQL Parquet Vectorized 556 579 25 18.9 53.0 15.6X -SQL Parquet MR 1574 1585 14 6.7 150.2 5.5X -SQL ORC Vectorized 420 427 4 25.0 40.1 20.6X -SQL ORC MR 1711 1733 31 6.1 163.2 5.1X +SQL CSV 11320 11376 78 0.9 1079.6 1.0X +SQL Json 7593 7664 101 1.4 724.1 1.5X +SQL Parquet Vectorized: DataPageV1 633 639 9 16.6 60.3 17.9X +SQL Parquet Vectorized: DataPageV2 621 644 20 16.9 59.2 18.2X +SQL Parquet MR: DataPageV1 2111 2157 65 5.0 201.3 5.4X +SQL Parquet MR: DataPageV2 2018 2064 65 5.2 192.4 5.6X +SQL ORC Vectorized 505 540 36 20.8 48.2 22.4X +SQL ORC MR 2302 2360 82 4.6 219.5 4.9X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz -Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 21008 21367 508 0.7 1335.7 1.0X -Data column - Json 12091 12412 455 1.3 768.7 1.7X -Data column - Parquet Vectorized 210 217 6 75.0 13.3 100.1X -Data column - Parquet MR 2434 2450 22 6.5 154.8 8.6X -Data column - ORC Vectorized 323 347 26 48.7 20.5 65.1X -Data column - ORC MR 2223 2231 11 7.1 141.3 9.5X -Partition column - CSV 5889 5992 146 2.7 374.4 3.6X -Partition column - Json 9706 9870 233 1.6 617.1 2.2X -Partition column - Parquet Vectorized 51 58 8 306.3 3.3 409.2X -Partition column - Parquet MR 1237 1241 5 12.7 78.7 17.0X -Partition column - ORC Vectorized 53 61 8 294.1 3.4 392.9X -Partition column - ORC MR 1322 1336 20 11.9 84.1 15.9X -Both columns - CSV 20362 20389 39 0.8 1294.6 1.0X -Both columns - Json 12267 12512 346 1.3 779.9 1.7X -Both columns - Parquet Vectorized 254 262 9 61.9 16.2 82.6X -Both columns - Parquet MR 2649 2745 136 5.9 168.4 7.9X -Both columns - ORC Vectorized 348 379 32 45.2 22.1 60.4X -Both columns - ORC MR 2339 2343 6 6.7 148.7 9.0X +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------- +Data column - CSV 24867 25261 556 0.6 1581.0 1.0X +Data column - Json 13937 13987 70 1.1 886.1 1.8X +Data column - Parquet Vectorized: DataPageV1 252 264 8 62.3 16.0 98.5X +Data column - Parquet Vectorized: DataPageV2 547 560 13 28.8 34.7 45.5X +Data column - Parquet MR: DataPageV1 3492 3509 25 4.5 222.0 7.1X +Data column - Parquet MR: DataPageV2 3148 3208 84 5.0 200.2 7.9X +Data column - ORC Vectorized 493 512 21 31.9 31.3 50.5X +Data column - ORC MR 2925 2943 26 5.4 185.9 8.5X +Partition column - CSV 7847 7851 5 2.0 498.9 3.2X +Partition column - Json 11759 11908 210 1.3 747.6 2.1X +Partition column - Parquet Vectorized: DataPageV1 60 67 7 262.3 3.8 414.7X +Partition column - Parquet Vectorized: DataPageV2 57 65 9 274.2 3.6 433.5X +Partition column - Parquet MR: DataPageV1 1762 1768 8 8.9 112.1 14.1X +Partition column - Parquet MR: DataPageV2 1742 1783 59 9.0 110.7 14.3X +Partition column - ORC Vectorized 59 71 7 265.6 3.8 419.9X +Partition column - ORC MR 1743 1764 29 9.0 110.8 14.3X +Both columns - CSV 25859 25924 92 0.6 1644.1 1.0X +Both columns - Json 14693 14764 101 1.1 934.2 1.7X +Both columns - Parquet Vectorized: DataPageV1 341 395 66 46.2 21.7 73.0X +Both columns - Parquet Vectorized: DataPageV2 624 643 13 25.2 39.7 39.9X +Both columns - Parquet MR: DataPageV1 3541 3611 99 4.4 225.2 7.0X +Both columns - Parquet MR: DataPageV2 3279 3301 32 4.8 208.4 7.6X +Both columns - ORC Vectorized 434 483 40 36.2 27.6 57.3X +Both columns - ORC MR 2946 2964 26 5.3 187.3 8.4X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9872 9917 64 1.1 941.4 1.0X -SQL Json 8698 8793 134 1.2 829.5 1.1X -SQL Parquet Vectorized 1277 1281 6 8.2 121.8 7.7X -SQL Parquet MR 3649 3679 42 2.9 348.0 2.7X -ParquetReader Vectorized 969 1015 66 10.8 92.4 10.2X -SQL ORC Vectorized 1022 1038 23 10.3 97.4 9.7X -SQL ORC MR 3103 3122 27 3.4 295.9 3.2X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 13698 13783 121 0.8 1306.3 1.0X +SQL Json 11030 11144 161 1.0 1051.9 1.2X +SQL Parquet Vectorized: DataPageV1 1695 1699 7 6.2 161.6 8.1X +SQL Parquet Vectorized: DataPageV2 2740 2744 5 3.8 261.3 5.0X +SQL Parquet MR: DataPageV1 4547 4594 66 2.3 433.7 3.0X +SQL Parquet MR: DataPageV2 5382 5455 103 1.9 513.3 2.5X +ParquetReader Vectorized: DataPageV1 1238 1238 0 8.5 118.0 11.1X +ParquetReader Vectorized: DataPageV2 2312 2325 19 4.5 220.5 5.9X +SQL ORC Vectorized 1134 1147 18 9.2 108.2 12.1X +SQL ORC MR 3966 4015 69 2.6 378.2 3.5X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7321 7550 324 1.4 698.2 1.0X -SQL Json 6939 6962 32 1.5 661.8 1.1X -SQL Parquet Vectorized 906 917 17 11.6 86.4 8.1X -SQL Parquet MR 2617 2655 54 4.0 249.6 2.8X -ParquetReader Vectorized 832 837 5 12.6 79.4 8.8X -SQL ORC Vectorized 1101 1109 11 9.5 105.0 6.6X -SQL ORC MR 2777 2778 2 3.8 264.8 2.6X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 10613 10658 64 1.0 1012.1 1.0X +SQL Json 8973 8996 33 1.2 855.7 1.2X +SQL Parquet Vectorized: DataPageV1 1208 1221 18 8.7 115.2 8.8X +SQL Parquet Vectorized: DataPageV2 1949 1950 1 5.4 185.9 5.4X +SQL Parquet MR: DataPageV1 3701 3716 21 2.8 353.0 2.9X +SQL Parquet MR: DataPageV2 4150 4192 60 2.5 395.8 2.6X +ParquetReader Vectorized: DataPageV1 1191 1192 1 8.8 113.6 8.9X +ParquetReader Vectorized: DataPageV2 1874 1917 61 5.6 178.7 5.7X +SQL ORC Vectorized 1338 1365 38 7.8 127.6 7.9X +SQL ORC MR 3659 3674 21 2.9 349.0 2.9X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5670 5691 30 1.8 540.7 1.0X -SQL Json 4309 4327 27 2.4 410.9 1.3X -SQL Parquet Vectorized 212 217 5 49.5 20.2 26.8X -SQL Parquet MR 1634 1672 53 6.4 155.9 3.5X -ParquetReader Vectorized 212 214 3 49.5 20.2 26.8X -SQL ORC Vectorized 356 359 4 29.5 33.9 15.9X -SQL ORC MR 1519 1561 59 6.9 144.9 3.7X +SQL CSV 8714 8809 134 1.2 831.0 1.0X +SQL Json 5801 5819 25 1.8 553.2 1.5X +SQL Parquet Vectorized: DataPageV1 297 316 11 35.3 28.3 29.3X +SQL Parquet Vectorized: DataPageV2 363 382 12 28.9 34.6 24.0X +SQL Parquet MR: DataPageV1 2350 2366 22 4.5 224.1 3.7X +SQL Parquet MR: DataPageV2 2132 2183 73 4.9 203.3 4.1X +ParquetReader Vectorized: DataPageV1 296 310 13 35.4 28.2 29.4X +ParquetReader Vectorized: DataPageV2 368 372 3 28.5 35.1 23.7X +SQL ORC Vectorized 474 487 10 22.1 45.2 18.4X +SQL ORC MR 2025 2031 9 5.2 193.1 4.3X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2172 2213 58 0.5 2071.4 1.0X -SQL Json 2916 2934 26 0.4 2780.7 0.7X -SQL Parquet Vectorized 43 48 6 24.5 40.7 50.8X -SQL Parquet MR 175 182 9 6.0 167.1 12.4X -SQL ORC Vectorized 51 56 6 20.5 48.9 42.4X -SQL ORC MR 152 157 5 6.9 144.9 14.3X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 2677 2687 14 0.4 2553.2 1.0X +SQL Json 3581 3588 10 0.3 3414.8 0.7X +SQL Parquet Vectorized: DataPageV1 52 59 7 20.2 49.6 51.5X +SQL Parquet Vectorized: DataPageV2 68 75 7 15.4 65.0 39.3X +SQL Parquet MR: DataPageV1 245 257 9 4.3 233.6 10.9X +SQL Parquet MR: DataPageV2 224 237 8 4.7 213.7 11.9X +SQL ORC Vectorized 64 70 5 16.3 61.3 41.7X +SQL ORC MR 208 216 8 5.0 198.2 12.9X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 4658 4737 112 0.2 4442.6 1.0X -SQL Json 12114 12242 181 0.1 11552.8 0.4X -SQL Parquet Vectorized 59 66 9 17.8 56.3 78.9X -SQL Parquet MR 196 206 10 5.3 187.3 23.7X -SQL ORC Vectorized 68 77 6 15.3 65.2 68.1X -SQL ORC MR 171 183 9 6.1 163.4 27.2X - -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL CSV 5753 5771 25 0.2 5486.7 1.0X +SQL Json 13801 13851 71 0.1 13161.9 0.4X +SQL Parquet Vectorized: DataPageV1 75 83 9 14.1 71.1 77.2X +SQL Parquet Vectorized: DataPageV2 84 93 7 12.4 80.6 68.1X +SQL Parquet MR: DataPageV1 269 280 7 3.9 256.5 21.4X +SQL Parquet MR: DataPageV2 251 258 8 4.2 238.9 23.0X +SQL ORC Vectorized 82 88 6 12.8 78.3 70.1X +SQL ORC MR 223 239 8 4.7 213.0 25.8X + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8008 8070 88 0.1 7636.6 1.0X -SQL Json 22795 23224 607 0.0 21739.5 0.4X -SQL Parquet Vectorized 81 88 7 13.0 77.2 99.0X -SQL Parquet MR 225 244 16 4.7 214.9 35.5X -SQL ORC Vectorized 77 82 5 13.6 73.3 104.2X -SQL ORC MR 185 190 6 5.7 176.2 43.3X +SQL CSV 9487 9503 24 0.1 9047.1 1.0X +SQL Json 26109 26240 186 0.0 24899.2 0.4X +SQL Parquet Vectorized: DataPageV1 100 110 10 10.4 95.8 94.5X +SQL Parquet Vectorized: DataPageV2 113 119 6 9.3 107.3 84.3X +SQL Parquet MR: DataPageV1 280 296 11 3.7 267.2 33.9X +SQL Parquet MR: DataPageV2 281 321 68 3.7 268.0 33.8X +SQL ORC Vectorized 92 101 8 11.4 87.5 103.4X +SQL ORC MR 228 245 10 4.6 217.7 41.6X diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt index 1dd99011ba27..6a2b6bfb4a0a 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt @@ -2,269 +2,322 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13046 13274 322 1.2 829.5 1.0X -SQL Json 10585 10610 37 1.5 672.9 1.2X -SQL Parquet Vectorized 147 168 27 106.7 9.4 88.5X -SQL Parquet MR 1891 1897 7 8.3 120.3 6.9X -SQL ORC Vectorized 200 213 15 78.8 12.7 65.4X -SQL ORC MR 1939 1944 7 8.1 123.3 6.7X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz -Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 164 165 3 96.2 10.4 1.0X -ParquetReader Vectorized -> Row 71 72 2 220.6 4.5 2.3X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 11570 12144 812 1.4 735.6 1.0X +SQL Json 7542 7568 37 2.1 479.5 1.5X +SQL Parquet Vectorized: DataPageV1 129 144 16 121.9 8.2 89.7X +SQL Parquet Vectorized: DataPageV2 92 106 20 170.3 5.9 125.2X +SQL Parquet MR: DataPageV1 1416 1419 3 11.1 90.0 8.2X +SQL Parquet MR: DataPageV2 1281 1359 110 12.3 81.4 9.0X +SQL ORC Vectorized 161 176 10 97.4 10.3 71.6X +SQL ORC MR 1525 1545 29 10.3 96.9 7.6X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 111 118 6 142.3 7.0 1.0X +ParquetReader Vectorized: DataPageV2 116 117 2 135.7 7.4 1.0X +ParquetReader Vectorized -> Row: DataPageV1 48 49 1 324.9 3.1 2.3X +ParquetReader Vectorized -> Row: DataPageV2 39 39 1 405.8 2.5 2.9X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 16466 16494 40 1.0 1046.9 1.0X -SQL Json 12509 12528 28 1.3 795.3 1.3X -SQL Parquet Vectorized 170 179 11 92.7 10.8 97.1X -SQL Parquet MR 2154 2167 19 7.3 136.9 7.6X -SQL ORC Vectorized 203 213 9 77.4 12.9 81.1X -SQL ORC MR 1977 1980 4 8.0 125.7 8.3X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz -Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 216 218 3 72.8 13.7 1.0X -ParquetReader Vectorized -> Row 123 124 2 127.6 7.8 1.8X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 13807 14535 1030 1.1 877.8 1.0X +SQL Json 8079 8094 21 1.9 513.6 1.7X +SQL Parquet Vectorized: DataPageV1 139 152 12 113.0 8.9 99.2X +SQL Parquet Vectorized: DataPageV2 140 147 5 112.5 8.9 98.7X +SQL Parquet MR: DataPageV1 1637 1741 148 9.6 104.1 8.4X +SQL Parquet MR: DataPageV2 1522 1636 161 10.3 96.8 9.1X +SQL ORC Vectorized 147 160 10 106.9 9.4 93.8X +SQL ORC MR 1542 1545 4 10.2 98.1 9.0X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 166 171 8 94.7 10.6 1.0X +ParquetReader Vectorized: DataPageV2 166 169 4 94.7 10.6 1.0X +ParquetReader Vectorized -> Row: DataPageV1 156 157 2 100.7 9.9 1.1X +ParquetReader Vectorized -> Row: DataPageV2 156 157 2 100.7 9.9 1.1X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17321 17358 53 0.9 1101.2 1.0X -SQL Json 12964 13001 52 1.2 824.2 1.3X -SQL Parquet Vectorized 243 251 7 64.8 15.4 71.3X -SQL Parquet MR 2491 2499 12 6.3 158.4 7.0X -SQL ORC Vectorized 214 217 3 73.4 13.6 80.9X -SQL ORC MR 1960 1963 3 8.0 124.6 8.8X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 15327 15421 133 1.0 974.5 1.0X +SQL Json 8564 8799 332 1.8 544.5 1.8X +SQL Parquet Vectorized: DataPageV1 202 219 11 77.8 12.8 75.8X +SQL Parquet Vectorized: DataPageV2 203 210 8 77.7 12.9 75.7X +SQL Parquet MR: DataPageV1 1874 2004 183 8.4 119.2 8.2X +SQL Parquet MR: DataPageV2 1606 1709 146 9.8 102.1 9.5X +SQL ORC Vectorized 167 179 10 94.1 10.6 91.7X +SQL ORC MR 1404 1408 6 11.2 89.3 10.9X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 361 365 6 43.6 22.9 1.0X -ParquetReader Vectorized -> Row 323 329 10 48.7 20.5 1.1X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 222 236 13 70.7 14.1 1.0X +ParquetReader Vectorized: DataPageV2 259 268 14 60.8 16.5 0.9X +ParquetReader Vectorized -> Row: DataPageV1 228 248 11 68.9 14.5 1.0X +ParquetReader Vectorized -> Row: DataPageV2 264 293 13 59.5 16.8 0.8X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 19098 19123 36 0.8 1214.2 1.0X -SQL Json 13719 13736 23 1.1 872.3 1.4X -SQL Parquet Vectorized 188 192 5 83.5 12.0 101.4X -SQL Parquet MR 2515 2536 30 6.3 159.9 7.6X -SQL ORC Vectorized 287 295 5 54.8 18.3 66.5X -SQL ORC MR 2034 2036 2 7.7 129.3 9.4X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz -Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 309 311 3 50.9 19.7 1.0X -ParquetReader Vectorized -> Row 270 272 5 58.4 17.1 1.1X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 17479 17651 243 0.9 1111.3 1.0X +SQL Json 9565 9582 25 1.6 608.1 1.8X +SQL Parquet Vectorized: DataPageV1 152 159 8 103.2 9.7 114.7X +SQL Parquet Vectorized: DataPageV2 290 308 18 54.2 18.4 60.3X +SQL Parquet MR: DataPageV1 1861 1980 169 8.5 118.3 9.4X +SQL Parquet MR: DataPageV2 1647 1748 142 9.5 104.7 10.6X +SQL ORC Vectorized 230 251 12 68.3 14.6 75.9X +SQL ORC MR 1645 1648 3 9.6 104.6 10.6X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 208 213 9 75.7 13.2 1.0X +ParquetReader Vectorized: DataPageV2 355 382 14 44.3 22.6 0.6X +ParquetReader Vectorized -> Row: DataPageV1 212 233 8 74.1 13.5 1.0X +ParquetReader Vectorized -> Row: DataPageV2 350 353 7 45.0 22.2 0.6X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 25565 25574 13 0.6 1625.4 1.0X -SQL Json 17510 17518 11 0.9 1113.3 1.5X -SQL Parquet Vectorized 259 266 9 60.7 16.5 98.6X -SQL Parquet MR 2628 2647 28 6.0 167.1 9.7X -SQL ORC Vectorized 357 365 6 44.1 22.7 71.6X -SQL ORC MR 2144 2151 10 7.3 136.3 11.9X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz -Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 385 390 8 40.8 24.5 1.0X -ParquetReader Vectorized -> Row 345 350 6 45.6 21.9 1.1X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 21825 21944 169 0.7 1387.6 1.0X +SQL Json 11877 11927 71 1.3 755.1 1.8X +SQL Parquet Vectorized: DataPageV1 229 242 18 68.8 14.5 95.5X +SQL Parquet Vectorized: DataPageV2 435 452 23 36.1 27.7 50.1X +SQL Parquet MR: DataPageV1 2050 2184 190 7.7 130.3 10.6X +SQL Parquet MR: DataPageV2 1829 1927 138 8.6 116.3 11.9X +SQL ORC Vectorized 287 308 14 54.8 18.3 76.0X +SQL ORC MR 1579 1603 34 10.0 100.4 13.8X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 299 341 86 52.6 19.0 1.0X +ParquetReader Vectorized: DataPageV2 551 607 110 28.5 35.1 0.5X +ParquetReader Vectorized -> Row: DataPageV1 341 344 4 46.2 21.7 0.9X +ParquetReader Vectorized -> Row: DataPageV2 508 557 33 31.0 32.3 0.6X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 19931 19941 13 0.8 1267.2 1.0X -SQL Json 17274 17302 40 0.9 1098.2 1.2X -SQL Parquet Vectorized 175 182 10 90.0 11.1 114.1X -SQL Parquet MR 2496 2502 9 6.3 158.7 8.0X -SQL ORC Vectorized 432 436 4 36.4 27.5 46.1X -SQL ORC MR 2184 2187 5 7.2 138.8 9.1X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz -Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 287 289 5 54.9 18.2 1.0X -ParquetReader Vectorized -> Row 281 283 3 55.9 17.9 1.0X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 17585 17926 482 0.9 1118.0 1.0X +SQL Json 11927 12180 357 1.3 758.3 1.5X +SQL Parquet Vectorized: DataPageV1 150 161 11 104.6 9.6 116.9X +SQL Parquet Vectorized: DataPageV2 150 160 8 104.7 9.5 117.1X +SQL Parquet MR: DataPageV1 1830 1867 52 8.6 116.4 9.6X +SQL Parquet MR: DataPageV2 1715 1828 160 9.2 109.1 10.3X +SQL ORC Vectorized 328 358 15 48.0 20.8 53.6X +SQL ORC MR 1584 1687 145 9.9 100.7 11.1X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 207 211 8 76.0 13.2 1.0X +ParquetReader Vectorized: DataPageV2 207 220 11 75.8 13.2 1.0X +ParquetReader Vectorized -> Row: DataPageV1 208 214 9 75.7 13.2 1.0X +ParquetReader Vectorized -> Row: DataPageV2 208 213 9 75.6 13.2 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 26664 26695 44 0.6 1695.3 1.0X -SQL Json 22655 22657 3 0.7 1440.4 1.2X -SQL Parquet Vectorized 249 254 8 63.2 15.8 107.1X -SQL Parquet MR 2689 2750 86 5.8 171.0 9.9X -SQL ORC Vectorized 517 523 7 30.4 32.9 51.6X -SQL ORC MR 2269 2270 1 6.9 144.3 11.8X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz -Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 359 404 100 43.8 22.8 1.0X -ParquetReader Vectorized -> Row 325 329 5 48.4 20.7 1.1X +SQL CSV 22569 22614 63 0.7 1434.9 1.0X +SQL Json 15590 15600 15 1.0 991.2 1.4X +SQL Parquet Vectorized: DataPageV1 225 241 17 69.9 14.3 100.3X +SQL Parquet Vectorized: DataPageV2 219 236 13 72.0 13.9 103.3X +SQL Parquet MR: DataPageV1 2013 2109 136 7.8 128.0 11.2X +SQL Parquet MR: DataPageV2 1850 1967 165 8.5 117.6 12.2X +SQL ORC Vectorized 396 416 25 39.7 25.2 56.9X +SQL ORC MR 1707 1763 79 9.2 108.5 13.2X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized: DataPageV1 280 298 13 56.2 17.8 1.0X +ParquetReader Vectorized: DataPageV2 278 300 21 56.6 17.7 1.0X +ParquetReader Vectorized -> Row: DataPageV1 280 299 13 56.2 17.8 1.0X +ParquetReader Vectorized -> Row: DataPageV2 304 307 4 51.8 19.3 0.9X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 18336 18703 519 0.6 1748.7 1.0X -SQL Json 15924 16092 238 0.7 1518.6 1.2X -SQL Parquet Vectorized 2534 2540 9 4.1 241.6 7.2X -SQL Parquet MR 4768 4772 5 2.2 454.7 3.8X -SQL ORC Vectorized 2477 2513 51 4.2 236.3 7.4X -SQL ORC MR 4451 4470 27 2.4 424.5 4.1X +SQL CSV 15548 16002 641 0.7 1482.8 1.0X +SQL Json 10801 11108 434 1.0 1030.1 1.4X +SQL Parquet Vectorized: DataPageV1 1858 1966 152 5.6 177.2 8.4X +SQL Parquet Vectorized: DataPageV2 2342 2466 175 4.5 223.4 6.6X +SQL Parquet MR: DataPageV1 3873 3908 49 2.7 369.4 4.0X +SQL Parquet MR: DataPageV2 3764 3869 148 2.8 358.9 4.1X +SQL ORC Vectorized 2018 2020 3 5.2 192.5 7.7X +SQL ORC MR 3247 3450 287 3.2 309.7 4.8X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9701 9753 74 1.1 925.1 1.0X -SQL Json 9562 9566 6 1.1 911.9 1.0X -SQL Parquet Vectorized 907 916 8 11.6 86.5 10.7X -SQL Parquet MR 2020 2021 2 5.2 192.6 4.8X -SQL ORC Vectorized 536 539 3 19.6 51.1 18.1X -SQL ORC MR 2211 2218 9 4.7 210.9 4.4X +SQL CSV 8028 8337 436 1.3 765.6 1.0X +SQL Json 6362 6488 178 1.6 606.7 1.3X +SQL Parquet Vectorized: DataPageV1 642 673 51 16.3 61.3 12.5X +SQL Parquet Vectorized: DataPageV2 646 678 40 16.2 61.6 12.4X +SQL Parquet MR: DataPageV1 1504 1604 141 7.0 143.5 5.3X +SQL Parquet MR: DataPageV2 1645 1646 1 6.4 156.9 4.9X +SQL ORC Vectorized 386 415 25 27.2 36.8 20.8X +SQL ORC MR 1704 1730 37 6.2 162.5 4.7X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz -Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Data column - CSV 25664 25733 97 0.6 1631.7 1.0X -Data column - Json 17014 17023 13 0.9 1081.7 1.5X -Data column - Parquet Vectorized 261 268 8 60.2 16.6 98.2X -Data column - Parquet MR 3173 3182 14 5.0 201.7 8.1X -Data column - ORC Vectorized 363 365 1 43.3 23.1 70.7X -Data column - ORC MR 2672 2675 4 5.9 169.9 9.6X -Partition column - CSV 8197 8202 7 1.9 521.2 3.1X -Partition column - Json 12495 12501 9 1.3 794.4 2.1X -Partition column - Parquet Vectorized 67 69 2 236.1 4.2 385.3X -Partition column - Parquet MR 1465 1466 1 10.7 93.2 17.5X -Partition column - ORC Vectorized 68 71 4 232.7 4.3 379.7X -Partition column - ORC MR 1625 1625 0 9.7 103.3 15.8X -Both columns - CSV 26284 26309 36 0.6 1671.1 1.0X -Both columns - Json 19343 19369 37 0.8 1229.8 1.3X -Both columns - Parquet Vectorized 311 321 10 50.5 19.8 82.5X -Both columns - Parquet MR 3355 3356 2 4.7 213.3 7.6X -Both columns - ORC Vectorized 415 418 5 37.9 26.4 61.9X -Both columns - ORC MR 2739 2743 6 5.7 174.1 9.4X +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------- +Data column - CSV 21472 21514 59 0.7 1365.2 1.0X +Data column - Json 11537 11606 97 1.4 733.5 1.9X +Data column - Parquet Vectorized: DataPageV1 238 256 11 66.1 15.1 90.2X +Data column - Parquet Vectorized: DataPageV2 482 507 17 32.6 30.6 44.6X +Data column - Parquet MR: DataPageV1 2213 2355 200 7.1 140.7 9.7X +Data column - Parquet MR: DataPageV2 2036 2163 179 7.7 129.4 10.5X +Data column - ORC Vectorized 289 310 20 54.4 18.4 74.3X +Data column - ORC MR 1898 1936 54 8.3 120.7 11.3X +Partition column - CSV 6307 6364 80 2.5 401.0 3.4X +Partition column - Json 9167 9253 121 1.7 582.8 2.3X +Partition column - Parquet Vectorized: DataPageV1 62 66 3 253.5 3.9 346.1X +Partition column - Parquet Vectorized: DataPageV2 61 65 2 259.2 3.9 353.8X +Partition column - Parquet MR: DataPageV1 1086 1088 3 14.5 69.0 19.8X +Partition column - Parquet MR: DataPageV2 1091 1146 78 14.4 69.4 19.7X +Partition column - ORC Vectorized 63 67 2 251.1 4.0 342.9X +Partition column - ORC MR 1173 1175 3 13.4 74.6 18.3X +Both columns - CSV 21458 22038 820 0.7 1364.3 1.0X +Both columns - Json 12697 12712 22 1.2 807.2 1.7X +Both columns - Parquet Vectorized: DataPageV1 275 288 10 57.2 17.5 78.0X +Both columns - Parquet Vectorized: DataPageV2 505 525 24 31.2 32.1 42.5X +Both columns - Parquet MR: DataPageV1 2541 2547 9 6.2 161.5 8.5X +Both columns - Parquet MR: DataPageV2 2059 2060 2 7.6 130.9 10.4X +Both columns - ORC Vectorized 326 349 16 48.3 20.7 66.0X +Both columns - ORC MR 2116 2151 50 7.4 134.5 10.1X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 12006 12014 11 0.9 1145.0 1.0X -SQL Json 19062 19074 16 0.6 1817.9 0.6X -SQL Parquet Vectorized 1608 1612 6 6.5 153.3 7.5X -SQL Parquet MR 3986 4005 27 2.6 380.1 3.0X -ParquetReader Vectorized 1199 1203 7 8.7 114.3 10.0X -SQL ORC Vectorized 1114 1114 0 9.4 106.2 10.8X -SQL ORC MR 3806 3806 1 2.8 362.9 3.2X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 10074 10372 422 1.0 960.7 1.0X +SQL Json 10037 10147 156 1.0 957.2 1.0X +SQL Parquet Vectorized: DataPageV1 1192 1226 47 8.8 113.7 8.4X +SQL Parquet Vectorized: DataPageV2 2349 2423 105 4.5 224.0 4.3X +SQL Parquet MR: DataPageV1 2995 3114 168 3.5 285.6 3.4X +SQL Parquet MR: DataPageV2 3847 3900 75 2.7 366.9 2.6X +ParquetReader Vectorized: DataPageV1 888 918 51 11.8 84.7 11.3X +ParquetReader Vectorized: DataPageV2 2128 2159 43 4.9 203.0 4.7X +SQL ORC Vectorized 837 908 61 12.5 79.8 12.0X +SQL ORC MR 2792 2882 127 3.8 266.3 3.6X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8707 8791 118 1.2 830.4 1.0X -SQL Json 14505 14532 39 0.7 1383.3 0.6X -SQL Parquet Vectorized 1245 1265 27 8.4 118.8 7.0X -SQL Parquet MR 3019 3028 12 3.5 287.9 2.9X -ParquetReader Vectorized 1143 1156 20 9.2 109.0 7.6X -SQL ORC Vectorized 1543 1549 8 6.8 147.1 5.6X -SQL ORC MR 3672 3685 18 2.9 350.2 2.4X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 7808 7810 3 1.3 744.6 1.0X +SQL Json 7434 7491 82 1.4 708.9 1.1X +SQL Parquet Vectorized: DataPageV1 1037 1044 10 10.1 98.9 7.5X +SQL Parquet Vectorized: DataPageV2 1528 1529 3 6.9 145.7 5.1X +SQL Parquet MR: DataPageV1 2300 2411 156 4.6 219.4 3.4X +SQL Parquet MR: DataPageV2 2637 2639 4 4.0 251.5 3.0X +ParquetReader Vectorized: DataPageV1 843 907 56 12.4 80.4 9.3X +ParquetReader Vectorized: DataPageV2 1424 1446 30 7.4 135.8 5.5X +SQL ORC Vectorized 1131 1132 1 9.3 107.8 6.9X +SQL ORC MR 2781 2856 106 3.8 265.3 2.8X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 5845 5848 4 1.8 557.4 1.0X -SQL Json 8854 8858 5 1.2 844.4 0.7X -SQL Parquet Vectorized 272 278 8 38.6 25.9 21.5X -SQL Parquet MR 1916 1936 27 5.5 182.7 3.1X -ParquetReader Vectorized 283 285 3 37.0 27.0 20.6X -SQL ORC Vectorized 548 551 3 19.1 52.3 10.7X -SQL ORC MR 1942 1944 2 5.4 185.2 3.0X +SQL CSV 5357 5538 255 2.0 510.9 1.0X +SQL Json 4354 4387 47 2.4 415.2 1.2X +SQL Parquet Vectorized: DataPageV1 212 226 15 49.5 20.2 25.3X +SQL Parquet Vectorized: DataPageV2 265 276 16 39.6 25.2 20.2X +SQL Parquet MR: DataPageV1 1575 1578 4 6.7 150.2 3.4X +SQL Parquet MR: DataPageV2 1624 1638 21 6.5 154.8 3.3X +ParquetReader Vectorized: DataPageV1 219 234 14 47.8 20.9 24.4X +ParquetReader Vectorized: DataPageV2 274 294 17 38.2 26.2 19.5X +SQL ORC Vectorized 370 393 12 28.4 35.3 14.5X +SQL ORC MR 1540 1545 7 6.8 146.9 3.5X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 3388 3395 10 0.3 3231.0 1.0X -SQL Json 4079 4087 11 0.3 3889.6 0.8X -SQL Parquet Vectorized 55 59 7 19.2 52.1 62.0X -SQL Parquet MR 226 229 2 4.6 215.2 15.0X -SQL ORC Vectorized 62 67 13 17.0 58.7 55.0X -SQL ORC MR 194 198 5 5.4 185.0 17.5X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 2159 2212 74 0.5 2059.3 1.0X +SQL Json 2836 2896 84 0.4 2704.5 0.8X +SQL Parquet Vectorized: DataPageV1 54 59 9 19.5 51.4 40.1X +SQL Parquet Vectorized: DataPageV2 66 72 8 15.9 63.1 32.7X +SQL Parquet MR: DataPageV1 173 186 10 6.1 164.5 12.5X +SQL Parquet MR: DataPageV2 159 172 8 6.6 151.8 13.6X +SQL ORC Vectorized 54 60 10 19.2 52.0 39.6X +SQL ORC MR 150 161 7 7.0 143.3 14.4X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8141 8142 1 0.1 7764.3 1.0X -SQL Json 15614 15694 113 0.1 14890.4 0.5X -SQL Parquet Vectorized 70 78 12 14.9 67.0 115.8X -SQL Parquet MR 245 250 4 4.3 234.0 33.2X -SQL ORC Vectorized 77 83 9 13.5 73.8 105.2X -SQL ORC MR 212 215 2 4.9 202.1 38.4X - -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 5877 5883 8 0.2 5605.0 1.0X +SQL Json 11474 11587 159 0.1 10942.9 0.5X +SQL Parquet Vectorized: DataPageV1 66 72 7 15.9 63.1 88.9X +SQL Parquet Vectorized: DataPageV2 83 90 8 12.6 79.4 70.6X +SQL Parquet MR: DataPageV1 191 201 9 5.5 182.6 30.7X +SQL Parquet MR: DataPageV2 179 187 9 5.9 170.3 32.9X +SQL ORC Vectorized 70 76 12 14.9 67.1 83.5X +SQL ORC MR 167 175 7 6.3 159.2 35.2X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1025-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 14087 14102 20 0.1 13434.7 1.0X -SQL Json 30069 30223 218 0.0 28676.2 0.5X -SQL Parquet Vectorized 107 113 8 9.8 101.9 131.9X -SQL Parquet MR 289 295 4 3.6 275.9 48.7X -SQL ORC Vectorized 99 105 14 10.6 94.4 142.3X -SQL ORC MR 236 239 3 4.4 225.5 59.6X +SQL CSV 9695 9965 382 0.1 9245.8 1.0X +SQL Json 22119 23566 2045 0.0 21094.6 0.4X +SQL Parquet Vectorized: DataPageV1 96 104 7 10.9 91.6 100.9X +SQL Parquet Vectorized: DataPageV2 113 121 8 9.3 107.8 85.8X +SQL Parquet MR: DataPageV1 227 243 9 4.6 216.2 42.8X +SQL Parquet MR: DataPageV2 210 225 12 5.0 200.2 46.2X +SQL ORC Vectorized 90 96 10 11.7 85.7 107.9X +SQL ORC MR 188 199 9 5.6 178.9 51.7X diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala index 5094cdf2296e..7c9fa58d77f4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala @@ -24,8 +24,7 @@ import scala.util.Random import org.apache.parquet.column.ParquetProperties import org.apache.parquet.hadoop.ParquetOutputFormat -import org.apache.spark.SparkConf -import org.apache.spark.TestUtils +import org.apache.spark.{SparkConf, TestUtils} import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.{DataFrame, DataFrameWriter, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow @@ -79,7 +78,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { saveAsCsvTable(testDf, dir.getCanonicalPath + "/csv") saveAsJsonTable(testDf, dir.getCanonicalPath + "/json") - saveAsParquetTable(testDf, dir.getCanonicalPath + "/parquet") + saveAsParquetV1Table(testDf, dir.getCanonicalPath + "/parquetV1") saveAsParquetV2Table(testDf, dir.getCanonicalPath + "/parquetV2") saveAsOrcTable(testDf, dir.getCanonicalPath + "/orc") } @@ -94,9 +93,9 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { spark.read.json(dir).createOrReplaceTempView("jsonTable") } - private def saveAsParquetTable(df: DataFrameWriter[Row], dir: String): Unit = { + private def saveAsParquetV1Table(df: DataFrameWriter[Row], dir: String): Unit = { df.mode("overwrite").option("compression", "snappy").parquet(dir) - spark.read.parquet(dir).createOrReplaceTempView("parquetTable") + spark.read.parquet(dir).createOrReplaceTempView("parquetV1Table") } private def saveAsParquetV2Table(df: DataFrameWriter[Row], dir: String): Unit = { @@ -112,6 +111,8 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { spark.read.orc(dir).createOrReplaceTempView("orcTable") } + private def withParquetVersions(f: String => Unit): Unit = Seq("V1", "V2").foreach(f) + def numericScanBenchmark(values: Int, dataType: DataType): Unit = { // Benchmarks running through spark sql. val sqlBenchmark = new Benchmark( @@ -126,7 +127,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { output = output) withTempPath { dir => - withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") { + withTempTable("t1", "csvTable", "jsonTable", "parquetV1Table", "parquetV2Table", "orcTable") { import spark.implicits._ spark.range(values).map(_ => Random.nextLong).createOrReplaceTempView("t1") @@ -145,13 +146,17 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { spark.sql(s"select $query from jsonTable").noop() } - sqlBenchmark.addCase("SQL Parquet Vectorized") { _ => - spark.sql(s"select $query from parquetTable").noop() + withParquetVersions { version => + sqlBenchmark.addCase(s"SQL Parquet Vectorized: DataPage$version") { _ => + spark.sql(s"select $query from parquet${version}Table").noop() + } } - sqlBenchmark.addCase("SQL Parquet MR") { _ => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql(s"select $query from parquetTable").noop() + withParquetVersions { version => + sqlBenchmark.addCase(s"SQL Parquet MR: DataPage$version") { _ => + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { + spark.sql(s"select $query from parquet${version}Table").noop() + } } } @@ -167,79 +172,93 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { sqlBenchmark.run() - // Driving the parquet reader in batch mode directly. - val files = TestUtils.listDirectory(new File(dir, "parquet")) val enableOffHeapColumnVector = spark.sessionState.conf.offHeapColumnVectorEnabled val vectorizedReaderBatchSize = spark.sessionState.conf.parquetVectorizedReaderBatchSize - parquetReaderBenchmark.addCase("ParquetReader Vectorized") { _ => - var longSum = 0L - var doubleSum = 0.0 - val aggregateValue: (ColumnVector, Int) => Unit = dataType match { - case BooleanType => (col: ColumnVector, i: Int) => if (col.getBoolean(i)) longSum += 1L - case ByteType => (col: ColumnVector, i: Int) => longSum += col.getByte(i) - case ShortType => (col: ColumnVector, i: Int) => longSum += col.getShort(i) - case IntegerType => (col: ColumnVector, i: Int) => longSum += col.getInt(i) - case LongType => (col: ColumnVector, i: Int) => longSum += col.getLong(i) - case FloatType => (col: ColumnVector, i: Int) => doubleSum += col.getFloat(i) - case DoubleType => (col: ColumnVector, i: Int) => doubleSum += col.getDouble(i) - } + withParquetVersions { version => + // Driving the parquet reader in batch mode directly. + val files = TestUtils.listDirectory(new File(dir, s"parquet$version")) + parquetReaderBenchmark.addCase(s"ParquetReader Vectorized: DataPage$version") { _ => + var longSum = 0L + var doubleSum = 0.0 + val aggregateValue: (ColumnVector, Int) => Unit = dataType match { + case BooleanType => + (col: ColumnVector, i: Int) => if (col.getBoolean(i)) longSum += 1L + case ByteType => + (col: ColumnVector, i: Int) => longSum += col.getByte(i) + case ShortType => + (col: ColumnVector, i: Int) => longSum += col.getShort(i) + case IntegerType => + (col: ColumnVector, i: Int) => longSum += col.getInt(i) + case LongType => + (col: ColumnVector, i: Int) => longSum += col.getLong(i) + case FloatType => + (col: ColumnVector, i: Int) => doubleSum += col.getFloat(i) + case DoubleType => + (col: ColumnVector, i: Int) => doubleSum += col.getDouble(i) + } - files.foreach { p => - val reader = new VectorizedParquetRecordReader( - enableOffHeapColumnVector, vectorizedReaderBatchSize) - try { - reader.initialize(p, ("id" :: Nil).asJava) - val batch = reader.resultBatch() - val col = batch.column(0) - while (reader.nextBatch()) { - val numRows = batch.numRows() - var i = 0 - while (i < numRows) { - if (!col.isNullAt(i)) aggregateValue(col, i) - i += 1 + files.foreach { p => + val reader = new VectorizedParquetRecordReader( + enableOffHeapColumnVector, vectorizedReaderBatchSize) + try { + reader.initialize(p, ("id" :: Nil).asJava) + val batch = reader.resultBatch() + val col = batch.column(0) + while (reader.nextBatch()) { + val numRows = batch.numRows() + var i = 0 + while (i < numRows) { + if (!col.isNullAt(i)) aggregateValue(col, i) + i += 1 + } } + } finally { + reader.close() } - } finally { - reader.close() } } } - // Decoding in vectorized but having the reader return rows. - parquetReaderBenchmark.addCase("ParquetReader Vectorized -> Row") { num => - var longSum = 0L - var doubleSum = 0.0 - val aggregateValue: (InternalRow) => Unit = dataType match { - case BooleanType => (col: InternalRow) => if (col.getBoolean(0)) longSum += 1L - case ByteType => (col: InternalRow) => longSum += col.getByte(0) - case ShortType => (col: InternalRow) => longSum += col.getShort(0) - case IntegerType => (col: InternalRow) => longSum += col.getInt(0) - case LongType => (col: InternalRow) => longSum += col.getLong(0) - case FloatType => (col: InternalRow) => doubleSum += col.getFloat(0) - case DoubleType => (col: InternalRow) => doubleSum += col.getDouble(0) - } + withParquetVersions { version => + // Driving the parquet reader in batch mode directly. + val files = TestUtils.listDirectory(new File(dir, s"parquet$version")) + // Decoding in vectorized but having the reader return rows. + parquetReaderBenchmark + .addCase(s"ParquetReader Vectorized -> Row: DataPage$version") { _ => + var longSum = 0L + var doubleSum = 0.0 + val aggregateValue: (InternalRow) => Unit = dataType match { + case BooleanType => (col: InternalRow) => if (col.getBoolean(0)) longSum += 1L + case ByteType => (col: InternalRow) => longSum += col.getByte(0) + case ShortType => (col: InternalRow) => longSum += col.getShort(0) + case IntegerType => (col: InternalRow) => longSum += col.getInt(0) + case LongType => (col: InternalRow) => longSum += col.getLong(0) + case FloatType => (col: InternalRow) => doubleSum += col.getFloat(0) + case DoubleType => (col: InternalRow) => doubleSum += col.getDouble(0) + } - files.map(_.asInstanceOf[String]).foreach { p => - val reader = new VectorizedParquetRecordReader( - enableOffHeapColumnVector, vectorizedReaderBatchSize) - try { - reader.initialize(p, ("id" :: Nil).asJava) - val batch = reader.resultBatch() - while (reader.nextBatch()) { - val it = batch.rowIterator() - while (it.hasNext) { - val record = it.next() - if (!record.isNullAt(0)) aggregateValue(record) + files.foreach { p => + val reader = new VectorizedParquetRecordReader( + enableOffHeapColumnVector, vectorizedReaderBatchSize) + try { + reader.initialize(p, ("id" :: Nil).asJava) + val batch = reader.resultBatch() + while (reader.nextBatch()) { + val it = batch.rowIterator() + while (it.hasNext) { + val record = it.next() + if (!record.isNullAt(0)) aggregateValue(record) + } + } + } finally { + reader.close() } } - } finally { - reader.close() } - } } - - parquetReaderBenchmark.run() } + + parquetReaderBenchmark.run() } } @@ -247,7 +266,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { val benchmark = new Benchmark("Int and String Scan", values, output = output) withTempPath { dir => - withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") { + withTempTable("t1", "csvTable", "jsonTable", "parquetV1Table", "parquetV2Table", "orcTable") { import spark.implicits._ spark.range(values).map(_ => Random.nextLong).createOrReplaceTempView("t1") @@ -263,13 +282,17 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { spark.sql("select sum(c1), sum(length(c2)) from jsonTable").noop() } - benchmark.addCase("SQL Parquet Vectorized") { _ => - spark.sql("select sum(c1), sum(length(c2)) from parquetTable").noop() + withParquetVersions { version => + benchmark.addCase(s"SQL Parquet Vectorized: DataPage$version") { _ => + spark.sql(s"select sum(c1), sum(length(c2)) from parquet${version}Table").noop() + } } - benchmark.addCase("SQL Parquet MR") { _ => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("select sum(c1), sum(length(c2)) from parquetTable").noop() + withParquetVersions { version => + benchmark.addCase(s"SQL Parquet MR: DataPage$version") { _ => + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { + spark.sql(s"select sum(c1), sum(length(c2)) from parquet${version}Table").noop() + } } } @@ -292,7 +315,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { val benchmark = new Benchmark("Repeated String", values, output = output) withTempPath { dir => - withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") { + withTempTable("t1", "csvTable", "jsonTable", "parquetV1Table", "parquetV2Table", "orcTable") { import spark.implicits._ spark.range(values).map(_ => Random.nextLong).createOrReplaceTempView("t1") @@ -308,13 +331,17 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { spark.sql("select sum(length(c1)) from jsonTable").noop() } - benchmark.addCase("SQL Parquet Vectorized") { _ => - spark.sql("select sum(length(c1)) from parquetTable").noop() + withParquetVersions { version => + benchmark.addCase(s"SQL Parquet Vectorized: DataPage$version") { _ => + spark.sql(s"select sum(length(c1)) from parquet${version}Table").noop() + } } - benchmark.addCase("SQL Parquet MR") { _ => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("select sum(length(c1)) from parquetTable").noop() + withParquetVersions { version => + benchmark.addCase(s"SQL Parquet MR: DataPage$version") { _ => + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { + spark.sql(s"select sum(length(c1)) from parquet${version}Table").noop() + } } } @@ -337,7 +364,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { val benchmark = new Benchmark("Partitioned Table", values, output = output) withTempPath { dir => - withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") { + withTempTable("t1", "csvTable", "jsonTable", "parquetV1Table", "parquetV2Table", "orcTable") { import spark.implicits._ spark.range(values).map(_ => Random.nextLong).createOrReplaceTempView("t1") @@ -351,13 +378,17 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { spark.sql("select sum(id) from jsonTable").noop() } - benchmark.addCase("Data column - Parquet Vectorized") { _ => - spark.sql("select sum(id) from parquetTable").noop() + withParquetVersions { version => + benchmark.addCase(s"Data column - Parquet Vectorized: DataPage$version") { _ => + spark.sql(s"select sum(id) from parquet${version}Table").noop() + } } - benchmark.addCase("Data column - Parquet MR") { _ => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("select sum(id) from parquetTable").noop() + withParquetVersions { version => + benchmark.addCase(s"Data column - Parquet MR: DataPage$version") { _ => + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { + spark.sql(s"select sum(id) from parquet${version}Table").noop() + } } } @@ -379,13 +410,17 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { spark.sql("select sum(p) from jsonTable").noop() } - benchmark.addCase("Partition column - Parquet Vectorized") { _ => - spark.sql("select sum(p) from parquetTable").noop() + withParquetVersions { version => + benchmark.addCase(s"Partition column - Parquet Vectorized: DataPage$version") { _ => + spark.sql(s"select sum(p) from parquet${version}Table").noop() + } } - benchmark.addCase("Partition column - Parquet MR") { _ => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("select sum(p) from parquetTable").noop() + withParquetVersions { version => + benchmark.addCase(s"Partition column - Parquet MR: DataPage$version") { _ => + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { + spark.sql(s"select sum(p) from parquet${version}Table").noop() + } } } @@ -407,13 +442,17 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { spark.sql("select sum(p), sum(id) from jsonTable").noop() } - benchmark.addCase("Both columns - Parquet Vectorized") { _ => - spark.sql("select sum(p), sum(id) from parquetTable").noop() + withParquetVersions { version => + benchmark.addCase(s"Both columns - Parquet Vectorized: DataPage$version") { _ => + spark.sql(s"select sum(p), sum(id) from parquet${version}Table").noop() + } } - benchmark.addCase("Both columns - Parquet MR") { _ => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("select sum(p), sum(id) from parquetTable").noop() + withParquetVersions { version => + benchmark.addCase(s"Both columns - Parquet MR: DataPage$version") { _ => + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { + spark.sql(s"select sum(p), sum(id) from parquet${version}Table").noop() + } } } @@ -438,7 +477,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { new Benchmark(s"String with Nulls Scan ($percentageOfNulls%)", values, output = output) withTempPath { dir => - withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") { + withTempTable("t1", "csvTable", "jsonTable", "parquetV1Table", "parquetV2Table", "orcTable") { spark.range(values).createOrReplaceTempView("t1") prepareTable( @@ -457,39 +496,45 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { "not NULL and c2 is not NULL").noop() } - benchmark.addCase("SQL Parquet Vectorized") { _ => - spark.sql("select sum(length(c2)) from parquetTable where c1 is " + - "not NULL and c2 is not NULL").noop() + withParquetVersions { version => + benchmark.addCase(s"SQL Parquet Vectorized: DataPage$version") { _ => + spark.sql(s"select sum(length(c2)) from parquet${version}Table where c1 is " + + "not NULL and c2 is not NULL").noop() + } } - benchmark.addCase("SQL Parquet MR") { _ => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("select sum(length(c2)) from parquetTable where c1 is " + - "not NULL and c2 is not NULL").noop() + withParquetVersions { version => + benchmark.addCase(s"SQL Parquet MR: DataPage$version") { _ => + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { + spark.sql(s"select sum(length(c2)) from parquet${version}Table where c1 is " + + "not NULL and c2 is not NULL").noop() + } } } - val files = TestUtils.listDirectory(new File(dir, "parquet")) - val enableOffHeapColumnVector = spark.sessionState.conf.offHeapColumnVectorEnabled - val vectorizedReaderBatchSize = spark.sessionState.conf.parquetVectorizedReaderBatchSize - benchmark.addCase("ParquetReader Vectorized") { num => - var sum = 0 - files.foreach { p => - val reader = new VectorizedParquetRecordReader( - enableOffHeapColumnVector, vectorizedReaderBatchSize) - try { - reader.initialize(p, ("c1" :: "c2" :: Nil).asJava) - val batch = reader.resultBatch() - while (reader.nextBatch()) { - val rowIterator = batch.rowIterator() - while (rowIterator.hasNext) { - val row = rowIterator.next() - val value = row.getUTF8String(0) - if (!row.isNullAt(0) && !row.isNullAt(1)) sum += value.numBytes() + withParquetVersions { version => + val files = TestUtils.listDirectory(new File(dir, s"parquet$version")) + val enableOffHeapColumnVector = spark.sessionState.conf.offHeapColumnVectorEnabled + val vectorizedReaderBatchSize = spark.sessionState.conf.parquetVectorizedReaderBatchSize + benchmark.addCase(s"ParquetReader Vectorized: DataPage$version") { _ => + var sum = 0 + files.foreach { p => + val reader = new VectorizedParquetRecordReader( + enableOffHeapColumnVector, vectorizedReaderBatchSize) + try { + reader.initialize(p, ("c1" :: "c2" :: Nil).asJava) + val batch = reader.resultBatch() + while (reader.nextBatch()) { + val rowIterator = batch.rowIterator() + while (rowIterator.hasNext) { + val row = rowIterator.next() + val value = row.getUTF8String(0) + if (!row.isNullAt(0) && !row.isNullAt(1)) sum += value.numBytes() + } } + } finally { + reader.close() } - } finally { - reader.close() } } } @@ -518,7 +563,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { output = output) withTempPath { dir => - withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") { + withTempTable("t1", "csvTable", "jsonTable", "parquetV1Table", "parquetV2Table", "orcTable") { import spark.implicits._ val middle = width / 2 val selectExpr = (1 to width).map(i => s"value as c$i") @@ -535,13 +580,17 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { spark.sql(s"SELECT sum(c$middle) FROM jsonTable").noop() } - benchmark.addCase("SQL Parquet Vectorized") { _ => - spark.sql(s"SELECT sum(c$middle) FROM parquetTable").noop() + withParquetVersions { version => + benchmark.addCase(s"SQL Parquet Vectorized: DataPage$version") { _ => + spark.sql(s"SELECT sum(c$middle) FROM parquet${version}Table").noop() + } } - benchmark.addCase("SQL Parquet MR") { _ => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql(s"SELECT sum(c$middle) FROM parquetTable").noop() + withParquetVersions { version => + benchmark.addCase(s"SQL Parquet MR: DataPage$version") { _ => + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { + spark.sql(s"SELECT sum(c$middle) FROM parquet${version}Table").noop() + } } }