Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,19 @@ OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure
AMD EPYC 7763 64-Core Processor
Escape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Legacy 7128 7146 8 0.1 7127.9 1.0X
New 790 795 5 1.3 789.7 9.0X
Legacy 6996 7009 9 0.1 6996.5 1.0X
New 771 776 3 1.3 770.7 9.1X


================================================================================================
Unescape
================================================================================================

OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure
AMD EPYC 7763 64-Core Processor
Unescape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Legacy 5127 5137 6 0.2 5127.3 1.0X
New 579 583 4 1.7 579.3 8.9X


16 changes: 14 additions & 2 deletions sql/catalyst/benchmarks/EscapePathBenchmark-results.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,19 @@ OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1021-azure
AMD EPYC 7763 64-Core Processor
Escape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Legacy 6719 6726 6 0.1 6719.3 1.0X
New 735 744 21 1.4 735.3 9.1X
Legacy 6966 6978 12 0.1 6965.9 1.0X
New 725 730 4 1.4 725.4 9.6X


================================================================================================
Unescape
================================================================================================

OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1021-azure
AMD EPYC 7763 64-Core Processor
Unescape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
Legacy 6665 6677 11 0.2 6664.6 1.0X
New 602 606 2 1.7 602.1 11.1X


Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import org.apache.hadoop.util.Shell
import org.apache.spark.sql.catalyst.analysis.Resolver
import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, BasePredicate, BoundReference, Expression, Predicate}
import org.apache.spark.sql.catalyst.expressions.Hex.unhexDigits
import org.apache.spark.sql.catalyst.util.CharVarcharUtils
import org.apache.spark.sql.errors.QueryCompilationErrors
import org.apache.spark.sql.internal.SQLConf
Expand Down Expand Up @@ -97,31 +98,40 @@ object ExternalCatalogUtils {
}

def unescapePathName(path: String): String = {
val sb = new StringBuilder
var i = 0

while (i < path.length) {
val c = path.charAt(i)
if (c == '%' && i + 2 < path.length) {
val code: Int = try {
Integer.parseInt(path.substring(i + 1, i + 3), 16)
} catch {
case _: Exception => -1
}
if (code >= 0) {
sb.append(code.asInstanceOf[Char])
i += 3
if (path == null || path.isEmpty) {
return path
}
var plaintextEndIdx = path.indexOf('%')
val length = path.length
if (plaintextEndIdx == -1 || plaintextEndIdx + 2 > length) {
// fast path, no %xx encoding found then return the string identity
path
} else {
val sb = new java.lang.StringBuilder(length)
var plaintextStartIdx = 0
while(plaintextEndIdx != -1 && plaintextEndIdx + 2 < length) {
if (plaintextEndIdx > plaintextStartIdx) sb.append(path, plaintextStartIdx, plaintextEndIdx)
val high = path.charAt(plaintextEndIdx + 1)
if ((high >>> 8) == 0 && unhexDigits(high) != -1) {
val low = path.charAt(plaintextEndIdx + 2)
if ((low >>> 8) == 0 && unhexDigits(low) != -1) {
sb.append((unhexDigits(high) << 4 | unhexDigits(low)).asInstanceOf[Char])
plaintextStartIdx = plaintextEndIdx + 3
} else {
sb.append('%')
plaintextStartIdx = plaintextEndIdx + 1
}
} else {
sb.append(c)
i += 1
sb.append('%')
plaintextStartIdx = plaintextEndIdx + 1
}
} else {
sb.append(c)
i += 1
plaintextEndIdx = path.indexOf('%', plaintextStartIdx)
}
if (plaintextStartIdx < length) {
sb.append(path, plaintextStartIdx, length)
}
sb.toString
}

sb.toString()
}

def generatePartitionPath(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils

/**
* Benchmark for path escaping
* Benchmark for path escaping/unescaping
* To run this benchmark:
* {{{
* 1. without sbt:
Expand Down Expand Up @@ -53,6 +53,30 @@ object EscapePathBenchmark extends BenchmarkBase {
}
benchmark.run()
}

runBenchmark("Unescape") {
val benchmark = new Benchmark("Unescape Tests", N, 10, output = output)
val paths = Seq(
"https%3A%2F%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551",
"https:%2F%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551",
"https:/%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551",
"https://issues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551",
"https://issues.apache.org/jira%2Fbrowse%2FSPARK-48551",
"https://issues.apache.org/jira%2Fbrowse%2FSPARK-48551",
"https://issues.apache.org/jira/browse%2FSPARK-48551",
"https://issues.apache.org/jira/browse%2SPARK-48551",
"https://issues.apache.org/jira/browse/SPARK-48551")
benchmark.addCase("Legacy") { _ =>
(1 to N).foreach(_ => paths.foreach(unescapePathNameLegacy))
}

benchmark.addCase("New") { _ =>
(1 to N).foreach(_ => {
paths.foreach(ExternalCatalogUtils.unescapePathName)
})
}
benchmark.run()
}
}

/**
Expand All @@ -71,4 +95,30 @@ object EscapePathBenchmark extends BenchmarkBase {

builder.toString()
}

def unescapePathNameLegacy(path: String): String = {
val sb = new StringBuilder
var i = 0
while (i < path.length) {
val c = path.charAt(i)
if (c == '%' && i + 2 < path.length) {
val code: Int = try {
Integer.parseInt(path.substring(i + 1, i + 3), 16)
} catch {
case _: Exception => -1
}
if (code >= 0) {
sb.append(code.asInstanceOf[Char])
i += 3
} else {
sb.append(c)
i += 1
}
} else {
sb.append(c)
i += 1
}
}
sb.toString()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
package org.apache.spark.sql.catalyst.catalog

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.{escapePathName, unescapePathName}

class ExternalCatalogUtilsSuite extends SparkFunSuite {

Expand All @@ -39,4 +39,28 @@ class ExternalCatalogUtilsSuite extends SparkFunSuite {
assert(escapePathName("a,b") === "a,b")
assert(escapePathName("a/b") === "a%2Fb")
}

test("SPARK-48551: unescapePathName") {
ExternalCatalogUtils.charToEscape.stream().toArray.map(_.asInstanceOf[Char]).foreach { c =>
// Check parity with old conversion technique:
assert(unescapePathName("%" + f"$c%02X") === c.toString,
s"wrong unescaping for $c")
}
assert(unescapePathName(null) === null)
assert(unescapePathName("") === "")
assert(unescapePathName(" ") === " ")
assert(unescapePathName("%0A") === "\n")
assert(unescapePathName("a b") === "a b")
assert(unescapePathName("a%3Ab") === "a:b")
assert(unescapePathName("%3Aab") === ":ab")
assert(unescapePathName("ab%3A") === "ab:")
assert(unescapePathName("a%25b") === "a%b")
assert(unescapePathName("a,b") === "a,b")
assert(unescapePathName("a%2Fb") === "a/b")
assert(unescapePathName("a%2") === "a%2")
assert(unescapePathName("a%F ") === "a%F ")
// scalastyle:off nonascii
assert(unescapePathName("a\u00FF") === "a\u00FF")
// scalastyle:on nonascii
}
}