Skip to content

Commit 146c786

Browse files
authored
Port the code for supporting md5/sha1/sha2 from apache apache#10464 (apache#135)
* Port the code and resolve conflicts * Remove excessively ported code * Fix issue in casting float to string * Support sha2 * Fix compile error * Add a ut * Follow the naming convention
1 parent d7aa47e commit 146c786

10 files changed

Lines changed: 941 additions & 92 deletions

cpp/src/gandiva/function_registry_common.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,17 @@ typedef std::unordered_map<const FunctionSignature*, const NativeFunction*, KeyH
228228
utf8(), kResultNullNever, ARROW_STRINGIFY(gdv_fn_sha1_##TYPE), \
229229
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)
230230

231+
// HashSHA2 functions that :
232+
// - Needs to specify bits_length
233+
// - NULL handling is of type NULL_NEVER
234+
// - can return errors
235+
//
236+
// The function name includes the base name & input type name. gdv_fn_sha256_float64
237+
#define HASH_SHA2_NULL_NEVER(NAME, ALIASES, TYPE) \
238+
NativeFunction(#NAME, std::vector<std::string> ALIASES, DataTypeVector{TYPE(), int32()}, \
239+
utf8(), kResultNullNever, ARROW_STRINGIFY(gdv_fn_sha2_##TYPE##_int32), \
240+
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)
241+
231242
// HashSHA256 functions that :
232243
// - NULL handling is of type NULL_NEVER
233244
// - can return errors
@@ -238,6 +249,16 @@ typedef std::unordered_map<const FunctionSignature*, const NativeFunction*, KeyH
238249
utf8(), kResultNullNever, ARROW_STRINGIFY(gdv_fn_sha256_##TYPE), \
239250
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)
240251

252+
// HashMD5 functions that :
253+
// - NULL handling is of type NULL_NEVER
254+
// - can return errors
255+
//
256+
// The function name includes the base name & input type name. gdv_fn_md5_float64
257+
#define HASH_MD5_NULL_NEVER(NAME, ALIASES, TYPE) \
258+
NativeFunction(#NAME, std::vector<std::string> ALIASES, DataTypeVector{TYPE()}, utf8(), \
259+
kResultNullNever, ARROW_STRINGIFY(gdv_fn_md5_##TYPE), \
260+
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)
261+
241262
// Iterate the inner macro over all numeric types
242263
#define NUMERIC_TYPES(INNER, NAME, ALIASES) \
243264
INNER(NAME, ALIASES, int8), INNER(NAME, ALIASES, int16), INNER(NAME, ALIASES, int32), \

cpp/src/gandiva/function_registry_hash.cc

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,15 @@ namespace gandiva {
3535
#define HASH_SHA1_NULL_NEVER_FN(name, ALIASES) \
3636
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH_SHA1_NULL_NEVER, name, ALIASES)
3737

38+
#define HASH_SHA2_NULL_NEVER_FN(name, ALIASES) \
39+
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH_SHA2_NULL_NEVER, name, ALIASES)
40+
3841
#define HASH_SHA256_NULL_NEVER_FN(name, ALIASES) \
3942
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH_SHA256_NULL_NEVER, name, ALIASES)
4043

44+
#define HASH_MD5_NULL_NEVER_FN(name, ALIASES) \
45+
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH_MD5_NULL_NEVER, name, ALIASES)
46+
4147
std::vector<NativeFunction> GetHashFunctionRegistry() {
4248
static std::vector<NativeFunction> hash_fn_registry_ = {
4349
NativeFunction("hash32_spark", {}, DataTypeVector{boolean(), int32()}, int32(),
@@ -72,9 +78,13 @@ std::vector<NativeFunction> GetHashFunctionRegistry() {
7278
HASH64_SEED_SAFE_NULL_NEVER_FN(hash64, {}),
7379
HASH64_SEED_SAFE_NULL_NEVER_FN(hash64AsDouble, {}),
7480

75-
HASH_SHA1_NULL_NEVER_FN(hashSHA1, {}),
81+
HASH_SHA1_NULL_NEVER_FN(hashSHA1, {"sha1"}),
82+
83+
HASH_SHA2_NULL_NEVER_FN(hashSHA2, {"sha2"}),
84+
85+
HASH_SHA256_NULL_NEVER_FN(hashSHA256, {"sha256"}),
7686

77-
HASH_SHA256_NULL_NEVER_FN(hashSHA256, {})};
87+
HASH_MD5_NULL_NEVER_FN(hashMD5, {"md5"})};
7888

7989
return hash_fn_registry_;
8090
}

cpp/src/gandiva/function_registry_string.cc

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -112,21 +112,25 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
112112
kResultNullIfNull, "castVARCHAR_int16_int64",
113113
NativeFunction::kNeedsContext),
114114

115-
NativeFunction("castVARCHAR", {}, DataTypeVector{int32(), int64()}, utf8(),
116-
kResultNullIfNull, "castVARCHAR_int32_int64",
117-
NativeFunction::kNeedsContext),
118-
119-
NativeFunction("castVARCHAR", {}, DataTypeVector{int64(), int64()}, utf8(),
120-
kResultNullIfNull, "castVARCHAR_int64_int64",
121-
NativeFunction::kNeedsContext),
122-
123-
NativeFunction("castVARCHAR", {}, DataTypeVector{float32(), int64()}, utf8(),
124-
kResultNullIfNull, "castVARCHAR_float32_int64",
125-
NativeFunction::kNeedsContext),
126-
127-
NativeFunction("castVARCHAR", {}, DataTypeVector{float64(), int64()}, utf8(),
128-
kResultNullIfNull, "castVARCHAR_float64_int64",
129-
NativeFunction::kNeedsContext),
115+
// There are other implementations that can replace the below functions, e.g.,
116+
// gdv_fn_castVARCHAR_int32_int64, etc., which can get the same result as spark
117+
// in handling cases like cast(2.0 as string) without extra zeros appended after
118+
// the casting.
119+
// NativeFunction("castVARCHAR", {}, DataTypeVector{int32(), int64()}, utf8(),
120+
// kResultNullIfNull, "castVARCHAR_int32_int64",
121+
// NativeFunction::kNeedsContext),
122+
123+
// NativeFunction("castVARCHAR", {}, DataTypeVector{int64(), int64()}, utf8(),
124+
// kResultNullIfNull, "castVARCHAR_int64_int64",
125+
// NativeFunction::kNeedsContext),
126+
127+
// NativeFunction("castVARCHAR", {}, DataTypeVector{float32(), int64()}, utf8(),
128+
// kResultNullIfNull, "castVARCHAR_float32_int64",
129+
// NativeFunction::kNeedsContext),
130+
131+
// NativeFunction("castVARCHAR", {}, DataTypeVector{float64(), int64()}, utf8(),
132+
// kResultNullIfNull, "castVARCHAR_float64_int64",
133+
// NativeFunction::kNeedsContext),
130134

131135
NativeFunction("castVARCHAR", {}, DataTypeVector{boolean(), int64()}, utf8(),
132136
kResultNullIfNull, "castVARCHAR_bool_int64",

0 commit comments

Comments
 (0)