-
Notifications
You must be signed in to change notification settings - Fork 12
Antalya 25.3: Support different warehouses behind Iceberg REST catalog #860
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6eceb35
9a7b6b5
daad95d
85ea912
95f4f0f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -67,13 +67,19 @@ void TableMetadata::setLocation(const std::string & location_) | |
| auto pos_to_path = location_.substr(pos_to_bucket).find('/'); | ||
|
|
||
| if (pos_to_path == std::string::npos) | ||
| throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Unexpected location format: {}", location_); | ||
|
|
||
| pos_to_path = pos_to_bucket + pos_to_path; | ||
| { // empty path | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you want to have this comment, I suggest moving it into a variable instead:
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this better? It's an additional variable filled in runtime, and makes no logic changes.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is just a very opionated comment. I usually prefer named variables over comments because variables compile and we tend to forget to update comments. In any case, I should have deleted or mentioned this is not a requirement. Don't worry about it. |
||
| location_without_path = location_; | ||
| path.clear(); | ||
| bucket = location_.substr(pos_to_bucket); | ||
| } | ||
| else | ||
| { | ||
| pos_to_path = pos_to_bucket + pos_to_path; | ||
|
|
||
| location_without_path = location_.substr(0, pos_to_path); | ||
| path = location_.substr(pos_to_path + 1); | ||
| bucket = location_.substr(pos_to_bucket, pos_to_path - pos_to_bucket); | ||
| location_without_path = location_.substr(0, pos_to_path); | ||
| path = location_.substr(pos_to_path + 1); | ||
| bucket = location_.substr(pos_to_bucket, pos_to_path - pos_to_bucket); | ||
| } | ||
|
|
||
| LOG_TEST(getLogger("TableMetadata"), | ||
| "Parsed location without path: {}, path: {}", | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -373,7 +373,7 @@ Model::HeadObjectOutcome Client::HeadObject(HeadObjectRequest & request) const | |
| auto bucket_uri = getURIForBucket(bucket); | ||
| if (!bucket_uri) | ||
| { | ||
| if (auto maybe_error = updateURIForBucketForHead(bucket); maybe_error.has_value()) | ||
| if (auto maybe_error = updateURIForBucketForHead(bucket, request.GetKey()); maybe_error.has_value()) | ||
| return *maybe_error; | ||
|
|
||
| if (auto region = getRegionForBucket(bucket); !region.empty()) | ||
|
|
@@ -578,7 +578,6 @@ Client::doRequest(RequestType & request, RequestFn request_fn) const | |
| if (auto uri = getURIForBucket(bucket); uri.has_value()) | ||
| request.overrideURI(std::move(*uri)); | ||
|
|
||
|
|
||
| bool found_new_endpoint = false; | ||
| // if we found correct endpoint after 301 responses, update the cache for future requests | ||
| SCOPE_EXIT( | ||
|
|
@@ -813,12 +812,15 @@ std::optional<S3::URI> Client::getURIFromError(const Aws::S3::S3Error & error) c | |
| } | ||
|
|
||
| // Do a list request because head requests don't have body in response | ||
| std::optional<Aws::S3::S3Error> Client::updateURIForBucketForHead(const std::string & bucket) const | ||
| // S3 Tables don't support ListObjects, so made dirty workaroung - changed on GetObject | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using |
||
| std::optional<Aws::S3::S3Error> Client::updateURIForBucketForHead(const std::string & bucket, const std::string & key) const | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the purpose of this method? The name is
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Magic inside AWS SDK. To get proper URI we must make some request to get 301 redirect. In response body AWS sends proper endpoint, and it extracted somewhere inside SDK. After that ClickHouse extracts it in method Client::getURIFromError.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you please add a short comment about this and even add a link to this discussion? I find it surprising they did not do it. |
||
| { | ||
| ListObjectsV2Request req; | ||
| GetObjectRequest req; | ||
| req.SetBucket(bucket); | ||
| req.SetMaxKeys(1); | ||
| auto result = ListObjectsV2(req); | ||
| req.SetKey(key); | ||
| req.SetRange("bytes=0-1"); | ||
| auto result = GetObject(req); | ||
|
|
||
| if (result.IsSuccess()) | ||
| return std::nullopt; | ||
| return result.GetError(); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -157,10 +157,72 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax) | |
| } | ||
| } | ||
|
|
||
| bool URI::isAWSRegion(std::string_view region) | ||
| { | ||
| /// List from https://docs.aws.amazon.com/general/latest/gr/s3.html | ||
| static const std::unordered_set<std::string_view> regions = { | ||
| "us-east-2", | ||
| "us-east-1", | ||
| "us-west-1", | ||
| "us-west-2", | ||
| "af-south-1", | ||
| "ap-east-1", | ||
| "ap-south-2", | ||
| "ap-southeast-3", | ||
| "ap-southeast-5", | ||
| "ap-southeast-4", | ||
| "ap-south-1", | ||
| "ap-northeast-3", | ||
| "ap-northeast-2", | ||
| "ap-southeast-1", | ||
| "ap-southeast-2", | ||
| "ap-east-2", | ||
| "ap-southeast-7", | ||
| "ap-northeast-1", | ||
| "ca-central-1", | ||
| "ca-west-1", | ||
| "eu-central-1", | ||
| "eu-west-1", | ||
| "eu-west-2", | ||
| "eu-south-1", | ||
| "eu-west-3", | ||
| "eu-south-2", | ||
| "eu-north-1", | ||
| "eu-central-2", | ||
| "il-central-1", | ||
| "mx-central-1", | ||
| "me-south-1", | ||
| "me-central-1", | ||
| "sa-east-1", | ||
| "us-gov-east-1", | ||
| "us-gov-west-1" | ||
| }; | ||
|
|
||
| /// 's3-us-west-2' is a legacy region format for S3 storage, equals to 'us-west-2' | ||
| /// See https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#VirtualHostingBackwardsCompatibility | ||
| if (region.substr(0, 3) == "s3-") | ||
| region = region.substr(3); | ||
|
|
||
| return regions.contains(region); | ||
| } | ||
|
|
||
| void URI::addRegionToURI(const std::string ®ion) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed during our call, the current use of this method is already "checking" if region is in the endpoint, but it's probably doing it wrong. Please correct the call site, add docs and examples. It might be a good idea to rename this method to be
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand code in Client.cpp. In response url with region.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Initial error: |
||
| { | ||
| if (auto pos = endpoint.find(".amazonaws.com"); pos != std::string::npos) | ||
| { | ||
| if (pos > 0) | ||
| { /// Check if region is already in endpoint to avoid add it second time | ||
| auto prev_pos = endpoint.find_last_of("/.", pos - 1); | ||
| if (prev_pos == std::string::npos) | ||
| prev_pos = 0; | ||
| else | ||
| ++prev_pos; | ||
| std::string_view endpoint_region = std::string_view(endpoint).substr(prev_pos, pos - prev_pos); | ||
| if (isAWSRegion(endpoint_region)) | ||
| return; | ||
| } | ||
| endpoint = endpoint.substr(0, pos) + "." + region + endpoint.substr(pos); | ||
| } | ||
| } | ||
|
|
||
| void URI::validateBucket(const String & bucket, const Poco::URI & uri) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,7 +28,11 @@ using namespace DB; | |
| // This function is used to get the file path inside the directory which corresponds to iceberg table from the full blob path which is written in manifest and metadata files. | ||
| // For example, if the full blob path is s3://bucket/table_name/data/00000-1-1234567890.avro, the function will return table_name/data/00000-1-1234567890.avro | ||
| // Common path should end with "<table_name>" or "<table_name>/". | ||
| std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::string_view common_path, std::string_view table_location) | ||
| std::string getProperFilePathFromMetadataInfo( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We've spent quite some time reviewing this function together and haven't understood it fully. Please document the possible values and examples for all the arguments here, where they come from and the scenarios. That'll make reviewing this much easier.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| std::string_view data_path, | ||
| std::string_view common_path, | ||
| std::string_view table_location, | ||
| std::string_view common_namespace) | ||
| { | ||
| auto trim_backward_slash = [](std::string_view str) -> std::string_view | ||
| { | ||
|
|
@@ -84,7 +88,20 @@ std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::s | |
| } | ||
| else | ||
| { | ||
| throw ::DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Expected to find '{}' in data path: '{}'", common_path, data_path); | ||
| /// Data files can have different path | ||
| pos = data_path.find("://"); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The method docs say:
Which means the As far as I understand your exception handling,you skip the initial
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Try to remove bucket
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same |
||
| if (pos == std::string::npos) | ||
| throw ::DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Unexpected data path: '{}'", data_path); | ||
| pos = data_path.find("/", pos + 3); | ||
| if (pos == std::string::npos) | ||
| throw ::DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Unexpected data path: '{}'", data_path); | ||
| if (data_path.substr(pos + 1).starts_with(common_namespace)) | ||
| { | ||
| auto new_pos = data_path.find("/", pos + 1); | ||
| if (new_pos - pos == common_namespace.length() + 1) /// bucket in the path | ||
| pos = new_pos; | ||
| } | ||
| return std::string(data_path.substr(pos)); | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wouldn't it be easier to just instantiate
S3::URI? AFAIK, it should support all types of S3 endpoints and its constructor takes a URI string.https://github.com/ClickHouse/ClickHouse/blob/fbd99df81d18fac4c1e26f665d3bba316775bfd4/src/IO/S3/URI.cpp#L114
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure, Here location is just cut on two pieces, with Poco::URI come makes full analysis with splitting in all parts and later need to concat back.