From 17eb4e2826aa683407df3d673e5099ee1ff52055 Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Tue, 18 Mar 2025 10:11:44 -0400 Subject: [PATCH 01/67] Fix combined report for release --- .github/workflows/release_branches.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index e70fd154996a..a850de2b2fa3 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -633,7 +633,7 @@ jobs: CHECKS_DATABASE_USER: ${{ secrets.CHECKS_DATABASE_USER }} CHECKS_DATABASE_PASSWORD: ${{ secrets.CHECKS_DATABASE_PASSWORD }} COMMIT_SHA: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - PR_NUMBER: ${{ github.event.number }} + PR_NUMBER: ${{ github.event.pull_request.number || 0 }} ACTIONS_RUN_URL: ${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }} shell: bash run: | From f4681e89f5a08ccb265e11393064a06b96c987c2 Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Tue, 18 Mar 2025 11:13:15 -0400 Subject: [PATCH 02/67] Update regression hash with fixes for tiered storage cleanup --- .github/workflows/release_branches.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index a850de2b2fa3..169a6972f48c 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -539,7 +539,7 @@ jobs: secrets: inherit with: runner_type: altinity-on-demand, altinity-type-cpx51, altinity-image-x86-app-docker-ce, altinity-setup-regression - commit: 11dcb1ad771e6afebcb06bcc7bf1c1d8b184d838 + commit: bd31e738c0cedaca253d15a05ed245c41b6e0b6a arch: release build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout_minutes: 300 @@ -550,7 +550,7 @@ jobs: secrets: inherit with: runner_type: altinity-on-demand, altinity-type-cax41, altinity-image-arm-app-docker-ce, altinity-setup-regression - commit: 11dcb1ad771e6afebcb06bcc7bf1c1d8b184d838 + commit: bd31e738c0cedaca253d15a05ed245c41b6e0b6a arch: aarch64 build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout_minutes: 300 From 02eaeff8a557d01b138372a60a761e512482e833 Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Tue, 18 Mar 2025 13:19:26 -0400 Subject: [PATCH 03/67] support new regression job.retry attr --- .github/create_combined_ci_report.py | 1 + .github/workflows/regression.yml | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/create_combined_ci_report.py b/.github/create_combined_ci_report.py index 2607760e4abb..8a58f7de2cb6 100755 --- a/.github/create_combined_ci_report.py +++ b/.github/create_combined_ci_report.py @@ -117,6 +117,7 @@ def get_regression_fails(client: Client, job_url: str): WHERE job_url='{job_url}' AND status IN ('Fail', 'Error') """ + print(query) df = client.query_dataframe(query) df = drop_prefix_rows(df, "test_name") df["job_name"] = df["job_name"].str.title() diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 576f68300b29..4a2d519fe7c9 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -179,7 +179,7 @@ jobs: python3 -u ${{ env.SUITE }}/regression.py --clickhouse-binary-path ${{ env.clickhouse_path }} - --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" + --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.retry=$GITHUB_RUN_ATTEMPT job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" ${{ env.args }} || EXITCODE=$?; .github/add_link_to_logs.sh; exit $EXITCODE @@ -243,7 +243,7 @@ jobs: -u alter/regression.py --clickhouse-binary-path ${{ env.clickhouse_path }} --only "/alter/${{ matrix.ONLY }} partition/*" - --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" + --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.retry=$GITHUB_RUN_ATTEMPT job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" ${{ env.args }} || EXITCODE=$?; .github/add_link_to_logs.sh; exit $EXITCODE @@ -314,7 +314,7 @@ jobs: --aws-s3-region ${{ secrets.REGRESSION_AWS_S3_REGION }} --aws-s3-key-id ${{ secrets.REGRESSION_AWS_S3_KEY_ID }} --aws-s3-access-key ${{ secrets.REGRESSION_AWS_S3_SECRET_ACCESS_KEY }} - --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" + --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.retry=$GITHUB_RUN_ATTEMPT job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" ${{ env.args }} || EXITCODE=$?; .github/add_link_to_logs.sh; exit $EXITCODE @@ -374,7 +374,7 @@ jobs: -u ${{ env.SUITE }}/regression.py --ssl --clickhouse-binary-path ${{ env.clickhouse_path }} - --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" + --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.retry=$GITHUB_RUN_ATTEMPT job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" ${{ env.args }} || EXITCODE=$?; .github/add_link_to_logs.sh; exit $EXITCODE @@ -436,7 +436,7 @@ jobs: python3 -u ${{ env.SUITE }}/regression.py --clickhouse-binary-path ${{ env.clickhouse_path }} - --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" + --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.retry=$GITHUB_RUN_ATTEMPT job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" ${{ env.args }} || EXITCODE=$?; .github/add_link_to_logs.sh; exit $EXITCODE @@ -494,7 +494,7 @@ jobs: python3 -u ${{ env.SUITE }}/regression.py --clickhouse-binary-path ${{ env.clickhouse_path }} - --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" + --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.retry=$GITHUB_RUN_ATTEMPT job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" ${{ env.args }} || EXITCODE=$?; .github/add_link_to_logs.sh; exit $EXITCODE @@ -562,7 +562,7 @@ jobs: --aws-s3-region ${{ secrets.REGRESSION_AWS_S3_REGION }} --aws-s3-key-id ${{ secrets.REGRESSION_AWS_S3_KEY_ID }} --aws-s3-access-key ${{ secrets.REGRESSION_AWS_S3_SECRET_ACCESS_KEY }} - --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" + --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.retry=$GITHUB_RUN_ATTEMPT job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" ${{ env.args }} || EXITCODE=$?; .github/add_link_to_logs.sh; exit $EXITCODE @@ -636,7 +636,7 @@ jobs: --azure-account-name ${{ secrets.AZURE_ACCOUNT_NAME }} --azure-storage-key ${{ secrets.AZURE_STORAGE_KEY }} --azure-container ${{ secrets.AZURE_CONTAINER_NAME }} - --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" + --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.retry=$GITHUB_RUN_ATTEMPT job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" ${{ env.args }} || EXITCODE=$?; .github/add_link_to_logs.sh; exit $EXITCODE @@ -706,7 +706,7 @@ jobs: --gcs-key-secret ${{ secrets.REGRESSION_GCS_KEY_SECRET }} --gcs-uri ${{ secrets.REGRESSION_GCS_URI }} --with-${{ matrix.STORAGE }} - --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" + --attr project="$GITHUB_REPOSITORY" project.id="$GITHUB_REPOSITORY_ID" package="${{ env.clickhouse_path }}" version="${{ env.version }}" user.name="$GITHUB_ACTOR" repository="https://github.com/Altinity/clickhouse-regression" commit.hash="$(git rev-parse HEAD)" job.name=$GITHUB_JOB job.retry=$GITHUB_RUN_ATTEMPT job.url="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID" arch="$(uname -i)" ${{ env.args }} || EXITCODE=$?; .github/add_link_to_logs.sh; exit $EXITCODE From ddb6587efe8e76ed75cb31c5dabc967b4f111500 Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Tue, 18 Mar 2025 13:35:02 -0400 Subject: [PATCH 04/67] fix bug in combined report --- .github/create_combined_ci_report.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/create_combined_ci_report.py b/.github/create_combined_ci_report.py index 8a58f7de2cb6..07b6a80763bb 100755 --- a/.github/create_combined_ci_report.py +++ b/.github/create_combined_ci_report.py @@ -111,13 +111,13 @@ def get_regression_fails(client: Client, job_url: str): job_name, report_url as results_link FROM `gh-data`.clickhouse_regression_results - GROUP BY architecture, test_name, job_url, job_name, report_url, start_time - ORDER BY start_time DESC, length(test_name) DESC + GROUP BY architecture, test_name, job_url, job_name, report_url + ORDER BY length(test_name) DESC ) WHERE job_url='{job_url}' AND status IN ('Fail', 'Error') """ - print(query) + df = client.query_dataframe(query) df = drop_prefix_rows(df, "test_name") df["job_name"] = df["job_name"].str.title() From d174862eb20168b69169265a6dafabddb2f7e0cb Mon Sep 17 00:00:00 2001 From: MyroTk Date: Wed, 19 Mar 2025 17:27:03 -0400 Subject: [PATCH 05/67] get docker credentials from secrets --- tests/ci/docker_images_helper.py | 5 ++--- tests/ci/env_helper.py | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/ci/docker_images_helper.py b/tests/ci/docker_images_helper.py index 9faefef12c24..219fdc00e20a 100644 --- a/tests/ci/docker_images_helper.py +++ b/tests/ci/docker_images_helper.py @@ -6,8 +6,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional -from env_helper import ROOT_DIR, DOCKER_TAG -from get_robot_token import get_parameter_from_ssm +from env_helper import ROOT_DIR, DOCKER_TAG, DOCKER_PASSWORD from ci_utils import Shell IMAGES_FILE_PATH = Path("docker/images.json") @@ -22,7 +21,7 @@ def docker_login(relogin: bool = True) -> None: Shell.check( # pylint: disable=unexpected-keyword-arg "docker login --username 'altinityinfra' --password-stdin", strict=True, - stdin_str=get_parameter_from_ssm("dockerhub-password"), + stdin_str=DOCKER_PASSWORD, encoding="utf-8", ) diff --git a/tests/ci/env_helper.py b/tests/ci/env_helper.py index cad781a17445..3240c6a49756 100644 --- a/tests/ci/env_helper.py +++ b/tests/ci/env_helper.py @@ -16,6 +16,7 @@ REPORT_PATH = f"{TEMP_PATH}/reports" # FIXME: latest should not be used in CI, set temporary for transition to "docker with digest as a tag" DOCKER_TAG = os.getenv("DOCKER_TAG", "latest") +DOCKER_PASSWORD = os.getenv("DOCKER_PASSWORD") CACHES_PATH = os.getenv("CACHES_PATH", TEMP_PATH) CLOUDFLARE_TOKEN = os.getenv("CLOUDFLARE_TOKEN") GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH", "") From 581666df4aebca5ec69b7745ccb1952640eab4ff Mon Sep 17 00:00:00 2001 From: MyroTk Date: Wed, 19 Mar 2025 17:29:19 -0400 Subject: [PATCH 06/67] add docker password to env --- .github/workflows/release_branches.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index e70fd154996a..f6df8b0e4123 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -7,6 +7,7 @@ env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} on: # yamllint disable-line rule:truthy pull_request: From a31e6b4b23fec95fbd0a05377c024249f05a7ade Mon Sep 17 00:00:00 2001 From: MyroTk Date: Fri, 21 Mar 2025 11:11:40 -0400 Subject: [PATCH 07/67] add secrets to env and prevent aws leaks --- .github/create_combined_ci_report.py | 2 +- .github/workflows/release_branches.yml | 4 ++++ .github/workflows/reusable_sign.yml | 10 +++++---- .github/workflows/reusable_test.yml | 9 ++++++-- docker/packager/packager | 30 ++++++++++++++++++-------- tests/ci/clickhouse_helper.py | 7 +++--- tests/ci/env_helper.py | 7 +++++- tests/ci/get_robot_token.py | 5 +---- 8 files changed, 50 insertions(+), 24 deletions(-) diff --git a/.github/create_combined_ci_report.py b/.github/create_combined_ci_report.py index 2607760e4abb..2a55f20a6dba 100755 --- a/.github/create_combined_ci_report.py +++ b/.github/create_combined_ci_report.py @@ -13,7 +13,7 @@ DATABASE_HOST_VAR = "CHECKS_DATABASE_HOST" DATABASE_USER_VAR = "CHECKS_DATABASE_USER" DATABASE_PASSWORD_VAR = "CHECKS_DATABASE_PASSWORD" -S3_BUCKET = "altinity-build-artifacts" +S3_BUCKET = "altinity-test-new-credentials" def get_checks_fails(client: Client, job_url: str): diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index f6df8b0e4123..c02206a14ca3 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -7,7 +7,11 @@ env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + CLICKHOUSE_TEST_STAT_LOGIN: ${{ secrets.CLICKHOUSE_TEST_STAT_LOGIN }} + CLICKHOUSE_TEST_STAT_PASSWORD: ${{ secrets.CLICKHOUSE_TEST_STAT_PASSWORD }} + CLICKHOUSE_TEST_STAT_URL: ${{ secrets.CLICKHOUSE_TEST_STAT_URL }} DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + ROBOT_TOKEN: ${{ secrets.ROBOT_TOKEN }} on: # yamllint disable-line rule:truthy pull_request: diff --git a/.github/workflows/reusable_sign.yml b/.github/workflows/reusable_sign.yml index 2bd8ae430fa5..fee2f9131469 100644 --- a/.github/workflows/reusable_sign.yml +++ b/.github/workflows/reusable_sign.yml @@ -1,7 +1,4 @@ -### For the pure soul wishes to move it to another place -# https://github.com/orgs/community/discussions/9050 - -name: Testing workflow +name: Sigining workflow 'on': workflow_call: inputs: @@ -63,6 +60,11 @@ env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + CLICKHOUSE_TEST_STAT_LOGIN: ${{ secrets.CLICKHOUSE_TEST_STAT_LOGIN }} + CLICKHOUSE_TEST_STAT_PASSWORD: ${{ secrets.CLICKHOUSE_TEST_STAT_PASSWORD }} + CLICKHOUSE_TEST_STAT_URL: ${{ secrets.CLICKHOUSE_TEST_STAT_URL }} + DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + ROBOT_TOKEN: ${{ secrets.ROBOT_TOKEN }} jobs: runner_labels_setup: diff --git a/.github/workflows/reusable_test.yml b/.github/workflows/reusable_test.yml index 4867cab03480..2c293a9e1eb7 100644 --- a/.github/workflows/reusable_test.yml +++ b/.github/workflows/reusable_test.yml @@ -44,10 +44,10 @@ name: Testing workflow description: if given, it's passed to the environments required: false AWS_SECRET_ACCESS_KEY: - description: the access key to the aws param store. + description: the access key to the aws s3 bucket. required: true AWS_ACCESS_KEY_ID: - description: the access key id to the aws param store. + description: the access key id to the aws s3 bucket. required: true env: @@ -57,6 +57,11 @@ env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + CLICKHOUSE_TEST_STAT_LOGIN: ${{ secrets.CLICKHOUSE_TEST_STAT_LOGIN }} + CLICKHOUSE_TEST_STAT_PASSWORD: ${{ secrets.CLICKHOUSE_TEST_STAT_PASSWORD }} + CLICKHOUSE_TEST_STAT_URL: ${{ secrets.CLICKHOUSE_TEST_STAT_URL }} + DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + ROBOT_TOKEN: ${{ secrets.ROBOT_TOKEN }} jobs: runner_labels_setup: diff --git a/docker/packager/packager b/docker/packager/packager index 7975b6a03aae..2c28bb891b64 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -6,7 +6,9 @@ import os import subprocess import sys from pathlib import Path -from typing import List, Optional +from typing import Dict, List, Optional + +from env_helper import TEMP_PATH SCRIPT_PATH = Path(__file__).absolute() IMAGE_TYPE = "binary-builder" @@ -82,9 +84,22 @@ def run_docker_image_with_env( ch_root: Path, cargo_cache_dir: Path, ccache_dir: Optional[Path], + aws_secrets : Optional[Dict[str:str]] ) -> None: output_dir.mkdir(parents=True, exist_ok=True) cargo_cache_dir.mkdir(parents=True, exist_ok=True) + extra_parts = "" + + if aws_secrets: + # Pass AWS credentials via file rather than via env to avoid leaking secrets + env_part["AWS_CONFIG_FILE"] = "/aws_config" + host_aws_config_file_path = Path(TEMP_PATH) / 'aws_config' + with open(host_aws_config_file_path, 'wt') as f: + f.write("[default]") + for key, value in aws_secrets.items(): + f.write(f"{key}={value}") + + extra_parts = f" --volume={host_aws_config_file_path}:{env_part["AWS_CONFIG_FILE"]} " env_part = " -e ".join(env_variables) if env_part: @@ -107,6 +122,7 @@ def run_docker_image_with_env( cmd = ( f"docker run --network=host --user={user} --rm {ccache_mount} " f"--volume={output_dir}:/output --volume={ch_root}:/build {env_part} " + f" {extra_parts} " f"--volume={cargo_cache_dir}:/rust/cargo/registry {interactive} {image_name}" ) @@ -130,11 +146,9 @@ def parse_env_variables( sanitizer: str, package_type: str, cache: str, - s3_access_key_id: str, s3_bucket: str, s3_directory: str, s3_rw_access: bool, - s3_secret_access_key: str, clang_tidy: bool, version: str, official: bool, @@ -323,10 +337,6 @@ def parse_env_variables( result.append(f"SCCACHE_S3_KEY_PREFIX={sccache_dir}") if not s3_rw_access: result.append("SCCACHE_S3_NO_CREDENTIALS=true") - if s3_access_key_id: - result.append(f"AWS_ACCESS_KEY_ID={s3_access_key_id}") - if s3_secret_access_key: - result.append(f"AWS_SECRET_ACCESS_KEY={s3_secret_access_key}") if clang_tidy: # `CTCACHE_DIR` has the same purpose as the `CCACHE_DIR` above. @@ -544,11 +554,9 @@ def main() -> None: args.sanitizer, args.package_type, args.cache, - args.s3_access_key_id, args.s3_bucket, args.s3_directory, args.s3_rw_access, - args.s3_secret_access_key, args.clang_tidy, args.version, args.official, @@ -567,6 +575,10 @@ def main() -> None: ch_root, args.cargo_cache_dir, args.ccache_dir, + { + "aws_access_key_id" : args.s3_access_key_id, + "aws_secret_access_key" : args.s3_secret_access_key + } ) logging.info("Output placed into %s", args.output_dir) diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py index 422e1738701a..e49e2f2d0b7e 100644 --- a/tests/ci/clickhouse_helper.py +++ b/tests/ci/clickhouse_helper.py @@ -9,6 +9,7 @@ import requests +from env_helper import CLICKHOUSE_TEST_STAT_URL, CLICKHOUSE_TEST_STAT_PASSWORD, CLICKHOUSE_TEST_STAT_LOGIN from get_robot_token import get_parameter_from_ssm from pr_info import PRInfo from report import TestResults @@ -27,12 +28,12 @@ def __init__( self, url: Optional[str] = None, auth: Optional[Dict[str, str]] = None ): if url is None: - url = get_parameter_from_ssm("clickhouse-test-stat-url") + url = CLICKHOUSE_TEST_STAT_URL self.url = url self.auth = auth or { - "X-ClickHouse-User": get_parameter_from_ssm("clickhouse-test-stat-login"), - "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password"), + "X-ClickHouse-User": CLICKHOUSE_TEST_STAT_LOGIN, + "X-ClickHouse-Key": CLICKHOUSE_TEST_STAT_PASSWORD, } @staticmethod diff --git a/tests/ci/env_helper.py b/tests/ci/env_helper.py index 3240c6a49756..11eae74689d2 100644 --- a/tests/ci/env_helper.py +++ b/tests/ci/env_helper.py @@ -16,7 +16,6 @@ REPORT_PATH = f"{TEMP_PATH}/reports" # FIXME: latest should not be used in CI, set temporary for transition to "docker with digest as a tag" DOCKER_TAG = os.getenv("DOCKER_TAG", "latest") -DOCKER_PASSWORD = os.getenv("DOCKER_PASSWORD") CACHES_PATH = os.getenv("CACHES_PATH", TEMP_PATH) CLOUDFLARE_TOKEN = os.getenv("CLOUDFLARE_TOKEN") GITHUB_EVENT_PATH = os.getenv("GITHUB_EVENT_PATH", "") @@ -47,6 +46,12 @@ ) CI_CONFIG_PATH = f"{TEMP_PATH}/ci_config.json" +CLICKHOUSE_TEST_STAT_LOGIN = os.getenv("CLICKHOUSE_TEST_STAT_LOGIN") +CLICKHOUSE_TEST_STAT_PASSWORD = os.getenv("CLICKHOUSE_TEST_STAT_PASSWORD") +CLICKHOUSE_TEST_STAT_URL = os.getenv("CLICKHOUSE_TEST_STAT_URL") +DOCKER_PASSWORD = os.getenv("DOCKER_PASSWORD") +ROBOT_TOKEN = os.getenv("ROBOT_TOKEN") + # These parameters are set only on demand, and only once _GITHUB_JOB_ID = "" _GITHUB_JOB_URL = "" diff --git a/tests/ci/get_robot_token.py b/tests/ci/get_robot_token.py index a218202fd76e..68a35259ac2e 100644 --- a/tests/ci/get_robot_token.py +++ b/tests/ci/get_robot_token.py @@ -10,6 +10,7 @@ from github.GithubException import BadCredentialsException from github.NamedUser import NamedUser +from env_helper import ROBOT_TOKEN @dataclass class Token: @@ -66,10 +67,6 @@ def get_parameters_from_ssm( def get_best_robot_token(token_prefix_env_name="github_robot_token"): # Re-use already fetched token (same as in get_best_robot_token_original) # except here we assume it is always a string (since we use only one token and don't do token rotation) - global ROBOT_TOKEN - if ROBOT_TOKEN is not None: - return ROBOT_TOKEN - ROBOT_TOKEN = get_parameter_from_ssm(token_prefix_env_name) return ROBOT_TOKEN def get_best_robot_token_original(tokens_path: str = "/github-tokens") -> str: From a6d14f930864e332a82801bae292e33971ac7509 Mon Sep 17 00:00:00 2001 From: MyroTk Date: Fri, 21 Mar 2025 11:19:07 -0400 Subject: [PATCH 08/67] use temp test bucket --- tests/ci/env_helper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/ci/env_helper.py b/tests/ci/env_helper.py index 11eae74689d2..eb4103af3107 100644 --- a/tests/ci/env_helper.py +++ b/tests/ci/env_helper.py @@ -33,10 +33,10 @@ RUNNER_TEMP = os.getenv("RUNNER_TEMP", p.abspath(p.join(module_dir, "./tmp"))) S3_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") -S3_BUILDS_BUCKET = os.getenv("S3_BUILDS_BUCKET", "altinity-build-artifacts") -S3_BUILDS_BUCKET_PUBLIC = "altinity-build-artifacts" +S3_BUILDS_BUCKET = os.getenv("S3_BUILDS_BUCKET", "altinity-test-new-credentials") +S3_BUILDS_BUCKET_PUBLIC = "altinity-test-new-credentials" S3_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") -S3_TEST_REPORTS_BUCKET = os.getenv("S3_TEST_REPORTS_BUCKET", "altinity-build-artifacts") +S3_TEST_REPORTS_BUCKET = os.getenv("S3_TEST_REPORTS_BUCKET", "altinity-test-new-credentials") S3_URL = os.getenv("S3_URL", "https://s3.amazonaws.com") S3_DOWNLOAD = os.getenv("S3_DOWNLOAD", S3_URL) From 9a0784c55519043909373c76c27fb914db713ed5 Mon Sep 17 00:00:00 2001 From: MyroTk Date: Fri, 21 Mar 2025 11:31:37 -0400 Subject: [PATCH 09/67] src upload --- .github/workflows/reusable_build.yml | 5 +++++ tests/ci/build_check.py | 23 +++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/.github/workflows/reusable_build.yml b/.github/workflows/reusable_build.yml index c64d0aaec500..6338328f4be4 100644 --- a/.github/workflows/reusable_build.yml +++ b/.github/workflows/reusable_build.yml @@ -88,6 +88,11 @@ jobs: uses: ./.github/actions/common_setup with: job_type: build_check + - name: Create source tar + run: | + mkdir -p "$TEMP_PATH/build_check/package_release" + cd .. && tar czf $TEMP_PATH/build_source.src.tar.gz ClickHouse/ + cd $TEMP_PATH && tar xvzf $TEMP_PATH/build_source.src.tar.gz - name: Pre run: | python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --infile ${{ toJson(inputs.data) }} --pre --job-name '${{inputs.build_name}}' diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index 5b59fb92c7aa..4b0eef9c44cc 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -14,6 +14,7 @@ from git_helper import Git from pr_info import PRInfo, EventType from report import FAILURE, SUCCESS, JobReport, StatusType +from s3_helper import S3Helper from stopwatch import Stopwatch from tee_popen import TeePopen from version_helper import ( @@ -223,6 +224,28 @@ def main(): f"sudo chown -R ubuntu:ubuntu {build_output_path}", shell=True ) logging.info("Build finished as %s, log path %s", build_status, log_path) + + s3_helper = S3Helper() + s3_path_prefix = "/".join( + ( + get_release_or_pr(pr_info, get_version_from_repo())[0], + pr_info.sha, + build_name, + ) + ) + src_path = temp_path / "build_source.src.tar.gz" + s3_path = s3_path_prefix + "/clickhouse-" + version.string + ".src.tar.gz" + logging.info("s3_path %s", s3_path) + if src_path.exists(): + src_url = s3_helper.upload_build_file_to_s3( + src_path, s3_path + ) + logging.info("Source tar %s", src_url) + print(f"::notice ::Source tar URL: {src_url}") + else: + logging.info("Source tar doesn't exist") + print("Source tar doesn't exist") + if build_status != SUCCESS: # We check if docker works, because if it's down, it's infrastructure try: From 0377def587785a67a62a9bdf7e1f787da309d85b Mon Sep 17 00:00:00 2001 From: MyroTk Date: Fri, 21 Mar 2025 11:38:25 -0400 Subject: [PATCH 10/67] fix syntax issue --- docker/packager/packager | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index 2c28bb891b64..b6a4fb49e520 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -99,7 +99,7 @@ def run_docker_image_with_env( for key, value in aws_secrets.items(): f.write(f"{key}={value}") - extra_parts = f" --volume={host_aws_config_file_path}:{env_part["AWS_CONFIG_FILE"]} " + extra_parts = f" --volume={host_aws_config_file_path}:{env_part['AWS_CONFIG_FILE']} " env_part = " -e ".join(env_variables) if env_part: From af1aeef8673c55bde35ad342d9b04c5743710e59 Mon Sep 17 00:00:00 2001 From: MyroTk Date: Fri, 21 Mar 2025 11:52:34 -0400 Subject: [PATCH 11/67] avoid using env_helper module in packager --- docker/packager/packager | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/packager/packager b/docker/packager/packager index b6a4fb49e520..2e855ebd6478 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -5,15 +5,16 @@ import logging import os import subprocess import sys +from os import path as p from pathlib import Path from typing import Dict, List, Optional -from env_helper import TEMP_PATH +module_dir = p.abspath(p.dirname(__file__)) SCRIPT_PATH = Path(__file__).absolute() IMAGE_TYPE = "binary-builder" IMAGE_NAME = f"altinityinfra/{IMAGE_TYPE}" - +TEMP_PATH = os.getenv("TEMP_PATH", p.abspath(p.join(module_dir, "./tmp"))) class BuildException(Exception): pass From a1955aebb8c207e295184e98e145185e4d32b8e7 Mon Sep 17 00:00:00 2001 From: MyroTk Date: Fri, 21 Mar 2025 12:37:31 -0400 Subject: [PATCH 12/67] fix dict issue --- docker/packager/packager | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index 2e855ebd6478..2ab5cfa9cea8 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -85,7 +85,7 @@ def run_docker_image_with_env( ch_root: Path, cargo_cache_dir: Path, ccache_dir: Optional[Path], - aws_secrets : Optional[Dict[str:str]] + aws_secrets : Optional[Dict[str,str]] ) -> None: output_dir.mkdir(parents=True, exist_ok=True) cargo_cache_dir.mkdir(parents=True, exist_ok=True) From 93a391354805d91b32b919a8696c2fe4a85d2b53 Mon Sep 17 00:00:00 2001 From: MyroTk Date: Fri, 21 Mar 2025 14:31:33 -0400 Subject: [PATCH 13/67] fix dict --- docker/packager/packager | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index 2ab5cfa9cea8..57cb7e6e6362 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -93,7 +93,7 @@ def run_docker_image_with_env( if aws_secrets: # Pass AWS credentials via file rather than via env to avoid leaking secrets - env_part["AWS_CONFIG_FILE"] = "/aws_config" + env_part = {"AWS_CONFIG_FILE": "/aws_config"} host_aws_config_file_path = Path(TEMP_PATH) / 'aws_config' with open(host_aws_config_file_path, 'wt') as f: f.write("[default]") From f06682cd33693e7107661a2a10fba535da803dea Mon Sep 17 00:00:00 2001 From: MyroTk Date: Fri, 21 Mar 2025 15:42:26 -0400 Subject: [PATCH 14/67] update envs in build script --- .github/workflows/release_branches.yml | 2 +- .github/workflows/reusable_build.yml | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index c02206a14ca3..1b4ce0995e15 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -21,7 +21,7 @@ on: # yamllint disable-line rule:truthy - opened branches: # Anything/24.8 (e.g customizations/24.8.x) - - '**/24.8*' + - '**24.8*' release: types: - published diff --git a/.github/workflows/reusable_build.yml b/.github/workflows/reusable_build.yml index c64d0aaec500..1cd3f410f996 100644 --- a/.github/workflows/reusable_build.yml +++ b/.github/workflows/reusable_build.yml @@ -7,6 +7,11 @@ env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + CLICKHOUSE_TEST_STAT_LOGIN: ${{ secrets.CLICKHOUSE_TEST_STAT_LOGIN }} + CLICKHOUSE_TEST_STAT_PASSWORD: ${{ secrets.CLICKHOUSE_TEST_STAT_PASSWORD }} + CLICKHOUSE_TEST_STAT_URL: ${{ secrets.CLICKHOUSE_TEST_STAT_URL }} + DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + ROBOT_TOKEN: ${{ secrets.ROBOT_TOKEN }} name: Build ClickHouse 'on': From 0f5c6c51be000f7408998fb481f98230291d9ebd Mon Sep 17 00:00:00 2001 From: MyroTk Date: Mon, 24 Mar 2025 10:13:25 -0400 Subject: [PATCH 15/67] aws config file fix --- docker/packager/packager | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/packager/packager b/docker/packager/packager index 57cb7e6e6362..57a76106eb25 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -98,9 +98,9 @@ def run_docker_image_with_env( with open(host_aws_config_file_path, 'wt') as f: f.write("[default]") for key, value in aws_secrets.items(): - f.write(f"{key}={value}") + f.write(f"\n{key}={value}") - extra_parts = f" --volume={host_aws_config_file_path}:{env_part['AWS_CONFIG_FILE']} " + extra_parts = f"--volume={host_aws_config_file_path}:{env_part['AWS_CONFIG_FILE']}" env_part = " -e ".join(env_variables) if env_part: From 288a32fce611b5ce202760164a9507b27ed0eaf6 Mon Sep 17 00:00:00 2001 From: MyroTk Date: Mon, 24 Mar 2025 10:44:50 -0400 Subject: [PATCH 16/67] update aws config path inside docker --- docker/packager/packager | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index 57a76106eb25..df69ca3058ac 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -93,7 +93,7 @@ def run_docker_image_with_env( if aws_secrets: # Pass AWS credentials via file rather than via env to avoid leaking secrets - env_part = {"AWS_CONFIG_FILE": "/aws_config"} + env_part = {"AWS_CONFIG_FILE": "/root/.aws/config"} host_aws_config_file_path = Path(TEMP_PATH) / 'aws_config' with open(host_aws_config_file_path, 'wt') as f: f.write("[default]") From eff4415937c6ef3728db85cd700c8e5909a9ec50 Mon Sep 17 00:00:00 2001 From: MyroTk Date: Mon, 24 Mar 2025 10:59:55 -0400 Subject: [PATCH 17/67] aws credentials path inside docker --- docker/packager/packager | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index df69ca3058ac..6c421ee8f99e 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -93,7 +93,7 @@ def run_docker_image_with_env( if aws_secrets: # Pass AWS credentials via file rather than via env to avoid leaking secrets - env_part = {"AWS_CONFIG_FILE": "/root/.aws/config"} + env_part = {"AWS_CONFIG_FILE": "/root/.aws/credentials"} host_aws_config_file_path = Path(TEMP_PATH) / 'aws_config' with open(host_aws_config_file_path, 'wt') as f: f.write("[default]") From 9ac681fbbbb64a75aeb876859ad9ee41fe2344c6 Mon Sep 17 00:00:00 2001 From: MyroTk Date: Mon, 24 Mar 2025 12:50:10 -0400 Subject: [PATCH 18/67] move aws credentials to the correct user inside container --- docker/packager/packager | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index 6c421ee8f99e..0b6bb4e739f8 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -93,7 +93,7 @@ def run_docker_image_with_env( if aws_secrets: # Pass AWS credentials via file rather than via env to avoid leaking secrets - env_part = {"AWS_CONFIG_FILE": "/root/.aws/credentials"} + env_part = {"AWS_CONFIG_FILE": "/clickhouse/.aws/credentials"} host_aws_config_file_path = Path(TEMP_PATH) / 'aws_config' with open(host_aws_config_file_path, 'wt') as f: f.write("[default]") From abf28f578a7ab11111c414b981e1add69799a45c Mon Sep 17 00:00:00 2001 From: MyroTk Date: Mon, 24 Mar 2025 13:36:14 -0400 Subject: [PATCH 19/67] change aws credentials directory. again. --- docker/packager/packager | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index 0b6bb4e739f8..d7cc2ac807bc 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -93,7 +93,7 @@ def run_docker_image_with_env( if aws_secrets: # Pass AWS credentials via file rather than via env to avoid leaking secrets - env_part = {"AWS_CONFIG_FILE": "/clickhouse/.aws/credentials"} + env_part = {"AWS_CONFIG_FILE": "/home/clickhouse/.aws/credentials"} host_aws_config_file_path = Path(TEMP_PATH) / 'aws_config' with open(host_aws_config_file_path, 'wt') as f: f.write("[default]") From 95644cbc9c08cbd5cffa4ce7991886a32cac83a9 Mon Sep 17 00:00:00 2001 From: MyroTk Date: Mon, 24 Mar 2025 19:49:26 -0400 Subject: [PATCH 20/67] switch back to altinitybuild0artifacts bucket --- .github/create_combined_ci_report.py | 2 +- tests/ci/env_helper.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/create_combined_ci_report.py b/.github/create_combined_ci_report.py index 2a55f20a6dba..2607760e4abb 100755 --- a/.github/create_combined_ci_report.py +++ b/.github/create_combined_ci_report.py @@ -13,7 +13,7 @@ DATABASE_HOST_VAR = "CHECKS_DATABASE_HOST" DATABASE_USER_VAR = "CHECKS_DATABASE_USER" DATABASE_PASSWORD_VAR = "CHECKS_DATABASE_PASSWORD" -S3_BUCKET = "altinity-test-new-credentials" +S3_BUCKET = "altinity-build-artifacts" def get_checks_fails(client: Client, job_url: str): diff --git a/tests/ci/env_helper.py b/tests/ci/env_helper.py index eb4103af3107..11eae74689d2 100644 --- a/tests/ci/env_helper.py +++ b/tests/ci/env_helper.py @@ -33,10 +33,10 @@ RUNNER_TEMP = os.getenv("RUNNER_TEMP", p.abspath(p.join(module_dir, "./tmp"))) S3_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") -S3_BUILDS_BUCKET = os.getenv("S3_BUILDS_BUCKET", "altinity-test-new-credentials") -S3_BUILDS_BUCKET_PUBLIC = "altinity-test-new-credentials" +S3_BUILDS_BUCKET = os.getenv("S3_BUILDS_BUCKET", "altinity-build-artifacts") +S3_BUILDS_BUCKET_PUBLIC = "altinity-build-artifacts" S3_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") -S3_TEST_REPORTS_BUCKET = os.getenv("S3_TEST_REPORTS_BUCKET", "altinity-test-new-credentials") +S3_TEST_REPORTS_BUCKET = os.getenv("S3_TEST_REPORTS_BUCKET", "altinity-build-artifacts") S3_URL = os.getenv("S3_URL", "https://s3.amazonaws.com") S3_DOWNLOAD = os.getenv("S3_DOWNLOAD", S3_URL) From 530a662f43200bed05e46ad70999e32ddd1fece2 Mon Sep 17 00:00:00 2001 From: MyroTk Date: Tue, 25 Mar 2025 13:07:05 -0400 Subject: [PATCH 21/67] Update secrets definition in workflows and temp path in packager --- .github/workflows/reusable_test.yml | 15 +++++++++++++++ docker/packager/packager | 3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/.github/workflows/reusable_test.yml b/.github/workflows/reusable_test.yml index 2c293a9e1eb7..4cb00cc9e83f 100644 --- a/.github/workflows/reusable_test.yml +++ b/.github/workflows/reusable_test.yml @@ -49,6 +49,21 @@ name: Testing workflow AWS_ACCESS_KEY_ID: description: the access key id to the aws s3 bucket. required: true + CLICKHOUSE_TEST_STAT_LOGIN: + description: username for ci db. + required: true + CLICKHOUSE_TEST_STAT_PASSWORD: + description: password for ci db. + required: true + CLICKHOUSE_TEST_STAT_URL: + description: url for ci db. + required: true + DOCKER_PASSWORD: + description: token to upload docker images. + required: true + ROBOT_TOKEN: + description: token to update ci status. + required: true env: # Force the stdout and stderr streams to be unbuffered diff --git a/docker/packager/packager b/docker/packager/packager index d7cc2ac807bc..37ecaf6b6b44 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -14,7 +14,8 @@ module_dir = p.abspath(p.dirname(__file__)) SCRIPT_PATH = Path(__file__).absolute() IMAGE_TYPE = "binary-builder" IMAGE_NAME = f"altinityinfra/{IMAGE_TYPE}" -TEMP_PATH = os.getenv("TEMP_PATH", p.abspath(p.join(module_dir, "./tmp"))) +DEFAULT_TMP_PATH = SCRIPT_PATH.parent.absolute() / 'tmp' +TEMP_PATH = Path(os.getenv("TEMP_PATH", DEFAULT_TMP_PATH)) class BuildException(Exception): pass From 781e811c26cce8b95093ef93c106557f6ee9107e Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Tue, 25 Mar 2025 19:28:24 +0100 Subject: [PATCH 22/67] remved unused module_dir --- docker/packager/packager | 3 --- 1 file changed, 3 deletions(-) diff --git a/docker/packager/packager b/docker/packager/packager index 37ecaf6b6b44..c21bf41de4e7 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -5,12 +5,9 @@ import logging import os import subprocess import sys -from os import path as p from pathlib import Path from typing import Dict, List, Optional -module_dir = p.abspath(p.dirname(__file__)) - SCRIPT_PATH = Path(__file__).absolute() IMAGE_TYPE = "binary-builder" IMAGE_NAME = f"altinityinfra/{IMAGE_TYPE}" From 926c396ba3b5ba0e3ccb842768b25089a50474b0 Mon Sep 17 00:00:00 2001 From: MyroTk Date: Tue, 25 Mar 2025 16:48:27 -0400 Subject: [PATCH 23/67] fix regression aws secrets and robot token function --- .github/workflows/regression.yml | 6 +++--- tests/ci/get_robot_token.py | 7 +------ 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 576f68300b29..ac63da1695d0 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -88,9 +88,9 @@ name: Regression test workflow - Release env: # Force the stdout and stderr streams to be unbuffered PYTHONUNBUFFERED: 1 - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_REPORT_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_REPORT_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: ${{ secrets.AWS_REPORT_REGION }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }} DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} CHECKS_DATABASE_HOST: ${{ secrets.CHECKS_DATABASE_HOST }} diff --git a/tests/ci/get_robot_token.py b/tests/ci/get_robot_token.py index 68a35259ac2e..f9b30beb178f 100644 --- a/tests/ci/get_robot_token.py +++ b/tests/ci/get_robot_token.py @@ -57,16 +57,11 @@ def get_parameters_from_ssm( return results - -ROBOT_TOKEN = None # type: Optional[Token] - # NOTE(Arthur Passos): Original CI code uses the "_original" version of this method. Each robot token is rate limited # and the original implementation selects the "best one". To make it simpler and iterate faster, # we are using only one robot and keeping the method signature. In the future we might reconsider # having multiple robot tokens -def get_best_robot_token(token_prefix_env_name="github_robot_token"): - # Re-use already fetched token (same as in get_best_robot_token_original) - # except here we assume it is always a string (since we use only one token and don't do token rotation) +def get_best_robot_token(): return ROBOT_TOKEN def get_best_robot_token_original(tokens_path: str = "/github-tokens") -> str: From 16bff7e8f5a023fa98ac5cb6595250639475ceef Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Tue, 25 Mar 2025 14:21:27 -0400 Subject: [PATCH 24/67] Scan files for secrets in _upload_file_to_s3 --- tests/ci/s3_helper.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index 86656e6e7c0c..d83258205599 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -6,6 +6,7 @@ from multiprocessing.dummy import Pool from pathlib import Path from typing import Any, List, Union +import os import boto3 # type: ignore import botocore # type: ignore @@ -19,6 +20,31 @@ S3_URL, ) +sensitive_var_pattern = re.compile(r"[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*") +sensitive_strings = {var: value for var, value in os.environ.items() + if sensitive_var_pattern.match(var)} + +def scan_file_for_sensitive_data(file_content, file_name): + """ + Scan the content of a file for sensitive strings. + Raises ValueError if any sensitive values are found. + """ + matches = [] + for line_number, line in enumerate(file_content.splitlines(), start=1): + for match in sensitive_var_pattern.finditer(line): + matches.append((file_name, line_number, match.group(0))) + for name, value in sensitive_strings.items(): + if value in line: + matches.append((file_name, line_number, f"SECRET[{name}]")) + + if not matches: + return + + logging.error(f"Sensitive values found in {file_name}") + for file_name, line_number, match in matches: + logging.error(f"{file_name}:{line_number}: {match}") + + raise ValueError(f"Sensitive values found in {file_name}") def _flatten_list(lst): result = [] @@ -45,6 +71,8 @@ def __init__(self, client: Any = None, endpoint: str = S3_URL): def _upload_file_to_s3( self, bucket_name: str, file_path: Path, s3_path: str ) -> str: + logging.debug("Checking %s for sensitive values", file_path) + scan_file_for_sensitive_data(file_path.read_text(), file_path.name) logging.debug( "Start uploading %s to bucket=%s path=%s", file_path, bucket_name, s3_path ) From 33cea7b56e7b25fff0773804792c151406e13920 Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Wed, 26 Mar 2025 10:11:01 -0400 Subject: [PATCH 25/67] Handle UnicodeDecodeError --- tests/ci/s3_helper.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index d83258205599..5573b2f65ec9 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -72,7 +72,11 @@ def _upload_file_to_s3( self, bucket_name: str, file_path: Path, s3_path: str ) -> str: logging.debug("Checking %s for sensitive values", file_path) - scan_file_for_sensitive_data(file_path.read_text(), file_path.name) + try: + file_content = file_path.read_text(encoding="utf-8") + except UnicodeDecodeError: + logging.warning("Failed to read file %s, unknown encoding", file_path) + scan_file_for_sensitive_data(file_content, file_path.name) logging.debug( "Start uploading %s to bucket=%s path=%s", file_path, bucket_name, s3_path ) From 7cc240f44f2076dd5a9811db4f0532feac052f68 Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Wed, 26 Mar 2025 15:58:32 -0400 Subject: [PATCH 26/67] fix --- tests/ci/s3_helper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index 5573b2f65ec9..97e9007f60e7 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -76,7 +76,9 @@ def _upload_file_to_s3( file_content = file_path.read_text(encoding="utf-8") except UnicodeDecodeError: logging.warning("Failed to read file %s, unknown encoding", file_path) - scan_file_for_sensitive_data(file_content, file_path.name) + else: + scan_file_for_sensitive_data(file_content, file_path.name) + logging.debug( "Start uploading %s to bucket=%s path=%s", file_path, bucket_name, s3_path ) From bcf06ba506196499d9a1907249cd244d898d164e Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Thu, 27 Mar 2025 11:55:57 -0400 Subject: [PATCH 27/67] A variable isn't really secret if the value is 'clickhouse' --- tests/ci/s3_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index 97e9007f60e7..f72fb1a7cd20 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -20,7 +20,7 @@ S3_URL, ) -sensitive_var_pattern = re.compile(r"[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*") +sensitive_var_pattern = re.compile(r"[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*(?!=clickhouse\s)") sensitive_strings = {var: value for var, value in os.environ.items() if sensitive_var_pattern.match(var)} From 2332e5a67ba890fefe3a27af6b22ee7ea4acc5dc Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Thu, 27 Mar 2025 14:27:17 -0400 Subject: [PATCH 28/67] fix --- tests/ci/s3_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index f72fb1a7cd20..09302e97337a 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -20,7 +20,7 @@ S3_URL, ) -sensitive_var_pattern = re.compile(r"[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*(?!=clickhouse\s)") +sensitive_var_pattern = re.compile(r"[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*(?!=clickhouse$)") sensitive_strings = {var: value for var, value in os.environ.items() if sensitive_var_pattern.match(var)} From c15db67913f17b6d512226a3d5c5e5b66d468721 Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Fri, 28 Mar 2025 10:01:18 -0400 Subject: [PATCH 29/67] regex should also ignore *** --- tests/ci/s3_helper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index 09302e97337a..a7492a1c289d 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -20,7 +20,9 @@ S3_URL, ) -sensitive_var_pattern = re.compile(r"[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*(?!=clickhouse$)") +sensitive_var_pattern = re.compile( + r"\b[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*\b(?!=clickhouse$)(?!: \*{3}$)" +) sensitive_strings = {var: value for var, value in os.environ.items() if sensitive_var_pattern.match(var)} From 09b31e68e963faf639476f1009c423fb2c27ae96 Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Fri, 28 Mar 2025 14:27:14 -0400 Subject: [PATCH 30/67] make regression tests skippable --- .github/workflows/release_branches.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 1b4ce0995e15..e9b7e0c81a9b 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -538,8 +538,8 @@ jobs: ##################################### REGRESSION TESTS ###################################### ############################################################################################# RegressionTestsRelease: - needs: [BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} + needs: [RunConfig, BuilderDebRelease] + if: ${{ !failure() && !cancelled() && !contains(fromJson(needs.RunConfig.outputs.data).ci_settings.exclude_keywords, 'regression') }} uses: ./.github/workflows/regression.yml secrets: inherit with: @@ -549,8 +549,8 @@ jobs: build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout_minutes: 300 RegressionTestsAarch64: - needs: [BuilderDebAarch64] - if: ${{ !failure() && !cancelled() }} + needs: [RunConfig, BuilderDebAarch64] + if: ${{ !failure() && !cancelled() && !contains(fromJson(needs.RunConfig.outputs.data).ci_settings.exclude_keywords, 'regression') && !contains(fromJson(needs.RunConfig.outputs.data).ci_settings.exclude_keywords, 'aarch64')}} uses: ./.github/workflows/regression.yml secrets: inherit with: From 821275b1136036086dc93d0d92d7b37c1aa387cd Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Mon, 31 Mar 2025 09:05:58 -0400 Subject: [PATCH 31/67] print the entire offending log line --- tests/ci/s3_helper.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index a7492a1c289d..39e06d8d635c 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -21,23 +21,31 @@ ) sensitive_var_pattern = re.compile( - r"\b[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*\b(?!=clickhouse$)(?!: \*{3}$)" + r"\b[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*\b(?!=clickhouse$)(?!: \*{3}$)" ) -sensitive_strings = {var: value for var, value in os.environ.items() - if sensitive_var_pattern.match(var)} +sensitive_strings = { + var: value for var, value in os.environ.items() if sensitive_var_pattern.match(var) +} + def scan_file_for_sensitive_data(file_content, file_name): """ Scan the content of a file for sensitive strings. Raises ValueError if any sensitive values are found. """ + + def clean_line(line): + for name, value in sensitive_strings.items(): + line = line.replace(value, f"SECRET[{name}]") + return line + matches = [] for line_number, line in enumerate(file_content.splitlines(), start=1): for match in sensitive_var_pattern.finditer(line): - matches.append((file_name, line_number, match.group(0))) + matches.append((file_name, line_number, clean_line(line))) for name, value in sensitive_strings.items(): if value in line: - matches.append((file_name, line_number, f"SECRET[{name}]")) + matches.append((file_name, line_number, clean_line(line))) if not matches: return @@ -48,6 +56,7 @@ def scan_file_for_sensitive_data(file_content, file_name): raise ValueError(f"Sensitive values found in {file_name}") + def _flatten_list(lst): result = [] for elem in lst: From 62915394df11a7aea61c056168ae981c5a306716 Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Thu, 3 Apr 2025 11:32:19 -0400 Subject: [PATCH 32/67] Fixing more false positives --- tests/ci/s3_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index 39e06d8d635c..973507d1d921 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -21,7 +21,7 @@ ) sensitive_var_pattern = re.compile( - r"\b[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*\b(?!=clickhouse$)(?!: \*{3}$)" + r"\b[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*\b(?!=clickhouse$)(?!: \*{3}$)(?! '\[HIDDEN\]')(?!%)" ) sensitive_strings = { var: value for var, value in os.environ.items() if sensitive_var_pattern.match(var) From 758bda364746706c2b70667699a6ccc03222742f Mon Sep 17 00:00:00 2001 From: Your Name <146047128+strtgbb@users.noreply.github.com> Date: Thu, 3 Apr 2025 20:28:06 -0400 Subject: [PATCH 33/67] Fixing more false positives --- tests/ci/s3_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index 973507d1d921..5d9d0758e5c3 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -21,7 +21,7 @@ ) sensitive_var_pattern = re.compile( - r"\b[A-Z_]*(SECRET|PASSWORD|ACCESS_KEY|TOKEN)[A-Z_]*\b(?!=clickhouse$)(?!: \*{3}$)(?! '\[HIDDEN\]')(?!%)" + r"\b[A-Z_]*(? Date: Wed, 14 May 2025 08:07:41 -0400 Subject: [PATCH 34/67] 24.8 Fix report credentials --- .github/workflows/release_branches.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 3873bb580acf..8c60559a4399 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -635,8 +635,8 @@ jobs: if: ${{ !cancelled() }} env: CHECKS_DATABASE_HOST: ${{ secrets.CHECKS_DATABASE_HOST }} - CHECKS_DATABASE_USER: ${{ secrets.CHECKS_DATABASE_USER }} - CHECKS_DATABASE_PASSWORD: ${{ secrets.CHECKS_DATABASE_PASSWORD }} + CHECKS_DATABASE_USER: ${{ secrets.CLICKHOUSE_TEST_STAT_LOGIN }} + CHECKS_DATABASE_PASSWORD: ${{ secrets.CLICKHOUSE_TEST_STAT_PASSWORD }} COMMIT_SHA: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} PR_NUMBER: ${{ github.event.pull_request.number || 0 }} ACTIONS_RUN_URL: ${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }} From 83952b43a3f4ee784ff148c1a9275913af36bf0c Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Mon, 19 May 2025 13:45:37 -0400 Subject: [PATCH 35/67] 24.8 fix regression db upload --- .github/workflows/regression.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index a916ef5efbb6..3d5d06e5b296 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -94,8 +94,8 @@ env: DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }} DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} CHECKS_DATABASE_HOST: ${{ secrets.CHECKS_DATABASE_HOST }} - CHECKS_DATABASE_USER: ${{ secrets.CHECKS_DATABASE_USER }} - CHECKS_DATABASE_PASSWORD: ${{ secrets.CHECKS_DATABASE_PASSWORD }} + CHECKS_DATABASE_USER: ${{ secrets.CLICKHOUSE_TEST_STAT_LOGIN }} + CHECKS_DATABASE_PASSWORD: ${{ secrets.CLICKHOUSE_TEST_STAT_PASSWORD }} args: --test-to-end --no-colors --local From 717ec5dcf59f25e0c7430177c0a96e151ce5181b Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Wed, 21 May 2025 20:29:14 -0400 Subject: [PATCH 36/67] add grype scanning --- .github/grype/parse_vulnerabilities_grype.py | 32 +++++ .github/grype/run_grype_scan.sh | 18 +++ .../grype/transform_and_upload_results_s3.sh | 13 ++ .github/workflows/grype_scan.yml | 132 ++++++++++++++++++ .github/workflows/release_branches.yml | 18 +++ 5 files changed, 213 insertions(+) create mode 100644 .github/grype/parse_vulnerabilities_grype.py create mode 100755 .github/grype/run_grype_scan.sh create mode 100755 .github/grype/transform_and_upload_results_s3.sh create mode 100644 .github/workflows/grype_scan.yml diff --git a/.github/grype/parse_vulnerabilities_grype.py b/.github/grype/parse_vulnerabilities_grype.py new file mode 100644 index 000000000000..fec2ef3bfac7 --- /dev/null +++ b/.github/grype/parse_vulnerabilities_grype.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +import json + +from testflows.core import * + +xfails = {} + + +@Name("docker vulnerabilities") +@XFails(xfails) +@TestModule +def docker_vulnerabilities(self): + with Given("I gather grype scan results"): + with open("./result.json", "r") as f: + results = json.load(f) + + for vulnerability in results["matches"]: + with Test( + f"{vulnerability['vulnerability']['id']}@{vulnerability['vulnerability']['namespace']},{vulnerability['vulnerability']['severity']}", + flags=TE, + ): + note(vulnerability) + critical_levels = set(["HIGH", "CRITICAL"]) + if vulnerability['vulnerability']["severity"].upper() in critical_levels: + with Then( + f"Found vulnerability of {vulnerability['vulnerability']['severity']} severity" + ): + result(Fail) + + +if main(): + docker_vulnerabilities() diff --git a/.github/grype/run_grype_scan.sh b/.github/grype/run_grype_scan.sh new file mode 100755 index 000000000000..c5ce0b1b10d3 --- /dev/null +++ b/.github/grype/run_grype_scan.sh @@ -0,0 +1,18 @@ +set -x +set -e + +IMAGE=$1 + +GRYPE_VERSION="v0.80.1" + +docker pull $IMAGE +docker pull anchore/grype:${GRYPE_VERSION} + +docker run \ + --rm --volume /var/run/docker.sock:/var/run/docker.sock \ + --name Grype anchore/grype:${GRYPE_VERSION} \ + --scope all-layers \ + -o json \ + $IMAGE > result.json + +ls -sh diff --git a/.github/grype/transform_and_upload_results_s3.sh b/.github/grype/transform_and_upload_results_s3.sh new file mode 100755 index 000000000000..7a10b02887ef --- /dev/null +++ b/.github/grype/transform_and_upload_results_s3.sh @@ -0,0 +1,13 @@ +DOCKER_IMAGE=$(echo "$DOCKER_IMAGE" | sed 's/[\/:]/_/g') + +S3_PATH="s3://$S3_BUCKET/$PR_NUMBER/$COMMIT_SHA/grype/$DOCKER_IMAGE" +HTTPS_S3_PATH="https://s3.amazonaws.com/$S3_BUCKET/$PR_NUMBER/$COMMIT_SHA/grype/$DOCKER_IMAGE" +echo "https_s3_path=$HTTPS_S3_PATH" >> $GITHUB_OUTPUT + +tfs --no-colors transform nice raw.log nice.log.txt +tfs --no-colors report results -a $HTTPS_S3_PATH raw.log - --copyright "Altinity LTD" | tfs --no-colors document convert > results.html + +aws s3 cp --no-progress nice.log.txt $S3_PATH/nice.log.txt --content-type "text/plain; charset=utf-8" || echo "nice log file not found". +aws s3 cp --no-progress results.html $S3_PATH/results.html || echo "results file not found". +aws s3 cp --no-progress raw.log $S3_PATH/raw.log || echo "raw.log file not found". +aws s3 cp --no-progress result.json $S3_PATH/result.json --content-type "text/plain; charset=utf-8" || echo "result.json not found". \ No newline at end of file diff --git a/.github/workflows/grype_scan.yml b/.github/workflows/grype_scan.yml new file mode 100644 index 000000000000..e749448b81ba --- /dev/null +++ b/.github/workflows/grype_scan.yml @@ -0,0 +1,132 @@ +name: Grype Scan +run-name: Grype Scan ${{ inputs.docker_image }} + +on: + workflow_dispatch: + # Inputs for manual run + inputs: + docker_image: + description: 'Docker image. If no tag, it will be determined by version_helper.py' + required: true + workflow_call: + # Inputs for workflow call + inputs: + docker_image: + description: 'Docker image. If no tag, it will be determined by version_helper.py' + required: true + type: string +env: + PYTHONUNBUFFERED: 1 + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + +jobs: + grype_scan: + name: Grype Scan + runs-on: [self-hosted, altinity-on-demand, altinity-func-tester-aarch64] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker + uses: docker/setup-buildx-action@v3 + + - name: Set up Python + run: | + export TESTFLOWS_VERSION="2.4.19" + sudo apt-get update + sudo apt-get install -y python3-pip python3-venv + python3 -m venv venv + source venv/bin/activate + pip install --upgrade requests chardet urllib3 + pip install testflows==$TESTFLOWS_VERSION awscli==1.33.28 + echo PATH=$PATH >>$GITHUB_ENV + + - name: Set image tag if not given + if: ${{ !contains(inputs.docker_image, ':') }} + id: set_version + run: | + python3 ./tests/ci/version_helper.py | tee /tmp/version_info + source /tmp/version_info + echo "docker_image=${{ inputs.docker_image }}:${{ github.event.pull_request.number || 0 }}-$CLICKHOUSE_VERSION_STRING" >> $GITHUB_OUTPUT + echo "commit_sha=$CLICKHOUSE_VERSION_GITHASH" >> $GITHUB_OUTPUT + + - name: Run Grype Scan + run: | + DOCKER_IMAGE=${{ steps.set_version.outputs.docker_image || inputs.docker_image }} + ./.github/grype/run_grype_scan.sh $DOCKER_IMAGE + + - name: Parse grype results + run: | + python3 -u ./.github/grype/parse_vulnerabilities_grype.py -o nice --no-colors --log raw.log --test-to-end + + - name: Transform and Upload Grype Results + if: always() + id: upload_results + env: + S3_BUCKET: "altinity-build-artifacts" + COMMIT_SHA: ${{ steps.set_version.outputs.commit_sha || github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + PR_NUMBER: ${{ github.event.pull_request.number || 0 }} + DOCKER_IMAGE: ${{ steps.set_version.outputs.docker_image || inputs.docker_image }} + run: | + ./.github/grype/transform_and_upload_results_s3.sh + + - name: Create step summary + if: always() + id: create_summary + run: | + jq -r '"**Image**: \(.source.target.userInput)"' result.json >> $GITHUB_STEP_SUMMARY + jq -r '.distro | "**Distro**: \(.name):\(.version)"' result.json >> $GITHUB_STEP_SUMMARY + if jq -e '.matches | length == 0' result.json > /dev/null; then + echo "No CVEs" >> $GITHUB_STEP_SUMMARY + else + echo "| Severity | Count |" >> $GITHUB_STEP_SUMMARY + echo "|------------|-------|" >> $GITHUB_STEP_SUMMARY + jq -r ' + .matches | + map(.vulnerability.severity) | + group_by(.) | + map({severity: .[0], count: length}) | + sort_by(.severity) | + map("| \(.severity) | \(.count) |") | + .[] + ' result.json >> $GITHUB_STEP_SUMMARY + fi + + HIGH_COUNT=$(jq -r '.matches | map(.vulnerability.severity) | map(select(. == "High")) | length' result.json) + CRITICAL_COUNT=$(jq -r '.matches | map(.vulnerability.severity) | map(select(. == "Critical")) | length' result.json) + TOTAL_HIGH_CRITICAL=$((HIGH_COUNT + CRITICAL_COUNT)) + echo "total_high_critical=$TOTAL_HIGH_CRITICAL" >> $GITHUB_OUTPUT + + if [ $TOTAL_HIGH_CRITICAL -gt 0 ]; then + echo '## High and Critical vulnerabilities found' >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + cat raw.log | tfs --no-colors show tests | grep -Pi 'High|Critical' >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + fi + + - name: Set commit status + if: always() + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + github.rest.repos.createCommitStatus({ + owner: context.repo.owner, + repo: context.repo.repo, + sha: '${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}', + state: '${{ steps.create_summary.outputs.total_high_critical > 0 && 'failure' || 'success' }}', + target_url: '${{ steps.upload_results.outputs.https_s3_path }}/results.html', + description: 'Grype Scan Completed with ${{ steps.create_summary.outputs.total_high_critical }} high/critical vulnerabilities', + context: 'Grype Scan ${{ steps.set_version.outputs.docker_image || inputs.docker_image }}' + }) + + - name: Upload artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: grype-results-${{ hashFiles('raw.log') }} + path: | + result.json + nice.log.txt diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 8c60559a4399..10dc41627c07 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -577,6 +577,23 @@ jobs: test_name: Sign aarch64 runner_type: altinity-on-demand, altinity-func-tester data: ${{ needs.RunConfig.outputs.data }} + GrypeScan: + needs: [RunConfig, DockerServerImage, DockerKeeperImage] + if: ${{ !failure() && !cancelled() }} + strategy: + fail-fast: false + matrix: + include: + - image: server + suffix: '' + - image: server + suffix: '-alpine' + - image: keeper + suffix: '' + uses: ./.github/workflows/grype_scan.yml + secrets: inherit + with: + docker_image: altinityinfra/clickhouse-${{ matrix.image }}:${{ github.event.pull_request.number || 0 }}-${{ fromJson(needs.RunConfig.outputs.data).version }}${{ matrix.suffix }} FinishCheck: if: ${{ !cancelled() }} needs: @@ -611,6 +628,7 @@ jobs: - CompatibilityCheckAarch64 - RegressionTestsRelease - RegressionTestsAarch64 + - GrypeScan - SignRelease runs-on: [self-hosted, altinity-on-demand, altinity-style-checker-aarch64] steps: From 4b2d33f2f43ad7c09737cdd9d53062f4c3cca022 Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Wed, 21 May 2025 20:38:28 -0400 Subject: [PATCH 37/67] update report to new format --- .github/create_combined_ci_report.py | 295 ----------- .github/create_workflow_report.py | 653 +++++++++++++++++++++++++ .github/workflows/release_branches.yml | 4 +- 3 files changed, 655 insertions(+), 297 deletions(-) delete mode 100755 .github/create_combined_ci_report.py create mode 100755 .github/create_workflow_report.py diff --git a/.github/create_combined_ci_report.py b/.github/create_combined_ci_report.py deleted file mode 100755 index 07b6a80763bb..000000000000 --- a/.github/create_combined_ci_report.py +++ /dev/null @@ -1,295 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import os -from pathlib import Path -from itertools import combinations -import json - -import requests -from clickhouse_driver import Client -import boto3 -from botocore.exceptions import NoCredentialsError - -DATABASE_HOST_VAR = "CHECKS_DATABASE_HOST" -DATABASE_USER_VAR = "CHECKS_DATABASE_USER" -DATABASE_PASSWORD_VAR = "CHECKS_DATABASE_PASSWORD" -S3_BUCKET = "altinity-build-artifacts" - - -def get_checks_fails(client: Client, job_url: str): - """ - Get tests that did not succeed for the given job URL. - Exclude checks that have status 'error' as they are counted in get_checks_errors. - """ - columns = ( - "check_status, check_name, test_status, test_name, report_url as results_link" - ) - query = f"""SELECT {columns} FROM `gh-data`.checks - WHERE task_url='{job_url}' - AND test_status IN ('FAIL', 'ERROR') - AND check_status!='error' - ORDER BY check_name, test_name - """ - return client.query_dataframe(query) - - -def get_checks_known_fails(client: Client, job_url: str, known_fails: dict): - """ - Get tests that are known to fail for the given job URL. - """ - assert len(known_fails) > 0, "cannot query the database with empty known fails" - columns = ( - "check_status, check_name, test_status, test_name, report_url as results_link" - ) - query = f"""SELECT {columns} FROM `gh-data`.checks - WHERE task_url='{job_url}' - AND test_status='BROKEN' - AND test_name IN ({','.join(f"'{test}'" for test in known_fails.keys())}) - ORDER BY test_name, check_name - """ - - df = client.query_dataframe(query) - - df.insert( - len(df.columns) - 1, - "reason", - df["test_name"] - .astype(str) - .apply( - lambda test_name: known_fails[test_name].get("reason", "No reason given") - ), - ) - - return df - - -def get_checks_errors(client: Client, job_url: str): - """ - Get checks that have status 'error' for the given job URL. - """ - columns = ( - "check_status, check_name, test_status, test_name, report_url as results_link" - ) - query = f"""SELECT {columns} FROM `gh-data`.checks - WHERE task_url='{job_url}' - AND check_status=='error' - ORDER BY check_name, test_name - """ - return client.query_dataframe(query) - - -def drop_prefix_rows(df, column_to_clean): - """ - Drop rows from the dataframe if: - - the row matches another row completely except for the specified column - - the specified column of that row is a prefix of the same column in another row - """ - to_drop = set() - reference_columns = [col for col in df.columns if col != column_to_clean] - for (i, row_1), (j, row_2) in combinations(df.iterrows(), 2): - if all(row_1[col] == row_2[col] for col in reference_columns): - if row_2[column_to_clean].startswith(row_1[column_to_clean]): - to_drop.add(i) - elif row_1[column_to_clean].startswith(row_2[column_to_clean]): - to_drop.add(j) - return df.drop(to_drop) - - -def get_regression_fails(client: Client, job_url: str): - """ - Get regression tests that did not succeed for the given job URL. - """ - # If you rename the alias for report_url, also update the formatters in format_results_as_html_table - # Nested SELECT handles test reruns - query = f"""SELECT arch, job_name, status, test_name, results_link - FROM ( - SELECT - architecture as arch, - test_name, - argMax(result, start_time) AS status, - job_url, - job_name, - report_url as results_link - FROM `gh-data`.clickhouse_regression_results - GROUP BY architecture, test_name, job_url, job_name, report_url - ORDER BY length(test_name) DESC - ) - WHERE job_url='{job_url}' - AND status IN ('Fail', 'Error') - """ - - df = client.query_dataframe(query) - df = drop_prefix_rows(df, "test_name") - df["job_name"] = df["job_name"].str.title() - return df - - -def url_to_html_link(url: str) -> str: - if not url: - return "" - text = url.split("/")[-1] - if not text: - text = "results" - return f'{text}' - - -def format_test_name_for_linewrap(text: str) -> str: - """Tweak the test name to improve line wrapping.""" - return text.replace(".py::", "/") - - -def format_results_as_html_table(results) -> str: - if len(results) == 0: - return "

Nothing to report

" - results.columns = [col.replace("_", " ").title() for col in results.columns] - html = ( - results.to_html( - index=False, - formatters={ - "Results Link": url_to_html_link, - "Test Name": format_test_name_for_linewrap, - }, - escape=False, - ) # tbody/thead tags interfere with the table sorting script - .replace("\n", "") - .replace("\n", "") - .replace("\n", "") - .replace("\n", "") - .replace(' argparse.Namespace: - parser = argparse.ArgumentParser(description="Create a combined CI report.") - parser.add_argument( - "--actions-run-url", required=True, help="URL of the actions run" - ) - parser.add_argument( - "--pr-number", required=True, help="Pull request number for the S3 path" - ) - parser.add_argument( - "--commit-sha", required=True, help="Commit SHA for the S3 path" - ) - parser.add_argument( - "--no-upload", action="store_true", help="Do not upload the report" - ) - parser.add_argument( - "--known-fails", type=str, help="Path to the file with known fails" - ) - parser.add_argument( - "--mark-preview", action="store_true", help="Mark the report as a preview" - ) - return parser.parse_args() - - -def main(): - args = parse_args() - - db_client = Client( - host=os.getenv(DATABASE_HOST_VAR), - user=os.getenv(DATABASE_USER_VAR), - password=os.getenv(DATABASE_PASSWORD_VAR), - port=9440, - secure="y", - verify=False, - settings={"use_numpy": True}, - ) - - s3_path = ( - f"https://s3.amazonaws.com/{S3_BUCKET}/{args.pr_number}/{args.commit_sha}/" - ) - report_destination_url = s3_path + "combined_report.html" - ci_running_report_url = s3_path + "ci_running.html" - - response = requests.get(ci_running_report_url) - if response.status_code == 200: - ci_running_report: str = response.text - else: - print( - f"Failed to download CI running report. Status code: {response.status_code}, Response: {response.text}" - ) - exit(1) - - fail_results = { - "checks_fails": get_checks_fails(db_client, args.actions_run_url), - "checks_known_fails": [], - "checks_errors": get_checks_errors(db_client, args.actions_run_url), - "regression_fails": get_regression_fails(db_client, args.actions_run_url), - } - - if args.known_fails: - if not os.path.exists(args.known_fails): - print(f"Known fails file {args.known_fails} not found.") - exit(1) - - with open(args.known_fails) as f: - known_fails = json.load(f) - - if known_fails: - fail_results["checks_known_fails"] = get_checks_known_fails( - db_client, args.actions_run_url, known_fails - ) - - combined_report = ( - ci_running_report.replace("ClickHouse CI running for", "Combined CI Report for") - .replace( - "
", - f"""

Table of Contents

-{'

This is a preview. FinishCheck has not completed.

' if args.mark_preview else ""} - - -

CI Jobs Status

-
""", - 1, - ) - .replace( - "
", - f""" - -

Checks Errors

-{format_results_as_html_table(fail_results['checks_errors'])} - -

Checks New Fails

-{format_results_as_html_table(fail_results['checks_fails'])} - -

Regression New Fails

-{format_results_as_html_table(fail_results['regression_fails'])} - -

Checks Known Fails

-{format_results_as_html_table(fail_results['checks_known_fails'])} -""", - 1, - ) - ) - report_path = Path("combined_report.html") - report_path.write_text(combined_report, encoding="utf-8") - - if args.no_upload: - print(f"Report saved to {report_path}") - exit(0) - - # Upload the report to S3 - s3_client = boto3.client("s3") - - try: - s3_client.put_object( - Bucket=S3_BUCKET, - Key=f"{args.pr_number}/{args.commit_sha}/combined_report.html", - Body=combined_report, - ContentType="text/html; charset=utf-8", - ) - except NoCredentialsError: - print("Credentials not available for S3 upload.") - - print(report_destination_url) - - -if __name__ == "__main__": - main() diff --git a/.github/create_workflow_report.py b/.github/create_workflow_report.py new file mode 100755 index 000000000000..a7f30f72aedf --- /dev/null +++ b/.github/create_workflow_report.py @@ -0,0 +1,653 @@ +#!/usr/bin/env python3 +import argparse +import os +from pathlib import Path +from itertools import combinations +import json +from datetime import datetime + +import requests +import pandas as pd +from clickhouse_driver import Client +import boto3 +from botocore.exceptions import NoCredentialsError +import pandas as pd + +DATABASE_HOST_VAR = "CHECKS_DATABASE_HOST" +DATABASE_USER_VAR = "CHECKS_DATABASE_USER" +DATABASE_PASSWORD_VAR = "CHECKS_DATABASE_PASSWORD" +S3_BUCKET = "altinity-build-artifacts" +GITHUB_REPO = "Altinity/ClickHouse" + + +css = """ + /* Base colors for Altinity */ + :root { + --altinity-background: #000D45; + --altinity-accent: #189DCF; + --altinity-highlight: #FFC600; + --altinity-gray: #6c757d; + --altinity-light-gray: #f8f9fa; + --altinity-white: #ffffff; + } + + /* Body and heading fonts */ + body { + font-family: Arimo, "Proxima Nova", "Helvetica Neue", Helvetica, Arial, sans-serif; + font-size: 1rem; + background-color: var(--altinity-background); + color: var(--altinity-light-gray); + padding: 2rem; + } + + h1, h2, h3, h4, h5, h6 { + font-family: Figtree, "Proxima Nova", "Helvetica Neue", Helvetica, Arial, sans-serif; + color: var(--altinity-white); + } + + .logo { + width: auto; + height: 5em; + } + + /* General table styling */ + table { + min-width: min(900px, 98vw); + margin: 1rem 0; + border-collapse: collapse; + background-color: var(--altinity-white); + border: 1px solid var(--altinity-accent); + box-shadow: 0 0 8px rgba(0, 0, 0, 0.05); + color: var(--altinity-background); + } + + /* Table header styling */ + th { + background-color: var(--altinity-accent); + color: var(--altinity-white); + padding: 10px 16px; + text-align: left; + border: none; + border-bottom: 2px solid var(--altinity-background); + white-space: nowrap; + } + th.hth { + border-bottom: 1px solid var(--altinity-accent); + border-right: 2px solid var(--altinity-background); + } + + /* Table header sorting styling */ + th { + cursor: pointer; + } + th.no-sort { + pointer-events: none; + } + th::after, + th::before { + transition: color 0.2s ease-in-out; + font-size: 1.2em; + color: transparent; + } + th::after { + margin-left: 3px; + content: '\\025B8'; + } + th:hover::after { + color: inherit; + } + th.dir-d::after { + color: inherit; + content: '\\025BE'; + } + th.dir-u::after { + color: inherit; + content: '\\025B4'; + } + + /* Table body row styling */ + tr:hover { + background-color: var(--altinity-light-gray); + } + + /* Table cell styling */ + td { + padding: 8px 8px; + border: 1px solid var(--altinity-accent); + } + + /* Link styling */ + a { + color: var(--altinity-accent); + text-decoration: none; + } + a:hover { + color: var(--altinity-highlight); + text-decoration: underline; + } +""" + +script = """ + +""" + +logo = """ +

+""" + + +def get_commit_statuses(sha: str) -> pd.DataFrame: + """ + Fetch commit statuses for a given SHA and return as a pandas DataFrame. + Handles pagination to get all statuses. + + Args: + sha (str): Commit SHA to fetch statuses for. + + Returns: + pd.DataFrame: DataFrame containing all statuses. + """ + headers = { + "Authorization": f"token {os.getenv('GITHUB_TOKEN')}", + "Accept": "application/vnd.github.v3+json", + } + + url = f"https://api.github.com/repos/{GITHUB_REPO}/commits/{sha}/statuses" + + all_data = [] + + while url: + response = requests.get(url, headers=headers) + + if response.status_code != 200: + raise Exception( + f"Failed to fetch statuses: {response.status_code} {response.text}" + ) + + data = response.json() + all_data.extend(data) + + # Check for pagination links in the response headers + if "Link" in response.headers: + links = response.headers["Link"].split(",") + next_url = None + + for link in links: + parts = link.strip().split(";") + if len(parts) == 2 and 'rel="next"' in parts[1]: + next_url = parts[0].strip("<>") + break + + url = next_url + else: + url = None + + # Parse relevant fields + parsed = [ + { + "job_name": item["context"], + "job_status": item["state"], + "message": item["description"], + "results_link": item["target_url"], + } + for item in all_data + ] + + return ( + pd.DataFrame(parsed) + .sort_values(by=["job_status", "job_name"], ascending=[True, True]) + .reset_index(drop=True) + ) + + +def get_pr_info_from_number(pr_number: str) -> dict: + """ + Fetch pull request information for a given PR number. + + Args: + pr_number (str): Pull request number to fetch information for. + + Returns: + dict: Dictionary containing PR information. + """ + headers = { + "Authorization": f"token {os.getenv('GITHUB_TOKEN')}", + "Accept": "application/vnd.github.v3+json", + } + + url = f"https://api.github.com/repos/{GITHUB_REPO}/pulls/{pr_number}" + response = requests.get(url, headers=headers) + + if response.status_code != 200: + raise Exception( + f"Failed to fetch pull request info: {response.status_code} {response.text}" + ) + + return response.json() + + +def get_checks_fails(client: Client, job_url: str): + """ + Get tests that did not succeed for the given job URL. + Exclude checks that have status 'error' as they are counted in get_checks_errors. + """ + columns = "check_status as job_status, check_name as job_name, test_status, test_name, report_url as results_link" + query = f"""SELECT {columns} FROM `gh-data`.checks + WHERE task_url LIKE '{job_url}%' + AND test_status IN ('FAIL', 'ERROR') + AND check_status!='error' + ORDER BY check_name, test_name + """ + return client.query_dataframe(query) + + +def get_checks_known_fails(client: Client, job_url: str, known_fails: dict): + """ + Get tests that are known to fail for the given job URL. + """ + assert len(known_fails) > 0, "cannot query the database with empty known fails" + columns = "check_status as job_status, check_name as job_name, test_status, test_name, report_url as results_link" + query = f"""SELECT {columns} FROM `gh-data`.checks + WHERE task_url LIKE '{job_url}%' + AND test_status='BROKEN' + AND test_name IN ({','.join(f"'{test}'" for test in known_fails.keys())}) + ORDER BY test_name, check_name + """ + + df = client.query_dataframe(query) + + df.insert( + len(df.columns) - 1, + "reason", + df["test_name"] + .astype(str) + .apply( + lambda test_name: known_fails[test_name].get("reason", "No reason given") + ), + ) + + return df + + +def get_checks_errors(client: Client, job_url: str): + """ + Get checks that have status 'error' for the given job URL. + """ + columns = "check_status as job_status, check_name as job_name, test_status, test_name, report_url as results_link" + query = f"""SELECT {columns} FROM `gh-data`.checks + WHERE task_url LIKE '{job_url}%' + AND check_status=='error' + ORDER BY check_name, test_name + """ + return client.query_dataframe(query) + + +def drop_prefix_rows(df, column_to_clean): + """ + Drop rows from the dataframe if: + - the row matches another row completely except for the specified column + - the specified column of that row is a prefix of the same column in another row + """ + to_drop = set() + reference_columns = [col for col in df.columns if col != column_to_clean] + for (i, row_1), (j, row_2) in combinations(df.iterrows(), 2): + if all(row_1[col] == row_2[col] for col in reference_columns): + if row_2[column_to_clean].startswith(row_1[column_to_clean]): + to_drop.add(i) + elif row_1[column_to_clean].startswith(row_2[column_to_clean]): + to_drop.add(j) + return df.drop(to_drop) + + +def get_regression_fails(client: Client, job_url: str): + """ + Get regression tests that did not succeed for the given job URL. + """ + # If you rename the alias for report_url, also update the formatters in format_results_as_html_table + # Nested SELECT handles test reruns + query = f"""SELECT arch, job_name, status, test_name, results_link + FROM ( + SELECT + architecture as arch, + test_name, + argMax(result, start_time) AS status, + job_url, + job_name, + report_url as results_link + FROM `gh-data`.clickhouse_regression_results + GROUP BY architecture, test_name, job_url, job_name, report_url + ORDER BY length(test_name) DESC + ) + WHERE job_url='{job_url}' + AND status IN ('Fail', 'Error') + """ + df = client.query_dataframe(query) + df = drop_prefix_rows(df, "test_name") + df["job_name"] = df["job_name"].str.title() + return df + + +def get_cves(pr_number, commit_sha): + s3_client = boto3.client("s3", endpoint_url=os.getenv("S3_URL")) + s3_prefix = f"{pr_number}/{commit_sha}/grype/" + + results = [] + + response = s3_client.list_objects_v2( + Bucket=S3_BUCKET, Prefix=s3_prefix, Delimiter="/" + ) + grype_result_dirs = [ + content["Prefix"] for content in response.get("CommonPrefixes", []) + ] + + for path in grype_result_dirs: + file_key = f"{path}result.json" + file_response = s3_client.get_object(Bucket=S3_BUCKET, Key=file_key) + content = file_response["Body"].read().decode("utf-8") + results.append(json.loads(content)) + + rows = [] + for scan_result in results: + for match in scan_result["matches"]: + rows.append( + { + "docker_image": scan_result["source"]["target"]["userInput"], + "severity": match["vulnerability"]["severity"], + "identifier": match["vulnerability"]["id"], + "namespace": match["vulnerability"]["namespace"], + } + ) + + if len(rows) == 0: + return pd.DataFrame() + + df = pd.DataFrame(rows).drop_duplicates() + df = df.sort_values( + by="severity", + key=lambda col: col.str.lower().map( + {"critical": 1, "high": 2, "medium": 3, "low": 4, "negligible": 5} + ), + ) + return df + + +def url_to_html_link(url: str) -> str: + if not url: + return "" + text = url.split("/")[-1].replace("__", "_") + if not text: + text = "results" + return f'{text}' + + +def format_test_name_for_linewrap(text: str) -> str: + """Tweak the test name to improve line wrapping.""" + return f'{text}' + + +def format_test_status(text: str) -> str: + """Format the test status for better readability.""" + color = ( + "red" + if text.lower().startswith("fail") + else "orange" if text.lower() in ("error", "broken") else "green" + ) + return f'{text}' + + +def format_results_as_html_table(results) -> str: + if len(results) == 0: + return "

Nothing to report

" + results.columns = [col.replace("_", " ").title() for col in results.columns] + html = results.to_html( + index=False, + formatters={ + "Results Link": url_to_html_link, + "Test Name": format_test_name_for_linewrap, + "Test Status": format_test_status, + "Job Status": format_test_status, + "Status": format_test_status, + "Message": lambda m: m.replace("\n", " "), + "Identifier": lambda i: url_to_html_link( + "https://nvd.nist.gov/vuln/detail/" + i + ), + }, + escape=False, + ).replace(' border="1"', "") + return html + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Create a combined CI report.") + parser.add_argument( + "--actions-run-url", required=True, help="URL of the actions run" + ) + parser.add_argument( + "--pr-number", required=True, help="Pull request number for the S3 path" + ) + parser.add_argument( + "--commit-sha", required=True, help="Commit SHA for the S3 path" + ) + parser.add_argument( + "--no-upload", action="store_true", help="Do not upload the report" + ) + parser.add_argument( + "--known-fails", type=str, help="Path to the file with known fails" + ) + parser.add_argument( + "--cves", action="store_true", help="Get CVEs from Grype results" + ) + parser.add_argument( + "--mark-preview", action="store_true", help="Mark the report as a preview" + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + db_client = Client( + host=os.getenv(DATABASE_HOST_VAR), + user=os.getenv(DATABASE_USER_VAR), + password=os.getenv(DATABASE_PASSWORD_VAR), + port=9440, + secure="y", + verify=False, + settings={"use_numpy": True}, + ) + + fail_results = { + "job_statuses": get_commit_statuses(args.commit_sha), + "checks_fails": get_checks_fails(db_client, args.actions_run_url), + "checks_known_fails": [], + "checks_errors": get_checks_errors(db_client, args.actions_run_url), + "regression_fails": get_regression_fails(db_client, args.actions_run_url), + "docker_images_cves": ( + [] if not args.cves else get_cves(args.pr_number, args.commit_sha) + ), + } + + if args.known_fails: + if not os.path.exists(args.known_fails): + print(f"Known fails file {args.known_fails} not found.") + exit(1) + + with open(args.known_fails) as f: + known_fails = json.load(f) + + if known_fails: + fail_results["checks_known_fails"] = get_checks_known_fails( + db_client, args.actions_run_url, known_fails + ) + + if args.pr_number == "0": + pr_info_html = "Release" + else: + try: + pr_info = get_pr_info_from_number(args.pr_number) + pr_info_html = f""" + #{pr_info.get("number")} ({pr_info.get("base", {}).get('ref')} <- {pr_info.get("head", {}).get('ref')}) {pr_info.get("title")} + """ + except Exception as e: + pr_info_html = e + + high_cve_count = 0 + if len(fail_results["docker_images_cves"]) > 0: + high_cve_count = ( + fail_results["docker_images_cves"]["severity"] + .str.lower() + .isin(("high", "critical")) + .sum() + ) + + title = "ClickHouse® CI Workflow Run Report" + + html_report = f""" + + + + + + + {title} + + + {logo} +

{title}

+ + + + + + + + + + + + + +
Pull Request{pr_info_html}
Workflow Run{args.actions_run_url.split('/')[-1]}
Commit{args.commit_sha}
Date{datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')} UTC
+ +

Table of Contents

+{'

This is a preview. FinishCheck has not completed.

' if args.mark_preview else ""} + + +

CI Jobs Status

+{format_results_as_html_table(fail_results['job_statuses'])} + +

Checks Errors

+{format_results_as_html_table(fail_results['checks_errors'])} + +

Checks New Fails

+{format_results_as_html_table(fail_results['checks_fails'])} + +

Regression New Fails

+{format_results_as_html_table(fail_results['regression_fails'])} + +

Docker Images CVEs

+{"

Not Checked

" if not args.cves else format_results_as_html_table(fail_results['docker_images_cves'])} + +

Checks Known Fails

+{"

Not Checked

" if not args.known_fails else format_results_as_html_table(fail_results['checks_known_fails'])} + +{script} + + +""" + report_name = "ci_run_report.html" + report_path = Path(report_name) + report_path.write_text(html_report, encoding="utf-8") + + if args.no_upload: + print(f"Report saved to {report_path}") + exit(0) + + report_destination_key = f"{args.pr_number}/{args.commit_sha}/{report_name}" + + # Upload the report to S3 + s3_client = boto3.client("s3", endpoint_url=os.getenv("S3_URL")) + + try: + s3_client.put_object( + Bucket=S3_BUCKET, + Key=report_destination_key, + Body=html_report, + ContentType="text/html; charset=utf-8", + ) + except NoCredentialsError: + print("Credentials not available for S3 upload.") + + print(f"https://s3.amazonaws.com/{S3_BUCKET}/" + report_destination_key) + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 10dc41627c07..c5ca0080ec4b 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -660,9 +660,9 @@ jobs: ACTIONS_RUN_URL: ${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }} shell: bash run: | - pip install clickhouse-driver==0.2.8 numpy==1.26.4 pandas==2.2.0 + pip install clickhouse-driver==0.2.8 numpy==1.26.4 pandas==2.0.3 - REPORT_LINK=$(python3 .github/create_combined_ci_report.py --pr-number $PR_NUMBER --commit-sha $COMMIT_SHA --actions-run-url $ACTIONS_RUN_URL --known-fails tests/broken_tests.json) + REPORT_LINK=$(python3 .github/create_workflow_report.py --pr-number $PR_NUMBER --commit-sha $COMMIT_SHA --actions-run-url $ACTIONS_RUN_URL --known-fails tests/broken_tests.json --cves) IS_VALID_URL=$(echo $REPORT_LINK | grep -E '^https?://') if [[ -n $IS_VALID_URL ]]; then From 60b1b522f80233d129c4a571505a6e4546ace0d4 Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Wed, 21 May 2025 20:29:14 -0400 Subject: [PATCH 38/67] add grype scanning --- .github/grype/parse_vulnerabilities_grype.py | 32 +++++ .github/grype/run_grype_scan.sh | 18 +++ .../grype/transform_and_upload_results_s3.sh | 13 ++ .github/workflows/grype_scan.yml | 132 ++++++++++++++++++ .github/workflows/release_branches.yml | 18 +++ 5 files changed, 213 insertions(+) create mode 100644 .github/grype/parse_vulnerabilities_grype.py create mode 100755 .github/grype/run_grype_scan.sh create mode 100755 .github/grype/transform_and_upload_results_s3.sh create mode 100644 .github/workflows/grype_scan.yml diff --git a/.github/grype/parse_vulnerabilities_grype.py b/.github/grype/parse_vulnerabilities_grype.py new file mode 100644 index 000000000000..fec2ef3bfac7 --- /dev/null +++ b/.github/grype/parse_vulnerabilities_grype.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +import json + +from testflows.core import * + +xfails = {} + + +@Name("docker vulnerabilities") +@XFails(xfails) +@TestModule +def docker_vulnerabilities(self): + with Given("I gather grype scan results"): + with open("./result.json", "r") as f: + results = json.load(f) + + for vulnerability in results["matches"]: + with Test( + f"{vulnerability['vulnerability']['id']}@{vulnerability['vulnerability']['namespace']},{vulnerability['vulnerability']['severity']}", + flags=TE, + ): + note(vulnerability) + critical_levels = set(["HIGH", "CRITICAL"]) + if vulnerability['vulnerability']["severity"].upper() in critical_levels: + with Then( + f"Found vulnerability of {vulnerability['vulnerability']['severity']} severity" + ): + result(Fail) + + +if main(): + docker_vulnerabilities() diff --git a/.github/grype/run_grype_scan.sh b/.github/grype/run_grype_scan.sh new file mode 100755 index 000000000000..c5ce0b1b10d3 --- /dev/null +++ b/.github/grype/run_grype_scan.sh @@ -0,0 +1,18 @@ +set -x +set -e + +IMAGE=$1 + +GRYPE_VERSION="v0.80.1" + +docker pull $IMAGE +docker pull anchore/grype:${GRYPE_VERSION} + +docker run \ + --rm --volume /var/run/docker.sock:/var/run/docker.sock \ + --name Grype anchore/grype:${GRYPE_VERSION} \ + --scope all-layers \ + -o json \ + $IMAGE > result.json + +ls -sh diff --git a/.github/grype/transform_and_upload_results_s3.sh b/.github/grype/transform_and_upload_results_s3.sh new file mode 100755 index 000000000000..7a10b02887ef --- /dev/null +++ b/.github/grype/transform_and_upload_results_s3.sh @@ -0,0 +1,13 @@ +DOCKER_IMAGE=$(echo "$DOCKER_IMAGE" | sed 's/[\/:]/_/g') + +S3_PATH="s3://$S3_BUCKET/$PR_NUMBER/$COMMIT_SHA/grype/$DOCKER_IMAGE" +HTTPS_S3_PATH="https://s3.amazonaws.com/$S3_BUCKET/$PR_NUMBER/$COMMIT_SHA/grype/$DOCKER_IMAGE" +echo "https_s3_path=$HTTPS_S3_PATH" >> $GITHUB_OUTPUT + +tfs --no-colors transform nice raw.log nice.log.txt +tfs --no-colors report results -a $HTTPS_S3_PATH raw.log - --copyright "Altinity LTD" | tfs --no-colors document convert > results.html + +aws s3 cp --no-progress nice.log.txt $S3_PATH/nice.log.txt --content-type "text/plain; charset=utf-8" || echo "nice log file not found". +aws s3 cp --no-progress results.html $S3_PATH/results.html || echo "results file not found". +aws s3 cp --no-progress raw.log $S3_PATH/raw.log || echo "raw.log file not found". +aws s3 cp --no-progress result.json $S3_PATH/result.json --content-type "text/plain; charset=utf-8" || echo "result.json not found". \ No newline at end of file diff --git a/.github/workflows/grype_scan.yml b/.github/workflows/grype_scan.yml new file mode 100644 index 000000000000..e749448b81ba --- /dev/null +++ b/.github/workflows/grype_scan.yml @@ -0,0 +1,132 @@ +name: Grype Scan +run-name: Grype Scan ${{ inputs.docker_image }} + +on: + workflow_dispatch: + # Inputs for manual run + inputs: + docker_image: + description: 'Docker image. If no tag, it will be determined by version_helper.py' + required: true + workflow_call: + # Inputs for workflow call + inputs: + docker_image: + description: 'Docker image. If no tag, it will be determined by version_helper.py' + required: true + type: string +env: + PYTHONUNBUFFERED: 1 + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + +jobs: + grype_scan: + name: Grype Scan + runs-on: [self-hosted, altinity-on-demand, altinity-func-tester-aarch64] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker + uses: docker/setup-buildx-action@v3 + + - name: Set up Python + run: | + export TESTFLOWS_VERSION="2.4.19" + sudo apt-get update + sudo apt-get install -y python3-pip python3-venv + python3 -m venv venv + source venv/bin/activate + pip install --upgrade requests chardet urllib3 + pip install testflows==$TESTFLOWS_VERSION awscli==1.33.28 + echo PATH=$PATH >>$GITHUB_ENV + + - name: Set image tag if not given + if: ${{ !contains(inputs.docker_image, ':') }} + id: set_version + run: | + python3 ./tests/ci/version_helper.py | tee /tmp/version_info + source /tmp/version_info + echo "docker_image=${{ inputs.docker_image }}:${{ github.event.pull_request.number || 0 }}-$CLICKHOUSE_VERSION_STRING" >> $GITHUB_OUTPUT + echo "commit_sha=$CLICKHOUSE_VERSION_GITHASH" >> $GITHUB_OUTPUT + + - name: Run Grype Scan + run: | + DOCKER_IMAGE=${{ steps.set_version.outputs.docker_image || inputs.docker_image }} + ./.github/grype/run_grype_scan.sh $DOCKER_IMAGE + + - name: Parse grype results + run: | + python3 -u ./.github/grype/parse_vulnerabilities_grype.py -o nice --no-colors --log raw.log --test-to-end + + - name: Transform and Upload Grype Results + if: always() + id: upload_results + env: + S3_BUCKET: "altinity-build-artifacts" + COMMIT_SHA: ${{ steps.set_version.outputs.commit_sha || github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + PR_NUMBER: ${{ github.event.pull_request.number || 0 }} + DOCKER_IMAGE: ${{ steps.set_version.outputs.docker_image || inputs.docker_image }} + run: | + ./.github/grype/transform_and_upload_results_s3.sh + + - name: Create step summary + if: always() + id: create_summary + run: | + jq -r '"**Image**: \(.source.target.userInput)"' result.json >> $GITHUB_STEP_SUMMARY + jq -r '.distro | "**Distro**: \(.name):\(.version)"' result.json >> $GITHUB_STEP_SUMMARY + if jq -e '.matches | length == 0' result.json > /dev/null; then + echo "No CVEs" >> $GITHUB_STEP_SUMMARY + else + echo "| Severity | Count |" >> $GITHUB_STEP_SUMMARY + echo "|------------|-------|" >> $GITHUB_STEP_SUMMARY + jq -r ' + .matches | + map(.vulnerability.severity) | + group_by(.) | + map({severity: .[0], count: length}) | + sort_by(.severity) | + map("| \(.severity) | \(.count) |") | + .[] + ' result.json >> $GITHUB_STEP_SUMMARY + fi + + HIGH_COUNT=$(jq -r '.matches | map(.vulnerability.severity) | map(select(. == "High")) | length' result.json) + CRITICAL_COUNT=$(jq -r '.matches | map(.vulnerability.severity) | map(select(. == "Critical")) | length' result.json) + TOTAL_HIGH_CRITICAL=$((HIGH_COUNT + CRITICAL_COUNT)) + echo "total_high_critical=$TOTAL_HIGH_CRITICAL" >> $GITHUB_OUTPUT + + if [ $TOTAL_HIGH_CRITICAL -gt 0 ]; then + echo '## High and Critical vulnerabilities found' >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + cat raw.log | tfs --no-colors show tests | grep -Pi 'High|Critical' >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + fi + + - name: Set commit status + if: always() + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + github.rest.repos.createCommitStatus({ + owner: context.repo.owner, + repo: context.repo.repo, + sha: '${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}', + state: '${{ steps.create_summary.outputs.total_high_critical > 0 && 'failure' || 'success' }}', + target_url: '${{ steps.upload_results.outputs.https_s3_path }}/results.html', + description: 'Grype Scan Completed with ${{ steps.create_summary.outputs.total_high_critical }} high/critical vulnerabilities', + context: 'Grype Scan ${{ steps.set_version.outputs.docker_image || inputs.docker_image }}' + }) + + - name: Upload artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: grype-results-${{ hashFiles('raw.log') }} + path: | + result.json + nice.log.txt diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 8c60559a4399..10dc41627c07 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -577,6 +577,23 @@ jobs: test_name: Sign aarch64 runner_type: altinity-on-demand, altinity-func-tester data: ${{ needs.RunConfig.outputs.data }} + GrypeScan: + needs: [RunConfig, DockerServerImage, DockerKeeperImage] + if: ${{ !failure() && !cancelled() }} + strategy: + fail-fast: false + matrix: + include: + - image: server + suffix: '' + - image: server + suffix: '-alpine' + - image: keeper + suffix: '' + uses: ./.github/workflows/grype_scan.yml + secrets: inherit + with: + docker_image: altinityinfra/clickhouse-${{ matrix.image }}:${{ github.event.pull_request.number || 0 }}-${{ fromJson(needs.RunConfig.outputs.data).version }}${{ matrix.suffix }} FinishCheck: if: ${{ !cancelled() }} needs: @@ -611,6 +628,7 @@ jobs: - CompatibilityCheckAarch64 - RegressionTestsRelease - RegressionTestsAarch64 + - GrypeScan - SignRelease runs-on: [self-hosted, altinity-on-demand, altinity-style-checker-aarch64] steps: From bf86f0ecaab0066821c4e8e367ef7ebaabfe3a09 Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Wed, 21 May 2025 20:38:28 -0400 Subject: [PATCH 39/67] update report to new format --- .github/create_combined_ci_report.py | 295 ----------- .github/create_workflow_report.py | 653 +++++++++++++++++++++++++ .github/workflows/release_branches.yml | 4 +- 3 files changed, 655 insertions(+), 297 deletions(-) delete mode 100755 .github/create_combined_ci_report.py create mode 100755 .github/create_workflow_report.py diff --git a/.github/create_combined_ci_report.py b/.github/create_combined_ci_report.py deleted file mode 100755 index 07b6a80763bb..000000000000 --- a/.github/create_combined_ci_report.py +++ /dev/null @@ -1,295 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import os -from pathlib import Path -from itertools import combinations -import json - -import requests -from clickhouse_driver import Client -import boto3 -from botocore.exceptions import NoCredentialsError - -DATABASE_HOST_VAR = "CHECKS_DATABASE_HOST" -DATABASE_USER_VAR = "CHECKS_DATABASE_USER" -DATABASE_PASSWORD_VAR = "CHECKS_DATABASE_PASSWORD" -S3_BUCKET = "altinity-build-artifacts" - - -def get_checks_fails(client: Client, job_url: str): - """ - Get tests that did not succeed for the given job URL. - Exclude checks that have status 'error' as they are counted in get_checks_errors. - """ - columns = ( - "check_status, check_name, test_status, test_name, report_url as results_link" - ) - query = f"""SELECT {columns} FROM `gh-data`.checks - WHERE task_url='{job_url}' - AND test_status IN ('FAIL', 'ERROR') - AND check_status!='error' - ORDER BY check_name, test_name - """ - return client.query_dataframe(query) - - -def get_checks_known_fails(client: Client, job_url: str, known_fails: dict): - """ - Get tests that are known to fail for the given job URL. - """ - assert len(known_fails) > 0, "cannot query the database with empty known fails" - columns = ( - "check_status, check_name, test_status, test_name, report_url as results_link" - ) - query = f"""SELECT {columns} FROM `gh-data`.checks - WHERE task_url='{job_url}' - AND test_status='BROKEN' - AND test_name IN ({','.join(f"'{test}'" for test in known_fails.keys())}) - ORDER BY test_name, check_name - """ - - df = client.query_dataframe(query) - - df.insert( - len(df.columns) - 1, - "reason", - df["test_name"] - .astype(str) - .apply( - lambda test_name: known_fails[test_name].get("reason", "No reason given") - ), - ) - - return df - - -def get_checks_errors(client: Client, job_url: str): - """ - Get checks that have status 'error' for the given job URL. - """ - columns = ( - "check_status, check_name, test_status, test_name, report_url as results_link" - ) - query = f"""SELECT {columns} FROM `gh-data`.checks - WHERE task_url='{job_url}' - AND check_status=='error' - ORDER BY check_name, test_name - """ - return client.query_dataframe(query) - - -def drop_prefix_rows(df, column_to_clean): - """ - Drop rows from the dataframe if: - - the row matches another row completely except for the specified column - - the specified column of that row is a prefix of the same column in another row - """ - to_drop = set() - reference_columns = [col for col in df.columns if col != column_to_clean] - for (i, row_1), (j, row_2) in combinations(df.iterrows(), 2): - if all(row_1[col] == row_2[col] for col in reference_columns): - if row_2[column_to_clean].startswith(row_1[column_to_clean]): - to_drop.add(i) - elif row_1[column_to_clean].startswith(row_2[column_to_clean]): - to_drop.add(j) - return df.drop(to_drop) - - -def get_regression_fails(client: Client, job_url: str): - """ - Get regression tests that did not succeed for the given job URL. - """ - # If you rename the alias for report_url, also update the formatters in format_results_as_html_table - # Nested SELECT handles test reruns - query = f"""SELECT arch, job_name, status, test_name, results_link - FROM ( - SELECT - architecture as arch, - test_name, - argMax(result, start_time) AS status, - job_url, - job_name, - report_url as results_link - FROM `gh-data`.clickhouse_regression_results - GROUP BY architecture, test_name, job_url, job_name, report_url - ORDER BY length(test_name) DESC - ) - WHERE job_url='{job_url}' - AND status IN ('Fail', 'Error') - """ - - df = client.query_dataframe(query) - df = drop_prefix_rows(df, "test_name") - df["job_name"] = df["job_name"].str.title() - return df - - -def url_to_html_link(url: str) -> str: - if not url: - return "" - text = url.split("/")[-1] - if not text: - text = "results" - return f'{text}' - - -def format_test_name_for_linewrap(text: str) -> str: - """Tweak the test name to improve line wrapping.""" - return text.replace(".py::", "/") - - -def format_results_as_html_table(results) -> str: - if len(results) == 0: - return "

Nothing to report

" - results.columns = [col.replace("_", " ").title() for col in results.columns] - html = ( - results.to_html( - index=False, - formatters={ - "Results Link": url_to_html_link, - "Test Name": format_test_name_for_linewrap, - }, - escape=False, - ) # tbody/thead tags interfere with the table sorting script - .replace("\n", "") - .replace("\n", "") - .replace("\n", "") - .replace("\n", "") - .replace(' argparse.Namespace: - parser = argparse.ArgumentParser(description="Create a combined CI report.") - parser.add_argument( - "--actions-run-url", required=True, help="URL of the actions run" - ) - parser.add_argument( - "--pr-number", required=True, help="Pull request number for the S3 path" - ) - parser.add_argument( - "--commit-sha", required=True, help="Commit SHA for the S3 path" - ) - parser.add_argument( - "--no-upload", action="store_true", help="Do not upload the report" - ) - parser.add_argument( - "--known-fails", type=str, help="Path to the file with known fails" - ) - parser.add_argument( - "--mark-preview", action="store_true", help="Mark the report as a preview" - ) - return parser.parse_args() - - -def main(): - args = parse_args() - - db_client = Client( - host=os.getenv(DATABASE_HOST_VAR), - user=os.getenv(DATABASE_USER_VAR), - password=os.getenv(DATABASE_PASSWORD_VAR), - port=9440, - secure="y", - verify=False, - settings={"use_numpy": True}, - ) - - s3_path = ( - f"https://s3.amazonaws.com/{S3_BUCKET}/{args.pr_number}/{args.commit_sha}/" - ) - report_destination_url = s3_path + "combined_report.html" - ci_running_report_url = s3_path + "ci_running.html" - - response = requests.get(ci_running_report_url) - if response.status_code == 200: - ci_running_report: str = response.text - else: - print( - f"Failed to download CI running report. Status code: {response.status_code}, Response: {response.text}" - ) - exit(1) - - fail_results = { - "checks_fails": get_checks_fails(db_client, args.actions_run_url), - "checks_known_fails": [], - "checks_errors": get_checks_errors(db_client, args.actions_run_url), - "regression_fails": get_regression_fails(db_client, args.actions_run_url), - } - - if args.known_fails: - if not os.path.exists(args.known_fails): - print(f"Known fails file {args.known_fails} not found.") - exit(1) - - with open(args.known_fails) as f: - known_fails = json.load(f) - - if known_fails: - fail_results["checks_known_fails"] = get_checks_known_fails( - db_client, args.actions_run_url, known_fails - ) - - combined_report = ( - ci_running_report.replace("ClickHouse CI running for", "Combined CI Report for") - .replace( - "
", - f"""

Table of Contents

-{'

This is a preview. FinishCheck has not completed.

' if args.mark_preview else ""} - - -

CI Jobs Status

-
""", - 1, - ) - .replace( - "
", - f""" - -

Checks Errors

-{format_results_as_html_table(fail_results['checks_errors'])} - -

Checks New Fails

-{format_results_as_html_table(fail_results['checks_fails'])} - -

Regression New Fails

-{format_results_as_html_table(fail_results['regression_fails'])} - -

Checks Known Fails

-{format_results_as_html_table(fail_results['checks_known_fails'])} -""", - 1, - ) - ) - report_path = Path("combined_report.html") - report_path.write_text(combined_report, encoding="utf-8") - - if args.no_upload: - print(f"Report saved to {report_path}") - exit(0) - - # Upload the report to S3 - s3_client = boto3.client("s3") - - try: - s3_client.put_object( - Bucket=S3_BUCKET, - Key=f"{args.pr_number}/{args.commit_sha}/combined_report.html", - Body=combined_report, - ContentType="text/html; charset=utf-8", - ) - except NoCredentialsError: - print("Credentials not available for S3 upload.") - - print(report_destination_url) - - -if __name__ == "__main__": - main() diff --git a/.github/create_workflow_report.py b/.github/create_workflow_report.py new file mode 100755 index 000000000000..a7f30f72aedf --- /dev/null +++ b/.github/create_workflow_report.py @@ -0,0 +1,653 @@ +#!/usr/bin/env python3 +import argparse +import os +from pathlib import Path +from itertools import combinations +import json +from datetime import datetime + +import requests +import pandas as pd +from clickhouse_driver import Client +import boto3 +from botocore.exceptions import NoCredentialsError +import pandas as pd + +DATABASE_HOST_VAR = "CHECKS_DATABASE_HOST" +DATABASE_USER_VAR = "CHECKS_DATABASE_USER" +DATABASE_PASSWORD_VAR = "CHECKS_DATABASE_PASSWORD" +S3_BUCKET = "altinity-build-artifacts" +GITHUB_REPO = "Altinity/ClickHouse" + + +css = """ + /* Base colors for Altinity */ + :root { + --altinity-background: #000D45; + --altinity-accent: #189DCF; + --altinity-highlight: #FFC600; + --altinity-gray: #6c757d; + --altinity-light-gray: #f8f9fa; + --altinity-white: #ffffff; + } + + /* Body and heading fonts */ + body { + font-family: Arimo, "Proxima Nova", "Helvetica Neue", Helvetica, Arial, sans-serif; + font-size: 1rem; + background-color: var(--altinity-background); + color: var(--altinity-light-gray); + padding: 2rem; + } + + h1, h2, h3, h4, h5, h6 { + font-family: Figtree, "Proxima Nova", "Helvetica Neue", Helvetica, Arial, sans-serif; + color: var(--altinity-white); + } + + .logo { + width: auto; + height: 5em; + } + + /* General table styling */ + table { + min-width: min(900px, 98vw); + margin: 1rem 0; + border-collapse: collapse; + background-color: var(--altinity-white); + border: 1px solid var(--altinity-accent); + box-shadow: 0 0 8px rgba(0, 0, 0, 0.05); + color: var(--altinity-background); + } + + /* Table header styling */ + th { + background-color: var(--altinity-accent); + color: var(--altinity-white); + padding: 10px 16px; + text-align: left; + border: none; + border-bottom: 2px solid var(--altinity-background); + white-space: nowrap; + } + th.hth { + border-bottom: 1px solid var(--altinity-accent); + border-right: 2px solid var(--altinity-background); + } + + /* Table header sorting styling */ + th { + cursor: pointer; + } + th.no-sort { + pointer-events: none; + } + th::after, + th::before { + transition: color 0.2s ease-in-out; + font-size: 1.2em; + color: transparent; + } + th::after { + margin-left: 3px; + content: '\\025B8'; + } + th:hover::after { + color: inherit; + } + th.dir-d::after { + color: inherit; + content: '\\025BE'; + } + th.dir-u::after { + color: inherit; + content: '\\025B4'; + } + + /* Table body row styling */ + tr:hover { + background-color: var(--altinity-light-gray); + } + + /* Table cell styling */ + td { + padding: 8px 8px; + border: 1px solid var(--altinity-accent); + } + + /* Link styling */ + a { + color: var(--altinity-accent); + text-decoration: none; + } + a:hover { + color: var(--altinity-highlight); + text-decoration: underline; + } +""" + +script = """ + +""" + +logo = """ +

+""" + + +def get_commit_statuses(sha: str) -> pd.DataFrame: + """ + Fetch commit statuses for a given SHA and return as a pandas DataFrame. + Handles pagination to get all statuses. + + Args: + sha (str): Commit SHA to fetch statuses for. + + Returns: + pd.DataFrame: DataFrame containing all statuses. + """ + headers = { + "Authorization": f"token {os.getenv('GITHUB_TOKEN')}", + "Accept": "application/vnd.github.v3+json", + } + + url = f"https://api.github.com/repos/{GITHUB_REPO}/commits/{sha}/statuses" + + all_data = [] + + while url: + response = requests.get(url, headers=headers) + + if response.status_code != 200: + raise Exception( + f"Failed to fetch statuses: {response.status_code} {response.text}" + ) + + data = response.json() + all_data.extend(data) + + # Check for pagination links in the response headers + if "Link" in response.headers: + links = response.headers["Link"].split(",") + next_url = None + + for link in links: + parts = link.strip().split(";") + if len(parts) == 2 and 'rel="next"' in parts[1]: + next_url = parts[0].strip("<>") + break + + url = next_url + else: + url = None + + # Parse relevant fields + parsed = [ + { + "job_name": item["context"], + "job_status": item["state"], + "message": item["description"], + "results_link": item["target_url"], + } + for item in all_data + ] + + return ( + pd.DataFrame(parsed) + .sort_values(by=["job_status", "job_name"], ascending=[True, True]) + .reset_index(drop=True) + ) + + +def get_pr_info_from_number(pr_number: str) -> dict: + """ + Fetch pull request information for a given PR number. + + Args: + pr_number (str): Pull request number to fetch information for. + + Returns: + dict: Dictionary containing PR information. + """ + headers = { + "Authorization": f"token {os.getenv('GITHUB_TOKEN')}", + "Accept": "application/vnd.github.v3+json", + } + + url = f"https://api.github.com/repos/{GITHUB_REPO}/pulls/{pr_number}" + response = requests.get(url, headers=headers) + + if response.status_code != 200: + raise Exception( + f"Failed to fetch pull request info: {response.status_code} {response.text}" + ) + + return response.json() + + +def get_checks_fails(client: Client, job_url: str): + """ + Get tests that did not succeed for the given job URL. + Exclude checks that have status 'error' as they are counted in get_checks_errors. + """ + columns = "check_status as job_status, check_name as job_name, test_status, test_name, report_url as results_link" + query = f"""SELECT {columns} FROM `gh-data`.checks + WHERE task_url LIKE '{job_url}%' + AND test_status IN ('FAIL', 'ERROR') + AND check_status!='error' + ORDER BY check_name, test_name + """ + return client.query_dataframe(query) + + +def get_checks_known_fails(client: Client, job_url: str, known_fails: dict): + """ + Get tests that are known to fail for the given job URL. + """ + assert len(known_fails) > 0, "cannot query the database with empty known fails" + columns = "check_status as job_status, check_name as job_name, test_status, test_name, report_url as results_link" + query = f"""SELECT {columns} FROM `gh-data`.checks + WHERE task_url LIKE '{job_url}%' + AND test_status='BROKEN' + AND test_name IN ({','.join(f"'{test}'" for test in known_fails.keys())}) + ORDER BY test_name, check_name + """ + + df = client.query_dataframe(query) + + df.insert( + len(df.columns) - 1, + "reason", + df["test_name"] + .astype(str) + .apply( + lambda test_name: known_fails[test_name].get("reason", "No reason given") + ), + ) + + return df + + +def get_checks_errors(client: Client, job_url: str): + """ + Get checks that have status 'error' for the given job URL. + """ + columns = "check_status as job_status, check_name as job_name, test_status, test_name, report_url as results_link" + query = f"""SELECT {columns} FROM `gh-data`.checks + WHERE task_url LIKE '{job_url}%' + AND check_status=='error' + ORDER BY check_name, test_name + """ + return client.query_dataframe(query) + + +def drop_prefix_rows(df, column_to_clean): + """ + Drop rows from the dataframe if: + - the row matches another row completely except for the specified column + - the specified column of that row is a prefix of the same column in another row + """ + to_drop = set() + reference_columns = [col for col in df.columns if col != column_to_clean] + for (i, row_1), (j, row_2) in combinations(df.iterrows(), 2): + if all(row_1[col] == row_2[col] for col in reference_columns): + if row_2[column_to_clean].startswith(row_1[column_to_clean]): + to_drop.add(i) + elif row_1[column_to_clean].startswith(row_2[column_to_clean]): + to_drop.add(j) + return df.drop(to_drop) + + +def get_regression_fails(client: Client, job_url: str): + """ + Get regression tests that did not succeed for the given job URL. + """ + # If you rename the alias for report_url, also update the formatters in format_results_as_html_table + # Nested SELECT handles test reruns + query = f"""SELECT arch, job_name, status, test_name, results_link + FROM ( + SELECT + architecture as arch, + test_name, + argMax(result, start_time) AS status, + job_url, + job_name, + report_url as results_link + FROM `gh-data`.clickhouse_regression_results + GROUP BY architecture, test_name, job_url, job_name, report_url + ORDER BY length(test_name) DESC + ) + WHERE job_url='{job_url}' + AND status IN ('Fail', 'Error') + """ + df = client.query_dataframe(query) + df = drop_prefix_rows(df, "test_name") + df["job_name"] = df["job_name"].str.title() + return df + + +def get_cves(pr_number, commit_sha): + s3_client = boto3.client("s3", endpoint_url=os.getenv("S3_URL")) + s3_prefix = f"{pr_number}/{commit_sha}/grype/" + + results = [] + + response = s3_client.list_objects_v2( + Bucket=S3_BUCKET, Prefix=s3_prefix, Delimiter="/" + ) + grype_result_dirs = [ + content["Prefix"] for content in response.get("CommonPrefixes", []) + ] + + for path in grype_result_dirs: + file_key = f"{path}result.json" + file_response = s3_client.get_object(Bucket=S3_BUCKET, Key=file_key) + content = file_response["Body"].read().decode("utf-8") + results.append(json.loads(content)) + + rows = [] + for scan_result in results: + for match in scan_result["matches"]: + rows.append( + { + "docker_image": scan_result["source"]["target"]["userInput"], + "severity": match["vulnerability"]["severity"], + "identifier": match["vulnerability"]["id"], + "namespace": match["vulnerability"]["namespace"], + } + ) + + if len(rows) == 0: + return pd.DataFrame() + + df = pd.DataFrame(rows).drop_duplicates() + df = df.sort_values( + by="severity", + key=lambda col: col.str.lower().map( + {"critical": 1, "high": 2, "medium": 3, "low": 4, "negligible": 5} + ), + ) + return df + + +def url_to_html_link(url: str) -> str: + if not url: + return "" + text = url.split("/")[-1].replace("__", "_") + if not text: + text = "results" + return f'{text}' + + +def format_test_name_for_linewrap(text: str) -> str: + """Tweak the test name to improve line wrapping.""" + return f'{text}' + + +def format_test_status(text: str) -> str: + """Format the test status for better readability.""" + color = ( + "red" + if text.lower().startswith("fail") + else "orange" if text.lower() in ("error", "broken") else "green" + ) + return f'{text}' + + +def format_results_as_html_table(results) -> str: + if len(results) == 0: + return "

Nothing to report

" + results.columns = [col.replace("_", " ").title() for col in results.columns] + html = results.to_html( + index=False, + formatters={ + "Results Link": url_to_html_link, + "Test Name": format_test_name_for_linewrap, + "Test Status": format_test_status, + "Job Status": format_test_status, + "Status": format_test_status, + "Message": lambda m: m.replace("\n", " "), + "Identifier": lambda i: url_to_html_link( + "https://nvd.nist.gov/vuln/detail/" + i + ), + }, + escape=False, + ).replace(' border="1"', "") + return html + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Create a combined CI report.") + parser.add_argument( + "--actions-run-url", required=True, help="URL of the actions run" + ) + parser.add_argument( + "--pr-number", required=True, help="Pull request number for the S3 path" + ) + parser.add_argument( + "--commit-sha", required=True, help="Commit SHA for the S3 path" + ) + parser.add_argument( + "--no-upload", action="store_true", help="Do not upload the report" + ) + parser.add_argument( + "--known-fails", type=str, help="Path to the file with known fails" + ) + parser.add_argument( + "--cves", action="store_true", help="Get CVEs from Grype results" + ) + parser.add_argument( + "--mark-preview", action="store_true", help="Mark the report as a preview" + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + db_client = Client( + host=os.getenv(DATABASE_HOST_VAR), + user=os.getenv(DATABASE_USER_VAR), + password=os.getenv(DATABASE_PASSWORD_VAR), + port=9440, + secure="y", + verify=False, + settings={"use_numpy": True}, + ) + + fail_results = { + "job_statuses": get_commit_statuses(args.commit_sha), + "checks_fails": get_checks_fails(db_client, args.actions_run_url), + "checks_known_fails": [], + "checks_errors": get_checks_errors(db_client, args.actions_run_url), + "regression_fails": get_regression_fails(db_client, args.actions_run_url), + "docker_images_cves": ( + [] if not args.cves else get_cves(args.pr_number, args.commit_sha) + ), + } + + if args.known_fails: + if not os.path.exists(args.known_fails): + print(f"Known fails file {args.known_fails} not found.") + exit(1) + + with open(args.known_fails) as f: + known_fails = json.load(f) + + if known_fails: + fail_results["checks_known_fails"] = get_checks_known_fails( + db_client, args.actions_run_url, known_fails + ) + + if args.pr_number == "0": + pr_info_html = "Release" + else: + try: + pr_info = get_pr_info_from_number(args.pr_number) + pr_info_html = f""" + #{pr_info.get("number")} ({pr_info.get("base", {}).get('ref')} <- {pr_info.get("head", {}).get('ref')}) {pr_info.get("title")} + """ + except Exception as e: + pr_info_html = e + + high_cve_count = 0 + if len(fail_results["docker_images_cves"]) > 0: + high_cve_count = ( + fail_results["docker_images_cves"]["severity"] + .str.lower() + .isin(("high", "critical")) + .sum() + ) + + title = "ClickHouse® CI Workflow Run Report" + + html_report = f""" + + + + + + + {title} + + + {logo} +

{title}

+ + + + + + + + + + + + + +
Pull Request{pr_info_html}
Workflow Run{args.actions_run_url.split('/')[-1]}
Commit{args.commit_sha}
Date{datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')} UTC
+ +

Table of Contents

+{'

This is a preview. FinishCheck has not completed.

' if args.mark_preview else ""} + + +

CI Jobs Status

+{format_results_as_html_table(fail_results['job_statuses'])} + +

Checks Errors

+{format_results_as_html_table(fail_results['checks_errors'])} + +

Checks New Fails

+{format_results_as_html_table(fail_results['checks_fails'])} + +

Regression New Fails

+{format_results_as_html_table(fail_results['regression_fails'])} + +

Docker Images CVEs

+{"

Not Checked

" if not args.cves else format_results_as_html_table(fail_results['docker_images_cves'])} + +

Checks Known Fails

+{"

Not Checked

" if not args.known_fails else format_results_as_html_table(fail_results['checks_known_fails'])} + +{script} + + +""" + report_name = "ci_run_report.html" + report_path = Path(report_name) + report_path.write_text(html_report, encoding="utf-8") + + if args.no_upload: + print(f"Report saved to {report_path}") + exit(0) + + report_destination_key = f"{args.pr_number}/{args.commit_sha}/{report_name}" + + # Upload the report to S3 + s3_client = boto3.client("s3", endpoint_url=os.getenv("S3_URL")) + + try: + s3_client.put_object( + Bucket=S3_BUCKET, + Key=report_destination_key, + Body=html_report, + ContentType="text/html; charset=utf-8", + ) + except NoCredentialsError: + print("Credentials not available for S3 upload.") + + print(f"https://s3.amazonaws.com/{S3_BUCKET}/" + report_destination_key) + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 10dc41627c07..c5ca0080ec4b 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -660,9 +660,9 @@ jobs: ACTIONS_RUN_URL: ${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }} shell: bash run: | - pip install clickhouse-driver==0.2.8 numpy==1.26.4 pandas==2.2.0 + pip install clickhouse-driver==0.2.8 numpy==1.26.4 pandas==2.0.3 - REPORT_LINK=$(python3 .github/create_combined_ci_report.py --pr-number $PR_NUMBER --commit-sha $COMMIT_SHA --actions-run-url $ACTIONS_RUN_URL --known-fails tests/broken_tests.json) + REPORT_LINK=$(python3 .github/create_workflow_report.py --pr-number $PR_NUMBER --commit-sha $COMMIT_SHA --actions-run-url $ACTIONS_RUN_URL --known-fails tests/broken_tests.json --cves) IS_VALID_URL=$(echo $REPORT_LINK | grep -E '^https?://') if [[ -n $IS_VALID_URL ]]; then From 7eda4d0be3ec6d2ecb98bd5dfee584ace493c6e7 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> Date: Wed, 28 May 2025 07:05:20 +0000 Subject: [PATCH 40/67] Merge pull request #79975 from zvonand/add-headers-in-handlers Allow to add `http_response_headers` in `http_handlers` of any type --- docs/en/interfaces/http.md | 37 ++++- src/Server/HTTPHandler.cpp | 10 +- src/Server/HTTPHandlerFactory.cpp | 104 +++++++++++-- src/Server/HTTPHandlerFactory.h | 12 +- src/Server/HTTPResponseHeaderWriter.cpp | 21 +++ src/Server/HTTPResponseHeaderWriter.h | 12 ++ src/Server/PrometheusRequestHandler.cpp | 5 +- src/Server/PrometheusRequestHandler.h | 4 +- .../PrometheusRequestHandlerFactory.cpp | 14 +- src/Server/PrometheusRequestHandlerFactory.h | 3 +- src/Server/ReplicasStatusHandler.cpp | 14 +- src/Server/ReplicasStatusHandler.h | 10 ++ src/Server/StaticRequestHandler.cpp | 5 +- src/Server/WebUIRequestHandler.cpp | 22 ++- src/Server/WebUIRequestHandler.h | 47 ++++++ .../test_http_handlers_config/test.py | 28 ++++ .../test_headers_in_response/config.xml | 146 ++++++++++++++++++ 17 files changed, 452 insertions(+), 42 deletions(-) create mode 100644 tests/integration/test_http_handlers_config/test_headers_in_response/config.xml diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index 03fdfa048c8d..44e460dadbfb 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -791,7 +791,42 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_relative_path_static_handler' * Connection #0 to host localhost left intact ``` -## Valid JSON/XML response on exception during HTTP streaming {valid-output-on-exception-http-streaming} +## HTTP Response Headers {#http-response-headers} + +ClickHouse allows you to configure custom HTTP response headers that can be applied to any kind of handler that can be configured. These headers can be set using the `http_response_headers` setting, which accepts key-value pairs representing header names and their values. This feature is particularly useful for implementing custom security headers, CORS policies, or any other HTTP header requirements across your ClickHouse HTTP interface. + +For example, you can configure headers for: +- Regular query endpoints +- Web UI +- Health check. + +It is also possible to specify `common_http_response_headers`. These will be applied to all http handlers defined in the configuration. + +The headers will be included in the HTTP response for every configured handler. + +In the example below, every server response will contain two custom headers: `X-My-Common-Header` and `X-My-Custom-Header`. + +```xml + + + + Common header + + + GET + /ping + + ping + + Custom indeed + + + + + +``` + +## Valid JSON/XML response on exception during HTTP streaming {#valid-output-on-exception-http-streaming} While query execution over HTTP an exception can happen when part of the data has already been sent. Usually an exception is sent to the client in plain text even if some specific data format was used to output data and the output may become invalid in terms of specified data format. diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index d2bc22e98cc6..8d8174cb172c 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -895,11 +895,14 @@ std::string PredefinedQueryHandler::getQuery(HTTPServerRequest & request, HTMLFo HTTPRequestHandlerFactoryPtr createDynamicHandlerFactory(IServer & server, const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix) + const std::string & config_prefix, + std::unordered_map & common_headers) { auto query_param_name = config.getString(config_prefix + ".handler.query_param_name", "query"); HTTPResponseHeaderSetup http_response_headers_override = parseHTTPResponseHeaders(config, config_prefix); + if (http_response_headers_override.has_value()) + http_response_headers_override.value().insert(common_headers.begin(), common_headers.end()); auto creator = [&server, query_param_name, http_response_headers_override]() -> std::unique_ptr { return std::make_unique(server, query_param_name, http_response_headers_override); }; @@ -932,7 +935,8 @@ static inline CompiledRegexPtr getCompiledRegex(const std::string & expression) HTTPRequestHandlerFactoryPtr createPredefinedHandlerFactory(IServer & server, const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix) + const std::string & config_prefix, + std::unordered_map & common_headers) { if (!config.has(config_prefix + ".handler.query")) throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, "There is no path '{}.handler.query' in configuration file.", config_prefix); @@ -958,6 +962,8 @@ HTTPRequestHandlerFactoryPtr createPredefinedHandlerFactory(IServer & server, } HTTPResponseHeaderSetup http_response_headers_override = parseHTTPResponseHeaders(config, config_prefix); + if (http_response_headers_override.has_value()) + http_response_headers_override.value().insert(common_headers.begin(), common_headers.end()); std::shared_ptr> factory; diff --git a/src/Server/HTTPHandlerFactory.cpp b/src/Server/HTTPHandlerFactory.cpp index fc31ad2874ef..2aa2a56696ba 100644 --- a/src/Server/HTTPHandlerFactory.cpp +++ b/src/Server/HTTPHandlerFactory.cpp @@ -13,6 +13,7 @@ #include "InterserverIOHTTPHandler.h" #include "WebUIRequestHandler.h" +#include namespace DB { @@ -31,27 +32,35 @@ class RedirectRequestHandler : public HTTPRequestHandler { private: std::string url; + std::unordered_map http_response_headers_override; public: - explicit RedirectRequestHandler(std::string url_) - : url(std::move(url_)) + explicit RedirectRequestHandler(std::string url_, std::unordered_map http_response_headers_override_ = {}) + : url(std::move(url_)), http_response_headers_override(http_response_headers_override_) { } void handleRequest(HTTPServerRequest &, HTTPServerResponse & response, const ProfileEvents::Event &) override { + applyHTTPResponseHeaders(response, http_response_headers_override); response.redirect(url); } }; HTTPRequestHandlerFactoryPtr createRedirectHandlerFactory( const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix) + const std::string & config_prefix, + std::unordered_map common_headers) { std::string url = config.getString(config_prefix + ".handler.location"); + auto headers = parseHTTPResponseHeadersWithCommons(config, config_prefix, common_headers); + auto factory = std::make_shared>( - [my_url = std::move(url)]() { return std::make_unique(my_url); }); + [my_url = std::move(url), headers_override = std::move(headers)]() + { + return std::make_unique(my_url, headers_override); + }); factory->addFiltersFromConfig(config, config_prefix); return factory; @@ -78,6 +87,33 @@ static auto createPingHandlerFactory(IServer & server) return std::make_shared>(std::move(creator)); } +static auto createPingHandlerFactory(IServer & server, const Poco::Util::AbstractConfiguration & config, const String & config_prefix, + std::unordered_map common_headers) +{ + auto creator = [&server,&config,config_prefix,common_headers]() -> std::unique_ptr + { + constexpr auto ping_response_expression = "Ok.\n"; + + auto headers = parseHTTPResponseHeadersWithCommons(config, config_prefix, "text/html; charset=UTF-8", common_headers); + + return std::make_unique( + server, ping_response_expression, headers); + }; + return std::make_shared>(std::move(creator)); +} + +template +static auto createWebUIHandlerFactory(IServer & server, const Poco::Util::AbstractConfiguration & config, const String & config_prefix, + std::unordered_map common_headers) +{ + auto creator = [&server,&config,config_prefix,common_headers]() -> std::unique_ptr + { + auto headers = parseHTTPResponseHeadersWithCommons(config, config_prefix, "text/html; charset=UTF-8", common_headers); + return std::make_unique(server, headers); + }; + return std::make_shared>(std::move(creator)); +} + static inline auto createHandlersFactoryFromConfig( IServer & server, const Poco::Util::AbstractConfiguration & config, @@ -90,6 +126,19 @@ static inline auto createHandlersFactoryFromConfig( Poco::Util::AbstractConfiguration::Keys keys; config.keys(prefix, keys); + std::unordered_map common_headers_override; + + if (std::find(keys.begin(), keys.end(), "common_http_response_headers") != keys.end()) + { + auto common_headers_prefix = prefix + ".common_http_response_headers"; + Poco::Util::AbstractConfiguration::Keys headers_keys; + config.keys(common_headers_prefix, headers_keys); + for (const auto & header_key : headers_keys) + { + common_headers_override[header_key] = config.getString(common_headers_prefix + "." + header_key); + } + } + for (const auto & key : keys) { if (key == "defaults") @@ -106,50 +155,73 @@ static inline auto createHandlersFactoryFromConfig( if (handler_type == "static") { - main_handler_factory->addHandler(createStaticHandlerFactory(server, config, prefix + "." + key)); + main_handler_factory->addHandler(createStaticHandlerFactory(server, config, prefix + "." + key, common_headers_override)); } else if (handler_type == "redirect") { - main_handler_factory->addHandler(createRedirectHandlerFactory(config, prefix + "." + key)); + main_handler_factory->addHandler(createRedirectHandlerFactory(config, prefix + "." + key, common_headers_override)); } else if (handler_type == "dynamic_query_handler") { - main_handler_factory->addHandler(createDynamicHandlerFactory(server, config, prefix + "." + key)); + main_handler_factory->addHandler(createDynamicHandlerFactory(server, config, prefix + "." + key, common_headers_override)); } else if (handler_type == "predefined_query_handler") { - main_handler_factory->addHandler(createPredefinedHandlerFactory(server, config, prefix + "." + key)); + main_handler_factory->addHandler(createPredefinedHandlerFactory(server, config, prefix + "." + key, common_headers_override)); } else if (handler_type == "prometheus") { main_handler_factory->addHandler( - createPrometheusHandlerFactoryForHTTPRule(server, config, prefix + "." + key, async_metrics)); + createPrometheusHandlerFactoryForHTTPRule(server, config, prefix + "." + key, async_metrics, common_headers_override)); } else if (handler_type == "replicas_status") { - main_handler_factory->addHandler(createReplicasStatusHandlerFactory(server, config, prefix + "." + key)); + main_handler_factory->addHandler(createReplicasStatusHandlerFactory(server, config, prefix + "." + key, common_headers_override)); } else if (handler_type == "ping") { - auto handler = createPingHandlerFactory(server); - handler->addFiltersFromConfig(config, prefix + "." + key); + const String config_prefix = prefix + "." + key; + auto handler = createPingHandlerFactory(server, config, config_prefix, common_headers_override); + handler->addFiltersFromConfig(config, config_prefix); main_handler_factory->addHandler(std::move(handler)); } else if (handler_type == "play") { - auto handler = std::make_shared>(server); + auto handler = createWebUIHandlerFactory(server, config, prefix + "." + key, common_headers_override); handler->addFiltersFromConfig(config, prefix + "." + key); main_handler_factory->addHandler(std::move(handler)); } else if (handler_type == "dashboard") { - auto handler = std::make_shared>(server); + auto handler = createWebUIHandlerFactory(server, config, prefix + "." + key, common_headers_override); handler->addFiltersFromConfig(config, prefix + "." + key); main_handler_factory->addHandler(std::move(handler)); } else if (handler_type == "binary") { - auto handler = std::make_shared>(server); + auto handler = createWebUIHandlerFactory(server, config, prefix + "." + key, common_headers_override); + handler->addFiltersFromConfig(config, prefix + "." + key); + main_handler_factory->addHandler(std::move(handler)); + } + else if (handler_type == "merges") + { + auto handler = createWebUIHandlerFactory(server, config, prefix + "." + key, common_headers_override); + handler->addFiltersFromConfig(config, prefix + "." + key); + main_handler_factory->addHandler(std::move(handler)); + } + else if (handler_type == "js") + { + // NOTE: JavaScriptWebUIRequestHandler only makes sense for paths other then /js/uplot.js, /js/lz-string.js + // because these paths are hardcoded in dashboard.html + const auto & path = config.getString(prefix + "." + key + ".url", ""); + if (path != "/js/uplot.js" && path != "/js/lz-string.js") + { + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, + "Handler type 'js' is only supported for url '/js/'. " + "Configured path here: {}", path); + } + + auto handler = createWebUIHandlerFactory(server, config, prefix + "." + key, common_headers_override); handler->addFiltersFromConfig(config, prefix + "." + key); main_handler_factory->addHandler(std::move(handler)); } @@ -157,7 +229,7 @@ static inline auto createHandlersFactoryFromConfig( throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Unknown handler type '{}' in config here: {}.{}.handler.type", handler_type, prefix, key); } - else + else if (key != "common_http_response_headers") throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, "Unknown element in config: " "{}.{}, must be 'rule' or 'defaults'", prefix, key); } diff --git a/src/Server/HTTPHandlerFactory.h b/src/Server/HTTPHandlerFactory.h index db4bb73cbc44..f2b7760f8b02 100644 --- a/src/Server/HTTPHandlerFactory.h +++ b/src/Server/HTTPHandlerFactory.h @@ -110,19 +110,23 @@ class HandlingRuleHTTPHandlerFactory : public HTTPRequestHandlerFactory HTTPRequestHandlerFactoryPtr createStaticHandlerFactory(IServer & server, const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix); + const std::string & config_prefix, + std::unordered_map & common_headers); HTTPRequestHandlerFactoryPtr createDynamicHandlerFactory(IServer & server, const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix); + const std::string & config_prefix, + std::unordered_map & common_headers); HTTPRequestHandlerFactoryPtr createPredefinedHandlerFactory(IServer & server, const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix); + const std::string & config_prefix, + std::unordered_map & common_headers); HTTPRequestHandlerFactoryPtr createReplicasStatusHandlerFactory(IServer & server, const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix); + const std::string & config_prefix, + std::unordered_map & common_headers); /// @param server - used in handlers to check IServer::isCancelled() /// @param config - not the same as server.config(), since it can be newer diff --git a/src/Server/HTTPResponseHeaderWriter.cpp b/src/Server/HTTPResponseHeaderWriter.cpp index fd29af5bdc77..912b696faffb 100644 --- a/src/Server/HTTPResponseHeaderWriter.cpp +++ b/src/Server/HTTPResponseHeaderWriter.cpp @@ -66,4 +66,25 @@ void applyHTTPResponseHeaders(Poco::Net::HTTPResponse & response, const std::uno response.set(header_name, header_value); } +std::unordered_map parseHTTPResponseHeadersWithCommons( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const std::string & default_content_type, + const std::unordered_map & common_headers) +{ + auto headers = parseHTTPResponseHeaders(config, config_prefix, default_content_type); + headers.insert(common_headers.begin(), common_headers.end()); + return headers; +} + +std::unordered_map parseHTTPResponseHeadersWithCommons( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const std::unordered_map & common_headers) +{ + auto headers = parseHTTPResponseHeaders(config, config_prefix).value_or(std::unordered_map{}); + headers.insert(common_headers.begin(), common_headers.end()); + return headers; +} + } diff --git a/src/Server/HTTPResponseHeaderWriter.h b/src/Server/HTTPResponseHeaderWriter.h index 06281abb42dd..7e240b9eddec 100644 --- a/src/Server/HTTPResponseHeaderWriter.h +++ b/src/Server/HTTPResponseHeaderWriter.h @@ -22,4 +22,16 @@ std::unordered_map parseHTTPResponseHeaders(const std::string & void applyHTTPResponseHeaders(Poco::Net::HTTPResponse & response, const HTTPResponseHeaderSetup & setup); void applyHTTPResponseHeaders(Poco::Net::HTTPResponse & response, const std::unordered_map & setup); + +std::unordered_map parseHTTPResponseHeadersWithCommons( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const std::unordered_map & common_headers); + +std::unordered_map parseHTTPResponseHeadersWithCommons( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const std::string & default_content_type, + const std::unordered_map & common_headers); + } diff --git a/src/Server/PrometheusRequestHandler.cpp b/src/Server/PrometheusRequestHandler.cpp index ae1fb6d629e0..e9bfb9fe5789 100644 --- a/src/Server/PrometheusRequestHandler.cpp +++ b/src/Server/PrometheusRequestHandler.cpp @@ -303,13 +303,15 @@ PrometheusRequestHandler::PrometheusRequestHandler( IServer & server_, const PrometheusRequestHandlerConfig & config_, const AsynchronousMetrics & async_metrics_, - std::shared_ptr metrics_writer_) + std::shared_ptr metrics_writer_, + std::unordered_map response_headers_) : server(server_) , config(config_) , async_metrics(async_metrics_) , metrics_writer(metrics_writer_) , log(getLogger("PrometheusRequestHandler")) { + response_headers = response_headers_; createImpl(); } @@ -341,6 +343,7 @@ void PrometheusRequestHandler::createImpl() void PrometheusRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event_) { setThreadName("PrometheusHndlr"); + applyHTTPResponseHeaders(response, response_headers); try { diff --git a/src/Server/PrometheusRequestHandler.h b/src/Server/PrometheusRequestHandler.h index 281ecf5260ed..550404f7cf64 100644 --- a/src/Server/PrometheusRequestHandler.h +++ b/src/Server/PrometheusRequestHandler.h @@ -19,7 +19,8 @@ class PrometheusRequestHandler : public HTTPRequestHandler IServer & server_, const PrometheusRequestHandlerConfig & config_, const AsynchronousMetrics & async_metrics_, - std::shared_ptr metrics_writer_); + std::shared_ptr metrics_writer_, + std::unordered_map response_headers_ = {}); ~PrometheusRequestHandler() override; void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event_) override; @@ -59,6 +60,7 @@ class PrometheusRequestHandler : public HTTPRequestHandler std::unique_ptr write_buffer_from_response; bool response_finalized = false; ProfileEvents::Event write_event; + std::unordered_map response_headers; }; } diff --git a/src/Server/PrometheusRequestHandlerFactory.cpp b/src/Server/PrometheusRequestHandlerFactory.cpp index 52f1d3b64c14..72811d88e67d 100644 --- a/src/Server/PrometheusRequestHandlerFactory.cpp +++ b/src/Server/PrometheusRequestHandlerFactory.cpp @@ -131,14 +131,15 @@ namespace IServer & server, const AsynchronousMetrics & async_metrics, const PrometheusRequestHandlerConfig & config, - bool for_keeper) + bool for_keeper, + std::unordered_map headers = {}) { if (!canBeHandled(config, for_keeper)) return nullptr; auto metric_writer = createPrometheusMetricWriter(for_keeper); - auto creator = [&server, &async_metrics, config, metric_writer]() -> std::unique_ptr + auto creator = [&server, &async_metrics, config, metric_writer, headers_moved = std::move(headers)]() -> std::unique_ptr { - return std::make_unique(server, config, async_metrics, metric_writer); + return std::make_unique(server, config, async_metrics, metric_writer, headers_moved); }; return std::make_shared>(std::move(creator)); } @@ -200,10 +201,13 @@ HTTPRequestHandlerFactoryPtr createPrometheusHandlerFactoryForHTTPRule( IServer & server, const Poco::Util::AbstractConfiguration & config, const String & config_prefix, - const AsynchronousMetrics & asynchronous_metrics) + const AsynchronousMetrics & asynchronous_metrics, + std::unordered_map & common_headers) { + auto headers = parseHTTPResponseHeadersWithCommons(config, config_prefix, common_headers); + auto parsed_config = parseExposeMetricsConfig(config, config_prefix + ".handler"); - auto handler = createPrometheusHandlerFactoryFromConfig(server, asynchronous_metrics, parsed_config, /* for_keeper= */ false); + auto handler = createPrometheusHandlerFactoryFromConfig(server, asynchronous_metrics, parsed_config, /* for_keeper= */ false, headers); chassert(handler); /// `handler` can't be nullptr here because `for_keeper` is false. handler->addFiltersFromConfig(config, config_prefix); return handler; diff --git a/src/Server/PrometheusRequestHandlerFactory.h b/src/Server/PrometheusRequestHandlerFactory.h index c52395ca93f3..23d00b50095b 100644 --- a/src/Server/PrometheusRequestHandlerFactory.h +++ b/src/Server/PrometheusRequestHandlerFactory.h @@ -93,7 +93,8 @@ HTTPRequestHandlerFactoryPtr createPrometheusHandlerFactoryForHTTPRule( IServer & server, const Poco::Util::AbstractConfiguration & config, const String & config_prefix, /// path to "http_handlers.my_handler_1" - const AsynchronousMetrics & asynchronous_metrics); + const AsynchronousMetrics & asynchronous_metrics, + std::unordered_map & common_headers); /// Makes a HTTP Handler factory to handle requests for prometheus metrics as a part of the default HTTP rule in the section. /// Expects a configuration like this: diff --git a/src/Server/ReplicasStatusHandler.cpp b/src/Server/ReplicasStatusHandler.cpp index 419ad635d0d5..042c9657f6d2 100644 --- a/src/Server/ReplicasStatusHandler.cpp +++ b/src/Server/ReplicasStatusHandler.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,8 @@ void ReplicasStatusHandler::handleRequest(HTTPServerRequest & request, HTTPServe { try { + applyHTTPResponseHeaders(response, http_response_headers_override); + HTMLForm params(getContext()->getSettingsRef(), request); const auto & config = getContext()->getConfigRef(); @@ -129,9 +132,16 @@ void ReplicasStatusHandler::handleRequest(HTTPServerRequest & request, HTTPServe HTTPRequestHandlerFactoryPtr createReplicasStatusHandlerFactory(IServer & server, const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix) + const std::string & config_prefix, + std::unordered_map & common_headers) { - auto factory = std::make_shared>(server); + std::unordered_map http_response_headers_override + = parseHTTPResponseHeadersWithCommons(config, config_prefix, "text/plain; charset=UTF-8", common_headers); + + auto creator = [&server, http_response_headers_override]() -> std::unique_ptr + { return std::make_unique(server, http_response_headers_override); }; + + auto factory = std::make_shared>(std::move(creator)); factory->addFiltersFromConfig(config, config_prefix); return factory; } diff --git a/src/Server/ReplicasStatusHandler.h b/src/Server/ReplicasStatusHandler.h index 08fd757b0d61..2d3aaad184b5 100644 --- a/src/Server/ReplicasStatusHandler.h +++ b/src/Server/ReplicasStatusHandler.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace DB { @@ -13,8 +14,17 @@ class ReplicasStatusHandler : public HTTPRequestHandler, WithContext { public: explicit ReplicasStatusHandler(IServer & server_); + explicit ReplicasStatusHandler(IServer & server_, const std::unordered_map & http_response_headers_override_) + : ReplicasStatusHandler(server_) + { + http_response_headers_override = http_response_headers_override_; + } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; + +private: + /// Overrides for response headers. + std::unordered_map http_response_headers_override; }; diff --git a/src/Server/StaticRequestHandler.cpp b/src/Server/StaticRequestHandler.cpp index d8c0765bca48..34774d6689d8 100644 --- a/src/Server/StaticRequestHandler.cpp +++ b/src/Server/StaticRequestHandler.cpp @@ -163,13 +163,14 @@ StaticRequestHandler::StaticRequestHandler( HTTPRequestHandlerFactoryPtr createStaticHandlerFactory(IServer & server, const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix) + const std::string & config_prefix, + std::unordered_map & common_headers) { int status = config.getInt(config_prefix + ".handler.status", 200); std::string response_content = config.getRawString(config_prefix + ".handler.response_content", "Ok.\n"); std::unordered_map http_response_headers_override - = parseHTTPResponseHeaders(config, config_prefix, "text/plain; charset=UTF-8"); + = parseHTTPResponseHeadersWithCommons(config, config_prefix, "text/plain; charset=UTF-8", common_headers); auto creator = [&server, http_response_headers_override, response_content, status]() -> std::unique_ptr { return std::make_unique(server, response_content, http_response_headers_override, status); }; diff --git a/src/Server/WebUIRequestHandler.cpp b/src/Server/WebUIRequestHandler.cpp index c04d7a3f2a03..b382da29c038 100644 --- a/src/Server/WebUIRequestHandler.cpp +++ b/src/Server/WebUIRequestHandler.cpp @@ -1,6 +1,7 @@ #include "WebUIRequestHandler.h" #include "IServer.h" #include +#include #include #include @@ -30,8 +31,10 @@ DashboardWebUIRequestHandler::DashboardWebUIRequestHandler(IServer & server_) : BinaryWebUIRequestHandler::BinaryWebUIRequestHandler(IServer & server_) : server(server_) {} JavaScriptWebUIRequestHandler::JavaScriptWebUIRequestHandler(IServer & server_) : server(server_) {} -static void handle(HTTPServerRequest & request, HTTPServerResponse & response, std::string_view html) +static void handle(HTTPServerRequest & request, HTTPServerResponse & response, std::string_view html, + std::unordered_map http_response_headers_override = {}) { + applyHTTPResponseHeaders(response, http_response_headers_override); response.setContentType("text/html; charset=UTF-8"); if (request.getVersion() == HTTPServerRequest::HTTP_1_1) response.setChunkedTransferEncoding(true); @@ -43,7 +46,7 @@ static void handle(HTTPServerRequest & request, HTTPServerResponse & response, s void PlayWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) { - handle(request, response, {reinterpret_cast(gresource_play_htmlData), gresource_play_htmlSize}); + handle(request, response, {reinterpret_cast(gresource_play_htmlData), gresource_play_htmlSize}, http_response_headers_override); } void DashboardWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) @@ -61,23 +64,28 @@ void DashboardWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HT static re2::RE2 lz_string_url = R"(https://[^\s"'`]+lz-string[^\s"'`]*\.js)"; RE2::Replace(&html, lz_string_url, "/js/lz-string.js"); - handle(request, response, html); + handle(request, response, html, http_response_headers_override); } void BinaryWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) { - handle(request, response, {reinterpret_cast(gresource_binary_htmlData), gresource_binary_htmlSize}); + handle(request, response, {reinterpret_cast(gresource_binary_htmlData), gresource_binary_htmlSize}, http_response_headers_override); +} + +void MergesWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) +{ + handle(request, response, {reinterpret_cast(gresource_merges_htmlData), gresource_merges_htmlSize}, http_response_headers_override); } void JavaScriptWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) { if (request.getURI() == "/js/uplot.js") { - handle(request, response, {reinterpret_cast(gresource_uplot_jsData), gresource_uplot_jsSize}); + handle(request, response, {reinterpret_cast(gresource_uplot_jsData), gresource_uplot_jsSize}, http_response_headers_override); } else if (request.getURI() == "/js/lz-string.js") { - handle(request, response, {reinterpret_cast(gresource_lz_string_jsData), gresource_lz_string_jsSize}); + handle(request, response, {reinterpret_cast(gresource_lz_string_jsData), gresource_lz_string_jsSize}, http_response_headers_override); } else { @@ -85,7 +93,7 @@ void JavaScriptWebUIRequestHandler::handleRequest(HTTPServerRequest & request, H *response.send() << "Not found.\n"; } - handle(request, response, {reinterpret_cast(gresource_binary_htmlData), gresource_binary_htmlSize}); + handle(request, response, {reinterpret_cast(gresource_binary_htmlData), gresource_binary_htmlSize}, http_response_headers_override); } } diff --git a/src/Server/WebUIRequestHandler.h b/src/Server/WebUIRequestHandler.h index b84c8f6534d7..91b74bae5863 100644 --- a/src/Server/WebUIRequestHandler.h +++ b/src/Server/WebUIRequestHandler.h @@ -16,7 +16,15 @@ class PlayWebUIRequestHandler : public HTTPRequestHandler IServer & server; public: explicit PlayWebUIRequestHandler(IServer & server_); + explicit PlayWebUIRequestHandler(IServer & server_, const std::unordered_map & http_response_headers_override_) + : PlayWebUIRequestHandler(server_) + { + http_response_headers_override = http_response_headers_override_; + } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; +private: + /// Overrides for response headers. + std::unordered_map http_response_headers_override; }; class DashboardWebUIRequestHandler : public HTTPRequestHandler @@ -25,7 +33,15 @@ class DashboardWebUIRequestHandler : public HTTPRequestHandler IServer & server; public: explicit DashboardWebUIRequestHandler(IServer & server_); + explicit DashboardWebUIRequestHandler(IServer & server_, const std::unordered_map & http_response_headers_override_) + : DashboardWebUIRequestHandler(server_) + { + http_response_headers_override = http_response_headers_override_; + } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; +private: + /// Overrides for response headers. + std::unordered_map http_response_headers_override; }; class BinaryWebUIRequestHandler : public HTTPRequestHandler @@ -34,7 +50,30 @@ class BinaryWebUIRequestHandler : public HTTPRequestHandler IServer & server; public: explicit BinaryWebUIRequestHandler(IServer & server_); + explicit BinaryWebUIRequestHandler(IServer & server_, const std::unordered_map & http_response_headers_override_) + : BinaryWebUIRequestHandler(server_) + { + http_response_headers_override = http_response_headers_override_; + } + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; +private: + /// Overrides for response headers. + std::unordered_map http_response_headers_override; +}; + +class MergesWebUIRequestHandler : public HTTPRequestHandler +{ +public: + explicit MergesWebUIRequestHandler(IServer &) {} + explicit MergesWebUIRequestHandler(IServer & server_, const std::unordered_map & http_response_headers_override_) + : MergesWebUIRequestHandler(server_) + { + http_response_headers_override = http_response_headers_override_; + } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; +private: + /// Overrides for response headers. + std::unordered_map http_response_headers_override; }; class JavaScriptWebUIRequestHandler : public HTTPRequestHandler @@ -43,7 +82,15 @@ class JavaScriptWebUIRequestHandler : public HTTPRequestHandler IServer & server; public: explicit JavaScriptWebUIRequestHandler(IServer & server_); + explicit JavaScriptWebUIRequestHandler(IServer & server_, const std::unordered_map & http_response_headers_override_) + : JavaScriptWebUIRequestHandler(server_) + { + http_response_headers_override = http_response_headers_override_; + } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; +private: + /// Overrides for response headers. + std::unordered_map http_response_headers_override; }; } diff --git a/tests/integration/test_http_handlers_config/test.py b/tests/integration/test_http_handlers_config/test.py index b2efbf4bb657..9d2230b37cd6 100644 --- a/tests/integration/test_http_handlers_config/test.py +++ b/tests/integration/test_http_handlers_config/test.py @@ -601,3 +601,31 @@ def test_replicas_status_handler(): "test_replicas_status", method="GET", headers={"XXX": "xxx"} ).content ) + + +def test_headers_in_response(): + with contextlib.closing( + SimpleCluster( + ClickHouseCluster(__file__), "headers_in_response", "test_headers_in_response" + ) + ) as cluster: + for endpoint in ("static", "ping", "replicas_status", "play", "dashboard", "binary", "merges", "metrics", + "js/lz-string.js", "js/uplot.js", "?query=SELECT%201"): + response = cluster.instance.http_request(endpoint, method="GET") + + assert "X-My-Answer" in response.headers + assert "X-My-Common-Header" in response.headers + + assert response.headers["X-My-Common-Header"] == "Common header present" + + if endpoint == "?query=SELECT%201": + assert response.headers["X-My-Answer"] == "Iam dynamic" + else: + assert response.headers["X-My-Answer"] == f"Iam {endpoint}" + + + # Handle predefined_query_handler separately because we need to pass headers there + response_predefined = cluster.instance.http_request( + "query_param_with_url", method="GET", headers={"PARAMS_XXX": "test_param"}) + assert response_predefined.headers["X-My-Answer"] == f"Iam predefined" + assert response_predefined.headers["X-My-Common-Header"] == "Common header present" diff --git a/tests/integration/test_http_handlers_config/test_headers_in_response/config.xml b/tests/integration/test_http_handlers_config/test_headers_in_response/config.xml new file mode 100644 index 000000000000..6fc72a1af5f5 --- /dev/null +++ b/tests/integration/test_http_handlers_config/test_headers_in_response/config.xml @@ -0,0 +1,146 @@ + + + + Common header present + + + + GET,HEAD + /static + + static + config://http_server_default_response + text/html; charset=UTF-8 + + Iam static + + + + + + GET,HEAD + /ping + + ping + + Iam ping + + + + + + GET,HEAD + /replicas_status + + replicas_status + + Iam replicas_status + + + + + + GET,HEAD + /play + + play + + Iam play + + + + + + GET,HEAD + /dashboard + + dashboard + + Iam dashboard + + + + + + GET,HEAD + /binary + + binary + + Iam binary + + + + + + GET,HEAD + /merges + + merges + + Iam merges + + + + + + GET,HEAD + /metrics + + prometheus + + Iam metrics + + + + + + /js/lz-string.js + + js + + Iam js/lz-string.js + + + + + + GET,HEAD + /js/uplot.js + + js + + Iam js/uplot.js + + + + + + /query_param_with_url + GET,HEAD + + [^/]+)]]> + + + predefined_query_handler + + SELECT {name_1:String} + + + Iam predefined + + + + + + GET,POST,HEAD,OPTIONS + + dynamic_query_handler + query + + Iam dynamic + + + + + \ No newline at end of file From 82e3216f65e83caea3d2ecf18f304d0255dd8b70 Mon Sep 17 00:00:00 2001 From: Tuan Pham Anh Date: Fri, 23 May 2025 23:59:29 +0000 Subject: [PATCH 41/67] Merge pull request #80710 from zvonand/zvonand-ttl32 Add support for Date32, DateTime64 in TTL --- .../mergetree-family/mergetree.md | 2 +- src/Processors/TTL/ITTLAlgorithm.cpp | 22 ++++++++--- src/Processors/TTL/ITTLAlgorithm.h | 2 +- .../TTL/TTLAggregationAlgorithm.cpp | 2 +- src/Processors/TTL/TTLColumnAlgorithm.cpp | 2 +- src/Processors/TTL/TTLDeleteAlgorithm.cpp | 2 +- src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp | 2 +- .../MergeTree/MergeTreeDataWriter.cpp | 23 +++++++++++ src/Storages/TTLDescription.cpp | 8 +++- .../03519_ttl_extended_data_types.reference | 8 ++++ .../03519_ttl_extended_data_types.sql | 39 +++++++++++++++++++ 11 files changed, 98 insertions(+), 14 deletions(-) create mode 100644 tests/queries/0_stateless/03519_ttl_extended_data_types.reference create mode 100644 tests/queries/0_stateless/03519_ttl_extended_data_types.sql diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 183b94f46418..7e898d20f22c 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -519,7 +519,7 @@ Determines the lifetime of values. The `TTL` clause can be set for the whole table and for each individual column. Table-level `TTL` can also specify the logic of automatic moving data between disks and volumes, or recompressing parts where all the data has been expired. -Expressions must evaluate to [Date](/docs/en/sql-reference/data-types/date.md) or [DateTime](/docs/en/sql-reference/data-types/datetime.md) data type. +Expressions must evaluate to [Date](/docs/en/sql-reference/data-types/date.md), [Date32](/docs/en/sql-reference/data-types/date32.md), [DateTime](/docs/en/sql-reference/data-types/datetime.md) or [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) data type. **Syntax** diff --git a/src/Processors/TTL/ITTLAlgorithm.cpp b/src/Processors/TTL/ITTLAlgorithm.cpp index 761f43e2422e..9dd69a656a30 100644 --- a/src/Processors/TTL/ITTLAlgorithm.cpp +++ b/src/Processors/TTL/ITTLAlgorithm.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace DB { @@ -46,18 +48,26 @@ ColumnPtr ITTLAlgorithm::executeExpressionAndGetColumn( return block_copy.getByName(result_column).column; } -UInt32 ITTLAlgorithm::getTimestampByIndex(const IColumn * column, size_t index) const +Int64 ITTLAlgorithm::getTimestampByIndex(const IColumn * column, size_t index) const { if (const ColumnUInt16 * column_date = typeid_cast(column)) - return static_cast(date_lut.fromDayNum(DayNum(column_date->getData()[index]))); - else if (const ColumnUInt32 * column_date_time = typeid_cast(column)) + return date_lut.fromDayNum(DayNum(column_date->getData()[index])); + if (const ColumnUInt32 * column_date_time = typeid_cast(column)) return column_date_time->getData()[index]; - else if (const ColumnConst * column_const = typeid_cast(column)) + if (const ColumnInt32 * column_date_32 = typeid_cast(column)) + return date_lut.fromDayNum(ExtendedDayNum(column_date_32->getData()[index])); + if (const ColumnDateTime64 * column_date_time_64 = typeid_cast(column)) + return column_date_time_64->getData()[index] / intExp10OfSize(column_date_time_64->getScale()); + if (const ColumnConst * column_const = typeid_cast(column)) { if (typeid_cast(&column_const->getDataColumn())) - return static_cast(date_lut.fromDayNum(DayNum(column_const->getValue()))); - else if (typeid_cast(&column_const->getDataColumn())) + return date_lut.fromDayNum(DayNum(column_const->getValue())); + if (typeid_cast(&column_const->getDataColumn())) return column_const->getValue(); + if (typeid_cast(&column_const->getDataColumn())) + return date_lut.fromDayNum(ExtendedDayNum(column_const->getValue())); + if (const ColumnDateTime64 * column_dt64 = typeid_cast(&column_const->getDataColumn())) + return column_const->getValue() / intExp10OfSize(column_dt64->getScale()); } throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected type of result TTL column"); diff --git a/src/Processors/TTL/ITTLAlgorithm.h b/src/Processors/TTL/ITTLAlgorithm.h index d79aa8a8dfcf..542f93d6ffe6 100644 --- a/src/Processors/TTL/ITTLAlgorithm.h +++ b/src/Processors/TTL/ITTLAlgorithm.h @@ -43,7 +43,7 @@ class ITTLAlgorithm protected: bool isTTLExpired(time_t ttl) const; - UInt32 getTimestampByIndex(const IColumn * column, size_t index) const; + Int64 getTimestampByIndex(const IColumn * column, size_t index) const; const TTLExpressions ttl_expressions; const TTLDescription description; diff --git a/src/Processors/TTL/TTLAggregationAlgorithm.cpp b/src/Processors/TTL/TTLAggregationAlgorithm.cpp index 2d7a37d0abea..1a4c52774033 100644 --- a/src/Processors/TTL/TTLAggregationAlgorithm.cpp +++ b/src/Processors/TTL/TTLAggregationAlgorithm.cpp @@ -86,7 +86,7 @@ void TTLAggregationAlgorithm::execute(Block & block) for (size_t i = 0; i < block.rows(); ++i) { - UInt32 cur_ttl = getTimestampByIndex(ttl_column.get(), i); + Int64 cur_ttl = getTimestampByIndex(ttl_column.get(), i); bool where_filter_passed = !where_column || where_column->getBool(i); bool ttl_expired = isTTLExpired(cur_ttl) && where_filter_passed; diff --git a/src/Processors/TTL/TTLColumnAlgorithm.cpp b/src/Processors/TTL/TTLColumnAlgorithm.cpp index e27050564cee..4b8f6b6f9ba2 100644 --- a/src/Processors/TTL/TTLColumnAlgorithm.cpp +++ b/src/Processors/TTL/TTLColumnAlgorithm.cpp @@ -59,7 +59,7 @@ void TTLColumnAlgorithm::execute(Block & block) for (size_t i = 0; i < block.rows(); ++i) { - UInt32 cur_ttl = getTimestampByIndex(ttl_column.get(), i); + Int64 cur_ttl = getTimestampByIndex(ttl_column.get(), i); if (isTTLExpired(cur_ttl)) { if (default_column) diff --git a/src/Processors/TTL/TTLDeleteAlgorithm.cpp b/src/Processors/TTL/TTLDeleteAlgorithm.cpp index 6f9bc315276f..fde1d7c51d54 100644 --- a/src/Processors/TTL/TTLDeleteAlgorithm.cpp +++ b/src/Processors/TTL/TTLDeleteAlgorithm.cpp @@ -34,7 +34,7 @@ void TTLDeleteAlgorithm::execute(Block & block) for (size_t i = 0; i < block.rows(); ++i) { - UInt32 cur_ttl = getTimestampByIndex(ttl_column.get(), i); + Int64 cur_ttl = getTimestampByIndex(ttl_column.get(), i); bool where_filter_passed = !where_column || where_column->getBool(i); if (!isTTLExpired(cur_ttl) || !where_filter_passed) diff --git a/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp b/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp index 13d3030bbb87..e27698e7cb44 100644 --- a/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp +++ b/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp @@ -25,7 +25,7 @@ void TTLUpdateInfoAlgorithm::execute(Block & block) auto ttl_column = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); for (size_t i = 0; i < block.rows(); ++i) { - UInt32 cur_ttl = ITTLAlgorithm::getTimestampByIndex(ttl_column.get(), i); + Int64 cur_ttl = ITTLAlgorithm::getTimestampByIndex(ttl_column.get(), i); new_ttl_info.update(cur_ttl); } } diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index f29d715e791d..aee7bc404d61 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -1,4 +1,7 @@ #include +#include +#include +#include #include #include #include @@ -159,6 +162,17 @@ void updateTTL( for (const auto & val : column_date_time->getData()) ttl_info.update(val); } + else if (const ColumnInt32 * column_date_32 = typeid_cast(ttl_column.get())) + { + const auto & date_lut = DateLUT::serverTimezoneInstance(); + for (const auto & val : column_date_32->getData()) + ttl_info.update(date_lut.fromDayNum(ExtendedDayNum(val))); + } + else if (const ColumnDateTime64 * column_date_time_64 = typeid_cast(ttl_column.get())) + { + for (const auto & val : column_date_time_64->getData()) + ttl_info.update(val / intExp10OfSize(column_date_time_64->getScale())); + } else if (const ColumnConst * column_const = typeid_cast(ttl_column.get())) { if (typeid_cast(&column_const->getDataColumn())) @@ -170,6 +184,15 @@ void updateTTL( { ttl_info.update(column_const->getValue()); } + else if (typeid_cast(&column_const->getDataColumn())) + { + const auto & date_lut = DateLUT::serverTimezoneInstance(); + ttl_info.update(date_lut.fromDayNum(ExtendedDayNum(column_const->getValue()))); + } + else if (const ColumnDateTime64 * column_dt64 = typeid_cast(&column_const->getDataColumn())) + { + ttl_info.update(column_const->getValue() / intExp10OfSize(column_dt64->getScale())); + } else throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected type of result TTL column"); } diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index d674f0546325..e0f16e0a945d 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -15,7 +15,9 @@ #include #include +#include #include +#include #include #include #include @@ -81,10 +83,12 @@ void checkTTLExpression(const ExpressionActionsPtr & ttl_expression, const Strin const auto & result_column = ttl_expression->getSampleBlock().getByName(result_column_name); if (!typeid_cast(result_column.type.get()) - && !typeid_cast(result_column.type.get())) + && !typeid_cast(result_column.type.get()) + && !typeid_cast(result_column.type.get()) + && !typeid_cast(result_column.type.get())) { throw Exception(ErrorCodes::BAD_TTL_EXPRESSION, - "TTL expression result column should have DateTime or Date type, but has {}", + "TTL expression result column should have Date, Date32, DateTime or DateTime64 type, but has {}", result_column.type->getName()); } } diff --git a/tests/queries/0_stateless/03519_ttl_extended_data_types.reference b/tests/queries/0_stateless/03519_ttl_extended_data_types.reference new file mode 100644 index 000000000000..d098f930edc6 --- /dev/null +++ b/tests/queries/0_stateless/03519_ttl_extended_data_types.reference @@ -0,0 +1,8 @@ +"2170-01-01",2170 +"2170-01-01 12:12:12.12345",2170 +"1901-01-01","" +"2010-01-01","" +"2170-01-01","uio" +"1901-01-01 12:12:12.12345","" +"2010-01-01 12:12:12.00000","" +"2170-01-01 12:12:12.12345","uio" diff --git a/tests/queries/0_stateless/03519_ttl_extended_data_types.sql b/tests/queries/0_stateless/03519_ttl_extended_data_types.sql new file mode 100644 index 000000000000..e7c7dcebe48b --- /dev/null +++ b/tests/queries/0_stateless/03519_ttl_extended_data_types.sql @@ -0,0 +1,39 @@ +-- Row TTL with extended data types +DROP TABLE IF EXISTS ttl_03519_1 SYNC; +CREATE TABLE ttl_03519_1 (date Date32, date_key Int) ENGINE=MergeTree TTL date + INTERVAL 1 MONTH ORDER BY date; +INSERT INTO ttl_03519_1 VALUES ('2010-01-01', 2010); +INSERT INTO ttl_03519_1 VALUES ('1901-01-01', 1901); +INSERT INTO ttl_03519_1 VALUES ('2170-01-01', 2170); +OPTIMIZE TABLE ttl_03519_1 FINAL; +SELECT * FROM ttl_03519_1 ORDER BY date FORMAT CSV; +DROP TABLE ttl_03519_1 SYNC; + +DROP TABLE IF EXISTS ttl_03519_2 SYNC; +CREATE TABLE ttl_03519_2 (date DateTime64(5, 'UTC'), date_key Int) ENGINE=MergeTree TTL date + INTERVAL 1 MONTH ORDER BY date; +INSERT INTO ttl_03519_2 VALUES ('2010-01-01 12:12:12.12345', 2010); +INSERT INTO ttl_03519_2 VALUES ('1901-01-01 12:12:12.12345', 1901); +INSERT INTO ttl_03519_2 VALUES ('2170-01-01 12:12:12.12345', 2170); +OPTIMIZE TABLE ttl_03519_2 FINAL; +SELECT * FROM ttl_03519_2 ORDER BY date FORMAT CSV; +DROP TABLE ttl_03519_2 SYNC; + +-- Column TTL with extended data types + +DROP TABLE IF EXISTS ttl_03519_3 SYNC; +CREATE TABLE ttl_03519_3 (date Date32, str String TTL date + INTERVAL 1 MONTH) ENGINE=MergeTree ORDER BY date; +INSERT INTO ttl_03519_3 VALUES ('2010-01-01', 'qwe'); +INSERT INTO ttl_03519_3 VALUES ('1901-01-01', 'rty'); +INSERT INTO ttl_03519_3 VALUES ('2170-01-01', 'uio'); +OPTIMIZE TABLE ttl_03519_3 FINAL; +SELECT * FROM ttl_03519_3 ORDER BY date FORMAT CSV; +DROP TABLE ttl_03519_3 SYNC; + +DROP TABLE IF EXISTS ttl_03519_4 SYNC; +CREATE TABLE ttl_03519_4 (date DateTime64(5, 'UTC'), str String TTL date + INTERVAL 1 MONTH) ENGINE=MergeTree ORDER BY date; +INSERT INTO ttl_03519_4 VALUES ('2010-01-01 12:12:12', 'qwe'); +INSERT INTO ttl_03519_4 VALUES ('1901-01-01 12:12:12.12345', 'rty'); +INSERT INTO ttl_03519_4 VALUES ('2170-01-01 12:12:12.12345', 'uio'); +OPTIMIZE TABLE ttl_03519_4 FINAL; +SELECT * FROM ttl_03519_4 ORDER BY date FORMAT CSV; +DROP TABLE ttl_03519_4 SYNC; + From 6c3166412be9b607e7177c685d08d0ab313a7de8 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Tue, 3 Jun 2025 14:45:30 +0300 Subject: [PATCH 42/67] fix build --- src/Server/PrometheusRequestHandlerFactory.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Server/PrometheusRequestHandlerFactory.cpp b/src/Server/PrometheusRequestHandlerFactory.cpp index 72811d88e67d..65061a5cc018 100644 --- a/src/Server/PrometheusRequestHandlerFactory.cpp +++ b/src/Server/PrometheusRequestHandlerFactory.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include From 698ef79bcf991b1f113c75532f7298728965fb48 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Tue, 3 Jun 2025 15:40:17 +0300 Subject: [PATCH 43/67] remove pieces of code from future --- src/Server/HTTPHandlerFactory.cpp | 6 ------ src/Server/WebUIRequestHandler.cpp | 5 ----- src/Server/WebUIRequestHandler.h | 15 --------------- 3 files changed, 26 deletions(-) diff --git a/src/Server/HTTPHandlerFactory.cpp b/src/Server/HTTPHandlerFactory.cpp index 2aa2a56696ba..ff3c5e2d4d60 100644 --- a/src/Server/HTTPHandlerFactory.cpp +++ b/src/Server/HTTPHandlerFactory.cpp @@ -203,12 +203,6 @@ static inline auto createHandlersFactoryFromConfig( handler->addFiltersFromConfig(config, prefix + "." + key); main_handler_factory->addHandler(std::move(handler)); } - else if (handler_type == "merges") - { - auto handler = createWebUIHandlerFactory(server, config, prefix + "." + key, common_headers_override); - handler->addFiltersFromConfig(config, prefix + "." + key); - main_handler_factory->addHandler(std::move(handler)); - } else if (handler_type == "js") { // NOTE: JavaScriptWebUIRequestHandler only makes sense for paths other then /js/uplot.js, /js/lz-string.js diff --git a/src/Server/WebUIRequestHandler.cpp b/src/Server/WebUIRequestHandler.cpp index b382da29c038..7f1a06b27858 100644 --- a/src/Server/WebUIRequestHandler.cpp +++ b/src/Server/WebUIRequestHandler.cpp @@ -72,11 +72,6 @@ void BinaryWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPS handle(request, response, {reinterpret_cast(gresource_binary_htmlData), gresource_binary_htmlSize}, http_response_headers_override); } -void MergesWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) -{ - handle(request, response, {reinterpret_cast(gresource_merges_htmlData), gresource_merges_htmlSize}, http_response_headers_override); -} - void JavaScriptWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) { if (request.getURI() == "/js/uplot.js") diff --git a/src/Server/WebUIRequestHandler.h b/src/Server/WebUIRequestHandler.h index 91b74bae5863..8fa2f10daae0 100644 --- a/src/Server/WebUIRequestHandler.h +++ b/src/Server/WebUIRequestHandler.h @@ -61,21 +61,6 @@ class BinaryWebUIRequestHandler : public HTTPRequestHandler std::unordered_map http_response_headers_override; }; -class MergesWebUIRequestHandler : public HTTPRequestHandler -{ -public: - explicit MergesWebUIRequestHandler(IServer &) {} - explicit MergesWebUIRequestHandler(IServer & server_, const std::unordered_map & http_response_headers_override_) - : MergesWebUIRequestHandler(server_) - { - http_response_headers_override = http_response_headers_override_; - } - void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; -private: - /// Overrides for response headers. - std::unordered_map http_response_headers_override; -}; - class JavaScriptWebUIRequestHandler : public HTTPRequestHandler { private: From 429e5c8013d71a1d0e86b4bf31117bcaa7f5ccba Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Tue, 3 Jun 2025 16:40:47 +0300 Subject: [PATCH 44/67] fix build --- src/Server/PrometheusRequestHandler.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Server/PrometheusRequestHandler.cpp b/src/Server/PrometheusRequestHandler.cpp index e9bfb9fe5789..d99b2c825598 100644 --- a/src/Server/PrometheusRequestHandler.cpp +++ b/src/Server/PrometheusRequestHandler.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include From 7d628da15ece099e9805f563f0017afe4e382aeb Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Fri, 30 May 2025 14:14:10 -0400 Subject: [PATCH 45/67] update report to new format --- .github/create_workflow_report.py | 90 ++++++++++++++++++-------- .github/workflows/release_branches.yml | 4 +- 2 files changed, 67 insertions(+), 27 deletions(-) diff --git a/.github/create_workflow_report.py b/.github/create_workflow_report.py index a7f30f72aedf..8c074abd76c9 100755 --- a/.github/create_workflow_report.py +++ b/.github/create_workflow_report.py @@ -253,11 +253,17 @@ def get_commit_statuses(sha: str) -> pd.DataFrame: for item in all_data ] - return ( - pd.DataFrame(parsed) - .sort_values(by=["job_status", "job_name"], ascending=[True, True]) - .reset_index(drop=True) - ) + # Create DataFrame + df = pd.DataFrame(parsed) + + # Drop duplicates keeping the first occurrence (newest status for each context) + # GitHub returns statuses in reverse chronological order + df = df.drop_duplicates(subset=["job_name"], keep="first") + + # Sort by status and job name + return df.sort_values( + by=["job_status", "job_name"], ascending=[True, True] + ).reset_index(drop=True) def get_pr_info_from_number(pr_number: str) -> dict: @@ -291,13 +297,23 @@ def get_checks_fails(client: Client, job_url: str): Get tests that did not succeed for the given job URL. Exclude checks that have status 'error' as they are counted in get_checks_errors. """ - columns = "check_status as job_status, check_name as job_name, test_status, test_name, report_url as results_link" - query = f"""SELECT {columns} FROM `gh-data`.checks - WHERE task_url LIKE '{job_url}%' - AND test_status IN ('FAIL', 'ERROR') - AND check_status!='error' - ORDER BY check_name, test_name - """ + query = f"""SELECT job_status, job_name, status as test_status, test_name, results_link + FROM ( + SELECT + argMax(check_status, check_start_time) as job_status, + check_name as job_name, + argMax(test_status, check_start_time) as status, + test_name, + report_url as results_link, + task_url + FROM `gh-data`.checks + GROUP BY check_name, test_name, report_url, task_url + ) + WHERE task_url LIKE '{job_url}%' + AND test_status IN ('FAIL', 'ERROR') + AND job_status!='error' + ORDER BY job_name, test_name + """ return client.query_dataframe(query) @@ -305,14 +321,26 @@ def get_checks_known_fails(client: Client, job_url: str, known_fails: dict): """ Get tests that are known to fail for the given job URL. """ - assert len(known_fails) > 0, "cannot query the database with empty known fails" - columns = "check_status as job_status, check_name as job_name, test_status, test_name, report_url as results_link" - query = f"""SELECT {columns} FROM `gh-data`.checks - WHERE task_url LIKE '{job_url}%' - AND test_status='BROKEN' - AND test_name IN ({','.join(f"'{test}'" for test in known_fails.keys())}) - ORDER BY test_name, check_name - """ + if len(known_fails) == 0: + return pd.DataFrame() + + query = f"""SELECT job_status, job_name, status as test_status, test_name, results_link + FROM ( + SELECT + argMax(check_status, check_start_time) as job_status, + check_name as job_name, + argMax(test_status, check_start_time) as status, + test_name, + report_url as results_link, + task_url + FROM `gh-data`.checks + GROUP BY check_name, test_name, report_url, task_url + ) + WHERE task_url LIKE '{job_url}%' + AND test_status='BROKEN' + AND test_name IN ({','.join(f"'{test}'" for test in known_fails.keys())}) + ORDER BY job_name, test_name + """ df = client.query_dataframe(query) @@ -333,12 +361,22 @@ def get_checks_errors(client: Client, job_url: str): """ Get checks that have status 'error' for the given job URL. """ - columns = "check_status as job_status, check_name as job_name, test_status, test_name, report_url as results_link" - query = f"""SELECT {columns} FROM `gh-data`.checks - WHERE task_url LIKE '{job_url}%' - AND check_status=='error' - ORDER BY check_name, test_name - """ + query = f"""SELECT job_status, job_name, status as test_status, test_name, results_link + FROM ( + SELECT + argMax(check_status, check_start_time) as job_status, + check_name as job_name, + argMax(test_status, check_start_time) as status, + test_name, + report_url as results_link, + task_url + FROM `gh-data`.checks + GROUP BY check_name, test_name, report_url, task_url + ) + WHERE task_url LIKE '{job_url}%' + AND job_status=='error' + ORDER BY job_name, test_name + """ return client.query_dataframe(query) diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index c5ca0080ec4b..7fb9f79da516 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -649,12 +649,13 @@ jobs: ${{ toJson(needs) }} EOF python3 ./tests/ci/ci_buddy.py --check-wf-status - - name: Create and upload combined report + - name: Create and upload report if: ${{ !cancelled() }} env: CHECKS_DATABASE_HOST: ${{ secrets.CHECKS_DATABASE_HOST }} CHECKS_DATABASE_USER: ${{ secrets.CLICKHOUSE_TEST_STAT_LOGIN }} CHECKS_DATABASE_PASSWORD: ${{ secrets.CLICKHOUSE_TEST_STAT_PASSWORD }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} COMMIT_SHA: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} PR_NUMBER: ${{ github.event.pull_request.number || 0 }} ACTIONS_RUN_URL: ${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }} @@ -663,6 +664,7 @@ jobs: pip install clickhouse-driver==0.2.8 numpy==1.26.4 pandas==2.0.3 REPORT_LINK=$(python3 .github/create_workflow_report.py --pr-number $PR_NUMBER --commit-sha $COMMIT_SHA --actions-run-url $ACTIONS_RUN_URL --known-fails tests/broken_tests.json --cves) + echo $REPORT_LINK IS_VALID_URL=$(echo $REPORT_LINK | grep -E '^https?://') if [[ -n $IS_VALID_URL ]]; then From 81c082fb9db07cba7675efab941bc0f11adebb1f Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Wed, 4 Jun 2025 10:37:05 +0300 Subject: [PATCH 46/67] remove 'merges' from test as it only appears in later versions --- tests/integration/test_http_handlers_config/test.py | 2 +- .../test_headers_in_response/config.xml | 11 ----------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/tests/integration/test_http_handlers_config/test.py b/tests/integration/test_http_handlers_config/test.py index 9d2230b37cd6..091690cf6b31 100644 --- a/tests/integration/test_http_handlers_config/test.py +++ b/tests/integration/test_http_handlers_config/test.py @@ -609,7 +609,7 @@ def test_headers_in_response(): ClickHouseCluster(__file__), "headers_in_response", "test_headers_in_response" ) ) as cluster: - for endpoint in ("static", "ping", "replicas_status", "play", "dashboard", "binary", "merges", "metrics", + for endpoint in ("static", "ping", "replicas_status", "play", "dashboard", "binary", "metrics", "js/lz-string.js", "js/uplot.js", "?query=SELECT%201"): response = cluster.instance.http_request(endpoint, method="GET") diff --git a/tests/integration/test_http_handlers_config/test_headers_in_response/config.xml b/tests/integration/test_http_handlers_config/test_headers_in_response/config.xml index 6fc72a1af5f5..4ee24ae123f8 100644 --- a/tests/integration/test_http_handlers_config/test_headers_in_response/config.xml +++ b/tests/integration/test_http_handlers_config/test_headers_in_response/config.xml @@ -72,17 +72,6 @@ - - GET,HEAD - /merges - - merges - - Iam merges - - - - GET,HEAD /metrics From 7e3ec24b0ba01b9d4c67c276af44e0914a74e187 Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Wed, 4 Jun 2025 08:59:25 -0400 Subject: [PATCH 47/67] Update regression hash --- .github/workflows/release_branches.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index c5ca0080ec4b..229bfc622631 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -544,7 +544,7 @@ jobs: secrets: inherit with: runner_type: altinity-on-demand, altinity-type-cpx51, altinity-image-x86-app-docker-ce, altinity-setup-regression - commit: bd31e738c0cedaca253d15a05ed245c41b6e0b6a + commit: aad61f1c34cd8c709bba5bc45d59a774f36cfbae arch: release build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout_minutes: 300 @@ -555,7 +555,7 @@ jobs: secrets: inherit with: runner_type: altinity-on-demand, altinity-type-cax41, altinity-image-arm-app-docker-ce, altinity-setup-regression - commit: bd31e738c0cedaca253d15a05ed245c41b6e0b6a + commit: aad61f1c34cd8c709bba5bc45d59a774f36cfbae arch: aarch64 build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout_minutes: 300 From 888bb525f93bef0eab1d007cdf687cfd5b82467c Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Wed, 4 Jun 2025 16:11:31 -0400 Subject: [PATCH 48/67] Update regression hash --- .github/workflows/release_branches.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 229bfc622631..7e625505fd7a 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -544,7 +544,7 @@ jobs: secrets: inherit with: runner_type: altinity-on-demand, altinity-type-cpx51, altinity-image-x86-app-docker-ce, altinity-setup-regression - commit: aad61f1c34cd8c709bba5bc45d59a774f36cfbae + commit: f9e29772f4d261b82c23189d89038eb7ba027865 arch: release build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout_minutes: 300 @@ -555,7 +555,7 @@ jobs: secrets: inherit with: runner_type: altinity-on-demand, altinity-type-cax41, altinity-image-arm-app-docker-ce, altinity-setup-regression - commit: aad61f1c34cd8c709bba5bc45d59a774f36cfbae + commit: f9e29772f4d261b82c23189d89038eb7ba027865 arch: aarch64 build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout_minutes: 300 From 7a49bd3cfa10011d0430e1f9de4ce6ea37d19571 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 1 Jun 2025 06:41:27 +0000 Subject: [PATCH 49/67] Merge pull request #79369 from ilejn/ignore_error_distributed_ddl_queue Ignore parse error in system.distributed_ddl_queue conflict resolved --- .../System/StorageSystemDDLWorkerQueue.cpp | 30 +++- .../configs/remote_servers.xml | 1 + .../test_system_ddl_worker_queue/test.py | 128 +++++++++++++++--- 3 files changed, 133 insertions(+), 26 deletions(-) diff --git a/src/Storages/System/StorageSystemDDLWorkerQueue.cpp b/src/Storages/System/StorageSystemDDLWorkerQueue.cpp index ae1a233ea81e..23819d0f8cd2 100644 --- a/src/Storages/System/StorageSystemDDLWorkerQueue.cpp +++ b/src/Storages/System/StorageSystemDDLWorkerQueue.cpp @@ -21,6 +21,11 @@ namespace fs = std::filesystem; namespace DB { +namespace ErrorCodes +{ + extern const int SYNTAX_ERROR; +} + enum class Status : uint8_t { INACTIVE, @@ -54,7 +59,7 @@ ColumnsDescription StorageSystemDDLWorkerQueue::getColumnsDescription() {"entry_version", std::make_shared(std::make_shared()), "Version of the entry."}, {"initiator_host", std::make_shared(std::make_shared()), "Host that initiated the DDL operation."}, {"initiator_port", std::make_shared(std::make_shared()), "Port used by the initiator."}, - {"cluster", std::make_shared(), "Cluster name."}, + {"cluster", std::make_shared(), "Cluster name, empty if not determined."}, {"query", std::make_shared(), "Query executed."}, {"settings", std::make_shared(std::make_shared(), std::make_shared()), "Settings used in the DDL operation."}, {"query_create_time", std::make_shared(), "Query created time."}, @@ -77,10 +82,25 @@ static String clusterNameFromDDLQuery(ContextPtr context, const DDLTask & task) String description = fmt::format("from {}", task.entry_path); ParserQuery parser_query(end, settings.allow_settings_after_format_in_insert); - ASTPtr query = parseQuery(parser_query, begin, end, description, - settings.max_query_size, - settings.max_parser_depth, - settings.max_parser_backtracks); + ASTPtr query; + + try + { + query = parseQuery(parser_query, begin, end, description, + settings.max_query_size, + settings.max_parser_depth, + settings.max_parser_backtracks); + } + catch (const Exception & e) + { + LOG_INFO(getLogger("StorageSystemDDLWorkerQueue"), "Failed to determine cluster"); + if (e.code() == ErrorCodes::SYNTAX_ERROR) + { + /// ignore parse error and present available information + return ""; + } + throw; + } String cluster_name; if (const auto * query_on_cluster = dynamic_cast(query.get())) diff --git a/tests/integration/test_system_ddl_worker_queue/configs/remote_servers.xml b/tests/integration/test_system_ddl_worker_queue/configs/remote_servers.xml index 791af83a2d6d..f6392caf5e51 100644 --- a/tests/integration/test_system_ddl_worker_queue/configs/remote_servers.xml +++ b/tests/integration/test_system_ddl_worker_queue/configs/remote_servers.xml @@ -25,4 +25,5 @@ + 1 diff --git a/tests/integration/test_system_ddl_worker_queue/test.py b/tests/integration/test_system_ddl_worker_queue/test.py index 4659e5b92e84..1bebf709a821 100644 --- a/tests/integration/test_system_ddl_worker_queue/test.py +++ b/tests/integration/test_system_ddl_worker_queue/test.py @@ -1,4 +1,5 @@ import pytest +import time from helpers.cluster import ClickHouseCluster @@ -25,46 +26,131 @@ def started_cluster(): try: cluster.start() - for i, node in enumerate([node1, node2]): - node.query("CREATE DATABASE testdb") - node.query( - """CREATE TABLE testdb.test_table(id UInt32, val String) ENGINE = ReplicatedMergeTree('/clickhouse/test/test_table1', '{}') ORDER BY id;""".format( - i - ) - ) - for i, node in enumerate([node3, node4]): - node.query("CREATE DATABASE testdb") - node.query( - """CREATE TABLE testdb.test_table(id UInt32, val String) ENGINE = ReplicatedMergeTree('/clickhouse/test/test_table2', '{}') ORDER BY id;""".format( - i - ) - ) yield cluster finally: cluster.shutdown() +def maintain_test_table(test_table): + tmark = time.time() # to guarantee ZK path uniqueness + + for i, node in enumerate([node1, node2]): + node.query(f"DROP TABLE IF EXISTS testdb.{test_table} SYNC") + node.query("DROP DATABASE IF EXISTS testdb") + + node.query("CREATE DATABASE testdb") + node.query( + f"CREATE TABLE testdb.{test_table}(id UInt32, val String) ENGINE = ReplicatedMergeTree('/clickhouse/test/{test_table}1-{tmark}', '{i}') ORDER BY id;" + ) + for i, node in enumerate([node3, node4]): + node.query(f"DROP TABLE IF EXISTS testdb.{test_table} SYNC") + node.query("DROP DATABASE IF EXISTS testdb") + + node.query("CREATE DATABASE testdb") + node.query( + f"CREATE TABLE testdb.{test_table}(id UInt32, val String) ENGINE = ReplicatedMergeTree('/clickhouse/test/{test_table}2-{tmark}', '{i}') ORDER BY id;" + ) + + def test_distributed_ddl_queue(started_cluster): + test_table = "test_table" + maintain_test_table(test_table) node1.query( - "INSERT INTO testdb.test_table SELECT number, toString(number) FROM numbers(100)" + f"INSERT INTO testdb.{test_table} SELECT number, toString(number) FROM numbers(100)" ) node3.query( - "INSERT INTO testdb.test_table SELECT number, toString(number) FROM numbers(100)" + f"INSERT INTO testdb.{test_table} SELECT number, toString(number) FROM numbers(100)" ) - node2.query("SYSTEM SYNC REPLICA testdb.test_table") - node4.query("SYSTEM SYNC REPLICA testdb.test_table") + node2.query(f"SYSTEM SYNC REPLICA testdb.{test_table}") + node4.query(f"SYSTEM SYNC REPLICA testdb.{test_table}") node1.query( - "ALTER TABLE testdb.test_table ON CLUSTER test_cluster ADD COLUMN somecolumn UInt8 AFTER val", + f"ALTER TABLE testdb.{test_table} ON CLUSTER test_cluster ADD COLUMN somecolumn UInt8 AFTER val", settings={"replication_alter_partitions_sync": "2"}, ) for node in nodes: - node.query("SYSTEM SYNC REPLICA testdb.test_table") - assert node.query("SELECT somecolumn FROM testdb.test_table LIMIT 1") == "0\n" + node.query(f"SYSTEM SYNC REPLICA testdb.{test_table}") + assert ( + node.query(f"SELECT somecolumn FROM testdb.{test_table} LIMIT 1") == "0\n" + ) assert ( node.query( "SELECT If((SELECT count(*) FROM system.distributed_ddl_queue WHERE cluster='test_cluster' AND entry='query-0000000000') > 0, 'ok', 'fail')" ) == "ok\n" ) + + node1.query( + f"ALTER TABLE testdb.{test_table} ON CLUSTER test_cluster DROP COLUMN somecolumn", + settings={"replication_alter_partitions_sync": "2"}, + ) + + +def test_distributed_ddl_rubbish(started_cluster): + test_table = "test_table_rubbish" + maintain_test_table(test_table) + node1.query( + f"ALTER TABLE testdb.{test_table} ON CLUSTER test_cluster ADD COLUMN somenewcolumn UInt8 AFTER val", + settings={"replication_alter_partitions_sync": "2"}, + ) + + zk_content = node1.query( + "SELECT name, value, path FROM system.zookeeper WHERE path LIKE '/clickhouse/task_queue/ddl%' SETTINGS allow_unrestricted_reads_from_keeper=true", + parse=True, + ).to_dict("records") + + original_query = "" + new_query = "query-artificial-" + str(time.monotonic_ns()) + + # Copy information about query (one that added 'somenewcolumn') with new query ID + # and broken query text (TABLE => TUBLE) + for row in zk_content: + if row["value"].find("somenewcolumn") >= 0: + original_query = row["name"] + break + + rows_to_insert = [] + + for row in zk_content: + if row["name"] == original_query: + rows_to_insert.append( + { + "name": new_query, + "path": row["path"], + "value": row["value"].replace("TABLE", "TUBLE"), + } + ) + continue + pos = row["path"].find(original_query) + if pos >= 0: + rows_to_insert.append( + { + "name": row["name"], + "path": row["path"].replace(original_query, new_query), + "value": row["value"], + } + ) + + # Ingest it to ZK + for row in rows_to_insert: + node1.query( + "insert into system.zookeeper (name, path, value) values ('{}', '{}', '{}')".format( + f'{row["name"]}', f'{row["path"]}', f'{row["value"]}' + ) + ) + + # Ensure that data is visible via system.distributed_ddl_queue + assert ( + int( + node1.query( + f"SELECT count(1) FROM system.distributed_ddl_queue WHERE entry='{new_query}' AND cluster=''" + ) + ) + == 4 + ) + + node1.query( + f"ALTER TABLE testdb.{test_table} ON CLUSTER test_cluster DROP COLUMN somenewcolumn", + settings={"replication_alter_partitions_sync": "2"}, + ) From efedec39ceafbbc7ab992e630309040a115da78b Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Fri, 6 Jun 2025 09:11:25 -0400 Subject: [PATCH 50/67] smarter pr number fetching for grype and report --- .github/create_workflow_report.py | 10 ++++++++++ .github/workflows/grype_scan.yml | 4 ++-- .github/workflows/release_branches.yml | 4 +--- tests/ci/version_helper.py | 8 ++++++++ 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/.github/create_workflow_report.py b/.github/create_workflow_report.py index a7f30f72aedf..c34457f36488 100755 --- a/.github/create_workflow_report.py +++ b/.github/create_workflow_report.py @@ -506,6 +506,16 @@ def parse_args() -> argparse.Namespace: def main(): args = parse_args() + if args.pr_number is None or args.commit_sha is None: + run_details = get_run_details(args.actions_run_url) + if args.pr_number is None: + if len(run_details["pull_requests"]) > 0: + args.pr_number = run_details["pull_requests"][0]["number"] + else: + args.pr_number = 0 + if args.commit_sha is None: + args.commit_sha = run_details["head_commit"]["id"] + db_client = Client( host=os.getenv(DATABASE_HOST_VAR), user=os.getenv(DATABASE_USER_VAR), diff --git a/.github/workflows/grype_scan.yml b/.github/workflows/grype_scan.yml index e749448b81ba..d4ce9b57977e 100644 --- a/.github/workflows/grype_scan.yml +++ b/.github/workflows/grype_scan.yml @@ -49,7 +49,7 @@ jobs: run: | python3 ./tests/ci/version_helper.py | tee /tmp/version_info source /tmp/version_info - echo "docker_image=${{ inputs.docker_image }}:${{ github.event.pull_request.number || 0 }}-$CLICKHOUSE_VERSION_STRING" >> $GITHUB_OUTPUT + echo "docker_image=${{ inputs.docker_image }}:$PR_NUMBER-$CLICKHOUSE_VERSION_STRING" >> $GITHUB_OUTPUT echo "commit_sha=$CLICKHOUSE_VERSION_GITHASH" >> $GITHUB_OUTPUT - name: Run Grype Scan @@ -67,9 +67,9 @@ jobs: env: S3_BUCKET: "altinity-build-artifacts" COMMIT_SHA: ${{ steps.set_version.outputs.commit_sha || github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - PR_NUMBER: ${{ github.event.pull_request.number || 0 }} DOCKER_IMAGE: ${{ steps.set_version.outputs.docker_image || inputs.docker_image }} run: | + echo "PR_NUMBER=$PR_NUMBER" ./.github/grype/transform_and_upload_results_s3.sh - name: Create step summary diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index c5ca0080ec4b..2f61a658785d 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -655,14 +655,12 @@ jobs: CHECKS_DATABASE_HOST: ${{ secrets.CHECKS_DATABASE_HOST }} CHECKS_DATABASE_USER: ${{ secrets.CLICKHOUSE_TEST_STAT_LOGIN }} CHECKS_DATABASE_PASSWORD: ${{ secrets.CLICKHOUSE_TEST_STAT_PASSWORD }} - COMMIT_SHA: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - PR_NUMBER: ${{ github.event.pull_request.number || 0 }} ACTIONS_RUN_URL: ${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }} shell: bash run: | pip install clickhouse-driver==0.2.8 numpy==1.26.4 pandas==2.0.3 - REPORT_LINK=$(python3 .github/create_workflow_report.py --pr-number $PR_NUMBER --commit-sha $COMMIT_SHA --actions-run-url $ACTIONS_RUN_URL --known-fails tests/broken_tests.json --cves) + REPORT_LINK=$(python3 .github/create_workflow_report.py --actions-run-url $ACTIONS_RUN_URL --known-fails tests/broken_tests.json --cves) IS_VALID_URL=$(echo $REPORT_LINK | grep -E '^https?://') if [[ -n $IS_VALID_URL ]]; then diff --git a/tests/ci/version_helper.py b/tests/ci/version_helper.py index afee29c9c193..f097d4a57b9f 100755 --- a/tests/ci/version_helper.py +++ b/tests/ci/version_helper.py @@ -4,6 +4,8 @@ from pathlib import Path from typing import Any, Dict, Iterable, List, Literal, Optional, Set, Tuple, Union +from pr_info import PRInfo # grype scan needs to know the PR number + from git_helper import TWEAK, Git, get_tags, git_runner, removeprefix, VersionType FILE_WITH_VERSION_PATH = "cmake/autogenerated_versions.txt" @@ -531,6 +533,12 @@ def main(): if args.update_part or args.update_cmake: update_cmake_version(version) + # grype scan needs to know the PR number + pr_info = PRInfo() + print(f"PR_NUMBER={pr_info.number}") + if args.export: + print(f"export PR_NUMBER") + for k, v in version.as_dict().items(): name = f"CLICKHOUSE_VERSION_{k.upper()}" print(f"{name}='{v}'") From 0166c918c3c8ddf9eb65e434d7795b3c19a5317d Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Fri, 6 Jun 2025 13:05:06 -0400 Subject: [PATCH 51/67] let the grype workflow compute the tag --- .github/grype/run_grype_scan.sh | 2 +- .github/workflows/grype_scan.yml | 11 ++++++++++- .github/workflows/release_branches.yml | 3 ++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/.github/grype/run_grype_scan.sh b/.github/grype/run_grype_scan.sh index c5ce0b1b10d3..af428e37d669 100755 --- a/.github/grype/run_grype_scan.sh +++ b/.github/grype/run_grype_scan.sh @@ -3,7 +3,7 @@ set -e IMAGE=$1 -GRYPE_VERSION="v0.80.1" +GRYPE_VERSION=${GRYPE_VERSION:-"v0.92.2"} docker pull $IMAGE docker pull anchore/grype:${GRYPE_VERSION} diff --git a/.github/workflows/grype_scan.yml b/.github/workflows/grype_scan.yml index d4ce9b57977e..fd6368a0953b 100644 --- a/.github/workflows/grype_scan.yml +++ b/.github/workflows/grype_scan.yml @@ -15,11 +15,17 @@ on: description: 'Docker image. If no tag, it will be determined by version_helper.py' required: true type: string + tag-suffix: + description: 'Tag suffix. To be appended the version from version_helper.py' + required: false + type: string + default: "" env: PYTHONUNBUFFERED: 1 AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + GRYPE_VERSION: "v0.92.2-arm64v8" jobs: grype_scan: @@ -46,10 +52,12 @@ jobs: - name: Set image tag if not given if: ${{ !contains(inputs.docker_image, ':') }} id: set_version + env: + TAG_SUFFIX: ${{ inputs.tag-suffix }} run: | python3 ./tests/ci/version_helper.py | tee /tmp/version_info source /tmp/version_info - echo "docker_image=${{ inputs.docker_image }}:$PR_NUMBER-$CLICKHOUSE_VERSION_STRING" >> $GITHUB_OUTPUT + echo "docker_image=${{ inputs.docker_image }}:$PR_NUMBER-$CLICKHOUSE_VERSION_STRING$TAG_SUFFIX" >> $GITHUB_OUTPUT echo "commit_sha=$CLICKHOUSE_VERSION_GITHASH" >> $GITHUB_OUTPUT - name: Run Grype Scan @@ -67,6 +75,7 @@ jobs: env: S3_BUCKET: "altinity-build-artifacts" COMMIT_SHA: ${{ steps.set_version.outputs.commit_sha || github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + PR_NUMBER: ${{ env.PR_NUMBER || github.event.pull_request.number || 0 }} DOCKER_IMAGE: ${{ steps.set_version.outputs.docker_image || inputs.docker_image }} run: | echo "PR_NUMBER=$PR_NUMBER" diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 8270b3de1e14..c16e821aced4 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -593,7 +593,8 @@ jobs: uses: ./.github/workflows/grype_scan.yml secrets: inherit with: - docker_image: altinityinfra/clickhouse-${{ matrix.image }}:${{ github.event.pull_request.number || 0 }}-${{ fromJson(needs.RunConfig.outputs.data).version }}${{ matrix.suffix }} + docker_image: altinityinfra/clickhouse-${{ matrix.image }} + tag-suffix: ${{ matrix.suffix }} FinishCheck: if: ${{ !cancelled() }} needs: From 1cf050ff7b7bdada692f2920d8ea5c692e172a1b Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Mon, 19 May 2025 14:22:30 +0200 Subject: [PATCH 52/67] update alpine --- docker/keeper/Dockerfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index 646ef74164ba..880dba685b95 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -12,9 +12,7 @@ RUN arch=${TARGETARCH:-amd64} \ && ln -s "${rarch}-linux-gnu" /lib/linux-gnu -# As of Nov 3rd 2024, alpine:3.20.3 has 1 "unknown"-type CVE: -# * https://security.alpinelinux.org/vuln/CVE-2024-9143 -FROM alpine:3.20.3 +FROM alpine:3.21.3 ENV LANG=en_US.UTF-8 \ LANGUAGE=en_US:en \ From 28ade05fc218d2dcb569328ecefaed8bb762f872 Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Fri, 6 Jun 2025 15:05:54 -0400 Subject: [PATCH 53/67] fix python deps --- .github/workflows/grype_scan.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/grype_scan.yml b/.github/workflows/grype_scan.yml index fd6368a0953b..860b77d60b3f 100644 --- a/.github/workflows/grype_scan.yml +++ b/.github/workflows/grype_scan.yml @@ -45,7 +45,7 @@ jobs: sudo apt-get install -y python3-pip python3-venv python3 -m venv venv source venv/bin/activate - pip install --upgrade requests chardet urllib3 + pip install --upgrade requests chardet urllib3 unidiff boto3 PyGithub pip install testflows==$TESTFLOWS_VERSION awscli==1.33.28 echo PATH=$PATH >>$GITHUB_ENV From eabac0f8afa5ff00a41938364d7c1014b2ec6650 Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Mon, 9 Jun 2025 11:22:35 -0400 Subject: [PATCH 54/67] add branch name to release report --- .github/create_workflow_report.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/.github/create_workflow_report.py b/.github/create_workflow_report.py index 403b0c60f7e4..cbb616cd4807 100755 --- a/.github/create_workflow_report.py +++ b/.github/create_workflow_report.py @@ -5,6 +5,7 @@ from itertools import combinations import json from datetime import datetime +from functools import lru_cache import requests import pandas as pd @@ -196,6 +197,29 @@ """ +@lru_cache +def get_run_details(run_url: str) -> dict: + """ + Fetch run details for a given run URL. + """ + run_id = run_url.split("/")[-1] + + headers = { + "Authorization": f"token {os.getenv('GITHUB_TOKEN')}", + "Accept": "application/vnd.github.v3+json", + } + + url = f"https://api.github.com/repos/{GITHUB_REPO}/actions/runs/{run_id}" + response = requests.get(url, headers=headers) + + if response.status_code != 200: + raise Exception( + f"Failed to fetch run details: {response.status_code} {response.text}" + ) + + return response.json() + + def get_commit_statuses(sha: str) -> pd.DataFrame: """ Fetch commit statuses for a given SHA and return as a pandas DataFrame. @@ -589,7 +613,9 @@ def main(): ) if args.pr_number == "0": - pr_info_html = "Release" + run_details = get_run_details(args.actions_run_url) + branch_name = run_details.get("head_branch", "unknown branch") + pr_info_html = f"Release ({branch_name})" else: try: pr_info = get_pr_info_from_number(args.pr_number) From 3d3000f9e8d700b1dccebb61c580cf1819d4fc83 Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Mon, 9 Jun 2025 12:40:31 -0400 Subject: [PATCH 55/67] use version from RunConfig --- .github/workflows/grype_scan.yml | 13 ++++++++++++- .github/workflows/release_branches.yml | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/grype_scan.yml b/.github/workflows/grype_scan.yml index 860b77d60b3f..e68c3e63e283 100644 --- a/.github/workflows/grype_scan.yml +++ b/.github/workflows/grype_scan.yml @@ -15,6 +15,11 @@ on: description: 'Docker image. If no tag, it will be determined by version_helper.py' required: true type: string + version: + description: 'Version tag. If no version, it will be determined by version_helper.py' + required: false + type: string + default: "" tag-suffix: description: 'Tag suffix. To be appended the version from version_helper.py' required: false @@ -54,10 +59,16 @@ jobs: id: set_version env: TAG_SUFFIX: ${{ inputs.tag-suffix }} + SPECIFIED_VERSION: ${{ inputs.version }} run: | python3 ./tests/ci/version_helper.py | tee /tmp/version_info source /tmp/version_info - echo "docker_image=${{ inputs.docker_image }}:$PR_NUMBER-$CLICKHOUSE_VERSION_STRING$TAG_SUFFIX" >> $GITHUB_OUTPUT + if [ -z "$SPECIFIED_VERSION" ]; then + VERSION=$CLICKHOUSE_VERSION_STRING + else + VERSION=$SPECIFIED_VERSION + fi + echo "docker_image=${{ inputs.docker_image }}:$PR_NUMBER-$VERSION$TAG_SUFFIX" >> $GITHUB_OUTPUT echo "commit_sha=$CLICKHOUSE_VERSION_GITHASH" >> $GITHUB_OUTPUT - name: Run Grype Scan diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index c16e821aced4..75abdf0346f9 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -594,6 +594,7 @@ jobs: secrets: inherit with: docker_image: altinityinfra/clickhouse-${{ matrix.image }} + version: ${{ fromJson(needs.RunConfig.outputs.data).version }} tag-suffix: ${{ matrix.suffix }} FinishCheck: if: ${{ !cancelled() }} From eee7504f536fe29a47531c9997a03d016419025e Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Mon, 9 Jun 2025 13:00:26 -0400 Subject: [PATCH 56/67] use report action from main branch --- .../actions/create_workflow_report/action.yml | 41 ++ .../ci_run_report.html.jinja | 269 ++++++++++ .../create_workflow_report.py | 463 ++++++++---------- .github/workflows/release_branches.yml | 21 +- 4 files changed, 525 insertions(+), 269 deletions(-) create mode 100644 .github/actions/create_workflow_report/action.yml create mode 100644 .github/actions/create_workflow_report/ci_run_report.html.jinja rename .github/{ => actions/create_workflow_report}/create_workflow_report.py (56%) diff --git a/.github/actions/create_workflow_report/action.yml b/.github/actions/create_workflow_report/action.yml new file mode 100644 index 000000000000..fde62d01e29d --- /dev/null +++ b/.github/actions/create_workflow_report/action.yml @@ -0,0 +1,41 @@ +name: Create and Upload Combined Report +description: Create and upload a combined CI report +inputs: + final: + description: "Control whether the report is final or a preview" + required: false + default: "false" +runs: + using: "composite" + steps: + - name: Create and upload workflow report + env: + PR_NUMBER: ${{ github.event.pull_request.number || 0 }} + COMMIT_SHA: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ACTIONS_RUN_URL: ${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }} + FINAL: ${{ inputs.final }} + shell: bash + run: | + pip install clickhouse-driver==0.2.8 numpy==1.26.4 pandas==2.0.3 jinja2==3.1.5 + + CMD="python3 .github/actions/create_workflow_report/create_workflow_report.py" + ARGS="--pr-number $PR_NUMBER --commit-sha $COMMIT_SHA --actions-run-url $ACTIONS_RUN_URL --known-fails tests/broken_tests.json --cves" + + set +e + if [[ "$FINAL" == "false" ]]; then + REPORT_LINK=$($CMD $ARGS --mark-preview) + else + REPORT_LINK=$($CMD $ARGS) + fi + + echo $REPORT_LINK + + if [[ "$FINAL" == "true" ]]; then + IS_VALID_URL=$(echo $REPORT_LINK | grep -E '^https?://') + if [[ -n $IS_VALID_URL ]]; then + echo "Workflow Run Report: [View Report]($REPORT_LINK)" >> $GITHUB_STEP_SUMMARY + else + echo "Error: $REPORT_LINK" >> $GITHUB_STEP_SUMMARY + exit 1 + fi + fi diff --git a/.github/actions/create_workflow_report/ci_run_report.html.jinja b/.github/actions/create_workflow_report/ci_run_report.html.jinja new file mode 100644 index 000000000000..a92c1aa34e3a --- /dev/null +++ b/.github/actions/create_workflow_report/ci_run_report.html.jinja @@ -0,0 +1,269 @@ + + + + + + + + + {{ title }} + + + + +

+ +

{{ title }}

+ + + + + + + + + + + + + + + + + + + + + + + +
Pull Request{{ pr_info_html }}
Workflow Run{{ workflow_id }}
Commit{{ commit_sha }}
Build ReportBuild Report
Date {{ date }}
+ {% if is_preview %} +

This is a preview. The workflow is not yet finished.

+ {% endif %} +

Table of Contents

+ + + {%- if pr_number != 0 -%} +

New Fails in PR

+

Compared with base sha {{ base_sha }}

+ {{ new_fails_html }} + {%- endif %} + +

CI Jobs Status

+ {{ ci_jobs_status_html }} + +

Checks Errors

+ {{ checks_errors_html }} + +

Checks New Fails

+ {{ checks_fails_html }} + +

Regression New Fails

+ {{ regression_fails_html }} + +

Docker Images CVEs

+ {{ docker_images_cves_html }} + +

Checks Known Fails

+

+ Fail reason conventions:
+ KNOWN - Accepted fail and fix is not planned
+ INVESTIGATE - We don't know why it fails
+ NEEDSFIX - Investigation done and a fix is needed to make it pass
+

+ {{ checks_known_fails_html }} + + + + \ No newline at end of file diff --git a/.github/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py similarity index 56% rename from .github/create_workflow_report.py rename to .github/actions/create_workflow_report/create_workflow_report.py index cbb616cd4807..8918b1db7728 100755 --- a/.github/create_workflow_report.py +++ b/.github/actions/create_workflow_report/create_workflow_report.py @@ -7,12 +7,12 @@ from datetime import datetime from functools import lru_cache -import requests import pandas as pd +from jinja2 import Environment, FileSystemLoader +import requests from clickhouse_driver import Client import boto3 from botocore.exceptions import NoCredentialsError -import pandas as pd DATABASE_HOST_VAR = "CHECKS_DATABASE_HOST" DATABASE_USER_VAR = "CHECKS_DATABASE_USER" @@ -20,181 +20,13 @@ S3_BUCKET = "altinity-build-artifacts" GITHUB_REPO = "Altinity/ClickHouse" +# Set up the Jinja2 environment +template_dir = os.path.dirname(__file__) -css = """ - /* Base colors for Altinity */ - :root { - --altinity-background: #000D45; - --altinity-accent: #189DCF; - --altinity-highlight: #FFC600; - --altinity-gray: #6c757d; - --altinity-light-gray: #f8f9fa; - --altinity-white: #ffffff; - } - - /* Body and heading fonts */ - body { - font-family: Arimo, "Proxima Nova", "Helvetica Neue", Helvetica, Arial, sans-serif; - font-size: 1rem; - background-color: var(--altinity-background); - color: var(--altinity-light-gray); - padding: 2rem; - } - - h1, h2, h3, h4, h5, h6 { - font-family: Figtree, "Proxima Nova", "Helvetica Neue", Helvetica, Arial, sans-serif; - color: var(--altinity-white); - } - - .logo { - width: auto; - height: 5em; - } - - /* General table styling */ - table { - min-width: min(900px, 98vw); - margin: 1rem 0; - border-collapse: collapse; - background-color: var(--altinity-white); - border: 1px solid var(--altinity-accent); - box-shadow: 0 0 8px rgba(0, 0, 0, 0.05); - color: var(--altinity-background); - } - - /* Table header styling */ - th { - background-color: var(--altinity-accent); - color: var(--altinity-white); - padding: 10px 16px; - text-align: left; - border: none; - border-bottom: 2px solid var(--altinity-background); - white-space: nowrap; - } - th.hth { - border-bottom: 1px solid var(--altinity-accent); - border-right: 2px solid var(--altinity-background); - } - - /* Table header sorting styling */ - th { - cursor: pointer; - } - th.no-sort { - pointer-events: none; - } - th::after, - th::before { - transition: color 0.2s ease-in-out; - font-size: 1.2em; - color: transparent; - } - th::after { - margin-left: 3px; - content: '\\025B8'; - } - th:hover::after { - color: inherit; - } - th.dir-d::after { - color: inherit; - content: '\\025BE'; - } - th.dir-u::after { - color: inherit; - content: '\\025B4'; - } - - /* Table body row styling */ - tr:hover { - background-color: var(--altinity-light-gray); - } - - /* Table cell styling */ - td { - padding: 8px 8px; - border: 1px solid var(--altinity-accent); - } - - /* Link styling */ - a { - color: var(--altinity-accent); - text-decoration: none; - } - a:hover { - color: var(--altinity-highlight); - text-decoration: underline; - } -""" - -script = """ - -""" - -logo = """ -

-""" +# Load the template +template = Environment(loader=FileSystemLoader(template_dir)).get_template( + "ci_run_report.html.jinja" +) @lru_cache @@ -316,6 +148,29 @@ def get_pr_info_from_number(pr_number: str) -> dict: return response.json() +@lru_cache +def get_run_details(run_url: str) -> dict: + """ + Fetch run details for a given run URL. + """ + run_id = run_url.split("/")[-1] + + headers = { + "Authorization": f"token {os.getenv('GITHUB_TOKEN')}", + "Accept": "application/vnd.github.v3+json", + } + + url = f"https://api.github.com/repos/{GITHUB_REPO}/actions/runs/{run_id}" + response = requests.get(url, headers=headers) + + if response.status_code != 200: + raise Exception( + f"Failed to fetch run details: {response.status_code} {response.text}" + ) + + return response.json() + + def get_checks_fails(client: Client, job_url: str): """ Get tests that did not succeed for the given job URL. @@ -433,14 +288,14 @@ def get_regression_fails(client: Client, job_url: str): architecture as arch, test_name, argMax(result, start_time) AS status, - job_url, job_name, - report_url as results_link + report_url as results_link, + job_url FROM `gh-data`.clickhouse_regression_results GROUP BY architecture, test_name, job_url, job_name, report_url ORDER BY length(test_name) DESC ) - WHERE job_url='{job_url}' + WHERE job_url LIKE '{job_url}%' AND status IN ('Fail', 'Error') """ df = client.query_dataframe(query) @@ -449,7 +304,105 @@ def get_regression_fails(client: Client, job_url: str): return df +def get_new_fails_this_pr( + client: Client, + pr_info: dict, + checks_fails: pd.DataFrame, + regression_fails: pd.DataFrame, +): + """ + Get tests that failed in the PR but passed in the base branch. + Compares both checks and regression test results. + """ + base_sha = pr_info.get("base", {}).get("sha") + if not base_sha: + raise Exception("No base SHA found for PR") + + # Modify tables to have the same columns + if len(checks_fails) > 0: + checks_fails = checks_fails.copy().drop(columns=["job_status"]) + if len(regression_fails) > 0: + regression_fails = regression_fails.copy() + regression_fails["job_name"] = regression_fails.apply( + lambda row: f"{row['arch']} {row['job_name']}".strip(), axis=1 + ) + regression_fails["test_status"] = regression_fails["status"] + + # Combine both types of fails and select only desired columns + desired_columns = ["job_name", "test_name", "test_status", "results_link"] + all_pr_fails = pd.concat([checks_fails, regression_fails], ignore_index=True)[ + desired_columns + ] + if len(all_pr_fails) == 0: + return pd.DataFrame() + + # Get all checks from the base branch that didn't fail + base_checks_query = f"""SELECT job_name, status as test_status, test_name, results_link + FROM ( + SELECT + check_name as job_name, + argMax(test_status, check_start_time) as status, + test_name, + report_url as results_link, + task_url + FROM `gh-data`.checks + WHERE commit_sha='{base_sha}' + GROUP BY check_name, test_name, report_url, task_url + ) + WHERE test_status NOT IN ('FAIL', 'ERROR') + ORDER BY job_name, test_name + """ + base_checks = client.query_dataframe(base_checks_query) + + # Get regression results from base branch that didn't fail + base_regression_query = f"""SELECT arch, job_name, status, test_name, results_link + FROM ( + SELECT + architecture as arch, + test_name, + argMax(result, start_time) AS status, + job_url, + job_name, + report_url as results_link + FROM `gh-data`.clickhouse_regression_results + WHERE results_link LIKE'%/{base_sha}/%' + GROUP BY architecture, test_name, job_url, job_name, report_url + ORDER BY length(test_name) DESC + ) + WHERE status NOT IN ('Fail', 'Error') + """ + base_regression = client.query_dataframe(base_regression_query) + if len(base_regression) > 0: + base_regression["job_name"] = base_regression.apply( + lambda row: f"{row['arch']} {row['job_name']}".strip(), axis=1 + ) + base_regression["test_status"] = base_regression["status"] + base_regression = base_regression.drop(columns=["arch", "status"]) + + # Combine base results + base_results = pd.concat([base_checks, base_regression], ignore_index=True) + + # Find tests that failed in PR but passed in base + pr_failed_tests = set(zip(all_pr_fails["job_name"], all_pr_fails["test_name"])) + base_passed_tests = set(zip(base_results["job_name"], base_results["test_name"])) + + new_fails = pr_failed_tests.intersection(base_passed_tests) + + # Filter PR results to only include new fails + mask = all_pr_fails.apply( + lambda row: (row["job_name"], row["test_name"]) in new_fails, axis=1 + ) + new_fails_df = all_pr_fails[mask] + + return new_fails_df + + def get_cves(pr_number, commit_sha): + """ + Fetch Grype results from S3. + + If no results are available for download, returns ... (Ellipsis). + """ s3_client = boto3.client("s3", endpoint_url=os.getenv("S3_URL")) s3_prefix = f"{pr_number}/{commit_sha}/grype/" @@ -462,6 +415,11 @@ def get_cves(pr_number, commit_sha): content["Prefix"] for content in response.get("CommonPrefixes", []) ] + if len(grype_result_dirs) == 0: + # We were asked to check the CVE data, but none was found, + # maybe this is a preview report and grype results are not available yet + return ... + for path in grype_result_dirs: file_key = f"{path}result.json" file_response = s3_client.get_object(Bucket=S3_BUCKET, Key=file_key) @@ -535,21 +493,21 @@ def format_results_as_html_table(results) -> str: ), }, escape=False, - ).replace(' border="1"', "") + border=0, + classes=["test-results-table"], + ) return html def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Create a combined CI report.") - parser.add_argument( + parser.add_argument( # Need the full URL rather than just the ID to query the databases "--actions-run-url", required=True, help="URL of the actions run" ) parser.add_argument( - "--pr-number", required=True, help="Pull request number for the S3 path" - ) - parser.add_argument( - "--commit-sha", required=True, help="Commit SHA for the S3 path" + "--pr-number", help="Pull request number for the S3 path", type=int ) + parser.add_argument("--commit-sha", help="Commit SHA for the S3 path") parser.add_argument( "--no-upload", action="store_true", help="Do not upload the report" ) @@ -592,6 +550,7 @@ def main(): "job_statuses": get_commit_statuses(args.commit_sha), "checks_fails": get_checks_fails(db_client, args.actions_run_url), "checks_known_fails": [], + "pr_new_fails": [], "checks_errors": get_checks_errors(db_client, args.actions_run_url), "regression_fails": get_regression_fails(db_client, args.actions_run_url), "docker_images_cves": ( @@ -599,6 +558,12 @@ def main(): ), } + # get_cves returns ... in the case where no Grype result files were found. + # This might occur when run in preview mode. + cves_not_checked = not args.cves or ( + args.mark_preview and fail_results["docker_images_cves"] is ... + ) + if args.known_fails: if not os.path.exists(args.known_fails): print(f"Known fails file {args.known_fails} not found.") @@ -612,7 +577,7 @@ def main(): db_client, args.actions_run_url, known_fails ) - if args.pr_number == "0": + if args.pr_number == 0: run_details = get_run_details(args.actions_run_url) branch_name = run_details.get("head_branch", "unknown branch") pr_info_html = f"Release ({branch_name})" @@ -622,11 +587,17 @@ def main(): pr_info_html = f""" #{pr_info.get("number")} ({pr_info.get("base", {}).get('ref')} <- {pr_info.get("head", {}).get('ref')}) {pr_info.get("title")} """ + fail_results["pr_new_fails"] = get_new_fails_this_pr( + db_client, + pr_info, + fail_results["checks_fails"], + fail_results["regression_fails"], + ) except Exception as e: pr_info_html = e high_cve_count = 0 - if len(fail_results["docker_images_cves"]) > 0: + if not cves_not_checked and len(fail_results["docker_images_cves"]) > 0: high_cve_count = ( fail_results["docker_images_cves"]["severity"] .str.lower() @@ -634,72 +605,60 @@ def main(): .sum() ) - title = "ClickHouse® CI Workflow Run Report" - - html_report = f""" - - - - - - - {title} - - - {logo} -

{title}

- - - - - - - - - - - - - -
Pull Request{pr_info_html}
Workflow Run{args.actions_run_url.split('/')[-1]}
Commit{args.commit_sha}
Date{datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')} UTC
- -

Table of Contents

-{'

This is a preview. FinishCheck has not completed.

' if args.mark_preview else ""} - - -

CI Jobs Status

-{format_results_as_html_table(fail_results['job_statuses'])} - -

Checks Errors

-{format_results_as_html_table(fail_results['checks_errors'])} - -

Checks New Fails

-{format_results_as_html_table(fail_results['checks_fails'])} - -

Regression New Fails

-{format_results_as_html_table(fail_results['regression_fails'])} - -

Docker Images CVEs

-{"

Not Checked

" if not args.cves else format_results_as_html_table(fail_results['docker_images_cves'])} - -

Checks Known Fails

-{"

Not Checked

" if not args.known_fails else format_results_as_html_table(fail_results['checks_known_fails'])} - -{script} - - -""" + # Define the context for rendering + context = { + "title": "ClickHouse® CI Workflow Run Report", + "github_repo": GITHUB_REPO, + "s3_bucket": S3_BUCKET, + "pr_info_html": pr_info_html, + "pr_number": args.pr_number, + "workflow_id": args.actions_run_url.split("/")[-1], + "commit_sha": args.commit_sha, + "base_sha": "" if args.pr_number == 0 else pr_info.get("base", {}).get("sha"), + "date": f"{datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')} UTC", + "is_preview": args.mark_preview, + "counts": { + "jobs_status": f"{sum(fail_results['job_statuses']['job_status'] != 'success')} fail/error", + "checks_errors": len(fail_results["checks_errors"]), + "checks_new_fails": len(fail_results["checks_fails"]), + "regression_new_fails": len(fail_results["regression_fails"]), + "cves": "N/A" if cves_not_checked else f"{high_cve_count} high/critical", + "checks_known_fails": ( + "N/A" + if not args.known_fails + else len(fail_results["checks_known_fails"]) + ), + "pr_new_fails": len(fail_results["pr_new_fails"]), + }, + "ci_jobs_status_html": format_results_as_html_table( + fail_results["job_statuses"] + ), + "checks_errors_html": format_results_as_html_table( + fail_results["checks_errors"] + ), + "checks_fails_html": format_results_as_html_table(fail_results["checks_fails"]), + "regression_fails_html": format_results_as_html_table( + fail_results["regression_fails"] + ), + "docker_images_cves_html": ( + "

Not Checked

" + if cves_not_checked + else format_results_as_html_table(fail_results["docker_images_cves"]) + ), + "checks_known_fails_html": ( + "

Not Checked

" + if not args.known_fails + else format_results_as_html_table(fail_results["checks_known_fails"]) + ), + "new_fails_html": format_results_as_html_table(fail_results["pr_new_fails"]), + } + + # Render the template with the context + rendered_html = template.render(context) + report_name = "ci_run_report.html" report_path = Path(report_name) - report_path.write_text(html_report, encoding="utf-8") + report_path.write_text(rendered_html, encoding="utf-8") if args.no_upload: print(f"Report saved to {report_path}") @@ -714,7 +673,7 @@ def main(): s3_client.put_object( Bucket=S3_BUCKET, Key=report_destination_key, - Body=html_report, + Body=rendered_html, ContentType="text/html; charset=utf-8", ) except NoCredentialsError: diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 75abdf0346f9..091f5fa41406 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -653,24 +653,11 @@ jobs: python3 ./tests/ci/ci_buddy.py --check-wf-status - name: Create and upload report if: ${{ !cancelled() }} + uses: ./.github/actions/create_workflow_report@b229c9e6f9d1e100ba32f271b6bba112d9894afb env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} CHECKS_DATABASE_HOST: ${{ secrets.CHECKS_DATABASE_HOST }} CHECKS_DATABASE_USER: ${{ secrets.CLICKHOUSE_TEST_STAT_LOGIN }} CHECKS_DATABASE_PASSWORD: ${{ secrets.CLICKHOUSE_TEST_STAT_PASSWORD }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - ACTIONS_RUN_URL: ${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }} - shell: bash - run: | - pip install clickhouse-driver==0.2.8 numpy==1.26.4 pandas==2.0.3 - - REPORT_LINK=$(python3 .github/create_workflow_report.py --actions-run-url $ACTIONS_RUN_URL --known-fails tests/broken_tests.json --cves) - - echo $REPORT_LINK - - IS_VALID_URL=$(echo $REPORT_LINK | grep -E '^https?://') - if [[ -n $IS_VALID_URL ]]; then - echo "Combined CI Report: [View Report]($REPORT_LINK)" >> $GITHUB_STEP_SUMMARY - else - echo "Error: $REPORT_LINK" >> $GITHUB_STEP_SUMMARY - exit 1 - fi + with: + final: true From a4a4a422819073c487d24f61eaa777c93d658fe8 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Wed, 11 Jun 2025 09:10:03 +0500 Subject: [PATCH 57/67] update regression version --- .github/workflows/release_branches.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index f297b31e0212..7fae297e16ae 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -544,7 +544,7 @@ jobs: secrets: inherit with: runner_type: altinity-on-demand, altinity-type-cpx51, altinity-image-x86-app-docker-ce, altinity-setup-regression - commit: f9e29772f4d261b82c23189d89038eb7ba027865 + commit: e3c00be97a045aa04e9d1a6ec50cc64f4c387b70 arch: release build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout_minutes: 300 @@ -555,7 +555,7 @@ jobs: secrets: inherit with: runner_type: altinity-on-demand, altinity-type-cax41, altinity-image-arm-app-docker-ce, altinity-setup-regression - commit: f9e29772f4d261b82c23189d89038eb7ba027865 + commit: e3c00be97a045aa04e9d1a6ec50cc64f4c387b70 arch: aarch64 build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout_minutes: 300 From 27ce9e6c154265da72f1b7d4786e7f2ec96e1e81 Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Wed, 11 Jun 2025 09:06:12 -0400 Subject: [PATCH 58/67] support live updating report --- .github/workflows/release_branches.yml | 10 +++++++++- .github/workflows/reusable_test.yml | 10 ++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 091f5fa41406..b6ba377143ba 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -69,6 +69,14 @@ jobs: - name: Re-create GH statuses for skipped jobs if any run: | python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --infile ${{ runner.temp }}/ci_run_data.json --update-gh-statuses + - name: Note report location to summary + env: + PR_NUMBER: ${{ github.event.pull_request.number || 0 }} + COMMIT_SHA: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + run: | + REPORT_LINK=https://s3.amazonaws.com/altinity-build-artifacts/$PR_NUMBER/$COMMIT_SHA/ci_run_report.html + echo "Workflow Run Report: [View Report]($REPORT_LINK)" >> $GITHUB_STEP_SUMMARY + BuildDockers: needs: [RunConfig] if: ${{ !failure() && !cancelled() }} @@ -653,7 +661,7 @@ jobs: python3 ./tests/ci/ci_buddy.py --check-wf-status - name: Create and upload report if: ${{ !cancelled() }} - uses: ./.github/actions/create_workflow_report@b229c9e6f9d1e100ba32f271b6bba112d9894afb + uses: ./.github/actions/create_workflow_report env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} CHECKS_DATABASE_HOST: ${{ secrets.CHECKS_DATABASE_HOST }} diff --git a/.github/workflows/reusable_test.yml b/.github/workflows/reusable_test.yml index 4cb00cc9e83f..64f41c3c28a4 100644 --- a/.github/workflows/reusable_test.yml +++ b/.github/workflows/reusable_test.yml @@ -163,6 +163,16 @@ jobs: if: ${{ !cancelled() }} run: | python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --infile ${{ toJson(inputs.data) }} --post --job-name '${{inputs.test_name}}' + - name: Update workflow report + if: ${{ !cancelled() }} + uses: ./.github/actions/create_workflow_report + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CHECKS_DATABASE_HOST: ${{ secrets.CHECKS_DATABASE_HOST }} + CHECKS_DATABASE_USER: ${{ secrets.CLICKHOUSE_TEST_STAT_LOGIN }} + CHECKS_DATABASE_PASSWORD: ${{ secrets.CLICKHOUSE_TEST_STAT_PASSWORD }} + with: + final: false - name: Mark as done if: ${{ !cancelled() }} run: | From 34e9bcf53892155467d296433830e665004e3df0 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Thu, 12 Jun 2025 13:24:26 +0000 Subject: [PATCH 59/67] Adapt test_system_ddl_worker_queue/test.py::test_distributed_ddl_rubbish to 24.8 --- .../test_system_ddl_worker_queue/test.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_system_ddl_worker_queue/test.py b/tests/integration/test_system_ddl_worker_queue/test.py index 1bebf709a821..a743dbe3f396 100644 --- a/tests/integration/test_system_ddl_worker_queue/test.py +++ b/tests/integration/test_system_ddl_worker_queue/test.py @@ -1,5 +1,8 @@ import pytest import time +from io import StringIO +import csv +import logging from helpers.cluster import ClickHouseCluster @@ -95,10 +98,13 @@ def test_distributed_ddl_rubbish(started_cluster): settings={"replication_alter_partitions_sync": "2"}, ) - zk_content = node1.query( - "SELECT name, value, path FROM system.zookeeper WHERE path LIKE '/clickhouse/task_queue/ddl%' SETTINGS allow_unrestricted_reads_from_keeper=true", - parse=True, - ).to_dict("records") + zk_content_raw = node1.query( + "SELECT name, value, path FROM system.zookeeper WHERE path LIKE '/clickhouse/task_queue/ddl%' SETTINGS allow_unrestricted_reads_from_keeper=true FORMAT TabSeparatedWithNames", + # parse=True, + ) # .to_dict("records") + + dict_reader = csv.DictReader(StringIO(zk_content_raw), delimiter='\t') + zk_content = [row for row in dict_reader] original_query = "" new_query = "query-artificial-" + str(time.monotonic_ns()) @@ -150,7 +156,7 @@ def test_distributed_ddl_rubbish(started_cluster): == 4 ) - node1.query( - f"ALTER TABLE testdb.{test_table} ON CLUSTER test_cluster DROP COLUMN somenewcolumn", - settings={"replication_alter_partitions_sync": "2"}, - ) + # node1.query( + # f"ALTER TABLE testdb.{test_table} ON CLUSTER test_cluster DROP COLUMN somenewcolumn", + # settings={"replication_alter_partitions_sync": "2"}, + # ) From 3a36e8865a14786b4d96c2f0b2115ac3185c6e03 Mon Sep 17 00:00:00 2001 From: strtgbb <146047128+strtgbb@users.noreply.github.com> Date: Wed, 11 Jun 2025 13:49:19 -0400 Subject: [PATCH 60/67] small fixes --- .../create_workflow_report/create_workflow_report.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/actions/create_workflow_report/create_workflow_report.py b/.github/actions/create_workflow_report/create_workflow_report.py index 8918b1db7728..4f360b9f092d 100755 --- a/.github/actions/create_workflow_report/create_workflow_report.py +++ b/.github/actions/create_workflow_report/create_workflow_report.py @@ -470,7 +470,7 @@ def format_test_status(text: str) -> str: color = ( "red" if text.lower().startswith("fail") - else "orange" if text.lower() in ("error", "broken") else "green" + else "orange" if text.lower() in ("error", "broken", "pending") else "green" ) return f'{text}' @@ -560,9 +560,7 @@ def main(): # get_cves returns ... in the case where no Grype result files were found. # This might occur when run in preview mode. - cves_not_checked = not args.cves or ( - args.mark_preview and fail_results["docker_images_cves"] is ... - ) + cves_not_checked = not args.cves or fail_results["docker_images_cves"] is ... if args.known_fails: if not os.path.exists(args.known_fails): @@ -618,7 +616,7 @@ def main(): "date": f"{datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')} UTC", "is_preview": args.mark_preview, "counts": { - "jobs_status": f"{sum(fail_results['job_statuses']['job_status'] != 'success')} fail/error", + "jobs_status": f"{sum(fail_results['job_statuses']['job_status'] != 'success')} fail/error/pending", "checks_errors": len(fail_results["checks_errors"]), "checks_new_fails": len(fail_results["checks_fails"]), "regression_new_fails": len(fail_results["regression_fails"]), From 2ca9156e8f67293d54ea08f4f04ad0cf485b70c0 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> Date: Wed, 11 Jun 2025 22:44:03 +0000 Subject: [PATCH 61/67] Merge pull request #79743 from zvonand/fix-group-by-const-cols Analyzer: Fix wrong results for grouping sets with ColumnConst --- src/Interpreters/ActionsDAG.cpp | 4 +-- src/Interpreters/ActionsDAG.h | 2 +- src/Planner/PlannerExpressionAnalysis.cpp | 4 ++- ...757_enum_defaults_const_analyzer.reference | 2 +- ...ping_sets_analyzer_const_columns.reference | 27 +++++++++++++++++++ ...7_grouping_sets_analyzer_const_columns.sql | 26 ++++++++++++++++++ 6 files changed, 60 insertions(+), 5 deletions(-) create mode 100644 tests/queries/0_stateless/03447_grouping_sets_analyzer_const_columns.reference create mode 100644 tests/queries/0_stateless/03447_grouping_sets_analyzer_const_columns.sql diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 2a594839c6a9..d5b5695b24c5 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -134,11 +134,11 @@ ActionsDAG::ActionsDAG(const NamesAndTypesList & inputs_) outputs.push_back(&addInput(input.name, input.type)); } -ActionsDAG::ActionsDAG(const ColumnsWithTypeAndName & inputs_) +ActionsDAG::ActionsDAG(const ColumnsWithTypeAndName & inputs_, bool duplicate_const_columns) { for (const auto & input : inputs_) { - if (input.column && isColumnConst(*input.column)) + if (input.column && isColumnConst(*input.column) && duplicate_const_columns) { addInput(input); diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index ee2b3fbf4f28..c73968d175b5 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -109,7 +109,7 @@ class ActionsDAG ActionsDAG & operator=(ActionsDAG &&) = default; ActionsDAG & operator=(const ActionsDAG &) = delete; explicit ActionsDAG(const NamesAndTypesList & inputs_); - explicit ActionsDAG(const ColumnsWithTypeAndName & inputs_); + explicit ActionsDAG(const ColumnsWithTypeAndName & inputs_, bool duplicate_const_columns = true); const Nodes & getNodes() const { return nodes; } static Nodes detachNodes(ActionsDAG && dag) { return std::move(dag.nodes); } diff --git a/src/Planner/PlannerExpressionAnalysis.cpp b/src/Planner/PlannerExpressionAnalysis.cpp index ed3f78193ee8..9f51d439795e 100644 --- a/src/Planner/PlannerExpressionAnalysis.cpp +++ b/src/Planner/PlannerExpressionAnalysis.cpp @@ -121,7 +121,9 @@ std::optional analyzeAggregation(const QueryTreeNodeP Names aggregation_keys; ActionsAndProjectInputsFlagPtr before_aggregation_actions = std::make_shared(); - before_aggregation_actions->dag = ActionsDAG(input_columns); + /// Here it is OK to materialize const columns: if column is used in GROUP BY, it may be expected to become non-const + /// See https://github.com/ClickHouse/ClickHouse/issues/70655 for example + before_aggregation_actions->dag = ActionsDAG(input_columns, false); before_aggregation_actions->dag.getOutputs().clear(); std::unordered_set before_aggregation_actions_output_node_names; diff --git a/tests/queries/0_stateless/00757_enum_defaults_const_analyzer.reference b/tests/queries/0_stateless/00757_enum_defaults_const_analyzer.reference index 6895acffed15..56ead34ad3b7 100644 --- a/tests/queries/0_stateless/00757_enum_defaults_const_analyzer.reference +++ b/tests/queries/0_stateless/00757_enum_defaults_const_analyzer.reference @@ -3,4 +3,4 @@ iphone 1 iphone 1 iphone 1 -iphone 1 +\N 1 diff --git a/tests/queries/0_stateless/03447_grouping_sets_analyzer_const_columns.reference b/tests/queries/0_stateless/03447_grouping_sets_analyzer_const_columns.reference new file mode 100644 index 000000000000..723208a70e1f --- /dev/null +++ b/tests/queries/0_stateless/03447_grouping_sets_analyzer_const_columns.reference @@ -0,0 +1,27 @@ +Const column in grouping set, analyzer on: +0 0 value 0 1 +0 0 value 1 1 +0 0 value 2 1 +0 0 value 3 1 +1 0 0 1 +1 0 1 1 +1 0 2 1 +1 0 3 1 +Non-const column in grouping set, analyzer on: +0 0 value 0 1 +0 0 value 1 1 +0 0 value 2 1 +0 0 value 3 1 +1 0 0 1 +1 0 1 1 +1 0 2 1 +1 0 3 1 +Const column in grouping set, analyzer off: +0 0 value 0 1 +0 0 value 1 1 +0 0 value 2 1 +0 0 value 3 1 +1 0 0 1 +1 0 1 1 +1 0 2 1 +1 0 3 1 diff --git a/tests/queries/0_stateless/03447_grouping_sets_analyzer_const_columns.sql b/tests/queries/0_stateless/03447_grouping_sets_analyzer_const_columns.sql new file mode 100644 index 000000000000..f12e5636bd3a --- /dev/null +++ b/tests/queries/0_stateless/03447_grouping_sets_analyzer_const_columns.sql @@ -0,0 +1,26 @@ +SET enable_analyzer=1; + +SELECT 'Const column in grouping set, analyzer on:'; + +SELECT grouping(key_a), grouping(key_b), key_a, key_b, count() FROM ( + SELECT 'value' as key_a, number as key_b FROM numbers(4) +) +GROUP BY GROUPING SETS((key_b), (key_a, key_b)) +ORDER BY (grouping(key_a), grouping(key_b), key_a, key_b); + +SELECT 'Non-const column in grouping set, analyzer on:'; + +SELECT grouping(key_a), grouping(key_b), key_a, key_b, count() FROM ( + SELECT materialize('value') as key_a, number as key_b FROM numbers(4) +) +GROUP BY GROUPING SETS((key_b), (key_a, key_b)) +ORDER BY (grouping(key_a), grouping(key_b), key_a, key_b); + +SELECT 'Const column in grouping set, analyzer off:'; + +SELECT grouping(key_a), grouping(key_b), key_a, key_b, count() FROM ( + SELECT 'value' as key_a, number as key_b FROM numbers(4) +) +GROUP BY GROUPING SETS((key_b), (key_a, key_b)) +ORDER BY (grouping(key_a), grouping(key_b), key_a, key_b) +SETTINGS allow_experimental_analyzer=0; From 3a11a8307451766527758825cab14bbd25aa3a04 Mon Sep 17 00:00:00 2001 From: Pervakov Grigorii Date: Fri, 6 Jun 2025 14:48:02 +0000 Subject: [PATCH 62/67] Merge pull request #79969 from filimonov/fix_index_match MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix match(col, '^…') index analysis with escaped metacharacters to avoid wrong results and crashes --- src/Storages/MergeTree/KeyCondition.cpp | 11 +- ...56_match_index_prefix_extraction.reference | 52 ++++++++ .../03456_match_index_prefix_extraction.sql | 111 ++++++++++++++++++ 3 files changed, 170 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/03456_match_index_prefix_extraction.reference create mode 100644 tests/queries/0_stateless/03456_match_index_prefix_extraction.sql diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 64d33da1d2c5..fbc33ec800f6 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -101,7 +101,7 @@ static String extractFixedPrefixFromRegularExpression(const String & regexp) const char * pos = begin; const char * end = regexp.data() + regexp.size(); - while (pos != end) + while (pos < end) { switch (*pos) { @@ -124,19 +124,22 @@ static String extractFixedPrefixFromRegularExpression(const String & regexp) case '$': case '.': case '[': + case ']': case '?': case '*': case '+': + case '\\': case '{': + case '}': + case '-': fixed_prefix += *pos; + ++pos; break; default: /// all other escape sequences are not supported - pos = end; - break; + pos = end; } - ++pos; break; } diff --git a/tests/queries/0_stateless/03456_match_index_prefix_extraction.reference b/tests/queries/0_stateless/03456_match_index_prefix_extraction.reference new file mode 100644 index 000000000000..72aa8c7e403f --- /dev/null +++ b/tests/queries/0_stateless/03456_match_index_prefix_extraction.reference @@ -0,0 +1,52 @@ +Condition: (path in [\'xxx(zzz\', \'xxx(zz{\')) +1 +Condition: (path in [\'xxx)zzz\', \'xxx)zz{\')) +1 +Condition: (path in [\'xxx^zzz\', \'xxx^zz{\')) +1 +Condition: (path in [\'xxx$zzz\', \'xxx$zz{\')) +1 +Condition: (path in [\'xxx.zzz\', \'xxx.zz{\')) +1 +Condition: (path in [\'xxx[zzz\', \'xxx[zz{\')) +1 +Condition: (path in [\'xxx]zzz\', \'xxx]zz{\')) +1 +Condition: (path in [\'xxx?zzz\', \'xxx?zz{\')) +1 +Condition: (path in [\'xxx*zzz\', \'xxx*zz{\')) +1 +Condition: (path in [\'xxx+zzz\', \'xxx+zz{\')) +1 +Condition: (path in [\'xxx\\\\zzz\', \'xxx\\\\zz{\')) +1 +Condition: (path in [\'xxx{zzz\', \'xxx{zz{\')) +1 +Condition: (path in [\'xxx}zzz\', \'xxx}zz{\')) +1 +Condition: (path in [\'xxx-zzz\', \'xxx-zz{\')) +1 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: (path in [\'xxx\', \'xxy\')) +15 +Condition: (path in [\'xxx\', \'xxy\')) +15 +Condition: (path in [\'xxx\', \'xxy\')) +15 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: (path in [\'xxx\', \'xxy\')) +0 +Condition: true +Condition: true diff --git a/tests/queries/0_stateless/03456_match_index_prefix_extraction.sql b/tests/queries/0_stateless/03456_match_index_prefix_extraction.sql new file mode 100644 index 000000000000..56eeb8bdb93d --- /dev/null +++ b/tests/queries/0_stateless/03456_match_index_prefix_extraction.sql @@ -0,0 +1,111 @@ +SET parallel_replicas_local_plan=1; + +drop table if exists foo; + +CREATE TABLE foo (id UInt8, path String) engine = MergeTree ORDER BY (path) SETTINGS index_granularity=1; + +INSERT INTO foo VALUES (1, 'xxx|yyy'), +(2, 'xxx(zzz'), +(3, 'xxx)zzz'), +(4, 'xxx^zzz'), +(5, 'xxx$zzz'), +(6, 'xxx.zzz'), +(7, 'xxx[zzz'), +(8, 'xxx]zzz'), +(9, 'xxx?zzz'), +(10, 'xxx*zzz'), +(11, 'xxx+zzz'), +(12, 'xxx\\zzz'), +(13, 'xxx{zzz'), +(14, 'xxx}zzz'), +(15, 'xxx-zzz'); + + +-- check if also escaped sequence are properly extracted +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\(zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\(zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\)zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\)zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\^zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\^zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\$zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\$zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\.zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\.zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\[zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\[zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\]zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\]zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\?zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\?zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\*zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\*zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\+zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\+zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\\\zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\\\zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\{zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\{zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\}zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\}zzz') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\-zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\-zzz') SETTINGS force_primary_key = 1; + + +-- those regex chars prevent the index use (only 3 first chars used during index scan) +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\0bla')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\0bla') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx(bla)')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx(bla)') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx[bla]')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx[bla]') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx^bla')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx^bla') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx.bla')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx.bla') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx+bla')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx+bla') SETTINGS force_primary_key = 1; + + +-- here the forth char is not used during index, because it has 0+ quantifier +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxxx{0,1}')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxxx{0,1}') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxxx?')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxxx?') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxxx*')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxxx*') SETTINGS force_primary_key = 1; + +-- some unsupported regex chars - only 3 first chars used during index scan +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\d+')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\d+') SETTINGS force_primary_key = 1; + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\w+')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\w+') SETTINGS force_primary_key = 1; + + +-- fully disabled for pipes - see https://github.com/ClickHouse/ClickHouse/pull/54696 +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxx\\|zzz')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxx\\|zzz') SETTINGS force_primary_key = 1; -- { serverError INDEX_NOT_USED } + +SELECT trimLeft(explain) FROM (EXPLAIN PLAN indexes=1 SELECT id FROM foo WHERE match(path, '^xxxx|foo')) WHERE explain like '%Condition%'; +SELECT count() FROM foo WHERE match(path, '^xxxx|foo') SETTINGS force_primary_key = 1; -- { serverError INDEX_NOT_USED } From 5f57c5fc03513d0ee77e8c46da2300ca995f6da1 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 17 Jun 2025 17:41:23 -0300 Subject: [PATCH 63/67] partial backport --- .../impl/CHKeyValuePairExtractor.h | 20 ++++++---- .../keyvaluepair/impl/NeedleFactory.h | 13 ++++++- .../keyvaluepair/impl/StateHandler.h | 5 +++ .../keyvaluepair/impl/StateHandlerImpl.h | 39 +++++++++++++++++-- ...t_key_value_pairs_multiple_input.reference | 11 ++++++ ...extract_key_value_pairs_multiple_input.sql | 12 ++++++ 6 files changed, 87 insertions(+), 13 deletions(-) diff --git a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h index 3895cf3e77db..645d87dfda63 100644 --- a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h @@ -102,7 +102,17 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor } case State::FLUSH_PAIR: { - return flushPair(file, key, value, row_offset); + incrementRowOffset(row_offset); + return state_handler.flushPair(file, key, value); + } + case State::FLUSH_PAIR_AFTER_QUOTED_VALUE: + { + incrementRowOffset(row_offset); + return state_handler.flushPairAfterQuotedValue(file, key, value); + } + case State::WAITING_PAIR_DELIMITER: + { + return state_handler.waitPairDelimiter(file); } case State::END: { @@ -111,8 +121,7 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor } } - NextState flushPair(const std::string_view & file, auto & key, - auto & value, uint64_t & row_offset) + void incrementRowOffset(uint64_t & row_offset) { row_offset++; @@ -120,11 +129,6 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor { throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs); } - - key.commit(); - value.commit(); - - return {0, file.empty() ? State::END : State::WAITING_KEY}; } void reset(auto & key, auto & value) diff --git a/src/Functions/keyvaluepair/impl/NeedleFactory.h b/src/Functions/keyvaluepair/impl/NeedleFactory.h index 83862a2281a5..5e58523fb277 100644 --- a/src/Functions/keyvaluepair/impl/NeedleFactory.h +++ b/src/Functions/keyvaluepair/impl/NeedleFactory.h @@ -20,7 +20,7 @@ template class NeedleFactory { public: - SearchSymbols getWaitNeedles(const Configuration & extractor_configuration) + SearchSymbols getWaitKeyNeedles(const Configuration & extractor_configuration) { const auto & [key_value_delimiter, quoting_character, pair_delimiters] = extractor_configuration; @@ -39,6 +39,17 @@ class NeedleFactory return SearchSymbols {std::string{needles.data(), needles.size()}}; } + SearchSymbols getWaitPairDelimiterNeedles(const Configuration & extractor_configuration) + { + const auto & pair_delimiters = extractor_configuration.pair_delimiters; + + std::vector needles; + + std::copy(pair_delimiters.begin(), pair_delimiters.end(), std::back_inserter(needles)); + + return SearchSymbols {std::string{needles.data(), needles.size()}}; + } + SearchSymbols getReadKeyNeedles(const Configuration & extractor_configuration) { const auto & [key_value_delimiter, quoting_character, pair_delimiters] diff --git a/src/Functions/keyvaluepair/impl/StateHandler.h b/src/Functions/keyvaluepair/impl/StateHandler.h index 178974e9d366..f102debec609 100644 --- a/src/Functions/keyvaluepair/impl/StateHandler.h +++ b/src/Functions/keyvaluepair/impl/StateHandler.h @@ -30,6 +30,11 @@ class StateHandler READING_QUOTED_VALUE, // In this state, both key and value have already been collected and should be flushed. Might jump to WAITING_KEY or END. FLUSH_PAIR, + // The `READING_QUOTED_VALUE` will assert the closing quoting character is found and then flush the pair. In this case, we should not + // move from `FLUSH_PAIR` directly to `WAITING_FOR_KEY` because a pair delimiter has not been found. Might jump to WAITING_FOR_PAIR_DELIMITER or END + FLUSH_PAIR_AFTER_QUOTED_VALUE, + // Might jump to WAITING_KEY or END. + WAITING_PAIR_DELIMITER, END }; diff --git a/src/Functions/keyvaluepair/impl/StateHandlerImpl.h b/src/Functions/keyvaluepair/impl/StateHandlerImpl.h index cf31d30b9dc4..8b18b2050fdd 100644 --- a/src/Functions/keyvaluepair/impl/StateHandlerImpl.h +++ b/src/Functions/keyvaluepair/impl/StateHandlerImpl.h @@ -40,10 +40,11 @@ class StateHandlerImpl : public StateHandler * */ NeedleFactory needle_factory; - wait_needles = needle_factory.getWaitNeedles(configuration); + wait_key_needles = needle_factory.getWaitKeyNeedles(configuration); read_key_needles = needle_factory.getReadKeyNeedles(configuration); read_value_needles = needle_factory.getReadValueNeedles(configuration); read_quoted_needles = needle_factory.getReadQuotedNeedles(configuration); + wait_pair_delimiter_needles = needle_factory.getWaitPairDelimiterNeedles(configuration); } /* @@ -51,7 +52,7 @@ class StateHandlerImpl : public StateHandler * */ [[nodiscard]] NextState waitKey(std::string_view file) const { - if (const auto * p = find_first_not_symbols_or_null(file, wait_needles)) + if (const auto * p = find_first_not_symbols_or_null(file, wait_key_needles)) { const size_t character_position = p - file.begin(); if (isQuotingCharacter(*p)) @@ -284,7 +285,7 @@ class StateHandlerImpl : public StateHandler { value.append(file.begin() + pos, file.begin() + character_position); - return {next_pos, State::FLUSH_PAIR}; + return {next_pos, State::FLUSH_PAIR_AFTER_QUOTED_VALUE}; } pos = next_pos; @@ -292,14 +293,44 @@ class StateHandlerImpl : public StateHandler return {file.size(), State::END}; } + [[nodiscard]] NextState flushPair(std::string_view file, auto & key, auto & value) const + { + key.commit(); + value.commit(); + + return {0, file.empty() ? State::END : State::WAITING_KEY}; + } + + [[nodiscard]] NextState flushPairAfterQuotedValue(std::string_view file, auto & key, auto & value) const + { + key.commit(); + value.commit(); + + return {0, file.empty() ? State::END : State::WAITING_PAIR_DELIMITER}; + } + + [[nodiscard]] NextState waitPairDelimiter(std::string_view file) const + { + if (const auto * p = find_first_symbols_or_null(file, wait_pair_delimiter_needles)) + { + const size_t character_position = p - file.data(); + size_t next_pos = character_position + 1u; + + return {next_pos, State::WAITING_KEY}; + } + + return {file.size(), State::END}; + } + const Configuration configuration; private: - SearchSymbols wait_needles; + SearchSymbols wait_key_needles; SearchSymbols read_key_needles; SearchSymbols read_value_needles; SearchSymbols read_quoted_needles; + SearchSymbols wait_pair_delimiter_needles; /* * Helper method to copy bytes until `character_pos` and process possible escape sequence. Returns a pair containing a boolean diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference index 9a0cfdffcb50..d9a3c10c835d 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference @@ -381,3 +381,14 @@ WITH SELECT x; {'age':'31','name':'neymar','nationality':'brazil','team':'psg'} +-- after parsing a quoted value, the next key should only start after a pair delimiter +WITH + extractKeyValuePairs('key:"quoted_value"junk,second_key:0') as s_map, + CAST( + arrayMap( + (x) -> (x, s_map[x]), arraySort(mapKeys(s_map)) + ), + 'Map(String,String)' + ) AS x +SELECT + x; diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql index 4f3db3f166b4..ce05ee6a3f3b 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql @@ -516,3 +516,15 @@ WITH ) AS x SELECT x; + +-- after parsing a quoted value, the next key should only start after a pair delimiter +WITH + extractKeyValuePairs('key:"quoted_value"junk,second_key:0') as s_map, + CAST( + arrayMap( + (x) -> (x, s_map[x]), arraySort(mapKeys(s_map)) + ), + 'Map(String,String)' + ) AS x +SELECT + x; From c8f5926c0c4b2ce5a36d97092b0953ea02b76234 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 17 Jun 2025 19:57:36 -0300 Subject: [PATCH 64/67] fix tests --- .../02499_extract_key_value_pairs_multiple_input.reference | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference index d9a3c10c835d..bf6f30bfc1b1 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference @@ -392,3 +392,4 @@ WITH ) AS x SELECT x; +{'key':'quoted_value','second_key':'0'} From b24162488a669b7734e763da53805eb5e70b6fd1 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Wed, 18 Jun 2025 08:45:09 +0300 Subject: [PATCH 65/67] Poke CI once more From 7943791bbb540719ad8440bb0f9148feee30cf57 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Wed, 18 Jun 2025 09:20:26 +0300 Subject: [PATCH 66/67] Poke CI once more From 2558dfa8db4869b50bf5fce614afc47efc093342 Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Wed, 25 Jun 2025 13:12:44 +0200 Subject: [PATCH 67/67] Bumped VERSION_TWEAK to have proper version --- cmake/autogenerated_versions.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt index 28b744feeda5..d71dcfefe2c3 100644 --- a/cmake/autogenerated_versions.txt +++ b/cmake/autogenerated_versions.txt @@ -10,10 +10,10 @@ SET(VERSION_GITHASH c8a1e828dcf9832dc2d71adcbd50c698f93bb69b) #10000 for altinitystable candidates #20000 for altinityedge candidates -SET(VERSION_TWEAK 10000) +SET(VERSION_TWEAK 10500) SET(VERSION_FLAVOUR altinitytest) -SET(VERSION_DESCRIBE v24.8.14.10000.altinitytest) -SET(VERSION_STRING 24.8.14.10000.altinitytest) +SET(VERSION_DESCRIBE v24.8.14.10500.altinitytest) +SET(VERSION_STRING 24.8.14.10500.altinitytest) # end of autochange