From e1e3d4c64d0587fa7026e07b0f838d95681ac46d Mon Sep 17 00:00:00 2001 From: John Humlick <15677335+jhumlick@users.noreply.github.com> Date: Mon, 7 Apr 2025 16:50:09 -0700 Subject: [PATCH 1/2] libclamav: Add URI scanning support to PDF parser Threat Research requests scanning URIs in PDF files and adding them to the json report file. This change adds URI scanning support to the PDF parser, including support for object references to URIs in PDF files. Jira: CLAM-2588 Fix out-of-order references and other minor improvements. CLAM-2588, CLAM-2757 --- clamscan/clamscan.c | 6 +- clamscan/manager.c | 8 +- common/optparser.c | 3 +- etc/clamd.conf.sample | 12 +- libclamav/bytecode_api.h | 3 +- libclamav/clamav.h | 8 +- libclamav/others.h | 3 +- libclamav/pdf.c | 275 +++++++++++++----- libclamav/scanners.c | 16 +- ...ml_urls_test.py => save_html_uris_test.py} | 11 +- unit_tests/clamscan/save_pdf_uris_test.py | 85 ++++++ .../other_scanfiles/pdf/out-of-order.pdf | Bin 0 -> 831 bytes .../input/other_scanfiles/pdf/uri-and-ref.pdf | Bin 0 -> 831 bytes win32/conf_examples/clamd.conf.sample | 12 +- 14 files changed, 346 insertions(+), 96 deletions(-) rename unit_tests/clamscan/{save_html_urls_test.py => save_html_uris_test.py} (88%) create mode 100644 unit_tests/clamscan/save_pdf_uris_test.py create mode 100644 unit_tests/input/other_scanfiles/pdf/out-of-order.pdf create mode 100644 unit_tests/input/other_scanfiles/pdf/uri-and-ref.pdf diff --git a/clamscan/clamscan.c b/clamscan/clamscan.c index 9bfd93587e..83abe0fb86 100644 --- a/clamscan/clamscan.c +++ b/clamscan/clamscan.c @@ -254,8 +254,10 @@ void help(void) mprintf(LOGG_INFO, " --gen-json[=yes/no(*)] Generate JSON metadata for the scanned file(s). For testing & development use ONLY.\n"); mprintf(LOGG_INFO, " JSON will be printed if --debug is enabled.\n"); mprintf(LOGG_INFO, " A JSON file will dropped to the temp directory if --leave-temps is enabled.\n"); - mprintf(LOGG_INFO, " --json-store-html-urls[=yes(*)/no] Store html URLs in metadata.\n"); - mprintf(LOGG_INFO, " URLs will be written to the metadata.json file in an array called 'HTMLUrls'\n"); + mprintf(LOGG_INFO, " --json-store-html-uris[=yes(*)/no] Store html URIs in metadata.\n"); + mprintf(LOGG_INFO, " URLs will be written to the metadata.json file in an array called 'URIs'\n"); + mprintf(LOGG_INFO, " --json-store-pdf-uris[=yes(*)/no] Store pdf URIs in metadata.\n"); + mprintf(LOGG_INFO, " URLs will be written to the metadata.json file in an array called 'URIs'\n"); mprintf(LOGG_INFO, " --database=FILE/DIR -d FILE/DIR Load virus database from FILE or load all supported db files from DIR\n"); mprintf(LOGG_INFO, " --official-db-only[=yes/no(*)] Only load official signatures\n"); mprintf(LOGG_INFO, " --fail-if-cvd-older-than=days Return with a nonzero error code if virus database outdated.\n"); diff --git a/clamscan/manager.c b/clamscan/manager.c index d6b38a66d0..d861ec88e7 100644 --- a/clamscan/manager.c +++ b/clamscan/manager.c @@ -1574,8 +1574,12 @@ int scanmanager(const struct optstruct *opts) options.general |= CL_SCAN_GENERAL_HEURISTICS; } - if (optget(opts, "json-store-html-urls")->enabled) { - options.general |= CL_SCAN_GENERAL_STORE_HTML_URLS; + if (optget(opts, "json-store-html-uris")->enabled) { + options.general |= CL_SCAN_GENERAL_STORE_HTML_URIS; + } + + if (optget(opts, "json-store-pdf-uris")->enabled) { + options.general |= CL_SCAN_GENERAL_STORE_PDF_URIS; } /* TODO: Remove deprecated option in a future feature release */ diff --git a/common/optparser.c b/common/optparser.c index 5014f9e885..717011c531 100644 --- a/common/optparser.c +++ b/common/optparser.c @@ -389,7 +389,8 @@ const struct clam_option __clam_options[] = { {"PhishingScanURLs", "phishing-scan-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Scan URLs found in mails for phishing attempts using heuristics.", "yes"}, {"HeuristicAlerts", "heuristic-alerts", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "In some cases (eg. complex malware, exploits in graphic files, and others),\nClamAV uses special algorithms to provide accurate detection. This option\ncontrols the algorithmic detection.", "yes"}, - {"JsonStoreHTMLUrls", "json-store-html-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Store URLs found in HTML