Cisco-Talos · val-ms · Sep 16, 2022 · Sep 9, 2022 · Sep 9, 2022
diff --git a/libclamav/readdb.c b/libclamav/readdb.c
@@ -340,23 +340,37 @@ static cl_error_t readdb_load_regex_subsignature(struct cli_matcher *root, const
     char *end    = NULL;
 
     const char *trigger, *pattern, *cflags;
-    int subtokens_count;
+
 // The maximum number of `:` delimited fields in a regex subsignature.
 #define MAX_REGEX_SUB_TOKENS 4
     char *subtokens[MAX_REGEX_SUB_TOKENS + 1];
     const char *sig;
 
-    subtokens_count = cli_ldbtokenize(hexsig, ':', MAX_REGEX_SUB_TOKENS + 1, (const char **)subtokens, 0);
-    if (!subtokens_count) {
-        cli_errmsg("Invalid or unsupported ldb subsignature format\n");
-        status = CL_EMALFDB;
-        goto done;
-    }
+    if (0 == strncmp(virname, "YARA", 4)) {
+        // Do not tokenize for ':' in yara regex strings. ':' do not have special meaning in yara regex strings.
+        // Also, Yara regex strings may use '/' without escape characters, which confuses the "within_pcre" feature of `cli_ldbtokenize()`.
+        sig = hexsig;
 
-    if ((subtokens_count % 2) == 0)
-        offset = subtokens[0];
+    } else {
+        // LDB PCRE subsignatures have this structure:
+        // [Offset:]Trigger/PCRE/[Flags]
+        // We need to split on the ':' character in case the offset was specified.
 
-    sig = (subtokens_count % 2) ? subtokens[0] : subtokens[1];
+        size_t subtokens_count = cli_ldbtokenize(hexsig, ':', MAX_REGEX_SUB_TOKENS + 1, (const char **)subtokens, 0);
+        if (!subtokens_count) {
+            cli_errmsg("Invalid or unsupported ldb subsignature format\n");
+            status = CL_EMALFDB;
+            goto done;
+        }
+
+        if (subtokens_count == 2) {
+            // Offset was specified
+            offset = subtokens[0];
+            sig = subtokens[1];
+        } else {
+            sig = subtokens[0];
+        }
+    }
 
     /* get copied */
     hexcpy = cli_strdup(sig);

diff --git a/libclamav/str.c b/libclamav/str.c
@@ -857,33 +857,40 @@ cl_error_t cli_strntoul_wrap(const char *buf, size_t buf_size,
 size_t cli_ldbtokenize(char *buffer, const char delim, const size_t token_count,
                        const char **tokens, size_t token_skip)
 {
-    size_t tokens_found, i;
-    char *start = buffer;
+    size_t tokens_found = 0;
+    size_t token_index  = 0;
+    size_t buffer_index = 0;
+    bool within_pcre    = false;
 
-    for (tokens_found = 0; tokens_found < token_count;) {
-        tokens[tokens_found++] = buffer;
+    while (tokens_found < token_count) {
+        tokens[tokens_found++] = &buffer[buffer_index];
 
-        while (*buffer != '\0') {
-            if (*buffer == delim) {
+        while (buffer[buffer_index] != '\0') {
+            if (!within_pcre && (buffer[buffer_index] == delim)) {
                 break;
             } else if ((tokens_found > token_skip) &&
-                       ((buffer > start) && (*(buffer - 1) != '\\')) &&
-                       (*buffer == '/')) {
-                return tokens_found;
+                       // LDB PCRE rules must escape the '/' character with a '\'.
+                       // If the character sequence is "\/", then we are still within the PCRE string.
+                       ((buffer_index > 0) && (buffer[buffer_index - 1] != '\\')) && (buffer[buffer_index] == '/')) {
+                within_pcre = !within_pcre;
             }
-            buffer++;
+            buffer_index++;
         }
 
-        if (*buffer != '\0') {
-            *buffer++ = '\0';
+        if (buffer[buffer_index] != '\0') {
+            buffer[buffer_index] = '\0';
+            buffer_index++;
         } else {
-            i = tokens_found;
-            while (i < token_count) {
-                tokens[i++] = NULL;
+            token_index = tokens_found;
+            while (token_index < token_count) {
+                tokens[token_index] = NULL;
+                token_index++;
             }
+
             return tokens_found;
         }
     }
+
     return tokens_found;
 }
 

diff --git a/unit_tests/clamscan_test.py b/unit_tests/clamscan_test.py
@@ -571,7 +571,7 @@ def test_clamscan_17_pcre_slash_colon(self):
         #
         yara_db = TC.path_tmp / 'regex-slash-colon.ldb'
         yara_db.write_text(
-            r'regex;Engine:81-255,Target:0;1;68656c6c6f20;0/hello blee/blah: bleh/'
+            r'regex;Engine:81-255,Target:0;1;68656c6c6f20;0/hello blee\/blah: bleh/'
         )
         command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles}'.format(
             valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan, path_db=yara_db, testfiles=testfile,
@@ -616,3 +616,166 @@ def test_clamscan_17_pcre_slash_colon(self):
             'Infected files: 1',
         ]
         self.verify_output(output.out, expected=expected_results)
+
+    def test_clamscan_18_ldb_offset_pcre(self):
+        self.step_name('Test LDB regex rules with an offset')
+        # The offset feature starts the match some # of bytes after start of the pattern match
+        # The offset is EXACT, meaning it's no longer wildcard.
+        # The match must occur exactly that number of bytes from the start of the file.
+
+        # using 'MZ' prefix so it is detected as MSEXE and not TEXT. This is to avoid normalization.
+        testfile = TC.path_tmp / 'ldb_offset_pcre'
+        testfile.write_text('MZ hello blee')
+
+        # First without the offset, make sure it matches
+        yara_db = TC.path_tmp / 'ldb_pcre_no_offset.ldb'
+        yara_db.write_text(
+            r'ldb_pcre_no_offset;Engine:81-255,Target:0;0&1;68656c6c6f20;0/hello blee/'
+        )
+        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles}'.format(
+            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan, path_db=yara_db, testfiles=testfile,
+        )
+        output = self.execute_command(command)
+
+        assert output.ec == 1  # virus found
+
+        expected_results = [
+            'ldb_offset_pcre: ldb_pcre_no_offset.UNOFFICIAL FOUND',
+            'Infected files: 1',
+        ]
+
+        # Next, with the offset, but it won't match, because the regex pattern is "hello blee"
+        # and with the offset of 5 (from start of file) means it should start the pcre matching at "llo blee"
+        yara_db = TC.path_tmp / 'ldb_pcre_offset_no_match.ldb'
+        yara_db.write_text(
+            r'ldb_pcre_offset_no_match;Engine:81-255,Target:0;0&1;68656c6c6f20;5:0/hello blee/'
+        )
+        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles}'.format(
+            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan, path_db=yara_db, testfiles=testfile,
+        )
+        output = self.execute_command(command)
+
+        assert output.ec == 0  # virus NOT found
+
+        expected_results = [
+            'ldb_offset_pcre: OK',
+        ]
+
+        # Next, with the offset, and it SHOULD match, because the regex pattern is "llo blee"
+        # and with the offset of 5 (from start of file) means it should start the pcre matching at "llo blee"
+        yara_db = TC.path_tmp / 'ldb_pcre_offset_match.ldb'
+        yara_db.write_text(
+            r'ldb_pcre_offset_match;Engine:81-255,Target:0;0&1;68656c6c6f20;5:0/llo blee/'
+        )
+        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles}'.format(
+            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan, path_db=yara_db, testfiles=testfile,
+        )
+        output = self.execute_command(command)
+
+        assert output.ec == 1  # virus found
+
+        expected_results = [
+            'ldb_offset_pcre: ldb_pcre_offset_match.UNOFFICIAL FOUND',
+            'Infected files: 1',
+        ]
+
+    def test_clamscan_18_ldb_pcre_flag(self):
+        self.step_name('Test LDB regex rules with case insensitive flag')
+        # This test validates that the flags field is, and more specifically the case-insensitive flag is working.
+
+        # using 'MZ' prefix so it is detected as MSEXE and not TEXT. This is to avoid normalization.
+        testfile = TC.path_tmp / 'ldb_pcre_flag'
+        testfile.write_text('MZ hello blee / BlAh')
+
+        # First test withOUT the case-insensitive flag. It should NOT match.
+        yara_db = TC.path_tmp / 'ldb_pcre_case.ldb'
+        yara_db.write_text(
+            r'ldb_pcre_case;Engine:81-255,Target:0;0&1;68656c6c6f20;0/blah/'
+        )
+        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles}'.format(
+            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan, path_db=yara_db, testfiles=testfile,
+        )
+        output = self.execute_command(command)
+
+        assert output.ec == 0  # virus NOT found
+
+        expected_results = [
+            'ldb_pcre_flag: OK',
+        ]
+
+        # First test WITH the case-insensitive flag. It SHOULD match.
+        yara_db = TC.path_tmp / 'ldb_pcre_nocase.ldb'
+        yara_db.write_text(
+            r'ldb_pcre_nocase;Engine:81-255,Target:0;0&1;68656c6c6f20;0/blah/i'
+        )
+        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles}'.format(
+            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan, path_db=yara_db, testfiles=testfile,
+        )
+        output = self.execute_command(command)
+
+        assert output.ec == 1  # virus found
+
+        expected_results = [
+            'ldb_pcre_flag: ldb_pcre_nocase.UNOFFICIAL FOUND',
+            'Infected files: 1',
+        ]
+
+    def test_clamscan_18_ldb_multi_pcre(self):
+        self.step_name('Test LDB and Yara regex rules with / and : in the string work')
+        # This is a regression test for a bug where :'s in a PCRE regex would act
+        # as delimiters if there was also a / in the regex before the :
+
+        # using 'MZ' prefix so it is detected as MSEXE and not TEXT. This is to avoid normalization.
+        testfile = TC.path_tmp / 'ldb_multi_pcre'
+        testfile.write_text('MZ hello blee / BlAh')
+
+        # Verify first with two subsigs that should match, that the alert has found.
+        yara_db = TC.path_tmp / 'ldb_multi_pcre.ldb'
+        yara_db.write_text(
+            r'ldb_multi_pcre;Engine:81-255,Target:0;0&1&2;68656c6c6f20;0/hello blee/;0/blah/i'
+        )
+        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles}'.format(
+            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan, path_db=yara_db, testfiles=testfile,
+        )
+        output = self.execute_command(command)
+
+        assert output.ec == 1  # virus found
+
+        expected_results = [
+            'ldb_multi_pcre: ldb_multi_pcre.UNOFFICIAL FOUND',
+            'Infected files: 1',
+        ]
+
+        # Verify next that if one of the two subsigs do not match, the whole thing does not match.
+        yara_db = TC.path_tmp / 'ldb_multi_pcre.ldb'
+        yara_db.write_text(
+            r'ldb_multi_pcre;Engine:81-255,Target:0;0&1&2;68656c6c6f20;0/hello blee/;0/bloh/i'
+        )
+        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles}'.format(
+            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan, path_db=yara_db, testfiles=testfile,
+        )
+        output = self.execute_command(command)
+
+        assert output.ec == 0  # virus NOT found
+
+        expected_results = [
+            'ldb_multi_pcre: OK',
+            'Infected files: 0',
+        ]
+
+        # Verify next that if the other of the two subsigs do not match, the whole thing does not match.
+        yara_db = TC.path_tmp / 'ldb_multi_pcre.ldb'
+        yara_db.write_text(
+            r'ldb_multi_pcre;Engine:81-255,Target:0;0&1&2;68656c6c6f20;0/hella blee/;0/blah/i'
+        )
+        command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} {testfiles}'.format(
+            valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan, path_db=yara_db, testfiles=testfile,
+        )
+        output = self.execute_command(command)
+
+        assert output.ec == 0  # virus NOT found
+
+        expected_results = [
+            'ldb_multi_pcre: OK',
+            'Infected files: 0',
+        ]