Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 65 additions & 14 deletions libclamav/pdf.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ static void pdf_export_json(struct pdf_struct *);

static void ASCIIHexDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void ASCII85Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void AutomaticAction_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
static void EmbeddedFile_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void FlateDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Image_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
Expand Down Expand Up @@ -1652,12 +1653,13 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
* is an object stream. If so, collect the relevant info.
*/
dict_len = obj->stream - start;
if (NULL != (pstr = pdf_getdict(start, &dict_len, "/Type/ObjStm"))) {
if (NULL != (pstr = pdf_getdict(start, &dict_len, "/ObjStm"))) {
int objstm_first = -1;
int objstm_length = -1;
int objstm_n = -1;

cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n");
cli_dbgmsg("pdf_extract_obj: Found /ObjStm\n");
pdf->stats.nobjstream++;

dict_len = obj->stream - start;
if (-1 == (objstm_first = pdf_readint(start, dict_len, "/First"))) {
Expand All @@ -1668,14 +1670,17 @@ cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t
cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n");
} else {
/* Add objstm to pdf struct, so it can be freed eventually */
pdf->nobjstms++;
pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms);
if (!pdf->objstms) {
pdf->objstms = malloc(sizeof(struct objstm_struct *));
} else {
pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * (pdf->nobjstms + 1));
}
if (!pdf->objstms) {
cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
status = CL_EMEM;
goto done;
}

pdf->nobjstms++;
objstm = malloc(sizeof(struct objstm_struct));
if (!objstm) {
cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
Expand Down Expand Up @@ -1955,6 +1960,7 @@ struct pdfname_action {
};

static struct pdfname_action pdfname_actions[] = {
{"AA", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, AutomaticAction_cb},
{"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCIIHexDecode_cb},
{"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb},
{"A85", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb},
Expand Down Expand Up @@ -2139,7 +2145,7 @@ static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len)
static void pdf_parse_trailer(struct pdf_struct *pdf, const char *s, long length)
{
const char *enc;

pdf->stats.ntrailer++;
enc = cli_memstr(s, length, "/Encrypt", 8);
if (enc) {
char *newID;
Expand Down Expand Up @@ -2223,6 +2229,7 @@ void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
if ((CL_SUCCESS == has_stream) ||
(CL_EFORMAT == has_stream)) {
/* Stream found. Store this fact and the stream bounds. */
pdf->stats.nstream++;
cli_dbgmsg("pdf_parseobj: %u %u contains stream, size: %zu\n", obj->id >> 8, obj->id & 0xff, stream_size);
obj->flags |= (1 << OBJ_STREAM);
obj->stream = stream;
Expand Down Expand Up @@ -3902,6 +3909,8 @@ cl_error_t cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
if (!q || xrefCheck(q, q + bytesleft) == -1) {
cli_dbgmsg("cli_pdf: did not find valid xref\n");
pdf.flags |= 1 << BAD_PDF_TRAILER;
} else {
pdf.stats.nxref++;
}
}
}
Expand Down Expand Up @@ -4564,35 +4573,61 @@ static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfna
}
}

static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);

if (NULL == pdf)
cli_ctx *ctx = pdf->ctx;

if (!(pdf) || !(pdf->ctx->wrkproperty) || !(SCAN_COLLECT_METADATA)) {
return;
}

pdf->stats.nrichmedia++;
pdf->stats.nacroform++;
}

static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
static void AutomaticAction_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);
char *p1 = NULL;
const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
: (const char *)(obj->start + pdf->map);
cli_ctx *ctx = pdf->ctx;

if (NULL == pdf)
if (!(pdf) || !(pdf->ctx->wrkproperty) || !(SCAN_COLLECT_METADATA)) {
return;
}

pdf->stats.nacroform++;
// ToDO: Find a way to not count references to the same automatic action multiple times
pdf->stats.naa++;
}

static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);

cli_ctx *ctx = pdf->ctx;

if (!(pdf) || !(pdf->ctx->wrkproperty) || !(SCAN_COLLECT_METADATA)) {
return;
}

pdf->stats.nrichmedia++;
}

static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
{
UNUSEDPARAM(obj);
UNUSEDPARAM(act);

if (NULL == pdf)
cli_ctx *ctx = pdf->ctx;

if (!(pdf) || !(pdf->ctx->wrkproperty) || !(SCAN_COLLECT_METADATA)) {
return;
}

pdf->stats.nxfa++;
}
Expand Down Expand Up @@ -4761,6 +4796,8 @@ static void URI_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_a
return;
}

pdf->stats.nuri++;

if (obj->objstm) {
bytesleft = MIN(obj->size, obj->objstm->streambuf_len - obj->start);
} else {
Expand Down Expand Up @@ -5118,8 +5155,22 @@ static void pdf_export_json(struct pdf_struct *pdf)
cli_jsonint(pdfobj, "RichMediaCount", pdf->stats.nrichmedia);
if (pdf->stats.nacroform)
cli_jsonint(pdfobj, "AcroFormCount", pdf->stats.nacroform);
if (pdf->stats.nxfa)
if (pdf->stats.nacroform)
cli_jsonint(pdfobj, "XFACount", pdf->stats.nxfa);
if (pdf->stats.naa)
cli_jsonint(pdfobj, "AutomaticActionCount", pdf->stats.naa);
if (pdf->stats.nstream)
cli_jsonint(pdfobj, "StreamCount", pdf->stats.nstream);
if (pdf->nobjs)
cli_jsonint(pdfobj, "ObjectCount", pdf->nobjs);
if (pdf->stats.nobjstream)
cli_jsonint(pdfobj, "ObjectStreamCount", pdf->stats.nobjstream);
if (pdf->stats.ntrailer)
cli_jsonint(pdfobj, "TrailerCount", pdf->stats.ntrailer);
if (pdf->stats.nuri)
cli_jsonint(pdfobj, "URICount", pdf->stats.nuri);
if (pdf->stats.nxref)
cli_jsonint(pdfobj, "XRefCount", pdf->stats.nxref);
if (pdf->flags & (1 << BAD_PDF_VERSION))
cli_jsonbool(pdfobj, "BadVersion", 1);
if (pdf->flags & (1 << BAD_PDF_HEADERPOS))
Expand Down
44 changes: 25 additions & 19 deletions libclamav/pdf.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,39 +102,45 @@ struct pdf_stats_entry {
};

struct pdf_stats {
int32_t ninvalidobjs; /* Number of invalid objects */
int32_t njs; /* Number of javascript objects */
int32_t nflate; /* Number of flate-encoded objects */
int32_t naa; /* Number of Automatic Action objects */
int32_t nacroform; /* Number of AcroForm objects */
int32_t nactivex; /* Number of ActiveX objects */
int32_t nflash; /* Number of flash objects */
int32_t ncolors; /* Number of colors */
int32_t nasciihexdecode; /* Number of ASCIIHexDecode-filtered objects */
int32_t nascii85decode; /* Number of ASCII85Decode-filtered objects */
int32_t nasciihexdecode; /* Number of ASCIIHexDecode-filtered objects */
int32_t ncolors; /* Number of colors */
int32_t ncrypt; /* Number of Crypt-filtered objects */
int32_t ndctdecode; /* Number of DCTDecode-filtered objects */
int32_t nembeddedfile; /* Number of embedded files */
int32_t nimage; /* Number of image objects */
int32_t nlzw; /* Number of LZW-filtered objects */
int32_t nrunlengthdecode; /* Number of RunLengthDecode-filtered objects */
int32_t nfaxdecode; /* Number of CCITT-filtered objects */
int32_t nflash; /* Number of flash objects */
int32_t nflate; /* Number of flate-encoded objects */
int32_t nimage; /* Number of image objects */
int32_t ninvalidobjs; /* Number of invalid objects */
int32_t njbig2decode; /* Number of JBIG2Decode-filtered objects */
int32_t ndctdecode; /* Number of DCTDecode-filtered objects */
int32_t njpxdecode; /* Number of JPXDecode-filtered objects */
int32_t ncrypt; /* Number of Crypt-filtered objects */
int32_t nstandard; /* Number of Standard-filtered objects */
int32_t nsigned; /* Number of Signed objects */
int32_t nopenaction; /* Number of OpenAction objects */
int32_t njs; /* Number of javascript objects */
int32_t nlaunch; /* Number of Launch objects */
int32_t nlzw; /* Number of LZW-filtered objects */
int32_t nobjstream; /* Number of object streams */
int32_t nopenaction; /* Number of OpenAction objects */
int32_t npage; /* Number of Page objects */
int32_t nrichmedia; /* Number of RichMedia objects */
int32_t nacroform; /* Number of AcroForm objects */
int32_t nrunlengthdecode; /* Number of RunLengthDecode-filtered objects */
int32_t nsigned; /* Number of Signed objects */
int32_t nstandard; /* Number of Standard-filtered objects */
int32_t nstream; /* Number of streams */
int32_t ntrailer; /* Number of trailer objects */
int32_t nuri; /* Number of URI objects */
int32_t nxfa; /* Number of XFA objects */
int32_t nxref; /* Number of xref objects */
struct pdf_stats_entry *author; /* Author of the PDF */
struct pdf_stats_entry *creator; /* Application used to create the PDF */
struct pdf_stats_entry *producer; /* Application used to produce the PDF */
struct pdf_stats_entry *creationdate; /* Date the PDF was created */
struct pdf_stats_entry *creator; /* Application used to create the PDF */
struct pdf_stats_entry *keywords; /* Keywords of the PDF */
struct pdf_stats_entry *modificationdate; /* Date the PDF was modified */
struct pdf_stats_entry *title; /* Title of the PDF */
struct pdf_stats_entry *producer; /* Application used to produce the PDF */
struct pdf_stats_entry *subject; /* Subject of the PDF */
struct pdf_stats_entry *keywords; /* Keywords of the PDF */
struct pdf_stats_entry *title; /* Title of the PDF */
};

enum enc_method {
Expand Down
72 changes: 72 additions & 0 deletions unit_tests/clamscan/pdf_stats_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Copyright (C) 2020-2025 Cisco Systems, Inc. and/or its affiliates. All rights reserved.

"""
Run clamscan tests.
"""

import sys
import os
import re
import shutil

sys.path.append('../unit_tests')
import testcase


class TC(testcase.TestCase):
@classmethod
def setUpClass(cls):
super(TC, cls).setUpClass()

@classmethod
def tearDownClass(cls):
super(TC, cls).tearDownClass()

def setUp(self):
super(TC, self).setUp()

def tearDown(self):
super(TC, self).tearDown()

# Remove scan temps directory between tests
if (self.path_tmp / "TD").exists():
shutil.rmtree(self.path_tmp / "TD")

self.verify_valgrind_log()

def test_pdf_stats(self):
self.step_name('Test PDF Stats')

tempdir=self.path_tmp / "TD"
if not os.path.isdir(tempdir):
os.makedirs(tempdir)

testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'pdf' / 'pdf-stats-test.pdf'
command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile}'.format(
valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan,
path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb',
tempdir=tempdir,
testfile=testfile,
)
output = self.execute_command(command)

assert output.ec == 0 # clean

expected_strings = [
'"JavaScriptObjectCount":1,',
'"EmbeddedFileCount":2,',
'"JBIG2DecodeCount":2,',
'"OpenActionCount":2,',
'"LaunchCount":2,',
'"PageCount":2,',
'"RichMediaCount":2,',
'"AcroFormCount":2,',
'"XFACount":2,',
'"AutomaticActionCount":2,',
'"StreamCount":7,',
'"ObjectCount":16,',
'"ObjectStreamCount":1,',
'"TrailerCount":1,',
'"XRefCount":1'
]
self.verify_metadata_json(tempdir, expected_strings)
Binary file not shown.
Loading