diff --git a/include/mupdf/pdf/document.h b/include/mupdf/pdf/document.h index 4e7b90c895..a0d6769c22 100644 --- a/include/mupdf/pdf/document.h +++ b/include/mupdf/pdf/document.h @@ -777,6 +777,7 @@ typedef struct int do_use_objstms; /* Use objstms if possible */ int compression_effort; /* 0 for default. 100 = max, 1 = min. */ int do_labels; /* Add labels to each object showing how it can be reached from the Root. */ + int do_strip_invisible_text; /* Strip invisible text, requires sanitize. */ } pdf_write_options; FZ_DATA extern const pdf_write_options pdf_default_write_options; diff --git a/source/pdf/pdf-write.c b/source/pdf/pdf-write.c index add3f22607..2927003656 100644 --- a/source/pdf/pdf-write.c +++ b/source/pdf/pdf-write.c @@ -1879,6 +1879,7 @@ const char *fz_pdf_write_options_usage = "\tlinearize: optimize for web browsers (no longer supported!)\n" "\tclean: pretty-print graphics commands in content streams\n" "\tsanitize: sanitize graphics commands in content streams\n" + "\tstrip-invisible-text: strip invisible text in content streams\n" "\tgarbage: garbage collect unused objects\n" "\tor garbage=compact: ... and compact cross reference table\n" "\tor garbage=deduplicate: ... and remove duplicate objects\n" @@ -1930,6 +1931,8 @@ pdf_parse_write_options(fz_context *ctx, pdf_write_options *opts, const char *ar opts->do_clean = fz_option_eq(val, "yes"); if (fz_has_option(ctx, args, "sanitize", &val)) opts->do_sanitize = fz_option_eq(val, "yes"); + if (fz_has_option(ctx, args, "strip-invisible-text", &val)) + opts->do_strip_invisible_text = fz_option_eq(val, "yes"); if (fz_has_option(ctx, args, "incremental", &val)) opts->do_incremental = fz_option_eq(val, "yes"); if (fz_has_option(ctx, args, "objstms", &val)) @@ -2755,6 +2758,37 @@ void pdf_write_document(fz_context *ctx, pdf_document *doc, fz_output *out, cons do_pdf_save_document(ctx, doc, &opts, in_opts); } +static void pdf_strip_invisible_text(fz_context *ctx, pdf_document *doc) +{ + int i; + int n = pdf_count_pages(ctx, doc); + fz_rect rect; + pdf_annot *annot = NULL; + pdf_page *page = NULL; + pdf_redact_options opts = { 0, PDF_REDACT_IMAGE_NONE, PDF_REDACT_LINE_ART_NONE, PDF_REDACT_TEXT_REMOVE_INVISIBLE }; + + fz_var(page); + + fz_try(ctx) + { + for (i = 0; i < n; i++) + { + page = pdf_load_page(ctx, doc, i); + annot = pdf_create_annot(ctx, page, PDF_ANNOT_REDACT); + rect = pdf_bound_page(ctx, page, FZ_MEDIA_BOX); + pdf_set_annot_rect(ctx, annot, rect); + pdf_redact_page(ctx, doc, page, &opts); + pdf_drop_page(ctx, page); + page = NULL; + } + } + fz_catch(ctx) + { + pdf_drop_page(ctx, page); + fz_rethrow(ctx); + } +} + void pdf_save_document(fz_context *ctx, pdf_document *doc, const char *filename, const pdf_write_options *in_opts) { pdf_write_options opts_defaults = pdf_default_write_options; @@ -2776,6 +2810,8 @@ void pdf_save_document(fz_context *ctx, pdf_document *doc, const char *filename, fz_throw(ctx, FZ_ERROR_ARGUMENT, "Linearisation is no longer supported"); if (in_opts->do_incremental && in_opts->do_encrypt != PDF_ENCRYPT_KEEP) fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't do incremental writes when changing encryption"); + if (in_opts->do_strip_invisible_text && in_opts->do_sanitize == 0) + fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't strip invisible text without sanitizing"); if (in_opts->do_snapshot) { if (in_opts->do_incremental == 0 || @@ -2793,6 +2829,9 @@ void pdf_save_document(fz_context *ctx, pdf_document *doc, const char *filename, fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't use these options when snapshotting!"); } + if (in_opts->do_strip_invisible_text) + pdf_strip_invisible_text(ctx, doc); + if (in_opts->do_appearance > 0) { int i, n = pdf_count_pages(ctx, doc); @@ -2885,6 +2924,8 @@ pdf_format_write_options(fz_context *ctx, char *buffer, size_t buffer_len, const ADD_OPT("clean=yes"); if (opts->do_sanitize) ADD_OPT("sanitize=yes"); + if (opts->do_strip_invisible_text) + ADD_OPT("strip-invisible-text=yes"); if (opts->do_incremental) ADD_OPT("incremental=yes"); if (opts->do_encrypt == PDF_ENCRYPT_NONE) diff --git a/source/tools/pdfclean.c b/source/tools/pdfclean.c index a16c2533d0..7ead6fe52c 100644 --- a/source/tools/pdfclean.c +++ b/source/tools/pdfclean.c @@ -134,7 +134,7 @@ int pdfclean_main(int argc, char **argv) opts.write = pdf_default_write_options; opts.write.dont_regenerate_id = 1; - while ((c = fz_getopt_long(argc, argv, "ade:fgilmp:stcvzDAE:LO:U:P:SZ", longopts)) != -1) + while ((c = fz_getopt_long(argc, argv, "ade:fgiIlmp:stcvzDAE:LO:U:P:SZ", longopts)) != -1) { switch (c) { @@ -144,6 +144,7 @@ int pdfclean_main(int argc, char **argv) case 'z': opts.write.do_compress += 1; break; case 'f': opts.write.do_compress_fonts += 1; break; case 'i': opts.write.do_compress_images += 1; break; + case 'I': opts.write.do_strip_invisible_text += 1; break; case 'a': opts.write.do_ascii += 1; break; case 'e': opts.write.compression_effort = fz_atoi(fz_optarg); break; case 'g': opts.write.do_garbage += 1; break;