From f17c23ac6ff8551d44d5362bdf9742142d175129 Mon Sep 17 00:00:00 2001 From: Seth Moore Date: Mon, 8 Jun 2020 16:44:50 -0700 Subject: [PATCH] Add DLP sample for redacting all image text The sample shows how to remove all text found in an image with DLP. The sample is integrated into the existing redact.py CLI application. --- dlp/README.rst | 30 +++--------- dlp/redact.py | 116 +++++++++++++++++++++++++++++++++++++-------- dlp/redact_test.py | 14 ++++++ 3 files changed, 118 insertions(+), 42 deletions(-) diff --git a/dlp/README.rst b/dlp/README.rst index 9ef0fc3fa14..f36433beb60 100644 --- a/dlp/README.rst +++ b/dlp/README.rst @@ -136,37 +136,21 @@ To run this sample: $ python redact.py - usage: redact.py [-h] [--project PROJECT] - [--info_types INFO_TYPES [INFO_TYPES ...]] - [--min_likelihood {LIKELIHOOD_UNSPECIFIED,VERY_UNLIKELY,UNLIKELY,POSSIBLE,LIKELY,VERY_LIKELY}] - [--mime_type MIME_TYPE] - filename output_filename + usage: redact.py [-h] {info_types,all_text} ... Sample app that uses the Data Loss Prevent API to redact the contents of an image file. positional arguments: - filename The path to the file to inspect. - output_filename The path to which the redacted image will be written. + {info_types,all_text} + Select which content should be redacted. + info_types Redact specific infoTypes from an image. + all_text Redact all text from an image. The MIME type of the + file is inferred via the Python standard library's + mimetypes module. optional arguments: -h, --help show this help message and exit - --project PROJECT The Google Cloud project id to use as a parent - resource. - --info_types INFO_TYPES [INFO_TYPES ...] - Strings representing info types to look for. A full - list of info categories and types is available from - the API. Examples include "FIRST_NAME", "LAST_NAME", - "EMAIL_ADDRESS". If unspecified, the three above - examples will be used. - --min_likelihood {LIKELIHOOD_UNSPECIFIED,VERY_UNLIKELY,UNLIKELY,POSSIBLE,LIKELY,VERY_LIKELY} - A string representing the minimum likelihood threshold - that constitutes a match. - --mime_type MIME_TYPE - The MIME type of the file. If not specified, the type - is inferred via the Python standard library's - mimetypes module. - Metadata diff --git a/dlp/redact.py b/dlp/redact.py index 66072de7b28..8a1650a262d 100644 --- a/dlp/redact.py +++ b/dlp/redact.py @@ -121,23 +121,87 @@ def redact_image( # [END dlp_redact_image] +# [START dlp_redact_image_all_text] -if __name__ == "__main__": - default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") - parser = argparse.ArgumentParser(description=__doc__) +def redact_image_all_text( + project, + filename, + output_filename, +): + """Uses the Data Loss Prevention API to redact all text in an image. - parser.add_argument("filename", help="The path to the file to inspect.") - parser.add_argument( - "output_filename", - help="The path to which the redacted image will be written.", + Args: + project: The Google Cloud project id to use as a parent resource. + filename: The path to the file to inspect. + output_filename: The path to which the redacted image will be written. + + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct the image_redaction_configs, indicating to DLP that all text in + # the input image should be redacted. + image_redaction_configs = [{ + "redact_all_text": True, + }] + + # Construct the byte_item, containing the file's byte data. + with open(filename, mode="rb") as f: + byte_item = {"type": "IMAGE", "data": f.read()} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.redact_image( + parent, + image_redaction_configs=image_redaction_configs, + byte_item=byte_item, ) - parser.add_argument( + + # Write out the results. + with open(output_filename, mode="wb") as f: + f.write(response.redacted_image) + + print("Wrote {byte_count} to {filename}".format( + byte_count=len(response.redacted_image), filename=output_filename)) + + +# [END dlp_redact_image_all_text] + +if __name__ == "__main__": + default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") + + common_args_parser = argparse.ArgumentParser(add_help=False) + common_args_parser.add_argument( "--project", help="The Google Cloud project id to use as a parent resource.", default=default_project, ) - parser.add_argument( + common_args_parser.add_argument( + "filename", help="The path to the file to inspect.") + common_args_parser.add_argument( + "output_filename", + help="The path to which the redacted image will be written.", + ) + + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select which content should be redacted.") + subparsers.required = True + + info_types_parser = subparsers.add_parser( + "info_types", + help="Redact specific infoTypes from an image.", + parents=[common_args_parser], + ) + info_types_parser.add_argument( "--info_types", nargs="+", help="Strings representing info types to look for. A full list of " @@ -146,7 +210,7 @@ def redact_image( "If unspecified, the three above examples will be used.", default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], ) - parser.add_argument( + info_types_parser.add_argument( "--min_likelihood", choices=[ "LIKELIHOOD_UNSPECIFIED", @@ -159,19 +223,33 @@ def redact_image( help="A string representing the minimum likelihood threshold that " "constitutes a match.", ) - parser.add_argument( + info_types_parser.add_argument( "--mime_type", help="The MIME type of the file. If not specified, the type is " "inferred via the Python standard library's mimetypes module.", ) + all_text_parser = subparsers.add_parser( + "all_text", + help="Redact all text from an image. The MIME type of the file is " + "inferred via the Python standard library's mimetypes module.", + parents=[common_args_parser], + ) + args = parser.parse_args() - redact_image( - args.project, - args.filename, - args.output_filename, - args.info_types, - min_likelihood=args.min_likelihood, - mime_type=args.mime_type, - ) + if args.content == "info_types": + redact_image( + args.project, + args.filename, + args.output_filename, + args.info_types, + min_likelihood=args.min_likelihood, + mime_type=args.mime_type, + ) + elif args.content == "all_text": + redact_image_all_text( + args.project, + args.filename, + args.output_filename, + ) diff --git a/dlp/redact_test.py b/dlp/redact_test.py index cb3740353b5..0cce514eb1a 100644 --- a/dlp/redact_test.py +++ b/dlp/redact_test.py @@ -44,3 +44,17 @@ def test_redact_image_file(tempdir, capsys): out, _ = capsys.readouterr() assert output_filepath in out + + +def test_redact_image_all_text(tempdir, capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") + output_filepath = os.path.join(tempdir, "redacted.png") + + redact.redact_image_all_text( + GCLOUD_PROJECT, + test_filepath, + output_filepath, + ) + + out, _ = capsys.readouterr() + assert output_filepath in out