Skip to content

Commit 396bbb5

Browse files
KrishPatel13Krish Patelabrichr
authored
feat(openadapt.privacy.providers): add private_ai as a scrubbing provider (#486)
* ran black * add blank file for priavte ai * add class `PrivateAIScrubbingProvider` * add pvt_ai api key in config * ran `pre-commit run --all` * add scrub_text function * progress for image redaction * format * complete scrub_image * start scrub_pdf * add pdf redaction code * add more wrapper for invalid reponse from priavate ai * try to fix pytest * try to fix pytest * remove lst * remove uncessary methods * text scrubbing test all passses * pdf_redaction test works * add test_image_redaction test in pyetst * add easy ocr to poetry * pytest is fixed * remove versino files * remove un-neccasry files * add code to remove uncessary files after pytest * addressed #486 (comment) * address comment #486 (comment) * reduce line chars * addressed comment: #486 (comment) * fix flake8 * use f strings * address comment: #486 (comment) * address comment: #486 (comment) * change to value error * remove .keys() * add constants * fix flake8 erros * use BytesIO * address comment #486 (comment) * rna black * final commit * remove unused code * refactor typo * rename `redact_file_path` to `redacted_file_path` * use BytesIO wherever possible * fix flake8 * add documentation links * Apply suggestions from code review * Update tests/openadapt/privacy/providers/test_private_ai_scrub.py * fix poetry.lock * poetry.lock --------- Co-authored-by: Krish Patel <[email protected]> Co-authored-by: Richard Abrich <[email protected]> Co-authored-by: Richard Abrich <[email protected]>
1 parent 5708256 commit 396bbb5

File tree

13 files changed

+730
-158
lines changed

13 files changed

+730
-158
lines changed

assets/sample_llc_1.pdf

-2.31 MB
Binary file not shown.

openadapt/privacy/providers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ class ScrubProvider: # pylint: disable=too-few-public-methods
99

1010
PRESIDIO = "PRESIDIO"
1111
COMPREHEND = "COMPREHEND"
12+
PRIVATE_AI = "PRIVATE_AI"
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
"""A Module for Private AI Scrubbing Provider."""
2+
3+
from io import BytesIO
4+
from typing import List
5+
import base64
6+
7+
from loguru import logger
8+
from PIL import Image
9+
import requests
10+
11+
from openadapt import config
12+
from openadapt.privacy.base import Modality, ScrubbingProvider, TextScrubbingMixin
13+
from openadapt.privacy.providers import ScrubProvider
14+
15+
BASE64_URL = "https://api.private-ai.com/deid/v3/process/files/base64"
16+
FILES_DIR = "assets/"
17+
HEADER_CONTENT_TYPE = "application/json"
18+
IMAGE_CONTENT_TYPE = "image/png"
19+
PDF_CONTENT_TYPE = "application/pdf"
20+
TEMP_IMAGEFILE_NAME = "temp_image_to_scrub.png"
21+
TEXT_URL = "https://api.private-ai.com/deid/v3/process/text"
22+
23+
24+
class PrivateAIScrubbingProvider(
25+
ScrubProvider, ScrubbingProvider, TextScrubbingMixin
26+
): # pylint: disable=abstract-method
27+
"""A Class for Private AI Scrubbing Provider."""
28+
29+
name: str = ScrubProvider.PRIVATE_AI
30+
capabilities: List[Modality] = [Modality.TEXT, Modality.PIL_IMAGE, Modality.PDF]
31+
32+
def scrub_text(self, text: str, is_separated: bool = False) -> str:
33+
"""Scrub the text of all PII/PHI.
34+
35+
Args:
36+
text (str): Text to be redacted
37+
is_separated (bool): Whether the text is separated with special characters
38+
39+
Returns:
40+
str: redacted text
41+
"""
42+
payload = {
43+
"text": [text],
44+
"link_batch": False,
45+
"entity_detection": {
46+
"accuracy": "high",
47+
"return_entity": True,
48+
},
49+
"processed_text": {
50+
"type": "MARKER",
51+
"pattern": "[UNIQUE_NUMBERED_ENTITY_TYPE]",
52+
},
53+
}
54+
55+
headers = {
56+
"Content-Type": HEADER_CONTENT_TYPE,
57+
"X-API-KEY": config.PRIVATE_AI_API_KEY,
58+
}
59+
60+
response = requests.post(TEXT_URL, json=payload, headers=headers)
61+
response.raise_for_status()
62+
data = response.json()
63+
logger.debug(f"{data=}")
64+
65+
# According to the PrivateAI API documentation,
66+
# https://docs.private-ai.com/reference/latest/operation/process_text_v3_process_text_post/
67+
# the response is a list of dicts when there is no error/issue in the request
68+
# else it is a dict with a key "detail" containing the error message
69+
70+
if type(data) is dict and "detail" in data:
71+
raise ValueError(data.get("detail"))
72+
73+
redacted_text = data[0].get("processed_text")
74+
logger.debug(f"{redacted_text=}")
75+
76+
return redacted_text
77+
78+
def scrub_image(
79+
self,
80+
image: Image,
81+
fill_color: int = config.SCRUB_FILL_COLOR, # pylint: disable=no-member
82+
) -> Image:
83+
"""Scrub the image of all PII/PHI.
84+
85+
Args:
86+
image (Image): A PIL.Image object to be redacted
87+
fill_color (int): The color used to fill the redacted regions(BGR).
88+
89+
Returns:
90+
Image: The redacted image with PII and PHI removed.
91+
"""
92+
buffer = BytesIO()
93+
94+
image.save(buffer, format="PNG")
95+
# Get the image data as bytes
96+
image_data = buffer.getvalue()
97+
98+
file_data = base64.b64encode(image_data)
99+
file_data = file_data.decode("ascii")
100+
101+
# Clean up by closing the BytesIO buffer
102+
buffer.close()
103+
104+
payload = {
105+
"file": {"data": file_data, "content_type": IMAGE_CONTENT_TYPE},
106+
"entity_detection": {"accuracy": "high", "return_entity": True},
107+
"pdf_options": {"density": 150, "max_resolution": 2000},
108+
"audio_options": {"bleep_start_padding": 0, "bleep_end_padding": 0},
109+
}
110+
111+
headers = {
112+
"Content-Type": HEADER_CONTENT_TYPE,
113+
"X-API-KEY": config.PRIVATE_AI_API_KEY,
114+
}
115+
116+
response = requests.post(BASE64_URL, json=payload, headers=headers)
117+
response = response.json()
118+
logger.debug(f"{response=}")
119+
120+
# According to the PrivateAI API documentation,
121+
# https://docs.private-ai.com/reference/latest/operation/process_files_base64_v3_process_files_base64_post/
122+
# else it is a dict with a key "detail" containing the error message
123+
124+
if type(response) is dict and "detail" in response:
125+
raise ValueError(response.get("detail"))
126+
127+
redacted_file_data = response.get("processed_file").encode("ascii")
128+
redacted_file_data = base64.b64decode(redacted_file_data, validate=True)
129+
130+
# Use a BytesIO buffer to work with redacted_file_data
131+
redacted_buffer = BytesIO(redacted_file_data)
132+
133+
redact_pil_image_data = Image.open(redacted_buffer)
134+
135+
return redact_pil_image_data
136+
137+
def scrub_pdf(self, path_to_pdf: str) -> str:
138+
"""Scrub the PDF of all PII/PHI.
139+
140+
Args:
141+
path_to_pdf (str): Path to the PDF to be redacted
142+
143+
Returns:
144+
str: Path to the redacted PDF
145+
"""
146+
# Create a BytesIO buffer to read the PDF file
147+
with open(path_to_pdf, "rb") as pdf_file:
148+
pdf_buffer = BytesIO(pdf_file.read())
149+
150+
# Read PDF data from the BytesIO buffer
151+
pdf_data = pdf_buffer.getvalue()
152+
pdf_buffer.close()
153+
154+
# Encode PDF data as base64
155+
pdf_base64 = base64.b64encode(pdf_data).decode("ascii")
156+
157+
payload = {
158+
"file": {"data": pdf_base64, "content_type": PDF_CONTENT_TYPE},
159+
"entity_detection": {"accuracy": "high", "return_entity": True},
160+
"pdf_options": {"density": 150, "max_resolution": 2000},
161+
"audio_options": {"bleep_start_padding": 0, "bleep_end_padding": 0},
162+
}
163+
164+
headers = {
165+
"Content-Type": HEADER_CONTENT_TYPE,
166+
"X-API-KEY": config.PRIVATE_AI_API_KEY,
167+
}
168+
169+
response = requests.post(BASE64_URL, json=payload, headers=headers)
170+
response_data = response.json()
171+
172+
# According to the PrivateAI API documentation,
173+
# https://docs.private-ai.com/reference/latest/operation/process_files_base64_v3_process_files_base64_post/
174+
# the response is a list of dicts when there is no error/issue in the request
175+
# else it is a dict with a key "detail" containing the error message
176+
177+
if isinstance(response_data, dict) and "details" in response_data:
178+
raise ValueError(response_data.get("detail"))
179+
180+
logger.debug(f"{response_data.get('entities')=}")
181+
logger.debug(f"{len(response_data.get('entities'))=}")
182+
183+
redacted_file_path = path_to_pdf.split(".")[0] + "_redacted.pdf"
184+
185+
# Create a BytesIO buffer to handle the redacted PDF data
186+
redacted_buffer = BytesIO()
187+
188+
# Decode and write the redacted PDF data to the BytesIO buffer
189+
processed_file = response_data.get("processed_file").encode("ascii")
190+
processed_file = base64.b64decode(processed_file, validate=True)
191+
redacted_buffer.write(processed_file)
192+
193+
# Write the redacted PDF data to a file
194+
with open(redacted_file_path, "wb") as redacted_file:
195+
redacted_buffer.seek(0) # Move the buffer position to the beginning
196+
redacted_file.write(redacted_buffer.read())
197+
198+
return redacted_file_path

0 commit comments

Comments
 (0)