Skip to content

Commit ffe67d4

Browse files
authored
Merge pull request #1355 from data-prep-kit/pending-release/1.1.2
- kfpv2 tests are failing, and will be addressed on ops repo. - tokenization2arrow is failing due to pulling for pypi, and will be fixed once this release if pushed to pypy.
2 parents 1f98783 + 0d51403 commit ffe67d4

80 files changed

Lines changed: 192 additions & 147 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.make.versions

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@ REQUIRED_PYTHON_VERSIONS=">=3.10,<3.13"
1818
# for integration testing or other
1919

2020
DPK_NEXT_RELEASE=1.0.0
21-
DPK_VERSION_SUFFIX=.dev1
21+
DPK_VERSION_SUFFIX=
2222

2323

2424
DPK_CONNECTOR_NEXT_RELEASE=0.2.4
2525
DPK_CONNECTOR_SUFFIX=.dev0
2626

2727
TRANSFORM_NEXT_RELEASE=1.1.2
28-
TRANSFORM_VERSION_SUFFIX=.dev0
28+
TRANSFORM_VERSION_SUFFIX=
2929

3030
################################################################################
3131
# Begin versions that the repo depends on.
@@ -75,11 +75,11 @@ endif
7575
docker-tag::
7676
@echo $(DOCKER_IMAGE_VERSION)
7777

78-
# Printout the current releast tag for the DPK wheel
78+
# Printout the current release tag for the DPK wheel
7979
dpk-tag::
8080
@echo $(DPK_VERSION)
8181

82-
# Printout the current releast tag for the trsnsdfotmd wheel
82+
# Printout the current release tag for the transform wheel
8383
transforms-tag::
8484
@echo $(TRANSFORMS_PKG_VERSION)
8585

data-processing-lib/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "data_prep_toolkit"
3-
version = "1.0.0.dev1"
3+
version = "1.0.0"
44
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
55
requires-python = ">=3.10,<3.13"
66
description = "Data Preparation Toolkit Library for Ray and Python"

data-processing-lib/python/requirements.txt

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
numpy < 1.29.0
22
pyarrow==16.1.0
3-
# Need to revisit boto3 version
43
# Later version 1.38.7 is breaking authentication.
5-
# other suggestion: boto3==1.34.69 and botocore==1.34.154
6-
# Go with a DMF compatible version for now
7-
boto3>=1.35.74,<1.35.94
4+
# check compatibility with other applications that depend on botocore(DMF, DSift, etc.)
5+
boto3>=1.35.74,<=1.38.18
86
mmh3
97
psutil
108
polars>=1.9.0

data-processing-lib/python/src/data_processing/data_access/data_access_local.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,14 +88,21 @@ def __init__(
8888
files_to_use=files_to_use, files_to_checkpoint=files_to_checkpoint)
8989

9090
######
91+
## data_config = {'input_folder': str= 'path to input folder',
92+
## 'output_folder': str='path to output folder',
93+
## 'cache' : bool = True | False}
9194
## Calling DataAccessLocal.validate_config should have caught this in a production setting
9295
## but we still allow the class to be created with no configuration defined. Why ?
96+
self.tables = {}
97+
9398
if config is None:
9499
self.input_folder = None
95100
self.output_folder = None
101+
self.cache = False
96102
else:
97103
self.input_folder = os.path.abspath(config["input_folder"])
98104
self.output_folder = os.path.abspath(config["output_folder"])
105+
self.cache = config.get('cache', False)
99106
######
100107

101108
logger.debug(f"Local input folder: {self.input_folder}")
@@ -162,6 +169,10 @@ def get_table(self, path: str) -> tuple[pa.table, int]:
162169
Returns:
163170
pyarrow.Table: PyArrow table if read successfully, None otherwise.
164171
"""
172+
# if the table exists in memory, use it for faster access
173+
if self.tables.get(path):
174+
logger.debug('Table found in memory')
175+
return self.tables[path], 0
165176

166177
try:
167178
table = pq.read_table(path)
@@ -186,6 +197,10 @@ def save_table(self, path: str, table: pa.Table) -> tuple[int, dict[str, Any], i
186197
- size (int): The size of the file (bytes).
187198
If saving fails, file_info will be None.
188199
"""
200+
#save the table in memory for faster access
201+
if self.cache:
202+
self.tables[path] = table
203+
189204
# Get table size in memory
190205
size_in_memory = table.nbytes
191206
try:

data-processing-lib/python/test/data_processing_tests/data_access/data_access_local_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class TestInit:
3030
path_dict = {
3131
"input_folder": os.path.join(os.sep, "tmp", "input_guf"),
3232
"output_folder": os.path.join(os.sep, "tmp", "output_guf"),
33+
"cache": True,
3334
}
3435
dal = DataAccessLocal(path_dict, d_sets=["dset1", "dset2"], checkpoint=True, m_files=-1)
3536
size_stat_dict_empty = {"max_file_size": 0.0, "min_file_size": float(GB), "total_file_size": 0.0}
@@ -464,9 +465,11 @@ def test_successful_save(self):
464465
with patch("os.path.getsize") as mock_getsize, patch("os.path.basename") as mock_basename:
465466
mock_getsize.return_value = 1024
466467
mock_basename.return_value = "test_file.parquet"
468+
assert len(self.dal.tables) == 0
467469
size_in_memory, file_info, _ = self.dal.save_table(self.pq_file_path, self.table)
468470
os.remove(self.pq_file_path)
469471
# Assertions about return values
472+
assert len(self.dal.tables) > 0
470473
assert size_in_memory == self.table.nbytes
471474
assert file_info == {"name": "test_file.parquet", "size": 1024}
472475

kfp/kfp_ray_components/createRayClusterComponent.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ inputs:
1212

1313
implementation:
1414
container:
15-
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
15+
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:1.1.2"
1616
# command is a list of strings (command-line arguments).
1717
# The YAML language has two syntaxes for lists and you can use either of them.
1818
# Here we use the "flow syntax" - comma-separated strings inside square brackets.

kfp/kfp_ray_components/deleteRayClusterComponent.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ inputs:
99

1010
implementation:
1111
container:
12-
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
12+
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:1.1.2"
1313
# command is a list of strings (command-line arguments).
1414
# The YAML language has two syntaxes for lists and you can use either of them.
1515
# Here we use the "flow syntax" - comma-separated strings inside square brackets.

kfp/kfp_ray_components/executeRayJobComponent.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ inputs:
1212

1313
implementation:
1414
container:
15-
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
15+
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:1.1.2"
1616
# command is a list of strings (command-line arguments).
1717
# The YAML language has two syntaxes for lists and you can use either of them.
1818
# Here we use the "flow syntax" - comma-separated strings inside square brackets.

kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ inputs:
1313

1414
implementation:
1515
container:
16-
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
16+
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:1.1.2"
1717
# command is a list of strings (command-line arguments).
1818
# The YAML language has two syntaxes for lists and you can use either of them.
1919
# Here we use the "flow syntax" - comma-separated strings inside square brackets.

kfp/kfp_ray_components/executeSubWorkflowComponent.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ outputs:
2727

2828
implementation:
2929
container:
30-
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
30+
image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:1.1.2"
3131
# command is a list of strings (command-line arguments).
3232
# The YAML language has two syntaxes for lists, and you can use either of them.
3333
# Here we use the "flow syntax" - comma-separated strings inside square brackets.

0 commit comments

Comments
 (0)