diff --git a/.gitignore b/.gitignore index 21782abb7..0712f6d7b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ python-venv cache* !cache.py +minio-volume +scylladb-volume + # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/.mypy.ini b/.mypy.ini index a1adeaeda..e202650ed 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -27,6 +27,15 @@ ignore_missing_imports = True [mypy-google.cloud] ignore_missing_imports = True +[mypy-google.cloud.logging] +ignore_missing_imports = True + +[mypy-google.cloud.monitoring_v3] +ignore_missing_imports = True + +[mypy-google.cloud.storage] +ignore_missing_imports = True + [mypy-google.api_core] ignore_missing_imports = True diff --git a/benchmarks-data b/benchmarks-data index cbf461339..6a17a460f 160000 --- a/benchmarks-data +++ b/benchmarks-data @@ -1 +1 @@ -Subproject commit cbf461339656db3c6d6d8428e864da54c2dae2b7 +Subproject commit 6a17a460f289e166abb47ea6298fb939e80e8beb diff --git a/benchmarks/100.webapps/110.dynamic-html/config.json b/benchmarks/100.webapps/110.dynamic-html/config.json index 69bd3a573..25254c247 100644 --- a/benchmarks/100.webapps/110.dynamic-html/config.json +++ b/benchmarks/100.webapps/110.dynamic-html/config.json @@ -1,5 +1,6 @@ { "timeout": 10, "memory": 128, - "languages": ["python", "nodejs"] + "languages": ["python", "nodejs"], + "modules": [] } diff --git a/benchmarks/100.webapps/110.dynamic-html/input.py b/benchmarks/100.webapps/110.dynamic-html/input.py index ec57a2246..98dac88b2 100644 --- a/benchmarks/100.webapps/110.dynamic-html/input.py +++ b/benchmarks/100.webapps/110.dynamic-html/input.py @@ -5,10 +5,7 @@ 'large': 100000 } -def buckets_count(): - return (0, 0) - -def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func): +def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func): input_config = {'username': 'testname'} input_config['random_len'] = size_generators[size] return input_config diff --git a/benchmarks/100.webapps/120.uploader/config.json b/benchmarks/100.webapps/120.uploader/config.json index cd8566fc6..cbc635670 100644 --- a/benchmarks/100.webapps/120.uploader/config.json +++ b/benchmarks/100.webapps/120.uploader/config.json @@ -1,5 +1,6 @@ { "timeout": 30, "memory": 128, - "languages": ["python", "nodejs"] + "languages": ["python", "nodejs"], + "modules": ["storage"] } diff --git a/benchmarks/100.webapps/120.uploader/input.py b/benchmarks/100.webapps/120.uploader/input.py index 19eb7d2e6..ce6169ccb 100644 --- a/benchmarks/100.webapps/120.uploader/input.py +++ b/benchmarks/100.webapps/120.uploader/input.py @@ -11,7 +11,7 @@ def buckets_count(): return (0, 1) -def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func): +def generate_input(data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_func): input_config = {'object': {}, 'bucket': {}} input_config['object']['url'] = url_generators[size] input_config['bucket']['bucket'] = benchmarks_bucket diff --git a/benchmarks/100.webapps/130.crud-api/config.json b/benchmarks/100.webapps/130.crud-api/config.json new file mode 100644 index 000000000..25c6cb05e --- /dev/null +++ b/benchmarks/100.webapps/130.crud-api/config.json @@ -0,0 +1,11 @@ +{ + "timeout": 30, + "memory": 128, + "languages": [ + "python", + "nodejs" + ], + "modules": [ + "nosql" + ] +} diff --git a/benchmarks/100.webapps/130.crud-api/input.py b/benchmarks/100.webapps/130.crud-api/input.py new file mode 100644 index 000000000..c019e7e8b --- /dev/null +++ b/benchmarks/100.webapps/130.crud-api/input.py @@ -0,0 +1,96 @@ +import uuid + + +def allocate_nosql() -> dict: + return {"shopping_cart": {"primary_key": "cart_id", "secondary_key": "product_id"}} + + +def generate_input( + data_dir, size, benchmarks_bucket, input_buckets, output_buckets, upload_func, nosql_upload +): + + input_config = {} + + cart_id = str(uuid.uuid4().hex) + write_cart_id = str(uuid.uuid4().hex) + + # Set initial data + + nosql_upload( + "130.crud-api", + "shopping_cart", + {"name": "Gothic Game", "price": 42, "quantity": 2}, + ("cart_id", cart_id), + ("product_id", "game-gothic"), + ) + nosql_upload( + "130.crud-api", + "shopping_cart", + {"name": "Gothic 2", "price": 142, "quantity": 3}, + ("cart_id", cart_id), + ("product_id", "game-gothic-2"), + ) + nosql_upload( + "130.crud-api", + "shopping_cart", + {"name": "SeBS Benchmark", "price": 1000, "quantity": 1}, + ("cart_id", cart_id), + ("product_id", "sebs-benchmark"), + ) + nosql_upload( + "130.crud-api", + "shopping_cart", + {"name": "Mint Linux", "price": 0, "quantity": 5}, + ("cart_id", cart_id), + ("product_id", "mint-linux"), + ) + + requests = [] + + if size == "test": + # retrieve a single entry + requests.append( + { + "route": "GET /cart/{id}", + "path": {"id": "game-gothic"}, + "body": { + "cart": cart_id, + }, + } + ) + elif size == "small": + requests.append( + { + "route": "GET /cart", + "body": { + "cart": cart_id, + }, + } + ) + elif size == "large": + # add many new entries + for i in range(5): + requests.append( + { + "route": "PUT /cart", + "body": { + "cart": write_cart_id, + "product_id": f"new-id-{i}", + "name": f"Test Item {i}", + "price": 100 * i, + "quantity": i, + }, + } + ) + requests.append( + { + "route": "GET /cart", + "body": { + "cart": write_cart_id, + }, + } + ) + + input_config["requests"] = requests + + return input_config diff --git a/benchmarks/100.webapps/130.crud-api/python/function.py b/benchmarks/100.webapps/130.crud-api/python/function.py new file mode 100644 index 000000000..0b5e0e8c0 --- /dev/null +++ b/benchmarks/100.webapps/130.crud-api/python/function.py @@ -0,0 +1,67 @@ +from . import nosql + +nosql_client = nosql.nosql.get_instance() + +nosql_table_name = "shopping_cart" + + +def add_product(cart_id: str, product_id: str, product_name: str, price: float, quantity: int): + + nosql_client.insert( + nosql_table_name, + ("cart_id", cart_id), + ("product_id", product_id), + {"price": price, "quantity": quantity, "name": product_name}, + ) + + +def get_products(cart_id: str, product_id: str): + return nosql_client.get(nosql_table_name, ("cart_id", cart_id), ("product_id", product_id)) + + +def query_products(cart_id: str): + + res = nosql_client.query( + nosql_table_name, + ("cart_id", cart_id), + "product_id", + ) + + products = [] + price_sum = 0 + quantity_sum = 0 + for product in res: + + products.append(product["name"]) + price_sum += product["price"] + quantity_sum += product["quantity"] + + avg_price = price_sum / quantity_sum if quantity_sum > 0 else 0.0 + + return {"products": products, "total_cost": price_sum, "avg_price": avg_price} + + +def handler(event): + + results = [] + + for request in event["requests"]: + + route = request["route"] + body = request["body"] + + if route == "PUT /cart": + add_product( + body["cart"], body["product_id"], body["name"], body["price"], body["quantity"] + ) + res = {} + elif route == "GET /cart/{id}": + res = get_products(body["cart"], request["path"]["id"]) + elif route == "GET /cart": + res = query_products(body["cart"]) + else: + raise RuntimeError(f"Unknown request route: {route}") + + results.append(res) + + return {"result": results} diff --git a/benchmarks/200.multimedia/210.thumbnailer/config.json b/benchmarks/200.multimedia/210.thumbnailer/config.json index e9fe5a458..8edb99e52 100644 --- a/benchmarks/200.multimedia/210.thumbnailer/config.json +++ b/benchmarks/200.multimedia/210.thumbnailer/config.json @@ -1,5 +1,6 @@ { "timeout": 60, "memory": 256, - "languages": ["python", "nodejs"] + "languages": ["python", "nodejs"], + "modules": ["storage"] } diff --git a/benchmarks/200.multimedia/210.thumbnailer/input.py b/benchmarks/200.multimedia/210.thumbnailer/input.py index d063e258e..8943effed 100644 --- a/benchmarks/200.multimedia/210.thumbnailer/input.py +++ b/benchmarks/200.multimedia/210.thumbnailer/input.py @@ -12,7 +12,7 @@ def buckets_count(): :param output_buckets: :param upload_func: upload function taking three params(bucket_idx, key, filepath) ''' -def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func): +def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func): for file in glob.glob(os.path.join(data_dir, '*.jpg')): img = os.path.relpath(file, data_dir) diff --git a/benchmarks/200.multimedia/220.video-processing/config.json b/benchmarks/200.multimedia/220.video-processing/config.json index 75de3d4f4..94ede7925 100644 --- a/benchmarks/200.multimedia/220.video-processing/config.json +++ b/benchmarks/200.multimedia/220.video-processing/config.json @@ -1,5 +1,6 @@ { "timeout": 60, "memory": 512, - "languages": ["python"] + "languages": ["python"], + "modules": ["storage"] } diff --git a/benchmarks/200.multimedia/220.video-processing/input.py b/benchmarks/200.multimedia/220.video-processing/input.py index abd022901..6da31647f 100644 --- a/benchmarks/200.multimedia/220.video-processing/input.py +++ b/benchmarks/200.multimedia/220.video-processing/input.py @@ -12,7 +12,7 @@ def buckets_count(): :param output_buckets: :param upload_func: upload function taking three params(bucket_idx, key, filepath) ''' -def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func): +def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func): for file in glob.glob(os.path.join(data_dir, '*.mp4')): img = os.path.relpath(file, data_dir) upload_func(0, img, file) diff --git a/benchmarks/300.utilities/311.compression/config.json b/benchmarks/300.utilities/311.compression/config.json index e9fe5a458..8edb99e52 100644 --- a/benchmarks/300.utilities/311.compression/config.json +++ b/benchmarks/300.utilities/311.compression/config.json @@ -1,5 +1,6 @@ { "timeout": 60, "memory": 256, - "languages": ["python", "nodejs"] + "languages": ["python", "nodejs"], + "modules": ["storage"] } diff --git a/benchmarks/300.utilities/311.compression/input.py b/benchmarks/300.utilities/311.compression/input.py index c69d68e57..5f88bc91a 100644 --- a/benchmarks/300.utilities/311.compression/input.py +++ b/benchmarks/300.utilities/311.compression/input.py @@ -22,7 +22,7 @@ def upload_files(data_root, data_dir, upload_func): :param output_buckets: :param upload_func: upload function taking three params(bucket_idx, key, filepath) ''' -def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func): +def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func): # upload different datasets datasets = [] diff --git a/benchmarks/400.inference/411.image-recognition/config.json b/benchmarks/400.inference/411.image-recognition/config.json index 75de3d4f4..94ede7925 100644 --- a/benchmarks/400.inference/411.image-recognition/config.json +++ b/benchmarks/400.inference/411.image-recognition/config.json @@ -1,5 +1,6 @@ { "timeout": 60, "memory": 512, - "languages": ["python"] + "languages": ["python"], + "modules": ["storage"] } diff --git a/benchmarks/400.inference/411.image-recognition/input.py b/benchmarks/400.inference/411.image-recognition/input.py index 45ea3cee3..45d7215a6 100644 --- a/benchmarks/400.inference/411.image-recognition/input.py +++ b/benchmarks/400.inference/411.image-recognition/input.py @@ -21,7 +21,7 @@ def upload_files(data_root, data_dir, upload_func): :param output_buckets: :param upload_func: upload function taking three params(bucket_idx, key, filepath) ''' -def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func): +def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func): # upload model model_name = 'resnet50-19c8e357.pth' diff --git a/benchmarks/500.scientific/501.graph-pagerank/config.json b/benchmarks/500.scientific/501.graph-pagerank/config.json index 40336fd00..e80fb4351 100644 --- a/benchmarks/500.scientific/501.graph-pagerank/config.json +++ b/benchmarks/500.scientific/501.graph-pagerank/config.json @@ -1,5 +1,6 @@ { "timeout": 120, "memory": 512, - "languages": ["python"] + "languages": ["python"], + "modules": [] } diff --git a/benchmarks/500.scientific/501.graph-pagerank/input.py b/benchmarks/500.scientific/501.graph-pagerank/input.py index a8f3f6c9f..025c28caf 100644 --- a/benchmarks/500.scientific/501.graph-pagerank/input.py +++ b/benchmarks/500.scientific/501.graph-pagerank/input.py @@ -4,8 +4,5 @@ 'large': 100000 } -def buckets_count(): - return (0, 0) - -def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func): +def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func): return { 'size': size_generators[size] } diff --git a/benchmarks/500.scientific/502.graph-mst/config.json b/benchmarks/500.scientific/502.graph-mst/config.json index 40336fd00..e80fb4351 100644 --- a/benchmarks/500.scientific/502.graph-mst/config.json +++ b/benchmarks/500.scientific/502.graph-mst/config.json @@ -1,5 +1,6 @@ { "timeout": 120, "memory": 512, - "languages": ["python"] + "languages": ["python"], + "modules": [] } diff --git a/benchmarks/500.scientific/502.graph-mst/input.py b/benchmarks/500.scientific/502.graph-mst/input.py index a8f3f6c9f..025c28caf 100644 --- a/benchmarks/500.scientific/502.graph-mst/input.py +++ b/benchmarks/500.scientific/502.graph-mst/input.py @@ -4,8 +4,5 @@ 'large': 100000 } -def buckets_count(): - return (0, 0) - -def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func): +def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func): return { 'size': size_generators[size] } diff --git a/benchmarks/500.scientific/503.graph-bfs/config.json b/benchmarks/500.scientific/503.graph-bfs/config.json index 40336fd00..e80fb4351 100644 --- a/benchmarks/500.scientific/503.graph-bfs/config.json +++ b/benchmarks/500.scientific/503.graph-bfs/config.json @@ -1,5 +1,6 @@ { "timeout": 120, "memory": 512, - "languages": ["python"] + "languages": ["python"], + "modules": [] } diff --git a/benchmarks/500.scientific/503.graph-bfs/input.py b/benchmarks/500.scientific/503.graph-bfs/input.py index a8f3f6c9f..025c28caf 100644 --- a/benchmarks/500.scientific/503.graph-bfs/input.py +++ b/benchmarks/500.scientific/503.graph-bfs/input.py @@ -4,8 +4,5 @@ 'large': 100000 } -def buckets_count(): - return (0, 0) - -def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func): +def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func): return { 'size': size_generators[size] } diff --git a/benchmarks/500.scientific/504.dna-visualisation/config.json b/benchmarks/500.scientific/504.dna-visualisation/config.json index 712e3a5e6..ff297ac5b 100644 --- a/benchmarks/500.scientific/504.dna-visualisation/config.json +++ b/benchmarks/500.scientific/504.dna-visualisation/config.json @@ -1,5 +1,6 @@ { "timeout": 60, "memory": 2048, - "languages": ["python"] + "languages": ["python"], + "modules": ["storage"] } diff --git a/benchmarks/500.scientific/504.dna-visualisation/input.py b/benchmarks/500.scientific/504.dna-visualisation/input.py index 3f13010fc..a9f376ea2 100644 --- a/benchmarks/500.scientific/504.dna-visualisation/input.py +++ b/benchmarks/500.scientific/504.dna-visualisation/input.py @@ -3,7 +3,7 @@ def buckets_count(): return (1, 1) -def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func): +def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func, nosql_func): for file in glob.glob(os.path.join(data_dir, '*.fasta')): data = os.path.relpath(file, data_dir) diff --git a/benchmarks/wrappers/aws/python/nosql.py b/benchmarks/wrappers/aws/python/nosql.py new file mode 100644 index 000000000..72bc2d9da --- /dev/null +++ b/benchmarks/wrappers/aws/python/nosql.py @@ -0,0 +1,121 @@ +from decimal import Decimal +from os import environ +from typing import List, Optional, Union, Tuple + +import boto3 + + +class nosql: + + instance: Optional["nosql"] = None + + def __init__(self): + self.client = boto3.resource("dynamodb") + self._tables = {} + + # Based on: https://github.com/boto/boto3/issues/369#issuecomment-157205696 + def _remove_decimals(self, data: dict) -> Union[dict, list, int, float]: + + if isinstance(data, list): + return [self._remove_decimals(x) for x in data] + elif isinstance(data, dict): + return {k: self._remove_decimals(v) for k, v in data.items()} + elif isinstance(data, Decimal): + if data.as_integer_ratio()[1] == 1: + return int(data) + else: + return float(data) + else: + return data + + def _get_table(self, table_name: str): + + if table_name not in self._tables: + + env_name = f"NOSQL_STORAGE_TABLE_{table_name}" + + if env_name in environ: + aws_name = environ[env_name] + self._tables[table_name] = self.client.Table(aws_name) + else: + raise RuntimeError( + f"Couldn't find an environment variable {env_name} for table {table_name}" + ) + + return self._tables[table_name] + + def insert( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + data: dict, + ): + for key in (primary_key, secondary_key): + data[key[0]] = key[1] + + self._get_table(table_name).put_item(Item=data) + + def get( + self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str] + ) -> dict: + + data = {} + for key in (primary_key, secondary_key): + data[key[0]] = key[1] + + res = self._get_table(table_name).get_item(Key=data) + return self._remove_decimals(res["Item"]) + + def update( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + updates: dict, + ): + + key_data = {} + for key in (primary_key, secondary_key): + key_data[key[0]] = key[1] + + update_expression = "SET " + update_values = {} + update_names = {} + + # We use attribute names because DynamoDB reserves some keywords, like 'status' + for key, value in updates.items(): + + update_expression += f" #{key}_name = :{key}_value, " + update_values[f":{key}_value"] = value + update_names[f"#{key}_name"] = key + + update_expression = update_expression[:-2] + + self._get_table(table_name).update_item( + Key=key_data, + UpdateExpression=update_expression, + ExpressionAttributeValues=update_values, + ExpressionAttributeNames=update_names, + ) + + def query(self, table_name: str, primary_key: Tuple[str, str], _: str) -> List[dict]: + + res = self._get_table(table_name).query( + KeyConditionExpression=f"{primary_key[0]} = :keyvalue", + ExpressionAttributeValues={":keyvalue": primary_key[1]}, + )["Items"] + return self._remove_decimals(res) + + def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]): + data = {} + for key in (primary_key, secondary_key): + data[key[0]] = key[1] + + self._get_table(table_name).delete_item(Key=data) + + @staticmethod + def get_instance(): + if nosql.instance is None: + nosql.instance = nosql() + return nosql.instance diff --git a/benchmarks/wrappers/azure/python/handler.py b/benchmarks/wrappers/azure/python/handler.py index 5f7f14f2e..88e44baf6 100644 --- a/benchmarks/wrappers/azure/python/handler.py +++ b/benchmarks/wrappers/azure/python/handler.py @@ -4,13 +4,27 @@ import azure.functions as func +if 'NOSQL_STORAGE_DATABASE' in os.environ: + + from . import nosql + + nosql.nosql.get_instance( + os.environ['NOSQL_STORAGE_DATABASE'], + os.environ['NOSQL_STORAGE_URL'], + os.environ['NOSQL_STORAGE_CREDS'] + ) + +if 'STORAGE_CONNECTION_STRING' in os.environ: + + from . import storage + client = storage.storage.get_instance(os.environ['STORAGE_CONNECTION_STRING']) + # TODO: usual trigger # implement support for blob and others def main(req: func.HttpRequest, context: func.Context) -> func.HttpResponse: income_timestamp = datetime.datetime.now().timestamp() req_json = req.get_json() - if 'connection_string' in req_json: - os.environ['STORAGE_CONNECTION_STRING'] = req_json['connection_string'] + req_json['request-id'] = context.invocation_id req_json['income-timestamp'] = income_timestamp begin = datetime.datetime.now() diff --git a/benchmarks/wrappers/azure/python/nosql.py b/benchmarks/wrappers/azure/python/nosql.py new file mode 100644 index 000000000..f7dd94851 --- /dev/null +++ b/benchmarks/wrappers/azure/python/nosql.py @@ -0,0 +1,94 @@ +from typing import Dict, List, Optional, Tuple + +from azure.cosmos import CosmosClient, ContainerProxy + + +class nosql: + instance = None + client = None + + def __init__(self, url: str, credential: str, database: str): + self._client = CosmosClient(url=url, credential=credential) + self._db_client = self._client.get_database_client(database) + self._containers: Dict[str, ContainerProxy] = {} + + def _get_table(self, table_name: str): + + if table_name not in self._containers: + self._containers[table_name] = self._db_client.get_container_client(table_name) + + return self._containers[table_name] + + def insert( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + data: dict, + ): + + data[primary_key[0]] = primary_key[1] + # secondary key must have that name in CosmosDB + data["id"] = secondary_key[1] + + self._get_table(table_name).upsert_item(data) + + def get( + self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str] + ) -> dict: + res = self._get_table(table_name).read_item( + item=secondary_key[1], partition_key=primary_key[1] + ) + res[secondary_key[0]] = secondary_key[1] + + return res + + def update( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + updates: dict, + ): + + ops = [] + for key, value in updates.items(): + ops.append({"op": "add", "path": f"/{key}", "value": value}) + + self._get_table(table_name).patch_item( + item=secondary_key[1], partition_key=primary_key[1], patch_operations=ops + ) + + """ + This query must involve partition key - it does not scan across partitions. + """ + + def query( + self, table_name: str, primary_key: Tuple[str, str], secondary_key_name: str + ) -> List[dict]: + + res = list( + self._get_table(table_name).query_items( + f"SELECT * FROM c WHERE c.{primary_key[0]} = '{primary_key[1]}'", + enable_cross_partition_query=False, + ) + ) + + # Emulate the kind key + for item in res: + item[secondary_key_name] = item["id"] + + return res + + def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]): + + self._get_table(table_name).delete_item(item=secondary_key[1], partition_key=primary_key[1]) + + @staticmethod + def get_instance( + database: Optional[str] = None, url: Optional[str] = None, credential: Optional[str] = None + ): + if nosql.instance is None: + assert database is not None and url is not None and credential is not None + nosql.instance = nosql(url, credential, database) + return nosql.instance diff --git a/benchmarks/wrappers/azure/python/storage.py b/benchmarks/wrappers/azure/python/storage.py index 74c08307f..42b129c89 100644 --- a/benchmarks/wrappers/azure/python/storage.py +++ b/benchmarks/wrappers/azure/python/storage.py @@ -1,6 +1,7 @@ import os import uuid +from typing import Optional from azure.storage.blob import BlobServiceClient @@ -8,10 +9,8 @@ class storage: instance = None client = None - def __init__(self): - self.client = BlobServiceClient.from_connection_string( - os.getenv('STORAGE_CONNECTION_STRING') - ) + def __init__(self, connection_string: str): + self.client = BlobServiceClient.from_connection_string(connection_string) @staticmethod def unique_name(name): @@ -52,7 +51,9 @@ def download_stream(self, container, file): client = self.client.get_blob_client(container=container, blob=file) return client.download_blob().readall() - def get_instance(): + @staticmethod + def get_instance(connection_string: Optional[str] = None): if storage.instance is None: - storage.instance = storage() + assert connection_string is not None + storage.instance = storage(connection_string) return storage.instance diff --git a/benchmarks/wrappers/gcp/python/handler.py b/benchmarks/wrappers/gcp/python/handler.py index b9017b523..9b6989611 100644 --- a/benchmarks/wrappers/gcp/python/handler.py +++ b/benchmarks/wrappers/gcp/python/handler.py @@ -2,6 +2,14 @@ sys.path.append(os.path.join(os.path.dirname(__file__), '.python_packages/lib/site-packages')) +# This variable is defined by SeBS during function creation. +if 'NOSQL_STORAGE_DATABASE' in os.environ: + from function import nosql + + nosql.nosql.get_instance( + os.environ['NOSQL_STORAGE_DATABASE'] + ) + def handler(req): income_timestamp = datetime.datetime.now().timestamp() diff --git a/benchmarks/wrappers/gcp/python/nosql.py b/benchmarks/wrappers/gcp/python/nosql.py new file mode 100644 index 000000000..408712857 --- /dev/null +++ b/benchmarks/wrappers/gcp/python/nosql.py @@ -0,0 +1,131 @@ +from typing import List, Optional, Tuple + +from google.cloud import datastore + + +class nosql: + + instance: Optional["nosql"] = None + + """ + Each benchmark supports up to two keys - one for grouping items, + and for unique identification of each item. + + In Google Cloud Datastore, we determine different tables by using + its value for `kind` name. + + The primary key is assigned to the `kind` value. + + To implement sorting semantics, we use the ancestor relation: + the sorting key is used as the parent. + It is the assumption that all related items will have the same parent. + """ + + def __init__(self, database: str): + self._client = datastore.Client(database=database) + + def insert( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + data: dict, + ): + + parent_key = self._client.key(primary_key[0], primary_key[1]) + key = self._client.key( + # kind determines the table + table_name, + # main ID key + secondary_key[1], + # organization key + parent=parent_key, + ) + + val = datastore.Entity(key=key) + val.update(data) + self._client.put(val) + + def update( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + data: dict, + ): + # There is no direct update - we have to fetch the entire entity and manually change fields. + parent_key = self._client.key(primary_key[0], primary_key[1]) + key = self._client.key( + # kind determines the table + table_name, + # main ID key + secondary_key[1], + # organization key + parent=parent_key, + ) + res = self._client.get(key) + if res is None: + res = datastore.Entity(key=key) + res.update(data) + self._client.put(res) + + def get( + self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str] + ) -> Optional[dict]: + + parent_key = self._client.key(primary_key[0], primary_key[1]) + key = self._client.key( + # kind determines the table + table_name, + # main ID key + secondary_key[1], + # organization key + parent=parent_key, + ) + + res = self._client.get(key) + if res is None: + return None + + # Emulate the kind key + res[secondary_key[0]] = secondary_key[1] + + return res + + """ + This query must involve partition key - it does not scan across partitions. + """ + + def query( + self, table_name: str, primary_key: Tuple[str, str], secondary_key_name: str + ) -> List[dict]: + + ancestor = self._client.key(primary_key[0], primary_key[1]) + query = self._client.query(kind=table_name, ancestor=ancestor) + res = list(query.fetch()) + + # Emulate the kind key + for item in res: + item[secondary_key_name] = item.key.name + + return res + + def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]): + parent_key = self._client.key(primary_key[0], primary_key[1]) + key = self._client.key( + # kind determines the table + table_name, + # main ID key + secondary_key[1], + # organization key + parent=parent_key, + ) + + return self._client.delete(key) + + @staticmethod + def get_instance(database: Optional[str] = None): + if nosql.instance is None: + assert database is not None + nosql.instance = nosql(database) + return nosql.instance diff --git a/benchmarks/wrappers/local/python/nosql.py b/benchmarks/wrappers/local/python/nosql.py new file mode 100644 index 000000000..0e816954c --- /dev/null +++ b/benchmarks/wrappers/local/python/nosql.py @@ -0,0 +1,131 @@ +from decimal import Decimal +from os import environ +from typing import List, Optional, Union, Tuple + +import boto3 + + +class nosql: + + instance: Optional["nosql"] = None + + def __init__(self): + + if environ["NOSQL_STORAGE_TYPE"] != "scylladb": + raise RuntimeError(f"Unsupported NoSQL storage type: {environ['NOSQL_STORAGE_TYPE']}!") + + self.client = boto3.resource( + "dynamodb", + region_name="None", + aws_access_key_id="None", + aws_secret_access_key="None", + endpoint_url=f"http://{environ['NOSQL_STORAGE_ENDPOINT']}", + ) + self._tables = {} + + # Based on: https://github.com/boto/boto3/issues/369#issuecomment-157205696 + def _remove_decimals(self, data: dict) -> Union[dict, list, int, float]: + + if isinstance(data, list): + return [self._remove_decimals(x) for x in data] + elif isinstance(data, dict): + return {k: self._remove_decimals(v) for k, v in data.items()} + elif isinstance(data, Decimal): + if data.as_integer_ratio()[1] == 1: + return int(data) + else: + return float(data) + else: + return data + + def _get_table(self, table_name: str): + + if table_name not in self._tables: + + env_name = f"NOSQL_STORAGE_TABLE_{table_name}" + + if env_name in environ: + aws_name = environ[env_name] + self._tables[table_name] = self.client.Table(aws_name) + else: + raise RuntimeError( + f"Couldn't find an environment variable {env_name} for table {table_name}" + ) + + return self._tables[table_name] + + def insert( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + data: dict, + ): + for key in (primary_key, secondary_key): + data[key[0]] = key[1] + + self._get_table(table_name).put_item(Item=data) + + def get( + self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str] + ) -> dict: + + data = {} + for key in (primary_key, secondary_key): + data[key[0]] = key[1] + + res = self._get_table(table_name).get_item(Key=data) + return self._remove_decimals(res["Item"]) + + def update( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + updates: dict, + ): + + key_data = {} + for key in (primary_key, secondary_key): + key_data[key[0]] = key[1] + + update_expression = "SET " + update_values = {} + update_names = {} + + # We use attribute names because DynamoDB reserves some keywords, like 'status' + for key, value in updates.items(): + + update_expression += f" #{key}_name = :{key}_value, " + update_values[f":{key}_value"] = value + update_names[f"#{key}_name"] = key + + update_expression = update_expression[:-2] + + self._get_table(table_name).update_item( + Key=key_data, + UpdateExpression=update_expression, + ExpressionAttributeValues=update_values, + ExpressionAttributeNames=update_names, + ) + + def query(self, table_name: str, primary_key: Tuple[str, str], _: str) -> List[dict]: + + res = self._get_table(table_name).query( + KeyConditionExpression=f"{primary_key[0]} = :keyvalue", + ExpressionAttributeValues={":keyvalue": primary_key[1]}, + )["Items"] + return self._remove_decimals(res) + + def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]): + data = {} + for key in (primary_key, secondary_key): + data[key[0]] = key[1] + + self._get_table(table_name).delete_item(Key=data) + + @staticmethod + def get_instance(): + if nosql.instance is None: + nosql.instance = nosql() + return nosql.instance diff --git a/benchmarks/wrappers/openwhisk/python/__main__.py b/benchmarks/wrappers/openwhisk/python/__main__.py index 21ead0d9b..3ae44f9c2 100644 --- a/benchmarks/wrappers/openwhisk/python/__main__.py +++ b/benchmarks/wrappers/openwhisk/python/__main__.py @@ -2,8 +2,6 @@ import datetime import os -import minio - def main(args): logging.getLogger().setLevel(logging.INFO) begin = datetime.datetime.now() @@ -14,6 +12,12 @@ def main(args): os.environ[arg] = args[arg] del args[arg] + key_list = list(args.keys()) + for arg in key_list: + if 'NOSQL_STORAGE_' in arg: + os.environ[arg] = args[arg] + del args[arg] + try: from function import function ret = function.handler(args) diff --git a/benchmarks/wrappers/openwhisk/python/nosql.py b/benchmarks/wrappers/openwhisk/python/nosql.py new file mode 100644 index 000000000..da8245009 --- /dev/null +++ b/benchmarks/wrappers/openwhisk/python/nosql.py @@ -0,0 +1,133 @@ +from decimal import Decimal +from os import environ +from typing import List, Optional, Union, Tuple + +import boto3 +from botocore.client import Config + +class nosql: + + instance: Optional["nosql"] = None + + def __init__(self): + + if environ["NOSQL_STORAGE_TYPE"] != "scylladb": + raise RuntimeError(f"Unsupported NoSQL storage type: {environ['NOSQL_STORAGE_TYPE']}!") + + config = Config(connect_timeout=5, retries={'max_attempts': 0}) + self.client = boto3.resource( + "dynamodb", + region_name="None", + aws_access_key_id="None", + aws_secret_access_key="None", + endpoint_url=f"http://{environ['NOSQL_STORAGE_ENDPOINT']}", + config=config + ) + self._tables = {} + + # Based on: https://github.com/boto/boto3/issues/369#issuecomment-157205696 + def _remove_decimals(self, data: dict) -> Union[dict, list, int, float]: + + if isinstance(data, list): + return [self._remove_decimals(x) for x in data] + elif isinstance(data, dict): + return {k: self._remove_decimals(v) for k, v in data.items()} + elif isinstance(data, Decimal): + if data.as_integer_ratio()[1] == 1: + return int(data) + else: + return float(data) + else: + return data + + def _get_table(self, table_name: str): + + if table_name not in self._tables: + + env_name = f"NOSQL_STORAGE_TABLE_{table_name}" + + if env_name in environ: + aws_name = environ[env_name] + self._tables[table_name] = self.client.Table(aws_name) + else: + raise RuntimeError( + f"Couldn't find an environment variable {env_name} for table {table_name}" + ) + + return self._tables[table_name] + + def insert( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + data: dict, + ): + for key in (primary_key, secondary_key): + data[key[0]] = key[1] + + self._get_table(table_name).put_item(Item=data) + + def get( + self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str] + ) -> dict: + + data = {} + for key in (primary_key, secondary_key): + data[key[0]] = key[1] + + res = self._get_table(table_name).get_item(Key=data) + return self._remove_decimals(res["Item"]) + + def update( + self, + table_name: str, + primary_key: Tuple[str, str], + secondary_key: Tuple[str, str], + updates: dict, + ): + + key_data = {} + for key in (primary_key, secondary_key): + key_data[key[0]] = key[1] + + update_expression = "SET " + update_values = {} + update_names = {} + + # We use attribute names because DynamoDB reserves some keywords, like 'status' + for key, value in updates.items(): + + update_expression += f" #{key}_name = :{key}_value, " + update_values[f":{key}_value"] = value + update_names[f"#{key}_name"] = key + + update_expression = update_expression[:-2] + + self._get_table(table_name).update_item( + Key=key_data, + UpdateExpression=update_expression, + ExpressionAttributeValues=update_values, + ExpressionAttributeNames=update_names, + ) + + def query(self, table_name: str, primary_key: Tuple[str, str], _: str) -> List[dict]: + + res = self._get_table(table_name).query( + KeyConditionExpression=f"{primary_key[0]} = :keyvalue", + ExpressionAttributeValues={":keyvalue": primary_key[1]}, + )["Items"] + return self._remove_decimals(res) + + def delete(self, table_name: str, primary_key: Tuple[str, str], secondary_key: Tuple[str, str]): + data = {} + for key in (primary_key, secondary_key): + data[key[0]] = key[1] + + self._get_table(table_name).delete_item(Key=data) + + @staticmethod + def get_instance(): + if nosql.instance is None: + nosql.instance = nosql() + return nosql.instance diff --git a/config/storage.json b/config/storage.json new file mode 100644 index 000000000..9ea14d31d --- /dev/null +++ b/config/storage.json @@ -0,0 +1,20 @@ +{ + "object": { + "type": "minio", + "minio": { + "mapped_port": 9011, + "version": "RELEASE.2024-07-16T23-46-41Z", + "data_volume": "minio-volume" + } + }, + "nosql": { + "type": "scylladb", + "scylladb": { + "mapped_port": 9012, + "version": "6.0", + "cpus": 1, + "memory": "750", + "data_volume": "scylladb-volume" + } + } +} diff --git a/config/systems.json b/config/systems.json index b38e2e045..5a4077a23 100644 --- a/config/systems.json +++ b/config/systems.json @@ -1,7 +1,7 @@ { "general": { "docker_repository": "spcleth/serverless-benchmarks", - "SeBS_version": "1.1.0" + "SeBS_version": "1.2.0" }, "local": { "experiments": { @@ -18,11 +18,13 @@ "languages": { "python": { "base_images": { - "3.7": "python:3.7-slim", - "3.8": "python:3.8-slim", - "3.9": "python:3.9-slim", - "3.10": "python:3.10-slim", - "3.11": "python:3.11-slim" + "x64": { + "3.7": "python:3.7-slim", + "3.8": "python:3.8-slim", + "3.9": "python:3.9-slim", + "3.10": "python:3.10-slim", + "3.11": "python:3.11-slim" + } }, "images": [ "run", @@ -31,17 +33,25 @@ "username": "docker_user", "deployment": { "files": [ - "storage.py" + "storage.py", + "nosql.py" ], - "packages": [] + "packages": [], + "module_packages": { + "nosql": [ + "boto3==1.28.3" + ] + } } }, "nodejs": { "base_images": { - "14": "node:14-slim", - "16": "node:16-slim", - "18": "node:18-slim", - "20": "node:20-slim" + "x64": { + "14": "node:14-slim", + "16": "node:16-slim", + "18": "node:18-slim", + "20": "node:20-slim" + } }, "images": [ "run", @@ -83,9 +93,11 @@ "files": [ "handler.py", "storage.py", + "nosql.py", "setup.py" ], - "packages": [] + "packages": [], + "module_packages": {} } }, "nodejs": { @@ -133,11 +145,18 @@ "deployment": { "files": [ "handler.py", - "storage.py" + "storage.py", + "nosql.py" ], - "packages": [ - "azure-storage-blob" - ] + "packages": [], + "module_packages": { + "storage": [ + "azure-storage-blob" + ], + "nosql": [ + "azure-cosmos" + ] + } } }, "nodejs": { @@ -192,11 +211,18 @@ "deployment": { "files": [ "handler.py", - "storage.py" + "storage.py", + "nosql.py" ], - "packages": [ - "google-cloud-storage" - ] + "packages": [], + "module_packages": { + "storage": [ + "google-cloud-storage" + ], + "nosql": [ + "google-cloud-datastore" + ] + } } }, "nodejs": { @@ -226,6 +252,11 @@ } } }, + "images": { + "manage": { + "username": "docker_user" + } + }, "architecture": ["x64"], "deployments": ["package"] }, @@ -248,10 +279,17 @@ "files": [ "__main__.py", "storage.py", - "setup.py" + "setup.py", + "nosql.py" ], - "packages": { - "minio": "^5.0.10" + "packages": [], + "module_packages": { + "storage": { + "minio": "^5.0.10" + }, + "nosql": [ + "boto3==1.28.3" + ] } } }, diff --git a/dockerfiles/aws/python/Dockerfile.function b/dockerfiles/aws/python/Dockerfile.function index 65ec420bf..d72f6d432 100644 --- a/dockerfiles/aws/python/Dockerfile.function +++ b/dockerfiles/aws/python/Dockerfile.function @@ -2,18 +2,30 @@ ARG BASE_IMAGE FROM $BASE_IMAGE ARG VERSION ENV PYTHON_VERSION=${VERSION} +ARG TARGET_ARCHITECTURE COPY . function/ -RUN touch function/__init__.py -RUN if test -f "function/requirements.txt.${PYTHON_VERSION}"; then \ - pip install --no-cache-dir \ +ENV PLATFORM_ARG="" + +RUN touch function/__init__.py \ + && if [[ "${TARGET_ARCHITECTURE}" == "arm64" ]]; then \ + export PLATFORM_ARG="--platform manylinux_2_17_aarch64 --only-binary=:all:"; \ + fi \ + && if [[ "${TARGET_ARCHITECTURE}" == "arm64" ]] && test -f "function/requirements.txt.arm.${PYTHON_VERSION}"; then \ + pip install --no-cache-dir ${PLATFORM_ARG} --target . \ + -r function/requirements.txt \ + -r function/requirements.txt.arm.${PYTHON_VERSION} \ + function/ && \ + pip cache purge; \ + elif test -f "function/requirements.txt.${PYTHON_VERSION}"; then \ + pip install --no-cache-dir ${PLATFORM_ARG} --target . \ -r function/requirements.txt \ -r function/requirements.txt.${PYTHON_VERSION} \ function/ && \ pip cache purge; \ else \ - pip install --no-cache-dir \ + pip install --no-cache-dir ${PLATFORM_ARG} --target . \ -r function/requirements.txt \ function/ && \ pip cache purge; \ diff --git a/dockerfiles/gcp/Dockerfile.manage b/dockerfiles/gcp/Dockerfile.manage new file mode 100644 index 000000000..8af120b2a --- /dev/null +++ b/dockerfiles/gcp/Dockerfile.manage @@ -0,0 +1,13 @@ +FROM ubuntu:24.04 + +RUN apt-get clean && apt-get update\ + && apt-get install -y ca-certificates curl gnupg apt-transport-https\ + && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg\ + && echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list\ + && apt-get update\ + && apt-get install -y google-cloud-cli\ + && apt-get purge -y --auto-remove curl lsb-release gnupg + +ENV GOOGLE_APPLICATION_CREDENTIALS=/credentials.json + +ENTRYPOINT ["/bin/bash"] diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 864bea70c..e292a4b04 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -5,6 +5,7 @@ | :--- | :---: | :---: | :---: | :---: | | Webapps | 110.dynamic-html | Python, Node.js | x64, arm64 | Generate dynamic HTML from a template. | | Webapps | 120.uploader | Python, Node.js | x64, arm64 | Uploader file from provided URL to cloud storage. | +| Webapps | 130.crud-api | Python | x64, arm64 | Simple CRUD application using NoSQL to store application data. | | Multimedia | 210.thumbnailer | Python, Node.js | x64, arm64 | Generate a thumbnail of an image. | | Multimedia | 220.video-processing | Python | x64, arm64 | Add a watermark and generate gif of a video file. | | Utilities | 311.compression | Python | x64, arm64 | Create a .zip file for a group of files in storage and return to user to download. | @@ -41,6 +42,11 @@ The benchmark represents a dynamic generation of webpage contents through a serv The benchmark implements the common workflow of uploading user-defined data to the persistent cloud storage. It accepts a URL, downloads file contents, and uploads them to the storage. Python implementation uses the standard library `requests`, while the Node.js version uses the third-party `requests` library installed with `npm`. +### CRUD API + +The benchmark implements a simple CRUD application simulating a webstore cart. It offers three basic methods: add new item (`PUT`), get an item (`GET`), and query all items in a cart. It uses the NoSQL storage, with each item stored using cart id as primary key and item id as secondary key. The Python implementation uses +cloud-native libraries to access the database. + ## Multimedia ### Thumbnailer diff --git a/docs/platforms.md b/docs/platforms.md index 83a949340..2069c36ae 100644 --- a/docs/platforms.md +++ b/docs/platforms.md @@ -12,7 +12,17 @@ points for each platform. > [!WARNING] > On many platforms, credentials can be provided as environment variables or through the SeBS configuration. SeBS will not store your credentials in the cache. When saving results, SeBS stores user benchmark and experiment configuration for documentation and reproducibility, except for credentials that are erased. If you provide the credentials through JSON input configuration, do not commit nor publish these files anywhere. -### Architectures +Supported platforms: +* [Amazon Web Services (AWS) Lambda](#aws-lambda) +* [Microsoft Azure Functions](#azure-functions) +* [Google Cloud (GCP) Functions](#google-cloud-functions) +* [OpenWhisk](#openwhisk) + +## Storage Configuration + +SeBS benchmarks rely on persistent object and NoSQL storage for input and output data. For configuration instructions regarding both object storage and NoSQL databases, please refer to the [storage documentation](storage.md). Storage configuration is particularly important for local deployments, OpenWhisk, and other open-source FaaS platforms. + +## Architectures By default, SeBS defaults functions built for the x64 (x86_64) architecture. On AWS, functions can also be build and deployed for ARM CPUs to benefit from Graviton CPUs available on Lambda. This change primarily affects functions that make use of dependencies with native builds, such as `torch`, `numpy` or `ffmpeg`. @@ -22,7 +32,7 @@ However, special care is needed to build Docker containers: since installation o binaries based on ARM containers on x86 CPUs. To build multi-platform images, we recommend to follow official [Docker guidelines](https://docs.docker.com/build/building/multi-platform/#build-multi-platform-images) and provide static QEMU installation. On Ubuntu-based distributions, this requires installing an OS package and executing a single Docker command to provide seamless emulation of ARM containers. -### Cloud Account Identifiers +## Cloud Account Identifiers SeBS ensures that all locally cached cloud resources are valid by storing a unique identifier associated with each cloud account. Furthermore, we store this identifier in experiment results to easily match results with the cloud account or subscription that was used to obtain them. We use non-sensitive identifiers such as account IDs on AWS, subscription IDs on Azure, and Google Cloud project IDs. @@ -135,6 +145,8 @@ The Google Cloud Free Tier gives free resources. It has two parts: - Always Free, which provides limited access to many common Google Cloud resources, free of charge. You need to create an account and add [service account](https://cloud.google.com/iam/docs/service-accounts) to permit operating on storage and functions. From the cloud problem, download the cloud credentials saved as a JSON file. +You should have at least write access to **Cloud Functions** (`Cloud Functions Admin`) and **Logging** Furthermore, SeBS needs the permissions to create Firestore databases through +Google Cloud CLI tool; the `Firestore Service Agent` role allows for that. You can pass the credentials either using the default GCP-specific environment variable: @@ -170,6 +182,8 @@ In the subsections below, we discuss the meaning and use of each parameter. To correctly deploy SeBS functions to OpenWhisk, following the subsections on *Toolchain* and *Docker* configuration is particularly important. +For storage configuration in OpenWhisk, refer to the [storage documentation](storage.md), which covers both object storage and NoSQL requirements specific to OpenWhisk deployments. + > [!WARNING] > Some benchmarks might require larger memory allocations, e.g., 2048 MB. Not all OpenWhisk deployments support this out-of-the-box. > The deployment section below shows an example of changing the default function memory limit from 512 MB to a higher value. @@ -270,73 +284,7 @@ However, Docker's [experimental `manifest` feature](https://docs.docker.com/engi allows checking image status without downloading its contents, saving bandwidth and time. To use that feature in SeBS, set the `experimentalManifest` flag to true. -### Storage - -To provide persistent object storage in OpenWhisk, users must first deploy an instance -of [`Minio`](https://github.com/minio/minio) storage. -The storage instance is deployed as a Docker container, and it can be retained -across many experiments. -OpenWhisk functions must be able to reach the storage instance. -Even on a local machine, it's necessary to configure the network address, as OpenWhisk functions -are running isolated from the host network and won't be able to reach other containers running on the Docker bridge. - -Use the following command to deploy the storage instance locally and map the host public port 9011 to Minio instance. - -```bash -./sebs.py storage start minio --port 9011 --output-json out_storage.json -``` - -The output will look similar to the one below. -As we can see, the storage container is running on the default Docker bridge network with address `172.17.0.2` and uses port `9000`. -From the host network, port `9011` is mapped to the container's port `9000` to allow external parties - such as OpenWhisk functions - to reach the storage. - -``` -{ - "address": "172.17.0.2:9000", - "mapped_port": 9011, - "access_key": "XXX", - "secret_key": "XXX", - "instance_id": "XXX", - "input_buckets": [], - "output_buckets": [], - "type": "minio" -} -``` - -The storage configuration found in `out_storage.json` needs to be provided to -SeBS via the SeBS configuration, however the address in `out_storage.json` is likely incorrect. By -default, it is a address in the local bridge network not accessible to most of -the Kubernetes cluster. It should be replaced with an external address of the -machine and the mapped port. You can typically find an externally accessible address via `ip addr`. - -For example, for an external address `10.10.1.15` (a LAN-local address on CloudLab) and mapped port `9011`, set the SeBS configuration as follows: - -``` -jq --argfile file1 out_storage.json '.deployment.openwhisk.storage = $file1 | .deployment.openwhisk.storage.address = "10.10.1.15:9011"' config/example.json > config/openwhisk.json -``` - -You can validate this is the correct address by use `curl` to access the Minio instance from another machine or container: - -``` -$ curl -i 10.10.1.15:9011/minio/health/live -HTTP/1.1 200 OK -Accept-Ranges: bytes -Content-Length: 0 -Content-Security-Policy: block-all-mixed-content -Server: MinIO -Strict-Transport-Security: max-age=31536000; includeSubDomains -Vary: Origin -X-Amz-Request-Id: 16F3D9B9FDFFA340 -X-Content-Type-Options: nosniff -X-Xss-Protection: 1; mode=block -Date: Mon, 30 May 2022 10:01:21 GMT -``` - -The `shutdownStorage` switch controls the behavior of SeBS. -When set to true, SeBS will remove the Minio instance after finishing all -work. -Otherwise, the container will be retained, and future experiments with SeBS -will automatically detect an existing Minio instance. -Reusing the Minio instance helps run experiments faster and smoothly since -SeBS does not have to re-upload function's data on each experiment. +### Storage +OpenWhisk has a `shutdownStorage` switch that controls the behavior of SeBS. +When set to true, SeBS will remove the Minio instance after finishing all work. diff --git a/docs/storage.md b/docs/storage.md new file mode 100644 index 000000000..627216041 --- /dev/null +++ b/docs/storage.md @@ -0,0 +1,123 @@ +# Storage Configuration + +SeBS benchmarks rely on persistent storage for both input and output data. +Most applications use object storage for storing inputs and outputs, while others can use NoSQL database. +On cloud platforms, you can use cloud-native storage services like S3, DynamoDB, CosmosDB, or Firestore. +SeBS will automatically allocate resources and configure them. +With open-source platforms like OpenWhisk or local deployment, SeBS needs a self-hosted storage instance. + +In this document, we explain how to deploy and configure storage systems for benchmarking with SeBS. +We use [Minio](https://github.com/minio/minio), a high-performance and S3-compatible object storage, and [ScyllaDB](https://github.com/scylladb/scylladb) +with an adapter that provides a DynamoDB-compatible interface. +The storage instance is deployed as a Docker container and can be retained across multiple experiments. +While we provide a default configuration that automatically deploys each storage instance, +you can deploy them on any cloud resource and adapt the configuration to fit your needs. + +## Starting Storage Services + +You can start the necessary storage services using the `storage` command in SeBS: + +```bash +# Start only object storage +./sebs.py storage start object config/storage.json --output-json storage_object.json + +# Start only NoSQL database +./sebs.py storage start nosql config/storage.json --output-json storage_nosql.json + +# Start both storage types +./sebs.py storage start all config/storage.json --output-json storage.json +``` + +The command deploys the requested storage services as Docker containers and generates a configuration file in JSON format. +This file contains all the necessary information to connect to the storage services, including endpoint addresses, credentials, and instance IDs: + +```json +{ + "object": { + "type": "minio", + "minio": { + "address": "172.17.0.2:9000", + "mapped_port": 9011, + "access_key": "XXX", + "secret_key": "XXX", + "instance_id": "XXX", + "output_buckets": [], + "input_buckets": [], + "version": "RELEASE.2024-07-16T23-46-41Z", + "data_volume": "minio-volume", + "type": "minio" + } + }, + "nosql": { + "type": "scylladb", + "scylladb": { + "address": "172.17.0.3:8000", + "mapped_port": 9012, + "alternator_port": 8000, + "access_key": "None", + "secret_key": "None", + "instance_id": "XXX", + "region": "None", + "cpus": 1, + "memory": "750", + "version": "6.0", + "data_volume": "scylladb-volume" + } + } +} +``` + +As we can see, the Minio container is running on the default Docker bridge network with address `172.17.0.2` and uses port `9000`. +The default configuration maps the container's port to the host, making the storage instance available directly without referring to the container's IP address. Minio is mapped to port 9011, and ScyllaDB is mapped to port 9012. + +## Network Configuration + +The storage instance must be accessible from the host network, and in some cases, from external networks. +For example, the storage can be deployed on a separate virtual machine or container. +Furthermore, even on a local machine, it's necessary to configure the network address, as OpenWhisk functions +are running isolated from the host network and won't be able to reach other containers running on the Docker bridge. + +When using Minio with cloud-hosted FaaS platforms like OpenWhisk or for local deployment, you need to ensure that the functions can reach the storage instance. +By default, the container runs on the Docker bridge network with an address (e.g., `172.17.0.2`) that is not accessible from outside the host. +Even when deploying both OpenWhisk and storage on the same system, the local bridge network is not accessible from the Kubernetes cluster. +To make it accessible, functions need to use the public IP address of the machine hosting the container instance and the mapped port. +You can typically find an externally accessible address via `ip addr`, and then replace the storage's address with the external address of the machine and the mapped port. + +For example, for an external address `10.10.1.15` (a LAN-local address on CloudLab) and mapped port `9011`, set the SeBS configuration as follows: + +```bash +# For a LAN-local address (e.g., on CloudLab) +jq --slurpfile file1 storage.json '.deployment.openwhisk.storage = $file1[0] | .deployment.openwhisk.storage.address = "10.10.1.15:9011"' config/example.json > config/openwhisk.json +``` + +You can validate the configuration of Minio with an HTTP request by using `curl`: + +```bash +$ curl -i 10.10.1.15:9011/minio/health/live +HTTP/1.1 200 OK +Accept-Ranges: bytes +Content-Length: 0 +Content-Security-Policy: block-all-mixed-content +Server: MinIO +Strict-Transport-Security: max-age=31536000; includeSubDomains +Vary: Origin +X-Amz-Request-Id: 16F3D9B9FDFFA340 +X-Content-Type-Options: nosniff +X-Xss-Protection: 1; mode=block +Date: Mon, 30 May 2022 10:01:21 GMT +``` + + +## Lifecycle Management + +By default, storage containers are retained after experiments complete. This allows you to run multiple experiments without redeploying and repopulating storage. + +When you're done with your experiments, you can stop the storage services: + +```bash +./sebs.py storage stop object storage.json + +./sebs.py storage stop nosql storage.json + +./sebs.py storage stop all storage.json +``` diff --git a/requirements.azure.txt b/requirements.azure.txt index f7d82499f..f5f8a5dc4 100644 --- a/requirements.azure.txt +++ b/requirements.azure.txt @@ -1 +1,2 @@ azure-storage-blob==12.10.0 +azure-cosmos diff --git a/requirements.gcp.txt b/requirements.gcp.txt index 9cb909162..60f591501 100644 --- a/requirements.gcp.txt +++ b/requirements.gcp.txt @@ -4,3 +4,4 @@ google-api-python-client==1.12.5 google-cloud-monitoring==2.0.0 google-api-python-client-stubs google-cloud-logging==2.0.0 +google-cloud-datastore diff --git a/sebs.py b/sebs.py index b94ea546f..80fb11ed3 100755 --- a/sebs.py +++ b/sebs.py @@ -6,15 +6,16 @@ import functools import os import traceback -from typing import cast, Optional +from typing import cast, List, Optional import click import sebs from sebs import SeBS from sebs.types import Storage as StorageTypes +from sebs.types import NoSQLStorage as NoSQLStorageTypes from sebs.regression import regression_suite -from sebs.utils import update_nested_dict, catch_interrupt +from sebs.utils import update_nested_dict, append_nested_dict, catch_interrupt from sebs.faas import System as FaaSSystem from sebs.faas.function import Trigger @@ -134,7 +135,7 @@ def parse_common_params( resource_prefix: Optional[str] = None, initialize_deployment: bool = True, ignore_cache: bool = False, - storage_configuration: Optional[str] = None + storage_configuration: Optional[List[str]] = None ): global sebs_client, deployment_client @@ -159,9 +160,13 @@ def parse_common_params( # set the path the configuration was loaded from update_nested_dict(config_obj, ["deployment", "local", "path"], config) - if storage_configuration: - cfg = json.load(open(storage_configuration, 'r')) - update_nested_dict(config_obj, ["deployment", deployment, "storage"], cfg) + if storage_configuration is not None: + + for cfg_f in storage_configuration: + sebs_client.logging.info(f"Loading storage configuration from {cfg_f}") + + cfg = json.load(open(cfg_f, 'r')) + append_nested_dict(config_obj, ["deployment", deployment, "storage"], cfg) if initialize_deployment: deployment_client = sebs_client.get_deployment( @@ -225,7 +230,7 @@ def benchmark(): type=str, help="Attach prefix to generated Docker image tag.", ) -@click.option("--storage-configuration", default=None, type=str, help="JSON configuration of deployed storage.") +@click.option("--storage-configuration", type=str, multiple=True, help="JSON configuration of deployed storage.") @common_params def invoke( benchmark, @@ -264,14 +269,13 @@ def invoke( if timeout is not None: benchmark_obj.benchmark_config.timeout = timeout + input_config = benchmark_obj.prepare_input(deployment_client.system_resources, size=benchmark_input_size, replace_existing=experiment_config.update_storage) + func = deployment_client.get_function( benchmark_obj, function_name if function_name else deployment_client.default_function_name(benchmark_obj), ) - storage = deployment_client.get_storage(replace_existing=experiment_config.update_storage) - input_config = benchmark_obj.prepare_input(storage=storage, size=benchmark_input_size) - result = sebs.experiments.ExperimentResult(experiment_config, deployment_client.config) result.begin() @@ -367,63 +371,109 @@ def regression(benchmark_input_size, benchmark_name, **kwargs): ) +""" + Storage operations have the following characteristics: + - Two operations, start and stop. + - Three options, object storage, NoSQL storage, and all. + - Port and additional settings. + + Configuration is read from a JSON. +""" + + @cli.group() def storage(): pass @storage.command("start") -@click.argument("storage", type=click.Choice([StorageTypes.MINIO])) +@click.argument("storage", type=click.Choice(["object", "nosql", "all"])) +@click.argument("config", type=click.Path(dir_okay=False, readable=True)) @click.option("--output-json", type=click.Path(dir_okay=False, writable=True), default=None) -@click.option("--port", type=int, default=9000) -def storage_start(storage, output_json, port): +def storage_start(storage, config, output_json): import docker sebs.utils.global_logging() - storage_type = sebs.SeBS.get_storage_implementation(StorageTypes(storage)) - storage_config, storage_resources = sebs.SeBS.get_storage_config_implementation( - StorageTypes(storage) - ) - config = storage_config() - resources = storage_resources() + user_storage_config = json.load(open(config, 'r')) + + if storage in ["object", "all"]: + + storage_type_name = user_storage_config["object"]["type"] + storage_type_enum = StorageTypes(storage_type_name) + + storage_type = sebs.SeBS.get_storage_implementation(storage_type_enum) + storage_config = sebs.SeBS.get_storage_config_implementation(storage_type_enum) + config = storage_config.deserialize(user_storage_config["object"][storage_type_name]) + + storage_instance = storage_type(docker.from_env(), None, None, True) + storage_instance.config = config + + storage_instance.start() + + user_storage_config["object"][storage_type_name] = storage_instance.serialize() + else: + user_storage_config.pop("object") + + if storage in ["nosql", "all"]: + + storage_type_name = user_storage_config["nosql"]["type"] + storage_type_enum = NoSQLStorageTypes(storage_type_name) + + storage_type = sebs.SeBS.get_nosql_implementation(storage_type_enum) + storage_config = sebs.SeBS.get_nosql_config_implementation(storage_type_enum) + config = storage_config.deserialize(user_storage_config["nosql"][storage_type_name]) + + storage_instance = storage_type(docker.from_env(), None, config) + + storage_instance.start() + + key, value = storage_instance.serialize() + user_storage_config["nosql"][key] = value + else: + user_storage_config.pop("nosql") - storage_instance = storage_type(docker.from_env(), None, resources, True) - logging.info(f"Starting storage {str(storage)} on port {port}.") - storage_instance.start(port) if output_json: logging.info(f"Writing storage configuration to {output_json}.") with open(output_json, "w") as f: - json.dump(storage_instance.serialize(), fp=f, indent=2) + json.dump(user_storage_config, fp=f, indent=2) else: logging.info("Writing storage configuration to stdout.") - logging.info(json.dumps(storage_instance.serialize(), indent=2)) + logging.info(json.dumps(user_storage_config, indent=2)) @storage.command("stop") +@click.argument("storage", type=click.Choice(["object", "nosql", "all"])) @click.argument("input-json", type=click.Path(exists=True, dir_okay=False, readable=True)) -def storage_stop(input_json): +def storage_stop(storage, input_json): sebs.utils.global_logging() with open(input_json, "r") as f: cfg = json.load(f) - storage_type = cfg["type"] - storage_cfg, storage_resources = sebs.SeBS.get_storage_config_implementation(storage_type) - config = storage_cfg.deserialize(cfg) + if storage in ["object", "all"]: - if "resources" in cfg: - resources = storage_resources.deserialize(cfg["resources"]) - else: - resources = storage_resources() + storage_type = cfg["object"]["type"] + + storage_cfg = sebs.SeBS.get_storage_config_implementation(storage_type) + config = storage_cfg.deserialize(cfg["object"][storage_type]) logging.info(f"Stopping storage deployment of {storage_type}.") - storage = sebs.SeBS.get_storage_implementation(storage_type).deserialize( - config, None, resources - ) - storage.stop() + storage_instance = sebs.SeBS.get_storage_implementation(storage_type).deserialize(config, None, None) + storage_instance.stop() logging.info(f"Stopped storage deployment of {storage_type}.") + if storage in ["nosql", "all"]: + + storage_type = cfg["nosql"]["type"] + + storage_cfg = sebs.SeBS.get_nosql_config_implementation(storage_type) + config = storage_cfg.deserialize(cfg["nosql"][storage_type]) + + logging.info(f"Stopping nosql deployment of {storage_type}.") + storage_instance = sebs.SeBS.get_nosql_implementation(storage_type).deserialize(config, None, None) + storage_instance.stop() + logging.info(f"Stopped nosql deployment of {storage_type}.") @cli.group() def local(): @@ -435,13 +485,9 @@ def local(): @click.argument("benchmark-input-size", type=click.Choice(["test", "small", "large"])) @click.argument("output", type=str) @click.option("--deployments", default=1, type=int, help="Number of deployed containers.") -@click.option("--storage-configuration", type=str, help="JSON configuration of deployed storage.") -@click.option( - "--measure-interval", - type=int, - default=-1, - help="Interval duration between memory measurements in ms.", -) +@click.option("--storage-configuration", type=str, multiple=True, help="JSON configuration of deployed storage.") +@click.option("--measure-interval", type=int, default=-1, + help="Interval duration between memory measurements in ms.") @click.option( "--remove-containers/--no-remove-containers", default=True, @@ -449,7 +495,7 @@ def local(): ) @click.option( "--architecture", - default="x64", + default=None, type=click.Choice(["x64", "arm64"]), help="Target architecture", ) @@ -472,7 +518,8 @@ def start( (config, output_dir, logging_filename, sebs_client, deployment_client) = parse_common_params( update_code=False, update_storage=False, deployment="local", storage_configuration=storage_configuration, - architecture=architecture, container_deployment=False, ignore_cache=False, **kwargs + container_deployment=False, architecture=architecture, + **kwargs ) deployment_client = cast(sebs.local.Local, deployment_client) deployment_client.remove_containers = remove_containers @@ -486,9 +533,12 @@ def start( experiment_config, logging_filename=logging_filename, ) - storage = deployment_client.get_storage(replace_existing=experiment_config.update_storage) - result.set_storage(storage) - input_config = benchmark_obj.prepare_input(storage=storage, size=benchmark_input_size) + input_config = benchmark_obj.prepare_input( + deployment_client.system_resources, + size=benchmark_input_size, + replace_existing=experiment_config.update_storage + ) + result.set_storage(deployment_client.system_resources.get_storage()) result.add_input(input_config) for i in range(deployments): @@ -517,12 +567,6 @@ def stop(input_json, output_json, **kwargs): sebs.utils.global_logging() logging.info(f"Stopping deployment from {os.path.abspath(input_json)}") - (config, output_dir, logging_filename, sebs_client, deployment_client) = parse_common_params( - update_code=False, update_storage=False, - deployment="local", **kwargs - ) - - deployment_client.res deployment = sebs.local.Deployment.deserialize(input_json, None) deployment.shutdown(output_json) @@ -588,7 +632,7 @@ def resources_list(resource, **kwargs): ) = parse_common_params(**kwargs) if resource == "buckets": - storage_client = deployment_client.get_storage(False) + storage_client = deployment_client.system_resources.get_storage(False) buckets = storage_client.list_buckets() sebs_client.logging.info("Storage buckets:") for idx, bucket in enumerate(buckets): @@ -629,8 +673,8 @@ def resources_remove(resource, prefix, wait, dry_run, **kwargs): deployment_client, ) = parse_common_params(**kwargs) - storage_client = deployment_client.get_storage(False) - if resource == "storage": + storage_client = deployment_client.system_resources.get_storage(False) + if resource == "buckets": buckets = storage_client.list_buckets() for idx, bucket in enumerate(buckets): @@ -660,6 +704,8 @@ def resources_remove(resource, prefix, wait, dry_run, **kwargs): deployment_client.config.resources.delete_resource_group( deployment_client.cli_instance, group, wait ) + else: + raise NotImplementedError(f"Resource {resource} not supported.") if __name__ == "__main__": diff --git a/sebs/aws/aws.py b/sebs/aws/aws.py index 25a098126..243a6f0f9 100644 --- a/sebs/aws/aws.py +++ b/sebs/aws/aws.py @@ -8,6 +8,8 @@ import boto3 import docker +from sebs.aws.dynamodb import DynamoDB +from sebs.aws.resources import AWSSystemResources from sebs.aws.s3 import S3 from sebs.aws.function import LambdaFunction from sebs.aws.container import ECRContainer @@ -19,7 +21,6 @@ from sebs.config import SeBSConfig from sebs.utils import LoggingHandlers from sebs.faas.function import Function, ExecutionResult, Trigger, FunctionConfig -from sebs.faas.storage import PersistentStorage from sebs.faas.system import System @@ -44,6 +45,10 @@ def function_type() -> "Type[Function]": def config(self) -> AWSConfig: return self._config + @property + def system_resources(self) -> AWSSystemResources: + return cast(AWSSystemResources, self._system_resources) + """ :param cache_client: Function cache instance :param config: Experiments config @@ -58,10 +63,16 @@ def __init__( docker_client: docker.client, logger_handlers: LoggingHandlers, ): - super().__init__(sebs_config, cache_client, docker_client) + super().__init__( + sebs_config, + cache_client, + docker_client, + AWSSystemResources(config, cache_client, docker_client, logger_handlers), + ) self.logging_handlers = logger_handlers self._config = config self.storage: Optional[S3] = None + self.nosql_storage: Optional[DynamoDB] = None def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] = None): # thread-safe @@ -70,7 +81,7 @@ def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] aws_secret_access_key=self.config.credentials.secret_key, ) self.get_lambda_client() - self.get_storage() + self.system_resources.initialize_session(self.session) self.initialize_resources(select_prefix=resource_prefix) self.ecr_client = ECRContainer( @@ -85,33 +96,6 @@ def get_lambda_client(self): ) return self.client - """ - Create a client instance for cloud storage. When benchmark and buckets - parameters are passed, then storage is initialized with required number - of buckets. Buckets may be created or retrieved from cache. - - :param benchmark: benchmark name - :param buckets: tuple of required input/output buckets - :param replace_existing: replace existing files in cached buckets? - :return: storage client - """ - - def get_storage(self, replace_existing: bool = False) -> PersistentStorage: - if not self.storage: - self.storage = S3( - self.session, - self.cache_client, - self.config.resources, - self.config.region, - access_key=self.config.credentials.access_key, - secret_key=self.config.credentials.secret_key, - replace_existing=replace_existing, - ) - self.storage.logging_handlers = self.logging_handlers - else: - self.storage.replace_existing = replace_existing - return self.storage - """ It would be sufficient to just pack the code and ship it as zip to AWS. However, to have a compatible function implementation across providers, @@ -209,7 +193,6 @@ def create_function( code_size = code_package.code_size code_bucket: Optional[str] = None func_name = AWS.format_function_name(func_name) - storage_client = self.get_storage() function_cfg = FunctionConfig.from_benchmark(code_package) architecture = function_cfg.architecture.value # we can either check for exception or use list_functions @@ -255,8 +238,9 @@ def create_function( else: code_package_name = cast(str, os.path.basename(package)) + storage_client = self.system_resources.get_storage() code_bucket = storage_client.get_bucket(Resources.StorageBucketType.DEPLOYMENT) - code_prefix = os.path.join(benchmark, architecture, code_package_name) + code_prefix = os.path.join(benchmark, code_package_name) storage_client.upload(code_bucket, package, code_prefix) self.logging.info( @@ -290,6 +274,9 @@ def create_function( self.wait_function_active(lambda_function) + # Update environment variables + self.update_function_configuration(lambda_function, code_package) + # Add LibraryTrigger to a new function from sebs.aws.triggers import LibraryTrigger @@ -353,8 +340,8 @@ def update_function( else: code_package_name = os.path.basename(package) - storage = cast(S3, self.get_storage()) - bucket = function.code_bucket(code_package.benchmark, storage) + storage = self.system_resources.get_storage() + bucket = function.code_bucket(code_package.benchmark, cast(S3, storage)) code_prefix = os.path.join(benchmark, architecture, code_package_name) storage.upload(bucket, package, code_prefix) @@ -368,23 +355,49 @@ def update_function( self.wait_function_updated(function) self.logging.info(f"Updated code of {name} function. ") # and update config - self.client.update_function_configuration( - FunctionName=name, - Timeout=function.config.timeout, - MemorySize=function.config.memory, - ) - self.wait_function_updated(function) - self.logging.info(f"Updated configuration of {name} function. ") - self.wait_function_updated(function) - self.logging.info("Published new function code") + self.update_function_configuration(function, code_package) + + def update_function_configuration( + self, function: Function, code_package: Benchmark, env_variables: dict = {} + ): + + # We can only update storage configuration once it has been processed for this benchmark + assert code_package.has_input_processed + + envs = env_variables.copy() + if code_package.uses_nosql: + + nosql_storage = self.system_resources.get_nosql_storage() + for original_name, actual_name in nosql_storage.get_tables( + code_package.benchmark + ).items(): + envs[f"NOSQL_STORAGE_TABLE_{original_name}"] = actual_name + + # AWS Lambda will overwrite existing variables + # If we modify them, we need to first read existing ones and append. + if len(envs) > 0: + + response = self.client.get_function_configuration(FunctionName=function.name) + # preserve old variables while adding new ones. + # but for conflict, we select the new one + if "Environment" in response: + envs = {**response["Environment"]["Variables"], **envs} - def update_function_configuration(self, function: Function, benchmark: Benchmark): function = cast(LambdaFunction, function) - self.client.update_function_configuration( - FunctionName=function.name, - Timeout=function.config.timeout, - MemorySize=function.config.memory, - ) + # We only update envs if anything new was added + if len(envs) > 0: + self.client.update_function_configuration( + FunctionName=function.name, + Timeout=function.config.timeout, + MemorySize=function.config.memory, + Environment={"Variables": envs}, + ) + else: + self.client.update_function_configuration( + FunctionName=function.name, + Timeout=function.config.timeout, + MemorySize=function.config.memory, + ) self.wait_function_updated(function) self.logging.info(f"Updated configuration of {function.name} function. ") @@ -597,19 +610,16 @@ def create_trigger(self, func: Function, trigger_type: Trigger.TriggerType) -> T self.cache_client.update_function(function) return trigger - def _enforce_cold_start(self, function: Function): + def _enforce_cold_start(self, function: Function, code_package: Benchmark): func = cast(LambdaFunction, function) - self.get_lambda_client().update_function_configuration( - FunctionName=func.name, - Timeout=func.config.timeout, - MemorySize=func.config.memory, - Environment={"Variables": {"ForceColdStart": str(self.cold_start_counter)}}, + self.update_function_configuration( + func, code_package, {"ForceColdStart": str(self.cold_start_counter)} ) def enforce_cold_start(self, functions: List[Function], code_package: Benchmark): self.cold_start_counter += 1 for func in functions: - self._enforce_cold_start(func) + self._enforce_cold_start(func, code_package) self.logging.info("Sent function updates enforcing cold starts.") for func in functions: lambda_function = cast(LambdaFunction, func) diff --git a/sebs/aws/dynamodb.py b/sebs/aws/dynamodb.py new file mode 100644 index 000000000..0f3cc8782 --- /dev/null +++ b/sebs/aws/dynamodb.py @@ -0,0 +1,175 @@ +from collections import defaultdict +from typing import Dict, Optional, Tuple + +from sebs.cache import Cache +from sebs.faas.config import Resources +from sebs.faas.nosql import NoSQLStorage + +import boto3 +from boto3.dynamodb.types import TypeSerializer + + +class DynamoDB(NoSQLStorage): + @staticmethod + def typename() -> str: + return "AWS.DynamoDB" + + @staticmethod + def deployment_name(): + return "aws" + + def __init__( + self, + session: boto3.session.Session, + cache_client: Cache, + resources: Resources, + region: str, + access_key: str, + secret_key: str, + ): + super().__init__(region, cache_client, resources) + self.client = session.client( + "dynamodb", + region_name=region, + aws_access_key_id=access_key, + aws_secret_access_key=secret_key, + ) + + # Map benchmark -> name used by benchmark -> actual table_name in AWS + # Example "shopping_cart" -> "sebs-benchmarks--130.crud-api-shopping_cart" + self._tables: Dict[str, Dict[str, str]] = defaultdict(dict) + + self._serializer = TypeSerializer() + + def retrieve_cache(self, benchmark: str) -> bool: + + if benchmark in self._tables: + return True + + cached_storage = self.cache_client.get_nosql_config(self.deployment_name(), benchmark) + if cached_storage is not None: + self._tables[benchmark] = cached_storage["tables"] + return True + + return False + + def update_cache(self, benchmark: str): + + self._cache_client.update_nosql( + self.deployment_name(), + benchmark, + { + "tables": self._tables[benchmark], + }, + ) + + def get_tables(self, benchmark: str) -> Dict[str, str]: + return self._tables[benchmark] + + def _get_table_name(self, benchmark: str, table: str) -> Optional[str]: + + if benchmark not in self._tables: + return None + + if table not in self._tables[benchmark]: + return None + + return self._tables[benchmark][table] + + def write_to_table( + self, + benchmark: str, + table: str, + data: dict, + primary_key: Tuple[str, str], + secondary_key: Optional[Tuple[str, str]] = None, + ): + + table_name = self._get_table_name(benchmark, table) + assert table_name is not None + + for key in (primary_key, secondary_key): + if key is not None: + data[key[0]] = key[1] + + serialized_data = {k: self._serializer.serialize(v) for k, v in data.items()} + self.client.put_item(TableName=table_name, Item=serialized_data) + + """ + AWS: create a DynamoDB Table + + In contrast to the hierarchy of database objects in Azure (account -> database -> container) + and GCP (database per benchmark), we need to create unique table names here. + """ + + def create_table( + self, benchmark: str, name: str, primary_key: str, secondary_key: Optional[str] = None + ) -> str: + + table_name = f"sebs-benchmarks-{self._cloud_resources.resources_id}-{benchmark}-{name}" + + try: + + definitions = [{"AttributeName": primary_key, "AttributeType": "S"}] + key_schema = [{"AttributeName": primary_key, "KeyType": "HASH"}] + + if secondary_key is not None: + definitions.append({"AttributeName": secondary_key, "AttributeType": "S"}) + key_schema.append({"AttributeName": secondary_key, "KeyType": "RANGE"}) + + ret = self.client.create_table( + TableName=table_name, + BillingMode="PAY_PER_REQUEST", + AttributeDefinitions=definitions, # type: ignore + KeySchema=key_schema, # type: ignore + ) + + if ret["TableDescription"]["TableStatus"] == "CREATING": + + self.logging.info(f"Waiting for creation of DynamoDB table {name}") + waiter = self.client.get_waiter("table_exists") + waiter.wait(TableName=table_name, WaiterConfig={"Delay": 1}) + + self.logging.info(f"Created DynamoDB table {name} for benchmark {benchmark}") + self._tables[benchmark][name] = table_name + + return ret["TableDescription"]["TableName"] + + except self.client.exceptions.ResourceInUseException as e: + + if "already exists" in e.response["Error"]["Message"]: + + # We need this waiter. + # Otheriwise, we still might get later `ResourceNotFoundException` + # when uploading benchmark data. + self.logging.info(f"Waiting for the existing table {table_name} to be created") + waiter = self.client.get_waiter("table_exists") + waiter.wait(TableName=table_name, WaiterConfig={"Delay": 1}) + ret = self.client.describe_table(TableName=table_name) + + self.logging.info( + f"Using existing DynamoDB table {table_name} for benchmark {benchmark}" + ) + self._tables[benchmark][name] = table_name + return name + + if "being created" in e.response["Error"]["Message"]: + + self.logging.info(f"Waiting for the existing table {table_name} to be created") + waiter = self.client.get_waiter("table_exists") + waiter.wait(TableName=table_name, WaiterConfig={"Delay": 1}) + ret = self.client.describe_table(TableName=table_name) + + self.logging.info( + f"Using existing DynamoDB table {table_name} for benchmark {benchmark}" + ) + self._tables[benchmark][name] = table_name + return name + + raise RuntimeError(f"Creating DynamoDB failed, unknown reason! Error: {e}") + + def clear_table(self, name: str) -> str: + raise NotImplementedError() + + def remove_table(self, name: str) -> str: + raise NotImplementedError() diff --git a/sebs/aws/resources.py b/sebs/aws/resources.py new file mode 100644 index 000000000..5913c3928 --- /dev/null +++ b/sebs/aws/resources.py @@ -0,0 +1,82 @@ +from typing import cast, Optional + +from sebs.aws.s3 import S3 +from sebs.aws.dynamodb import DynamoDB +from sebs.aws.config import AWSConfig +from sebs.cache import Cache +from sebs.faas.resources import SystemResources +from sebs.faas.storage import PersistentStorage +from sebs.faas.nosql import NoSQLStorage +from sebs.utils import LoggingHandlers + +import boto3 +import docker + + +class AWSSystemResources(SystemResources): + @staticmethod + def typename() -> str: + return "AWS.SystemResources" + + @property + def config(self) -> AWSConfig: + return cast(AWSConfig, self._config) + + def __init__( + self, + config: AWSConfig, + cache_client: Cache, + docker_client: docker.client, + logger_handlers: LoggingHandlers, + ): + super().__init__(config, cache_client, docker_client) + + self._session: Optional[boto3.session.Session] = None + self._logging_handlers = logger_handlers + self._storage: Optional[S3] = None + self._nosql_storage: Optional[DynamoDB] = None + + def initialize_session(self, session: boto3.session.Session): + self._session = session + + """ + Create a client instance for cloud storage. When benchmark and buckets + parameters are passed, then storage is initialized with required number + of buckets. Buckets may be created or retrieved from cache. + + :param replace_existing: replace existing files in cached buckets? + :return: storage client + """ + + def get_storage(self, replace_existing: Optional[bool] = None) -> PersistentStorage: + + if not self._storage: + assert self._session is not None + self.logging.info("Initialize S3 storage instance.") + self._storage = S3( + self._session, + self._cache_client, + self.config.resources, + self.config.region, + access_key=self.config.credentials.access_key, + secret_key=self.config.credentials.secret_key, + replace_existing=replace_existing if replace_existing is not None else False, + ) + self._storage.logging_handlers = self._logging_handlers + elif replace_existing is not None: + self._storage.replace_existing = replace_existing + return self._storage + + def get_nosql_storage(self) -> NoSQLStorage: + if not self._nosql_storage: + assert self._session is not None + self.logging.info("Initialize DynamoDB NoSQL instance.") + self._nosql_storage = DynamoDB( + self._session, + self._cache_client, + self.config.resources, + self.config.region, + access_key=self.config.credentials.access_key, + secret_key=self.config.credentials.secret_key, + ) + return self._nosql_storage diff --git a/sebs/azure/azure.py b/sebs/azure/azure.py index 2f28fbe28..d848d724a 100644 --- a/sebs/azure/azure.py +++ b/sebs/azure/azure.py @@ -11,18 +11,19 @@ from sebs.azure.blob_storage import BlobStorage from sebs.azure.cli import AzureCLI +from sebs.azure.cosmosdb import CosmosDB from sebs.azure.function import AzureFunction from sebs.azure.config import AzureConfig, AzureResources +from sebs.azure.system_resources import AzureSystemResources from sebs.azure.triggers import AzureTrigger, HTTPTrigger from sebs.faas.function import Trigger from sebs.benchmark import Benchmark from sebs.cache import Cache from sebs.config import SeBSConfig from sebs.utils import LoggingHandlers, execute +from sebs.faas.function import Function, FunctionConfig, ExecutionResult +from sebs.faas.system import System from sebs.faas.config import Resources -from ..faas.function import Function, FunctionConfig, ExecutionResult -from ..faas.storage import PersistentStorage -from ..faas.system import System class Azure(System): @@ -46,6 +47,10 @@ def config(self) -> AzureConfig: def function_type() -> Type[Function]: return AzureFunction + @property + def cli_instance(self) -> AzureCLI: + return cast(AzureSystemResources, self._system_resources).cli_instance + def __init__( self, sebs_config: SeBSConfig, @@ -54,14 +59,15 @@ def __init__( docker_client: docker.client, logger_handlers: LoggingHandlers, ): - super().__init__(sebs_config, cache_client, docker_client) + super().__init__( + sebs_config, + cache_client, + docker_client, + AzureSystemResources(sebs_config, config, cache_client, docker_client, logger_handlers), + ) self.logging_handlers = logger_handlers self._config = config - def initialize_cli(self, cli: AzureCLI): - self.cli_instance = cli - self.cli_instance_stop = False - """ Start the Docker container running Azure CLI tools. """ @@ -71,30 +77,11 @@ def initialize( config: Dict[str, str] = {}, resource_prefix: Optional[str] = None, ): - if not hasattr(self, "cli_instance"): - self.cli_instance = AzureCLI(self.system_config, self.docker_client) - self.cli_instance_stop = True - - output = self.cli_instance.login( - appId=self.config.credentials.appId, - tenant=self.config.credentials.tenant, - password=self.config.credentials.password, - ) - - subscriptions = json.loads(output) - if len(subscriptions) == 0: - raise RuntimeError("Didn't find any valid subscription on Azure!") - if len(subscriptions) > 1: - raise RuntimeError("Found more than one valid subscription on Azure - not supported!") - - self.config.credentials.subscription_id = subscriptions[0]["id"] - self.initialize_resources(select_prefix=resource_prefix) self.allocate_shared_resource() def shutdown(self): - if self.cli_instance and self.cli_instance_stop: - self.cli_instance.shutdown() + cast(AzureSystemResources, self._system_resources).shutdown() super().shutdown() def find_deployments(self) -> List[str]: @@ -120,33 +107,6 @@ def find_deployments(self) -> List[str]: def allocate_shared_resource(self): self.config.resources.data_storage_account(self.cli_instance) - """ - Create wrapper object for Azure blob storage. - First ensure that storage account is created and connection string - is known. Then, create wrapper and create request number of buckets. - - Requires Azure CLI instance in Docker to obtain storage account details. - - :param benchmark: - :param buckets: number of input and output buckets - :param replace_existing: when true, replace existing files in input buckets - :return: Azure storage instance - """ - - def get_storage(self, replace_existing: bool = False) -> PersistentStorage: - if not hasattr(self, "storage"): - self.storage = BlobStorage( - self.config.region, - self.cache_client, - self.config.resources, - self.config.resources.data_storage_account(self.cli_instance).connection_string, - replace_existing=replace_existing, - ) - self.storage.logging_handlers = self.logging_handlers - else: - self.storage.replace_existing = replace_existing - return self.storage - # Directory structure # handler # - source files @@ -252,6 +212,8 @@ def publish_function( "Couldnt find function URL in the output: {}".format(ret.decode("utf-8")) ) + self.logging.info("Sleeping 30 seconds before attempting another query.") + resource_group = self.config.resources.resource_group(self.cli_instance) ret = self.cli_instance.execute( "az functionapp function show --function-name handler " @@ -268,7 +230,8 @@ def publish_function( except RuntimeError as e: error = str(e) # app not found - if "find app with name" in error and repeat_on_failure: + # Azure changed the description as some point + if ("find app with name" in error or "NotFound" in error) and repeat_on_failure: # Sleep because of problems when publishing immediately # after creating function app. time.sleep(30) @@ -304,13 +267,115 @@ def update_function( if container_deployment: raise NotImplementedError("Container deployment is not supported in Azure") + assert code_package.has_input_processed + + # Update environment variables first since it has a non-deterministic + # processing time. + self.update_envs(function, code_package) + # Mount code package in Docker instance container_dest = self._mount_function_code(code_package) - url = self.publish_function(function, code_package, container_dest, True) + function_url = self.publish_function(function, code_package, container_dest, True) + + # Avoid duplication of HTTP trigger + found_trigger = False + for trigger in function.triggers_all(): + + if isinstance(trigger, HTTPTrigger): + found_trigger = True + trigger.url = function_url + break + + if not found_trigger: + trigger = HTTPTrigger( + function_url, self.config.resources.data_storage_account(self.cli_instance) + ) + trigger.logging_handlers = self.logging_handlers + function.add_trigger(trigger) + + def update_envs(self, function: Function, code_package: Benchmark, env_variables: dict = {}): + envs = {} + if code_package.uses_nosql: + + nosql_storage = cast(CosmosDB, self._system_resources.get_nosql_storage()) - trigger = HTTPTrigger(url, self.config.resources.data_storage_account(self.cli_instance)) - trigger.logging_handlers = self.logging_handlers - function.add_trigger(trigger) + # If we use NoSQL, then the handle must be allocated + _, url, creds = nosql_storage.credentials() + db = nosql_storage.benchmark_database(code_package.benchmark) + envs["NOSQL_STORAGE_DATABASE"] = db + envs["NOSQL_STORAGE_URL"] = url + envs["NOSQL_STORAGE_CREDS"] = creds + + for original_name, actual_name in nosql_storage.get_tables( + code_package.benchmark + ).items(): + envs[f"NOSQL_STORAGE_TABLE_{original_name}"] = actual_name + + if code_package.uses_storage: + + envs["STORAGE_CONNECTION_STRING"] = self.config.resources.data_storage_account( + self.cli_instance + ).connection_string + + resource_group = self.config.resources.resource_group(self.cli_instance) + # Retrieve existing environment variables to prevent accidental overwrite + if len(envs) > 0: + + try: + self.logging.info( + f"Retrieving existing environment variables for function {function.name}" + ) + + # First read existing properties + response = self.cli_instance.execute( + f"az functionapp config appsettings list --name {function.name} " + f" --resource-group {resource_group} " + ) + old_envs = json.loads(response.decode()) + + # Find custom envs and copy them - unless they are overwritten now + for env in old_envs: + + # Ignore vars set automatically by Azure + found = False + for prefix in ["FUNCTIONS_", "WEBSITE_", "APPINSIGHTS_", "Azure"]: + if env["name"].startswith(prefix): + found = True + break + + # do not overwrite new value + if not found and env["name"] not in envs: + envs[env["name"]] = env["value"] + + except RuntimeError as e: + self.logging.error("Failed to retrieve environment variables!") + self.logging.error(e) + raise e + + if len(envs) > 0: + try: + env_string = "" + for k, v in envs.items(): + env_string += f" {k}={v}" + + self.logging.info(f"Exporting environment variables for function {function.name}") + self.cli_instance.execute( + f"az functionapp config appsettings set --name {function.name} " + f" --resource-group {resource_group} " + f" --settings {env_string} " + ) + + # if we don't do that, next invocation might still see old values + # Disabled since we swapped the order - we first update envs, then we publish. + # self.logging.info( + # "Sleeping for 10 seconds - Azure needs more time to propagate changes. " + # "Otherwise, functions might not see new variables and fail unexpectedly." + # ) + + except RuntimeError as e: + self.logging.error("Failed to set environment variable!") + self.logging.error(e) + raise e def update_function_configuration(self, function: Function, code_package: Benchmark): # FIXME: this does nothing currently - we don't specify timeout @@ -521,16 +586,10 @@ def download_metrics( def _enforce_cold_start(self, function: Function, code_package: Benchmark): - fname = function.name - resource_group = self.config.resources.resource_group(self.cli_instance) - - self.cli_instance.execute( - f"az functionapp config appsettings set --name {fname} " - f" --resource-group {resource_group} " - f" --settings ForceColdStart={self.cold_start_counter}" - ) + self.update_envs(function, code_package, {"ForceColdStart": str(self.cold_start_counter)}) - self.update_function(function, code_package, False, "") + # FIXME: is this sufficient to enforce cold starts? + # self.update_function(function, code_package, False, "") def enforce_cold_start(self, functions: List[Function], code_package: Benchmark): self.cold_start_counter += 1 diff --git a/sebs/azure/cloud_resources.py b/sebs/azure/cloud_resources.py new file mode 100644 index 000000000..e0d2a1ddd --- /dev/null +++ b/sebs/azure/cloud_resources.py @@ -0,0 +1,87 @@ +import json +from typing import Optional + +from sebs.azure.cli import AzureCLI + +""" + Keep a list of deployed special resources in Azure cloud. + + Currently, we have here CosmosDB accounts that require special handling. +""" + + +class CosmosDBAccount: + @property + def account_name(self) -> str: + return self._account_name + + @property + def url(self) -> str: + return self._url + + @property + def credential(self) -> str: + return self._credential + + def __init__(self, account_name: str, url: str, credential: str): + super().__init__() + self._account_name = account_name + self._url = url + self._credential = credential + + @staticmethod + def from_cache(account_name: str, url: str, credential: str) -> "CosmosDBAccount": + return CosmosDBAccount(account_name, url, credential) + + @staticmethod + def from_allocation( + account_name: str, resource_group: str, cli_instance: AzureCLI, url: Optional[str] + ) -> "CosmosDBAccount": + + if url is None: + url = CosmosDBAccount.query_url( + account_name, + resource_group, + cli_instance, + ) + + credential = CosmosDBAccount.query_credentials( + account_name, + resource_group, + cli_instance, + ) + + return CosmosDBAccount(account_name, url, credential) + + @staticmethod + def query_url(account_name: str, resource_group: str, cli_instance: AzureCLI) -> str: + + # Find the endpoint URL + ret = cli_instance.execute( + f" az cosmosdb show --name {account_name} " f" --resource-group {resource_group} " + ) + ret = json.loads(ret.decode("utf-8")) + return ret["documentEndpoint"] + + @staticmethod + def query_credentials(account_name: str, resource_group: str, cli_instance: AzureCLI) -> str: + + # Read the master key to access CosmosDB account + ret = cli_instance.execute( + f" az cosmosdb keys list --name {account_name} " f" --resource-group {resource_group} " + ) + ret = json.loads(ret.decode("utf-8")) + credential = ret["primaryMasterKey"] + + return credential + + def serialize(self) -> dict: + return { + "account_name": self._account_name, + "url": self._url, + "credential": self._credential, + } + + @staticmethod + def deserialize(obj: dict) -> "CosmosDBAccount": + return CosmosDBAccount.from_cache(obj["account_name"], obj["url"], obj["credential"]) diff --git a/sebs/azure/config.py b/sebs/azure/config.py index 73944864d..9aef0d8c0 100644 --- a/sebs/azure/config.py +++ b/sebs/azure/config.py @@ -3,10 +3,11 @@ import os import re import uuid -from typing import cast, Any, Dict, List, Optional # noqa +from typing import cast, Dict, List, Optional from sebs.azure.cli import AzureCLI +from sebs.azure.cloud_resources import CosmosDBAccount from sebs.cache import Cache from sebs.faas.config import Config, Credentials, Resources from sebs.utils import LoggingHandlers @@ -154,11 +155,13 @@ def __init__( resource_group: Optional[str] = None, storage_accounts: List["AzureResources.Storage"] = [], data_storage_account: Optional["AzureResources.Storage"] = None, + cosmosdb_account: Optional[CosmosDBAccount] = None, ): super().__init__(name="azure") self._resource_group = resource_group self._storage_accounts = storage_accounts self._data_storage_account = data_storage_account + self._cosmosdb_account = cosmosdb_account def set_region(self, region: str): self._region = region @@ -222,6 +225,68 @@ def delete_resource_group(self, cli_instance: AzureCLI, name: str, wait: bool = self.logging.error(ret.decode()) raise RuntimeError("Failed to delete the resource group!") + """ + Find or create a serverless CosmosDB account. + If not found, then create a new one based on the current resource ID. + Restriction: account names must be globally unique. + + Requires Azure CLI instance in Docker. + """ + + def cosmosdb_account(self, cli_instance: AzureCLI) -> CosmosDBAccount: + # Create resource group if not known + if not self._cosmosdb_account: + + # Only hyphen and alphanumeric characters are allowed + account_name = f"sebs-cosmosdb-account-{self.resources_id}" + account_name = account_name.replace("_", "-") + account_name = account_name.replace(".", "-") + + accounts = self.list_cosmosdb_accounts(cli_instance) + if account_name in accounts: + + self.logging.info("Using existing CosmosDB account {}.".format(account_name)) + url = accounts[account_name] + + else: + + try: + self.logging.info(f"Starting allocation of CosmosDB account {account_name}") + self.logging.info("This can take few minutes :-)!") + ret = cli_instance.execute( + f" az cosmosdb create --name {account_name} " + f" --resource-group {self._resource_group} " + f' --locations regionName="{self._region}" ' + " --capabilities EnableServerless " + ) + ret_values = json.loads(ret.decode()) + url = ret_values["documentEndpoint"] + self.logging.info(f"Allocated CosmosDB account {account_name}") + except Exception: + self.logging.error("Failed to parse the response!") + self.logging.error(ret.decode()) + raise RuntimeError("Failed to parse response from Azure CLI!") + + self._cosmosdb_account = CosmosDBAccount.from_allocation( + account_name, self.resource_group(cli_instance), cli_instance, url + ) + + return self._cosmosdb_account + + def list_cosmosdb_accounts(self, cli_instance: AzureCLI) -> Dict[str, str]: + + ret = cli_instance.execute( + f" az cosmosdb list --resource-group {self._resource_group} " + " --query \"[?starts_with(name,'sebs-cosmosdb-account')]\" " + ) + try: + accounts = json.loads(ret.decode()) + return {x["name"]: x["documentEndpoint"] for x in accounts} + except Exception: + self.logging.error("Failed to parse the response!") + self.logging.error(ret.decode()) + raise RuntimeError("Failed to parse response from Azure CLI!") + """ Retrieve or create storage account associated with benchmark data. Last argument allows to override the resource - useful when handling @@ -317,17 +382,23 @@ def initialize(res: Resources, dct: dict): ] else: ret._storage_accounts = [] + if "data_storage_account" in dct: ret._data_storage_account = AzureResources.Storage.deserialize( dct["data_storage_account"] ) + if "cosmosdb_account" in dct: + ret._cosmosdb_account = CosmosDBAccount.deserialize(dct["cosmosdb_account"]) + def serialize(self) -> dict: out = super().serialize() if len(self._storage_accounts) > 0: out["storage_accounts"] = [x.serialize() for x in self._storage_accounts] if self._resource_group: out["resource_group"] = self._resource_group + if self._cosmosdb_account: + out["cosmosdb_account"] = self._cosmosdb_account.serialize() if self._data_storage_account: out["data_storage_account"] = self._data_storage_account.serialize() return out diff --git a/sebs/azure/cosmosdb.py b/sebs/azure/cosmosdb.py new file mode 100644 index 000000000..52f8086b1 --- /dev/null +++ b/sebs/azure/cosmosdb.py @@ -0,0 +1,200 @@ +from dataclasses import dataclass +from typing import cast, Dict, List, Optional, Tuple + +from sebs.azure.cli import AzureCLI +from sebs.azure.cloud_resources import CosmosDBAccount +from sebs.cache import Cache +from sebs.azure.config import AzureResources +from sebs.faas.nosql import NoSQLStorage + +from azure.cosmos import CosmosClient, DatabaseProxy, PartitionKey +from azure.cosmos.exceptions import CosmosResourceNotFoundError + + +@dataclass +class BenchmarkResources: + + database: str + containers: List[str] + # We allocate this dynamically - ignore when caching + database_client: Optional[DatabaseProxy] = None + + def serialize(self) -> dict: + return {"database": self.database, "containers": self.containers} + + @staticmethod + def deserialize(config: dict) -> "BenchmarkResources": + return BenchmarkResources(database=config["database"], containers=config["containers"]) + + +class CosmosDB(NoSQLStorage): + @staticmethod + def typename() -> str: + return "Azure.CosmosDB" + + @staticmethod + def deployment_name(): + return "azure" + + def __init__(self, cli: AzureCLI, cache_client: Cache, resources: AzureResources, region: str): + super().__init__(region, cache_client, resources) + self._cli_instance = cli + self._resource_group = resources.resource_group(self._cli_instance) + + self._benchmark_resources: Dict[str, BenchmarkResources] = {} + self._cosmos_client: Optional[CosmosClient] = None + self._cosmosdb_account: Optional[CosmosDBAccount] = None + + """ + Azure requires no table mappings: the name of container is the same as benchmark name. + """ + + def get_tables(self, benchmark: str) -> Dict[str, str]: + return {} + + def _get_table_name(self, benchmark: str, table: str) -> Optional[str]: + + if benchmark not in self._benchmark_resources: + return None + + if table not in self._benchmark_resources[benchmark].containers: + return None + + return table + + def retrieve_cache(self, benchmark: str) -> bool: + + if benchmark in self._benchmark_resources: + return True + + cached_storage = self.cache_client.get_nosql_config(self.deployment_name(), benchmark) + if cached_storage is not None: + self._benchmark_resources[benchmark] = BenchmarkResources.deserialize(cached_storage) + return True + + return False + + def update_cache(self, benchmark: str): + + self.cache_client.update_nosql( + self.deployment_name(), benchmark, self._benchmark_resources[benchmark].serialize() + ) + + def cosmos_client(self) -> CosmosClient: + + if self._cosmos_client is None: + + self._cosmosdb_account = cast(AzureResources, self._cloud_resources).cosmosdb_account( + self._cli_instance + ) + + self._cosmos_client = CosmosClient( + url=self._cosmosdb_account.url, credential=self._cosmosdb_account.credential + ) + + return self._cosmos_client + + def has_tables(self, benchmark: str) -> bool: + return benchmark in self._benchmark_resources + + def benchmark_database(self, benchmark: str) -> str: + return self._benchmark_resources[benchmark].database + + def credentials(self) -> Tuple[str, str, str]: + + # An update of function that uses fully cached data will have + # to initialize it separately + # There were no prior actions that initialized this variable + if self._cosmosdb_account is None: + self._cosmosdb_account = cast(AzureResources, self._cloud_resources).cosmosdb_account( + self._cli_instance + ) + + return ( + self._cosmosdb_account.account_name, + self._cosmosdb_account.url, + self._cosmosdb_account.credential, + ) + + def write_to_table( + self, + benchmark: str, + table: str, + data: dict, + primary_key: Tuple[str, str], + secondary_key: Optional[Tuple[str, str]] = None, + ): + res = self._benchmark_resources[benchmark] + table_name = self._get_table_name(benchmark, table) + assert table_name is not None + + data[primary_key[0]] = primary_key[1] + # secondary key must have that name in CosmosDB + # FIXME: support both options + assert secondary_key is not None + data["id"] = secondary_key[1] + + if res.database_client is None: + res.database_client = self.cosmos_client().get_database_client(benchmark) + + container_client = res.database_client.get_container_client(table_name) + container_client.create_item(data) + + def create_table( + self, benchmark: str, name: str, primary_key: str, _: Optional[str] = None + ) -> str: + + benchmark_resources = self._benchmark_resources.get(benchmark, None) + + if benchmark_resources is not None and name in benchmark_resources.containers: + self.logging.info(f"Using cached CosmosDB container {name}") + + """ + For some reason, creating the client is enough to verify existence of db/container. + We need to force the client to make some actions; that's why we call read. + """ + # Each benchmark receives its own CosmosDB database + if benchmark_resources is None: + + # Get or allocate database + try: + db_client = self.cosmos_client().get_database_client(benchmark) + db_client.read() + + except CosmosResourceNotFoundError: + self.logging.info(f"Creating CosmosDB database {benchmark}") + db_client = self.cosmos_client().create_database(benchmark) + + benchmark_resources = BenchmarkResources( + database=benchmark, database_client=db_client, containers=[] + ) + self._benchmark_resources[benchmark] = benchmark_resources + + if benchmark_resources.database_client is None: + # Data loaded from cache will miss database client + benchmark_resources.database_client = self.cosmos_client().get_database_client( + benchmark + ) + + try: + + # verify it exists + benchmark_resources.database_client.get_container_client(name).read() + self.logging.info(f"Using existing CosmosDB container {name}") + + except CosmosResourceNotFoundError: + self.logging.info(f"Creating CosmosDB container {name}") + # no container with such name -> allocate + benchmark_resources.database_client.create_container( + id=name, partition_key=PartitionKey(path=f"/{primary_key}") + ) + + benchmark_resources.containers.append(name) + + return name + + def clear_table(self, name: str) -> str: + raise NotImplementedError() + + def remove_table(self, name: str) -> str: + raise NotImplementedError() diff --git a/sebs/azure/system_resources.py b/sebs/azure/system_resources.py new file mode 100644 index 000000000..0e3494d1c --- /dev/null +++ b/sebs/azure/system_resources.py @@ -0,0 +1,111 @@ +import json +from typing import cast, Optional + +from sebs.config import SeBSConfig +from sebs.azure.config import AzureConfig +from sebs.azure.blob_storage import BlobStorage +from sebs.azure.cosmosdb import CosmosDB +from sebs.azure.cli import AzureCLI +from sebs.cache import Cache +from sebs.faas.resources import SystemResources +from sebs.utils import LoggingHandlers + +import docker + + +class AzureSystemResources(SystemResources): + @staticmethod + def typename() -> str: + return "Azure.SystemResources" + + @property + def config(self) -> AzureConfig: + return cast(AzureConfig, self._config) + + def __init__( + self, + system_config: SeBSConfig, + config: AzureConfig, + cache_client: Cache, + docker_client: docker.client, + logger_handlers: LoggingHandlers, + ): + super().__init__(config, cache_client, docker_client) + + self._logging_handlers = logger_handlers + self._storage: Optional[BlobStorage] = None + self._nosql_storage: Optional[CosmosDB] = None + self._cli_instance: Optional[AzureCLI] = None + self._system_config = system_config + + """ + Create wrapper object for Azure blob storage. + First ensure that storage account is created and connection string + is known. Then, create wrapper and create request number of buckets. + + Requires Azure CLI instance in Docker to obtain storage account details. + + :param replace_existing: when true, replace existing files in input buckets + :return: Azure storage instance + """ + + def get_storage(self, replace_existing: Optional[bool] = None) -> BlobStorage: + if self._storage is None: + self._storage = BlobStorage( + self.config.region, + self._cache_client, + self.config.resources, + self.config.resources.data_storage_account(self.cli_instance).connection_string, + replace_existing=replace_existing if replace_existing is not None else False, + ) + self._storage.logging_handlers = self.logging_handlers + elif replace_existing is not None: + self._storage.replace_existing = replace_existing + return self._storage + + def get_nosql_storage(self) -> CosmosDB: + if self._nosql_storage is None: + self._nosql_storage = CosmosDB( + self.cli_instance, self._cache_client, self.config.resources, self.config.region + ) + return self._nosql_storage + + def _login_cli(self): + + assert self._cli_instance is not None + + output = self._cli_instance.login( + appId=self.config.credentials.appId, + tenant=self.config.credentials.tenant, + password=self.config.credentials.password, + ) + + subscriptions = json.loads(output) + if len(subscriptions) == 0: + raise RuntimeError("Didn't find any valid subscription on Azure!") + if len(subscriptions) > 1: + raise RuntimeError("Found more than one valid subscription on Azure - not supported!") + + self.config.credentials.subscription_id = subscriptions[0]["id"] + + @property + def cli_instance(self) -> AzureCLI: + + if self._cli_instance is None: + self._cli_instance = AzureCLI(self._system_config, self._docker_client) + self._cli_instance_stop = True + + self._login_cli() + + return self._cli_instance + + def initialize_cli(self, cli: AzureCLI, login: bool = False): + self._cli_instance = cli + self._cli_instance_stop = False + + if login: + self._login_cli() + + def shutdown(self) -> None: + if self._cli_instance and self._cli_instance_stop: + self._cli_instance.shutdown() diff --git a/sebs/azure/triggers.py b/sebs/azure/triggers.py index 66be8c6de..4296a5880 100644 --- a/sebs/azure/triggers.py +++ b/sebs/azure/triggers.py @@ -31,7 +31,6 @@ def trigger_type() -> Trigger.TriggerType: def sync_invoke(self, payload: dict) -> ExecutionResult: - payload["connection_string"] = self.data_storage_account.connection_string return self._http_invoke(payload, self.url) def async_invoke(self, payload: dict) -> concurrent.futures.Future: diff --git a/sebs/benchmark.py b/sebs/benchmark.py index 331e8db4d..42adb4e7c 100644 --- a/sebs/benchmark.py +++ b/sebs/benchmark.py @@ -12,8 +12,9 @@ from sebs.config import SeBSConfig from sebs.cache import Cache from sebs.faas.config import Resources +from sebs.faas.resources import SystemResources from sebs.utils import find_benchmark, project_absolute_path, LoggingBase -from sebs.faas.storage import PersistentStorage +from sebs.types import BenchmarkModule from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -22,10 +23,13 @@ class BenchmarkConfig: - def __init__(self, timeout: int, memory: int, languages: List["Language"]): + def __init__( + self, timeout: int, memory: int, languages: List["Language"], modules: List[BenchmarkModule] + ): self._timeout = timeout self._memory = memory self._languages = languages + self._modules = modules @property def timeout(self) -> int: @@ -47,6 +51,10 @@ def memory(self, val: int): def languages(self) -> List["Language"]: return self._languages + @property + def modules(self) -> List[BenchmarkModule]: + return self._modules + # FIXME: 3.7+ python with future annotations @staticmethod def deserialize(json_object: dict) -> "BenchmarkConfig": @@ -56,6 +64,7 @@ def deserialize(json_object: dict) -> "BenchmarkConfig": json_object["timeout"], json_object["memory"], [Language.deserialize(x) for x in json_object["languages"]], + [BenchmarkModule(x) for x in json_object["modules"]], ) @@ -141,6 +150,18 @@ def language_name(self) -> str: def language_version(self): return self._language_version + @property + def has_input_processed(self) -> bool: + return self._input_processed + + @property + def uses_storage(self) -> bool: + return self._uses_storage + + @property + def uses_nosql(self) -> bool: + return self._uses_nosql + @property def architecture(self) -> str: return self._architecture @@ -210,6 +231,16 @@ def __init__( if config.update_code: self._is_cached_valid = False + # Load input module + + self._benchmark_data_path = find_benchmark(self._benchmark, "benchmarks-data") + self._benchmark_input_module = load_benchmark_input(self._benchmark_path) + + # Check if input has been processed + self._input_processed: bool = False + self._uses_storage: bool = False + self._uses_nosql: bool = False + """ Compute MD5 hash of an entire directory. """ @@ -328,14 +359,27 @@ def add_deployment_files(self, output_dir): shutil.copy2(file, os.path.join(output_dir)) def add_deployment_package_python(self, output_dir): + + destination_file = f"requirements.txt.{self._language_version}" + if not os.path.exists(os.path.join(output_dir, destination_file)): + destination_file = "requirements.txt" + # append to the end of requirements file - packages = self._system_config.deployment_packages( - self._deployment_name, self.language_name - ) - if len(packages): - with open(os.path.join(output_dir, "requirements.txt"), "a") as out: - for package in packages: - out.write(package) + with open(os.path.join(output_dir, destination_file), "a") as out: + + packages = self._system_config.deployment_packages( + self._deployment_name, self.language_name + ) + for package in packages: + out.write(package) + + module_packages = self._system_config.deployment_module_packages( + self._deployment_name, self.language_name + ) + for bench_module in self._benchmark_config.modules: + if bench_module.value in module_packages: + for package in module_packages[bench_module.value]: + out.write(package) def add_deployment_package_nodejs(self, output_dir): # modify package.json @@ -343,7 +387,11 @@ def add_deployment_package_nodejs(self, output_dir): self._deployment_name, self.language_name ) if len(packages): - package_config = os.path.join(output_dir, "package.json") + + package_config = os.path.join(output_dir, f"package.json.{self._language_version}") + if not os.path.exists(package_config): + package_config = os.path.join(output_dir, "package.json") + with open(package_config, "r") as package_file: package_json = json.load(package_file) for key, val in packages.items(): @@ -382,10 +430,11 @@ def install_dependencies(self, output_dir): ) else: repo_name = self._system_config.docker_repository() - image_name = "build.{deployment}.{language}.{runtime}".format( + image_name = "build.{deployment}.{language}.{runtime}-{version}".format( deployment=self._deployment_name, language=self.language_name, runtime=self.language_version, + version=self._system_config.version(), ) try: self._docker_client.images.get(repo_name + ":" + image_name) @@ -453,7 +502,7 @@ def install_dependencies(self, output_dir): environment={"APP": self.benchmark}, # user="1000:1000", user=uid, - # remove=True, + remove=True, detach=True, tty=True, command="/bin/bash", @@ -585,36 +634,74 @@ def build( :param size: Benchmark workload size """ - def prepare_input(self, storage: PersistentStorage, size: str): - benchmark_data_path = find_benchmark(self._benchmark, "benchmarks-data") - mod = load_benchmark_input(self._benchmark_path) + def prepare_input( + self, system_resources: SystemResources, size: str, replace_existing: bool = False + ): + + """ + Handle object storage buckets. + """ + if hasattr(self._benchmark_input_module, "buckets_count"): + + buckets = self._benchmark_input_module.buckets_count() + storage = system_resources.get_storage(replace_existing) + input, output = storage.benchmark_data(self.benchmark, buckets) - buckets = mod.buckets_count() - input, output = storage.benchmark_data(self.benchmark, buckets) + self._uses_storage = len(input) > 0 or len(output) > 0 + + storage_func = storage.uploader_func + bucket = storage.get_bucket(Resources.StorageBucketType.BENCHMARKS) + else: + input = [] + output = [] + storage_func = None + bucket = None + + """ + Handle key-value storage. + This part is optional - only selected benchmarks implement this. + """ + if hasattr(self._benchmark_input_module, "allocate_nosql"): + + nosql_storage = system_resources.get_nosql_storage() + for name, table_properties in self._benchmark_input_module.allocate_nosql().items(): + nosql_storage.create_benchmark_tables( + self._benchmark, + name, + table_properties["primary_key"], + table_properties.get("secondary_key"), + ) + + self._uses_nosql = True + nosql_func = nosql_storage.write_to_table + else: + nosql_func = None # buckets = mod.buckets_count() # storage.allocate_buckets(self.benchmark, buckets) # Get JSON and upload data as required by benchmark - input_config = mod.generate_input( - benchmark_data_path, - size, - storage.get_bucket(Resources.StorageBucketType.BENCHMARKS), - input, - output, - storage.uploader_func, + input_config = self._benchmark_input_module.generate_input( + self._benchmark_data_path, size, bucket, input, output, storage_func, nosql_func ) - self._cache_client.update_storage( - storage.deployment_name(), - self._benchmark, - { - "buckets": { - "input": storage.input_prefixes, - "output": storage.output_prefixes, - "input_uploaded": True, - } - }, - ) + # Cache only once we data is in the cloud. + if hasattr(self._benchmark_input_module, "buckets_count"): + self._cache_client.update_storage( + storage.deployment_name(), + self._benchmark, + { + "buckets": { + "input": storage.input_prefixes, + "output": storage.output_prefixes, + "input_uploaded": True, + } + }, + ) + + if hasattr(self._benchmark_input_module, "allocate_nosql"): + nosql_storage.update_cache(self._benchmark) + + self._input_processed = True return input_config @@ -687,15 +774,23 @@ class BenchmarkModuleInterface: def buckets_count() -> Tuple[int, int]: pass + @staticmethod + @abstractmethod + def allocate_nosql() -> dict: + pass + @staticmethod @abstractmethod def generate_input( data_dir: str, size: str, - benchmarks_bucket: str, + benchmarks_bucket: Optional[str], input_paths: List[str], output_paths: List[str], - upload_func: Callable[[int, str, str], None], + upload_func: Optional[Callable[[int, str, str], None]], + nosql_func: Optional[ + Callable[[str, str, dict, Tuple[str, str], Optional[Tuple[str, str]]], None] + ], ) -> Dict[str, str]: pass diff --git a/sebs/cache.py b/sebs/cache.py index 1e4d1e00c..f690e747a 100644 --- a/sebs/cache.py +++ b/sebs/cache.py @@ -172,24 +172,51 @@ def get_functions( """ def get_storage_config(self, deployment: str, benchmark: str): + return self._get_resource_config(deployment, benchmark, "storage") + + def get_nosql_config(self, deployment: str, benchmark: str): + return self._get_resource_config(deployment, benchmark, "nosql") + + def _get_resource_config(self, deployment: str, benchmark: str, resource: str): cfg = self.get_benchmark_config(deployment, benchmark) - return cfg["storage"] if cfg and "storage" in cfg and not self.ignore_storage else None + return cfg[resource] if cfg and resource in cfg and not self.ignore_storage else None def update_storage(self, deployment: str, benchmark: str, config: dict): - benchmark_dir = os.path.join(self.cache_dir, benchmark) - config_path = os.path.join(benchmark_dir, "config.json") + if self.ignore_storage: + return - if self.ignore_storage or not os.path.exists(config_path): - self.logging.debug( - f"Skipping storage update: ignore_storage={self.ignore_storage}, " - "config exists={os.path.exists(config_path)} at {config_path}" - ) + self._update_resources(deployment, benchmark, "storage", config) + + def update_nosql(self, deployment: str, benchmark: str, config: dict): + if self.ignore_storage: + return + self._update_resources(deployment, benchmark, "nosql", config) + + def _update_resources(self, deployment: str, benchmark: str, resource: str, config: dict): + if self.ignore_storage: return + + """ + We are now preparing benchmark data before caching function. + Thus, we have to take over a situation where the cache directory does not exist. + """ + + benchmark_dir = os.path.join(self.cache_dir, benchmark) + os.makedirs(benchmark_dir, exist_ok=True) + with self._lock: - with open(config_path, "r") as fp: - cached_config = json.load(fp) - cached_config[deployment]["storage"] = config - with open(config_path, "w") as fp: + if os.path.exists(os.path.join(benchmark_dir, "config.json")): + with open(os.path.join(benchmark_dir, "config.json"), "r") as fp: + cached_config = json.load(fp) + else: + cached_config = {} + + if deployment in cached_config: + cached_config[deployment][resource] = config + else: + cached_config[deployment] = {resource: config} + + with open(os.path.join(benchmark_dir, "config.json"), "w") as fp: json.dump(cached_config, fp, indent=2) def add_code_package( diff --git a/sebs/config.py b/sebs/config.py index d2e27845a..aa20a8e3f 100644 --- a/sebs/config.py +++ b/sebs/config.py @@ -26,6 +26,13 @@ def deployment_packages(self, deployment_name: str, language_name: str) -> Dict[ "packages" ] + def deployment_module_packages( + self, deployment_name: str, language_name: str + ) -> Dict[str, str]: + return self._system_config[deployment_name]["languages"][language_name]["deployment"][ + "module_packages" + ] + def deployment_files(self, deployment_name: str, language_name: str) -> List[str]: return self._system_config[deployment_name]["languages"][language_name]["deployment"][ "files" @@ -60,6 +67,9 @@ def benchmark_base_images( architecture ] + def version(self) -> str: + return self._system_config["general"].get("SeBS_version", "unknown") + def benchmark_image_name( self, system: str, diff --git a/sebs/experiments/invocation_overhead.py b/sebs/experiments/invocation_overhead.py index 335b184f5..ceccbc8bd 100644 --- a/sebs/experiments/invocation_overhead.py +++ b/sebs/experiments/invocation_overhead.py @@ -78,6 +78,12 @@ def prepare(self, sebs_client: "SeBS", deployment_client: FaaSSystem): self._benchmark = sebs_client.get_benchmark( "030.clock-synchronization", deployment_client, self.config ) + + self.benchmark_input = self._benchmark.prepare_input( + deployment_client.system_resources, size="test", replace_existing=True + ) + self._storage = deployment_client.system_resources.get_storage(replace_existing=True) + self._function = deployment_client.get_function(self._benchmark) triggers = self._function.triggers(Trigger.TriggerType.HTTP) @@ -88,8 +94,6 @@ def prepare(self, sebs_client: "SeBS", deployment_client: FaaSSystem): else: self._trigger = triggers[0] - self._storage = deployment_client.get_storage(replace_existing=True) - self.benchmark_input = self._benchmark.prepare_input(storage=self._storage, size="test") self._out_dir = os.path.join( sebs_client.output_dir, "invocation-overhead", self.settings["type"] ) diff --git a/sebs/experiments/network_ping_pong.py b/sebs/experiments/network_ping_pong.py index 303f6f531..6c44f8480 100644 --- a/sebs/experiments/network_ping_pong.py +++ b/sebs/experiments/network_ping_pong.py @@ -28,9 +28,14 @@ def prepare(self, sebs_client: "SeBS", deployment_client: FaaSSystem): benchmark = sebs_client.get_benchmark( "020.network-benchmark", deployment_client, self.config ) + + self.benchmark_input = benchmark.prepare_input( + deployment_client.system_resources, size="test", replace_existing=True + ) + self._storage = deployment_client.system_resources.get_storage(replace_existing=True) + self._function = deployment_client.get_function(benchmark) - self._storage = deployment_client.get_storage(replace_existing=True) - self.benchmark_input = benchmark.prepare_input(storage=self._storage, size="test") + self._out_dir = os.path.join(sebs_client.output_dir, "network-ping-pong") if not os.path.exists(self._out_dir): # shutil.rmtree(self._out_dir) diff --git a/sebs/experiments/perf_cost.py b/sebs/experiments/perf_cost.py index 38e4d4189..80ef2d34f 100644 --- a/sebs/experiments/perf_cost.py +++ b/sebs/experiments/perf_cost.py @@ -46,13 +46,16 @@ def prepare(self, sebs_client: "SeBS", deployment_client: FaaSSystem): self._benchmark = sebs_client.get_benchmark( settings["benchmark"], deployment_client, self.config ) - self._function = deployment_client.get_function(self._benchmark) + # prepare benchmark input - self._storage = deployment_client.get_storage(replace_existing=self.config.update_storage) self._benchmark_input = self._benchmark.prepare_input( - storage=self._storage, size=settings["input-size"] + deployment_client.system_resources, + size=settings["input-size"], + replace_existing=self.config.update_storage, ) + self._function = deployment_client.get_function(self._benchmark) + # add HTTP trigger triggers = self._function.triggers(Trigger.TriggerType.HTTP) if len(triggers) == 0: diff --git a/sebs/faas/container.py b/sebs/faas/container.py index 7ae9c4d37..b17525f7b 100644 --- a/sebs/faas/container.py +++ b/sebs/faas/container.py @@ -196,7 +196,11 @@ def build_base_image( "our documentation. We recommend QEMU as it can be configured to run automatically." ) - buildargs = {"VERSION": language_version, "BASE_IMAGE": builder_image} + buildargs = { + "VERSION": language_version, + "BASE_IMAGE": builder_image, + "TARGET_ARCHITECTURE": architecture, + } image, _ = self.docker_client.images.build( tag=image_uri, path=build_dir, buildargs=buildargs ) diff --git a/sebs/faas/nosql.py b/sebs/faas/nosql.py new file mode 100644 index 000000000..16f9ab119 --- /dev/null +++ b/sebs/faas/nosql.py @@ -0,0 +1,118 @@ +from abc import ABC +from abc import abstractmethod +from typing import Dict, Optional, Tuple + +from sebs.faas.config import Resources +from sebs.cache import Cache +from sebs.utils import LoggingBase + + +class NoSQLStorage(ABC, LoggingBase): + @staticmethod + @abstractmethod + def deployment_name() -> str: + pass + + @property + def cache_client(self) -> Cache: + return self._cache_client + + @property + def region(self): + return self._region + + def __init__(self, region: str, cache_client: Cache, resources: Resources): + super().__init__() + self._cache_client = cache_client + self._cached = False + self._region = region + self._cloud_resources = resources + + @abstractmethod + def get_tables(self, benchmark: str) -> Dict[str, str]: + pass + + @abstractmethod + def _get_table_name(self, benchmark: str, table: str) -> Optional[str]: + pass + + @abstractmethod + def retrieve_cache(self, benchmark: str) -> bool: + pass + + @abstractmethod + def update_cache(self, benchmark: str): + pass + + def envs(self) -> dict: + return {} + + """ + Each table name follow this pattern: + sebs-benchmarks-{resource_id}-{benchmark-name}-{table-name} + + Each implementation should do the following + (1) Retrieve cached data + (2) Create missing table that do not exist + (3) Update cached data if anything new was created -> this is done separately + in benchmark.py once the data is uploaded by the benchmark. + """ + + def create_benchmark_tables( + self, benchmark: str, name: str, primary_key: str, secondary_key: Optional[str] = None + ): + + if self.retrieve_cache(benchmark): + + table_name = self._get_table_name(benchmark, name) + if table_name is not None: + self.logging.info( + f"Using cached NoSQL table {table_name} for benchmark {benchmark}" + ) + return + + self.logging.info(f"Preparing to create a NoSQL table {name} for benchmark {benchmark}") + + self.create_table(benchmark, name, primary_key, secondary_key) + + """ + + AWS: DynamoDB Table + Azure: CosmosDB Container + Google Cloud: Firestore in Datastore Mode, Database + """ + + @abstractmethod + def create_table( + self, benchmark: str, name: str, primary_key: str, secondary_key: Optional[str] = None + ) -> str: + pass + + @abstractmethod + def write_to_table( + self, + benchmark: str, + table: str, + data: dict, + primary_key: Tuple[str, str], + secondary_key: Optional[Tuple[str, str]] = None, + ): + pass + + """ + + AWS DynamoDB: Removing & recreating table is the cheapest & fastest option + + Azure CosmosDB: recreate container + + Google Cloud: also likely recreate + + """ + + @abstractmethod + def clear_table(self, name: str) -> str: + pass + + @abstractmethod + def remove_table(self, name: str) -> str: + pass diff --git a/sebs/faas/resources.py b/sebs/faas/resources.py new file mode 100644 index 000000000..140a719e6 --- /dev/null +++ b/sebs/faas/resources.py @@ -0,0 +1,43 @@ +from abc import abstractmethod, ABC +from typing import Optional + +import docker + +from sebs.cache import Cache +from sebs.faas.config import Config +from sebs.faas.storage import PersistentStorage +from sebs.faas.nosql import NoSQLStorage +from sebs.utils import LoggingBase + + +class SystemResources(ABC, LoggingBase): + def __init__(self, config: Config, cache_client: Cache, docker_client: docker.client): + + super().__init__() + + self._config = config + self._cache_client = cache_client + self._docker_client = docker_client + + """ + Access persistent storage instance. + It might be a remote and truly persistent service (AWS S3, Azure Blob..), + or a dynamically allocated local instance. + + :param replace_existing: replace benchmark input data if exists already + """ + + @abstractmethod + def get_storage(self, replace_existing: Optional[bool] = None) -> PersistentStorage: + pass + + """ + Access instance of NoSQL storage. + It might be a remote and truly persistent service (AWS DynamoDB, Azure CosmosDB..), + or a dynamically allocated local instance (ScyllaDB). + + """ + + @abstractmethod + def get_nosql_storage(self) -> NoSQLStorage: + pass diff --git a/sebs/faas/system.py b/sebs/faas/system.py index c2aa7ee3e..9fbe0e273 100644 --- a/sebs/faas/system.py +++ b/sebs/faas/system.py @@ -9,9 +9,9 @@ from sebs.benchmark import Benchmark from sebs.cache import Cache from sebs.config import SeBSConfig +from sebs.faas.resources import SystemResources from sebs.faas.config import Resources from sebs.faas.function import Function, Trigger, ExecutionResult -from sebs.faas.storage import PersistentStorage from sebs.utils import LoggingBase from .config import Config @@ -30,6 +30,7 @@ def __init__( system_config: SeBSConfig, cache_client: Cache, docker_client: docker.client, + system_resources: SystemResources, ): super().__init__() self._system_config = system_config @@ -37,6 +38,8 @@ def __init__( self._cache_client = cache_client self._cold_start_counter = randrange(100) + self._system_resources = system_resources + @property def system_config(self) -> SeBSConfig: return self._system_config @@ -62,6 +65,10 @@ def cold_start_counter(self, val: int): def config(self) -> Config: pass + @property + def system_resources(self) -> SystemResources: + return self._system_resources + @staticmethod @abstractmethod def function_type() -> "Type[Function]": @@ -75,7 +82,7 @@ def find_deployments(self) -> List[str]: This can be overriden, e.g., in Azure that looks for unique """ - return self.get_storage().find_deployments() + return self.system_resources.get_storage().find_deployments() def initialize_resources(self, select_prefix: Optional[str]): @@ -119,7 +126,7 @@ def initialize_resources(self, select_prefix: Optional[str]): self.config.resources.resources_id = res_id self.logging.info(f"Generating unique resource name {res_id}") # ensure that the bucket is created - this allocates the new resource - self.get_storage().get_bucket(Resources.StorageBucketType.BENCHMARKS) + self.system_resources.get_storage().get_bucket(Resources.StorageBucketType.BENCHMARKS) """ Initialize the system. After the call the local or remote @@ -132,18 +139,6 @@ def initialize_resources(self, select_prefix: Optional[str]): def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] = None): pass - """ - Access persistent storage instance. - It might be a remote and truly persistent service (AWS S3, Azure Blob..), - or a dynamically allocated local instance. - - :param replace_existing: replace benchmark input data if exists already - """ - - @abstractmethod - def get_storage(self, replace_existing: bool = False) -> PersistentStorage: - pass - """ Apply the system-specific code packaging routine to build benchmark. The benchmark creates a code directory with the following structure: @@ -276,7 +271,24 @@ def get_function(self, code_package: Benchmark, func_name: Optional[str] = None) be updated if the local version is different. """ functions = code_package.functions - if not functions or func_name not in functions: + + is_function_cached = not (not functions or func_name not in functions) + if is_function_cached: + # retrieve function + cached_function = functions[func_name] + code_location = code_package.code_location + + try: + function = self.function_type().deserialize(cached_function) + except RuntimeError as e: + + self.logging.error( + f"Cached function {cached_function['name']} is no longer available." + ) + self.logging.error(e) + is_function_cached = False + + if not is_function_cached: msg = ( "function name not provided." if not func_name @@ -295,10 +307,8 @@ def get_function(self, code_package: Benchmark, func_name: Optional[str] = None) code_package.query_cache() return function else: - # retrieve function - cached_function = functions[func_name] - code_location = code_package.code_location - function = self.function_type().deserialize(cached_function) + + assert function is not None self.cached_function(function) self.logging.info( "Using cached function {fname} in {loc}".format(fname=func_name, loc=code_location) diff --git a/sebs/gcp/cli.py b/sebs/gcp/cli.py new file mode 100644 index 000000000..65ca33bc2 --- /dev/null +++ b/sebs/gcp/cli.py @@ -0,0 +1,97 @@ +import logging +import os + +import docker + +from sebs.config import SeBSConfig +from sebs.gcp.config import GCPCredentials +from sebs.utils import LoggingBase + + +class GCloudCLI(LoggingBase): + @staticmethod + def typename() -> str: + return "GCP.CLI" + + def __init__( + self, credentials: GCPCredentials, system_config: SeBSConfig, docker_client: docker.client + ): + + super().__init__() + + repo_name = system_config.docker_repository() + image_name = "manage.gcp" + try: + docker_client.images.get(repo_name + ":" + image_name) + except docker.errors.ImageNotFound: + try: + logging.info( + "Docker pull of image {repo}:{image}".format(repo=repo_name, image=image_name) + ) + docker_client.images.pull(repo_name, image_name) + except docker.errors.APIError: + raise RuntimeError("Docker pull of image {} failed!".format(image_name)) + + volumes = { + os.path.abspath(credentials.gcp_credentials): { + "bind": "/credentials.json", + "mode": "ro", + } + } + self.docker_instance = docker_client.containers.run( + image=repo_name + ":" + image_name, + volumes=volumes, + remove=True, + stdout=True, + stderr=True, + detach=True, + tty=True, + ) + self.logging.info(f"Started gcloud CLI container: {self.docker_instance.id}.") + # while True: + # try: + # dkg = self.docker_instance.logs(stream=True, follow=True) + # next(dkg).decode("utf-8") + # break + # except StopIteration: + # pass + + """ + Execute the given command in Azure CLI. + Throws an exception on failure (commands are expected to execute succesfully). + """ + + def execute(self, cmd: str): + exit_code, out = self.docker_instance.exec_run(cmd) + if exit_code != 0: + raise RuntimeError( + "Command {} failed at gcloud CLI docker!\n Output {}".format( + cmd, out.decode("utf-8") + ) + ) + return out + + """ + Run gcloud auth command on Docker instance. + + Important: we cannot run "init" as this always requires authenticating through a browser. + Instead, we authenticate as a service account. + + Setting cloud project will show a warning about missing permissions + for Cloud Resource Manager API: I don't know why, we don't seem to need it. + + Because of that, it will ask for verification to continue - which we do by passing "Y". + """ + + def login(self, project_name: str): + self.execute("gcloud auth login --cred-file=/credentials.json") + self.execute(f"/bin/bash -c 'gcloud config set project {project_name} <<< Y'") + self.logging.info("gcloud CLI login succesful") + + """ + Shuts down the Docker instance. + """ + + def shutdown(self): + self.logging.info("Stopping gcloud CLI manage Docker instance") + self.docker_instance.stop() diff --git a/sebs/gcp/datastore.py b/sebs/gcp/datastore.py new file mode 100644 index 000000000..ae747fb17 --- /dev/null +++ b/sebs/gcp/datastore.py @@ -0,0 +1,180 @@ +from dataclasses import dataclass +from typing import Dict, List, Tuple, Optional + +from sebs.cache import Cache +from sebs.faas.config import Resources +from sebs.faas.nosql import NoSQLStorage +from sebs.gcp.cli import GCloudCLI + +from google.cloud import datastore + + +@dataclass +class BenchmarkResources: + + database: str + kinds: List[str] + # We allocate this dynamically - ignore when caching + database_client: Optional[datastore.Client] = None + + def serialize(self) -> dict: + return {"database": self.database, "kinds": self.kinds} + + @staticmethod + def deserialize(config: dict) -> "BenchmarkResources": + return BenchmarkResources(database=config["database"], kinds=config["kinds"]) + + +class Datastore(NoSQLStorage): + @staticmethod + def typename() -> str: + return "GCP.Datastore" + + @staticmethod + def deployment_name(): + return "gcp" + + def __init__( + self, cli_instance: GCloudCLI, cache_client: Cache, resources: Resources, region: str + ): + super().__init__(region, cache_client, resources) + self._cli_instance = cli_instance + self._region = region + + # Mapping: benchmark -> Datastore database + self._benchmark_resources: Dict[str, BenchmarkResources] = {} + + """ + GCP requires no table mappings: the name of "kind" is the same as benchmark name. + """ + + def get_tables(self, benchmark: str) -> Dict[str, str]: + return {} + + def _get_table_name(self, benchmark: str, table: str) -> Optional[str]: + + if benchmark not in self._benchmark_resources: + return None + + if table not in self._benchmark_resources[benchmark].kinds: + return None + + return table + + def retrieve_cache(self, benchmark: str) -> bool: + + if benchmark in self._benchmark_resources: + return True + + cached_storage = self.cache_client.get_nosql_config(self.deployment_name(), benchmark) + if cached_storage is not None: + self._benchmark_resources[benchmark] = BenchmarkResources.deserialize(cached_storage) + return True + + return False + + def update_cache(self, benchmark: str): + + self._cache_client.update_nosql( + self.deployment_name(), benchmark, self._benchmark_resources[benchmark].serialize() + ) + + def benchmark_database(self, benchmark: str) -> str: + return self._benchmark_resources[benchmark].database + + def write_to_table( + self, + benchmark: str, + table: str, + data: dict, + primary_key: Tuple[str, str], + secondary_key: Optional[Tuple[str, str]] = None, + ): + + res = self._benchmark_resources[benchmark] + table_name = self._get_table_name(benchmark, table) + + # FIXME: support both options + assert secondary_key is not None + + if res.database_client is None: + res.database_client = datastore.Client(database=res.database) + + parent_key = res.database_client.key(secondary_key[0], secondary_key[1]) + key = res.database_client.key( + # kind determines the table + table_name, + # main ID key + secondary_key[1], + # organization key + parent=parent_key, + ) + + val = datastore.Entity(key=key) + val.update(data) + res.database_client.put(val) + + def create_table( + self, benchmark: str, name: str, primary_key: str, _: Optional[str] = None + ) -> str: + + benchmark_resources = self._benchmark_resources.get(benchmark, None) + + if benchmark_resources is not None and name in benchmark_resources.kinds: + self.logging.info(f"Using cached Datastore kind {name}") + return name + + """ + No data for this benchmark -> we need to allocate a new Datastore database. + """ + + if benchmark_resources is None: + + database_name = f"sebs-benchmarks-{self._cloud_resources.resources_id}-{benchmark}" + + try: + + self._cli_instance.execute( + "gcloud firestore databases describe " + f" --database='{database_name}' " + " --format='json'" + ) + + except RuntimeError as e: + + if "NOT_FOUND" in str(e): + + """ + Allocate a new Firestore database, in datastore mode + """ + + self.logging.info(f"Allocating a new Firestore database {database_name}") + self._cli_instance.execute( + "gcloud firestore databases create " + f" --database='{database_name}' " + f" --location={self.region} " + f" --type='datastore-mode' " + ) + self.logging.info(f"Allocated a new Firestore database {database_name}") + + else: + + self.logging.error("Couldn't query Datastore instances!") + self.logging.error(e) + raise RuntimeError("Couldn't query Datastore instances!") + + db_client = datastore.Client(database=database_name) + benchmark_resources = BenchmarkResources( + database=database_name, kinds=[], database_client=db_client + ) + self._benchmark_resources[benchmark] = benchmark_resources + + benchmark_resources.kinds.append(name) + + return name + + def clear_table(self, name: str) -> str: + raise NotImplementedError() + + def remove_table(self, name: str) -> str: + raise NotImplementedError() diff --git a/sebs/gcp/gcp.py b/sebs/gcp/gcp.py index 87be34506..187d8cda8 100644 --- a/sebs/gcp/gcp.py +++ b/sebs/gcp/gcp.py @@ -11,16 +11,16 @@ from googleapiclient.discovery import build from googleapiclient.errors import HttpError -from google.cloud import monitoring_v3 +import google.cloud.monitoring_v3 as monitoring_v3 from sebs.cache import Cache from sebs.config import SeBSConfig from sebs.benchmark import Benchmark -from ..faas.function import Function, FunctionConfig, Trigger -from .storage import PersistentStorage +from sebs.faas.function import Function, FunctionConfig, Trigger from sebs.faas.config import Resources -from ..faas.system import System +from sebs.faas.system import System from sebs.gcp.config import GCPConfig +from sebs.gcp.resources import GCPSystemResources from sebs.gcp.storage import GCPStorage from sebs.gcp.function import GCPFunction from sebs.utils import LoggingHandlers @@ -43,9 +43,15 @@ def __init__( docker_client: docker.client, logging_handlers: LoggingHandlers, ): - super().__init__(system_config, cache_client, docker_client) + super().__init__( + system_config, + cache_client, + docker_client, + GCPSystemResources( + system_config, config, cache_client, docker_client, logging_handlers + ), + ) self._config = config - self.storage: Optional[GCPStorage] = None self.logging_handlers = logging_handlers @property @@ -74,36 +80,11 @@ def function_type() -> "Type[Function]": def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] = None): self.function_client = build("cloudfunctions", "v1", cache_discovery=False) - self.get_storage() self.initialize_resources(select_prefix=resource_prefix) def get_function_client(self): return self.function_client - """ - Access persistent storage instance. - It might be a remote and truly persistent service (AWS S3, Azure Blob..), - or a dynamically allocated local instance. - - :param replace_existing: replace benchmark input data if exists already - """ - - def get_storage( - self, - replace_existing: bool = False, - benchmark=None, - buckets=None, - ) -> PersistentStorage: - if not self.storage: - self.storage = GCPStorage( - self.config.region, self.cache_client, self.config.resources, replace_existing - ) - self.storage.logging_handlers = self.logging_handlers - else: - self.storage.replace_existing = replace_existing - return self.storage - - # @staticmethod def default_function_name( self, code_package: Benchmark, resources: Optional[Resources] = None ) -> str: @@ -120,9 +101,10 @@ def default_function_name( @staticmethod def format_function_name(func_name: str) -> str: # GCP functions must begin with a letter + # however, we now add by default `sebs` in the beginning func_name = func_name.replace("-", "_") func_name = func_name.replace(".", "_") - return f"function-{func_name}" + return func_name """ Apply the system-specific code packaging routine to build benchmark. @@ -171,10 +153,6 @@ def package_code( file = os.path.join(directory, file) shutil.move(file, function_dir) - requirements = open(os.path.join(directory, "requirements.txt"), "w") - requirements.write("google-cloud-storage") - requirements.close() - # rename handler function.py since in gcp it has to be caled main.py old_name, new_name = HANDLER[language_name] old_path = os.path.join(directory, old_name) @@ -222,7 +200,7 @@ def create_function( timeout = code_package.benchmark_config.timeout memory = code_package.benchmark_config.memory code_bucket: Optional[str] = None - storage_client = self.get_storage() + storage_client = self._system_resources.get_storage() location = self.config.region project_name = self.config.project_name function_cfg = FunctionConfig.from_benchmark(code_package) @@ -242,6 +220,9 @@ def create_function( try: get_req.execute() except HttpError: + + envs = self._generate_function_envs(code_package) + create_req = ( self.function_client.projects() .locations() @@ -259,11 +240,13 @@ def create_function( "httpsTrigger": {}, "ingressSettings": "ALLOW_ALL", "sourceArchiveUrl": "gs://" + code_bucket + "/" + code_prefix, + "environmentVariables": envs, }, ) ) create_req.execute() self.logging.info(f"Function {func_name} has been created!") + allow_unauthenticated_req = ( self.function_client.projects() .locations() @@ -279,7 +262,27 @@ def create_function( }, ) ) - allow_unauthenticated_req.execute() + + # Avoid infinite loop + MAX_RETRIES = 5 + counter = 0 + while counter < MAX_RETRIES: + try: + allow_unauthenticated_req.execute() + break + except HttpError: + + self.logging.info( + "Sleeping for 5 seconds because the created functions is not yet available!" + ) + time.sleep(5) + counter += 1 + else: + raise RuntimeError( + f"Failed to configure function {full_func_name} " + "for unauthenticated invocations!" + ) + self.logging.info(f"Function {func_name} accepts now unauthenticated invocations!") function = GCPFunction( @@ -365,12 +368,14 @@ def update_function( function_cfg = FunctionConfig.from_benchmark(code_package) architecture = function_cfg.architecture.value code_package_name = os.path.basename(code_package.code_location) + storage = cast(GCPStorage, self._system_resources.get_storage()) code_package_name = f"{architecture}-{code_package_name}" - storage = cast(GCPStorage, self.get_storage()) - bucket = function.code_bucket(code_package.benchmark, storage) storage.upload(bucket, code_package.code_location, code_package_name) + + envs = self._generate_function_envs(code_package) + self.logging.info(f"Uploaded new code package to {bucket}/{code_package_name}") full_func_name = GCP.get_full_function_name( self.config.project_name, self.config.region, function.name @@ -389,6 +394,7 @@ def update_function( "timeout": str(function.config.timeout) + "s", "httpsTrigger": {}, "sourceArchiveUrl": "gs://" + bucket + "/" + code_package_name, + "environmentVariables": envs, }, ) ) @@ -412,24 +418,85 @@ def update_function( ) self.logging.info("Published new function code and configuration.") - def update_function_configuration(self, function: Function, benchmark: Benchmark): + def _update_envs(self, full_function_name: str, envs: dict) -> dict: + + get_req = ( + self.function_client.projects().locations().functions().get(name=full_function_name) + ) + response = get_req.execute() + + # preserve old variables while adding new ones. + # but for conflict, we select the new one + if "environmentVariables" in response: + envs = {**response["environmentVariables"], **envs} + + return envs + + def _generate_function_envs(self, code_package: Benchmark) -> dict: + + envs = {} + if code_package.uses_nosql: + + db = ( + cast(GCPSystemResources, self._system_resources) + .get_nosql_storage() + .benchmark_database(code_package.benchmark) + ) + envs["NOSQL_STORAGE_DATABASE"] = db + + return envs + + def update_function_configuration( + self, function: Function, code_package: Benchmark, env_variables: dict = {} + ): + + assert code_package.has_input_processed + function = cast(GCPFunction, function) full_func_name = GCP.get_full_function_name( self.config.project_name, self.config.region, function.name ) - req = ( - self.function_client.projects() - .locations() - .functions() - .patch( - name=full_func_name, - updateMask="availableMemoryMb,timeout", - body={ - "availableMemoryMb": function.config.memory, - "timeout": str(function.config.timeout) + "s", - }, + + envs = self._generate_function_envs(code_package) + envs = {**envs, **env_variables} + # GCP might overwrite existing variables + # If we modify them, we need to first read existing ones and append. + if len(envs) > 0: + envs = self._update_envs(full_func_name, envs) + + if len(envs) > 0: + + req = ( + self.function_client.projects() + .locations() + .functions() + .patch( + name=full_func_name, + updateMask="availableMemoryMb,timeout,environmentVariables", + body={ + "availableMemoryMb": function.config.memory, + "timeout": str(function.config.timeout) + "s", + "environmentVariables": envs, + }, + ) ) - ) + + else: + + req = ( + self.function_client.projects() + .locations() + .functions() + .patch( + name=full_func_name, + updateMask="availableMemoryMb,timeout", + body={ + "availableMemoryMb": function.config.memory, + "timeout": str(function.config.timeout) + "s", + }, + ) + ) + res = req.execute() versionId = res["metadata"]["versionId"] retries = 0 @@ -450,15 +517,20 @@ def update_function_configuration(self, function: Function, benchmark: Benchmark ) self.logging.info("Published new function configuration.") + return versionId + @staticmethod def get_full_function_name(project_name: str, location: str, func_name: str): return f"projects/{project_name}/locations/{location}/functions/{func_name}" def prepare_experiment(self, benchmark): - logs_bucket = self.storage.add_output_bucket(benchmark, suffix="logs") + logs_bucket = self._system_resources.get_storage().add_output_bucket( + benchmark, suffix="logs" + ) return logs_bucket def shutdown(self) -> None: + cast(GCPSystemResources, self._system_resources).shutdown() super().shutdown() def download_metrics( @@ -484,7 +556,7 @@ def wrapper(gen): There shouldn't be problem of waiting for complete results, since logs appear very quickly here. """ - from google.cloud import logging as gcp_logging + import google.cloud.logging as gcp_logging logging_client = gcp_logging.Client() logger = logging_client.logger("cloudfunctions.googleapis.com%2Fcloud-functions") @@ -579,24 +651,12 @@ def wrapper(gen): } ] - def _enforce_cold_start(self, function: Function): + def _enforce_cold_start(self, function: Function, code_package: Benchmark): - name = GCP.get_full_function_name( - self.config.project_name, self.config.region, function.name - ) self.cold_start_counter += 1 - req = ( - self.function_client.projects() - .locations() - .functions() - .patch( - name=name, - updateMask="environmentVariables", - body={"environmentVariables": {"cold_start": str(self.cold_start_counter)}}, - ) + new_version = self.update_function_configuration( + function, code_package, {"cold_start": str(self.cold_start_counter)} ) - res = req.execute() - new_version = res["metadata"]["versionId"] return new_version @@ -604,7 +664,7 @@ def enforce_cold_start(self, functions: List[Function], code_package: Benchmark) new_versions = [] for func in functions: - new_versions.append((self._enforce_cold_start(func), func)) + new_versions.append((self._enforce_cold_start(func, code_package), func)) self.cold_start_counter -= 1 # verify deployment diff --git a/sebs/gcp/resources.py b/sebs/gcp/resources.py new file mode 100644 index 000000000..0a7d5c14d --- /dev/null +++ b/sebs/gcp/resources.py @@ -0,0 +1,85 @@ +from typing import cast, Optional + +from sebs.config import SeBSConfig +from sebs.gcp.config import GCPConfig +from sebs.gcp.storage import GCPStorage +from sebs.gcp.datastore import Datastore +from sebs.gcp.cli import GCloudCLI +from sebs.cache import Cache +from sebs.faas.resources import SystemResources +from sebs.utils import LoggingHandlers + +import docker + + +class GCPSystemResources(SystemResources): + @staticmethod + def typename() -> str: + return "GCP.SystemResources" + + @property + def config(self) -> GCPConfig: + return cast(GCPConfig, self._config) + + def __init__( + self, + system_config: SeBSConfig, + config: GCPConfig, + cache_client: Cache, + docker_client: docker.client, + logger_handlers: LoggingHandlers, + ): + super().__init__(config, cache_client, docker_client) + + self._logging_handlers = logger_handlers + self._storage: Optional[GCPStorage] = None + self._nosql_storage: Optional[Datastore] = None + self._cli_instance: Optional[GCloudCLI] = None + self._system_config = system_config + + """ + Access persistent storage instance. + It might be a remote and truly persistent service (AWS S3, Azure Blob..), + or a dynamically allocated local instance. + + :param replace_existing: replace benchmark input data if exists already + """ + + def get_storage(self, replace_existing: Optional[bool] = None) -> GCPStorage: + if not self._storage: + self._storage = GCPStorage( + self.config.region, + self._cache_client, + self.config.resources, + replace_existing if replace_existing is not None else False, + ) + self._storage.logging_handlers = self._logging_handlers + elif replace_existing is not None: + self._storage.replace_existing = replace_existing + return self._storage + + def get_nosql_storage(self) -> Datastore: + if not self._nosql_storage: + self._nosql_storage = Datastore( + self.cli_instance, self._cache_client, self.config.resources, self.config.region + ) + return self._nosql_storage + + @property + def cli_instance(self) -> GCloudCLI: + if self._cli_instance is None: + self._cli_instance = GCloudCLI( + self.config.credentials, self._system_config, self._docker_client + ) + self._cli_instance_stop = True + + self._cli_instance.login(self.config.credentials.project_name) + return self._cli_instance + + def initialize_cli(self, cli: GCloudCLI): + self._cli_instance = cli + self._cli_instance_stop = False + + def shutdown(self) -> None: + if self._cli_instance and self._cli_instance_stop: + self._cli_instance.shutdown() diff --git a/sebs/gcp/storage.py b/sebs/gcp/storage.py index a5b0e0064..c578966f1 100644 --- a/sebs/gcp/storage.py +++ b/sebs/gcp/storage.py @@ -3,7 +3,7 @@ import uuid from typing import List, Optional -from google.cloud import storage as gcp_storage +import google.cloud.storage as gcp_storage from google.api_core import exceptions from sebs.cache import Cache diff --git a/sebs/local/config.py b/sebs/local/config.py index 6f503d0ca..0b512c67c 100644 --- a/sebs/local/config.py +++ b/sebs/local/config.py @@ -2,7 +2,8 @@ from sebs.cache import Cache from sebs.faas.config import Config, Credentials, Resources -from sebs.storage.minio import MinioConfig +from sebs.storage.resources import SelfHostedResources +from sebs.storage.config import NoSQLStorageConfig, PersistentStorageConfig from sebs.utils import LoggingHandlers @@ -21,38 +22,30 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Creden """ -class LocalResources(Resources): - def __init__(self, storage_cfg: Optional[MinioConfig] = None): +class LocalResources(SelfHostedResources): + def __init__( + self, + storage_cfg: Optional[PersistentStorageConfig] = None, + nosql_storage_cfg: Optional[NoSQLStorageConfig] = None, + ): self._path: str = "" - super().__init__(name="local") - self._storage = storage_cfg + super().__init__("local", storage_cfg, nosql_storage_cfg) self._allocated_ports: Set[int] = set() - @property - def storage_config(self) -> Optional[MinioConfig]: - return self._storage - @property def allocated_ports(self) -> set: return self._allocated_ports def serialize(self) -> dict: - out: dict = {} + out = super().serialize() + out["allocated_ports"] = list(self._allocated_ports) - if self._storage is not None: - out["storage"] = self._storage.serialize() return out @staticmethod def initialize(res: Resources, config: dict): resources = cast(LocalResources, res) - # Check for new config - if "storage" in config: - resources._storage = MinioConfig.deserialize(config["storage"]) - resources.logging.info( - "Using user-provided configuration of storage for local containers." - ) if "allocated_ports" in config: resources._allocated_ports = set(config["allocated_ports"]) @@ -62,14 +55,14 @@ def update_cache(self, cache: Cache): cache.update_config( val=list(self._allocated_ports), keys=["local", "resources", "allocated_ports"] ) - if self._storage is not None: - self._storage.update_cache(["local", "resources", "storage"], cache) @staticmethod def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Resources: ret = LocalResources() cached_config = cache.get_config("local") + ret._deserialize(ret, config, cached_config) + # Load cached values if cached_config and "resources" in cached_config: LocalResources.initialize(ret, cached_config["resources"]) diff --git a/sebs/local/function.py b/sebs/local/function.py index dc0e1a35f..f0104a4e0 100644 --- a/sebs/local/function.py +++ b/sebs/local/function.py @@ -72,6 +72,14 @@ def __init__( self._measurement_pid = measurement_pid + @property + def container(self) -> docker.models.containers.Container: + return self._instance + + @container.setter + def container(self, instance: docker.models.containers.Container): + self._instance = instance + @property def url(self) -> str: return self._url diff --git a/sebs/local/local.py b/sebs/local/local.py index b2567efc8..32b9f9ffb 100644 --- a/sebs/local/local.py +++ b/sebs/local/local.py @@ -10,12 +10,11 @@ from sebs.cache import Cache from sebs.config import SeBSConfig +from sebs.storage.resources import SelfHostedSystemResources from sebs.utils import LoggingHandlers, is_linux from sebs.local.config import LocalConfig -from sebs.local.storage import Minio from sebs.local.function import LocalFunction from sebs.faas.function import Function, FunctionConfig, ExecutionResult, Trigger -from sebs.faas.storage import PersistentStorage from sebs.faas.system import System from sebs.faas.config import Resources from sebs.benchmark import Benchmark @@ -69,7 +68,14 @@ def __init__( docker_client: docker.client, logger_handlers: LoggingHandlers, ): - super().__init__(sebs_config, cache_client, docker_client) + super().__init__( + sebs_config, + cache_client, + docker_client, + SelfHostedSystemResources( + "local", config, cache_client, docker_client, logger_handlers + ), + ) self.logging_handlers = logger_handlers self._config = config self._remove_containers = True @@ -80,30 +86,9 @@ def __init__( self.initialize_resources(select_prefix="local") """ - Create wrapper object for minio storage and fill buckets. - Starts minio as a Docker instance, using always fresh buckets. - - :param benchmark: - :param buckets: number of input and output buckets - :param replace_existing: not used. - :return: Azure storage instance + Shut down minio storage instance. """ - def get_storage(self, replace_existing: bool = False) -> PersistentStorage: - if not hasattr(self, "storage"): - - if not self.config.resources.storage_config: - raise RuntimeError( - "The local deployment is missing the configuration of pre-allocated storage!" - ) - self.storage = Minio.deserialize( - self.config.resources.storage_config, self.cache_client, self.config.resources - ) - self.storage.logging_handlers = self.logging_handlers - else: - self.storage.replace_existing = replace_existing - return self.storage - def shutdown(self): super().shutdown() @@ -155,34 +140,34 @@ def package_code( return directory, bytes_size, "" - def create_function( - self, - code_package: Benchmark, - func_name: str, - container_deployment: bool, - container_uri: str, - ) -> "LocalFunction": - - if container_deployment: - raise NotImplementedError("Container deployment is not supported in Local") + def _start_container( + self, code_package: Benchmark, func_name: str, func: Optional[LocalFunction] + ) -> LocalFunction: container_name = "{}:run.local.{}.{}".format( self._system_config.docker_repository(), code_package.language_name, code_package.language_version, ) - environment: Dict[str, str] = {} + + environment = { + "CONTAINER_UID": str(os.getuid()), + "CONTAINER_GID": str(os.getgid()), + "CONTAINER_USER": self._system_config.username(self.name(), code_package.language_name), + } if self.config.resources.storage_config: - environment = { - "MINIO_ADDRESS": self.config.resources.storage_config.address, - "MINIO_ACCESS_KEY": self.config.resources.storage_config.access_key, - "MINIO_SECRET_KEY": self.config.resources.storage_config.secret_key, - "CONTAINER_UID": str(os.getuid()), - "CONTAINER_GID": str(os.getgid()), - "CONTAINER_USER": self._system_config.username( - self.name(), code_package.language_name - ), - } + + environment = {**self.config.resources.storage_config.envs(), **environment} + + if code_package.uses_nosql: + + nosql_storage = self.system_resources.get_nosql_storage() + environment = {**environment, **nosql_storage.envs()} + + for original_name, actual_name in nosql_storage.get_tables( + code_package.benchmark + ).items(): + environment[f"NOSQL_STORAGE_TABLE_{original_name}"] = actual_name # FIXME: make CPUs configurable # FIXME: configure memory @@ -256,16 +241,20 @@ def create_function( ) pid = proc.pid - function_cfg = FunctionConfig.from_benchmark(code_package) - func = LocalFunction( - container, - port, - func_name, - code_package.benchmark, - code_package.hash, - function_cfg, - pid, - ) + if func is None: + function_cfg = FunctionConfig.from_benchmark(code_package) + func = LocalFunction( + container, + port, + func_name, + code_package.benchmark, + code_package.hash, + function_cfg, + pid, + ) + else: + func.container = container + func._measurement_pid = pid # Wait until server starts max_attempts = 10 @@ -281,16 +270,29 @@ def create_function( if attempts == max_attempts: raise RuntimeError( f"Couldn't start {func_name} function at container " - f"{container.id} , running on {func._url}" + f"{container.id} , running on {func.url}" ) self.logging.info( f"Started {func_name} function at container {container.id} , running on {func._url}" ) + return func + def create_function( + self, + code_package: Benchmark, + func_name: str, + container_deployment: bool, + container_uri: str, + ) -> "LocalFunction": + + if container_deployment: + raise NotImplementedError("Container deployment is not supported in Local") + return self._start_container(code_package, func_name, None) + """ - FIXME: restart Docker? + Restart Docker container """ def update_function( @@ -300,7 +302,10 @@ def update_function( container_deployment: bool, container_uri: str, ): - pass + func = cast(LocalFunction, function) + func.stop() + self.logging.info("Allocating a new function container with updated code") + self._start_container(code_package, function.name, func) """ For local functions, we don't need to do anything for a cached function. diff --git a/sebs/local/storage.py b/sebs/local/storage.py deleted file mode 100644 index 9563deb49..000000000 --- a/sebs/local/storage.py +++ /dev/null @@ -1,27 +0,0 @@ -import docker - -from sebs.faas.config import Resources -from sebs.storage import minio -from sebs.storage.config import MinioConfig -from sebs.cache import Cache - - -class Minio(minio.Minio): - @staticmethod - def deployment_name() -> str: - return "local" - - def __init__( - self, - docker_client: docker.client, - cache_client: Cache, - res: Resources, - replace_existing: bool, - ): - super().__init__(docker_client, cache_client, res, replace_existing) - - @staticmethod - def deserialize( - cached_config: MinioConfig, cache_client: Cache, resources: Resources - ) -> "Minio": - return super(Minio, Minio)._deserialize(cached_config, cache_client, resources, Minio) diff --git a/sebs/openwhisk/config.py b/sebs/openwhisk/config.py index 056456080..bba54f7c7 100644 --- a/sebs/openwhisk/config.py +++ b/sebs/openwhisk/config.py @@ -3,7 +3,7 @@ from sebs.cache import Cache from sebs.faas.config import Credentials, Resources, Config from sebs.utils import LoggingHandlers -from sebs.storage.config import MinioConfig +from sebs.storage.resources import SelfHostedResources from typing import cast, Optional @@ -17,7 +17,7 @@ def serialize(self) -> dict: return {} -class OpenWhiskResources(Resources): +class OpenWhiskResources(SelfHostedResources): def __init__( self, registry: Optional[str] = None, @@ -30,7 +30,6 @@ def __init__( self._docker_username = username if username != "" else None self._docker_password = password if password != "" else None self._registry_updated = registry_updated - self._storage: Optional[MinioConfig] = None self._storage_updated = False @staticmethod @@ -49,10 +48,6 @@ def docker_username(self) -> Optional[str]: def docker_password(self) -> Optional[str]: return self._docker_password - @property - def storage_config(self) -> Optional[MinioConfig]: - return self._storage - @property def storage_updated(self) -> bool: return self._storage_updated @@ -78,6 +73,8 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Resour ret, cached_config["resources"] ) + ret._deserialize(ret, config, cached_config) + # Check for new config - overrides but check if it's different if "docker_registry" in config: @@ -109,33 +106,6 @@ def deserialize(config: dict, cache: Cache, handlers: LoggingHandlers) -> Resour ret.logging_handlers = handlers ret._registry_updated = True - # Check for new config - if "storage" in config: - ret._storage = MinioConfig.deserialize(config["storage"]) - ret.logging.info("Using user-provided configuration of storage for OpenWhisk.") - - # check if there has been an update - if not ( - cached_config - and "resources" in cached_config - and "storage" in cached_config["resources"] - and cached_config["resources"]["storage"] == config["storage"] - ): - ret.logging.info( - "User-provided configuration is different from cached storage, " - "we will update existing OpenWhisk actions." - ) - ret._storage_updated = True - - # Load cached values - elif ( - cached_config - and "resources" in cached_config - and "storage" in cached_config["resources"] - ): - ret._storage = MinioConfig.deserialize(cached_config["resources"]["storage"]) - ret.logging.info("Using cached configuration of storage for OpenWhisk.") - return ret def update_cache(self, cache: Cache): @@ -149,8 +119,6 @@ def update_cache(self, cache: Cache): cache.update_config( val=self.docker_password, keys=["openwhisk", "resources", "docker", "password"] ) - if self._storage: - self._storage.update_cache(["openwhisk", "resources", "storage"], cache) def serialize(self) -> dict: out: dict = { @@ -159,8 +127,6 @@ def serialize(self) -> dict: "docker_username": self.docker_username, "docker_password": self.docker_password, } - if self._storage: - out = {**out, "storage": self._storage.serialize()} return out diff --git a/sebs/openwhisk/function.py b/sebs/openwhisk/function.py index 624b1250f..daf851ca6 100644 --- a/sebs/openwhisk/function.py +++ b/sebs/openwhisk/function.py @@ -5,7 +5,7 @@ from sebs.benchmark import Benchmark from sebs.faas.function import Function, FunctionConfig, Runtime -from sebs.storage.config import MinioConfig +from sebs.storage.config import MinioConfig, ScyllaDBConfig @dataclass @@ -14,14 +14,16 @@ class OpenWhiskFunctionConfig(FunctionConfig): # FIXME: merge with higher level abstraction for images docker_image: str = "" namespace: str = "_" - storage: Optional[MinioConfig] = None + object_storage: Optional[MinioConfig] = None + nosql_storage: Optional[ScyllaDBConfig] = None @staticmethod def deserialize(data: dict) -> OpenWhiskFunctionConfig: keys = list(OpenWhiskFunctionConfig.__dataclass_fields__.keys()) data = {k: v for k, v in data.items() if k in keys} data["runtime"] = Runtime.deserialize(data["runtime"]) - data["storage"] = MinioConfig.deserialize(data["storage"]) + data["object_storage"] = MinioConfig.deserialize(data["object_storage"]) + data["nosql_storage"] = ScyllaDBConfig.deserialize(data["nosql_storage"]) return OpenWhiskFunctionConfig(**data) def serialize(self) -> dict: diff --git a/sebs/openwhisk/openwhisk.py b/sebs/openwhisk/openwhisk.py index 5501369b0..9c196fe25 100644 --- a/sebs/openwhisk/openwhisk.py +++ b/sebs/openwhisk/openwhisk.py @@ -6,11 +6,13 @@ from sebs.benchmark import Benchmark from sebs.cache import Cache -from sebs.faas import System, PersistentStorage +from sebs.faas import System from sebs.faas.function import Function, ExecutionResult, Trigger from sebs.openwhisk.container import OpenWhiskContainer -from sebs.openwhisk.storage import Minio from sebs.openwhisk.triggers import LibraryTrigger, HTTPTrigger +from sebs.storage.resources import SelfHostedSystemResources +from sebs.storage.minio import Minio +from sebs.storage.scylladb import ScyllaDB from sebs.utils import LoggingHandlers from sebs.faas.config import Resources from .config import OpenWhiskConfig @@ -29,7 +31,14 @@ def __init__( docker_client: docker.client, logger_handlers: LoggingHandlers, ): - super().__init__(system_config, cache_client, docker_client) + super().__init__( + system_config, + cache_client, + docker_client, + SelfHostedSystemResources( + "openwhisk", config, cache_client, docker_client, logger_handlers + ), + ) self._config = config self.logging_handlers = logger_handlers @@ -57,21 +66,6 @@ def initialize(self, config: Dict[str, str] = {}, resource_prefix: Optional[str] def config(self) -> OpenWhiskConfig: return self._config - def get_storage(self, replace_existing: bool = False) -> PersistentStorage: - if not hasattr(self, "storage"): - - if not self.config.resources.storage_config: - raise RuntimeError( - "OpenWhisk is missing the configuration of pre-allocated storage!" - ) - self.storage = Minio.deserialize( - self.config.resources.storage_config, self.cache_client, self.config.resources - ) - self.storage.logging_handlers = self.logging_handlers - else: - self.storage.replace_existing = replace_existing - return self.storage - def shutdown(self) -> None: if hasattr(self, "storage") and self.config.shutdownStorage: self.storage.stop() @@ -133,19 +127,40 @@ def package_code( self.logging.info("Zip archive size {:2f} MB".format(bytes_size / 1024.0 / 1024.0)) return benchmark_archive, bytes_size, image_uri - def storage_arguments(self) -> List[str]: - storage = cast(Minio, self.get_storage()) - return [ - "-p", - "MINIO_STORAGE_SECRET_KEY", - storage.config.secret_key, - "-p", - "MINIO_STORAGE_ACCESS_KEY", - storage.config.access_key, - "-p", - "MINIO_STORAGE_CONNECTION_URL", - storage.config.address, - ] + def storage_arguments(self, code_package: Benchmark) -> List[str]: + envs = [] + + if self.config.resources.storage_config: + + storage_envs = self.config.resources.storage_config.envs() + envs = [ + "-p", + "MINIO_STORAGE_SECRET_KEY", + storage_envs["MINIO_SECRET_KEY"], + "-p", + "MINIO_STORAGE_ACCESS_KEY", + storage_envs["MINIO_ACCESS_KEY"], + "-p", + "MINIO_STORAGE_CONNECTION_URL", + storage_envs["MINIO_ADDRESS"], + ] + + if code_package.uses_nosql: + + nosql_storage = self.system_resources.get_nosql_storage() + for key, value in nosql_storage.envs().items(): + envs.append("-p") + envs.append(key) + envs.append(value) + + for original_name, actual_name in nosql_storage.get_tables( + code_package.benchmark + ).items(): + envs.append("-p") + envs.append(f"NOSQL_STORAGE_TABLE_{original_name}") + envs.append(actual_name) + + return envs def create_function( self, @@ -170,7 +185,10 @@ def create_function( break function_cfg = OpenWhiskFunctionConfig.from_benchmark(code_package) - function_cfg.storage = cast(Minio, self.get_storage()).config + function_cfg.object_storage = cast(Minio, self.system_resources.get_storage()).config + function_cfg.nosql_storage = cast( + ScyllaDB, self.system_resources.get_nosql_storage() + ).config if function_found: # docker image is overwritten by the update res = OpenWhiskFunction( @@ -203,7 +221,7 @@ def create_function( str(code_package.benchmark_config.memory), "--timeout", str(code_package.benchmark_config.timeout * 1000), - *self.storage_arguments(), + *self.storage_arguments(code_package), code_package.code_location, ], stderr=subprocess.PIPE, @@ -261,7 +279,7 @@ def update_function( str(code_package.benchmark_config.memory), "--timeout", str(code_package.benchmark_config.timeout * 1000), - *self.storage_arguments(), + *self.storage_arguments(code_package), code_package.code_location, ], stderr=subprocess.PIPE, @@ -292,7 +310,7 @@ def update_function_configuration(self, function: Function, code_package: Benchm str(code_package.benchmark_config.memory), "--timeout", str(code_package.benchmark_config.timeout * 1000), - *self.storage_arguments(), + *self.storage_arguments(code_package), ], stderr=subprocess.PIPE, stdout=subprocess.PIPE, @@ -310,15 +328,25 @@ def update_function_configuration(self, function: Function, code_package: Benchm def is_configuration_changed(self, cached_function: Function, benchmark: Benchmark) -> bool: changed = super().is_configuration_changed(cached_function, benchmark) - storage = cast(Minio, self.get_storage()) + storage = cast(Minio, self.system_resources.get_storage()) function = cast(OpenWhiskFunction, cached_function) # check if now we're using a new storage - if function.config.storage != storage.config: + if function.config.object_storage != storage.config: self.logging.info( "Updating function configuration due to changed storage configuration." ) changed = True - function.config.storage = storage.config + function.config.object_storage = storage.config + + nosql_storage = cast(ScyllaDB, self.system_resources.get_nosql_storage()) + function = cast(OpenWhiskFunction, cached_function) + # check if now we're using a new storage + if function.config.nosql_storage != nosql_storage.config: + self.logging.info( + "Updating function configuration due to changed NoSQL storage configuration." + ) + changed = True + function.config.nosql_storage = nosql_storage.config return changed diff --git a/sebs/openwhisk/storage.py b/sebs/openwhisk/storage.py deleted file mode 100644 index 79e8e17cf..000000000 --- a/sebs/openwhisk/storage.py +++ /dev/null @@ -1,27 +0,0 @@ -import docker - -from sebs.faas.config import Resources -from sebs.storage import minio -from sebs.storage.config import MinioConfig -from sebs.cache import Cache - - -class Minio(minio.Minio): - @staticmethod - def deployment_name() -> str: - return "openwhisk" - - def __init__( - self, - docker_client: docker.client, - cache_client: Cache, - res: Resources, - replace_existing: bool, - ): - super().__init__(docker_client, cache_client, res, replace_existing) - - @staticmethod - def deserialize( - cached_config: MinioConfig, cache_client: Cache, resources: Resources - ) -> "Minio": - return super(Minio, Minio)._deserialize(cached_config, cache_client, resources, Minio) diff --git a/sebs/regression.py b/sebs/regression.py index 5450e4e04..579760a1c 100644 --- a/sebs/regression.py +++ b/sebs/regression.py @@ -16,6 +16,7 @@ benchmarks_python = [ "110.dynamic-html", "120.uploader", + "130.crud-api", "210.thumbnailer", "220.video-processing", "311.compression", @@ -45,18 +46,33 @@ class TestSequenceMeta(type): def __init__( - cls, name, bases, attrs, benchmarks, architectures, deployments, deployment_name, triggers + cls, + name, + bases, + attrs, + benchmarks, + architectures, + deployments, + deployment_name, + triggers, ): type.__init__(cls, name, bases, attrs) cls.deployment_name = deployment_name cls.triggers = triggers def __new__( - mcs, name, bases, dict, benchmarks, architectures, deployments, deployment_name, triggers + mcs, + name, + bases, + dict, + benchmarks, + architectures, + deployments, + deployment_name, + triggers, ): def gen_test(benchmark_name, architecture, deployment_type): def test(self): - log_name = f"Regression-{deployment_name}-{benchmark_name}-{deployment_type}" logger = logging.getLogger(log_name) logger.setLevel(logging.INFO) @@ -65,7 +81,9 @@ def test(self): self.experiment_config["architecture"] = architecture self.experiment_config["container_deployment"] = deployment_type == "container" - deployment_client = self.get_deployment(benchmark_name, architecture) + deployment_client = self.get_deployment( + benchmark_name, architecture, deployment_type + ) deployment_client.disable_rich_output() logging_wrapper.info( @@ -78,13 +96,14 @@ def test(self): benchmark = self.client.get_benchmark( benchmark_name, deployment_client, experiment_config ) - storage = deployment_client.get_storage( - replace_existing=experiment_config.update_storage + input_config = benchmark.prepare_input( + deployment_client.system_resources, + size="test", + replace_existing=experiment_config.update_storage, ) func = deployment_client.get_function( benchmark, deployment_client.default_function_name(benchmark) ) - input_config = benchmark.prepare_input(storage=storage, size="test") failure = False for trigger_type in triggers: @@ -121,11 +140,8 @@ def test(self): return test for benchmark in benchmarks: - for architecture in architectures: - for deployment_type in deployments: - # for trigger in triggers: test_name = f"test_{deployment_name}_{benchmark}" test_name += f"_{architecture}_{deployment_type}" @@ -149,19 +165,18 @@ class AWSTestSequencePython( def typename(self) -> str: return "AWSTestPython" - def get_deployment(self, benchmark_name, architecture): + def get_deployment(self, benchmark_name, architecture, deployment_type): deployment_name = "aws" assert cloud_config + + f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( cloud_config, - logging_filename=os.path.join( - self.client.output_dir, - f"regression_{deployment_name}_{benchmark_name}_{architecture}.log", - ), + logging_filename=os.path.join(self.client.output_dir, f), ) with AWSTestSequencePython.lock: - deployment_client.initialize(resource_prefix="regression") + deployment_client.initialize(resource_prefix="regr") return deployment_client @@ -174,18 +189,16 @@ class AWSTestSequenceNodejs( deployment_name="aws", triggers=[Trigger.TriggerType.LIBRARY, Trigger.TriggerType.HTTP], ): - def get_deployment(self, benchmark_name, architecture): + def get_deployment(self, benchmark_name, architecture, deployment_type): deployment_name = "aws" assert cloud_config + f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( cloud_config, - logging_filename=os.path.join( - self.client.output_dir, - f"regression_{deployment_name}_{benchmark_name}_{architecture}.log", - ), + logging_filename=os.path.join(self.client.output_dir, f), ) with AWSTestSequenceNodejs.lock: - deployment_client.initialize(resource_prefix="regression") + deployment_client.initialize(resource_prefix="regr") return deployment_client @@ -198,14 +211,13 @@ class AzureTestSequencePython( deployment_name="azure", triggers=[Trigger.TriggerType.HTTP], ): - def get_deployment(self, benchmark_name, architecture): + def get_deployment(self, benchmark_name, architecture, deployment_type): deployment_name = "azure" assert cloud_config with AzureTestSequencePython.lock: - if not AzureTestSequencePython.cfg: AzureTestSequencePython.cfg = self.client.get_deployment_config( - cloud_config, + cloud_config["deployment"], logging_filename=os.path.join( self.client.output_dir, f"regression_{deployment_name}_{benchmark_name}_{architecture}.log", @@ -217,15 +229,16 @@ def get_deployment(self, benchmark_name, architecture): self.client.config, self.client.docker_client ) + f = f"regression_{deployment_name}_{benchmark_name}_" + f += f"{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( cloud_config, - logging_filename=os.path.join( - self.client.output_dir, - f"regression_{deployment_name}_{benchmark_name}_{architecture}.log", - ), + logging_filename=os.path.join(self.client.output_dir, f), deployment_config=AzureTestSequencePython.cfg, ) - deployment_client.initialize_cli(cli=AzureTestSequencePython.cli) + deployment_client.system_resources.initialize_cli( + cli=AzureTestSequencePython.cli, login=True + ) deployment_client.initialize(resource_prefix="regr") return deployment_client @@ -239,13 +252,13 @@ class AzureTestSequenceNodejs( deployment_name="azure", triggers=[Trigger.TriggerType.HTTP], ): - def get_deployment(self, benchmark_name, architecture): + def get_deployment(self, benchmark_name, architecture, deployment_type): deployment_name = "azure" assert cloud_config with AzureTestSequenceNodejs.lock: if not AzureTestSequenceNodejs.cfg: AzureTestSequenceNodejs.cfg = self.client.get_deployment_config( - cloud_config, + cloud_config["deployment"], logging_filename=f"regression_{deployment_name}_{benchmark_name}.log", ) @@ -254,15 +267,14 @@ def get_deployment(self, benchmark_name, architecture): self.client.config, self.client.docker_client ) + f = f"regression_{deployment_name}_{benchmark_name}_" + f += f"{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( cloud_config, - logging_filename=os.path.join( - self.client.output_dir, - f"regression_{deployment_name}_{benchmark_name}_{architecture}.log", - ), + logging_filename=os.path.join(self.client.output_dir, f), deployment_config=AzureTestSequencePython.cfg, ) - deployment_client.initialize_cli(cli=AzureTestSequenceNodejs.cli) + deployment_client.system_resources.initialize_cli(cli=AzureTestSequenceNodejs.cli) deployment_client.initialize(resource_prefix="regr") return deployment_client @@ -276,18 +288,16 @@ class GCPTestSequencePython( deployment_name="gcp", triggers=[Trigger.TriggerType.HTTP], ): - def get_deployment(self, benchmark_name, architecture): + def get_deployment(self, benchmark_name, architecture, deployment_type): deployment_name = "gcp" assert cloud_config + f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( cloud_config, - logging_filename=os.path.join( - self.client.output_dir, - f"regression_{deployment_name}_{benchmark_name}_{architecture}.log", - ), + logging_filename=os.path.join(self.client.output_dir, f), ) with GCPTestSequencePython.lock: - deployment_client.initialize(resource_prefix="regression") + deployment_client.initialize(resource_prefix="regr") return deployment_client @@ -300,18 +310,16 @@ class GCPTestSequenceNodejs( deployment_name="gcp", triggers=[Trigger.TriggerType.HTTP], ): - def get_deployment(self, benchmark_name, architecture): + def get_deployment(self, benchmark_name, architecture, deployment_type): deployment_name = "gcp" assert cloud_config + f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( cloud_config, - logging_filename=os.path.join( - self.client.output_dir, - f"regression_{deployment_name}_{benchmark_name}_{architecture}.log", - ), + logging_filename=os.path.join(self.client.output_dir, f), ) with GCPTestSequenceNodejs.lock: - deployment_client.initialize(resource_prefix="regression") + deployment_client.initialize(resource_prefix="regr") return deployment_client @@ -324,18 +332,21 @@ class OpenWhiskTestSequencePython( deployment_name="openwhisk", triggers=[Trigger.TriggerType.HTTP], ): - def get_deployment(self, benchmark_name, architecture): + def get_deployment(self, benchmark_name, architecture, deployment_type): deployment_name = "openwhisk" assert cloud_config + + config_copy = cloud_config.copy() + config_copy["experiments"]["architecture"] = architecture + config_copy["experiments"]["container_deployment"] = deployment_type == "container" + + f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( - cloud_config, - logging_filename=os.path.join( - self.client.output_dir, - f"regression_{deployment_name}_{benchmark_name}_{architecture}.log", - ), + config_copy, + logging_filename=os.path.join(self.client.output_dir, f), ) with OpenWhiskTestSequencePython.lock: - deployment_client.initialize(resource_prefix="regression") + deployment_client.initialize(resource_prefix="regr") return deployment_client @@ -348,18 +359,21 @@ class OpenWhiskTestSequenceNodejs( deployment_name="openwhisk", triggers=[Trigger.TriggerType.HTTP], ): - def get_deployment(self, benchmark_name, architecture): + def get_deployment(self, benchmark_name, architecture, deployment_type): deployment_name = "openwhisk" assert cloud_config + + config_copy = cloud_config.copy() + config_copy["experiments"]["architecture"] = architecture + config_copy["experiments"]["container_deployment"] = deployment_type == "container" + + f = f"regression_{deployment_name}_{benchmark_name}_{architecture}_{deployment_type}.log" deployment_client = self.client.get_deployment( - cloud_config, - logging_filename=os.path.join( - self.client.output_dir, - f"regression_{deployment_name}_{benchmark_name}_{architecture}.log", - ), + config_copy, + logging_filename=os.path.join(self.client.output_dir, f), ) with OpenWhiskTestSequenceNodejs.lock: - deployment_client.initialize(resource_prefix="regression") + deployment_client.initialize(resource_prefix="regr") return deployment_client @@ -395,9 +409,12 @@ def status(self, *args, **kwargs): def filter_out_benchmarks( - benchmark: str, deployment_name: str, language: str, language_version: str, architecture: str + benchmark: str, + deployment_name: str, + language: str, + language_version: str, + architecture: str, ) -> bool: - # fmt: off if (deployment_name == "aws" and language == "python" and language_version in ["3.9", "3.10", "3.11"]): @@ -462,7 +479,6 @@ def regression_suite( # mypy is confused here for case in suite: for test in case: # type: ignore - # skip test_name = cast(unittest.TestCase, test)._testMethodName diff --git a/sebs/sebs.py b/sebs/sebs.py index ed3fc580a..309c0b253 100644 --- a/sebs/sebs.py +++ b/sebs/sebs.py @@ -3,7 +3,6 @@ import docker -import sebs.storage from sebs import types from sebs.local import Local from sebs.cache import Cache @@ -11,7 +10,9 @@ from sebs.benchmark import Benchmark from sebs.faas.system import System as FaaSSystem from sebs.faas.storage import PersistentStorage +from sebs.faas.nosql import NoSQLStorage from sebs.faas.config import Config +from sebs.storage import minio, config, scylladb from sebs.utils import has_platform, LoggingHandlers, LoggingBase from sebs.experiments.config import Config as ExperimentConfig @@ -199,19 +200,28 @@ def get_benchmark( @staticmethod def get_storage_implementation(storage_type: types.Storage) -> Type[PersistentStorage]: - _storage_implementations = {types.Storage.MINIO: sebs.storage.minio.Minio} + _storage_implementations = {types.Storage.MINIO: minio.Minio} + impl = _storage_implementations.get(storage_type) + assert impl + return impl + + @staticmethod + def get_nosql_implementation(storage_type: types.NoSQLStorage) -> Type[NoSQLStorage]: + _storage_implementations = {types.NoSQLStorage.SCYLLADB: scylladb.ScyllaDB} impl = _storage_implementations.get(storage_type) assert impl return impl @staticmethod def get_storage_config_implementation(storage_type: types.Storage): - _storage_implementations = { - types.Storage.MINIO: ( - sebs.storage.config.MinioConfig, - sebs.storage.config.MinioResources, - ) - } + _storage_implementations = {types.Storage.MINIO: config.MinioConfig} + impl = _storage_implementations.get(storage_type) + assert impl + return impl + + @staticmethod + def get_nosql_config_implementation(storage_type: types.NoSQLStorage): + _storage_implementations = {types.NoSQLStorage.SCYLLADB: config.ScyllaDBConfig} impl = _storage_implementations.get(storage_type) assert impl return impl diff --git a/sebs/storage/config.py b/sebs/storage/config.py index 9d0978973..cd47df391 100644 --- a/sebs/storage/config.py +++ b/sebs/storage/config.py @@ -1,37 +1,25 @@ -from typing import cast, List +from abc import ABC +from abc import abstractmethod +from typing import List from dataclasses import dataclass, field from sebs.cache import Cache -from sebs.faas.config import Resources -class MinioResources(Resources): - def __init__(self): - super().__init__(name="minio") - - @staticmethod - def initialize(res: Resources, dct: dict): - ret = cast(MinioResources, res) - super(MinioResources, MinioResources).initialize(ret, dct) - return ret - +@dataclass +class PersistentStorageConfig(ABC): + @abstractmethod def serialize(self) -> dict: - return super().serialize() + pass - @staticmethod - def deserialize(config: dict) -> "Resources": # type: ignore - - ret = MinioResources() - MinioResources.initialize(ret, {}) - return ret - - def update_cache(self, cache: Cache): - super().update_cache(cache) + @abstractmethod + def envs(self) -> dict: + pass @dataclass -class MinioConfig: +class MinioConfig(PersistentStorageConfig): address: str = "" mapped_port: int = -1 access_key: str = "" @@ -39,6 +27,8 @@ class MinioConfig: instance_id: str = "" output_buckets: List[str] = field(default_factory=list) input_buckets: List[str] = field(default_factory=lambda: []) + version: str = "" + data_volume: str = "" type: str = "minio" def update_cache(self, path: List[str], cache: Cache): @@ -55,9 +45,54 @@ def deserialize(data: dict) -> "MinioConfig": data = {k: v for k, v in data.items() if k in keys} cfg = MinioConfig(**data) - # cfg.resources = cast(MinioResources, MinioResources.deserialize(data["resources"])) return cfg def serialize(self) -> dict: - return self.__dict__ # , "resources": self.resources.serialize()} + return self.__dict__ + + def envs(self) -> dict: + return { + "MINIO_ADDRESS": self.address, + "MINIO_ACCESS_KEY": self.access_key, + "MINIO_SECRET_KEY": self.secret_key, + } + + +@dataclass +class NoSQLStorageConfig(ABC): + @abstractmethod + def serialize(self) -> dict: + pass + + +@dataclass +class ScyllaDBConfig(NoSQLStorageConfig): + address: str = "" + mapped_port: int = -1 + alternator_port: int = 8000 + access_key: str = "None" + secret_key: str = "None" + instance_id: str = "" + region: str = "None" + cpus: int = -1 + memory: int = -1 + version: str = "" + data_volume: str = "" + + def update_cache(self, path: List[str], cache: Cache): + + for key in ScyllaDBConfig.__dataclass_fields__.keys(): + cache.update_config(val=getattr(self, key), keys=[*path, key]) + + @staticmethod + def deserialize(data: dict) -> "ScyllaDBConfig": + keys = list(ScyllaDBConfig.__dataclass_fields__.keys()) + data = {k: v for k, v in data.items() if k in keys} + + cfg = ScyllaDBConfig(**data) + + return cfg + + def serialize(self) -> dict: + return self.__dict__ diff --git a/sebs/storage/minio.py b/sebs/storage/minio.py index 551584209..757d2025f 100644 --- a/sebs/storage/minio.py +++ b/sebs/storage/minio.py @@ -9,10 +9,10 @@ import minio from sebs.cache import Cache -from sebs.types import Storage as StorageTypes from sebs.faas.config import Resources from sebs.faas.storage import PersistentStorage from sebs.storage.config import MinioConfig +from sebs.utils import project_absolute_path from sebs.utils import is_linux @@ -44,6 +44,10 @@ def __init__( def config(self) -> MinioConfig: return self._cfg + @config.setter + def config(self, config: MinioConfig): + self._cfg = config + @staticmethod def _define_http_client(): """ @@ -63,17 +67,31 @@ def _define_http_client(): ), ) - def start(self, port: int = 9000): + def start(self): + + if self._cfg.data_volume == "": + minio_volume = os.path.join(project_absolute_path(), "minio-volume") + else: + minio_volume = self._cfg.data_volume + minio_volume = os.path.abspath(minio_volume) + + os.makedirs(minio_volume, exist_ok=True) + volumes = { + minio_volume: { + "bind": "/data", + "mode": "rw", + } + } - self._cfg.mapped_port = port self._cfg.access_key = secrets.token_urlsafe(32) self._cfg.secret_key = secrets.token_hex(32) self._cfg.address = "" self.logging.info("Minio storage ACCESS_KEY={}".format(self._cfg.access_key)) self.logging.info("Minio storage SECRET_KEY={}".format(self._cfg.secret_key)) try: + self.logging.info(f"Starting storage Minio on port {self._cfg.mapped_port}") self._storage_container = self._docker_client.containers.run( - "minio/minio:latest", + f"minio/minio:{self._cfg.version}", command="server /data", network_mode="bridge", ports={"9000": str(self._cfg.mapped_port)}, @@ -81,6 +99,7 @@ def start(self, port: int = 9000): "MINIO_ACCESS_KEY": self._cfg.access_key, "MINIO_SECRET_KEY": self._cfg.secret_key, }, + volumes=volumes, remove=True, stdout=True, stderr=True, @@ -234,19 +253,24 @@ def upload(self, bucket_name: str, filepath: str, key: str): raise NotImplementedError() def serialize(self) -> dict: - return { - **self._cfg.serialize(), - "type": StorageTypes.MINIO, - } + return self._cfg.serialize() + + """ + This implementation supports overriding this class. + The main Minio class is used to start/stop deployments. + + When overriding the implementation in Local/OpenWhisk/..., + we call the _deserialize and provide an alternative implementation. + """ T = TypeVar("T", bound="Minio") @staticmethod def _deserialize( - cached_config: MinioConfig, cache_client: Cache, res: Resources, obj_type: Type[T] + cached_config: MinioConfig, cache_client: Cache, resources: Resources, obj_type: Type[T] ) -> T: docker_client = docker.from_env() - obj = obj_type(docker_client, cache_client, res, False) + obj = obj_type(docker_client, cache_client, resources, False) obj._cfg = cached_config if cached_config.instance_id: instance_id = cached_config.instance_id diff --git a/sebs/storage/resources.py b/sebs/storage/resources.py new file mode 100644 index 000000000..a85e725e1 --- /dev/null +++ b/sebs/storage/resources.py @@ -0,0 +1,196 @@ +from typing import cast, Optional, Tuple + +from sebs.cache import Cache +from sebs.faas.config import Config, Resources +from sebs.faas.resources import SystemResources +from sebs.faas.storage import PersistentStorage +from sebs.faas.nosql import NoSQLStorage +from sebs.storage.minio import Minio +from sebs.storage.scylladb import ScyllaDB +from sebs.storage.config import ( + NoSQLStorageConfig, + PersistentStorageConfig, + ScyllaDBConfig, + MinioConfig, +) +from sebs.utils import LoggingHandlers + +import docker + + +class SelfHostedResources(Resources): + def __init__( + self, + name: str, + storage_cfg: Optional[PersistentStorageConfig] = None, + nosql_storage_cfg: Optional[NoSQLStorageConfig] = None, + ): + super().__init__(name=name) + self._object_storage = storage_cfg + self._nosql_storage = nosql_storage_cfg + + @property + def storage_config(self) -> Optional[PersistentStorageConfig]: + return self._object_storage + + @property + def nosql_storage_config(self) -> Optional[NoSQLStorageConfig]: + return self._nosql_storage + + def serialize(self) -> dict: + out: dict = {} + + if self._object_storage is not None: + out = {**out, "storage": self._object_storage.serialize()} + + if self._nosql_storage is not None: + out = {**out, "nosql": self._nosql_storage.serialize()} + + return out + + def update_cache(self, cache: Cache): + super().update_cache(cache) + if self._object_storage is not None: + cast(MinioConfig, self._object_storage).update_cache( + [self._name, "resources", "storage"], cache + ) + if self._nosql_storage is not None: + cast(ScyllaDBConfig, self._nosql_storage).update_cache( + [self._name, "resources", "nosql"], cache + ) + + def _deserialize_storage( + self, config: dict, cached_config: Optional[dict], storage_type: str + ) -> Tuple[str, dict]: + storage_impl = "" + storage_config = {} + + # Check for new config + if "storage" in config and storage_type in config["storage"]: + storage_impl = config["storage"][storage_type]["type"] + storage_config = config["storage"][storage_type][storage_impl] + self.logging.info( + "Using user-provided configuration of storage " + f"type: {storage_type} for {self._name} containers." + ) + + # Load cached values + elif ( + cached_config is not None + and "resources" in cached_config + and "storage" in cached_config["resources"] + and "object" in cached_config["resources"]["storage"] + ): + storage_impl = cached_config["storage"]["object"]["type"] + storage_config = cached_config["storage"]["object"][storage_impl] + self.logging.info( + f"Using cached configuration of storage type: " + f"{storage_type} for {self._name} container." + ) + + return storage_impl, storage_config + + @staticmethod + def _deserialize(ret: "SelfHostedResources", config: dict, cached_config: dict): + obj_storage_impl, obj_storage_cfg = ret._deserialize_storage( + config, cached_config, "object" + ) + + if obj_storage_impl == "minio": + ret._object_storage = MinioConfig.deserialize(obj_storage_cfg) + ret.logging.info("Deserializing access data to Minio storage") + elif obj_storage_impl != "": + ret.logging.warning(f"Unknown object storage type: {obj_storage_impl}") + else: + ret.logging.info("No object storage available") + + nosql_storage_impl, nosql_storage_cfg = ret._deserialize_storage( + config, cached_config, "nosql" + ) + + if nosql_storage_impl == "scylladb": + ret._nosql_storage = ScyllaDBConfig.deserialize(nosql_storage_cfg) + ret.logging.info("Deserializing access data to ScylladB NoSQL storage") + elif nosql_storage_impl != "": + ret.logging.warning(f"Unknown NoSQL storage type: {nosql_storage_impl}") + else: + ret.logging.info("No NoSQL storage available") + + +class SelfHostedSystemResources(SystemResources): + def __init__( + self, + name: str, + config: Config, + cache_client: Cache, + docker_client: docker.client, + logger_handlers: LoggingHandlers, + ): + super().__init__(config, cache_client, docker_client) + + self._name = name + self._logging_handlers = logger_handlers + self._storage: Optional[PersistentStorage] = None + self._nosql_storage: Optional[NoSQLStorage] = None + + """ + Create wrapper object for minio storage and fill buckets. + Starts minio as a Docker instance, using always fresh buckets. + + :param benchmark: + :param buckets: number of input and output buckets + :param replace_existing: not used. + :return: Azure storage instance + """ + + def get_storage(self, replace_existing: Optional[bool] = None) -> PersistentStorage: + if self._storage is None: + storage_config = cast(SelfHostedResources, self._config.resources).storage_config + if storage_config is None: + self.logging.error( + f"The {self._name} deployment is missing the " + "configuration of pre-allocated storage!" + ) + raise RuntimeError(f"Cannot run {self._name} deployment without any object storage") + + if isinstance(storage_config, MinioConfig): + self._storage = Minio.deserialize( + storage_config, + self._cache_client, + self._config.resources, + ) + self._storage.logging_handlers = self._logging_handlers + else: + self.logging.error( + f"The {self._name} deployment does not support " + f"the object storage config type: {type(storage_config)}!" + ) + raise RuntimeError("Cannot work with the provided object storage!") + + elif replace_existing is not None: + self._storage.replace_existing = replace_existing + return self._storage + + def get_nosql_storage(self) -> NoSQLStorage: + if self._nosql_storage is None: + storage_config = cast(SelfHostedResources, self._config.resources).nosql_storage_config + if storage_config is None: + self.logging.error( + f"The {self._name} deployment is missing the configuration " + "of pre-allocated NoSQL storage!" + ) + raise RuntimeError("Cannot allocate NoSQL storage!") + + if isinstance(storage_config, ScyllaDBConfig): + self._nosql_storage = ScyllaDB.deserialize( + storage_config, self._cache_client, self._config.resources + ) + self._nosql_storage.logging_handlers = self._logging_handlers + else: + self.logging.error( + f"The {self._name} deployment does not support " + f"the NoSQL storage config type: {type(storage_config)}!" + ) + raise RuntimeError("Cannot work with the provided NoSQL storage!") + + return self._nosql_storage diff --git a/sebs/storage/scylladb.py b/sebs/storage/scylladb.py new file mode 100644 index 000000000..aae97815d --- /dev/null +++ b/sebs/storage/scylladb.py @@ -0,0 +1,318 @@ +import json +import os +import platform +import time +from collections import defaultdict +from typing import Dict, Optional, Tuple, Type, TypeVar + +from sebs.cache import Cache +from sebs.faas.config import Resources +from sebs.faas.nosql import NoSQLStorage +from sebs.types import NoSQLStorage as StorageType +from sebs.storage.config import ScyllaDBConfig +from sebs.utils import project_absolute_path + +import boto3 +from boto3.dynamodb.types import TypeSerializer +import docker + + +class ScyllaDB(NoSQLStorage): + @staticmethod + def typename() -> str: + return f"{ScyllaDB.deployment_name()}.ScyllaDB" + + @staticmethod + def deployment_name() -> str: + return "scylladb" + + @property + def config(self) -> ScyllaDBConfig: + return self._cfg + + # the location does not matter + SCYLLADB_REGION = "None" + + def __init__( + self, + docker_client: docker.client, + cache_client: Cache, + config: ScyllaDBConfig, + resources: Optional[Resources] = None, + ): + + super().__init__(self.SCYLLADB_REGION, cache_client, resources) # type: ignore + self._docker_client = docker_client + self._storage_container: Optional[docker.container] = None + self._cfg = config + + # Map benchmark -> orig_name -> table_name + self._tables: Dict[str, Dict[str, str]] = defaultdict(dict) + self._serializer = TypeSerializer() + + if config.address != "": + self.client = boto3.client( + "dynamodb", + region_name="None", + aws_access_key_id="None", + aws_secret_access_key="None", + endpoint_url=f"http://{config.address}", + ) + + def start(self): + + if self._cfg.data_volume == "": + scylladb_volume = os.path.join(project_absolute_path(), "scylladb-volume") + else: + scylladb_volume = self._cfg.data_volume + scylladb_volume = os.path.abspath(scylladb_volume) + + os.makedirs(scylladb_volume, exist_ok=True) + volumes = { + scylladb_volume: { + "bind": "/var/lib/scylla/", + "mode": "rw", + } + } + + try: + + scylladb_args = "" + scylladb_args += f"--smp {self._cfg.cpus} " + scylladb_args += f"--memory {self._cfg.memory}M " + scylladb_args += "--overprovisioned 1 " + scylladb_args += "--alternator-port 8000 " + scylladb_args += "--alternator-write-isolation=only_rmw_uses_lwt " + + self.logging.info("Starting ScyllaDB storage") + self._storage_container = self._docker_client.containers.run( + f"scylladb/scylla:{self._cfg.version}", + command=scylladb_args, + name="some-scylla", + hostname="some-scylla", + network_mode="bridge", + volumes=volumes, + ports={"8000": str(self._cfg.mapped_port)}, + remove=True, + stdout=True, + stderr=True, + detach=True, + ) + self._cfg.instance_id = self._storage_container.id + + # Wait until it boots up + attempts = 0 + max_attempts = 30 + while attempts < max_attempts: + + exit_code, out = self._storage_container.exec_run("nodetool status") + + if exit_code == 0: + self.logging.info("Started ScyllaDB succesfully!") + break + + time.sleep(1.0) + attempts += 1 + + if attempts == max_attempts: + self.logging.error("Failed to launch ScyllaBD!") + self.logging.error(f"Last result of nodetool status: {out}") + raise RuntimeError("Failed to launch ScyllaBD!") + + self.configure_connection() + except docker.errors.APIError as e: + self.logging.error("Starting ScyllaDB storage failed! Reason: {}".format(e)) + raise RuntimeError("Starting ScyllaDB storage unsuccesful") + except Exception as e: + self.logging.error("Starting ScyllaDB storage failed! Unknown error: {}".format(e)) + raise RuntimeError("Starting ScyllaDB storage unsuccesful") + + # FIXME: refactor this - duplicated code from minio + def configure_connection(self): + # who knows why? otherwise attributes are not loaded + if self._cfg.address == "": + + if self._storage_container is None: + raise RuntimeError( + "ScyllaDB container is not available! Make sure that you deployed " + "the ScyllaDB storage and provided configuration!" + ) + + self._storage_container.reload() + + # Check if the system is Linux and that it's not WSL + if platform.system() == "Linux" and "microsoft" not in platform.release().lower(): + networks = self._storage_container.attrs["NetworkSettings"]["Networks"] + self._cfg.address = "{IPAddress}:{Port}".format( + IPAddress=networks["bridge"]["IPAddress"], Port=self._cfg.alternator_port + ) + else: + # System is either WSL, Windows, or Mac + self._cfg.address = f"localhost:{self._cfg.mapped_port}" + + if not self._cfg.address: + self.logging.error( + f"Couldn't read the IP address of container from attributes " + f"{json.dumps(self._instance.attrs, indent=2)}" + ) + raise RuntimeError( + f"Incorrect detection of IP address for container with id {self._instance_id}" + ) + self.logging.info("Starting ScyllaDB instance at {}".format(self._cfg.address)) + + def stop(self): + if self._storage_container is not None: + self.logging.info(f"Stopping ScyllaDB container at {self._cfg.address}.") + self._storage_container.stop() + self.logging.info(f"Stopped ScyllaDB container at {self._cfg.address}.") + else: + self.logging.error("Stopping ScyllaDB was not succesful, storage container not known!") + + def envs(self) -> dict: + return {"NOSQL_STORAGE_TYPE": "scylladb", "NOSQL_STORAGE_ENDPOINT": self._cfg.address} + + def serialize(self) -> Tuple[StorageType, dict]: + return StorageType.SCYLLADB, self._cfg.serialize() + + """ + This implementation supports overriding this class. + The main ScyllaDB class is used to start/stop deployments. + + When overriding the implementation in Local/OpenWhisk/..., + we call the _deserialize and provide an alternative implementation. + """ + + T = TypeVar("T", bound="ScyllaDB") + + @staticmethod + def _deserialize( + cached_config: ScyllaDBConfig, cache_client: Cache, resources: Resources, obj_type: Type[T] + ) -> T: + docker_client = docker.from_env() + obj = obj_type(docker_client, cache_client, cached_config, resources) + + if cached_config.instance_id: + instance_id = cached_config.instance_id + try: + obj._storage_container = docker_client.containers.get(instance_id) + except docker.errors.NotFound: + raise RuntimeError(f"Storage container {instance_id} does not exist!") + else: + obj._storage_container = None + return obj + + @staticmethod + def deserialize( + cached_config: ScyllaDBConfig, cache_client: Cache, resources: Resources + ) -> "ScyllaDB": + return ScyllaDB._deserialize(cached_config, cache_client, resources, ScyllaDB) + + def retrieve_cache(self, benchmark: str) -> bool: + + if benchmark in self._tables: + return True + + cached_storage = self.cache_client.get_nosql_config(self.deployment_name(), benchmark) + if cached_storage is not None: + self._tables[benchmark] = cached_storage["tables"] + return True + + return False + + def update_cache(self, benchmark: str): + + self._cache_client.update_nosql( + self.deployment_name(), + benchmark, + { + "tables": self._tables[benchmark], + }, + ) + + def get_tables(self, benchmark: str) -> Dict[str, str]: + return self._tables[benchmark] + + def _get_table_name(self, benchmark: str, table: str) -> Optional[str]: + + if benchmark not in self._tables: + return None + + if table not in self._tables[benchmark]: + return None + + return self._tables[benchmark][table] + + def write_to_table( + self, + benchmark: str, + table: str, + data: dict, + primary_key: Tuple[str, str], + secondary_key: Optional[Tuple[str, str]] = None, + ): + + table_name = self._get_table_name(benchmark, table) + assert table_name is not None + + for key in (primary_key, secondary_key): + if key is not None: + data[key[0]] = key[1] + + serialized_data = {k: self._serializer.serialize(v) for k, v in data.items()} + self.client.put_item(TableName=table_name, Item=serialized_data) + + """ + AWS: create a DynamoDB Table + + In contrast to the hierarchy of database objects in Azure (account -> database -> container) + and GCP (database per benchmark), we need to create unique table names here. + """ + + def create_table( + self, benchmark: str, name: str, primary_key: str, secondary_key: Optional[str] = None + ) -> str: + + table_name = f"sebs-benchmarks-{self._cloud_resources.resources_id}-{benchmark}-{name}" + + try: + + definitions = [{"AttributeName": primary_key, "AttributeType": "S"}] + key_schema = [{"AttributeName": primary_key, "KeyType": "HASH"}] + + if secondary_key is not None: + definitions.append({"AttributeName": secondary_key, "AttributeType": "S"}) + key_schema.append({"AttributeName": secondary_key, "KeyType": "RANGE"}) + + ret = self.client.create_table( + TableName=table_name, + BillingMode="PAY_PER_REQUEST", + AttributeDefinitions=definitions, # type: ignore + KeySchema=key_schema, # type: ignore + ) + + if ret["TableDescription"]["TableStatus"] == "CREATING": + self.logging.info(f"Waiting for creation of DynamoDB table {name}") + waiter = self.client.get_waiter("table_exists") + waiter.wait(TableName=name) + + self.logging.info(f"Created DynamoDB table {name} for benchmark {benchmark}") + self._tables[benchmark][name] = table_name + + return ret["TableDescription"]["TableName"] + + except self.client.exceptions.ResourceInUseException as e: + + if "already exists" in e.response["Error"]["Message"]: + self.logging.info( + f"Using existing DynamoDB table {table_name} for benchmark {benchmark}" + ) + self._tables[benchmark][name] = table_name + return name + + raise RuntimeError(f"Creating DynamoDB failed, unknown reason! Error: {e}") + + def clear_table(self, name: str) -> str: + raise NotImplementedError() + + def remove_table(self, name: str) -> str: + raise NotImplementedError() diff --git a/sebs/types.py b/sebs/types.py index 2f26117e3..b87516fba 100644 --- a/sebs/types.py +++ b/sebs/types.py @@ -1,6 +1,11 @@ from enum import Enum +class BenchmarkModule(str, Enum): + STORAGE = "storage" + NOSQL = "nosql" + + class Platforms(str, Enum): AWS = "aws" AZURE = "azure" @@ -14,3 +19,10 @@ class Storage(str, Enum): AZURE_BLOB_STORAGE = "azure-blob-storage" GCP_STORAGE = "google-cloud-storage" MINIO = "minio" + + +class NoSQLStorage(str, Enum): + AWS_DYNAMODB = "aws-dynamodb" + AZURE_COSMOSDB = "azure-cosmosdb" + GCP_DATASTORE = "google-cloud-datastore" + SCYLLADB = "scylladb" diff --git a/sebs/utils.py b/sebs/utils.py index a86ebb6a5..e7ab43f63 100644 --- a/sebs/utils.py +++ b/sebs/utils.py @@ -61,6 +61,14 @@ def update_nested_dict(cfg: dict, keys: List[str], value: Optional[str]): cfg[keys[-1]] = value +def append_nested_dict(cfg: dict, keys: List[str], value: Optional[dict]): + if value: + # make sure parent keys exist + for key in keys[:-1]: + cfg = cfg.setdefault(key, {}) + cfg[keys[-1]] = {**cfg[keys[-1]], **value} + + def find(name, path): for root, dirs, files in os.walk(path): if name in dirs: diff --git a/tools/build_docker_images.py b/tools/build_docker_images.py index ab117ae9e..5336fb485 100755 --- a/tools/build_docker_images.py +++ b/tools/build_docker_images.py @@ -38,7 +38,7 @@ def build(image_type, system, language=None, version=None, version_name=None): if version: target += "." + version sebs_version = config["general"].get("SeBS_version", "unknown") - target += "." + sebs_version + target += "-" + sebs_version # if we pass an integer, the build will fail with 'connection reset by peer' buildargs = { @@ -65,7 +65,7 @@ def build(image_type, system, language=None, version=None, version_name=None): def build_language(system, language, language_config): configs = [] if "base_images" in language_config: - for version, base_image in language_config["base_images"].items(): + for version, base_image in language_config["base_images"]["x64"].items(): if args.language_version is not None and args.language_version == version: configs.append([version, base_image]) elif args.language_version is None: