Skip to content
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sky/benchmark/benchmark_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def _print_candidate_resources(
accelerator, count = list(resources.accelerators.items())[0]
accelerators = f'{accelerator}:{count}'
cloud = resources.cloud
vcpus = cloud.get_vcpus_from_instance_type(resources.instance_type)
vcpus = cloud.get_vcpus_mem_from_instance_type(resources.instance_type)
if vcpus is None:
vcpus = '-'
elif vcpus.is_integer():
Expand Down
76 changes: 59 additions & 17 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,14 @@ def _interactive_node_cli_command(cli_func):
help=('Number of vCPUs each instance must have '
'(e.g., ``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
'This is used to automatically select the instance type.'))
memory = click.option(
'--memory',
default=None,
type=str,
required=False,
help=('Amount of memory each instance must have in GB (e.g., '
'``--memory=16`` (exactly 16GB), ``--memory=16+`` (at least '
'16GB))'))
gpus = click.option('--gpus',
default=None,
type=str,
Expand Down Expand Up @@ -281,6 +289,7 @@ def _interactive_node_cli_command(cli_func):
zone_option,
instance_type_option,
cpus,
memory,
*([gpus] if cli_func.__name__ == 'gpunode' else []),
*([tpus] if cli_func.__name__ == 'tpunode' else []),
spot_option,
Expand Down Expand Up @@ -575,6 +584,7 @@ def _parse_override_params(cloud: Optional[str] = None,
zone: Optional[str] = None,
gpus: Optional[str] = None,
cpus: Optional[str] = None,
memory: Optional[str] = None,
instance_type: Optional[str] = None,
use_spot: Optional[bool] = None,
image_id: Optional[str] = None,
Expand Down Expand Up @@ -606,6 +616,11 @@ def _parse_override_params(cloud: Optional[str] = None,
override_params['cpus'] = None
else:
override_params['cpus'] = cpus
if memory is not None:
if memory.lower() == 'none':
override_params['memory'] = None
else:
override_params['memory'] = memory
if instance_type is not None:
if instance_type.lower() == 'none':
override_params['instance_type'] = None
Expand Down Expand Up @@ -936,6 +951,7 @@ def _make_task_from_entrypoint_with_overrides(
zone: Optional[str] = None,
gpus: Optional[str] = None,
cpus: Optional[str] = None,
memory: Optional[str] = None,
instance_type: Optional[str] = None,
num_nodes: Optional[int] = None,
use_spot: Optional[bool] = None,
Expand Down Expand Up @@ -979,6 +995,7 @@ def _make_task_from_entrypoint_with_overrides(
zone=zone,
gpus=gpus,
cpus=cpus,
memory=memory,
instance_type=instance_type,
use_spot=use_spot,
image_id=image_id,
Expand Down Expand Up @@ -1127,6 +1144,13 @@ def cli():
help=('Number of vCPUs each instance must have (e.g., '
'``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
'This is used to automatically select the instance type.'))
@click.option(
'--memory',
default=None,
type=str,
required=False,
help=('Amount of memory each instance must have in GB (e.g., '
'``--memory=16`` (exactly 16GB), ``--memory=16+`` (at least 16GB))'))
@click.option('--disk-size',
default=None,
type=int,
Expand Down Expand Up @@ -1192,6 +1216,7 @@ def launch(
zone: Optional[str],
gpus: Optional[str],
cpus: Optional[str],
memory: Optional[str],
instance_type: Optional[str],
num_nodes: Optional[int],
use_spot: Optional[bool],
Expand Down Expand Up @@ -1237,6 +1262,7 @@ def launch(
zone=zone,
gpus=gpus,
cpus=cpus,
memory=memory,
instance_type=instance_type,
num_nodes=num_nodes,
use_spot=use_spot,
Expand Down Expand Up @@ -1383,6 +1409,7 @@ def exec(
zone=zone,
gpus=gpus,
cpus=None,
memory=None,
instance_type=instance_type,
use_spot=use_spot,
image_id=image_id,
Expand Down Expand Up @@ -2581,10 +2608,11 @@ def _down_or_stop(name: str):
def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
cloud: Optional[str], region: Optional[str], zone: Optional[str],
instance_type: Optional[str], cpus: Optional[str],
gpus: Optional[str], use_spot: Optional[bool],
screen: Optional[bool], tmux: Optional[bool],
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
down: bool, retry_until_up: bool):
memory: Optional[str], gpus: Optional[str],
use_spot: Optional[bool], screen: Optional[bool],
tmux: Optional[bool], disk_size: Optional[int],
idle_minutes_to_autostop: Optional[int], down: bool,
retry_until_up: bool):
"""Launch or attach to an interactive GPU node.

Examples:
Expand Down Expand Up @@ -2623,8 +2651,8 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],

user_requested_resources = not (cloud is None and region is None and
zone is None and instance_type is None and
cpus is None and gpus is None and
use_spot is None)
cpus is None and memory is None and
gpus is None and use_spot is None)
default_resources = _INTERACTIVE_NODE_DEFAULT_RESOURCES['gpunode']
cloud_provider = clouds.CLOUD_REGISTRY.from_str(cloud)
if gpus is None and instance_type is None:
Expand All @@ -2638,6 +2666,7 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
zone=zone,
instance_type=instance_type,
cpus=cpus,
memory=memory,
accelerators=gpus,
use_spot=use_spot,
disk_size=disk_size)
Expand All @@ -2662,10 +2691,10 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
cloud: Optional[str], region: Optional[str], zone: Optional[str],
instance_type: Optional[str], cpus: Optional[str],
use_spot: Optional[bool], screen: Optional[bool],
tmux: Optional[bool], disk_size: Optional[int],
idle_minutes_to_autostop: Optional[int], down: bool,
retry_until_up: bool):
memory: Optional[str], use_spot: Optional[bool],
screen: Optional[bool], tmux: Optional[bool],
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
down: bool, retry_until_up: bool):
"""Launch or attach to an interactive CPU node.

Examples:
Expand Down Expand Up @@ -2703,7 +2732,8 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],

user_requested_resources = not (cloud is None and region is None and
zone is None and instance_type is None and
cpus is None and use_spot is None)
cpus is None and memory is None and
use_spot is None)
default_resources = _INTERACTIVE_NODE_DEFAULT_RESOURCES['cpunode']
cloud_provider = clouds.CLOUD_REGISTRY.from_str(cloud)
if instance_type is None:
Expand All @@ -2715,6 +2745,7 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
zone=zone,
instance_type=instance_type,
cpus=cpus,
memory=memory,
use_spot=use_spot,
disk_size=disk_size)

Expand All @@ -2738,11 +2769,11 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
region: Optional[str], zone: Optional[str],
instance_type: Optional[str], cpus: Optional[str],
tpus: Optional[str], use_spot: Optional[bool],
tpu_vm: Optional[bool], screen: Optional[bool],
tmux: Optional[bool], disk_size: Optional[int],
idle_minutes_to_autostop: Optional[int], down: bool,
retry_until_up: bool):
memory: Optional[str], tpus: Optional[str],
use_spot: Optional[bool], tpu_vm: Optional[bool],
screen: Optional[bool], tmux: Optional[bool],
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
down: bool, retry_until_up: bool):
"""Launch or attach to an interactive TPU node.

Examples:
Expand Down Expand Up @@ -2780,7 +2811,8 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],

user_requested_resources = not (region is None and zone is None and
instance_type is None and cpus is None and
tpus is None and use_spot is None)
memory is None and tpus is None and
use_spot is None)
default_resources = _INTERACTIVE_NODE_DEFAULT_RESOURCES['tpunode']
accelerator_args = default_resources.accelerator_args
if tpu_vm:
Expand All @@ -2797,6 +2829,7 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
zone=zone,
instance_type=instance_type,
cpus=cpus,
memory=memory,
accelerators=tpus,
accelerator_args=accelerator_args,
use_spot=use_spot,
Expand Down Expand Up @@ -3148,6 +3181,13 @@ def spot():
help=('Number of vCPUs each instance must have (e.g., '
'``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
'This is used to automatically select the instance type.'))
@click.option(
'--memory',
default=None,
type=str,
required=False,
help=('Amount of memory each instance must have in GB (e.g., '
'``--memory=16`` (exactly 16GB), ``--memory=16+`` (at least 16GB))'))
@click.option('--spot-recovery',
default=None,
type=str,
Expand Down Expand Up @@ -3191,6 +3231,7 @@ def spot_launch(
zone: Optional[str],
gpus: Optional[str],
cpus: Optional[str],
memory: Optional[str],
instance_type: Optional[str],
num_nodes: Optional[int],
use_spot: Optional[bool],
Expand Down Expand Up @@ -3228,6 +3269,7 @@ def spot_launch(
zone=zone,
gpus=gpus,
cpus=cpus,
memory=memory,
instance_type=instance_type,
num_nodes=num_nodes,
use_spot=use_spot,
Expand Down
19 changes: 12 additions & 7 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,9 +266,12 @@ def is_same_cloud(self, other: clouds.Cloud):
return isinstance(other, AWS)

@classmethod
def get_default_instance_type(cls,
cpus: Optional[str] = None) -> Optional[str]:
def get_default_instance_type(
cls,
cpus: Optional[str] = None,
memory: Optional[str] = None) -> Optional[str]:
return service_catalog.get_default_instance_type(cpus=cpus,
memory=memory,
clouds='aws')

# TODO: factor the following three methods, as they are the same logic
Expand All @@ -282,12 +285,12 @@ def get_accelerators_from_instance_type(
instance_type, clouds='aws')

@classmethod
def get_vcpus_from_instance_type(
def get_vcpus_mem_from_instance_type(
cls,
instance_type: str,
) -> Optional[float]:
return service_catalog.get_vcpus_from_instance_type(instance_type,
clouds='aws')
) -> Tuple[Optional[float], Optional[float]]:
return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
clouds='aws')

def make_deploy_resources_variables(
self, resources: 'resources_lib.Resources', region: 'clouds.Region',
Expand Down Expand Up @@ -334,6 +337,7 @@ def _make(instance_list):
# attach the accelerators. Billed as part of the VM type.
accelerators=None,
cpus=None,
memory=None,
)
resource_list.append(r)
return resource_list
Expand All @@ -343,7 +347,7 @@ def _make(instance_list):
if accelerators is None:
# Return a default instance type with the given number of vCPUs.
default_instance_type = AWS.get_default_instance_type(
cpus=resources.cpus)
cpus=resources.cpus, memory=resources.memory)
if default_instance_type is None:
return ([], [])
else:
Expand All @@ -357,6 +361,7 @@ def _make(instance_list):
acc_count,
use_spot=resources.use_spot,
cpus=resources.cpus,
memory=resources.memory,
region=resources.region,
zone=resources.zone,
clouds='aws')
Expand Down
19 changes: 12 additions & 7 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,12 @@ def is_same_cloud(self, other):
return isinstance(other, Azure)

@classmethod
def get_default_instance_type(cls,
cpus: Optional[str] = None) -> Optional[str]:
def get_default_instance_type(
cls,
cpus: Optional[str] = None,
memory: Optional[str] = None) -> Optional[str]:
return service_catalog.get_default_instance_type(cpus=cpus,
memory=memory,
clouds='azure')

def _get_image_config(self, gen_version, instance_type):
Expand Down Expand Up @@ -194,12 +197,12 @@ def get_accelerators_from_instance_type(
instance_type, clouds='azure')

@classmethod
def get_vcpus_from_instance_type(
def get_vcpus_mem_from_instance_type(
cls,
instance_type: str,
) -> Optional[float]:
return service_catalog.get_vcpus_from_instance_type(instance_type,
clouds='azure')
) -> Tuple[Optional[float], Optional[float]]:
return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
clouds='azure')

@classmethod
def get_zone_shell_cmd(cls) -> Optional[str]:
Expand Down Expand Up @@ -256,6 +259,7 @@ def _make(instance_list):
# attach the accelerators. Billed as part of the VM type.
accelerators=None,
cpus=None,
memory=None,
)
resource_list.append(r)
return resource_list
Expand All @@ -265,7 +269,7 @@ def _make(instance_list):
if accelerators is None:
# Return a default instance type with the given number of vCPUs.
default_instance_type = Azure.get_default_instance_type(
cpus=resources.cpus)
cpus=resources.cpus, memory=resources.memory)
if default_instance_type is None:
return ([], [])
else:
Expand All @@ -278,6 +282,7 @@ def _make(instance_list):
acc,
acc_count,
cpus=resources.cpus,
memory=resources.memory,
use_spot=resources.use_spot,
region=resources.region,
zone=resources.zone,
Expand Down
20 changes: 13 additions & 7 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,9 @@ def make_deploy_resources_variables(
raise NotImplementedError

@classmethod
def get_vcpus_from_instance_type(cls,
instance_type: str) -> Optional[float]:
"""Returns the number of virtual CPUs that the instance type offers."""
def get_vcpus_mem_from_instance_type(
cls, instance_type: str) -> Tuple[Optional[float], Optional[float]]:
"""Returns the #vCPUs and memory that the instance type offers."""
raise NotImplementedError

@classmethod
Expand All @@ -248,15 +248,21 @@ def get_accelerators_from_instance_type(
raise NotImplementedError

@classmethod
def get_default_instance_type(cls,
cpus: Optional[str] = None) -> Optional[str]:
"""Returns the default instance type with the given number of vCPUs.
def get_default_instance_type(
cls,
cpus: Optional[str] = None,
memory: Optional[str] = None) -> Optional[str]:
"""Returns the default instance type with the given #vCPUs and memory.

For example, if cpus='4', this method returns the default instance type
with 4 vCPUs. If cpus='4+', this method returns the default instance
type with 4 or more vCPUs.

When cpus is None, this method will never return None.
If 'memory=4', this method returns the default instance type with 4GB
memory. If 'memory=4+', this method returns the default instance
type with 4GB or more memory.

When cpus is None or memory is None, this method will never return None.
This method may return None if the cloud's default instance family
does not have a VM with the given number of vCPUs (e.g., when cpus='7').
"""
Expand Down
Loading