Skip to content
Merged
Show file tree
Hide file tree
Changes from 50 commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
d5b5511
Add cpu option
WoosukKwon Jan 24, 2023
3e0fa4a
yapf
WoosukKwon Jan 24, 2023
22fe74b
Add cpu in CLI
WoosukKwon Jan 24, 2023
386b1dc
Fix GCP and Do not raise errors
WoosukKwon Jan 24, 2023
23a0ea5
yapf
WoosukKwon Jan 24, 2023
28b0c5a
Fix TPU VM
WoosukKwon Jan 24, 2023
0d8cf90
Print error msg for cpu mismatches
WoosukKwon Jan 24, 2023
a825cf7
Fix error when default instance is not found
WoosukKwon Jan 24, 2023
d65634c
Merge branch 'master' into woosuk-cpu
WoosukKwon Jan 24, 2023
4959f8f
Fix a comment
WoosukKwon Jan 24, 2023
5d57fd5
Minor
WoosukKwon Jan 24, 2023
9067308
Add _impl
WoosukKwon Jan 24, 2023
797a632
Define _DEFAULT_NUM_VCPUS as constant
WoosukKwon Jan 24, 2023
b4e0b00
yapf
WoosukKwon Jan 24, 2023
b95c56b
Ban specifying CPU in sky exec
WoosukKwon Jan 24, 2023
9324e52
yapf
WoosukKwon Jan 24, 2023
12f4bbe
Fix type error
WoosukKwon Jan 25, 2023
09ad032
xlarge -> large
WoosukKwon Jan 25, 2023
83a7a9d
Fix AWS
WoosukKwon Jan 25, 2023
c400dea
Fix comments on default instance family
WoosukKwon Jan 25, 2023
75cf94a
Fix sky exec
WoosukKwon Jan 25, 2023
afd80fe
minor
WoosukKwon Jan 25, 2023
89ba768
Rename
WoosukKwon Jan 25, 2023
e4ae323
fuzzy_candidate_list -> []
WoosukKwon Jan 25, 2023
6103c2e
Fix version up comment
WoosukKwon Jan 25, 2023
1961191
Minor
WoosukKwon Jan 25, 2023
f856b66
Fix help msg
WoosukKwon Jan 25, 2023
40ac63b
Add docstring for cpu
WoosukKwon Jan 25, 2023
2ac7471
Document get_default_instance_type
WoosukKwon Jan 25, 2023
62f78a2
prevent sky launch -c --cpu and sky exec --cpu
WoosukKwon Jan 25, 2023
b141f75
Minor fix
WoosukKwon Jan 25, 2023
bc7cd0a
cpu -> cpus
WoosukKwon Jan 25, 2023
a6f4355
yapf
WoosukKwon Jan 25, 2023
8033bbf
vcpus -> vCPUs
WoosukKwon Jan 25, 2023
add2811
n2 -> n2-standard
WoosukKwon Jan 25, 2023
aa926af
Fix docstring
WoosukKwon Jan 25, 2023
0368415
Add comment on assertion
WoosukKwon Jan 25, 2023
54faa77
Fix docstring
WoosukKwon Jan 25, 2023
8f9dabc
Fix docstring
WoosukKwon Jan 25, 2023
97dc60e
Add assert in less_demanding_than
WoosukKwon Jan 25, 2023
4599f36
roll back
WoosukKwon Jan 25, 2023
01021c9
Merge branch 'master' into woosuk-cpu
WoosukKwon Jan 27, 2023
4949000
Remove is_same_resources
WoosukKwon Jan 27, 2023
fec1b7a
Add TODO
WoosukKwon Jan 27, 2023
13607fd
Check cpus format again in _filter_with_cpus
WoosukKwon Jan 27, 2023
2059309
yapf
WoosukKwon Jan 27, 2023
6a2b554
Add tests
WoosukKwon Jan 27, 2023
99a66e7
Fix tests
WoosukKwon Jan 27, 2023
ad4827b
Fix
WoosukKwon Jan 27, 2023
2c08184
Fix
WoosukKwon Jan 27, 2023
014f63c
Fix tests
WoosukKwon Jan 28, 2023
1f8a031
Merge branch 'master' into woosuk-cpu
WoosukKwon Jan 28, 2023
62aa150
Fix example_app
WoosukKwon Jan 28, 2023
4ea39d7
Add cpus in yaml spec
WoosukKwon Jan 28, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 59 additions & 18 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,13 @@ def _interactive_node_cli_command(cli_func):
default=None,
type=str,
help='Instance type to use.')
cpus = click.option(
'--cpus',
default=None,
type=str,
help=('Number of vCPUs each instance must have '
'(e.g., ``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
'This is used to automatically select the instance type.'))
gpus = click.option('--gpus',
default=None,
type=str,
Expand Down Expand Up @@ -268,6 +275,7 @@ def _interactive_node_cli_command(cli_func):
region_option,
zone_option,
instance_type_option,
cpus,
*([gpus] if cli_func.__name__ == 'gpunode' else []),
*([tpus] if cli_func.__name__ == 'tpunode' else []),
spot_option,
Expand Down Expand Up @@ -556,6 +564,7 @@ def _parse_override_params(cloud: Optional[str] = None,
region: Optional[str] = None,
zone: Optional[str] = None,
gpus: Optional[str] = None,
cpus: Optional[str] = None,
instance_type: Optional[str] = None,
use_spot: Optional[bool] = None,
image_id: Optional[str] = None,
Expand All @@ -582,6 +591,11 @@ def _parse_override_params(cloud: Optional[str] = None,
override_params['accelerators'] = None
else:
override_params['accelerators'] = gpus
if cpus is not None:
if cpus.lower() == 'none':
override_params['cpus'] = None
else:
override_params['cpus'] = cpus
if instance_type is not None:
if instance_type.lower() == 'none':
override_params['instance_type'] = None
Expand Down Expand Up @@ -908,6 +922,7 @@ def _make_task_from_entrypoint_with_overrides(
region: Optional[str] = None,
zone: Optional[str] = None,
gpus: Optional[str] = None,
cpus: Optional[str] = None,
instance_type: Optional[str] = None,
num_nodes: Optional[int] = None,
use_spot: Optional[bool] = None,
Expand Down Expand Up @@ -949,6 +964,7 @@ def _make_task_from_entrypoint_with_overrides(
region=region,
zone=zone,
gpus=gpus,
cpus=cpus,
instance_type=instance_type,
use_spot=use_spot,
image_id=image_id,
Expand Down Expand Up @@ -1090,6 +1106,13 @@ def cli():
default=False,
help='If used, runs locally inside a docker container.')
@_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS)
@click.option('--cpus',
default=None,
type=str,
required=False,
help=('Number of vCPUs each instance must have (e.g., '
'``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
'This is used to automatically select the instance type.'))
@click.option('--disk-size',
default=None,
type=int,
Expand Down Expand Up @@ -1154,6 +1177,7 @@ def launch(
region: Optional[str],
zone: Optional[str],
gpus: Optional[str],
cpus: Optional[str],
instance_type: Optional[str],
num_nodes: Optional[int],
use_spot: Optional[bool],
Expand Down Expand Up @@ -1198,6 +1222,7 @@ def launch(
region=region,
zone=zone,
gpus=gpus,
cpus=cpus,
instance_type=instance_type,
num_nodes=num_nodes,
use_spot=use_spot,
Expand Down Expand Up @@ -1343,6 +1368,7 @@ def exec(
region=region,
zone=zone,
gpus=gpus,
cpus=None,
instance_type=instance_type,
use_spot=use_spot,
image_id=image_id,
Expand Down Expand Up @@ -2414,11 +2440,11 @@ def _down_or_stop(name: str):
# pylint: disable=redefined-outer-name
def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
cloud: Optional[str], region: Optional[str], zone: Optional[str],
instance_type: Optional[str], gpus: Optional[str],
use_spot: Optional[bool], screen: Optional[bool],
tmux: Optional[bool], disk_size: Optional[int],
idle_minutes_to_autostop: Optional[int], down: bool,
retry_until_up: bool):
instance_type: Optional[str], cpus: Optional[str],
gpus: Optional[str], use_spot: Optional[bool],
screen: Optional[bool], tmux: Optional[bool],
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
down: bool, retry_until_up: bool):
"""Launch or attach to an interactive GPU node.

Examples:
Expand Down Expand Up @@ -2457,7 +2483,8 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],

user_requested_resources = not (cloud is None and region is None and
zone is None and instance_type is None and
gpus is None and use_spot is None)
cpus is None and gpus is None and
use_spot is None)
default_resources = _INTERACTIVE_NODE_DEFAULT_RESOURCES['gpunode']
cloud_provider = clouds.CLOUD_REGISTRY.from_str(cloud)
if gpus is None and instance_type is None:
Expand All @@ -2470,6 +2497,7 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
region=region,
zone=zone,
instance_type=instance_type,
cpus=cpus,
accelerators=gpus,
use_spot=use_spot,
disk_size=disk_size)
Expand All @@ -2493,10 +2521,11 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
# pylint: disable=redefined-outer-name
def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
cloud: Optional[str], region: Optional[str], zone: Optional[str],
instance_type: Optional[str], use_spot: Optional[bool],
screen: Optional[bool], tmux: Optional[bool],
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
down: bool, retry_until_up: bool):
instance_type: Optional[str], cpus: Optional[str],
use_spot: Optional[bool], screen: Optional[bool],
tmux: Optional[bool], disk_size: Optional[int],
idle_minutes_to_autostop: Optional[int], down: bool,
retry_until_up: bool):
"""Launch or attach to an interactive CPU node.

Examples:
Expand Down Expand Up @@ -2534,7 +2563,7 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],

user_requested_resources = not (cloud is None and region is None and
zone is None and instance_type is None and
use_spot is None)
cpus is None and use_spot is None)
default_resources = _INTERACTIVE_NODE_DEFAULT_RESOURCES['cpunode']
cloud_provider = clouds.CLOUD_REGISTRY.from_str(cloud)
if instance_type is None:
Expand All @@ -2545,6 +2574,7 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
region=region,
zone=zone,
instance_type=instance_type,
cpus=cpus,
use_spot=use_spot,
disk_size=disk_size)

Expand All @@ -2567,11 +2597,12 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
# pylint: disable=redefined-outer-name
def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
region: Optional[str], zone: Optional[str],
instance_type: Optional[str], tpus: Optional[str],
use_spot: Optional[bool], tpu_vm: Optional[bool],
screen: Optional[bool], tmux: Optional[bool],
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
down: bool, retry_until_up: bool):
instance_type: Optional[str], cpus: Optional[str],
tpus: Optional[str], use_spot: Optional[bool],
tpu_vm: Optional[bool], screen: Optional[bool],
tmux: Optional[bool], disk_size: Optional[int],
idle_minutes_to_autostop: Optional[int], down: bool,
retry_until_up: bool):
"""Launch or attach to an interactive TPU node.

Examples:
Expand Down Expand Up @@ -2608,8 +2639,8 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
name = _default_interactive_node_name('tpunode')

user_requested_resources = not (region is None and zone is None and
instance_type is None and tpus is None and
use_spot is None)
instance_type is None and cpus is None and
tpus is None and use_spot is None)
default_resources = _INTERACTIVE_NODE_DEFAULT_RESOURCES['tpunode']
accelerator_args = default_resources.accelerator_args
if tpu_vm:
Expand All @@ -2625,6 +2656,7 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
region=region,
zone=zone,
instance_type=instance_type,
cpus=cpus,
accelerators=tpus,
accelerator_args=accelerator_args,
use_spot=use_spot,
Expand Down Expand Up @@ -2967,6 +2999,13 @@ def spot():
**_get_shell_complete_args(_complete_file_name))
# TODO(zhwu): Add --dryrun option to test the launch command.
@_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS)
@click.option('--cpus',
default=None,
type=str,
required=False,
help=('Number of vCPUs each instance must have (e.g., '
'``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
'This is used to automatically select the instance type.'))
@click.option('--spot-recovery',
default=None,
type=str,
Expand Down Expand Up @@ -3009,6 +3048,7 @@ def spot_launch(
region: Optional[str],
zone: Optional[str],
gpus: Optional[str],
cpus: Optional[str],
instance_type: Optional[str],
num_nodes: Optional[int],
use_spot: Optional[bool],
Expand Down Expand Up @@ -3047,6 +3087,7 @@ def spot_launch(
region=region,
zone=zone,
gpus=gpus,
cpus=cpus,
instance_type=instance_type,
num_nodes=num_nodes,
use_spot=use_spot,
Expand Down
23 changes: 14 additions & 9 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,10 @@ def is_same_cloud(self, other: clouds.Cloud):
return isinstance(other, AWS)

@classmethod
def get_default_instance_type(cls) -> str:
# General-purpose instance with 8 vCPUs and 32 GB RAM.
# Intel Ice Lake 8375C
return 'm6i.2xlarge'
def get_default_instance_type(cls,
cpus: Optional[str] = None) -> Optional[str]:
return service_catalog.get_default_instance_type(cpus=cpus,
clouds='aws')

# TODO: factor the following three methods, as they are the same logic
# between Azure and AWS.
Expand Down Expand Up @@ -334,12 +334,11 @@ def make_deploy_resources_variables(

def get_feasible_launchable_resources(self,
resources: 'resources_lib.Resources'):
fuzzy_candidate_list: List[str] = []
if resources.instance_type is not None:
assert resources.is_launchable(), resources
# Treat Resources(AWS, p3.2x, V100) as Resources(AWS, p3.2x).
resources = resources.copy(accelerators=None)
return ([resources], fuzzy_candidate_list)
return ([resources], [])

def _make(instance_list):
resource_list = []
Expand All @@ -350,16 +349,21 @@ def _make(instance_list):
# Setting this to None as AWS doesn't separately bill /
# attach the accelerators. Billed as part of the VM type.
accelerators=None,
cpus=None,
)
resource_list.append(r)
return resource_list

# Currently, handle a filter on accelerators only.
accelerators = resources.accelerators
if accelerators is None:
# No requirements to filter, so just return a default VM type.
return (_make([AWS.get_default_instance_type()]),
fuzzy_candidate_list)
# Return a default instance type with the given number of vCPUs.
default_instance_type = AWS.get_default_instance_type(
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For discussion/a thought: This is now dynamically picking a VM type based on cpu, which means it's no longer a "default" constant. More like a service_catalog.get_instance_type_for_cpu() a la L371.

Should we remove this interface (Cloud.get_default_instance_type()) then? Wdyt? (I can see arguments for both choices.)

Copy link
Copy Markdown
Collaborator Author

@WoosukKwon WoosukKwon Jan 25, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it cannot be get_instance_type_for_cpu() because it limits the search space to the default instance family and uses 8 as the default number of the vCPUs. I think get_default_instance_type is fine; one can easily understand what it means.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The naming part sounds good. Sorry, I did not spell out in my comment but was thinking removing one interface from Cloud will make it easier to add new clouds. Also, it's a shallow method (1-line dispatch). How about removing it and just calling service_catalog.get_default_instance_type() directly here?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be still useful to explicitly show what the default type is?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mind clarifying what you meant?

Currently, the Cloud interface includes a shallow method, which does not show the default type:

    @classmethod
    def get_default_instance_type(cls,
                                  cpu: Optional[str] = None) -> Optional[str]:
        return service_catalog.get_default_instance_type(cpu=cpu, clouds='aws')

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean one can easily find which instance type is default for AWS by calling AWS.get_default_instance_type, which is good.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense.

cpus=resources.cpus)
if default_instance_type is None:
return ([], [])
else:
return (_make([default_instance_type]), [])

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -368,6 +372,7 @@ def _make(instance_list):
acc,
acc_count,
use_spot=resources.use_spot,
cpus=resources.cpus,
region=resources.region,
zone=resources.zone,
clouds='aws')
Expand Down
26 changes: 16 additions & 10 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,10 @@ def is_same_cloud(self, other):
return isinstance(other, Azure)

@classmethod
def get_default_instance_type(cls) -> str:
# General-purpose instance with 8 vCPUs and 32 GB RAM.
# Intel Ice Lake 8370C
return 'Standard_D8_v5'
def get_default_instance_type(cls,
cpus: Optional[str] = None) -> Optional[str]:
return service_catalog.get_default_instance_type(cpus=cpus,
clouds='azure')

def _get_image_config(self, gen_version, instance_type):
# az vm image list \
Expand Down Expand Up @@ -250,12 +250,11 @@ def get_feasible_launchable_resources(self, resources):
# TODO(zhwu): our azure subscription offer ID does not support spot.
# Need to support it.
return ([], [])
fuzzy_candidate_list = []
if resources.instance_type is not None:
assert resources.is_launchable(), resources
# Treat Resources(AWS, p3.2x, V100) as Resources(AWS, p3.2x).
resources = resources.copy(accelerators=None)
return ([resources], fuzzy_candidate_list)
return ([resources], [])

def _make(instance_list):
resource_list = []
Expand All @@ -265,23 +264,30 @@ def _make(instance_list):
instance_type=instance_type,
# Setting this to None as Azure doesn't separately bill /
# attach the accelerators. Billed as part of the VM type.
accelerators=None)
accelerators=None,
cpus=None,
)
resource_list.append(r)
return resource_list

# Currently, handle a filter on accelerators only.
accelerators = resources.accelerators
if accelerators is None:
# No requirements to filter, so just return a default VM type.
return (_make([Azure.get_default_instance_type()]),
fuzzy_candidate_list)
# Return a default instance type with the given number of vCPUs.
default_instance_type = Azure.get_default_instance_type(
cpus=resources.cpus)
if default_instance_type is None:
return ([], [])
else:
return (_make([default_instance_type]), [])

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
(instance_list, fuzzy_candidate_list
) = service_catalog.get_instance_type_for_accelerator(
acc,
acc_count,
cpus=resources.cpus,
use_spot=resources.use_spot,
region=resources.region,
zone=resources.zone,
Expand Down
13 changes: 12 additions & 1 deletion sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,18 @@ def get_accelerators_from_instance_type(
raise NotImplementedError

@classmethod
def get_default_instance_type(cls) -> str:
def get_default_instance_type(cls,
cpus: Optional[str] = None) -> Optional[str]:
"""Returns the default instance type with the given number of vCPUs.

For example, if cpus='4', this method returns the default instance type
with 4 vCPUs. If cpus='4+', this method returns the default instance
type with 4 or more vCPUs.

When cpus is None, this method will never return None.
This method may return None if the cloud's default instance family
does not have a VM with the given number of vCPUs (e.g., when cpus='7').
"""
raise NotImplementedError

@classmethod
Expand Down
Loading