Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 53 additions & 18 deletions src/sagemaker/hyperpod/cli/commands/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
TEMP_KUBE_CONFIG_FILE,
OutputFormat,
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We also need to add telemetry for the functions like list_cluster

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added for list_cluster

from sagemaker.hyperpod.cli.telemetry.user_agent import (
from sagemaker.hyperpod.common.telemetry.user_agent import (
get_user_agent_extra_suffix,
)
from sagemaker.hyperpod.cli.service.list_pods import (
Expand All @@ -61,8 +61,17 @@
from sagemaker.hyperpod.cli.utils import (
get_eks_cluster_name,
)
from sagemaker.hyperpod.common.utils import get_cluster_context as get_cluster_context_util
from sagemaker.hyperpod.observability.utils import get_monitoring_config, is_observability_addon_enabled
from sagemaker.hyperpod.common.utils import (
get_cluster_context as get_cluster_context_util,
)
from sagemaker.hyperpod.observability.utils import (
get_monitoring_config,
is_observability_addon_enabled,
)
from sagemaker.hyperpod.common.telemetry.telemetry_logging import (
_hyperpod_telemetry_emitter,
)
from sagemaker.hyperpod.common.telemetry.constants import Feature

RATE_LIMIT = 4
RATE_LIMIT_PERIOD = 1 # 1 second
Expand Down Expand Up @@ -103,12 +112,13 @@
multiple=True,
help="Optional. The namespace that you want to check the capacity for. Only SageMaker managed namespaces are supported.",
)
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "list_cluster")
def list_cluster(
region: Optional[str],
output: Optional[str],
clusters: Optional[str],
debug: bool,
namespace: Optional[List]
namespace: Optional[List],
):
"""List SageMaker Hyperpod Clusters with cluster metadata.

Expand Down Expand Up @@ -261,8 +271,14 @@ def rate_limited_operation(
for ns in namespace:
sm_managed_namespace = k8s_client.get_sagemaker_managed_namespace(ns)
if sm_managed_namespace:
quota_allocation_id = sm_managed_namespace.metadata.labels[SAGEMAKER_QUOTA_ALLOCATION_LABEL]
cluster_queue_name = HYPERPOD_NAMESPACE_PREFIX + quota_allocation_id + SAGEMAKER_MANAGED_CLUSTER_QUEUE_SUFFIX
quota_allocation_id = sm_managed_namespace.metadata.labels[
SAGEMAKER_QUOTA_ALLOCATION_LABEL
]
cluster_queue_name = (
HYPERPOD_NAMESPACE_PREFIX
+ quota_allocation_id
+ SAGEMAKER_MANAGED_CLUSTER_QUEUE_SUFFIX
)
cluster_queue = k8s_client.get_cluster_queue(cluster_queue_name)
nominal_quota = _get_cluster_queue_nominal_quota(cluster_queue)
quota_usage = _get_cluster_queue_quota_usage(cluster_queue)
Expand All @@ -282,8 +298,19 @@ def rate_limited_operation(
nodes_summary["deep_health_check_passed"],
]
for ns in namespace:
capacities.append(ns_nominal_quota.get(ns).get(instance_type, {}).get(NVIDIA_GPU_RESOURCE_LIMIT_KEY, "N/A"))
capacities.append(_get_available_quota(ns_nominal_quota.get(ns), ns_quota_usage.get(ns), instance_type, NVIDIA_GPU_RESOURCE_LIMIT_KEY))
capacities.append(
ns_nominal_quota.get(ns)
.get(instance_type, {})
.get(NVIDIA_GPU_RESOURCE_LIMIT_KEY, "N/A")
)
capacities.append(
_get_available_quota(
ns_nominal_quota.get(ns),
ns_quota_usage.get(ns),
instance_type,
NVIDIA_GPU_RESOURCE_LIMIT_KEY,
)
)
cluster_capacities.append(capacities)
except Exception as e:
logger.error(f"Error processing cluster {cluster_name}: {e}, continue...")
Expand All @@ -305,7 +332,7 @@ def _get_cluster_queue_nominal_quota(cluster_queue):
if resource_name == NVIDIA_GPU_RESOURCE_LIMIT_KEY:
quota = int(quota)
nominal_quota[flavor_name][resource_name] = quota

return nominal_quota


Expand Down Expand Up @@ -336,7 +363,7 @@ def _get_available_quota(nominal, usage, flavor, resource_name):
# Some resources need to be further processed by parsing unit like memory, e.g 10Gi
if nominal_quota is not None and usage_quota is not None:
return int(nominal_quota) - int(usage_quota)

return "N/A"


Expand All @@ -358,7 +385,9 @@ def _restructure_output(summary_list, namespaces):
for node_summary in summary_list:
node_summary["Namespaces"] = {}
for ns in namespaces:
available_accelerators = node_summary[ns + AVAILABLE_ACCELERATOR_DEVICES_KEY]
available_accelerators = node_summary[
ns + AVAILABLE_ACCELERATOR_DEVICES_KEY
]
total_accelerators = node_summary[ns + TOTAL_ACCELERATOR_DEVICES_KEY]
quota_accelerator_info = {
AVAILABLE_ACCELERATOR_DEVICES_KEY: available_accelerators,
Expand Down Expand Up @@ -425,9 +454,9 @@ def _aggregate_nodes_info(

# Accelerator Devices available = Allocatable devices - Allocated devices
if node_name in nodes_resource_allocated_dict:
nodes_summary[instance_type]["accelerator_devices_available"] -= (
nodes_resource_allocated_dict[node_name]
)
nodes_summary[instance_type][
"accelerator_devices_available"
] -= nodes_resource_allocated_dict[node_name]

logger.debug(f"nodes_summary: {nodes_summary}")
return nodes_summary
Expand Down Expand Up @@ -550,7 +579,6 @@ def get_cluster_context(
sys.exit(1)



@click.command()
@click.option("--grafana", is_flag=True, help="Returns Grafana Dashboard URL")
@click.option("--prometheus", is_flag=True, help="Returns Prometheus Workspace URL")
Expand All @@ -572,14 +600,21 @@ def get_monitoring(grafana: bool, prometheus: bool, list: bool) -> None:
print(f"Grafana dashboard URL: {monitor_config.grafanaURL}")
if list:
metrics_data = monitor_config.availableMetrics
print(tabulate([[k, v.get('level', v.get('enabled'))] for k, v in metrics_data.items()],
headers=['Metric', 'Level/Enabled'], tablefmt='presto'))
print(
tabulate(
[
[k, v.get("level", v.get("enabled"))]
for k, v in metrics_data.items()
],
headers=["Metric", "Level/Enabled"],
tablefmt="presto",
)
)
except Exception as e:
logger.error(f"Failed to get metrics: {e}")
sys.exit(1)



def _update_kube_config(
eks_name: str,
region: Optional[str],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from __future__ import absolute_import
from .telemetry_logging import _hyperpod_telemetry_emitter
60 changes: 60 additions & 0 deletions src/sagemaker/hyperpod/common/telemetry/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from __future__ import absolute_import
from enum import Enum


class Feature(Enum):
"""Enumeration of feature names used in telemetry."""

HYPERPOD = 6 # Added to support telemetry in sagemaker-hyperpod-cli

def __str__(self): # pylint: disable=E0307
"""Return the feature name."""
return self.name


class Status(Enum):
"""Enumeration of status values used in telemetry."""

SUCCESS = 1
FAILURE = 0

def __str__(self): # pylint: disable=E0307
"""Return the status name."""
return self.name


class Region(str, Enum):
"""Telemetry: List of all supported AWS regions."""

# Classic
US_EAST_1 = "us-east-1" # IAD
US_EAST_2 = "us-east-2" # CMH
US_WEST_1 = "us-west-1" # SFO
US_WEST_2 = "us-west-2" # PDX
AP_NORTHEAST_1 = "ap-northeast-1" # NRT
AP_NORTHEAST_2 = "ap-northeast-2" # ICN
AP_NORTHEAST_3 = "ap-northeast-3" # KIX
AP_SOUTH_1 = "ap-south-1" # BOM
AP_SOUTHEAST_1 = "ap-southeast-1" # SIN
AP_SOUTHEAST_2 = "ap-southeast-2" # SYD
CA_CENTRAL_1 = "ca-central-1" # YUL
EU_CENTRAL_1 = "eu-central-1" # FRA
EU_NORTH_1 = "eu-north-1" # ARN
EU_WEST_1 = "eu-west-1" # DUB
EU_WEST_2 = "eu-west-2" # LHR
EU_WEST_3 = "eu-west-3" # CDG
SA_EAST_1 = "sa-east-1" # GRU
# Opt-in
AP_EAST_1 = "ap-east-1" # HKG
AP_SOUTHEAST_3 = "ap-southeast-3" # CGK
AF_SOUTH_1 = "af-south-1" # CPT
EU_SOUTH_1 = "eu-south-1" # MXP
ME_SOUTH_1 = "me-south-1" # BAH
MX_CENTRAL_1 = "mx-central-1" # QRO
AP_SOUTHEAST_7 = "ap-southeast-7" # BKK
AP_SOUTH_2 = "ap-south-2" # HYD
AP_SOUTHEAST_4 = "ap-southeast-4" # MEL
EU_CENTRAL_2 = "eu-central-2" # ZRH
EU_SOUTH_2 = "eu-south-2" # ZAZ
IL_CENTRAL_1 = "il-central-1" # TLV
ME_CENTRAL_1 = "me-central-1" # DXB
Loading
Loading