Skip to content

Commit 60e56f4

Browse files
pintaoz-awspintaoz
andauthored
Add container in get_logs_from_pod (#66)
Co-authored-by: pintaoz <[email protected]>
1 parent e165fa7 commit 60e56f4

File tree

1 file changed

+17
-10
lines changed

1 file changed

+17
-10
lines changed

sagemaker-hyperpod/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def create(self):
6464
print("Successful submitted HyperPodPytorchJob!")
6565
except Exception as e:
6666
print(f"Failed to create HyperPodPytorchJob {self.metadata.name}!")
67-
_handel_exception(e, self.metadata.name, self.metadata.namespace)
67+
_handle_exception(e, self.metadata.name, self.metadata.namespace)
6868

6969
@classmethod
7070
def list(cls, namespace="default") -> List["HyperPodPytorchJob"]:
@@ -85,7 +85,7 @@ def list(cls, namespace="default") -> List["HyperPodPytorchJob"]:
8585
return _load_hp_job_list(hp_job_list)
8686
except Exception as e:
8787
print(f"Failed to list HyperpodPytorchJobs!")
88-
_handel_exception(e, "", namespace)
88+
_handle_exception(e, "", namespace)
8989

9090
def delete(self):
9191
if not validate_cluster_connection():
@@ -106,7 +106,7 @@ def delete(self):
106106
print(f"Successful deleted HyperPodPytorchJob!")
107107
except Exception as e:
108108
print(f"Failed to delete HyperPodPytorchJob {self.metadata.name}!")
109-
_handel_exception(e, self.metadata.name, self.metadata.namespace)
109+
_handle_exception(e, self.metadata.name, self.metadata.namespace)
110110

111111
@classmethod
112112
def get(cls, name, namespace="default") -> "HyperPodPytorchJob":
@@ -128,7 +128,7 @@ def get(cls, name, namespace="default") -> "HyperPodPytorchJob":
128128
return _load_hp_job(response)
129129
except Exception as e:
130130
print(f"Failed to describe HyperPodPytorchJob {name}: {e}")
131-
_handel_exception(e, name, namespace)
131+
_handle_exception(e, name, namespace)
132132

133133
def refresh(self) -> "HyperPodPytorchJob":
134134
if not validate_cluster_connection():
@@ -151,7 +151,7 @@ def refresh(self) -> "HyperPodPytorchJob":
151151
)
152152
except Exception as e:
153153
print(f"Failed to refresh HyperPodPytorchJob {self.metadata.name}!")
154-
_handel_exception(e, self.metadata.name, self.metadata.namespace)
154+
_handle_exception(e, self.metadata.name, self.metadata.namespace)
155155

156156
def list_pods(self) -> List[str]:
157157
if not validate_cluster_connection():
@@ -172,28 +172,35 @@ def list_pods(self) -> List[str]:
172172
return pods
173173
except Exception as e:
174174
print(f"Failed to list pod in namespace {self.metadata.namespace}!")
175-
_handel_exception(e, self.metadata.name, self.metadata.namespace)
175+
_handle_exception(e, self.metadata.name, self.metadata.namespace)
176176

177-
def get_logs_from_pod(self, pod_name: str) -> str:
177+
def get_logs_from_pod(self, pod_name: str, container: Optional[str] = None) -> str:
178178
if not validate_cluster_connection():
179179
raise Exception(
180180
"Failed to connect to the Kubernetes cluster. Please check your kubeconfig."
181181
)
182182

183+
if container is None:
184+
# If container name is not set, get logs from the first container in the pod
185+
container = self.replicaSpecs[0].template.spec.containers[0].name
186+
183187
try:
184188
config.load_kube_config()
185189
v1 = client.CoreV1Api()
186190

187191
logs = v1.read_namespaced_pod_log(
188-
name=pod_name, namespace=self.metadata.namespace, timestamps=True
192+
name=pod_name,
193+
namespace=self.metadata.namespace,
194+
timestamps=True,
195+
container=container,
189196
)
190197
return logs
191198
except Exception as e:
192199
print(f"Failed to get logs from pod {pod_name}!")
193-
_handel_exception(e, self.metadata.name, self.metadata.namespace)
200+
_handle_exception(e, self.metadata.name, self.metadata.namespace)
194201

195202

196-
def _handel_exception(e: Exception, name: str, namespace: str):
203+
def _handle_exception(e: Exception, name: str, namespace: str):
197204
print("exception type", type(e))
198205
if isinstance(e, ApiException):
199206
if e.status == 401:

0 commit comments

Comments
 (0)