|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 1, |
| 6 | + "id": "f8caf125-ab9d-4655-a59e-edbaeed9e919", |
| 7 | + "metadata": {}, |
| 8 | + "outputs": [ |
| 9 | + { |
| 10 | + "name": "stdout", |
| 11 | + "output_type": "stream", |
| 12 | + "text": [ |
| 13 | + "Orchestrator Cluster Name\n", |
| 14 | + "-------------- ----------------------------\n", |
| 15 | + "EKS hp-cluster-for-inf-Beta2try1\n", |
| 16 | + "Updated context arn:aws:eks:us-east-2:637423555983:cluster/EKSClusterForInf-Beta2try1 in /tmp/kubeconfig\n", |
| 17 | + "Successfully set current cluster as: hp-cluster-for-inf-Beta2try1\n" |
| 18 | + ] |
| 19 | + } |
| 20 | + ], |
| 21 | + "source": [ |
| 22 | + "from sagemaker.hyperpod.hyperpod_manager import HyperPodManager\n", |
| 23 | + "\n", |
| 24 | + "HyperPodManager.list_clusters(region='us-east-2')\n", |
| 25 | + "HyperPodManager.set_context('hp-cluster-for-inf-Beta2try1', region='us-east-2')" |
| 26 | + ] |
| 27 | + }, |
| 28 | + { |
| 29 | + "cell_type": "code", |
| 30 | + "execution_count": 3, |
| 31 | + "id": "32f976ba-d113-4e73-9698-2e5d8c7c44f6", |
| 32 | + "metadata": {}, |
| 33 | + "outputs": [], |
| 34 | + "source": [ |
| 35 | + "from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, PrometheusTrigger, AutoScalingSpec, ModelMetrics, Metrics, FsxStorage, ModelSourceConfig, Tags, TlsConfig, ConfigMapKeyRef, FieldRef, ResourceFieldRef, SecretKeyRef, ValueFrom, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Claims, Resources, Worker\n", |
| 36 | + "from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint" |
| 37 | + ] |
| 38 | + }, |
| 39 | + { |
| 40 | + "cell_type": "code", |
| 41 | + "execution_count": 4, |
| 42 | + "id": "67f9a718-524a-4a3d-9885-31385c995c18", |
| 43 | + "metadata": {}, |
| 44 | + "outputs": [], |
| 45 | + "source": [ |
| 46 | + "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://tls-bucket-inf1-beta2')\n", |
| 47 | + "\n", |
| 48 | + "model_source_config = ModelSourceConfig(\n", |
| 49 | + " model_source_type='fsx',\n", |
| 50 | + " model_location=\"deepseek-1-5b\",\n", |
| 51 | + " fsx_storage=FsxStorage(\n", |
| 52 | + " file_system_id='fs-0e6a92495c35a81f2'\n", |
| 53 | + " ),\n", |
| 54 | + ")\n", |
| 55 | + "\n", |
| 56 | + "environment_variables = [\n", |
| 57 | + " EnvironmentVariables(name=\"HF_MODEL_ID\", value=\"/opt/ml/model\"),\n", |
| 58 | + " EnvironmentVariables(name=\"SAGEMAKER_PROGRAM\", value=\"inference.py\"),\n", |
| 59 | + " EnvironmentVariables(name=\"SAGEMAKER_SUBMIT_DIRECTORY\", value=\"/opt/ml/model/code\"),\n", |
| 60 | + " EnvironmentVariables(name=\"MODEL_CACHE_ROOT\", value=\"/opt/ml/model\"),\n", |
| 61 | + " EnvironmentVariables(name=\"SAGEMAKER_ENV\", value=\"1\"),\n", |
| 62 | + "]\n", |
| 63 | + "\n", |
| 64 | + "worker = Worker(\n", |
| 65 | + " image='763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0',\n", |
| 66 | + " model_volume_mount=ModelVolumeMount(\n", |
| 67 | + " name='model-weights',\n", |
| 68 | + " ),\n", |
| 69 | + " model_invocation_port=ModelInvocationPort(container_port=8080),\n", |
| 70 | + " resources=Resources(\n", |
| 71 | + " requests={\"cpu\": \"30000m\", \"nvidia.com/gpu\": 1, \"memory\": \"100Gi\"},\n", |
| 72 | + " limits={\"nvidia.com/gpu\": 1}\n", |
| 73 | + " ),\n", |
| 74 | + " environment_variables=environment_variables,\n", |
| 75 | + ")" |
| 76 | + ] |
| 77 | + }, |
| 78 | + { |
| 79 | + "cell_type": "code", |
| 80 | + "execution_count": 13, |
| 81 | + "id": "ae599413-e275-47c3-9dca-05b80b1bb6e2", |
| 82 | + "metadata": {}, |
| 83 | + "outputs": [], |
| 84 | + "source": [ |
| 85 | + "fsx_endpoint = HPEndpoint(\n", |
| 86 | + " endpoint_name='test-endpoint-name-fsx-zhaoqi-pysdk-06-28-1',\n", |
| 87 | + " instance_type='ml.g5.8xlarge',\n", |
| 88 | + " model_name='deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1',\n", |
| 89 | + " tls_config=tls_config,\n", |
| 90 | + " model_source_config=model_source_config,\n", |
| 91 | + " worker=worker,\n", |
| 92 | + ")" |
| 93 | + ] |
| 94 | + }, |
| 95 | + { |
| 96 | + "cell_type": "code", |
| 97 | + "execution_count": 8, |
| 98 | + "id": "bf04c78e-745d-4530-8342-216cf1fbcfc4", |
| 99 | + "metadata": {}, |
| 100 | + "outputs": [ |
| 101 | + { |
| 102 | + "name": "stdout", |
| 103 | + "output_type": "stream", |
| 104 | + "text": [ |
| 105 | + "Deploying model and its endpoint... The process may take a few minutes.\n" |
| 106 | + ] |
| 107 | + } |
| 108 | + ], |
| 109 | + "source": [ |
| 110 | + "fsx_endpoint.create()" |
| 111 | + ] |
| 112 | + }, |
| 113 | + { |
| 114 | + "cell_type": "code", |
| 115 | + "execution_count": 22, |
| 116 | + "id": "ec2cfae4-a056-465d-823a-75f5b6b9a495", |
| 117 | + "metadata": {}, |
| 118 | + "outputs": [], |
| 119 | + "source": [ |
| 120 | + "fsx_endpoint.refresh()" |
| 121 | + ] |
| 122 | + }, |
| 123 | + { |
| 124 | + "cell_type": "code", |
| 125 | + "execution_count": 23, |
| 126 | + "id": "1ab21f53-36d6-4230-a5ad-1622ebcbe32c", |
| 127 | + "metadata": { |
| 128 | + "scrolled": true |
| 129 | + }, |
| 130 | + "outputs": [ |
| 131 | + { |
| 132 | + "name": "stdout", |
| 133 | + "output_type": "stream", |
| 134 | + "text": [ |
| 135 | + "endpointName: test-endpoint-name-fsx-zhaoqi-pysdk-06-28-1\n", |
| 136 | + "instanceType: ml.g5.8xlarge\n", |
| 137 | + "invocationEndpoint: invocations\n", |
| 138 | + "modelName: deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1\n", |
| 139 | + "modelSourceConfig:\n", |
| 140 | + " fsxStorage:\n", |
| 141 | + " fileSystemId: fs-0e6a92495c35a81f2\n", |
| 142 | + " modelLocation: deepseek-1-5b\n", |
| 143 | + " modelSourceType: fsx\n", |
| 144 | + " prefetchEnabled: false\n", |
| 145 | + "namespace: default\n", |
| 146 | + "replicas: 1\n", |
| 147 | + "status:\n", |
| 148 | + " conditions:\n", |
| 149 | + " - lastTransitionTime: '2025-06-29T01:19:45Z'\n", |
| 150 | + " message: Deployment or SageMaker endpoint registration creation for model is in\n", |
| 151 | + " progress\n", |
| 152 | + " reason: InProgress\n", |
| 153 | + " status: 'True'\n", |
| 154 | + " type: DeploymentInProgress\n", |
| 155 | + " - lastTransitionTime: '2025-06-29T01:24:39Z'\n", |
| 156 | + " message: Deployment and SageMaker endpoint registration for model have been created\n", |
| 157 | + " successfully\n", |
| 158 | + " reason: Success\n", |
| 159 | + " status: 'True'\n", |
| 160 | + " type: DeploymentComplete\n", |
| 161 | + " deploymentStatus:\n", |
| 162 | + " deploymentObjectOverallState: DeploymentComplete\n", |
| 163 | + " lastUpdated: '2025-06-29T01:24:39Z'\n", |
| 164 | + " name: deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1\n", |
| 165 | + " reason: NativeDeploymentObjectFound\n", |
| 166 | + " status:\n", |
| 167 | + " availableReplicas: 1\n", |
| 168 | + " conditions:\n", |
| 169 | + " - lastTransitionTime: '2025-06-29T01:19:55Z'\n", |
| 170 | + " lastUpdateTime: '2025-06-29T01:19:55Z'\n", |
| 171 | + " message: Deployment has minimum availability.\n", |
| 172 | + " reason: MinimumReplicasAvailable\n", |
| 173 | + " status: 'True'\n", |
| 174 | + " type: Available\n", |
| 175 | + " - lastTransitionTime: '2025-06-29T01:19:37Z'\n", |
| 176 | + " lastUpdateTime: '2025-06-29T01:19:55Z'\n", |
| 177 | + " message: ReplicaSet \"deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1-54fb8fbfc8\"\n", |
| 178 | + " has successfully progressed.\n", |
| 179 | + " reason: NewReplicaSetAvailable\n", |
| 180 | + " status: 'True'\n", |
| 181 | + " type: Progressing\n", |
| 182 | + " observedGeneration: 1\n", |
| 183 | + " readyReplicas: 1\n", |
| 184 | + " replicas: 1\n", |
| 185 | + " updatedReplicas: 1\n", |
| 186 | + " endpoints:\n", |
| 187 | + " sagemaker:\n", |
| 188 | + " endpointArn: arn:aws:sagemaker:us-east-2:637423555983:endpoint/test-endpoint-name-fsx-zhaoqi-pysdk-06-28-1\n", |
| 189 | + " state: CreationCompleted\n", |
| 190 | + " replicas: 1\n", |
| 191 | + " selector: app=deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1,deploying-service=hyperpod-inference\n", |
| 192 | + " state: DeploymentComplete\n", |
| 193 | + " tlsCertificate:\n", |
| 194 | + " certificateARN: arn:aws:acm:us-east-2:637423555983:certificate/08c7f68c-e9bb-4069-8a68-b5ee72ea351a\n", |
| 195 | + " certificateDomainNames:\n", |
| 196 | + " - internal-k8s-default-albdeeps-a0dda9b7ad-724026463.us-east-2.elb.amazonaws.com\n", |
| 197 | + " certificateName: deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1-certificate\n", |
| 198 | + " importedCertificates:\n", |
| 199 | + " - arn:aws:acm:us-east-2:637423555983:certificate/08c7f68c-e9bb-4069-8a68-b5ee72ea351a\n", |
| 200 | + " issuerName: deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1-issuer\n", |
| 201 | + " lastCertExpiryTime: '2026-06-29T01:19:44Z'\n", |
| 202 | + " tlsCertificateOutputS3Bucket: tls-bucket-inf1-beta2\n", |
| 203 | + " tlsCertificateS3Keys:\n", |
| 204 | + " - 52c2qrh8q2ts/default-deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1-1751159976/deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1-certificate-1782695984.pem\n", |
| 205 | + "tlsConfig:\n", |
| 206 | + " tlsCertificateOutputS3Uri: s3://tls-bucket-inf1-beta2\n", |
| 207 | + "worker:\n", |
| 208 | + " environmentVariables:\n", |
| 209 | + " - name: HF_MODEL_ID\n", |
| 210 | + " value: /opt/ml/model\n", |
| 211 | + " - name: SAGEMAKER_PROGRAM\n", |
| 212 | + " value: inference.py\n", |
| 213 | + " - name: SAGEMAKER_SUBMIT_DIRECTORY\n", |
| 214 | + " value: /opt/ml/model/code\n", |
| 215 | + " - name: MODEL_CACHE_ROOT\n", |
| 216 | + " value: /opt/ml/model\n", |
| 217 | + " - name: SAGEMAKER_ENV\n", |
| 218 | + " value: '1'\n", |
| 219 | + " image: 763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0\n", |
| 220 | + " modelInvocationPort:\n", |
| 221 | + " containerPort: 8080\n", |
| 222 | + " name: http\n", |
| 223 | + " modelVolumeMount:\n", |
| 224 | + " mountPath: /opt/ml/model\n", |
| 225 | + " name: model-weights\n", |
| 226 | + " resources:\n", |
| 227 | + " limits:\n", |
| 228 | + " nvidia.com/gpu: 1\n", |
| 229 | + " requests:\n", |
| 230 | + " cpu: 30000m\n", |
| 231 | + " memory: 100Gi\n", |
| 232 | + " nvidia.com/gpu: 1\n", |
| 233 | + "\n" |
| 234 | + ] |
| 235 | + } |
| 236 | + ], |
| 237 | + "source": [ |
| 238 | + "# print refreshed config\n", |
| 239 | + "import yaml\n", |
| 240 | + "print(yaml.dump(fsx_endpoint.model_dump(exclude_none=True)))" |
| 241 | + ] |
| 242 | + }, |
| 243 | + { |
| 244 | + "cell_type": "code", |
| 245 | + "execution_count": 18, |
| 246 | + "id": "d606c862-0f03-4cd9-9324-c5960d6be362", |
| 247 | + "metadata": {}, |
| 248 | + "outputs": [ |
| 249 | + { |
| 250 | + "data": { |
| 251 | + "text/plain": [ |
| 252 | + "[HPEndpoint(InitialReplicaCount=None, autoScalingSpec=None, endpointName='test-endpoint-name-fsx-zhaoqi-pysdk', instanceType='ml.g5.8xlarge', invocationEndpoint='invocations', metrics=None, modelName='deepseek15b-fsx-test-zhaoqi-pysdk', modelSourceConfig=ModelSourceConfig(fsxStorage=FsxStorage(dnsName=None, fileSystemId='fs-0e6a92495c35a81f2', mountName=None), modelLocation='deepseek-1-5b', modelSourceType='fsx', prefetchEnabled=False, s3Storage=None), modelVersion=None, replicas=1, tags=None, tlsConfig=TlsConfig(tlsCertificateOutputS3Uri='s3://tls-bucket-inf1-beta2'), worker=Worker(environmentVariables=[EnvironmentVariables(name='HF_MODEL_ID', value='/opt/ml/model', valueFrom=None), EnvironmentVariables(name='SAGEMAKER_PROGRAM', value='inference.py', valueFrom=None), EnvironmentVariables(name='SAGEMAKER_SUBMIT_DIRECTORY', value='/opt/ml/model/code', valueFrom=None), EnvironmentVariables(name='MODEL_CACHE_ROOT', value='/opt/ml/model', valueFrom=None), EnvironmentVariables(name='SAGEMAKER_ENV', value='1', valueFrom=None)], image='763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0', modelInvocationPort=ModelInvocationPort(containerPort=8080, name='http'), modelVolumeMount=ModelVolumeMount(mountPath='/opt/ml/model', name='model-weights'), resources=Resources(claims=None, limits={'nvidia.com/gpu': 1}, requests={'cpu': '30000m', 'memory': '100Gi', 'nvidia.com/gpu': 1})), namespace='default', status=None),\n", |
| 253 | + " HPEndpoint(InitialReplicaCount=None, autoScalingSpec=None, endpointName='test-endpoint-name-fsx-zhaoqi-pysdk-06-28-1', instanceType='ml.g5.8xlarge', invocationEndpoint='invocations', metrics=None, modelName='deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1', modelSourceConfig=ModelSourceConfig(fsxStorage=FsxStorage(dnsName=None, fileSystemId='fs-0e6a92495c35a81f2', mountName=None), modelLocation='deepseek-1-5b', modelSourceType='fsx', prefetchEnabled=False, s3Storage=None), modelVersion=None, replicas=1, tags=None, tlsConfig=TlsConfig(tlsCertificateOutputS3Uri='s3://tls-bucket-inf1-beta2'), worker=Worker(environmentVariables=[EnvironmentVariables(name='HF_MODEL_ID', value='/opt/ml/model', valueFrom=None), EnvironmentVariables(name='SAGEMAKER_PROGRAM', value='inference.py', valueFrom=None), EnvironmentVariables(name='SAGEMAKER_SUBMIT_DIRECTORY', value='/opt/ml/model/code', valueFrom=None), EnvironmentVariables(name='MODEL_CACHE_ROOT', value='/opt/ml/model', valueFrom=None), EnvironmentVariables(name='SAGEMAKER_ENV', value='1', valueFrom=None)], image='763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0', modelInvocationPort=ModelInvocationPort(containerPort=8080, name='http'), modelVolumeMount=ModelVolumeMount(mountPath='/opt/ml/model', name='model-weights'), resources=Resources(claims=None, limits={'nvidia.com/gpu': 1}, requests={'cpu': '30000m', 'memory': '100Gi', 'nvidia.com/gpu': 1})), namespace='default', status=None)]" |
| 254 | + ] |
| 255 | + }, |
| 256 | + "execution_count": 18, |
| 257 | + "metadata": {}, |
| 258 | + "output_type": "execute_result" |
| 259 | + } |
| 260 | + ], |
| 261 | + "source": [ |
| 262 | + "# list all endpoints\n", |
| 263 | + "endpoints = HPEndpoint.list()\n", |
| 264 | + "endpoints" |
| 265 | + ] |
| 266 | + }, |
| 267 | + { |
| 268 | + "cell_type": "code", |
| 269 | + "execution_count": 30, |
| 270 | + "id": "7159577e-d9c3-4f95-8c94-0df869b658cb", |
| 271 | + "metadata": {}, |
| 272 | + "outputs": [], |
| 273 | + "source": [ |
| 274 | + "# get endpoint\n", |
| 275 | + "endpoint = HPEndpoint.get(name='deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1')\n", |
| 276 | + "\n", |
| 277 | + "# another way to get endpoint object\n", |
| 278 | + "# endpoint = HPEndpoint.list()[0]" |
| 279 | + ] |
| 280 | + }, |
| 281 | + { |
| 282 | + "cell_type": "code", |
| 283 | + "execution_count": 31, |
| 284 | + "id": "59d65df2-fe0c-4e3c-b584-dfd10e9e7264", |
| 285 | + "metadata": {}, |
| 286 | + "outputs": [ |
| 287 | + { |
| 288 | + "data": { |
| 289 | + "text/plain": [ |
| 290 | + "b'[{\"generated_text\":\"What is the capital of Japan? Or, if more than one city is capital, list the first one.\\\\nThe capital city of France is Paris, and the capital of Russia is Moscow, so each country has its own capital, but Japan only has one.\\\\nWhat is the capital of Japan? Or if there is more than one capital city, just list the first one.\\\\nIn Japan, although some cities (100-200 population) have their own capital, perhaps\\\\nthe typical capital might...,\\\\nWait, the\"}]'" |
| 291 | + ] |
| 292 | + }, |
| 293 | + "execution_count": 31, |
| 294 | + "metadata": {}, |
| 295 | + "output_type": "execute_result" |
| 296 | + } |
| 297 | + ], |
| 298 | + "source": [ |
| 299 | + "# invoke\n", |
| 300 | + "data='{\"inputs\": \"What is the capital of Japan?\"}'\n", |
| 301 | + "\n", |
| 302 | + "# invoke\n", |
| 303 | + "endpoint.invoke(body=data).body.read()" |
| 304 | + ] |
| 305 | + }, |
| 306 | + { |
| 307 | + "cell_type": "code", |
| 308 | + "execution_count": null, |
| 309 | + "id": "d2b3a2ce-6531-4d8d-bd17-e1daabf557cc", |
| 310 | + "metadata": {}, |
| 311 | + "outputs": [], |
| 312 | + "source": [] |
| 313 | + } |
| 314 | + ], |
| 315 | + "metadata": { |
| 316 | + "kernelspec": { |
| 317 | + "display_name": "Python 3 (ipykernel)", |
| 318 | + "language": "python", |
| 319 | + "name": "python3" |
| 320 | + }, |
| 321 | + "language_info": { |
| 322 | + "codemirror_mode": { |
| 323 | + "name": "ipython", |
| 324 | + "version": 3 |
| 325 | + }, |
| 326 | + "file_extension": ".py", |
| 327 | + "mimetype": "text/x-python", |
| 328 | + "name": "python", |
| 329 | + "nbconvert_exporter": "python", |
| 330 | + "pygments_lexer": "ipython3", |
| 331 | + "version": "3.12.3" |
| 332 | + } |
| 333 | + }, |
| 334 | + "nbformat": 4, |
| 335 | + "nbformat_minor": 5 |
| 336 | +} |
0 commit comments