aws
diff --git a/‎sagemaker-hyperpod/examples/inference-fsx-model-e2e.ipynb
Lines changed: 336 additions & 0 deletions b/‎sagemaker-hyperpod/examples/inference-fsx-model-e2e.ipynb
Lines changed: 336 additions & 0 deletions
@@ -0,0 +1,336 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f8caf125-ab9d-4655-a59e-edbaeed9e919",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Orchestrator    Cluster Name\n",
+      "--------------  ----------------------------\n",
+      "EKS             hp-cluster-for-inf-Beta2try1\n",
+      "Updated context arn:aws:eks:us-east-2:637423555983:cluster/EKSClusterForInf-Beta2try1 in /tmp/kubeconfig\n",
+      "Successfully set current cluster as: hp-cluster-for-inf-Beta2try1\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sagemaker.hyperpod.hyperpod_manager import HyperPodManager\n",
+    "\n",
+    "HyperPodManager.list_clusters(region='us-east-2')\n",
+    "HyperPodManager.set_context('hp-cluster-for-inf-Beta2try1', region='us-east-2')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "32f976ba-d113-4e73-9698-2e5d8c7c44f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, PrometheusTrigger, AutoScalingSpec, ModelMetrics, Metrics, FsxStorage, ModelSourceConfig, Tags, TlsConfig, ConfigMapKeyRef, FieldRef, ResourceFieldRef, SecretKeyRef, ValueFrom, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Claims, Resources, Worker\n",
+    "from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "67f9a718-524a-4a3d-9885-31385c995c18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://tls-bucket-inf1-beta2')\n",
+    "\n",
+    "model_source_config = ModelSourceConfig(\n",
+    "    model_source_type='fsx',\n",
+    "    model_location=\"deepseek-1-5b\",\n",
+    "    fsx_storage=FsxStorage(\n",
+    "        file_system_id='fs-0e6a92495c35a81f2'\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "environment_variables = [\n",
+    "    EnvironmentVariables(name=\"HF_MODEL_ID\", value=\"/opt/ml/model\"),\n",
+    "    EnvironmentVariables(name=\"SAGEMAKER_PROGRAM\", value=\"inference.py\"),\n",
+    "    EnvironmentVariables(name=\"SAGEMAKER_SUBMIT_DIRECTORY\", value=\"/opt/ml/model/code\"),\n",
+    "    EnvironmentVariables(name=\"MODEL_CACHE_ROOT\", value=\"/opt/ml/model\"),\n",
+    "    EnvironmentVariables(name=\"SAGEMAKER_ENV\", value=\"1\"),\n",
+    "]\n",
+    "\n",
+    "worker = Worker(\n",
+    "    image='763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0',\n",
+    "    model_volume_mount=ModelVolumeMount(\n",
+    "        name='model-weights',\n",
+    "    ),\n",
+    "    model_invocation_port=ModelInvocationPort(container_port=8080),\n",
+    "    resources=Resources(\n",
+    "            requests={\"cpu\": \"30000m\", \"nvidia.com/gpu\": 1, \"memory\": \"100Gi\"},\n",
+    "            limits={\"nvidia.com/gpu\": 1}\n",
+    "    ),\n",
+    "    environment_variables=environment_variables,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "ae599413-e275-47c3-9dca-05b80b1bb6e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fsx_endpoint = HPEndpoint(\n",
+    "    endpoint_name='test-endpoint-name-fsx-zhaoqi-pysdk-06-28-1',\n",
+    "    instance_type='ml.g5.8xlarge',\n",
+    "    model_name='deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1',\n",
+    "    tls_config=tls_config,\n",
+    "    model_source_config=model_source_config,\n",
+    "    worker=worker,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "bf04c78e-745d-4530-8342-216cf1fbcfc4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Deploying model and its endpoint... The process may take a few minutes.\n"
+     ]
+    }
+   ],
+   "source": [
+    "fsx_endpoint.create()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "ec2cfae4-a056-465d-823a-75f5b6b9a495",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fsx_endpoint.refresh()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "1ab21f53-36d6-4230-a5ad-1622ebcbe32c",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "endpointName: test-endpoint-name-fsx-zhaoqi-pysdk-06-28-1\n",
+      "instanceType: ml.g5.8xlarge\n",
+      "invocationEndpoint: invocations\n",
+      "modelName: deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1\n",
+      "modelSourceConfig:\n",
+      "  fsxStorage:\n",
+      "    fileSystemId: fs-0e6a92495c35a81f2\n",
+      "  modelLocation: deepseek-1-5b\n",
+      "  modelSourceType: fsx\n",
+      "  prefetchEnabled: false\n",
+      "namespace: default\n",
+      "replicas: 1\n",
+      "status:\n",
+      "  conditions:\n",
+      "  - lastTransitionTime: '2025-06-29T01:19:45Z'\n",
+      "    message: Deployment or SageMaker endpoint registration creation for model is in\n",
+      "      progress\n",
+      "    reason: InProgress\n",
+      "    status: 'True'\n",
+      "    type: DeploymentInProgress\n",
+      "  - lastTransitionTime: '2025-06-29T01:24:39Z'\n",
+      "    message: Deployment and SageMaker endpoint registration for model have been created\n",
+      "      successfully\n",
+      "    reason: Success\n",
+      "    status: 'True'\n",
+      "    type: DeploymentComplete\n",
+      "  deploymentStatus:\n",
+      "    deploymentObjectOverallState: DeploymentComplete\n",
+      "    lastUpdated: '2025-06-29T01:24:39Z'\n",
+      "    name: deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1\n",
+      "    reason: NativeDeploymentObjectFound\n",
+      "    status:\n",
+      "      availableReplicas: 1\n",
+      "      conditions:\n",
+      "      - lastTransitionTime: '2025-06-29T01:19:55Z'\n",
+      "        lastUpdateTime: '2025-06-29T01:19:55Z'\n",
+      "        message: Deployment has minimum availability.\n",
+      "        reason: MinimumReplicasAvailable\n",
+      "        status: 'True'\n",
+      "        type: Available\n",
+      "      - lastTransitionTime: '2025-06-29T01:19:37Z'\n",
+      "        lastUpdateTime: '2025-06-29T01:19:55Z'\n",
+      "        message: ReplicaSet \"deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1-54fb8fbfc8\"\n",
+      "          has successfully progressed.\n",
+      "        reason: NewReplicaSetAvailable\n",
+      "        status: 'True'\n",
+      "        type: Progressing\n",
+      "      observedGeneration: 1\n",
+      "      readyReplicas: 1\n",
+      "      replicas: 1\n",
+      "      updatedReplicas: 1\n",
+      "  endpoints:\n",
+      "    sagemaker:\n",
+      "      endpointArn: arn:aws:sagemaker:us-east-2:637423555983:endpoint/test-endpoint-name-fsx-zhaoqi-pysdk-06-28-1\n",
+      "      state: CreationCompleted\n",
+      "  replicas: 1\n",
+      "  selector: app=deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1,deploying-service=hyperpod-inference\n",
+      "  state: DeploymentComplete\n",
+      "  tlsCertificate:\n",
+      "    certificateARN: arn:aws:acm:us-east-2:637423555983:certificate/08c7f68c-e9bb-4069-8a68-b5ee72ea351a\n",
+      "    certificateDomainNames:\n",
+      "    - internal-k8s-default-albdeeps-a0dda9b7ad-724026463.us-east-2.elb.amazonaws.com\n",
+      "    certificateName: deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1-certificate\n",
+      "    importedCertificates:\n",
+      "    - arn:aws:acm:us-east-2:637423555983:certificate/08c7f68c-e9bb-4069-8a68-b5ee72ea351a\n",
+      "    issuerName: deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1-issuer\n",
+      "    lastCertExpiryTime: '2026-06-29T01:19:44Z'\n",
+      "    tlsCertificateOutputS3Bucket: tls-bucket-inf1-beta2\n",
+      "    tlsCertificateS3Keys:\n",
+      "    - 52c2qrh8q2ts/default-deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1-1751159976/deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1-certificate-1782695984.pem\n",
+      "tlsConfig:\n",
+      "  tlsCertificateOutputS3Uri: s3://tls-bucket-inf1-beta2\n",
+      "worker:\n",
+      "  environmentVariables:\n",
+      "  - name: HF_MODEL_ID\n",
+      "    value: /opt/ml/model\n",
+      "  - name: SAGEMAKER_PROGRAM\n",
+      "    value: inference.py\n",
+      "  - name: SAGEMAKER_SUBMIT_DIRECTORY\n",
+      "    value: /opt/ml/model/code\n",
+      "  - name: MODEL_CACHE_ROOT\n",
+      "    value: /opt/ml/model\n",
+      "  - name: SAGEMAKER_ENV\n",
+      "    value: '1'\n",
+      "  image: 763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0\n",
+      "  modelInvocationPort:\n",
+      "    containerPort: 8080\n",
+      "    name: http\n",
+      "  modelVolumeMount:\n",
+      "    mountPath: /opt/ml/model\n",
+      "    name: model-weights\n",
+      "  resources:\n",
+      "    limits:\n",
+      "      nvidia.com/gpu: 1\n",
+      "    requests:\n",
+      "      cpu: 30000m\n",
+      "      memory: 100Gi\n",
+      "      nvidia.com/gpu: 1\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# print refreshed config\n",
+    "import yaml\n",
+    "print(yaml.dump(fsx_endpoint.model_dump(exclude_none=True)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "d606c862-0f03-4cd9-9324-c5960d6be362",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[HPEndpoint(InitialReplicaCount=None, autoScalingSpec=None, endpointName='test-endpoint-name-fsx-zhaoqi-pysdk', instanceType='ml.g5.8xlarge', invocationEndpoint='invocations', metrics=None, modelName='deepseek15b-fsx-test-zhaoqi-pysdk', modelSourceConfig=ModelSourceConfig(fsxStorage=FsxStorage(dnsName=None, fileSystemId='fs-0e6a92495c35a81f2', mountName=None), modelLocation='deepseek-1-5b', modelSourceType='fsx', prefetchEnabled=False, s3Storage=None), modelVersion=None, replicas=1, tags=None, tlsConfig=TlsConfig(tlsCertificateOutputS3Uri='s3://tls-bucket-inf1-beta2'), worker=Worker(environmentVariables=[EnvironmentVariables(name='HF_MODEL_ID', value='/opt/ml/model', valueFrom=None), EnvironmentVariables(name='SAGEMAKER_PROGRAM', value='inference.py', valueFrom=None), EnvironmentVariables(name='SAGEMAKER_SUBMIT_DIRECTORY', value='/opt/ml/model/code', valueFrom=None), EnvironmentVariables(name='MODEL_CACHE_ROOT', value='/opt/ml/model', valueFrom=None), EnvironmentVariables(name='SAGEMAKER_ENV', value='1', valueFrom=None)], image='763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0', modelInvocationPort=ModelInvocationPort(containerPort=8080, name='http'), modelVolumeMount=ModelVolumeMount(mountPath='/opt/ml/model', name='model-weights'), resources=Resources(claims=None, limits={'nvidia.com/gpu': 1}, requests={'cpu': '30000m', 'memory': '100Gi', 'nvidia.com/gpu': 1})), namespace='default', status=None),\n",
+       " HPEndpoint(InitialReplicaCount=None, autoScalingSpec=None, endpointName='test-endpoint-name-fsx-zhaoqi-pysdk-06-28-1', instanceType='ml.g5.8xlarge', invocationEndpoint='invocations', metrics=None, modelName='deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1', modelSourceConfig=ModelSourceConfig(fsxStorage=FsxStorage(dnsName=None, fileSystemId='fs-0e6a92495c35a81f2', mountName=None), modelLocation='deepseek-1-5b', modelSourceType='fsx', prefetchEnabled=False, s3Storage=None), modelVersion=None, replicas=1, tags=None, tlsConfig=TlsConfig(tlsCertificateOutputS3Uri='s3://tls-bucket-inf1-beta2'), worker=Worker(environmentVariables=[EnvironmentVariables(name='HF_MODEL_ID', value='/opt/ml/model', valueFrom=None), EnvironmentVariables(name='SAGEMAKER_PROGRAM', value='inference.py', valueFrom=None), EnvironmentVariables(name='SAGEMAKER_SUBMIT_DIRECTORY', value='/opt/ml/model/code', valueFrom=None), EnvironmentVariables(name='MODEL_CACHE_ROOT', value='/opt/ml/model', valueFrom=None), EnvironmentVariables(name='SAGEMAKER_ENV', value='1', valueFrom=None)], image='763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0', modelInvocationPort=ModelInvocationPort(containerPort=8080, name='http'), modelVolumeMount=ModelVolumeMount(mountPath='/opt/ml/model', name='model-weights'), resources=Resources(claims=None, limits={'nvidia.com/gpu': 1}, requests={'cpu': '30000m', 'memory': '100Gi', 'nvidia.com/gpu': 1})), namespace='default', status=None)]"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# list all endpoints\n",
+    "endpoints = HPEndpoint.list()\n",
+    "endpoints"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "7159577e-d9c3-4f95-8c94-0df869b658cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get endpoint\n",
+    "endpoint = HPEndpoint.get(name='deepseek15b-fsx-test-zhaoqi-pysdk-06-28-1')\n",
+    "\n",
+    "# another way to get endpoint object\n",
+    "# endpoint = HPEndpoint.list()[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "59d65df2-fe0c-4e3c-b584-dfd10e9e7264",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "b'[{\"generated_text\":\"What is the capital of Japan? Or, if more than one city is capital, list the first one.\\\\nThe capital city of France is Paris, and the capital of Russia is Moscow, so each country has its own capital, but Japan only has one.\\\\nWhat is the capital of Japan? Or if there is more than one capital city, just list the first one.\\\\nIn Japan, although some cities (100-200 population) have their own capital, perhaps\\\\nthe typical capital might...,\\\\nWait, the\"}]'"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# invoke\n",
+    "data='{\"inputs\": \"What is the capital of Japan?\"}'\n",
+    "\n",
+    "# invoke\n",
+    "endpoint.invoke(body=data).body.read()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2b3a2ce-6531-4d8d-bd17-e1daabf557cc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}