Fixing notebook build and validating changes

carlyrichmond · carlyrichmond · commit 453f04429c05 · 2025-10-15T16:28:05.000+01:00
diff --git a/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb b/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb
@@ -9,6 +9,15 @@
     "This notebook by Quynh Nguyen shows how cross-lingual vector search overcomes language barriers, enabling you to query and retrieve information in any language from both single and multilingual datasets. It accompanies the piece *Lost In Translation? Multilingual Embedding Models Are All You Need* from [Elasticsearch Labs](https://www.elastic.co/search-labs)."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "\n",
+    "Before running this notebook, please ensure you have deployed the `.multilingual-e5-small_linux-x86_64` model within the [*Trained Models* view](https://www.elastic.co/docs/explore-analyze/machine-learning/nlp/ml-nlp-deploy-model) as covered in the accompanying article. "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 26,
@@ -102,9 +111,12 @@
     "from elasticsearch import Elasticsearch\n",
     "\n",
     "try:\n",
-    "    es = Elasticsearch(hosts=[elastic_endpoint], api_key=api_key,\n",
-    "    # Disable SSL verification if using localhost\n",
-    "    verify_certs=False)\n",
+    "    es = Elasticsearch(\n",
+    "        hosts=[elastic_endpoint],\n",
+    "        api_key=api_key,\n",
+    "        # Disable SSL verification if using localhost\n",
+    "        verify_certs=False,\n",
+    "    )\n",
     "    es.ping()\n",
     "\n",
     "    print(\"Successfully connected to Elasticsearch\")\n",
@@ -150,9 +162,11 @@
    "outputs": [],
    "source": [
     "# Helper function to prettify the ES response\n",
-    "prettify_response = lambda response: json.dumps(response.body.get(\"hits\").get(\"hits\"), indent=2, ensure_ascii=False)\n",
+    "prettify_response = lambda response: json.dumps(\n",
+    "    response.body.get(\"hits\").get(\"hits\"), indent=2, ensure_ascii=False\n",
+    ")\n",
     "# Default E5 model id\n",
-    "MODEL_ID = \".multilingual-e5-small\"\n"
+    "MODEL_ID = \".multilingual-e5-small\""
    ]
   },
   {
@@ -241,13 +255,6 @@
     "print(\"Indexing complete!\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -272,34 +279,23 @@
     "pipeline_body = {\n",
     "    \"description\": \"Pipeline to run the descriptions text_field through our inference text embedding model\",\n",
     "    \"processors\": [\n",
-    "        {\n",
-    "            \"set\": {\n",
-    "                \"field\": \"temp_desc\",\n",
-    "                \"value\": \"passage: {{description}}\"\n",
-    "            }\n",
-    "        },\n",
+    "        {\"set\": {\"field\": \"temp_desc\", \"value\": \"passage: {{description}}\"}},\n",
     "        {\n",
     "            \"inference\": {\n",
-    "                \"field_map\": {\n",
-    "                    \"temp_desc\": \"text_field\"\n",
-    "                },\n",
+    "                \"field_map\": {\"temp_desc\": \"text_field\"},\n",
     "                \"model_id\": MODEL_ID,\n",
-    "                \"target_field\": \"vector_description\"\n",
+    "                \"target_field\": \"vector_description\",\n",
     "            }\n",
     "        },\n",
-    "        {\n",
-    "            \"remove\": {\n",
-    "                \"field\": \"temp_desc\"\n",
-    "            }\n",
-    "        }\n",
-    "    ]\n",
+    "        {\"remove\": {\"field\": \"temp_desc\"}},\n",
+    "    ],\n",
     "}\n",
     "\n",
     "try:\n",
     "    es.ingest.put_pipeline(id=\"vectorize_descriptions\", body=pipeline_body)\n",
     "    print(\"Pipeline 'vectorize_descriptions' created successfully.\")\n",
     "except Exception as e:\n",
-    "    print(f\"Error creating pipeline: {str(e)}\")\n"
+    "    print(f\"Error creating pipeline: {str(e)}\")"
    ]
   },
   {
@@ -334,27 +330,17 @@
     "index_body = {\n",
     "    \"mappings\": {\n",
     "        \"properties\": {\n",
-    "            \"description\": {\n",
-    "                \"type\": \"text\"\n",
-    "            },\n",
-    "            \"en\": {\n",
-    "                \"type\": \"text\"\n",
-    "            },\n",
-    "            \"image_url\": {\n",
-    "                \"type\": \"keyword\"\n",
-    "            },\n",
-    "            \"language\": {\n",
-    "                \"type\": \"keyword\"\n",
-    "            },\n",
+    "            \"description\": {\"type\": \"text\"},\n",
+    "            \"en\": {\"type\": \"text\"},\n",
+    "            \"image_url\": {\"type\": \"keyword\"},\n",
+    "            \"language\": {\"type\": \"keyword\"},\n",
     "            \"vector_description.predicted_value\": {\n",
     "                \"type\": \"dense_vector\",\n",
     "                \"dims\": 384,\n",
     "                \"index\": True,\n",
     "                \"similarity\": \"cosine\",\n",
-    "                \"index_options\": {\n",
-    "                    \"type\": \"bbq_hnsw\"\n",
-    "                }\n",
-    "            }\n",
+    "                \"index_options\": {\"type\": \"bbq_hnsw\"},\n",
+    "            },\n",
     "        }\n",
     "    }\n",
     "}\n",
@@ -364,7 +350,7 @@
     "    es.indices.create(index=\"coco_multilingual\", body=index_body)\n",
     "    print(\"Index 'coco_multilingual' created successfully.\")\n",
     "except Exception as e:\n",
-    "    print(f\"Error creating index: {str(e)}\")\n"
+    "    print(f\"Error creating index: {str(e)}\")"
    ]
   },
   {
@@ -389,25 +375,19 @@
     }
    ],
    "source": [
-    "\n",
     "reindex_body = {\n",
-    "    \"source\": {\n",
-    "        \"index\": \"coco\"\n",
-    "    },\n",
-    "    \"dest\": {\n",
-    "        \"index\": \"coco_multilingual\",\n",
-    "        \"pipeline\": \"vectorize_descriptions\"\n",
-    "    }\n",
+    "    \"source\": {\"index\": \"coco\"},\n",
+    "    \"dest\": {\"index\": \"coco_multilingual\", \"pipeline\": \"vectorize_descriptions\"},\n",
     "}\n",
     "\n",
     "response = es.reindex(\n",
     "    body=reindex_body,\n",
     "    # Not waiting for completion here cause this process might take a while\n",
-    "    wait_for_completion=False\n",
+    "    wait_for_completion=False,\n",
     ")\n",
     "\n",
     "print(\"Reindex task started. Task info:\")\n",
-    "print(response)\n"
+    "print(response)"
    ]
   },
   {
@@ -534,31 +514,21 @@
    "source": [
     "query_body = {\n",
     "    \"size\": 10,\n",
-    "    \"_source\": [\n",
-    "        \"description\", \"language\", \"en\"\n",
-    "    ],\n",
+    "    \"_source\": [\"description\", \"language\", \"en\"],\n",
     "    \"knn\": {\n",
     "        \"field\": \"vector_description.predicted_value\",\n",
     "        \"k\": 10,\n",
     "        \"num_candidates\": 100,\n",
     "        \"query_vector_builder\": {\n",
-    "            \"text_embedding\": {\n",
-    "                \"model_id\": MODEL_ID,\n",
-    "                \"model_text\": \"query: kitty\"\n",
-    "            }\n",
-    "        }\n",
-    "    }\n",
+    "            \"text_embedding\": {\"model_id\": MODEL_ID, \"model_text\": \"query: kitty\"}\n",
+    "        },\n",
+    "    },\n",
     "}\n",
     "\n",
     "response = es.search(index=\"coco_multilingual\", body=query_body)\n",
     "print(prettify_response(response))"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": 35,
@@ -1076,24 +1046,22 @@
    "source": [
     "query_body = {\n",
     "    \"size\": 100,\n",
-    "    \"_source\": [\n",
-    "        \"description\", \"language\", \"en\"\n",
-    "    ],\n",
+    "    \"_source\": [\"description\", \"language\", \"en\"],\n",
     "    \"knn\": {\n",
     "        \"field\": \"vector_description.predicted_value\",\n",
     "        \"k\": 50,\n",
     "        \"num_candidates\": 1000,\n",
     "        \"query_vector_builder\": {\n",
     "            \"text_embedding\": {\n",
     "                \"model_id\": MODEL_ID,\n",
-    "                \"model_text\": \"query: kitty lying on something\"\n",
+    "                \"model_text\": \"query: kitty lying on something\",\n",
     "            }\n",
-    "        }\n",
-    "    }\n",
+    "        },\n",
+    "    },\n",
     "}\n",
     "\n",
     "response = es.search(index=\"coco_multilingual\", body=query_body)\n",
-    "print(prettify_response(response))\n"
+    "print(prettify_response(response))"
    ]
   },
   {
@@ -1613,24 +1581,19 @@
    "source": [
     "query_body = {\n",
     "    \"size\": 100,\n",
-    "    \"_source\": [\n",
-    "        \"description\", \"language\", \"en\"\n",
-    "    ],\n",
+    "    \"_source\": [\"description\", \"language\", \"en\"],\n",
     "    \"knn\": {\n",
     "        \"field\": \"vector_description.predicted_value\",\n",
     "        \"k\": 50,\n",
     "        \"num_candidates\": 1000,\n",
     "        \"query_vector_builder\": {\n",
-    "            \"text_embedding\": {\n",
-    "                \"model_id\": MODEL_ID,\n",
-    "                \"model_text\": \"query: 고양이\"\n",
-    "            }\n",
-    "        }\n",
-    "    }\n",
+    "            \"text_embedding\": {\"model_id\": MODEL_ID, \"model_text\": \"query: 고양이\"}\n",
+    "        },\n",
+    "    },\n",
     "}\n",
     "\n",
     "response = es.search(index=\"coco_multilingual\", body=query_body)\n",
-    "print(prettify_response(response))\n"
+    "print(prettify_response(response))"
    ]
   }
  ],