Skip to content

Commit 453f044

Browse files
committed
Fixing notebook build and validating changes
1 parent 07dc30b commit 453f044

File tree

1 file changed

+50
-87
lines changed

1 file changed

+50
-87
lines changed

supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb

Lines changed: 50 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@
99
"This notebook by Quynh Nguyen shows how cross-lingual vector search overcomes language barriers, enabling you to query and retrieve information in any language from both single and multilingual datasets. It accompanies the piece *Lost In Translation? Multilingual Embedding Models Are All You Need* from [Elasticsearch Labs](https://www.elastic.co/search-labs)."
1010
]
1111
},
12+
{
13+
"cell_type": "markdown",
14+
"metadata": {},
15+
"source": [
16+
"## Prerequisites\n",
17+
"\n",
18+
"Before running this notebook, please ensure you have deployed the `.multilingual-e5-small_linux-x86_64` model within the [*Trained Models* view](https://www.elastic.co/docs/explore-analyze/machine-learning/nlp/ml-nlp-deploy-model) as covered in the accompanying article. "
19+
]
20+
},
1221
{
1322
"cell_type": "code",
1423
"execution_count": 26,
@@ -102,9 +111,12 @@
102111
"from elasticsearch import Elasticsearch\n",
103112
"\n",
104113
"try:\n",
105-
" es = Elasticsearch(hosts=[elastic_endpoint], api_key=api_key,\n",
106-
" # Disable SSL verification if using localhost\n",
107-
" verify_certs=False)\n",
114+
" es = Elasticsearch(\n",
115+
" hosts=[elastic_endpoint],\n",
116+
" api_key=api_key,\n",
117+
" # Disable SSL verification if using localhost\n",
118+
" verify_certs=False,\n",
119+
" )\n",
108120
" es.ping()\n",
109121
"\n",
110122
" print(\"Successfully connected to Elasticsearch\")\n",
@@ -150,9 +162,11 @@
150162
"outputs": [],
151163
"source": [
152164
"# Helper function to prettify the ES response\n",
153-
"prettify_response = lambda response: json.dumps(response.body.get(\"hits\").get(\"hits\"), indent=2, ensure_ascii=False)\n",
165+
"prettify_response = lambda response: json.dumps(\n",
166+
" response.body.get(\"hits\").get(\"hits\"), indent=2, ensure_ascii=False\n",
167+
")\n",
154168
"# Default E5 model id\n",
155-
"MODEL_ID = \".multilingual-e5-small\"\n"
169+
"MODEL_ID = \".multilingual-e5-small\""
156170
]
157171
},
158172
{
@@ -241,13 +255,6 @@
241255
"print(\"Indexing complete!\")"
242256
]
243257
},
244-
{
245-
"cell_type": "code",
246-
"execution_count": null,
247-
"metadata": {},
248-
"outputs": [],
249-
"source": []
250-
},
251258
{
252259
"cell_type": "markdown",
253260
"metadata": {},
@@ -272,34 +279,23 @@
272279
"pipeline_body = {\n",
273280
" \"description\": \"Pipeline to run the descriptions text_field through our inference text embedding model\",\n",
274281
" \"processors\": [\n",
275-
" {\n",
276-
" \"set\": {\n",
277-
" \"field\": \"temp_desc\",\n",
278-
" \"value\": \"passage: {{description}}\"\n",
279-
" }\n",
280-
" },\n",
282+
" {\"set\": {\"field\": \"temp_desc\", \"value\": \"passage: {{description}}\"}},\n",
281283
" {\n",
282284
" \"inference\": {\n",
283-
" \"field_map\": {\n",
284-
" \"temp_desc\": \"text_field\"\n",
285-
" },\n",
285+
" \"field_map\": {\"temp_desc\": \"text_field\"},\n",
286286
" \"model_id\": MODEL_ID,\n",
287-
" \"target_field\": \"vector_description\"\n",
287+
" \"target_field\": \"vector_description\",\n",
288288
" }\n",
289289
" },\n",
290-
" {\n",
291-
" \"remove\": {\n",
292-
" \"field\": \"temp_desc\"\n",
293-
" }\n",
294-
" }\n",
295-
" ]\n",
290+
" {\"remove\": {\"field\": \"temp_desc\"}},\n",
291+
" ],\n",
296292
"}\n",
297293
"\n",
298294
"try:\n",
299295
" es.ingest.put_pipeline(id=\"vectorize_descriptions\", body=pipeline_body)\n",
300296
" print(\"Pipeline 'vectorize_descriptions' created successfully.\")\n",
301297
"except Exception as e:\n",
302-
" print(f\"Error creating pipeline: {str(e)}\")\n"
298+
" print(f\"Error creating pipeline: {str(e)}\")"
303299
]
304300
},
305301
{
@@ -334,27 +330,17 @@
334330
"index_body = {\n",
335331
" \"mappings\": {\n",
336332
" \"properties\": {\n",
337-
" \"description\": {\n",
338-
" \"type\": \"text\"\n",
339-
" },\n",
340-
" \"en\": {\n",
341-
" \"type\": \"text\"\n",
342-
" },\n",
343-
" \"image_url\": {\n",
344-
" \"type\": \"keyword\"\n",
345-
" },\n",
346-
" \"language\": {\n",
347-
" \"type\": \"keyword\"\n",
348-
" },\n",
333+
" \"description\": {\"type\": \"text\"},\n",
334+
" \"en\": {\"type\": \"text\"},\n",
335+
" \"image_url\": {\"type\": \"keyword\"},\n",
336+
" \"language\": {\"type\": \"keyword\"},\n",
349337
" \"vector_description.predicted_value\": {\n",
350338
" \"type\": \"dense_vector\",\n",
351339
" \"dims\": 384,\n",
352340
" \"index\": True,\n",
353341
" \"similarity\": \"cosine\",\n",
354-
" \"index_options\": {\n",
355-
" \"type\": \"bbq_hnsw\"\n",
356-
" }\n",
357-
" }\n",
342+
" \"index_options\": {\"type\": \"bbq_hnsw\"},\n",
343+
" },\n",
358344
" }\n",
359345
" }\n",
360346
"}\n",
@@ -364,7 +350,7 @@
364350
" es.indices.create(index=\"coco_multilingual\", body=index_body)\n",
365351
" print(\"Index 'coco_multilingual' created successfully.\")\n",
366352
"except Exception as e:\n",
367-
" print(f\"Error creating index: {str(e)}\")\n"
353+
" print(f\"Error creating index: {str(e)}\")"
368354
]
369355
},
370356
{
@@ -389,25 +375,19 @@
389375
}
390376
],
391377
"source": [
392-
"\n",
393378
"reindex_body = {\n",
394-
" \"source\": {\n",
395-
" \"index\": \"coco\"\n",
396-
" },\n",
397-
" \"dest\": {\n",
398-
" \"index\": \"coco_multilingual\",\n",
399-
" \"pipeline\": \"vectorize_descriptions\"\n",
400-
" }\n",
379+
" \"source\": {\"index\": \"coco\"},\n",
380+
" \"dest\": {\"index\": \"coco_multilingual\", \"pipeline\": \"vectorize_descriptions\"},\n",
401381
"}\n",
402382
"\n",
403383
"response = es.reindex(\n",
404384
" body=reindex_body,\n",
405385
" # Not waiting for completion here cause this process might take a while\n",
406-
" wait_for_completion=False\n",
386+
" wait_for_completion=False,\n",
407387
")\n",
408388
"\n",
409389
"print(\"Reindex task started. Task info:\")\n",
410-
"print(response)\n"
390+
"print(response)"
411391
]
412392
},
413393
{
@@ -534,31 +514,21 @@
534514
"source": [
535515
"query_body = {\n",
536516
" \"size\": 10,\n",
537-
" \"_source\": [\n",
538-
" \"description\", \"language\", \"en\"\n",
539-
" ],\n",
517+
" \"_source\": [\"description\", \"language\", \"en\"],\n",
540518
" \"knn\": {\n",
541519
" \"field\": \"vector_description.predicted_value\",\n",
542520
" \"k\": 10,\n",
543521
" \"num_candidates\": 100,\n",
544522
" \"query_vector_builder\": {\n",
545-
" \"text_embedding\": {\n",
546-
" \"model_id\": MODEL_ID,\n",
547-
" \"model_text\": \"query: kitty\"\n",
548-
" }\n",
549-
" }\n",
550-
" }\n",
523+
" \"text_embedding\": {\"model_id\": MODEL_ID, \"model_text\": \"query: kitty\"}\n",
524+
" },\n",
525+
" },\n",
551526
"}\n",
552527
"\n",
553528
"response = es.search(index=\"coco_multilingual\", body=query_body)\n",
554529
"print(prettify_response(response))"
555530
]
556531
},
557-
{
558-
"cell_type": "markdown",
559-
"metadata": {},
560-
"source": []
561-
},
562532
{
563533
"cell_type": "code",
564534
"execution_count": 35,
@@ -1076,24 +1046,22 @@
10761046
"source": [
10771047
"query_body = {\n",
10781048
" \"size\": 100,\n",
1079-
" \"_source\": [\n",
1080-
" \"description\", \"language\", \"en\"\n",
1081-
" ],\n",
1049+
" \"_source\": [\"description\", \"language\", \"en\"],\n",
10821050
" \"knn\": {\n",
10831051
" \"field\": \"vector_description.predicted_value\",\n",
10841052
" \"k\": 50,\n",
10851053
" \"num_candidates\": 1000,\n",
10861054
" \"query_vector_builder\": {\n",
10871055
" \"text_embedding\": {\n",
10881056
" \"model_id\": MODEL_ID,\n",
1089-
" \"model_text\": \"query: kitty lying on something\"\n",
1057+
" \"model_text\": \"query: kitty lying on something\",\n",
10901058
" }\n",
1091-
" }\n",
1092-
" }\n",
1059+
" },\n",
1060+
" },\n",
10931061
"}\n",
10941062
"\n",
10951063
"response = es.search(index=\"coco_multilingual\", body=query_body)\n",
1096-
"print(prettify_response(response))\n"
1064+
"print(prettify_response(response))"
10971065
]
10981066
},
10991067
{
@@ -1613,24 +1581,19 @@
16131581
"source": [
16141582
"query_body = {\n",
16151583
" \"size\": 100,\n",
1616-
" \"_source\": [\n",
1617-
" \"description\", \"language\", \"en\"\n",
1618-
" ],\n",
1584+
" \"_source\": [\"description\", \"language\", \"en\"],\n",
16191585
" \"knn\": {\n",
16201586
" \"field\": \"vector_description.predicted_value\",\n",
16211587
" \"k\": 50,\n",
16221588
" \"num_candidates\": 1000,\n",
16231589
" \"query_vector_builder\": {\n",
1624-
" \"text_embedding\": {\n",
1625-
" \"model_id\": MODEL_ID,\n",
1626-
" \"model_text\": \"query: 고양이\"\n",
1627-
" }\n",
1628-
" }\n",
1629-
" }\n",
1590+
" \"text_embedding\": {\"model_id\": MODEL_ID, \"model_text\": \"query: 고양이\"}\n",
1591+
" },\n",
1592+
" },\n",
16301593
"}\n",
16311594
"\n",
16321595
"response = es.search(index=\"coco_multilingual\", body=query_body)\n",
1633-
"print(prettify_response(response))\n"
1596+
"print(prettify_response(response))"
16341597
]
16351598
}
16361599
],

0 commit comments

Comments
 (0)