|
9 | 9 | "This notebook by Quynh Nguyen shows how cross-lingual vector search overcomes language barriers, enabling you to query and retrieve information in any language from both single and multilingual datasets. It accompanies the piece *Lost In Translation? Multilingual Embedding Models Are All You Need* from [Elasticsearch Labs](https://www.elastic.co/search-labs)." |
10 | 10 | ] |
11 | 11 | }, |
| 12 | + { |
| 13 | + "cell_type": "markdown", |
| 14 | + "metadata": {}, |
| 15 | + "source": [ |
| 16 | + "## Prerequisites\n", |
| 17 | + "\n", |
| 18 | + "Before running this notebook, please ensure you have deployed the `.multilingual-e5-small_linux-x86_64` model within the [*Trained Models* view](https://www.elastic.co/docs/explore-analyze/machine-learning/nlp/ml-nlp-deploy-model) as covered in the accompanying article. " |
| 19 | + ] |
| 20 | + }, |
12 | 21 | { |
13 | 22 | "cell_type": "code", |
14 | 23 | "execution_count": 26, |
|
102 | 111 | "from elasticsearch import Elasticsearch\n", |
103 | 112 | "\n", |
104 | 113 | "try:\n", |
105 | | - " es = Elasticsearch(hosts=[elastic_endpoint], api_key=api_key,\n", |
106 | | - " # Disable SSL verification if using localhost\n", |
107 | | - " verify_certs=False)\n", |
| 114 | + " es = Elasticsearch(\n", |
| 115 | + " hosts=[elastic_endpoint],\n", |
| 116 | + " api_key=api_key,\n", |
| 117 | + " # Disable SSL verification if using localhost\n", |
| 118 | + " verify_certs=False,\n", |
| 119 | + " )\n", |
108 | 120 | " es.ping()\n", |
109 | 121 | "\n", |
110 | 122 | " print(\"Successfully connected to Elasticsearch\")\n", |
|
150 | 162 | "outputs": [], |
151 | 163 | "source": [ |
152 | 164 | "# Helper function to prettify the ES response\n", |
153 | | - "prettify_response = lambda response: json.dumps(response.body.get(\"hits\").get(\"hits\"), indent=2, ensure_ascii=False)\n", |
| 165 | + "prettify_response = lambda response: json.dumps(\n", |
| 166 | + " response.body.get(\"hits\").get(\"hits\"), indent=2, ensure_ascii=False\n", |
| 167 | + ")\n", |
154 | 168 | "# Default E5 model id\n", |
155 | | - "MODEL_ID = \".multilingual-e5-small\"\n" |
| 169 | + "MODEL_ID = \".multilingual-e5-small\"" |
156 | 170 | ] |
157 | 171 | }, |
158 | 172 | { |
|
241 | 255 | "print(\"Indexing complete!\")" |
242 | 256 | ] |
243 | 257 | }, |
244 | | - { |
245 | | - "cell_type": "code", |
246 | | - "execution_count": null, |
247 | | - "metadata": {}, |
248 | | - "outputs": [], |
249 | | - "source": [] |
250 | | - }, |
251 | 258 | { |
252 | 259 | "cell_type": "markdown", |
253 | 260 | "metadata": {}, |
|
272 | 279 | "pipeline_body = {\n", |
273 | 280 | " \"description\": \"Pipeline to run the descriptions text_field through our inference text embedding model\",\n", |
274 | 281 | " \"processors\": [\n", |
275 | | - " {\n", |
276 | | - " \"set\": {\n", |
277 | | - " \"field\": \"temp_desc\",\n", |
278 | | - " \"value\": \"passage: {{description}}\"\n", |
279 | | - " }\n", |
280 | | - " },\n", |
| 282 | + " {\"set\": {\"field\": \"temp_desc\", \"value\": \"passage: {{description}}\"}},\n", |
281 | 283 | " {\n", |
282 | 284 | " \"inference\": {\n", |
283 | | - " \"field_map\": {\n", |
284 | | - " \"temp_desc\": \"text_field\"\n", |
285 | | - " },\n", |
| 285 | + " \"field_map\": {\"temp_desc\": \"text_field\"},\n", |
286 | 286 | " \"model_id\": MODEL_ID,\n", |
287 | | - " \"target_field\": \"vector_description\"\n", |
| 287 | + " \"target_field\": \"vector_description\",\n", |
288 | 288 | " }\n", |
289 | 289 | " },\n", |
290 | | - " {\n", |
291 | | - " \"remove\": {\n", |
292 | | - " \"field\": \"temp_desc\"\n", |
293 | | - " }\n", |
294 | | - " }\n", |
295 | | - " ]\n", |
| 290 | + " {\"remove\": {\"field\": \"temp_desc\"}},\n", |
| 291 | + " ],\n", |
296 | 292 | "}\n", |
297 | 293 | "\n", |
298 | 294 | "try:\n", |
299 | 295 | " es.ingest.put_pipeline(id=\"vectorize_descriptions\", body=pipeline_body)\n", |
300 | 296 | " print(\"Pipeline 'vectorize_descriptions' created successfully.\")\n", |
301 | 297 | "except Exception as e:\n", |
302 | | - " print(f\"Error creating pipeline: {str(e)}\")\n" |
| 298 | + " print(f\"Error creating pipeline: {str(e)}\")" |
303 | 299 | ] |
304 | 300 | }, |
305 | 301 | { |
|
334 | 330 | "index_body = {\n", |
335 | 331 | " \"mappings\": {\n", |
336 | 332 | " \"properties\": {\n", |
337 | | - " \"description\": {\n", |
338 | | - " \"type\": \"text\"\n", |
339 | | - " },\n", |
340 | | - " \"en\": {\n", |
341 | | - " \"type\": \"text\"\n", |
342 | | - " },\n", |
343 | | - " \"image_url\": {\n", |
344 | | - " \"type\": \"keyword\"\n", |
345 | | - " },\n", |
346 | | - " \"language\": {\n", |
347 | | - " \"type\": \"keyword\"\n", |
348 | | - " },\n", |
| 333 | + " \"description\": {\"type\": \"text\"},\n", |
| 334 | + " \"en\": {\"type\": \"text\"},\n", |
| 335 | + " \"image_url\": {\"type\": \"keyword\"},\n", |
| 336 | + " \"language\": {\"type\": \"keyword\"},\n", |
349 | 337 | " \"vector_description.predicted_value\": {\n", |
350 | 338 | " \"type\": \"dense_vector\",\n", |
351 | 339 | " \"dims\": 384,\n", |
352 | 340 | " \"index\": True,\n", |
353 | 341 | " \"similarity\": \"cosine\",\n", |
354 | | - " \"index_options\": {\n", |
355 | | - " \"type\": \"bbq_hnsw\"\n", |
356 | | - " }\n", |
357 | | - " }\n", |
| 342 | + " \"index_options\": {\"type\": \"bbq_hnsw\"},\n", |
| 343 | + " },\n", |
358 | 344 | " }\n", |
359 | 345 | " }\n", |
360 | 346 | "}\n", |
|
364 | 350 | " es.indices.create(index=\"coco_multilingual\", body=index_body)\n", |
365 | 351 | " print(\"Index 'coco_multilingual' created successfully.\")\n", |
366 | 352 | "except Exception as e:\n", |
367 | | - " print(f\"Error creating index: {str(e)}\")\n" |
| 353 | + " print(f\"Error creating index: {str(e)}\")" |
368 | 354 | ] |
369 | 355 | }, |
370 | 356 | { |
|
389 | 375 | } |
390 | 376 | ], |
391 | 377 | "source": [ |
392 | | - "\n", |
393 | 378 | "reindex_body = {\n", |
394 | | - " \"source\": {\n", |
395 | | - " \"index\": \"coco\"\n", |
396 | | - " },\n", |
397 | | - " \"dest\": {\n", |
398 | | - " \"index\": \"coco_multilingual\",\n", |
399 | | - " \"pipeline\": \"vectorize_descriptions\"\n", |
400 | | - " }\n", |
| 379 | + " \"source\": {\"index\": \"coco\"},\n", |
| 380 | + " \"dest\": {\"index\": \"coco_multilingual\", \"pipeline\": \"vectorize_descriptions\"},\n", |
401 | 381 | "}\n", |
402 | 382 | "\n", |
403 | 383 | "response = es.reindex(\n", |
404 | 384 | " body=reindex_body,\n", |
405 | 385 | " # Not waiting for completion here cause this process might take a while\n", |
406 | | - " wait_for_completion=False\n", |
| 386 | + " wait_for_completion=False,\n", |
407 | 387 | ")\n", |
408 | 388 | "\n", |
409 | 389 | "print(\"Reindex task started. Task info:\")\n", |
410 | | - "print(response)\n" |
| 390 | + "print(response)" |
411 | 391 | ] |
412 | 392 | }, |
413 | 393 | { |
|
534 | 514 | "source": [ |
535 | 515 | "query_body = {\n", |
536 | 516 | " \"size\": 10,\n", |
537 | | - " \"_source\": [\n", |
538 | | - " \"description\", \"language\", \"en\"\n", |
539 | | - " ],\n", |
| 517 | + " \"_source\": [\"description\", \"language\", \"en\"],\n", |
540 | 518 | " \"knn\": {\n", |
541 | 519 | " \"field\": \"vector_description.predicted_value\",\n", |
542 | 520 | " \"k\": 10,\n", |
543 | 521 | " \"num_candidates\": 100,\n", |
544 | 522 | " \"query_vector_builder\": {\n", |
545 | | - " \"text_embedding\": {\n", |
546 | | - " \"model_id\": MODEL_ID,\n", |
547 | | - " \"model_text\": \"query: kitty\"\n", |
548 | | - " }\n", |
549 | | - " }\n", |
550 | | - " }\n", |
| 523 | + " \"text_embedding\": {\"model_id\": MODEL_ID, \"model_text\": \"query: kitty\"}\n", |
| 524 | + " },\n", |
| 525 | + " },\n", |
551 | 526 | "}\n", |
552 | 527 | "\n", |
553 | 528 | "response = es.search(index=\"coco_multilingual\", body=query_body)\n", |
554 | 529 | "print(prettify_response(response))" |
555 | 530 | ] |
556 | 531 | }, |
557 | | - { |
558 | | - "cell_type": "markdown", |
559 | | - "metadata": {}, |
560 | | - "source": [] |
561 | | - }, |
562 | 532 | { |
563 | 533 | "cell_type": "code", |
564 | 534 | "execution_count": 35, |
|
1076 | 1046 | "source": [ |
1077 | 1047 | "query_body = {\n", |
1078 | 1048 | " \"size\": 100,\n", |
1079 | | - " \"_source\": [\n", |
1080 | | - " \"description\", \"language\", \"en\"\n", |
1081 | | - " ],\n", |
| 1049 | + " \"_source\": [\"description\", \"language\", \"en\"],\n", |
1082 | 1050 | " \"knn\": {\n", |
1083 | 1051 | " \"field\": \"vector_description.predicted_value\",\n", |
1084 | 1052 | " \"k\": 50,\n", |
1085 | 1053 | " \"num_candidates\": 1000,\n", |
1086 | 1054 | " \"query_vector_builder\": {\n", |
1087 | 1055 | " \"text_embedding\": {\n", |
1088 | 1056 | " \"model_id\": MODEL_ID,\n", |
1089 | | - " \"model_text\": \"query: kitty lying on something\"\n", |
| 1057 | + " \"model_text\": \"query: kitty lying on something\",\n", |
1090 | 1058 | " }\n", |
1091 | | - " }\n", |
1092 | | - " }\n", |
| 1059 | + " },\n", |
| 1060 | + " },\n", |
1093 | 1061 | "}\n", |
1094 | 1062 | "\n", |
1095 | 1063 | "response = es.search(index=\"coco_multilingual\", body=query_body)\n", |
1096 | | - "print(prettify_response(response))\n" |
| 1064 | + "print(prettify_response(response))" |
1097 | 1065 | ] |
1098 | 1066 | }, |
1099 | 1067 | { |
|
1613 | 1581 | "source": [ |
1614 | 1582 | "query_body = {\n", |
1615 | 1583 | " \"size\": 100,\n", |
1616 | | - " \"_source\": [\n", |
1617 | | - " \"description\", \"language\", \"en\"\n", |
1618 | | - " ],\n", |
| 1584 | + " \"_source\": [\"description\", \"language\", \"en\"],\n", |
1619 | 1585 | " \"knn\": {\n", |
1620 | 1586 | " \"field\": \"vector_description.predicted_value\",\n", |
1621 | 1587 | " \"k\": 50,\n", |
1622 | 1588 | " \"num_candidates\": 1000,\n", |
1623 | 1589 | " \"query_vector_builder\": {\n", |
1624 | | - " \"text_embedding\": {\n", |
1625 | | - " \"model_id\": MODEL_ID,\n", |
1626 | | - " \"model_text\": \"query: 고양이\"\n", |
1627 | | - " }\n", |
1628 | | - " }\n", |
1629 | | - " }\n", |
| 1590 | + " \"text_embedding\": {\"model_id\": MODEL_ID, \"model_text\": \"query: 고양이\"}\n", |
| 1591 | + " },\n", |
| 1592 | + " },\n", |
1630 | 1593 | "}\n", |
1631 | 1594 | "\n", |
1632 | 1595 | "response = es.search(index=\"coco_multilingual\", body=query_body)\n", |
1633 | | - "print(prettify_response(response))\n" |
| 1596 | + "print(prettify_response(response))" |
1634 | 1597 | ] |
1635 | 1598 | } |
1636 | 1599 | ], |
|
0 commit comments