From c42cbe74e57e7ead3275ccf5aadebdbef70fc2ab Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Mon, 3 Jun 2024 22:54:45 -0500 Subject: [PATCH 01/17] Fetch Surrounding Chunks commit of Fetch Surrounding Chunks python notebook --- .../fetch-surrounding-chunks.ipynb | 4048 +++++++++++++++++ 1 file changed, 4048 insertions(+) create mode 100644 notebooks/document-chunking/fetch-surrounding-chunks.ipynb diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb new file mode 100644 index 00000000..9bd766c9 --- /dev/null +++ b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb @@ -0,0 +1,4048 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "9c160e35cf414c528b5bffe05725a7d9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e87bc6913a7747728aed4b60a645bc2c", + "IPY_MODEL_9fa94c466004402bb293e4aa0bdc82f4", + "IPY_MODEL_95316b2f654a4ddc99c92d7c60c2f417" + ], + "layout": "IPY_MODEL_0fc0b516e82941dc934c26eba22d9e01" + } + }, + "e87bc6913a7747728aed4b60a645bc2c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d93a6e3ddd364921a7c2a24451d27ffc", + "placeholder": "​", + "style": "IPY_MODEL_2155cf3c7b2043d0a41fc011bf4f0e04", + "value": "tokenizer_config.json: 100%" + } + }, + "9fa94c466004402bb293e4aa0bdc82f4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_500a70f25097484bbec10c0ffd402595", + "max": 48, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9d43995246744e26a8053c21e2c5fcfa", + "value": 48 + } + }, + "95316b2f654a4ddc99c92d7c60c2f417": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2ed85f5360ba4feda6469aabd0324e7a", + "placeholder": "​", + "style": "IPY_MODEL_808af1e1f2464a928ee23398c837ff48", + "value": " 48.0/48.0 [00:00<00:00, 1.58kB/s]" + } + }, + "0fc0b516e82941dc934c26eba22d9e01": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d93a6e3ddd364921a7c2a24451d27ffc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2155cf3c7b2043d0a41fc011bf4f0e04": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "500a70f25097484bbec10c0ffd402595": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9d43995246744e26a8053c21e2c5fcfa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2ed85f5360ba4feda6469aabd0324e7a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "808af1e1f2464a928ee23398c837ff48": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "31906527169a4c08801dc6b21936188d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a6570ce51dfc46f383d855e28534bf73", + "IPY_MODEL_41cc49a71a164065bc833d080027e4d2", + "IPY_MODEL_748e7f3c8da243e9b5320654ec8e8146" + ], + "layout": "IPY_MODEL_a88953429ab6436fb4f01b6b1e2cf6ff" + } + }, + "a6570ce51dfc46f383d855e28534bf73": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b0a2671c90a048548314c2e3d21e19e7", + "placeholder": "​", + "style": "IPY_MODEL_5f2080a5d12241638447a5851d0c8db3", + "value": "vocab.txt: 100%" + } + }, + "41cc49a71a164065bc833d080027e4d2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7ce44d2f323d45838633a750f2386525", + "max": 231508, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_406f3564a217478d8f60dee5e1fb6dbf", + "value": 231508 + } + }, + "748e7f3c8da243e9b5320654ec8e8146": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_61ae734ac8d441fd9b3ea198aff3f2c7", + "placeholder": "​", + "style": "IPY_MODEL_bc52c57fa6464ab39823cd3ddb9d7d78", + "value": " 232k/232k [00:00<00:00, 2.88MB/s]" + } + }, + "a88953429ab6436fb4f01b6b1e2cf6ff": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b0a2671c90a048548314c2e3d21e19e7": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5f2080a5d12241638447a5851d0c8db3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7ce44d2f323d45838633a750f2386525": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "406f3564a217478d8f60dee5e1fb6dbf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "61ae734ac8d441fd9b3ea198aff3f2c7": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bc52c57fa6464ab39823cd3ddb9d7d78": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "02f735a438bf4058a9cfacf8d2b8660f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_331b178397164de49408dc50ce417a36", + "IPY_MODEL_07ee43d2a1684fb0b1445755802b6ea5", + "IPY_MODEL_c867bce7e34b4800903eb9ec99f34784" + ], + "layout": "IPY_MODEL_8169e16a9b0146f5a57a015601c2ebcb" + } + }, + "331b178397164de49408dc50ce417a36": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_35ca86faebfd43faaef0202389d958fd", + "placeholder": "​", + "style": "IPY_MODEL_f04f37ba10e9498ea61acdce637431ee", + "value": "tokenizer.json: 100%" + } + }, + "07ee43d2a1684fb0b1445755802b6ea5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_527bfa6067c84b94a1e70dfadfd4b78e", + "max": 466062, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_312e85864e074b958d86325b6417a0fa", + "value": 466062 + } + }, + "c867bce7e34b4800903eb9ec99f34784": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3b0fc37739334025b037a5270c9515bf", + "placeholder": "​", + "style": "IPY_MODEL_0dfb7f264674449b92a390324d17c4cf", + "value": " 466k/466k [00:00<00:00, 6.88MB/s]" + } + }, + "8169e16a9b0146f5a57a015601c2ebcb": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "35ca86faebfd43faaef0202389d958fd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f04f37ba10e9498ea61acdce637431ee": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "527bfa6067c84b94a1e70dfadfd4b78e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "312e85864e074b958d86325b6417a0fa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "3b0fc37739334025b037a5270c9515bf": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0dfb7f264674449b92a390324d17c4cf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7e58bf25549d4b428f231d528e8fef54": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_461ca08f677a4cba9ec2a388c2e346f3", + "IPY_MODEL_90d31fb52af949b0a2b41e3613827233", + "IPY_MODEL_1afbe347ab364b28b887f49dad54f5d7" + ], + "layout": "IPY_MODEL_8483a759cc0e4e12834fc7d08dab3b7e" + } + }, + "461ca08f677a4cba9ec2a388c2e346f3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_59d8ffb31bb340eba7e0dcebfbbdd977", + "placeholder": "​", + "style": "IPY_MODEL_2b62b542c091466cbae559e29ec797bd", + "value": "config.json: 100%" + } + }, + "90d31fb52af949b0a2b41e3613827233": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_092a4de220ba4ca2a23a0f273aba601b", + "max": 570, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e0c46565371f437a85a26d44c5b20c5b", + "value": 570 + } + }, + "1afbe347ab364b28b887f49dad54f5d7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4831ce9114e5437ea8a24919557c40e2", + "placeholder": "​", + "style": "IPY_MODEL_53344cac458d4d5ebdc504744c18b7de", + "value": " 570/570 [00:00<00:00, 19.9kB/s]" + } + }, + "8483a759cc0e4e12834fc7d08dab3b7e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "59d8ffb31bb340eba7e0dcebfbbdd977": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2b62b542c091466cbae559e29ec797bd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "092a4de220ba4ca2a23a0f273aba601b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e0c46565371f437a85a26d44c5b20c5b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4831ce9114e5437ea8a24919557c40e2": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "53344cac458d4d5ebdc504744c18b7de": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f34bf7b0bb424a8e8c00ff75309bbe6f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d604bc170c02491fae573c702e790893", + "IPY_MODEL_d8a3bdb8be354365944ab587738280d3", + "IPY_MODEL_89ad2dee66324ae896eec71924aee670" + ], + "layout": "IPY_MODEL_31c0c3d684564d5fb87d2e25e6de96eb" + } + }, + "d604bc170c02491fae573c702e790893": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b7bef190ebed494eb8773ec21d9b7160", + "placeholder": "​", + "style": "IPY_MODEL_6c822f08434c4212931dcf097a80b7d4", + "value": "tokenizer_config.json: 100%" + } + }, + "d8a3bdb8be354365944ab587738280d3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0f8d4b5000174234bded5d4e017aa4e9", + "max": 418, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_243f1e7de5414b82aaa4b50482dd964d", + "value": 418 + } + }, + "89ad2dee66324ae896eec71924aee670": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0961f276155348f98c12d1be4ad78e62", + "placeholder": "​", + "style": "IPY_MODEL_95c6cffafe1b4345a905be485b787728", + "value": " 418/418 [00:00<00:00, 13.9kB/s]" + } + }, + "31c0c3d684564d5fb87d2e25e6de96eb": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b7bef190ebed494eb8773ec21d9b7160": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6c822f08434c4212931dcf097a80b7d4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0f8d4b5000174234bded5d4e017aa4e9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "243f1e7de5414b82aaa4b50482dd964d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0961f276155348f98c12d1be4ad78e62": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "95c6cffafe1b4345a905be485b787728": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "43072f923bd24566ae0e20ca9aa3cdc5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5a4f80526c2c4b53a1bce182e9b3e5fa", + "IPY_MODEL_400b0a8ef7c64477bf4f02a16b5508a0", + "IPY_MODEL_b03347c3201849778ea3129314ac340c" + ], + "layout": "IPY_MODEL_3542b02e36ce4e03850b37c28d88da30" + } + }, + "5a4f80526c2c4b53a1bce182e9b3e5fa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7b2abc768054422f8af3d21837400b4a", + "placeholder": "​", + "style": "IPY_MODEL_d8e4ceae237d4381aa5e44b020d7564e", + "value": "sentencepiece.bpe.model: 100%" + } + }, + "400b0a8ef7c64477bf4f02a16b5508a0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e5ec838bb84644b6a27e3eaec9d7ac74", + "max": 5069051, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_30ba8c556cc34fde96b530ed66ac376d", + "value": 5069051 + } + }, + "b03347c3201849778ea3129314ac340c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_711439e7dcab4c10ab4300bdbe6b86aa", + "placeholder": "​", + "style": "IPY_MODEL_15887401b0814d9386fb4d02d6279412", + "value": " 5.07M/5.07M [00:00<00:00, 19.7MB/s]" + } + }, + "3542b02e36ce4e03850b37c28d88da30": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7b2abc768054422f8af3d21837400b4a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d8e4ceae237d4381aa5e44b020d7564e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e5ec838bb84644b6a27e3eaec9d7ac74": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "30ba8c556cc34fde96b530ed66ac376d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "711439e7dcab4c10ab4300bdbe6b86aa": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "15887401b0814d9386fb4d02d6279412": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bec0cae37feb48a4add318d970d8ef96": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_feb8127671424fa68b9b93a7547e40eb", + "IPY_MODEL_94c89aa435e44c8d9369305c21ca028c", + "IPY_MODEL_7abe3bc7884e4afeb9995f7d7acc8c0f" + ], + "layout": "IPY_MODEL_7edccbff1ca145eabd4af6f9da32442a" + } + }, + "feb8127671424fa68b9b93a7547e40eb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9c11fce2ab1f4811a3d98f9154818825", + "placeholder": "​", + "style": "IPY_MODEL_2cf5d1a84ed947ddb16e5f8e6984b01e", + "value": "tokenizer.json: 100%" + } + }, + "94c89aa435e44c8d9369305c21ca028c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d5de868032e640d5a07c34c9917190c3", + "max": 17082660, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d6295baf48b24c929c3ab4a317356e2b", + "value": 17082660 + } + }, + "7abe3bc7884e4afeb9995f7d7acc8c0f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_df2ed1f8e3754f3a8f30be35935e82f3", + "placeholder": "​", + "style": "IPY_MODEL_2f125504e41344c088231f0307d7cb92", + "value": " 17.1M/17.1M [00:00<00:00, 76.7MB/s]" + } + }, + "7edccbff1ca145eabd4af6f9da32442a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9c11fce2ab1f4811a3d98f9154818825": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2cf5d1a84ed947ddb16e5f8e6984b01e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d5de868032e640d5a07c34c9917190c3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d6295baf48b24c929c3ab4a317356e2b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "df2ed1f8e3754f3a8f30be35935e82f3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2f125504e41344c088231f0307d7cb92": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6a6665e93675459394536fd9f846fbea": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_aed65947759c47c58abe86f1ee279b86", + "IPY_MODEL_b5d7fb93223c4c458e7c80e59daea4d2", + "IPY_MODEL_540140c9c2f541b3a82f7a59e4f0b867" + ], + "layout": "IPY_MODEL_d5212aa2a4f74de1970c07e282f0e2bc" + } + }, + "aed65947759c47c58abe86f1ee279b86": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4a75d002a4ae4fc99625420ec6e580ee", + "placeholder": "​", + "style": "IPY_MODEL_695f77c019db487ea60171277073efe6", + "value": "special_tokens_map.json: 100%" + } + }, + "b5d7fb93223c4c458e7c80e59daea4d2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bc3fad5fe0194399add875a6d78907bd", + "max": 280, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_42c418329198400bb77cdd3a654a96de", + "value": 280 + } + }, + "540140c9c2f541b3a82f7a59e4f0b867": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4ef5fe3e9ea84b8c8cb1a90a8208bdb9", + "placeholder": "​", + "style": "IPY_MODEL_735b9a74223f4941a2837a1108889f63", + "value": " 280/280 [00:00<00:00, 15.7kB/s]" + } + }, + "d5212aa2a4f74de1970c07e282f0e2bc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4a75d002a4ae4fc99625420ec6e580ee": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "695f77c019db487ea60171277073efe6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bc3fad5fe0194399add875a6d78907bd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "42c418329198400bb77cdd3a654a96de": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4ef5fe3e9ea84b8c8cb1a90a8208bdb9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "735b9a74223f4941a2837a1108889f63": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Fetch surronding chucks (N-1, N+1)\n", + "\n", + "This notebook is designed to handle the ingestion of book text (Harry Potter and the Sorcerer's Stone) into an Elasticsearch Cloud instance. It includes partitioning the book text into chapters and chunking the chapter text, which are then ingested into Elasticsearch. The setup utilizes a nested structure, and for each chunk, it stores dense and sparse (ELSER) vector representations along with the text representation.\n", + "\n", + "Searches are performed using dense vector comparisons, sparse vector comparisons, and text search in parallel to demonstrate the power of hybrid search strategies. Additionally, the notebook is configured to retrieve adjacent chunks (n-1 and n+1), allowing for a more contextual understanding of the search results.\n", + "\n" + ], + "metadata": { + "id": "aAUkwshINwV7" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Install required python libraries\n" + ], + "metadata": { + "id": "MUEpppV7SeLu" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nXuL8wsQNq8G", + "outputId": "80261fea-a44b-429b-e55d-5947e7ac8b6c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: elasticsearch in /usr/local/lib/python3.10/dist-packages (8.13.2)\n", + "Requirement already satisfied: elastic-transport<9,>=8.13 in /usr/local/lib/python3.10/dist-packages (from elasticsearch) (8.13.1)\n", + "Requirement already satisfied: urllib3<3,>=1.26.2 in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch) (2.0.7)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch) (2024.2.2)\n" + ] + } + ], + "source": [ + "!pip install elasticsearch\n", + "import json\n", + "import time\n", + "import urllib.request\n", + "import re\n", + "import pandas as pd\n", + "from transformers import AutoTokenizer, BertTokenizer\n", + "from elasticsearch import Elasticsearch, helpers\n", + "from google.colab import userdata\n", + "import textwrap" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Elasticsearch and Tokenizer Configuration\n" + ], + "metadata": { + "id": "_d4RWjNAN6Q9" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Elasticsearch and Tokenizer Configuration\n", + "\n", + "This section sets up the necessary configurations for connecting to Elasticsearch and initializing the tokenizers used for text processing.\n", + "\n", + "### Configuration Details:\n", + "1. **Elasticsearch Credentials**:\n", + " - `es_username`: The username for Elasticsearch authentication.\n", + " - `es_password`: The password for Elasticsearch authentication, securely fetched using Google Colab's `userdata` module.\n", + " - `es_cloudid`: The Cloud ID for the Elasticsearch cluster.\n", + "\n", + "2. **Index Settings**:\n", + " - `raw_source_index`: The name of the index for the raw dataset (`harry_potter_dataset-raw`).\n", + " - `index_name`: The name of the enriched dataset index (`harry_potter_dataset_enriched`).\n", + " - `delete_raw_source_index`: A boolean flag indicating whether the raw data index should be deleted before ingesting new data.\n", + "\n", + "3. **Embedding Model**:\n", + " - `dense_embedding_model`: Specifies the model used for generating dense embeddings (`sentence-transformers/all-minilm-l6-v2`).\n", + "\n", + "4. **Tokenizer Initialization**:\n", + " - `bert_tokenizer`: Initializes the BERT tokenizer (`bert-base-uncased`) for English text processing.\n", + " - `e5_tokenizer`: Initializes the Multilingual E5 tokenizer (`intfloat/multilingual-e5-base`) for handling diverse datasets.\n", + "\n", + "5. **Chunking Parameters**:\n", + " - `SEMANTIC_SEARCH_TOKEN_LIMIT`: Sets the token limit for each chunk (500 tokens per chunk, considering space for special tokens).\n", + " - `ELSER_TOKEN_OVERLAP`: Defines the overlap ratio between chunks (default is 0%, customizable for context continuity).\n", + "\n", + "These configurations ensure that the necessary components are properly set up for efficient text processing, indexing, and search operations in Elasticsearch.\n" + ], + "metadata": { + "id": "2w7uTCYdQ0m6" + } + }, + { + "cell_type": "code", + "source": [ + "# Elasticsearch and Tokenizer Configuration\n", + "es_username = \"elastic\"\n", + "es_password = userdata.get(\"es_password\")\n", + "es_cloudid = userdata.get(\"es_cloudid\")\n", + "\n", + "raw_source_index = \"harry_potter_dataset-raw\"\n", + "index_name = \"harry_potter_dataset_enriched\"\n", + "\n", + "dense_embedding_model = \"sentence-transformers__all-minilm-l6-v2\"\n", + "elser_model_id = \".elser_model_2_linux-x86_64\"\n", + "\n", + "bert_tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", + "e5_tokenizer = AutoTokenizer.from_pretrained(\"intfloat/multilingual-e5-base\")\n", + "\n", + "\n", + "SEMANTIC_SEARCH_TOKEN_LIMIT = 500\n", + "ELSER_TOKEN_OVERLAP = 0.0" + ], + "metadata": { + "id": "LQzCw0pgN4ll", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 328, + "referenced_widgets": [ + "9c160e35cf414c528b5bffe05725a7d9", + "e87bc6913a7747728aed4b60a645bc2c", + "9fa94c466004402bb293e4aa0bdc82f4", + "95316b2f654a4ddc99c92d7c60c2f417", + "0fc0b516e82941dc934c26eba22d9e01", + "d93a6e3ddd364921a7c2a24451d27ffc", + "2155cf3c7b2043d0a41fc011bf4f0e04", + "500a70f25097484bbec10c0ffd402595", + "9d43995246744e26a8053c21e2c5fcfa", + "2ed85f5360ba4feda6469aabd0324e7a", + "808af1e1f2464a928ee23398c837ff48", + "31906527169a4c08801dc6b21936188d", + "a6570ce51dfc46f383d855e28534bf73", + "41cc49a71a164065bc833d080027e4d2", + "748e7f3c8da243e9b5320654ec8e8146", + "a88953429ab6436fb4f01b6b1e2cf6ff", + "b0a2671c90a048548314c2e3d21e19e7", + "5f2080a5d12241638447a5851d0c8db3", + "7ce44d2f323d45838633a750f2386525", + "406f3564a217478d8f60dee5e1fb6dbf", + "61ae734ac8d441fd9b3ea198aff3f2c7", + "bc52c57fa6464ab39823cd3ddb9d7d78", + "02f735a438bf4058a9cfacf8d2b8660f", + "331b178397164de49408dc50ce417a36", + "07ee43d2a1684fb0b1445755802b6ea5", + "c867bce7e34b4800903eb9ec99f34784", + "8169e16a9b0146f5a57a015601c2ebcb", + "35ca86faebfd43faaef0202389d958fd", + "f04f37ba10e9498ea61acdce637431ee", + "527bfa6067c84b94a1e70dfadfd4b78e", + "312e85864e074b958d86325b6417a0fa", + "3b0fc37739334025b037a5270c9515bf", + "0dfb7f264674449b92a390324d17c4cf", + "7e58bf25549d4b428f231d528e8fef54", + "461ca08f677a4cba9ec2a388c2e346f3", + "90d31fb52af949b0a2b41e3613827233", + "1afbe347ab364b28b887f49dad54f5d7", + "8483a759cc0e4e12834fc7d08dab3b7e", + "59d8ffb31bb340eba7e0dcebfbbdd977", + "2b62b542c091466cbae559e29ec797bd", + "092a4de220ba4ca2a23a0f273aba601b", + "e0c46565371f437a85a26d44c5b20c5b", + "4831ce9114e5437ea8a24919557c40e2", + "53344cac458d4d5ebdc504744c18b7de", + "f34bf7b0bb424a8e8c00ff75309bbe6f", + "d604bc170c02491fae573c702e790893", + "d8a3bdb8be354365944ab587738280d3", + "89ad2dee66324ae896eec71924aee670", + "31c0c3d684564d5fb87d2e25e6de96eb", + "b7bef190ebed494eb8773ec21d9b7160", + "6c822f08434c4212931dcf097a80b7d4", + "0f8d4b5000174234bded5d4e017aa4e9", + "243f1e7de5414b82aaa4b50482dd964d", + "0961f276155348f98c12d1be4ad78e62", + "95c6cffafe1b4345a905be485b787728", + "43072f923bd24566ae0e20ca9aa3cdc5", + "5a4f80526c2c4b53a1bce182e9b3e5fa", + "400b0a8ef7c64477bf4f02a16b5508a0", + "b03347c3201849778ea3129314ac340c", + "3542b02e36ce4e03850b37c28d88da30", + "7b2abc768054422f8af3d21837400b4a", + "d8e4ceae237d4381aa5e44b020d7564e", + "e5ec838bb84644b6a27e3eaec9d7ac74", + "30ba8c556cc34fde96b530ed66ac376d", + "711439e7dcab4c10ab4300bdbe6b86aa", + "15887401b0814d9386fb4d02d6279412", + "bec0cae37feb48a4add318d970d8ef96", + "feb8127671424fa68b9b93a7547e40eb", + "94c89aa435e44c8d9369305c21ca028c", + "7abe3bc7884e4afeb9995f7d7acc8c0f", + "7edccbff1ca145eabd4af6f9da32442a", + "9c11fce2ab1f4811a3d98f9154818825", + "2cf5d1a84ed947ddb16e5f8e6984b01e", + "d5de868032e640d5a07c34c9917190c3", + "d6295baf48b24c929c3ab4a317356e2b", + "df2ed1f8e3754f3a8f30be35935e82f3", + "2f125504e41344c088231f0307d7cb92", + "6a6665e93675459394536fd9f846fbea", + "aed65947759c47c58abe86f1ee279b86", + "b5d7fb93223c4c458e7c80e59daea4d2", + "540140c9c2f541b3a82f7a59e4f0b867", + "d5212aa2a4f74de1970c07e282f0e2bc", + "4a75d002a4ae4fc99625420ec6e580ee", + "695f77c019db487ea60171277073efe6", + "bc3fad5fe0194399add875a6d78907bd", + "42c418329198400bb77cdd3a654a96de", + "4ef5fe3e9ea84b8c8cb1a90a8208bdb9", + "735b9a74223f4941a2837a1108889f63" + ] + }, + "outputId": "7ed04793-8bb9-49c6-b090-82111d9835f6" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/48.0 [00:00= len(tokens):\n", + " break\n", + " return result\n", + "\n", + "\n", + "def check_task_status(es, task_id):\n", + " while True:\n", + " task_response = es.tasks.get(task_id=task_id)\n", + " if task_response[\"completed\"]:\n", + " print(\"Reindexing complete.\")\n", + " break\n", + " else:\n", + " print(\"Indexing...\")\n", + " time.sleep(10)" + ], + "metadata": { + "id": "xB2a9-qtONbQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "##Ingest Pipelines" + ], + "metadata": { + "id": "izMU8HqqP7ld" + } + }, + { + "cell_type": "code", + "source": [ + "# Define the ingest pipeline configuration\n", + "pipeline_body = {\n", + " \"description\": \"Pipeline for processing book passages\",\n", + " \"processors\": [\n", + " {\n", + " \"foreach\": {\n", + " \"field\": \"passages\",\n", + " \"processor\": {\n", + " \"inference\": {\n", + " \"field_map\": {\"_ingest._value.text\": \"text_field\"},\n", + " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", + " \"target_field\": \"_ingest._value.vector\",\n", + " \"on_failure\": [\n", + " {\n", + " \"append\": {\n", + " \"field\": \"_source._ingest.inference_errors\",\n", + " \"value\": [\n", + " {\n", + " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", + " \"pipeline\": \"ml-inference-title-vector\",\n", + " \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n", + " }\n", + " ],\n", + " }\n", + " }\n", + " ],\n", + " }\n", + " },\n", + " }\n", + " },\n", + " {\n", + " \"foreach\": {\n", + " \"field\": \"passages\",\n", + " \"processor\": {\n", + " \"inference\": {\n", + " \"field_map\": {\"_ingest._value.text\": \"text_field\"},\n", + " \"model_id\": elser_model_id,\n", + " \"target_field\": \"_ingest._value.content_embedding\",\n", + " \"on_failure\": [\n", + " {\n", + " \"append\": {\n", + " \"field\": \"_source._ingest.inference_errors\",\n", + " \"value\": [\n", + " {\n", + " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", + " \"pipeline\": \"ml-inference-title-vector\",\n", + " \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n", + " }\n", + " ],\n", + " }\n", + " }\n", + " ],\n", + " }\n", + " },\n", + " }\n", + " },\n", + " ],\n", + "}\n", + "\n", + "# Create or update the pipeline\n", + "pipeline_id = \"books_dataset_chunker\"\n", + "es = create_es_client()\n", + "es.ingest.put_pipeline(id=pipeline_id, body=pipeline_body)\n", + "print(f\"Ingest pipeline '{pipeline_id}' created/updated successfully.\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iUOFJK48OamP", + "outputId": "5dc25103-a2ee-4a19-e184-92ec65c29187" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Ingest pipeline 'books_dataset_chunker' created/updated successfully.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "##Index Settings" + ], + "metadata": { + "id": "6ZkRwEGdQBRP" + } + }, + { + "cell_type": "code", + "source": [ + "index_settings = {\n", + " \"settings\": {\n", + " \"number_of_shards\": 2,\n", + " \"number_of_replicas\": 0,\n", + " \"default_pipeline\": \"books_dataset_chunker\",\n", + " },\n", + " \"mappings\": {\n", + " \"dynamic\": \"false\",\n", + " \"properties\": {\n", + " \"book_title\": {\"type\": \"keyword\"},\n", + " \"chapter\": {\"type\": \"keyword\"},\n", + " \"chapter_full_text\": {\"type\": \"text\", \"index\": False},\n", + " \"passages\": {\n", + " \"type\": \"nested\",\n", + " \"properties\": {\n", + " \"content_embedding\": {\n", + " \"properties\": {\n", + " \"is_truncated\": {\"type\": \"boolean\"},\n", + " \"model_id\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\n", + " \"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}\n", + " },\n", + " },\n", + " \"predicted_value\": {\"type\": \"sparse_vector\"},\n", + " }\n", + " },\n", + " \"text\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"vector\": {\n", + " \"properties\": {\n", + " \"is_truncated\": {\"type\": \"boolean\"},\n", + " \"model_id\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\n", + " \"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}\n", + " },\n", + " },\n", + " \"predicted_value\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 384,\n", + " \"index\": True,\n", + " \"similarity\": \"dot_product\",\n", + " },\n", + " }\n", + " },\n", + " \"chunk_number\": {\"type\": \"integer\"},\n", + " },\n", + " },\n", + " },\n", + " },\n", + "}\n", + "\n", + "raw_source_index_settings = {\n", + " \"settings\": {\"number_of_shards\": 2, \"number_of_replicas\": 0},\n", + " \"mappings\": {\n", + " \"dynamic\": \"false\",\n", + " \"properties\": {\n", + " \"book_title\": {\"type\": \"keyword\"},\n", + " \"chapter\": {\"type\": \"keyword\"},\n", + " \"chapter_full_text\": {\"type\": \"text\", \"index\": False},\n", + " \"passages\": {\n", + " \"type\": \"nested\",\n", + " \"properties\": {\n", + " \"text\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"chunk_number\": {\"type\": \"integer\"},\n", + " },\n", + " },\n", + " },\n", + " },\n", + "}\n", + "\n", + "# Manage indices\n", + "manage_index(\n", + " es,\n", + " index_name,\n", + " index_settings[\"settings\"],\n", + " index_settings[\"mappings\"],\n", + " delete_index=True,\n", + ")\n", + "manage_index(\n", + " es,\n", + " raw_source_index,\n", + " raw_source_index_settings[\"settings\"],\n", + " raw_source_index_settings[\"mappings\"],\n", + " delete_index=True,\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vZ3Z5gZbOgjF", + "outputId": "996f6ca5-d27d-4ea0-ed4d-07570b9942ad" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Index harry_potter_dataset_enriched exists. Deleting it...\n", + "Index harry_potter_dataset_enriched deleted!\n", + "Index harry_potter_dataset_enriched created successfully!\n", + "Index harry_potter_dataset-raw exists. Deleting it...\n", + "Index harry_potter_dataset-raw deleted!\n", + "Index harry_potter_dataset-raw created successfully!\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Fetch and Process the Book Text\n", + "\n", + "This section downloads the full text of \"Harry Potter and the Sorcerer's Stone\" from a specified URL and processes it to extract chapters and their titles. The text is then structured into a pandas DataFrame for further analysis and indexing.\n", + "\n", + "### Key Steps:\n", + "1. **Download Text**: The book is fetched using `urllib.request` from the provided URL.\n", + "2. **Extract Chapters**: The text is split into chapters based on predefined patterns, omitting the text before the first chapter.\n", + "3. **Capture Chapter Titles**: Chapter titles are extracted and paired with their respective texts.\n", + "4. **Data Structuring**:\n", + " - Convert the list of chapter titles and texts into a DataFrame.\n", + " - Assign sequential numbers to chapters.\n", + " - Add the book title as metadata.\n", + " - Apply a text chunking function to split each chapter into manageable passages.\n", + "\n", + "This prepares the text data for efficient indexing and advanced search operations in Elasticsearch.\n" + ], + "metadata": { + "id": "NPtbLhVOQUF3" + } + }, + { + "cell_type": "code", + "source": [ + "# Fetch and process the book text\n", + "potter_book_url = \"https://raw.githubusercontent.com/amephraim/nlp/master/texts/J.%20K.%20Rowling%20-%20Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt\"\n", + "response = urllib.request.urlopen(potter_book_url)\n", + "harry_potter_book_text = response.read().decode(\"utf-8\")\n", + "chapter_pattern = re.compile(r\"CHAPTER [A-Z]+\", re.IGNORECASE)\n", + "chapters = chapter_pattern.split(harry_potter_book_text)[1:]\n", + "chapter_titles = re.findall(chapter_pattern, harry_potter_book_text)\n", + "chapters_with_titles = list(zip(chapter_titles, chapters))\n", + "\n", + "print(\"Total chapters found:\", len(chapters))\n", + "if chapters_with_titles:\n", + " print(\"First chapter title:\", chapters_with_titles[0][0])\n", + " print(\"Text sample from first chapter:\", chapters_with_titles[0][1][:500])\n", + "\n", + "\n", + "# Structuring chapters into a DataFrame\n", + "df = pd.DataFrame(chapters_with_titles, columns=[\"chapter_title\", \"chapter_full_text\"])\n", + "df[\"chapter\"] = df.index + 1\n", + "df[\"book_title\"] = \"Harry Potter and the Sorcerer’s Stone\"\n", + "df[\"passages\"] = df[\"chapter_full_text\"].apply(lambda text: chunk(text))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0L4YI96xOuKn", + "outputId": "7f9c63c7-82d8-4490-aabb-c3629872d80d" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Total chapters found: 17\n", + "First chapter title: CHAPTER ONE\n", + "Text sample from first chapter: \n", + "\n", + "THE BOY WHO LIVED\n", + "\n", + "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\n", + "that they were perfectly normal, thank you very much. They were the last\n", + "people you'd expect to be involved in anything strange or mysterious,\n", + "because they just didn't hold with such nonsense.\n", + "\n", + "Mr. Dursley was the director of a firm called Grunnings, which made\n", + "drills. He was a big, beefy man with hardly any neck, although he did\n", + "have a very large mustache. Mrs. Dursley was thin and blonde and had\n", + "nearly t\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Token indices sequence length is longer than the specified maximum sequence length for this model (6535 > 512). Running this sequence through the model will result in indexing errors\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Indexing DataFrame into Elasticsearch\n", + "\n", + "This section uploads the structured data from a pandas DataFrame into a specified Elasticsearch index. The DataFrame contains chapter information from \"Harry Potter and the Sorcerer's Stone\", including chapter titles, full texts, and additional metadata.\n", + "\n", + "### Key Operation:\n", + "- **Index Data**: The `index_dataframe` function is called with the Elasticsearch client, the raw source index name, and the DataFrame as arguments. This operation effectively uploads the data into Elasticsearch, making it searchable and ready for further processing.\n" + ], + "metadata": { + "id": "DKK4574EQaTl" + } + }, + { + "cell_type": "code", + "source": [ + "index_dataframe(es, raw_source_index, df)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7ReLAtz1O1HF", + "outputId": "3bf70ccc-804d-4718-e2a7-13dc0008e073" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Indexing documents to harry_potter_dataset-raw...\n", + "Successfully indexed 17 documents.\n", + "Failed to index 0 documents.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Asynchronous Reindexing in Elasticsearch\n", + "\n", + "This section initiates an asynchronous reindex operation to transfer data from the raw source index to the enriched index in Elasticsearch. This process runs in the background, allowing other operations to continue without waiting for completion.\n", + "\n", + "### Key Steps:\n", + "1. **Start Reindex**: The reindex operation is triggered from the `raw_source_index` to the `index_name`, with `wait_for_completion` set to `False` to allow asynchronous execution.\n", + "2. **Retrieve Task ID**: The task ID of the reindex operation is captured and printed for monitoring purposes.\n", + "3. **Monitor Progress**: The `check_task_status` function continuously checks the status of the reindex task, providing updates every 10 seconds until the operation is complete.\n" + ], + "metadata": { + "id": "pA5QroYdQgcM" + } + }, + { + "cell_type": "code", + "source": [ + "# Start the reindex operation asynchronously\n", + "response = es.reindex(\n", + " body={\"source\": {\"index\": raw_source_index}, \"dest\": {\"index\": index_name}},\n", + " wait_for_completion=False,\n", + ")\n", + "task_id = response[\"task\"]\n", + "print(\"Task ID:\", task_id)\n", + "check_task_status(es, task_id)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HOCX_lbmO3zl", + "outputId": "014309de-8ec6-4cf8-b647-6bf0e6f512d8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Task ID: _ov-FtHBSkqocXXBG6nu4A:68576798\n", + "Indexing...\n", + "Reindexing complete.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Custom Search Query Construction and Execution\n", + "\n", + "This section constructs and executes a custom search query in Elasticsearch, utilizing a hybrid approach combining vector and text-based search methods to enhance search accuracy and relevance. The specific example used is a user query about the \"Nimbus 2000\".\n", + "\n", + "### Key Steps:\n", + "1. **Define User Query**: The user query is specified as \"what is a nimbus 2000\".\n", + "2. **Set Boost Factors**:\n", + " - `knn_boost_factor`: A value to amplify the importance of the vector-based search component.\n", + " - `text_expansion_boost`: A value to modify the weight of the text-based search component.\n", + "3. **Build Query**: The `build_custom_query` function constructs the search query, incorporating both dense vector and text expansion components.\n", + "4. **Execute Search**: The query is executed against the specified Elasticsearch index.\n", + "5. **Identify Relevant Passages**:\n", + " - The search results are analyzed to find the passage with the highest relevance score.\n", + " - The ID and chunk number of the best matching passage are captured and printed.\n", + "6. **Fetch Surrounding Chunks**: Constructs and executes a query to retrieve chunks adjacent to the identified passage for broader context. If the matched chunk is the first chunk, fetches n, n+1, and n+2. If the chunk is the last chunk in the chapter, fetches n, n-1, and n-2. For other chunks, fetches n-1, n, and n+1.\n", + "7. **Display Results**: Outputs text from the relevant and adjacent passages." + ], + "metadata": { + "id": "xJBDwRmDQq4n" + } + }, + { + "cell_type": "code", + "source": [ + "# Custom Search Query Construction\n", + "user_query = \"what is a nimbus 2000\"\n", + "\n", + "\n", + "knn_boost_factor = 20\n", + "text_expansion_boost = 1\n", + "query = build_custom_query(\n", + " build_vector(user_query),\n", + " user_query,\n", + " knn_boost_factor,\n", + " text_expansion_boost,\n", + " debug=False,\n", + ")\n", + "\n", + "# Searching and identifying relevant passages\n", + "results = es.search(index=index_name, body=query, _source=False)\n", + "\n", + "hit_id = None\n", + "chunk_number = None\n", + "\n", + "if results and results.get(\"hits\") and results[\"hits\"].get(\"hits\"):\n", + " highest_score = -1\n", + " best_hit = None\n", + " hit_id = results[\"hits\"][\"hits\"][0][\"_id\"]\n", + " chapter_number = results[\"hits\"][\"hits\"][0][\"fields\"][\"chapter\"][0]\n", + " if \"inner_hits\" in results[\"hits\"][\"hits\"][0]:\n", + " for hit_type in [\"text_hits\", \"dense_hit\", \"sparse_hits\"]:\n", + " if hit_type in results[\"hits\"][\"hits\"][0][\"inner_hits\"]:\n", + " inner_hit = results[\"hits\"][\"hits\"][0][\"inner_hits\"][hit_type][\"hits\"]\n", + " if inner_hit[\"hits\"]:\n", + " max_score = inner_hit[\"max_score\"]\n", + " if max_score and max_score > highest_score:\n", + " highest_score = max_score\n", + " best_hit = inner_hit[\"hits\"][0]\n", + "\n", + " if best_hit:\n", + " first_passage_text = best_hit[\"_source\"][\"text\"]\n", + " chunk_number = best_hit[\"_source\"][\"chunk_number\"]\n", + " # print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text: {first_passage_text}\")\n", + " print(\n", + " f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text:\\n{textwrap.fill(first_passage_text, width=200)}\"\n", + " )\n", + " print(f\"\\n\")\n", + " else:\n", + " print(f\"ID: {hit_id}, No relevant passages found.\")\n", + "else:\n", + " print(\"No results found.\")\n", + "\n", + "print(f\"Fetch Surrounding Chunks\")\n", + "print(f\"------------------------\")\n", + "\n", + "max_chapter_chunk_result = es.search(\n", + " index=index_name,\n", + " body=get_max_chunk_number_query(chapter_number, debug=False),\n", + " _source=False,\n", + ")\n", + "max_chunk_number = max_chapter_chunk_result[\"aggregations\"][\"max_chunk_number\"][\n", + " \"max_chunk\"\n", + "][\"value\"]\n", + "\n", + "adjacent_chunks_query = get_adjacent_chunks_query(\n", + " hit_id, chunk_number, max_chunk_number, debug=False\n", + ")\n", + "results = es.search(index=index_name, body=adjacent_chunks_query, _source=False)\n", + "print_text_from_results(results)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "u7NFZBRJO3t7", + "outputId": "6f9ec0d9-bb1d-4235-da45-1c8040ac7036" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Matched Chunk ID: wz8m148BbBK3er50L0-W, Chunk Number: 3, Text:\n", + "t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "\n", + "\n", + "Fetch Surrounding Chunks\n", + "------------------------\n", + "\n", + "\n", + "Text from Chunk 2: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "\n", + "\n", + "Text from Chunk 3: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "\n", + "\n", + "Text from Chunk 4: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n" + ] + } + ] + } + ] +} \ No newline at end of file From 9c6a3528d5432cfb5dd197476cc3694a192de833 Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Tue, 4 Jun 2024 15:33:59 -0500 Subject: [PATCH 02/17] added pip install pandas added !pip install pandas --- .../fetch-surrounding-chunks.ipynb | 7677 +++++++++-------- 1 file changed, 3839 insertions(+), 3838 deletions(-) diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb index 9bd766c9..733cd64f 100644 --- a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb +++ b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb @@ -1,3115 +1,367 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "aAUkwshINwV7" + }, + "source": [ + "# Fetch surronding chucks (N-1, N+1)\n", + "\n", + "This notebook is designed to handle the ingestion of book text (Harry Potter and the Sorcerer's Stone) into an Elasticsearch Cloud instance. It includes partitioning the book text into chapters and chunking the chapter text, which are then ingested into Elasticsearch. The setup utilizes a nested structure, and for each chunk, it stores dense and sparse (ELSER) vector representations along with the text representation.\n", + "\n", + "Searches are performed using dense vector comparisons, sparse vector comparisons, and text search in parallel to demonstrate the power of hybrid search strategies. Additionally, the notebook is configured to retrieve adjacent chunks (n-1 and n+1), allowing for a more contextual understanding of the search results.\n", + "\n" + ] }, - "language_info": { - "name": "python" + { + "cell_type": "markdown", + "metadata": { + "id": "MUEpppV7SeLu" + }, + "source": [ + "## Install required python libraries\n" + ] }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "9c160e35cf414c528b5bffe05725a7d9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_e87bc6913a7747728aed4b60a645bc2c", - "IPY_MODEL_9fa94c466004402bb293e4aa0bdc82f4", - "IPY_MODEL_95316b2f654a4ddc99c92d7c60c2f417" - ], - "layout": "IPY_MODEL_0fc0b516e82941dc934c26eba22d9e01" - } + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "e87bc6913a7747728aed4b60a645bc2c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_d93a6e3ddd364921a7c2a24451d27ffc", - "placeholder": "​", - "style": "IPY_MODEL_2155cf3c7b2043d0a41fc011bf4f0e04", - "value": "tokenizer_config.json: 100%" - } + "id": "nXuL8wsQNq8G", + "outputId": "80261fea-a44b-429b-e55d-5947e7ac8b6c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: elasticsearch in /usr/local/lib/python3.10/dist-packages (8.13.2)\n", + "Requirement already satisfied: elastic-transport<9,>=8.13 in /usr/local/lib/python3.10/dist-packages (from elasticsearch) (8.13.1)\n", + "Requirement already satisfied: urllib3<3,>=1.26.2 in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch) (2.0.7)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch) (2024.2.2)\n" + ] + } + ], + "source": [ + "!pip install elasticsearch\n", + "!pip install pandas\n", + "import json\n", + "import time\n", + "import urllib.request\n", + "import re\n", + "import pandas as pd\n", + "from transformers import AutoTokenizer, BertTokenizer\n", + "from elasticsearch import Elasticsearch, helpers\n", + "from google.colab import userdata\n", + "import textwrap" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_d4RWjNAN6Q9" + }, + "source": [ + "# Elasticsearch and Tokenizer Configuration\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2w7uTCYdQ0m6" + }, + "source": [ + "## Elasticsearch and Tokenizer Configuration\n", + "\n", + "This section sets up the necessary configurations for connecting to Elasticsearch and initializing the tokenizers used for text processing.\n", + "\n", + "### Configuration Details:\n", + "1. **Elasticsearch Credentials**:\n", + " - `es_username`: The username for Elasticsearch authentication.\n", + " - `es_password`: The password for Elasticsearch authentication, securely fetched using Google Colab's `userdata` module.\n", + " - `es_cloudid`: The Cloud ID for the Elasticsearch cluster.\n", + "\n", + "2. **Index Settings**:\n", + " - `raw_source_index`: The name of the index for the raw dataset (`harry_potter_dataset-raw`).\n", + " - `index_name`: The name of the enriched dataset index (`harry_potter_dataset_enriched`).\n", + " - `delete_raw_source_index`: A boolean flag indicating whether the raw data index should be deleted before ingesting new data.\n", + "\n", + "3. **Embedding Model**:\n", + " - `dense_embedding_model`: Specifies the model used for generating dense embeddings (`sentence-transformers/all-minilm-l6-v2`).\n", + "\n", + "4. **Tokenizer Initialization**:\n", + " - `bert_tokenizer`: Initializes the BERT tokenizer (`bert-base-uncased`) for English text processing.\n", + " - `e5_tokenizer`: Initializes the Multilingual E5 tokenizer (`intfloat/multilingual-e5-base`) for handling diverse datasets.\n", + "\n", + "5. **Chunking Parameters**:\n", + " - `SEMANTIC_SEARCH_TOKEN_LIMIT`: Sets the token limit for each chunk (500 tokens per chunk, considering space for special tokens).\n", + " - `ELSER_TOKEN_OVERLAP`: Defines the overlap ratio between chunks (default is 0%, customizable for context continuity).\n", + "\n", + "These configurations ensure that the necessary components are properly set up for efficient text processing, indexing, and search operations in Elasticsearch.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 328, + "referenced_widgets": [ + "9c160e35cf414c528b5bffe05725a7d9", + "e87bc6913a7747728aed4b60a645bc2c", + "9fa94c466004402bb293e4aa0bdc82f4", + "95316b2f654a4ddc99c92d7c60c2f417", + "0fc0b516e82941dc934c26eba22d9e01", + "d93a6e3ddd364921a7c2a24451d27ffc", + "2155cf3c7b2043d0a41fc011bf4f0e04", + "500a70f25097484bbec10c0ffd402595", + "9d43995246744e26a8053c21e2c5fcfa", + "2ed85f5360ba4feda6469aabd0324e7a", + "808af1e1f2464a928ee23398c837ff48", + "31906527169a4c08801dc6b21936188d", + "a6570ce51dfc46f383d855e28534bf73", + "41cc49a71a164065bc833d080027e4d2", + "748e7f3c8da243e9b5320654ec8e8146", + "a88953429ab6436fb4f01b6b1e2cf6ff", + "b0a2671c90a048548314c2e3d21e19e7", + "5f2080a5d12241638447a5851d0c8db3", + "7ce44d2f323d45838633a750f2386525", + "406f3564a217478d8f60dee5e1fb6dbf", + "61ae734ac8d441fd9b3ea198aff3f2c7", + "bc52c57fa6464ab39823cd3ddb9d7d78", + "02f735a438bf4058a9cfacf8d2b8660f", + "331b178397164de49408dc50ce417a36", + "07ee43d2a1684fb0b1445755802b6ea5", + "c867bce7e34b4800903eb9ec99f34784", + "8169e16a9b0146f5a57a015601c2ebcb", + "35ca86faebfd43faaef0202389d958fd", + "f04f37ba10e9498ea61acdce637431ee", + "527bfa6067c84b94a1e70dfadfd4b78e", + "312e85864e074b958d86325b6417a0fa", + "3b0fc37739334025b037a5270c9515bf", + "0dfb7f264674449b92a390324d17c4cf", + "7e58bf25549d4b428f231d528e8fef54", + "461ca08f677a4cba9ec2a388c2e346f3", + "90d31fb52af949b0a2b41e3613827233", + "1afbe347ab364b28b887f49dad54f5d7", + "8483a759cc0e4e12834fc7d08dab3b7e", + "59d8ffb31bb340eba7e0dcebfbbdd977", + "2b62b542c091466cbae559e29ec797bd", + "092a4de220ba4ca2a23a0f273aba601b", + "e0c46565371f437a85a26d44c5b20c5b", + "4831ce9114e5437ea8a24919557c40e2", + "53344cac458d4d5ebdc504744c18b7de", + "f34bf7b0bb424a8e8c00ff75309bbe6f", + "d604bc170c02491fae573c702e790893", + "d8a3bdb8be354365944ab587738280d3", + "89ad2dee66324ae896eec71924aee670", + "31c0c3d684564d5fb87d2e25e6de96eb", + "b7bef190ebed494eb8773ec21d9b7160", + "6c822f08434c4212931dcf097a80b7d4", + "0f8d4b5000174234bded5d4e017aa4e9", + "243f1e7de5414b82aaa4b50482dd964d", + "0961f276155348f98c12d1be4ad78e62", + "95c6cffafe1b4345a905be485b787728", + "43072f923bd24566ae0e20ca9aa3cdc5", + "5a4f80526c2c4b53a1bce182e9b3e5fa", + "400b0a8ef7c64477bf4f02a16b5508a0", + "b03347c3201849778ea3129314ac340c", + "3542b02e36ce4e03850b37c28d88da30", + "7b2abc768054422f8af3d21837400b4a", + "d8e4ceae237d4381aa5e44b020d7564e", + "e5ec838bb84644b6a27e3eaec9d7ac74", + "30ba8c556cc34fde96b530ed66ac376d", + "711439e7dcab4c10ab4300bdbe6b86aa", + "15887401b0814d9386fb4d02d6279412", + "bec0cae37feb48a4add318d970d8ef96", + "feb8127671424fa68b9b93a7547e40eb", + "94c89aa435e44c8d9369305c21ca028c", + "7abe3bc7884e4afeb9995f7d7acc8c0f", + "7edccbff1ca145eabd4af6f9da32442a", + "9c11fce2ab1f4811a3d98f9154818825", + "2cf5d1a84ed947ddb16e5f8e6984b01e", + "d5de868032e640d5a07c34c9917190c3", + "d6295baf48b24c929c3ab4a317356e2b", + "df2ed1f8e3754f3a8f30be35935e82f3", + "2f125504e41344c088231f0307d7cb92", + "6a6665e93675459394536fd9f846fbea", + "aed65947759c47c58abe86f1ee279b86", + "b5d7fb93223c4c458e7c80e59daea4d2", + "540140c9c2f541b3a82f7a59e4f0b867", + "d5212aa2a4f74de1970c07e282f0e2bc", + "4a75d002a4ae4fc99625420ec6e580ee", + "695f77c019db487ea60171277073efe6", + "bc3fad5fe0194399add875a6d78907bd", + "42c418329198400bb77cdd3a654a96de", + "4ef5fe3e9ea84b8c8cb1a90a8208bdb9", + "735b9a74223f4941a2837a1108889f63" + ] }, - "9fa94c466004402bb293e4aa0bdc82f4": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_500a70f25097484bbec10c0ffd402595", - "max": 48, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_9d43995246744e26a8053c21e2c5fcfa", - "value": 48 - } + "id": "LQzCw0pgN4ll", + "outputId": "7ed04793-8bb9-49c6-b090-82111d9835f6" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9c160e35cf414c528b5bffe05725a7d9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/48.0 [00:00=8.13 in /usr/local/lib/python3.10/dist-packages (from elasticsearch) (8.13.1)\n", - "Requirement already satisfied: urllib3<3,>=1.26.2 in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch) (2.0.7)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch) (2024.2.2)\n" - ] - } - ], - "source": [ - "!pip install elasticsearch\n", - "import json\n", - "import time\n", - "import urllib.request\n", - "import re\n", - "import pandas as pd\n", - "from transformers import AutoTokenizer, BertTokenizer\n", - "from elasticsearch import Elasticsearch, helpers\n", - "from google.colab import userdata\n", - "import textwrap" - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Elasticsearch and Tokenizer Configuration\n" - ], - "metadata": { - "id": "_d4RWjNAN6Q9" - } - }, - { - "cell_type": "markdown", - "source": [ - "## Elasticsearch and Tokenizer Configuration\n", - "\n", - "This section sets up the necessary configurations for connecting to Elasticsearch and initializing the tokenizers used for text processing.\n", - "\n", - "### Configuration Details:\n", - "1. **Elasticsearch Credentials**:\n", - " - `es_username`: The username for Elasticsearch authentication.\n", - " - `es_password`: The password for Elasticsearch authentication, securely fetched using Google Colab's `userdata` module.\n", - " - `es_cloudid`: The Cloud ID for the Elasticsearch cluster.\n", - "\n", - "2. **Index Settings**:\n", - " - `raw_source_index`: The name of the index for the raw dataset (`harry_potter_dataset-raw`).\n", - " - `index_name`: The name of the enriched dataset index (`harry_potter_dataset_enriched`).\n", - " - `delete_raw_source_index`: A boolean flag indicating whether the raw data index should be deleted before ingesting new data.\n", - "\n", - "3. **Embedding Model**:\n", - " - `dense_embedding_model`: Specifies the model used for generating dense embeddings (`sentence-transformers/all-minilm-l6-v2`).\n", - "\n", - "4. **Tokenizer Initialization**:\n", - " - `bert_tokenizer`: Initializes the BERT tokenizer (`bert-base-uncased`) for English text processing.\n", - " - `e5_tokenizer`: Initializes the Multilingual E5 tokenizer (`intfloat/multilingual-e5-base`) for handling diverse datasets.\n", - "\n", - "5. **Chunking Parameters**:\n", - " - `SEMANTIC_SEARCH_TOKEN_LIMIT`: Sets the token limit for each chunk (500 tokens per chunk, considering space for special tokens).\n", - " - `ELSER_TOKEN_OVERLAP`: Defines the overlap ratio between chunks (default is 0%, customizable for context continuity).\n", - "\n", - "These configurations ensure that the necessary components are properly set up for efficient text processing, indexing, and search operations in Elasticsearch.\n" - ], - "metadata": { - "id": "2w7uTCYdQ0m6" - } - }, - { - "cell_type": "code", - "source": [ - "# Elasticsearch and Tokenizer Configuration\n", - "es_username = \"elastic\"\n", - "es_password = userdata.get(\"es_password\")\n", - "es_cloudid = userdata.get(\"es_cloudid\")\n", - "\n", - "raw_source_index = \"harry_potter_dataset-raw\"\n", - "index_name = \"harry_potter_dataset_enriched\"\n", - "\n", - "dense_embedding_model = \"sentence-transformers__all-minilm-l6-v2\"\n", - "elser_model_id = \".elser_model_2_linux-x86_64\"\n", - "\n", - "bert_tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", - "e5_tokenizer = AutoTokenizer.from_pretrained(\"intfloat/multilingual-e5-base\")\n", - "\n", - "\n", - "SEMANTIC_SEARCH_TOKEN_LIMIT = 500\n", - "ELSER_TOKEN_OVERLAP = 0.0" - ], - "metadata": { - "id": "LQzCw0pgN4ll", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 328, - "referenced_widgets": [ - "9c160e35cf414c528b5bffe05725a7d9", - "e87bc6913a7747728aed4b60a645bc2c", - "9fa94c466004402bb293e4aa0bdc82f4", - "95316b2f654a4ddc99c92d7c60c2f417", - "0fc0b516e82941dc934c26eba22d9e01", - "d93a6e3ddd364921a7c2a24451d27ffc", - "2155cf3c7b2043d0a41fc011bf4f0e04", - "500a70f25097484bbec10c0ffd402595", - "9d43995246744e26a8053c21e2c5fcfa", - "2ed85f5360ba4feda6469aabd0324e7a", - "808af1e1f2464a928ee23398c837ff48", - "31906527169a4c08801dc6b21936188d", - "a6570ce51dfc46f383d855e28534bf73", - "41cc49a71a164065bc833d080027e4d2", - "748e7f3c8da243e9b5320654ec8e8146", - "a88953429ab6436fb4f01b6b1e2cf6ff", - "b0a2671c90a048548314c2e3d21e19e7", - "5f2080a5d12241638447a5851d0c8db3", - "7ce44d2f323d45838633a750f2386525", - "406f3564a217478d8f60dee5e1fb6dbf", - "61ae734ac8d441fd9b3ea198aff3f2c7", - "bc52c57fa6464ab39823cd3ddb9d7d78", - "02f735a438bf4058a9cfacf8d2b8660f", - "331b178397164de49408dc50ce417a36", - "07ee43d2a1684fb0b1445755802b6ea5", - "c867bce7e34b4800903eb9ec99f34784", - "8169e16a9b0146f5a57a015601c2ebcb", - "35ca86faebfd43faaef0202389d958fd", - "f04f37ba10e9498ea61acdce637431ee", - "527bfa6067c84b94a1e70dfadfd4b78e", - "312e85864e074b958d86325b6417a0fa", - "3b0fc37739334025b037a5270c9515bf", - "0dfb7f264674449b92a390324d17c4cf", - "7e58bf25549d4b428f231d528e8fef54", - "461ca08f677a4cba9ec2a388c2e346f3", - "90d31fb52af949b0a2b41e3613827233", - "1afbe347ab364b28b887f49dad54f5d7", - "8483a759cc0e4e12834fc7d08dab3b7e", - "59d8ffb31bb340eba7e0dcebfbbdd977", - "2b62b542c091466cbae559e29ec797bd", - "092a4de220ba4ca2a23a0f273aba601b", - "e0c46565371f437a85a26d44c5b20c5b", - "4831ce9114e5437ea8a24919557c40e2", - "53344cac458d4d5ebdc504744c18b7de", - "f34bf7b0bb424a8e8c00ff75309bbe6f", - "d604bc170c02491fae573c702e790893", - "d8a3bdb8be354365944ab587738280d3", - "89ad2dee66324ae896eec71924aee670", - "31c0c3d684564d5fb87d2e25e6de96eb", - "b7bef190ebed494eb8773ec21d9b7160", - "6c822f08434c4212931dcf097a80b7d4", - "0f8d4b5000174234bded5d4e017aa4e9", - "243f1e7de5414b82aaa4b50482dd964d", - "0961f276155348f98c12d1be4ad78e62", - "95c6cffafe1b4345a905be485b787728", - "43072f923bd24566ae0e20ca9aa3cdc5", - "5a4f80526c2c4b53a1bce182e9b3e5fa", - "400b0a8ef7c64477bf4f02a16b5508a0", - "b03347c3201849778ea3129314ac340c", - "3542b02e36ce4e03850b37c28d88da30", - "7b2abc768054422f8af3d21837400b4a", - "d8e4ceae237d4381aa5e44b020d7564e", - "e5ec838bb84644b6a27e3eaec9d7ac74", - "30ba8c556cc34fde96b530ed66ac376d", - "711439e7dcab4c10ab4300bdbe6b86aa", - "15887401b0814d9386fb4d02d6279412", - "bec0cae37feb48a4add318d970d8ef96", - "feb8127671424fa68b9b93a7547e40eb", - "94c89aa435e44c8d9369305c21ca028c", - "7abe3bc7884e4afeb9995f7d7acc8c0f", - "7edccbff1ca145eabd4af6f9da32442a", - "9c11fce2ab1f4811a3d98f9154818825", - "2cf5d1a84ed947ddb16e5f8e6984b01e", - "d5de868032e640d5a07c34c9917190c3", - "d6295baf48b24c929c3ab4a317356e2b", - "df2ed1f8e3754f3a8f30be35935e82f3", - "2f125504e41344c088231f0307d7cb92", - "6a6665e93675459394536fd9f846fbea", - "aed65947759c47c58abe86f1ee279b86", - "b5d7fb93223c4c458e7c80e59daea4d2", - "540140c9c2f541b3a82f7a59e4f0b867", - "d5212aa2a4f74de1970c07e282f0e2bc", - "4a75d002a4ae4fc99625420ec6e580ee", - "695f77c019db487ea60171277073efe6", - "bc3fad5fe0194399add875a6d78907bd", - "42c418329198400bb77cdd3a654a96de", - "4ef5fe3e9ea84b8c8cb1a90a8208bdb9", - "735b9a74223f4941a2837a1108889f63" - ] - }, - "outputId": "7ed04793-8bb9-49c6-b090-82111d9835f6" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "tokenizer_config.json: 0%| | 0.00/48.0 [00:00= len(tokens):\n", + " break\n", + " return result\n", + "\n", + "\n", + "def check_task_status(es, task_id):\n", + " while True:\n", + " task_response = es.tasks.get(task_id=task_id)\n", + " if task_response[\"completed\"]:\n", + " print(\"Reindexing complete.\")\n", + " break\n", + " else:\n", + " print(\"Indexing...\")\n", + " time.sleep(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "izMU8HqqP7ld" + }, + "source": [ + "##Ingest Pipelines" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iUOFJK48OamP", + "outputId": "5dc25103-a2ee-4a19-e184-92ec65c29187" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ingest pipeline 'books_dataset_chunker' created/updated successfully.\n" + ] + } + ], + "source": [ + "# Define the ingest pipeline configuration\n", + "pipeline_body = {\n", + " \"description\": \"Pipeline for processing book passages\",\n", + " \"processors\": [\n", + " {\n", + " \"foreach\": {\n", + " \"field\": \"passages\",\n", + " \"processor\": {\n", + " \"inference\": {\n", + " \"field_map\": {\"_ingest._value.text\": \"text_field\"},\n", + " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", + " \"target_field\": \"_ingest._value.vector\",\n", + " \"on_failure\": [\n", " {\n", - " \"nested\": {\n", - " \"path\": \"passages\",\n", - " \"query\": {\"match\": {\"passages.text\": user_query}},\n", - " \"inner_hits\": {\n", - " \"name\": \"text_hits\",\n", - " \"size\": 1,\n", - " \"_source\": [\n", - " \"passages.text\",\n", - " \"passages.chunk_number\",\n", - " ],\n", - " },\n", + " \"append\": {\n", + " \"field\": \"_source._ingest.inference_errors\",\n", + " \"value\": [\n", + " {\n", + " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", + " \"pipeline\": \"ml-inference-title-vector\",\n", + " \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n", + " }\n", + " ],\n", " }\n", - " },\n", + " }\n", + " ],\n", + " }\n", + " },\n", + " }\n", + " },\n", + " {\n", + " \"foreach\": {\n", + " \"field\": \"passages\",\n", + " \"processor\": {\n", + " \"inference\": {\n", + " \"field_map\": {\"_ingest._value.text\": \"text_field\"},\n", + " \"model_id\": elser_model_id,\n", + " \"target_field\": \"_ingest._value.content_embedding\",\n", + " \"on_failure\": [\n", " {\n", - " \"nested\": {\n", - " \"path\": \"passages\",\n", - " \"query\": {\n", - " \"script_score\": {\n", - " \"query\": {\n", - " \"knn\": {\n", - " \"field\": \"passages.vector.predicted_value\",\n", - " \"query_vector\": query_vector,\n", - " \"num_candidates\": 50,\n", - " }\n", - " },\n", - " \"script\": {\n", - " \"source\": \"Math.log(1 + _score * params.boost_factor)\",\n", - " \"params\": {\n", - " \"boost_factor\": knn_boost_factor\n", - " },\n", - " },\n", + " \"append\": {\n", + " \"field\": \"_source._ingest.inference_errors\",\n", + " \"value\": [\n", + " {\n", + " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", + " \"pipeline\": \"ml-inference-title-vector\",\n", + " \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n", " }\n", - " },\n", - " \"inner_hits\": {\n", - " \"name\": \"dense_hit\",\n", - " \"size\": 1,\n", - " \"_source\": [\n", - " \"passages.text\",\n", - " \"passages.chunk_number\",\n", - " ],\n", - " },\n", + " ],\n", " }\n", + " }\n", + " ],\n", + " }\n", + " },\n", + " }\n", + " },\n", + " ],\n", + "}\n", + "\n", + "# Create or update the pipeline\n", + "pipeline_id = \"books_dataset_chunker\"\n", + "es = create_es_client()\n", + "es.ingest.put_pipeline(id=pipeline_id, body=pipeline_body)\n", + "print(f\"Ingest pipeline '{pipeline_id}' created/updated successfully.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6ZkRwEGdQBRP" + }, + "source": [ + "##Index Settings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vZ3Z5gZbOgjF", + "outputId": "996f6ca5-d27d-4ea0-ed4d-07570b9942ad" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index harry_potter_dataset_enriched exists. Deleting it...\n", + "Index harry_potter_dataset_enriched deleted!\n", + "Index harry_potter_dataset_enriched created successfully!\n", + "Index harry_potter_dataset-raw exists. Deleting it...\n", + "Index harry_potter_dataset-raw deleted!\n", + "Index harry_potter_dataset-raw created successfully!\n" + ] + } + ], + "source": [ + "index_settings = {\n", + " \"settings\": {\n", + " \"number_of_shards\": 2,\n", + " \"number_of_replicas\": 0,\n", + " \"default_pipeline\": \"books_dataset_chunker\",\n", + " },\n", + " \"mappings\": {\n", + " \"dynamic\": \"false\",\n", + " \"properties\": {\n", + " \"book_title\": {\"type\": \"keyword\"},\n", + " \"chapter\": {\"type\": \"keyword\"},\n", + " \"chapter_full_text\": {\"type\": \"text\", \"index\": False},\n", + " \"passages\": {\n", + " \"type\": \"nested\",\n", + " \"properties\": {\n", + " \"content_embedding\": {\n", + " \"properties\": {\n", + " \"is_truncated\": {\"type\": \"boolean\"},\n", + " \"model_id\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\n", + " \"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}\n", + " },\n", " },\n", - " {\n", - " \"nested\": {\n", - " \"path\": \"passages\",\n", - " \"query\": {\n", - " \"script_score\": {\n", - " \"query\": {\n", - " \"bool\": {\n", - " \"should\": [\n", - " {\n", - " \"text_expansion\": {\n", - " \"passages.content_embedding.predicted_value\": {\n", - " \"model_id\": \".elser_model_2_linux-x86_64\",\n", - " \"model_text\": user_query,\n", - " }\n", - " }\n", - " }\n", - " ]\n", - " }\n", - " },\n", - " \"script\": {\n", - " \"source\": \"_score * params.boost_factor\",\n", - " \"params\": {\n", - " \"boost_factor\": text_expansion_boost\n", - " },\n", - " },\n", - " }\n", - " },\n", - " \"inner_hits\": {\n", - " \"name\": \"sparse_hits\",\n", - " \"size\": 1,\n", - " \"_source\": [\n", - " \"passages.text\",\n", - " \"passages.chunk_number\",\n", - " ],\n", - " },\n", - " }\n", + " \"predicted_value\": {\"type\": \"sparse_vector\"},\n", + " }\n", + " },\n", + " \"text\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"vector\": {\n", + " \"properties\": {\n", + " \"is_truncated\": {\"type\": \"boolean\"},\n", + " \"model_id\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\n", + " \"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}\n", + " },\n", " },\n", - " ]\n", - " }\n", + " \"predicted_value\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 384,\n", + " \"index\": True,\n", + " \"similarity\": \"dot_product\",\n", + " },\n", + " }\n", + " },\n", + " \"chunk_number\": {\"type\": \"integer\"},\n", " },\n", - " \"score_mode\": \"sum\",\n", - " \"boost_mode\": \"sum\",\n", - " }\n", + " },\n", " },\n", - " }\n", - " if debug:\n", - " print(json.dumps(query, indent=4))\n", - " return query\n", - "\n", - "\n", - "def get_adjacent_chunks_query(doc_id, base_chunk_number, max_chunk_number, debug=False):\n", - " # Determine the chunk numbers to query based on the base_chunk_number\n", - " if base_chunk_number == 1:\n", - " chunk_numbers = [\n", - " base_chunk_number,\n", - " base_chunk_number + 1,\n", - " base_chunk_number + 2,\n", - " ]\n", - " elif base_chunk_number == max_chunk_number:\n", - " chunk_numbers = [\n", - " base_chunk_number,\n", - " base_chunk_number - 1,\n", - " base_chunk_number - 2,\n", - " ]\n", - " else:\n", - " chunk_numbers = [\n", - " base_chunk_number - 1,\n", - " base_chunk_number,\n", - " base_chunk_number + 1,\n", - " ]\n", + " },\n", + "}\n", "\n", - " # Construct the query\n", - " query = {\n", - " \"_source\": False,\n", - " \"query\": {\n", - " \"bool\": {\n", - " \"must\": [\n", - " {\"term\": {\"_id\": doc_id}},\n", - " {\n", - " \"nested\": {\n", - " \"path\": \"passages\",\n", - " \"query\": {\n", - " \"bool\": {\n", - " \"should\": [\n", - " {\"term\": {\"passages.chunk_number\": num}}\n", - " for num in chunk_numbers\n", - " ]\n", - " }\n", - " },\n", - " \"inner_hits\": {\n", - " \"_source\": [\"passages.text\", \"passages.chunk_number\"]\n", - " },\n", - " }\n", + "raw_source_index_settings = {\n", + " \"settings\": {\"number_of_shards\": 2, \"number_of_replicas\": 0},\n", + " \"mappings\": {\n", + " \"dynamic\": \"false\",\n", + " \"properties\": {\n", + " \"book_title\": {\"type\": \"keyword\"},\n", + " \"chapter\": {\"type\": \"keyword\"},\n", + " \"chapter_full_text\": {\"type\": \"text\", \"index\": False},\n", + " \"passages\": {\n", + " \"type\": \"nested\",\n", + " \"properties\": {\n", + " \"text\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", " },\n", - " ]\n", - " }\n", + " \"chunk_number\": {\"type\": \"integer\"},\n", + " },\n", + " },\n", " },\n", - " }\n", + " },\n", + "}\n", + "\n", + "# Manage indices\n", + "manage_index(\n", + " es,\n", + " index_name,\n", + " index_settings[\"settings\"],\n", + " index_settings[\"mappings\"],\n", + " delete_index=True,\n", + ")\n", + "manage_index(\n", + " es,\n", + " raw_source_index,\n", + " raw_source_index_settings[\"settings\"],\n", + " raw_source_index_settings[\"mappings\"],\n", + " delete_index=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NPtbLhVOQUF3" + }, + "source": [ + "## Fetch and Process the Book Text\n", + "\n", + "This section downloads the full text of \"Harry Potter and the Sorcerer's Stone\" from a specified URL and processes it to extract chapters and their titles. The text is then structured into a pandas DataFrame for further analysis and indexing.\n", + "\n", + "### Key Steps:\n", + "1. **Download Text**: The book is fetched using `urllib.request` from the provided URL.\n", + "2. **Extract Chapters**: The text is split into chapters based on predefined patterns, omitting the text before the first chapter.\n", + "3. **Capture Chapter Titles**: Chapter titles are extracted and paired with their respective texts.\n", + "4. **Data Structuring**:\n", + " - Convert the list of chapter titles and texts into a DataFrame.\n", + " - Assign sequential numbers to chapters.\n", + " - Add the book title as metadata.\n", + " - Apply a text chunking function to split each chapter into manageable passages.\n", + "\n", + "This prepares the text data for efficient indexing and advanced search operations in Elasticsearch.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0L4YI96xOuKn", + "outputId": "7f9c63c7-82d8-4490-aabb-c3629872d80d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total chapters found: 17\n", + "First chapter title: CHAPTER ONE\n", + "Text sample from first chapter: \n", + "\n", + "THE BOY WHO LIVED\n", + "\n", + "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\n", + "that they were perfectly normal, thank you very much. They were the last\n", + "people you'd expect to be involved in anything strange or mysterious,\n", + "because they just didn't hold with such nonsense.\n", + "\n", + "Mr. Dursley was the director of a firm called Grunnings, which made\n", + "drills. He was a big, beefy man with hardly any neck, although he did\n", + "have a very large mustache. Mrs. Dursley was thin and blonde and had\n", + "nearly t\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Token indices sequence length is longer than the specified maximum sequence length for this model (6535 > 512). Running this sequence through the model will result in indexing errors\n" + ] + } + ], + "source": [ + "# Fetch and process the book text\n", + "potter_book_url = \"https://raw.githubusercontent.com/amephraim/nlp/master/texts/J.%20K.%20Rowling%20-%20Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt\"\n", + "response = urllib.request.urlopen(potter_book_url)\n", + "harry_potter_book_text = response.read().decode(\"utf-8\")\n", + "chapter_pattern = re.compile(r\"CHAPTER [A-Z]+\", re.IGNORECASE)\n", + "chapters = chapter_pattern.split(harry_potter_book_text)[1:]\n", + "chapter_titles = re.findall(chapter_pattern, harry_potter_book_text)\n", + "chapters_with_titles = list(zip(chapter_titles, chapters))\n", + "\n", + "print(\"Total chapters found:\", len(chapters))\n", + "if chapters_with_titles:\n", + " print(\"First chapter title:\", chapters_with_titles[0][0])\n", + " print(\"Text sample from first chapter:\", chapters_with_titles[0][1][:500])\n", + "\n", + "\n", + "# Structuring chapters into a DataFrame\n", + "df = pd.DataFrame(chapters_with_titles, columns=[\"chapter_title\", \"chapter_full_text\"])\n", + "df[\"chapter\"] = df.index + 1\n", + "df[\"book_title\"] = \"Harry Potter and the Sorcerer’s Stone\"\n", + "df[\"passages\"] = df[\"chapter_full_text\"].apply(lambda text: chunk(text))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DKK4574EQaTl" + }, + "source": [ + "## Indexing DataFrame into Elasticsearch\n", + "\n", + "This section uploads the structured data from a pandas DataFrame into a specified Elasticsearch index. The DataFrame contains chapter information from \"Harry Potter and the Sorcerer's Stone\", including chapter titles, full texts, and additional metadata.\n", "\n", - " if debug:\n", - " print(json.dumps(query, indent=4))\n", + "### Key Operation:\n", + "- **Index Data**: The `index_dataframe` function is called with the Elasticsearch client, the raw source index name, and the DataFrame as arguments. This operation effectively uploads the data into Elasticsearch, making it searchable and ready for further processing.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7ReLAtz1O1HF", + "outputId": "3bf70ccc-804d-4718-e2a7-13dc0008e073" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Indexing documents to harry_potter_dataset-raw...\n", + "Successfully indexed 17 documents.\n", + "Failed to index 0 documents.\n" + ] + } + ], + "source": [ + "index_dataframe(es, raw_source_index, df)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pA5QroYdQgcM" + }, + "source": [ + "## Asynchronous Reindexing in Elasticsearch\n", "\n", - " return query\n", + "This section initiates an asynchronous reindex operation to transfer data from the raw source index to the enriched index in Elasticsearch. This process runs in the background, allowing other operations to continue without waiting for completion.\n", "\n", + "### Key Steps:\n", + "1. **Start Reindex**: The reindex operation is triggered from the `raw_source_index` to the `index_name`, with `wait_for_completion` set to `False` to allow asynchronous execution.\n", + "2. **Retrieve Task ID**: The task ID of the reindex operation is captured and printed for monitoring purposes.\n", + "3. **Monitor Progress**: The `check_task_status` function continuously checks the status of the reindex task, providing updates every 10 seconds until the operation is complete.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HOCX_lbmO3zl", + "outputId": "014309de-8ec6-4cf8-b647-6bf0e6f512d8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Task ID: _ov-FtHBSkqocXXBG6nu4A:68576798\n", + "Indexing...\n", + "Reindexing complete.\n" + ] + } + ], + "source": [ + "# Start the reindex operation asynchronously\n", + "response = es.reindex(\n", + " body={\"source\": {\"index\": raw_source_index}, \"dest\": {\"index\": index_name}},\n", + " wait_for_completion=False,\n", + ")\n", + "task_id = response[\"task\"]\n", + "print(\"Task ID:\", task_id)\n", + "check_task_status(es, task_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xJBDwRmDQq4n" + }, + "source": [ + "## Custom Search Query Construction and Execution\n", "\n", - "def get_max_chunk_number_query(chapter_number, debug=False):\n", - " # Construct the query\n", - " query = {\n", - " \"size\": 0,\n", - " \"query\": {\"term\": {\"chapter\": chapter_number}},\n", - " \"aggs\": {\n", - " \"max_chunk_number\": {\n", - " \"nested\": {\"path\": \"passages\"},\n", - " \"aggs\": {\"max_chunk\": {\"max\": {\"field\": \"passages.chunk_number\"}}},\n", - " }\n", - " },\n", - " }\n", + "This section constructs and executes a custom search query in Elasticsearch, utilizing a hybrid approach combining vector and text-based search methods to enhance search accuracy and relevance. The specific example used is a user query about the \"Nimbus 2000\".\n", "\n", - " if debug:\n", - " print(json.dumps(query, indent=4))\n", + "### Key Steps:\n", + "1. **Define User Query**: The user query is specified as \"what is a nimbus 2000\".\n", + "2. **Set Boost Factors**:\n", + " - `knn_boost_factor`: A value to amplify the importance of the vector-based search component.\n", + " - `text_expansion_boost`: A value to modify the weight of the text-based search component.\n", + "3. **Build Query**: The `build_custom_query` function constructs the search query, incorporating both dense vector and text expansion components.\n", + "4. **Execute Search**: The query is executed against the specified Elasticsearch index.\n", + "5. **Identify Relevant Passages**:\n", + " - The search results are analyzed to find the passage with the highest relevance score.\n", + " - The ID and chunk number of the best matching passage are captured and printed.\n", + "6. **Fetch Surrounding Chunks**: Constructs and executes a query to retrieve chunks adjacent to the identified passage for broader context. If the matched chunk is the first chunk, fetches n, n+1, and n+2. If the chunk is the last chunk in the chapter, fetches n, n-1, and n-2. For other chunks, fetches n-1, n, and n+1.\n", + "7. **Display Results**: Outputs text from the relevant and adjacent passages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "u7NFZBRJO3t7", + "outputId": "6f9ec0d9-bb1d-4235-da45-1c8040ac7036" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Matched Chunk ID: wz8m148BbBK3er50L0-W, Chunk Number: 3, Text:\n", + "t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "\n", + "\n", + "Fetch Surrounding Chunks\n", + "------------------------\n", + "\n", + "\n", + "Text from Chunk 2: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "\n", + "\n", + "Text from Chunk 3: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "\n", + "\n", + "Text from Chunk 4: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n" + ] + } + ], + "source": [ + "# Custom Search Query Construction\n", + "user_query = \"what is a nimbus 2000\"\n", "\n", - " return query\n", "\n", + "knn_boost_factor = 20\n", + "text_expansion_boost = 1\n", + "query = build_custom_query(\n", + " build_vector(user_query),\n", + " user_query,\n", + " knn_boost_factor,\n", + " text_expansion_boost,\n", + " debug=False,\n", + ")\n", + "\n", + "# Searching and identifying relevant passages\n", + "results = es.search(index=index_name, body=query, _source=False)\n", "\n", - "def print_text_from_results(results):\n", - " if results[\"hits\"][\"hits\"]:\n", - " for hit in results[\"hits\"][\"hits\"]:\n", - " if \"inner_hits\" in hit and \"passages\" in hit[\"inner_hits\"]:\n", - " nested_hits = hit[\"inner_hits\"][\"passages\"][\"hits\"][\"hits\"]\n", - " for nested_hit in nested_hits:\n", - " chunk_number = nested_hit[\"_source\"][\"chunk_number\"]\n", - " text = nested_hit[\"_source\"][\"text\"]\n", - " # print(f\"Text from Chunk {chunk_number}: {text}\")\n", - " print(\n", - " f\"\\n\\nText from Chunk {chunk_number}: {textwrap.fill(first_passage_text, width=200)}\"\n", - " )\n", - " else:\n", - " print(\"No hits found.\")\n", + "hit_id = None\n", + "chunk_number = None\n", "\n", + "if results and results.get(\"hits\") and results[\"hits\"].get(\"hits\"):\n", + " highest_score = -1\n", + " best_hit = None\n", + " hit_id = results[\"hits\"][\"hits\"][0][\"_id\"]\n", + " chapter_number = results[\"hits\"][\"hits\"][0][\"fields\"][\"chapter\"][0]\n", + " if \"inner_hits\" in results[\"hits\"][\"hits\"][0]:\n", + " for hit_type in [\"text_hits\", \"dense_hit\", \"sparse_hits\"]:\n", + " if hit_type in results[\"hits\"][\"hits\"][0][\"inner_hits\"]:\n", + " inner_hit = results[\"hits\"][\"hits\"][0][\"inner_hits\"][hit_type][\"hits\"]\n", + " if inner_hit[\"hits\"]:\n", + " max_score = inner_hit[\"max_score\"]\n", + " if max_score and max_score > highest_score:\n", + " highest_score = max_score\n", + " best_hit = inner_hit[\"hits\"][0]\n", "\n", - "def chunk(\n", - " text, chunk_size=SEMANTIC_SEARCH_TOKEN_LIMIT, overlap_ratio=ELSER_TOKEN_OVERLAP\n", - "):\n", - " step_size = round(chunk_size * (1 - overlap_ratio))\n", - " tokens = bert_tokenizer.encode(text)\n", - " tokens = tokens[1:-1] # remove special beginning and end tokens\n", - " result = []\n", - " chunk_number = 1\n", - " for i in range(0, len(tokens), step_size):\n", - " end = i + chunk_size\n", - " chunk_text = bert_tokenizer.decode(tokens[i:end])\n", - " result.append({\"text\": chunk_text, \"chunk_number\": chunk_number})\n", - " chunk_number += 1\n", - " if end >= len(tokens):\n", - " break\n", - " return result\n", + " if best_hit:\n", + " first_passage_text = best_hit[\"_source\"][\"text\"]\n", + " chunk_number = best_hit[\"_source\"][\"chunk_number\"]\n", + " # print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text: {first_passage_text}\")\n", + " print(\n", + " f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text:\\n{textwrap.fill(first_passage_text, width=200)}\"\n", + " )\n", + " print(f\"\\n\")\n", + " else:\n", + " print(f\"ID: {hit_id}, No relevant passages found.\")\n", + "else:\n", + " print(\"No results found.\")\n", "\n", + "print(f\"Fetch Surrounding Chunks\")\n", + "print(f\"------------------------\")\n", "\n", - "def check_task_status(es, task_id):\n", - " while True:\n", - " task_response = es.tasks.get(task_id=task_id)\n", - " if task_response[\"completed\"]:\n", - " print(\"Reindexing complete.\")\n", - " break\n", - " else:\n", - " print(\"Indexing...\")\n", - " time.sleep(10)" - ], - "metadata": { - "id": "xB2a9-qtONbQ" - }, - "execution_count": null, - "outputs": [] + "max_chapter_chunk_result = es.search(\n", + " index=index_name,\n", + " body=get_max_chunk_number_query(chapter_number, debug=False),\n", + " _source=False,\n", + ")\n", + "max_chunk_number = max_chapter_chunk_result[\"aggregations\"][\"max_chunk_number\"][\n", + " \"max_chunk\"\n", + "][\"value\"]\n", + "\n", + "adjacent_chunks_query = get_adjacent_chunks_query(\n", + " hit_id, chunk_number, max_chunk_number, debug=False\n", + ")\n", + "results = es.search(index=index_name, body=adjacent_chunks_query, _source=False)\n", + "print_text_from_results(results)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] }, - { - "cell_type": "markdown", - "source": [ - "##Ingest Pipelines" - ], - "metadata": { - "id": "izMU8HqqP7ld" - } + "kernelspec": { + "display_name": "Python 3", + "name": "python3" }, - { - "cell_type": "code", - "source": [ - "# Define the ingest pipeline configuration\n", - "pipeline_body = {\n", - " \"description\": \"Pipeline for processing book passages\",\n", - " \"processors\": [\n", - " {\n", - " \"foreach\": {\n", - " \"field\": \"passages\",\n", - " \"processor\": {\n", - " \"inference\": {\n", - " \"field_map\": {\"_ingest._value.text\": \"text_field\"},\n", - " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", - " \"target_field\": \"_ingest._value.vector\",\n", - " \"on_failure\": [\n", - " {\n", - " \"append\": {\n", - " \"field\": \"_source._ingest.inference_errors\",\n", - " \"value\": [\n", - " {\n", - " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", - " \"pipeline\": \"ml-inference-title-vector\",\n", - " \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n", - " }\n", - " ],\n", - " }\n", - " }\n", - " ],\n", - " }\n", - " },\n", - " }\n", - " },\n", - " {\n", - " \"foreach\": {\n", - " \"field\": \"passages\",\n", - " \"processor\": {\n", - " \"inference\": {\n", - " \"field_map\": {\"_ingest._value.text\": \"text_field\"},\n", - " \"model_id\": elser_model_id,\n", - " \"target_field\": \"_ingest._value.content_embedding\",\n", - " \"on_failure\": [\n", - " {\n", - " \"append\": {\n", - " \"field\": \"_source._ingest.inference_errors\",\n", - " \"value\": [\n", - " {\n", - " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", - " \"pipeline\": \"ml-inference-title-vector\",\n", - " \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n", - " }\n", - " ],\n", - " }\n", - " }\n", - " ],\n", - " }\n", - " },\n", - " }\n", - " },\n", - " ],\n", - "}\n", - "\n", - "# Create or update the pipeline\n", - "pipeline_id = \"books_dataset_chunker\"\n", - "es = create_es_client()\n", - "es.ingest.put_pipeline(id=pipeline_id, body=pipeline_body)\n", - "print(f\"Ingest pipeline '{pipeline_id}' created/updated successfully.\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "02f735a438bf4058a9cfacf8d2b8660f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_331b178397164de49408dc50ce417a36", + "IPY_MODEL_07ee43d2a1684fb0b1445755802b6ea5", + "IPY_MODEL_c867bce7e34b4800903eb9ec99f34784" + ], + "layout": "IPY_MODEL_8169e16a9b0146f5a57a015601c2ebcb" + } + }, + "07ee43d2a1684fb0b1445755802b6ea5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_527bfa6067c84b94a1e70dfadfd4b78e", + "max": 466062, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_312e85864e074b958d86325b6417a0fa", + "value": 466062 + } + }, + "092a4de220ba4ca2a23a0f273aba601b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0961f276155348f98c12d1be4ad78e62": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0dfb7f264674449b92a390324d17c4cf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0f8d4b5000174234bded5d4e017aa4e9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0fc0b516e82941dc934c26eba22d9e01": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "15887401b0814d9386fb4d02d6279412": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1afbe347ab364b28b887f49dad54f5d7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4831ce9114e5437ea8a24919557c40e2", + "placeholder": "​", + "style": "IPY_MODEL_53344cac458d4d5ebdc504744c18b7de", + "value": " 570/570 [00:00<00:00, 19.9kB/s]" + } + }, + "2155cf3c7b2043d0a41fc011bf4f0e04": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "243f1e7de5414b82aaa4b50482dd964d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2b62b542c091466cbae559e29ec797bd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2cf5d1a84ed947ddb16e5f8e6984b01e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2ed85f5360ba4feda6469aabd0324e7a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2f125504e41344c088231f0307d7cb92": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "30ba8c556cc34fde96b530ed66ac376d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "312e85864e074b958d86325b6417a0fa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "31906527169a4c08801dc6b21936188d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a6570ce51dfc46f383d855e28534bf73", + "IPY_MODEL_41cc49a71a164065bc833d080027e4d2", + "IPY_MODEL_748e7f3c8da243e9b5320654ec8e8146" + ], + "layout": "IPY_MODEL_a88953429ab6436fb4f01b6b1e2cf6ff" + } + }, + "31c0c3d684564d5fb87d2e25e6de96eb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "331b178397164de49408dc50ce417a36": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_35ca86faebfd43faaef0202389d958fd", + "placeholder": "​", + "style": "IPY_MODEL_f04f37ba10e9498ea61acdce637431ee", + "value": "tokenizer.json: 100%" + } + }, + "3542b02e36ce4e03850b37c28d88da30": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "35ca86faebfd43faaef0202389d958fd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3b0fc37739334025b037a5270c9515bf": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "400b0a8ef7c64477bf4f02a16b5508a0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e5ec838bb84644b6a27e3eaec9d7ac74", + "max": 5069051, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_30ba8c556cc34fde96b530ed66ac376d", + "value": 5069051 + } + }, + "406f3564a217478d8f60dee5e1fb6dbf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "41cc49a71a164065bc833d080027e4d2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7ce44d2f323d45838633a750f2386525", + "max": 231508, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_406f3564a217478d8f60dee5e1fb6dbf", + "value": 231508 + } + }, + "42c418329198400bb77cdd3a654a96de": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "43072f923bd24566ae0e20ca9aa3cdc5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5a4f80526c2c4b53a1bce182e9b3e5fa", + "IPY_MODEL_400b0a8ef7c64477bf4f02a16b5508a0", + "IPY_MODEL_b03347c3201849778ea3129314ac340c" + ], + "layout": "IPY_MODEL_3542b02e36ce4e03850b37c28d88da30" + } + }, + "461ca08f677a4cba9ec2a388c2e346f3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_59d8ffb31bb340eba7e0dcebfbbdd977", + "placeholder": "​", + "style": "IPY_MODEL_2b62b542c091466cbae559e29ec797bd", + "value": "config.json: 100%" + } + }, + "4831ce9114e5437ea8a24919557c40e2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4a75d002a4ae4fc99625420ec6e580ee": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4ef5fe3e9ea84b8c8cb1a90a8208bdb9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "500a70f25097484bbec10c0ffd402595": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "527bfa6067c84b94a1e70dfadfd4b78e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "53344cac458d4d5ebdc504744c18b7de": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "540140c9c2f541b3a82f7a59e4f0b867": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4ef5fe3e9ea84b8c8cb1a90a8208bdb9", + "placeholder": "​", + "style": "IPY_MODEL_735b9a74223f4941a2837a1108889f63", + "value": " 280/280 [00:00<00:00, 15.7kB/s]" + } + }, + "59d8ffb31bb340eba7e0dcebfbbdd977": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5a4f80526c2c4b53a1bce182e9b3e5fa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7b2abc768054422f8af3d21837400b4a", + "placeholder": "​", + "style": "IPY_MODEL_d8e4ceae237d4381aa5e44b020d7564e", + "value": "sentencepiece.bpe.model: 100%" + } + }, + "5f2080a5d12241638447a5851d0c8db3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "61ae734ac8d441fd9b3ea198aff3f2c7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "695f77c019db487ea60171277073efe6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6a6665e93675459394536fd9f846fbea": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_aed65947759c47c58abe86f1ee279b86", + "IPY_MODEL_b5d7fb93223c4c458e7c80e59daea4d2", + "IPY_MODEL_540140c9c2f541b3a82f7a59e4f0b867" + ], + "layout": "IPY_MODEL_d5212aa2a4f74de1970c07e282f0e2bc" + } + }, + "6c822f08434c4212931dcf097a80b7d4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "711439e7dcab4c10ab4300bdbe6b86aa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "735b9a74223f4941a2837a1108889f63": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "748e7f3c8da243e9b5320654ec8e8146": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_61ae734ac8d441fd9b3ea198aff3f2c7", + "placeholder": "​", + "style": "IPY_MODEL_bc52c57fa6464ab39823cd3ddb9d7d78", + "value": " 232k/232k [00:00<00:00, 2.88MB/s]" + } + }, + "7abe3bc7884e4afeb9995f7d7acc8c0f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_df2ed1f8e3754f3a8f30be35935e82f3", + "placeholder": "​", + "style": "IPY_MODEL_2f125504e41344c088231f0307d7cb92", + "value": " 17.1M/17.1M [00:00<00:00, 76.7MB/s]" + } + }, + "7b2abc768054422f8af3d21837400b4a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7ce44d2f323d45838633a750f2386525": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7e58bf25549d4b428f231d528e8fef54": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_461ca08f677a4cba9ec2a388c2e346f3", + "IPY_MODEL_90d31fb52af949b0a2b41e3613827233", + "IPY_MODEL_1afbe347ab364b28b887f49dad54f5d7" + ], + "layout": "IPY_MODEL_8483a759cc0e4e12834fc7d08dab3b7e" + } + }, + "7edccbff1ca145eabd4af6f9da32442a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "808af1e1f2464a928ee23398c837ff48": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8169e16a9b0146f5a57a015601c2ebcb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8483a759cc0e4e12834fc7d08dab3b7e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "89ad2dee66324ae896eec71924aee670": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0961f276155348f98c12d1be4ad78e62", + "placeholder": "​", + "style": "IPY_MODEL_95c6cffafe1b4345a905be485b787728", + "value": " 418/418 [00:00<00:00, 13.9kB/s]" + } + }, + "90d31fb52af949b0a2b41e3613827233": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_092a4de220ba4ca2a23a0f273aba601b", + "max": 570, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e0c46565371f437a85a26d44c5b20c5b", + "value": 570 + } + }, + "94c89aa435e44c8d9369305c21ca028c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d5de868032e640d5a07c34c9917190c3", + "max": 17082660, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d6295baf48b24c929c3ab4a317356e2b", + "value": 17082660 + } + }, + "95316b2f654a4ddc99c92d7c60c2f417": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2ed85f5360ba4feda6469aabd0324e7a", + "placeholder": "​", + "style": "IPY_MODEL_808af1e1f2464a928ee23398c837ff48", + "value": " 48.0/48.0 [00:00<00:00, 1.58kB/s]" + } + }, + "95c6cffafe1b4345a905be485b787728": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9c11fce2ab1f4811a3d98f9154818825": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9c160e35cf414c528b5bffe05725a7d9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e87bc6913a7747728aed4b60a645bc2c", + "IPY_MODEL_9fa94c466004402bb293e4aa0bdc82f4", + "IPY_MODEL_95316b2f654a4ddc99c92d7c60c2f417" + ], + "layout": "IPY_MODEL_0fc0b516e82941dc934c26eba22d9e01" + } + }, + "9d43995246744e26a8053c21e2c5fcfa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9fa94c466004402bb293e4aa0bdc82f4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_500a70f25097484bbec10c0ffd402595", + "max": 48, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9d43995246744e26a8053c21e2c5fcfa", + "value": 48 + } + }, + "a6570ce51dfc46f383d855e28534bf73": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b0a2671c90a048548314c2e3d21e19e7", + "placeholder": "​", + "style": "IPY_MODEL_5f2080a5d12241638447a5851d0c8db3", + "value": "vocab.txt: 100%" + } + }, + "a88953429ab6436fb4f01b6b1e2cf6ff": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "aed65947759c47c58abe86f1ee279b86": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4a75d002a4ae4fc99625420ec6e580ee", + "placeholder": "​", + "style": "IPY_MODEL_695f77c019db487ea60171277073efe6", + "value": "special_tokens_map.json: 100%" + } + }, + "b03347c3201849778ea3129314ac340c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_711439e7dcab4c10ab4300bdbe6b86aa", + "placeholder": "​", + "style": "IPY_MODEL_15887401b0814d9386fb4d02d6279412", + "value": " 5.07M/5.07M [00:00<00:00, 19.7MB/s]" + } + }, + "b0a2671c90a048548314c2e3d21e19e7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b5d7fb93223c4c458e7c80e59daea4d2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bc3fad5fe0194399add875a6d78907bd", + "max": 280, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_42c418329198400bb77cdd3a654a96de", + "value": 280 + } + }, + "b7bef190ebed494eb8773ec21d9b7160": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bc3fad5fe0194399add875a6d78907bd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bc52c57fa6464ab39823cd3ddb9d7d78": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bec0cae37feb48a4add318d970d8ef96": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_feb8127671424fa68b9b93a7547e40eb", + "IPY_MODEL_94c89aa435e44c8d9369305c21ca028c", + "IPY_MODEL_7abe3bc7884e4afeb9995f7d7acc8c0f" + ], + "layout": "IPY_MODEL_7edccbff1ca145eabd4af6f9da32442a" + } + }, + "c867bce7e34b4800903eb9ec99f34784": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3b0fc37739334025b037a5270c9515bf", + "placeholder": "​", + "style": "IPY_MODEL_0dfb7f264674449b92a390324d17c4cf", + "value": " 466k/466k [00:00<00:00, 6.88MB/s]" + } + }, + "d5212aa2a4f74de1970c07e282f0e2bc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d5de868032e640d5a07c34c9917190c3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d604bc170c02491fae573c702e790893": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b7bef190ebed494eb8773ec21d9b7160", + "placeholder": "​", + "style": "IPY_MODEL_6c822f08434c4212931dcf097a80b7d4", + "value": "tokenizer_config.json: 100%" + } + }, + "d6295baf48b24c929c3ab4a317356e2b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - "id": "iUOFJK48OamP", - "outputId": "5dc25103-a2ee-4a19-e184-92ec65c29187" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Ingest pipeline 'books_dataset_chunker' created/updated successfully.\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "##Index Settings" - ], - "metadata": { - "id": "6ZkRwEGdQBRP" - } - }, - { - "cell_type": "code", - "source": [ - "index_settings = {\n", - " \"settings\": {\n", - " \"number_of_shards\": 2,\n", - " \"number_of_replicas\": 0,\n", - " \"default_pipeline\": \"books_dataset_chunker\",\n", - " },\n", - " \"mappings\": {\n", - " \"dynamic\": \"false\",\n", - " \"properties\": {\n", - " \"book_title\": {\"type\": \"keyword\"},\n", - " \"chapter\": {\"type\": \"keyword\"},\n", - " \"chapter_full_text\": {\"type\": \"text\", \"index\": False},\n", - " \"passages\": {\n", - " \"type\": \"nested\",\n", - " \"properties\": {\n", - " \"content_embedding\": {\n", - " \"properties\": {\n", - " \"is_truncated\": {\"type\": \"boolean\"},\n", - " \"model_id\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}\n", - " },\n", - " },\n", - " \"predicted_value\": {\"type\": \"sparse_vector\"},\n", - " }\n", - " },\n", - " \"text\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", - " },\n", - " \"vector\": {\n", - " \"properties\": {\n", - " \"is_truncated\": {\"type\": \"boolean\"},\n", - " \"model_id\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}\n", - " },\n", - " },\n", - " \"predicted_value\": {\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 384,\n", - " \"index\": True,\n", - " \"similarity\": \"dot_product\",\n", - " },\n", - " }\n", - " },\n", - " \"chunk_number\": {\"type\": \"integer\"},\n", - " },\n", - " },\n", - " },\n", - " },\n", - "}\n", - "\n", - "raw_source_index_settings = {\n", - " \"settings\": {\"number_of_shards\": 2, \"number_of_replicas\": 0},\n", - " \"mappings\": {\n", - " \"dynamic\": \"false\",\n", - " \"properties\": {\n", - " \"book_title\": {\"type\": \"keyword\"},\n", - " \"chapter\": {\"type\": \"keyword\"},\n", - " \"chapter_full_text\": {\"type\": \"text\", \"index\": False},\n", - " \"passages\": {\n", - " \"type\": \"nested\",\n", - " \"properties\": {\n", - " \"text\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", - " },\n", - " \"chunk_number\": {\"type\": \"integer\"},\n", - " },\n", - " },\n", - " },\n", - " },\n", - "}\n", - "\n", - "# Manage indices\n", - "manage_index(\n", - " es,\n", - " index_name,\n", - " index_settings[\"settings\"],\n", - " index_settings[\"mappings\"],\n", - " delete_index=True,\n", - ")\n", - "manage_index(\n", - " es,\n", - " raw_source_index,\n", - " raw_source_index_settings[\"settings\"],\n", - " raw_source_index_settings[\"mappings\"],\n", - " delete_index=True,\n", - ")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "d8a3bdb8be354365944ab587738280d3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0f8d4b5000174234bded5d4e017aa4e9", + "max": 418, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_243f1e7de5414b82aaa4b50482dd964d", + "value": 418 + } }, - "id": "vZ3Z5gZbOgjF", - "outputId": "996f6ca5-d27d-4ea0-ed4d-07570b9942ad" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Index harry_potter_dataset_enriched exists. Deleting it...\n", - "Index harry_potter_dataset_enriched deleted!\n", - "Index harry_potter_dataset_enriched created successfully!\n", - "Index harry_potter_dataset-raw exists. Deleting it...\n", - "Index harry_potter_dataset-raw deleted!\n", - "Index harry_potter_dataset-raw created successfully!\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Fetch and Process the Book Text\n", - "\n", - "This section downloads the full text of \"Harry Potter and the Sorcerer's Stone\" from a specified URL and processes it to extract chapters and their titles. The text is then structured into a pandas DataFrame for further analysis and indexing.\n", - "\n", - "### Key Steps:\n", - "1. **Download Text**: The book is fetched using `urllib.request` from the provided URL.\n", - "2. **Extract Chapters**: The text is split into chapters based on predefined patterns, omitting the text before the first chapter.\n", - "3. **Capture Chapter Titles**: Chapter titles are extracted and paired with their respective texts.\n", - "4. **Data Structuring**:\n", - " - Convert the list of chapter titles and texts into a DataFrame.\n", - " - Assign sequential numbers to chapters.\n", - " - Add the book title as metadata.\n", - " - Apply a text chunking function to split each chapter into manageable passages.\n", - "\n", - "This prepares the text data for efficient indexing and advanced search operations in Elasticsearch.\n" - ], - "metadata": { - "id": "NPtbLhVOQUF3" - } - }, - { - "cell_type": "code", - "source": [ - "# Fetch and process the book text\n", - "potter_book_url = \"https://raw.githubusercontent.com/amephraim/nlp/master/texts/J.%20K.%20Rowling%20-%20Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt\"\n", - "response = urllib.request.urlopen(potter_book_url)\n", - "harry_potter_book_text = response.read().decode(\"utf-8\")\n", - "chapter_pattern = re.compile(r\"CHAPTER [A-Z]+\", re.IGNORECASE)\n", - "chapters = chapter_pattern.split(harry_potter_book_text)[1:]\n", - "chapter_titles = re.findall(chapter_pattern, harry_potter_book_text)\n", - "chapters_with_titles = list(zip(chapter_titles, chapters))\n", - "\n", - "print(\"Total chapters found:\", len(chapters))\n", - "if chapters_with_titles:\n", - " print(\"First chapter title:\", chapters_with_titles[0][0])\n", - " print(\"Text sample from first chapter:\", chapters_with_titles[0][1][:500])\n", - "\n", - "\n", - "# Structuring chapters into a DataFrame\n", - "df = pd.DataFrame(chapters_with_titles, columns=[\"chapter_title\", \"chapter_full_text\"])\n", - "df[\"chapter\"] = df.index + 1\n", - "df[\"book_title\"] = \"Harry Potter and the Sorcerer’s Stone\"\n", - "df[\"passages\"] = df[\"chapter_full_text\"].apply(lambda text: chunk(text))" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "d8e4ceae237d4381aa5e44b020d7564e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d93a6e3ddd364921a7c2a24451d27ffc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - "id": "0L4YI96xOuKn", - "outputId": "7f9c63c7-82d8-4490-aabb-c3629872d80d" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Total chapters found: 17\n", - "First chapter title: CHAPTER ONE\n", - "Text sample from first chapter: \n", - "\n", - "THE BOY WHO LIVED\n", - "\n", - "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\n", - "that they were perfectly normal, thank you very much. They were the last\n", - "people you'd expect to be involved in anything strange or mysterious,\n", - "because they just didn't hold with such nonsense.\n", - "\n", - "Mr. Dursley was the director of a firm called Grunnings, which made\n", - "drills. He was a big, beefy man with hardly any neck, although he did\n", - "have a very large mustache. Mrs. Dursley was thin and blonde and had\n", - "nearly t\n" - ] + "df2ed1f8e3754f3a8f30be35935e82f3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "Token indices sequence length is longer than the specified maximum sequence length for this model (6535 > 512). Running this sequence through the model will result in indexing errors\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Indexing DataFrame into Elasticsearch\n", - "\n", - "This section uploads the structured data from a pandas DataFrame into a specified Elasticsearch index. The DataFrame contains chapter information from \"Harry Potter and the Sorcerer's Stone\", including chapter titles, full texts, and additional metadata.\n", - "\n", - "### Key Operation:\n", - "- **Index Data**: The `index_dataframe` function is called with the Elasticsearch client, the raw source index name, and the DataFrame as arguments. This operation effectively uploads the data into Elasticsearch, making it searchable and ready for further processing.\n" - ], - "metadata": { - "id": "DKK4574EQaTl" - } - }, - { - "cell_type": "code", - "source": [ - "index_dataframe(es, raw_source_index, df)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "e0c46565371f437a85a26d44c5b20c5b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - "id": "7ReLAtz1O1HF", - "outputId": "3bf70ccc-804d-4718-e2a7-13dc0008e073" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Indexing documents to harry_potter_dataset-raw...\n", - "Successfully indexed 17 documents.\n", - "Failed to index 0 documents.\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Asynchronous Reindexing in Elasticsearch\n", - "\n", - "This section initiates an asynchronous reindex operation to transfer data from the raw source index to the enriched index in Elasticsearch. This process runs in the background, allowing other operations to continue without waiting for completion.\n", - "\n", - "### Key Steps:\n", - "1. **Start Reindex**: The reindex operation is triggered from the `raw_source_index` to the `index_name`, with `wait_for_completion` set to `False` to allow asynchronous execution.\n", - "2. **Retrieve Task ID**: The task ID of the reindex operation is captured and printed for monitoring purposes.\n", - "3. **Monitor Progress**: The `check_task_status` function continuously checks the status of the reindex task, providing updates every 10 seconds until the operation is complete.\n" - ], - "metadata": { - "id": "pA5QroYdQgcM" - } - }, - { - "cell_type": "code", - "source": [ - "# Start the reindex operation asynchronously\n", - "response = es.reindex(\n", - " body={\"source\": {\"index\": raw_source_index}, \"dest\": {\"index\": index_name}},\n", - " wait_for_completion=False,\n", - ")\n", - "task_id = response[\"task\"]\n", - "print(\"Task ID:\", task_id)\n", - "check_task_status(es, task_id)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "e5ec838bb84644b6a27e3eaec9d7ac74": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e87bc6913a7747728aed4b60a645bc2c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d93a6e3ddd364921a7c2a24451d27ffc", + "placeholder": "​", + "style": "IPY_MODEL_2155cf3c7b2043d0a41fc011bf4f0e04", + "value": "tokenizer_config.json: 100%" + } }, - "id": "HOCX_lbmO3zl", - "outputId": "014309de-8ec6-4cf8-b647-6bf0e6f512d8" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Task ID: _ov-FtHBSkqocXXBG6nu4A:68576798\n", - "Indexing...\n", - "Reindexing complete.\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Custom Search Query Construction and Execution\n", - "\n", - "This section constructs and executes a custom search query in Elasticsearch, utilizing a hybrid approach combining vector and text-based search methods to enhance search accuracy and relevance. The specific example used is a user query about the \"Nimbus 2000\".\n", - "\n", - "### Key Steps:\n", - "1. **Define User Query**: The user query is specified as \"what is a nimbus 2000\".\n", - "2. **Set Boost Factors**:\n", - " - `knn_boost_factor`: A value to amplify the importance of the vector-based search component.\n", - " - `text_expansion_boost`: A value to modify the weight of the text-based search component.\n", - "3. **Build Query**: The `build_custom_query` function constructs the search query, incorporating both dense vector and text expansion components.\n", - "4. **Execute Search**: The query is executed against the specified Elasticsearch index.\n", - "5. **Identify Relevant Passages**:\n", - " - The search results are analyzed to find the passage with the highest relevance score.\n", - " - The ID and chunk number of the best matching passage are captured and printed.\n", - "6. **Fetch Surrounding Chunks**: Constructs and executes a query to retrieve chunks adjacent to the identified passage for broader context. If the matched chunk is the first chunk, fetches n, n+1, and n+2. If the chunk is the last chunk in the chapter, fetches n, n-1, and n-2. For other chunks, fetches n-1, n, and n+1.\n", - "7. **Display Results**: Outputs text from the relevant and adjacent passages." - ], - "metadata": { - "id": "xJBDwRmDQq4n" - } - }, - { - "cell_type": "code", - "source": [ - "# Custom Search Query Construction\n", - "user_query = \"what is a nimbus 2000\"\n", - "\n", - "\n", - "knn_boost_factor = 20\n", - "text_expansion_boost = 1\n", - "query = build_custom_query(\n", - " build_vector(user_query),\n", - " user_query,\n", - " knn_boost_factor,\n", - " text_expansion_boost,\n", - " debug=False,\n", - ")\n", - "\n", - "# Searching and identifying relevant passages\n", - "results = es.search(index=index_name, body=query, _source=False)\n", - "\n", - "hit_id = None\n", - "chunk_number = None\n", - "\n", - "if results and results.get(\"hits\") and results[\"hits\"].get(\"hits\"):\n", - " highest_score = -1\n", - " best_hit = None\n", - " hit_id = results[\"hits\"][\"hits\"][0][\"_id\"]\n", - " chapter_number = results[\"hits\"][\"hits\"][0][\"fields\"][\"chapter\"][0]\n", - " if \"inner_hits\" in results[\"hits\"][\"hits\"][0]:\n", - " for hit_type in [\"text_hits\", \"dense_hit\", \"sparse_hits\"]:\n", - " if hit_type in results[\"hits\"][\"hits\"][0][\"inner_hits\"]:\n", - " inner_hit = results[\"hits\"][\"hits\"][0][\"inner_hits\"][hit_type][\"hits\"]\n", - " if inner_hit[\"hits\"]:\n", - " max_score = inner_hit[\"max_score\"]\n", - " if max_score and max_score > highest_score:\n", - " highest_score = max_score\n", - " best_hit = inner_hit[\"hits\"][0]\n", - "\n", - " if best_hit:\n", - " first_passage_text = best_hit[\"_source\"][\"text\"]\n", - " chunk_number = best_hit[\"_source\"][\"chunk_number\"]\n", - " # print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text: {first_passage_text}\")\n", - " print(\n", - " f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text:\\n{textwrap.fill(first_passage_text, width=200)}\"\n", - " )\n", - " print(f\"\\n\")\n", - " else:\n", - " print(f\"ID: {hit_id}, No relevant passages found.\")\n", - "else:\n", - " print(\"No results found.\")\n", - "\n", - "print(f\"Fetch Surrounding Chunks\")\n", - "print(f\"------------------------\")\n", - "\n", - "max_chapter_chunk_result = es.search(\n", - " index=index_name,\n", - " body=get_max_chunk_number_query(chapter_number, debug=False),\n", - " _source=False,\n", - ")\n", - "max_chunk_number = max_chapter_chunk_result[\"aggregations\"][\"max_chunk_number\"][\n", - " \"max_chunk\"\n", - "][\"value\"]\n", - "\n", - "adjacent_chunks_query = get_adjacent_chunks_query(\n", - " hit_id, chunk_number, max_chunk_number, debug=False\n", - ")\n", - "results = es.search(index=index_name, body=adjacent_chunks_query, _source=False)\n", - "print_text_from_results(results)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "f04f37ba10e9498ea61acdce637431ee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - "id": "u7NFZBRJO3t7", - "outputId": "6f9ec0d9-bb1d-4235-da45-1c8040ac7036" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Matched Chunk ID: wz8m148BbBK3er50L0-W, Chunk Number: 3, Text:\n", - "t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", - "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", - "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", - "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", - "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", - "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", - "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", - "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", - "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", - "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", - "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", - "\n", - "\n", - "Fetch Surrounding Chunks\n", - "------------------------\n", - "\n", - "\n", - "Text from Chunk 2: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", - "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", - "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", - "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", - "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", - "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", - "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", - "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", - "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", - "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", - "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", - "\n", - "\n", - "Text from Chunk 3: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", - "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", - "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", - "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", - "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", - "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", - "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", - "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", - "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", - "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", - "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", - "\n", - "\n", - "Text from Chunk 4: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", - "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", - "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", - "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", - "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", - "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", - "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", - "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", - "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", - "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", - "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n" - ] + "f34bf7b0bb424a8e8c00ff75309bbe6f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d604bc170c02491fae573c702e790893", + "IPY_MODEL_d8a3bdb8be354365944ab587738280d3", + "IPY_MODEL_89ad2dee66324ae896eec71924aee670" + ], + "layout": "IPY_MODEL_31c0c3d684564d5fb87d2e25e6de96eb" + } + }, + "feb8127671424fa68b9b93a7547e40eb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9c11fce2ab1f4811a3d98f9154818825", + "placeholder": "​", + "style": "IPY_MODEL_2cf5d1a84ed947ddb16e5f8e6984b01e", + "value": "tokenizer.json: 100%" + } } - ] + } } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} From ab39662e2e76aad59080febca2958c2ecf5b388f Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Tue, 4 Jun 2024 15:59:40 -0500 Subject: [PATCH 03/17] added pip install google.colab fixed issue during checks. installed google.colab --- notebooks/document-chunking/fetch-surrounding-chunks.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb index 733cd64f..d7cc1ba1 100644 --- a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb +++ b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb @@ -48,6 +48,7 @@ "source": [ "!pip install elasticsearch\n", "!pip install pandas\n", + "!pip install google.colab\n", "import json\n", "import time\n", "import urllib.request\n", From acff5fa4d3fc15ab871ce401cee266f120396889 Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Tue, 4 Jun 2024 16:18:24 -0500 Subject: [PATCH 04/17] updated notebook to use api key updated notebook to use api key instead of username and password similar to notebook here: https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/search/00-quick-start.ipynb#scrollTo=f38e0397 --- .../fetch-surrounding-chunks.ipynb | 4858 +++++++++-------- 1 file changed, 2458 insertions(+), 2400 deletions(-) diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb index d7cc1ba1..0cc262f6 100644 --- a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb +++ b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb @@ -1,1315 +1,365 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "aAUkwshINwV7" - }, - "source": [ - "# Fetch surronding chucks (N-1, N+1)\n", - "\n", - "This notebook is designed to handle the ingestion of book text (Harry Potter and the Sorcerer's Stone) into an Elasticsearch Cloud instance. It includes partitioning the book text into chapters and chunking the chapter text, which are then ingested into Elasticsearch. The setup utilizes a nested structure, and for each chunk, it stores dense and sparse (ELSER) vector representations along with the text representation.\n", - "\n", - "Searches are performed using dense vector comparisons, sparse vector comparisons, and text search in parallel to demonstrate the power of hybrid search strategies. Additionally, the notebook is configured to retrieve adjacent chunks (n-1 and n+1), allowing for a more contextual understanding of the search results.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MUEpppV7SeLu" - }, - "source": [ - "## Install required python libraries\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "nXuL8wsQNq8G", - "outputId": "80261fea-a44b-429b-e55d-5947e7ac8b6c" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: elasticsearch in /usr/local/lib/python3.10/dist-packages (8.13.2)\n", - "Requirement already satisfied: elastic-transport<9,>=8.13 in /usr/local/lib/python3.10/dist-packages (from elasticsearch) (8.13.1)\n", - "Requirement already satisfied: urllib3<3,>=1.26.2 in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch) (2.0.7)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch) (2024.2.2)\n" - ] - } - ], - "source": [ - "!pip install elasticsearch\n", - "!pip install pandas\n", - "!pip install google.colab\n", - "import json\n", - "import time\n", - "import urllib.request\n", - "import re\n", - "import pandas as pd\n", - "from transformers import AutoTokenizer, BertTokenizer\n", - "from elasticsearch import Elasticsearch, helpers\n", - "from google.colab import userdata\n", - "import textwrap" - ] + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] }, - { - "cell_type": "markdown", - "metadata": { - "id": "_d4RWjNAN6Q9" - }, - "source": [ - "# Elasticsearch and Tokenizer Configuration\n" - ] + "kernelspec": { + "name": "python3", + "display_name": "Python 3" }, - { - "cell_type": "markdown", - "metadata": { - "id": "2w7uTCYdQ0m6" - }, - "source": [ - "## Elasticsearch and Tokenizer Configuration\n", - "\n", - "This section sets up the necessary configurations for connecting to Elasticsearch and initializing the tokenizers used for text processing.\n", - "\n", - "### Configuration Details:\n", - "1. **Elasticsearch Credentials**:\n", - " - `es_username`: The username for Elasticsearch authentication.\n", - " - `es_password`: The password for Elasticsearch authentication, securely fetched using Google Colab's `userdata` module.\n", - " - `es_cloudid`: The Cloud ID for the Elasticsearch cluster.\n", - "\n", - "2. **Index Settings**:\n", - " - `raw_source_index`: The name of the index for the raw dataset (`harry_potter_dataset-raw`).\n", - " - `index_name`: The name of the enriched dataset index (`harry_potter_dataset_enriched`).\n", - " - `delete_raw_source_index`: A boolean flag indicating whether the raw data index should be deleted before ingesting new data.\n", - "\n", - "3. **Embedding Model**:\n", - " - `dense_embedding_model`: Specifies the model used for generating dense embeddings (`sentence-transformers/all-minilm-l6-v2`).\n", - "\n", - "4. **Tokenizer Initialization**:\n", - " - `bert_tokenizer`: Initializes the BERT tokenizer (`bert-base-uncased`) for English text processing.\n", - " - `e5_tokenizer`: Initializes the Multilingual E5 tokenizer (`intfloat/multilingual-e5-base`) for handling diverse datasets.\n", - "\n", - "5. **Chunking Parameters**:\n", - " - `SEMANTIC_SEARCH_TOKEN_LIMIT`: Sets the token limit for each chunk (500 tokens per chunk, considering space for special tokens).\n", - " - `ELSER_TOKEN_OVERLAP`: Defines the overlap ratio between chunks (default is 0%, customizable for context continuity).\n", - "\n", - "These configurations ensure that the necessary components are properly set up for efficient text processing, indexing, and search operations in Elasticsearch.\n" - ] + "language_info": { + "name": "python" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 328, - "referenced_widgets": [ - "9c160e35cf414c528b5bffe05725a7d9", - "e87bc6913a7747728aed4b60a645bc2c", - "9fa94c466004402bb293e4aa0bdc82f4", - "95316b2f654a4ddc99c92d7c60c2f417", - "0fc0b516e82941dc934c26eba22d9e01", - "d93a6e3ddd364921a7c2a24451d27ffc", - "2155cf3c7b2043d0a41fc011bf4f0e04", - "500a70f25097484bbec10c0ffd402595", - "9d43995246744e26a8053c21e2c5fcfa", - "2ed85f5360ba4feda6469aabd0324e7a", - "808af1e1f2464a928ee23398c837ff48", - "31906527169a4c08801dc6b21936188d", - "a6570ce51dfc46f383d855e28534bf73", - "41cc49a71a164065bc833d080027e4d2", - "748e7f3c8da243e9b5320654ec8e8146", - "a88953429ab6436fb4f01b6b1e2cf6ff", - "b0a2671c90a048548314c2e3d21e19e7", - "5f2080a5d12241638447a5851d0c8db3", - "7ce44d2f323d45838633a750f2386525", - "406f3564a217478d8f60dee5e1fb6dbf", - "61ae734ac8d441fd9b3ea198aff3f2c7", - "bc52c57fa6464ab39823cd3ddb9d7d78", - "02f735a438bf4058a9cfacf8d2b8660f", - "331b178397164de49408dc50ce417a36", - "07ee43d2a1684fb0b1445755802b6ea5", - "c867bce7e34b4800903eb9ec99f34784", - "8169e16a9b0146f5a57a015601c2ebcb", - "35ca86faebfd43faaef0202389d958fd", - "f04f37ba10e9498ea61acdce637431ee", - "527bfa6067c84b94a1e70dfadfd4b78e", - "312e85864e074b958d86325b6417a0fa", - "3b0fc37739334025b037a5270c9515bf", - "0dfb7f264674449b92a390324d17c4cf", - "7e58bf25549d4b428f231d528e8fef54", - "461ca08f677a4cba9ec2a388c2e346f3", - "90d31fb52af949b0a2b41e3613827233", - "1afbe347ab364b28b887f49dad54f5d7", - "8483a759cc0e4e12834fc7d08dab3b7e", - "59d8ffb31bb340eba7e0dcebfbbdd977", - "2b62b542c091466cbae559e29ec797bd", - "092a4de220ba4ca2a23a0f273aba601b", - "e0c46565371f437a85a26d44c5b20c5b", - "4831ce9114e5437ea8a24919557c40e2", - "53344cac458d4d5ebdc504744c18b7de", - "f34bf7b0bb424a8e8c00ff75309bbe6f", - "d604bc170c02491fae573c702e790893", - "d8a3bdb8be354365944ab587738280d3", - "89ad2dee66324ae896eec71924aee670", - "31c0c3d684564d5fb87d2e25e6de96eb", - "b7bef190ebed494eb8773ec21d9b7160", - "6c822f08434c4212931dcf097a80b7d4", - "0f8d4b5000174234bded5d4e017aa4e9", - "243f1e7de5414b82aaa4b50482dd964d", - "0961f276155348f98c12d1be4ad78e62", - "95c6cffafe1b4345a905be485b787728", - "43072f923bd24566ae0e20ca9aa3cdc5", - "5a4f80526c2c4b53a1bce182e9b3e5fa", - "400b0a8ef7c64477bf4f02a16b5508a0", - "b03347c3201849778ea3129314ac340c", - "3542b02e36ce4e03850b37c28d88da30", - "7b2abc768054422f8af3d21837400b4a", - "d8e4ceae237d4381aa5e44b020d7564e", - "e5ec838bb84644b6a27e3eaec9d7ac74", - "30ba8c556cc34fde96b530ed66ac376d", - "711439e7dcab4c10ab4300bdbe6b86aa", - "15887401b0814d9386fb4d02d6279412", - "bec0cae37feb48a4add318d970d8ef96", - "feb8127671424fa68b9b93a7547e40eb", - "94c89aa435e44c8d9369305c21ca028c", - "7abe3bc7884e4afeb9995f7d7acc8c0f", - "7edccbff1ca145eabd4af6f9da32442a", - "9c11fce2ab1f4811a3d98f9154818825", - "2cf5d1a84ed947ddb16e5f8e6984b01e", - "d5de868032e640d5a07c34c9917190c3", - "d6295baf48b24c929c3ab4a317356e2b", - "df2ed1f8e3754f3a8f30be35935e82f3", - "2f125504e41344c088231f0307d7cb92", - "6a6665e93675459394536fd9f846fbea", - "aed65947759c47c58abe86f1ee279b86", - "b5d7fb93223c4c458e7c80e59daea4d2", - "540140c9c2f541b3a82f7a59e4f0b867", - "d5212aa2a4f74de1970c07e282f0e2bc", - "4a75d002a4ae4fc99625420ec6e580ee", - "695f77c019db487ea60171277073efe6", - "bc3fad5fe0194399add875a6d78907bd", - "42c418329198400bb77cdd3a654a96de", - "4ef5fe3e9ea84b8c8cb1a90a8208bdb9", - "735b9a74223f4941a2837a1108889f63" - ] + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "9c160e35cf414c528b5bffe05725a7d9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e87bc6913a7747728aed4b60a645bc2c", + "IPY_MODEL_9fa94c466004402bb293e4aa0bdc82f4", + "IPY_MODEL_95316b2f654a4ddc99c92d7c60c2f417" + ], + "layout": "IPY_MODEL_0fc0b516e82941dc934c26eba22d9e01" + } }, - "id": "LQzCw0pgN4ll", - "outputId": "7ed04793-8bb9-49c6-b090-82111d9835f6" - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9c160e35cf414c528b5bffe05725a7d9", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "tokenizer_config.json: 0%| | 0.00/48.0 [00:00= len(tokens):\n", - " break\n", - " return result\n", - "\n", - "\n", - "def check_task_status(es, task_id):\n", - " while True:\n", - " task_response = es.tasks.get(task_id=task_id)\n", - " if task_response[\"completed\"]:\n", - " print(\"Reindexing complete.\")\n", - " break\n", - " else:\n", - " print(\"Indexing...\")\n", - " time.sleep(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "izMU8HqqP7ld" - }, - "source": [ - "##Ingest Pipelines" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "iUOFJK48OamP", - "outputId": "5dc25103-a2ee-4a19-e184-92ec65c29187" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Ingest pipeline 'books_dataset_chunker' created/updated successfully.\n" - ] - } - ], - "source": [ - "# Define the ingest pipeline configuration\n", - "pipeline_body = {\n", - " \"description\": \"Pipeline for processing book passages\",\n", - " \"processors\": [\n", - " {\n", - " \"foreach\": {\n", - " \"field\": \"passages\",\n", - " \"processor\": {\n", - " \"inference\": {\n", - " \"field_map\": {\"_ingest._value.text\": \"text_field\"},\n", - " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", - " \"target_field\": \"_ingest._value.vector\",\n", - " \"on_failure\": [\n", - " {\n", - " \"append\": {\n", - " \"field\": \"_source._ingest.inference_errors\",\n", - " \"value\": [\n", - " {\n", - " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", - " \"pipeline\": \"ml-inference-title-vector\",\n", - " \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n", - " }\n", - " ],\n", - " }\n", - " }\n", - " ],\n", - " }\n", - " },\n", - " }\n", - " },\n", - " {\n", - " \"foreach\": {\n", - " \"field\": \"passages\",\n", - " \"processor\": {\n", - " \"inference\": {\n", - " \"field_map\": {\"_ingest._value.text\": \"text_field\"},\n", - " \"model_id\": elser_model_id,\n", - " \"target_field\": \"_ingest._value.content_embedding\",\n", - " \"on_failure\": [\n", - " {\n", - " \"append\": {\n", - " \"field\": \"_source._ingest.inference_errors\",\n", - " \"value\": [\n", - " {\n", - " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", - " \"pipeline\": \"ml-inference-title-vector\",\n", - " \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n", - " }\n", - " ],\n", - " }\n", - " }\n", - " ],\n", - " }\n", - " },\n", - " }\n", - " },\n", - " ],\n", - "}\n", - "\n", - "# Create or update the pipeline\n", - "pipeline_id = \"books_dataset_chunker\"\n", - "es = create_es_client()\n", - "es.ingest.put_pipeline(id=pipeline_id, body=pipeline_body)\n", - "print(f\"Ingest pipeline '{pipeline_id}' created/updated successfully.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6ZkRwEGdQBRP" - }, - "source": [ - "##Index Settings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "vZ3Z5gZbOgjF", - "outputId": "996f6ca5-d27d-4ea0-ed4d-07570b9942ad" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index harry_potter_dataset_enriched exists. Deleting it...\n", - "Index harry_potter_dataset_enriched deleted!\n", - "Index harry_potter_dataset_enriched created successfully!\n", - "Index harry_potter_dataset-raw exists. Deleting it...\n", - "Index harry_potter_dataset-raw deleted!\n", - "Index harry_potter_dataset-raw created successfully!\n" - ] - } - ], - "source": [ - "index_settings = {\n", - " \"settings\": {\n", - " \"number_of_shards\": 2,\n", - " \"number_of_replicas\": 0,\n", - " \"default_pipeline\": \"books_dataset_chunker\",\n", - " },\n", - " \"mappings\": {\n", - " \"dynamic\": \"false\",\n", - " \"properties\": {\n", - " \"book_title\": {\"type\": \"keyword\"},\n", - " \"chapter\": {\"type\": \"keyword\"},\n", - " \"chapter_full_text\": {\"type\": \"text\", \"index\": False},\n", - " \"passages\": {\n", - " \"type\": \"nested\",\n", - " \"properties\": {\n", - " \"content_embedding\": {\n", - " \"properties\": {\n", - " \"is_truncated\": {\"type\": \"boolean\"},\n", - " \"model_id\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}\n", - " },\n", - " },\n", - " \"predicted_value\": {\"type\": \"sparse_vector\"},\n", - " }\n", - " },\n", - " \"text\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", - " },\n", - " \"vector\": {\n", - " \"properties\": {\n", - " \"is_truncated\": {\"type\": \"boolean\"},\n", - " \"model_id\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\n", - " \"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}\n", - " },\n", - " },\n", - " \"predicted_value\": {\n", - " \"type\": \"dense_vector\",\n", - " \"dims\": 384,\n", - " \"index\": True,\n", - " \"similarity\": \"dot_product\",\n", - " },\n", - " }\n", - " },\n", - " \"chunk_number\": {\"type\": \"integer\"},\n", - " },\n", - " },\n", - " },\n", - " },\n", - "}\n", - "\n", - "raw_source_index_settings = {\n", - " \"settings\": {\"number_of_shards\": 2, \"number_of_replicas\": 0},\n", - " \"mappings\": {\n", - " \"dynamic\": \"false\",\n", - " \"properties\": {\n", - " \"book_title\": {\"type\": \"keyword\"},\n", - " \"chapter\": {\"type\": \"keyword\"},\n", - " \"chapter_full_text\": {\"type\": \"text\", \"index\": False},\n", - " \"passages\": {\n", - " \"type\": \"nested\",\n", - " \"properties\": {\n", - " \"text\": {\n", - " \"type\": \"text\",\n", - " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", - " },\n", - " \"chunk_number\": {\"type\": \"integer\"},\n", - " },\n", - " },\n", - " },\n", - " },\n", - "}\n", - "\n", - "# Manage indices\n", - "manage_index(\n", - " es,\n", - " index_name,\n", - " index_settings[\"settings\"],\n", - " index_settings[\"mappings\"],\n", - " delete_index=True,\n", - ")\n", - "manage_index(\n", - " es,\n", - " raw_source_index,\n", - " raw_source_index_settings[\"settings\"],\n", - " raw_source_index_settings[\"mappings\"],\n", - " delete_index=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NPtbLhVOQUF3" - }, - "source": [ - "## Fetch and Process the Book Text\n", - "\n", - "This section downloads the full text of \"Harry Potter and the Sorcerer's Stone\" from a specified URL and processes it to extract chapters and their titles. The text is then structured into a pandas DataFrame for further analysis and indexing.\n", - "\n", - "### Key Steps:\n", - "1. **Download Text**: The book is fetched using `urllib.request` from the provided URL.\n", - "2. **Extract Chapters**: The text is split into chapters based on predefined patterns, omitting the text before the first chapter.\n", - "3. **Capture Chapter Titles**: Chapter titles are extracted and paired with their respective texts.\n", - "4. **Data Structuring**:\n", - " - Convert the list of chapter titles and texts into a DataFrame.\n", - " - Assign sequential numbers to chapters.\n", - " - Add the book title as metadata.\n", - " - Apply a text chunking function to split each chapter into manageable passages.\n", - "\n", - "This prepares the text data for efficient indexing and advanced search operations in Elasticsearch.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "0L4YI96xOuKn", - "outputId": "7f9c63c7-82d8-4490-aabb-c3629872d80d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total chapters found: 17\n", - "First chapter title: CHAPTER ONE\n", - "Text sample from first chapter: \n", - "\n", - "THE BOY WHO LIVED\n", - "\n", - "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\n", - "that they were perfectly normal, thank you very much. They were the last\n", - "people you'd expect to be involved in anything strange or mysterious,\n", - "because they just didn't hold with such nonsense.\n", - "\n", - "Mr. Dursley was the director of a firm called Grunnings, which made\n", - "drills. He was a big, beefy man with hardly any neck, although he did\n", - "have a very large mustache. Mrs. Dursley was thin and blonde and had\n", - "nearly t\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Token indices sequence length is longer than the specified maximum sequence length for this model (6535 > 512). Running this sequence through the model will result in indexing errors\n" - ] - } - ], - "source": [ - "# Fetch and process the book text\n", - "potter_book_url = \"https://raw.githubusercontent.com/amephraim/nlp/master/texts/J.%20K.%20Rowling%20-%20Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt\"\n", - "response = urllib.request.urlopen(potter_book_url)\n", - "harry_potter_book_text = response.read().decode(\"utf-8\")\n", - "chapter_pattern = re.compile(r\"CHAPTER [A-Z]+\", re.IGNORECASE)\n", - "chapters = chapter_pattern.split(harry_potter_book_text)[1:]\n", - "chapter_titles = re.findall(chapter_pattern, harry_potter_book_text)\n", - "chapters_with_titles = list(zip(chapter_titles, chapters))\n", - "\n", - "print(\"Total chapters found:\", len(chapters))\n", - "if chapters_with_titles:\n", - " print(\"First chapter title:\", chapters_with_titles[0][0])\n", - " print(\"Text sample from first chapter:\", chapters_with_titles[0][1][:500])\n", - "\n", - "\n", - "# Structuring chapters into a DataFrame\n", - "df = pd.DataFrame(chapters_with_titles, columns=[\"chapter_title\", \"chapter_full_text\"])\n", - "df[\"chapter\"] = df.index + 1\n", - "df[\"book_title\"] = \"Harry Potter and the Sorcerer’s Stone\"\n", - "df[\"passages\"] = df[\"chapter_full_text\"].apply(lambda text: chunk(text))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DKK4574EQaTl" - }, - "source": [ - "## Indexing DataFrame into Elasticsearch\n", - "\n", - "This section uploads the structured data from a pandas DataFrame into a specified Elasticsearch index. The DataFrame contains chapter information from \"Harry Potter and the Sorcerer's Stone\", including chapter titles, full texts, and additional metadata.\n", - "\n", - "### Key Operation:\n", - "- **Index Data**: The `index_dataframe` function is called with the Elasticsearch client, the raw source index name, and the DataFrame as arguments. This operation effectively uploads the data into Elasticsearch, making it searchable and ready for further processing.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "7ReLAtz1O1HF", - "outputId": "3bf70ccc-804d-4718-e2a7-13dc0008e073" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Indexing documents to harry_potter_dataset-raw...\n", - "Successfully indexed 17 documents.\n", - "Failed to index 0 documents.\n" - ] - } - ], - "source": [ - "index_dataframe(es, raw_source_index, df)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pA5QroYdQgcM" - }, - "source": [ - "## Asynchronous Reindexing in Elasticsearch\n", - "\n", - "This section initiates an asynchronous reindex operation to transfer data from the raw source index to the enriched index in Elasticsearch. This process runs in the background, allowing other operations to continue without waiting for completion.\n", - "\n", - "### Key Steps:\n", - "1. **Start Reindex**: The reindex operation is triggered from the `raw_source_index` to the `index_name`, with `wait_for_completion` set to `False` to allow asynchronous execution.\n", - "2. **Retrieve Task ID**: The task ID of the reindex operation is captured and printed for monitoring purposes.\n", - "3. **Monitor Progress**: The `check_task_status` function continuously checks the status of the reindex task, providing updates every 10 seconds until the operation is complete.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "9d43995246744e26a8053c21e2c5fcfa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - "id": "HOCX_lbmO3zl", - "outputId": "014309de-8ec6-4cf8-b647-6bf0e6f512d8" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Task ID: _ov-FtHBSkqocXXBG6nu4A:68576798\n", - "Indexing...\n", - "Reindexing complete.\n" - ] - } - ], - "source": [ - "# Start the reindex operation asynchronously\n", - "response = es.reindex(\n", - " body={\"source\": {\"index\": raw_source_index}, \"dest\": {\"index\": index_name}},\n", - " wait_for_completion=False,\n", - ")\n", - "task_id = response[\"task\"]\n", - "print(\"Task ID:\", task_id)\n", - "check_task_status(es, task_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xJBDwRmDQq4n" - }, - "source": [ - "## Custom Search Query Construction and Execution\n", - "\n", - "This section constructs and executes a custom search query in Elasticsearch, utilizing a hybrid approach combining vector and text-based search methods to enhance search accuracy and relevance. The specific example used is a user query about the \"Nimbus 2000\".\n", - "\n", - "### Key Steps:\n", - "1. **Define User Query**: The user query is specified as \"what is a nimbus 2000\".\n", - "2. **Set Boost Factors**:\n", - " - `knn_boost_factor`: A value to amplify the importance of the vector-based search component.\n", - " - `text_expansion_boost`: A value to modify the weight of the text-based search component.\n", - "3. **Build Query**: The `build_custom_query` function constructs the search query, incorporating both dense vector and text expansion components.\n", - "4. **Execute Search**: The query is executed against the specified Elasticsearch index.\n", - "5. **Identify Relevant Passages**:\n", - " - The search results are analyzed to find the passage with the highest relevance score.\n", - " - The ID and chunk number of the best matching passage are captured and printed.\n", - "6. **Fetch Surrounding Chunks**: Constructs and executes a query to retrieve chunks adjacent to the identified passage for broader context. If the matched chunk is the first chunk, fetches n, n+1, and n+2. If the chunk is the last chunk in the chapter, fetches n, n-1, and n-2. For other chunks, fetches n-1, n, and n+1.\n", - "7. **Display Results**: Outputs text from the relevant and adjacent passages." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "2ed85f5360ba4feda6469aabd0324e7a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - "id": "u7NFZBRJO3t7", - "outputId": "6f9ec0d9-bb1d-4235-da45-1c8040ac7036" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Matched Chunk ID: wz8m148BbBK3er50L0-W, Chunk Number: 3, Text:\n", - "t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", - "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", - "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", - "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", - "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", - "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", - "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", - "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", - "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", - "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", - "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", - "\n", - "\n", - "Fetch Surrounding Chunks\n", - "------------------------\n", - "\n", - "\n", - "Text from Chunk 2: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", - "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", - "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", - "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", - "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", - "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", - "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", - "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", - "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", - "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", - "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", - "\n", - "\n", - "Text from Chunk 3: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", - "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", - "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", - "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", - "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", - "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", - "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", - "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", - "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", - "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", - "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", - "\n", - "\n", - "Text from Chunk 4: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", - "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", - "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", - "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", - "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", - "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", - "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", - "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", - "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", - "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", - "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n" - ] - } - ], - "source": [ - "# Custom Search Query Construction\n", - "user_query = \"what is a nimbus 2000\"\n", - "\n", - "\n", - "knn_boost_factor = 20\n", - "text_expansion_boost = 1\n", - "query = build_custom_query(\n", - " build_vector(user_query),\n", - " user_query,\n", - " knn_boost_factor,\n", - " text_expansion_boost,\n", - " debug=False,\n", - ")\n", - "\n", - "# Searching and identifying relevant passages\n", - "results = es.search(index=index_name, body=query, _source=False)\n", - "\n", - "hit_id = None\n", - "chunk_number = None\n", - "\n", - "if results and results.get(\"hits\") and results[\"hits\"].get(\"hits\"):\n", - " highest_score = -1\n", - " best_hit = None\n", - " hit_id = results[\"hits\"][\"hits\"][0][\"_id\"]\n", - " chapter_number = results[\"hits\"][\"hits\"][0][\"fields\"][\"chapter\"][0]\n", - " if \"inner_hits\" in results[\"hits\"][\"hits\"][0]:\n", - " for hit_type in [\"text_hits\", \"dense_hit\", \"sparse_hits\"]:\n", - " if hit_type in results[\"hits\"][\"hits\"][0][\"inner_hits\"]:\n", - " inner_hit = results[\"hits\"][\"hits\"][0][\"inner_hits\"][hit_type][\"hits\"]\n", - " if inner_hit[\"hits\"]:\n", - " max_score = inner_hit[\"max_score\"]\n", - " if max_score and max_score > highest_score:\n", - " highest_score = max_score\n", - " best_hit = inner_hit[\"hits\"][0]\n", - "\n", - " if best_hit:\n", - " first_passage_text = best_hit[\"_source\"][\"text\"]\n", - " chunk_number = best_hit[\"_source\"][\"chunk_number\"]\n", - " # print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text: {first_passage_text}\")\n", - " print(\n", - " f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text:\\n{textwrap.fill(first_passage_text, width=200)}\"\n", - " )\n", - " print(f\"\\n\")\n", - " else:\n", - " print(f\"ID: {hit_id}, No relevant passages found.\")\n", - "else:\n", - " print(\"No results found.\")\n", - "\n", - "print(f\"Fetch Surrounding Chunks\")\n", - "print(f\"------------------------\")\n", - "\n", - "max_chapter_chunk_result = es.search(\n", - " index=index_name,\n", - " body=get_max_chunk_number_query(chapter_number, debug=False),\n", - " _source=False,\n", - ")\n", - "max_chunk_number = max_chapter_chunk_result[\"aggregations\"][\"max_chunk_number\"][\n", - " \"max_chunk\"\n", - "][\"value\"]\n", - "\n", - "adjacent_chunks_query = get_adjacent_chunks_query(\n", - " hit_id, chunk_number, max_chunk_number, debug=False\n", - ")\n", - "results = es.search(index=index_name, body=adjacent_chunks_query, _source=False)\n", - "print_text_from_results(results)" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "02f735a438bf4058a9cfacf8d2b8660f": { + "808af1e1f2464a928ee23398c837ff48": { "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "31906527169a4c08801dc6b21936188d": { + "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", + "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", @@ -1321,17 +371,38 @@ "_view_name": "HBoxView", "box_style": "", "children": [ - "IPY_MODEL_331b178397164de49408dc50ce417a36", - "IPY_MODEL_07ee43d2a1684fb0b1445755802b6ea5", - "IPY_MODEL_c867bce7e34b4800903eb9ec99f34784" + "IPY_MODEL_a6570ce51dfc46f383d855e28534bf73", + "IPY_MODEL_41cc49a71a164065bc833d080027e4d2", + "IPY_MODEL_748e7f3c8da243e9b5320654ec8e8146" ], - "layout": "IPY_MODEL_8169e16a9b0146f5a57a015601c2ebcb" + "layout": "IPY_MODEL_a88953429ab6436fb4f01b6b1e2cf6ff" } }, - "07ee43d2a1684fb0b1445755802b6ea5": { + "a6570ce51dfc46f383d855e28534bf73": { "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b0a2671c90a048548314c2e3d21e19e7", + "placeholder": "​", + "style": "IPY_MODEL_5f2080a5d12241638447a5851d0c8db3", + "value": "vocab.txt: 100%" + } + }, + "41cc49a71a164065bc833d080027e4d2": { + "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", @@ -1344,18 +415,39 @@ "bar_style": "success", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_527bfa6067c84b94a1e70dfadfd4b78e", - "max": 466062, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_312e85864e074b958d86325b6417a0fa", - "value": 466062 + "layout": "IPY_MODEL_7ce44d2f323d45838633a750f2386525", + "max": 231508, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_406f3564a217478d8f60dee5e1fb6dbf", + "value": 231508 + } + }, + "748e7f3c8da243e9b5320654ec8e8146": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_61ae734ac8d441fd9b3ea198aff3f2c7", + "placeholder": "​", + "style": "IPY_MODEL_bc52c57fa6464ab39823cd3ddb9d7d78", + "value": " 232k/232k [00:00<00:00, 2.88MB/s]" } }, - "092a4de220ba4ca2a23a0f273aba601b": { + "a88953429ab6436fb4f01b6b1e2cf6ff": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -1404,10 +496,10 @@ "width": null } }, - "0961f276155348f98c12d1be4ad78e62": { + "b0a2671c90a048548314c2e3d21e19e7": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -1456,10 +548,10 @@ "width": null } }, - "0dfb7f264674449b92a390324d17c4cf": { + "5f2080a5d12241638447a5851d0c8db3": { "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", @@ -1471,10 +563,10 @@ "description_width": "" } }, - "0f8d4b5000174234bded5d4e017aa4e9": { + "7ce44d2f323d45838633a750f2386525": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -1523,10 +615,26 @@ "width": null } }, - "0fc0b516e82941dc934c26eba22d9e01": { + "406f3564a217478d8f60dee5e1fb6dbf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "61ae734ac8d441fd9b3ea198aff3f2c7": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -1575,10 +683,10 @@ "width": null } }, - "15887401b0814d9386fb4d02d6279412": { + "bc52c57fa6464ab39823cd3ddb9d7d78": { "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", @@ -1590,92 +698,150 @@ "description_width": "" } }, - "1afbe347ab364b28b887f49dad54f5d7": { + "02f735a438bf4058a9cfacf8d2b8660f": { "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", "model_module_version": "1.5.0", - "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", + "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4831ce9114e5437ea8a24919557c40e2", - "placeholder": "​", - "style": "IPY_MODEL_53344cac458d4d5ebdc504744c18b7de", - "value": " 570/570 [00:00<00:00, 19.9kB/s]" + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_331b178397164de49408dc50ce417a36", + "IPY_MODEL_07ee43d2a1684fb0b1445755802b6ea5", + "IPY_MODEL_c867bce7e34b4800903eb9ec99f34784" + ], + "layout": "IPY_MODEL_8169e16a9b0146f5a57a015601c2ebcb" } }, - "2155cf3c7b2043d0a41fc011bf4f0e04": { + "331b178397164de49408dc50ce417a36": { "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", "state": { + "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", + "_model_name": "HTMLModel", "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_35ca86faebfd43faaef0202389d958fd", + "placeholder": "​", + "style": "IPY_MODEL_f04f37ba10e9498ea61acdce637431ee", + "value": "tokenizer.json: 100%" } }, - "243f1e7de5414b82aaa4b50482dd964d": { + "07ee43d2a1684fb0b1445755802b6ea5": { "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", "state": { + "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", + "_model_name": "FloatProgressModel", "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_527bfa6067c84b94a1e70dfadfd4b78e", + "max": 466062, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_312e85864e074b958d86325b6417a0fa", + "value": 466062 } }, - "2b62b542c091466cbae559e29ec797bd": { + "c867bce7e34b4800903eb9ec99f34784": { "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", "state": { + "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", + "_model_name": "HTMLModel", "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3b0fc37739334025b037a5270c9515bf", + "placeholder": "​", + "style": "IPY_MODEL_0dfb7f264674449b92a390324d17c4cf", + "value": " 466k/466k [00:00<00:00, 6.88MB/s]" } }, - "2cf5d1a84ed947ddb16e5f8e6984b01e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", + "8169e16a9b0146f5a57a015601c2ebcb": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null } }, - "2ed85f5360ba4feda6469aabd0324e7a": { + "35ca86faebfd43faaef0202389d958fd": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -1724,79 +890,25 @@ "width": null } }, - "2f125504e41344c088231f0307d7cb92": { + "f04f37ba10e9498ea61acdce637431ee": { "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "30ba8c556cc34fde96b530ed66ac376d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "312e85864e074b958d86325b6417a0fa": { - "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", + "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", - "bar_color": null, "description_width": "" } }, - "31906527169a4c08801dc6b21936188d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_a6570ce51dfc46f383d855e28534bf73", - "IPY_MODEL_41cc49a71a164065bc833d080027e4d2", - "IPY_MODEL_748e7f3c8da243e9b5320654ec8e8146" - ], - "layout": "IPY_MODEL_a88953429ab6436fb4f01b6b1e2cf6ff" - } - }, - "31c0c3d684564d5fb87d2e25e6de96eb": { + "527bfa6067c84b94a1e70dfadfd4b78e": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -1845,31 +957,26 @@ "width": null } }, - "331b178397164de49408dc50ce417a36": { + "312e85864e074b958d86325b6417a0fa": { "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", - "model_name": "HTMLModel", "state": { - "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", + "_model_name": "ProgressStyleModel", "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_35ca86faebfd43faaef0202389d958fd", - "placeholder": "​", - "style": "IPY_MODEL_f04f37ba10e9498ea61acdce637431ee", - "value": "tokenizer.json: 100%" + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" } }, - "3542b02e36ce4e03850b37c28d88da30": { + "3b0fc37739334025b037a5270c9515bf": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -1918,10 +1025,113 @@ "width": null } }, - "35ca86faebfd43faaef0202389d958fd": { + "0dfb7f264674449b92a390324d17c4cf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7e58bf25549d4b428f231d528e8fef54": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_461ca08f677a4cba9ec2a388c2e346f3", + "IPY_MODEL_90d31fb52af949b0a2b41e3613827233", + "IPY_MODEL_1afbe347ab364b28b887f49dad54f5d7" + ], + "layout": "IPY_MODEL_8483a759cc0e4e12834fc7d08dab3b7e" + } + }, + "461ca08f677a4cba9ec2a388c2e346f3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_59d8ffb31bb340eba7e0dcebfbbdd977", + "placeholder": "​", + "style": "IPY_MODEL_2b62b542c091466cbae559e29ec797bd", + "value": "config.json: 100%" + } + }, + "90d31fb52af949b0a2b41e3613827233": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_092a4de220ba4ca2a23a0f273aba601b", + "max": 570, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e0c46565371f437a85a26d44c5b20c5b", + "value": 570 + } + }, + "1afbe347ab364b28b887f49dad54f5d7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4831ce9114e5437ea8a24919557c40e2", + "placeholder": "​", + "style": "IPY_MODEL_53344cac458d4d5ebdc504744c18b7de", + "value": " 570/570 [00:00<00:00, 19.9kB/s]" + } + }, + "8483a759cc0e4e12834fc7d08dab3b7e": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -1970,10 +1180,10 @@ "width": null } }, - "3b0fc37739334025b037a5270c9515bf": { + "59d8ffb31bb340eba7e0dcebfbbdd977": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2022,133 +1232,25 @@ "width": null } }, - "400b0a8ef7c64477bf4f02a16b5508a0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e5ec838bb84644b6a27e3eaec9d7ac74", - "max": 5069051, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_30ba8c556cc34fde96b530ed66ac376d", - "value": 5069051 - } - }, - "406f3564a217478d8f60dee5e1fb6dbf": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "41cc49a71a164065bc833d080027e4d2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7ce44d2f323d45838633a750f2386525", - "max": 231508, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_406f3564a217478d8f60dee5e1fb6dbf", - "value": 231508 - } - }, - "42c418329198400bb77cdd3a654a96de": { + "2b62b542c091466cbae559e29ec797bd": { "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", + "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", - "bar_color": null, "description_width": "" } }, - "43072f923bd24566ae0e20ca9aa3cdc5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_5a4f80526c2c4b53a1bce182e9b3e5fa", - "IPY_MODEL_400b0a8ef7c64477bf4f02a16b5508a0", - "IPY_MODEL_b03347c3201849778ea3129314ac340c" - ], - "layout": "IPY_MODEL_3542b02e36ce4e03850b37c28d88da30" - } - }, - "461ca08f677a4cba9ec2a388c2e346f3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_59d8ffb31bb340eba7e0dcebfbbdd977", - "placeholder": "​", - "style": "IPY_MODEL_2b62b542c091466cbae559e29ec797bd", - "value": "config.json: 100%" - } - }, - "4831ce9114e5437ea8a24919557c40e2": { + "092a4de220ba4ca2a23a0f273aba601b": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2197,10 +1299,26 @@ "width": null } }, - "4a75d002a4ae4fc99625420ec6e580ee": { + "e0c46565371f437a85a26d44c5b20c5b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4831ce9114e5437ea8a24919557c40e2": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2249,10 +1367,113 @@ "width": null } }, - "4ef5fe3e9ea84b8c8cb1a90a8208bdb9": { + "53344cac458d4d5ebdc504744c18b7de": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f34bf7b0bb424a8e8c00ff75309bbe6f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d604bc170c02491fae573c702e790893", + "IPY_MODEL_d8a3bdb8be354365944ab587738280d3", + "IPY_MODEL_89ad2dee66324ae896eec71924aee670" + ], + "layout": "IPY_MODEL_31c0c3d684564d5fb87d2e25e6de96eb" + } + }, + "d604bc170c02491fae573c702e790893": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b7bef190ebed494eb8773ec21d9b7160", + "placeholder": "​", + "style": "IPY_MODEL_6c822f08434c4212931dcf097a80b7d4", + "value": "tokenizer_config.json: 100%" + } + }, + "d8a3bdb8be354365944ab587738280d3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0f8d4b5000174234bded5d4e017aa4e9", + "max": 418, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_243f1e7de5414b82aaa4b50482dd964d", + "value": 418 + } + }, + "89ad2dee66324ae896eec71924aee670": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0961f276155348f98c12d1be4ad78e62", + "placeholder": "​", + "style": "IPY_MODEL_95c6cffafe1b4345a905be485b787728", + "value": " 418/418 [00:00<00:00, 13.9kB/s]" + } + }, + "31c0c3d684564d5fb87d2e25e6de96eb": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2301,10 +1522,10 @@ "width": null } }, - "500a70f25097484bbec10c0ffd402595": { + "b7bef190ebed494eb8773ec21d9b7160": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2353,10 +1574,25 @@ "width": null } }, - "527bfa6067c84b94a1e70dfadfd4b78e": { + "6c822f08434c4212931dcf097a80b7d4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0f8d4b5000174234bded5d4e017aa4e9": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2405,46 +1641,26 @@ "width": null } }, - "53344cac458d4d5ebdc504744c18b7de": { + "243f1e7de5414b82aaa4b50482dd964d": { "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", + "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", + "bar_color": null, "description_width": "" } }, - "540140c9c2f541b3a82f7a59e4f0b867": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4ef5fe3e9ea84b8c8cb1a90a8208bdb9", - "placeholder": "​", - "style": "IPY_MODEL_735b9a74223f4941a2837a1108889f63", - "value": " 280/280 [00:00<00:00, 15.7kB/s]" - } - }, - "59d8ffb31bb340eba7e0dcebfbbdd977": { + "0961f276155348f98c12d1be4ad78e62": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2493,10 +1709,47 @@ "width": null } }, - "5a4f80526c2c4b53a1bce182e9b3e5fa": { + "95c6cffafe1b4345a905be485b787728": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "43072f923bd24566ae0e20ca9aa3cdc5": { "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5a4f80526c2c4b53a1bce182e9b3e5fa", + "IPY_MODEL_400b0a8ef7c64477bf4f02a16b5508a0", + "IPY_MODEL_b03347c3201849778ea3129314ac340c" + ], + "layout": "IPY_MODEL_3542b02e36ce4e03850b37c28d88da30" + } + }, + "5a4f80526c2c4b53a1bce182e9b3e5fa": { + "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", + "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", @@ -2514,25 +1767,55 @@ "value": "sentencepiece.bpe.model: 100%" } }, - "5f2080a5d12241638447a5851d0c8db3": { + "400b0a8ef7c64477bf4f02a16b5508a0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e5ec838bb84644b6a27e3eaec9d7ac74", + "max": 5069051, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_30ba8c556cc34fde96b530ed66ac376d", + "value": 5069051 + } + }, + "b03347c3201849778ea3129314ac340c": { "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", "state": { + "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", + "_model_name": "HTMLModel", "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_711439e7dcab4c10ab4300bdbe6b86aa", + "placeholder": "​", + "style": "IPY_MODEL_15887401b0814d9386fb4d02d6279412", + "value": " 5.07M/5.07M [00:00<00:00, 19.7MB/s]" } }, - "61ae734ac8d441fd9b3ea198aff3f2c7": { + "3542b02e36ce4e03850b37c28d88da30": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2581,62 +1864,10 @@ "width": null } }, - "695f77c019db487ea60171277073efe6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "6a6665e93675459394536fd9f846fbea": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_aed65947759c47c58abe86f1ee279b86", - "IPY_MODEL_b5d7fb93223c4c458e7c80e59daea4d2", - "IPY_MODEL_540140c9c2f541b3a82f7a59e4f0b867" - ], - "layout": "IPY_MODEL_d5212aa2a4f74de1970c07e282f0e2bc" - } - }, - "6c822f08434c4212931dcf097a80b7d4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "711439e7dcab4c10ab4300bdbe6b86aa": { + "7b2abc768054422f8af3d21837400b4a": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2685,10 +1916,10 @@ "width": null } }, - "735b9a74223f4941a2837a1108889f63": { + "d8e4ceae237d4381aa5e44b020d7564e": { "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", @@ -2700,52 +1931,10 @@ "description_width": "" } }, - "748e7f3c8da243e9b5320654ec8e8146": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_61ae734ac8d441fd9b3ea198aff3f2c7", - "placeholder": "​", - "style": "IPY_MODEL_bc52c57fa6464ab39823cd3ddb9d7d78", - "value": " 232k/232k [00:00<00:00, 2.88MB/s]" - } - }, - "7abe3bc7884e4afeb9995f7d7acc8c0f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_df2ed1f8e3754f3a8f30be35935e82f3", - "placeholder": "​", - "style": "IPY_MODEL_2f125504e41344c088231f0307d7cb92", - "value": " 17.1M/17.1M [00:00<00:00, 76.7MB/s]" - } - }, - "7b2abc768054422f8af3d21837400b4a": { + "e5ec838bb84644b6a27e3eaec9d7ac74": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2794,10 +1983,26 @@ "width": null } }, - "7ce44d2f323d45838633a750f2386525": { + "30ba8c556cc34fde96b530ed66ac376d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "711439e7dcab4c10ab4300bdbe6b86aa": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2846,10 +2051,25 @@ "width": null } }, - "7e58bf25549d4b428f231d528e8fef54": { + "15887401b0814d9386fb4d02d6279412": { "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bec0cae37feb48a4add318d970d8ef96": { + "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", + "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", @@ -2861,17 +2081,83 @@ "_view_name": "HBoxView", "box_style": "", "children": [ - "IPY_MODEL_461ca08f677a4cba9ec2a388c2e346f3", - "IPY_MODEL_90d31fb52af949b0a2b41e3613827233", - "IPY_MODEL_1afbe347ab364b28b887f49dad54f5d7" + "IPY_MODEL_feb8127671424fa68b9b93a7547e40eb", + "IPY_MODEL_94c89aa435e44c8d9369305c21ca028c", + "IPY_MODEL_7abe3bc7884e4afeb9995f7d7acc8c0f" ], - "layout": "IPY_MODEL_8483a759cc0e4e12834fc7d08dab3b7e" + "layout": "IPY_MODEL_7edccbff1ca145eabd4af6f9da32442a" + } + }, + "feb8127671424fa68b9b93a7547e40eb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9c11fce2ab1f4811a3d98f9154818825", + "placeholder": "​", + "style": "IPY_MODEL_2cf5d1a84ed947ddb16e5f8e6984b01e", + "value": "tokenizer.json: 100%" + } + }, + "94c89aa435e44c8d9369305c21ca028c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d5de868032e640d5a07c34c9917190c3", + "max": 17082660, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d6295baf48b24c929c3ab4a317356e2b", + "value": 17082660 + } + }, + "7abe3bc7884e4afeb9995f7d7acc8c0f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_df2ed1f8e3754f3a8f30be35935e82f3", + "placeholder": "​", + "style": "IPY_MODEL_2f125504e41344c088231f0307d7cb92", + "value": " 17.1M/17.1M [00:00<00:00, 76.7MB/s]" } }, "7edccbff1ca145eabd4af6f9da32442a": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2920,25 +2206,10 @@ "width": null } }, - "808af1e1f2464a928ee23398c837ff48": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "8169e16a9b0146f5a57a015601c2ebcb": { + "9c11fce2ab1f4811a3d98f9154818825": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -2987,10 +2258,25 @@ "width": null } }, - "8483a759cc0e4e12834fc7d08dab3b7e": { + "2cf5d1a84ed947ddb16e5f8e6984b01e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d5de868032e640d5a07c34c9917190c3": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -3039,115 +2325,26 @@ "width": null } }, - "89ad2dee66324ae896eec71924aee670": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0961f276155348f98c12d1be4ad78e62", - "placeholder": "​", - "style": "IPY_MODEL_95c6cffafe1b4345a905be485b787728", - "value": " 418/418 [00:00<00:00, 13.9kB/s]" - } - }, - "90d31fb52af949b0a2b41e3613827233": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_092a4de220ba4ca2a23a0f273aba601b", - "max": 570, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_e0c46565371f437a85a26d44c5b20c5b", - "value": 570 - } - }, - "94c89aa435e44c8d9369305c21ca028c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_d5de868032e640d5a07c34c9917190c3", - "max": 17082660, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_d6295baf48b24c929c3ab4a317356e2b", - "value": 17082660 - } - }, - "95316b2f654a4ddc99c92d7c60c2f417": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2ed85f5360ba4feda6469aabd0324e7a", - "placeholder": "​", - "style": "IPY_MODEL_808af1e1f2464a928ee23398c837ff48", - "value": " 48.0/48.0 [00:00<00:00, 1.58kB/s]" - } - }, - "95c6cffafe1b4345a905be485b787728": { + "d6295baf48b24c929c3ab4a317356e2b": { "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", + "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", + "bar_color": null, "description_width": "" } }, - "9c11fce2ab1f4811a3d98f9154818825": { + "df2ed1f8e3754f3a8f30be35935e82f3": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -3196,10 +2393,25 @@ "width": null } }, - "9c160e35cf414c528b5bffe05725a7d9": { + "2f125504e41344c088231f0307d7cb92": { "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6a6665e93675459394536fd9f846fbea": { + "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", + "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", @@ -3211,33 +2423,38 @@ "_view_name": "HBoxView", "box_style": "", "children": [ - "IPY_MODEL_e87bc6913a7747728aed4b60a645bc2c", - "IPY_MODEL_9fa94c466004402bb293e4aa0bdc82f4", - "IPY_MODEL_95316b2f654a4ddc99c92d7c60c2f417" + "IPY_MODEL_aed65947759c47c58abe86f1ee279b86", + "IPY_MODEL_b5d7fb93223c4c458e7c80e59daea4d2", + "IPY_MODEL_540140c9c2f541b3a82f7a59e4f0b867" ], - "layout": "IPY_MODEL_0fc0b516e82941dc934c26eba22d9e01" + "layout": "IPY_MODEL_d5212aa2a4f74de1970c07e282f0e2bc" } }, - "9d43995246744e26a8053c21e2c5fcfa": { + "aed65947759c47c58abe86f1ee279b86": { "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", "state": { + "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", + "_model_name": "HTMLModel", "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4a75d002a4ae4fc99625420ec6e580ee", + "placeholder": "​", + "style": "IPY_MODEL_695f77c019db487ea60171277073efe6", + "value": "special_tokens_map.json: 100%" } }, - "9fa94c466004402bb293e4aa0bdc82f4": { + "b5d7fb93223c4c458e7c80e59daea4d2": { "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", @@ -3250,18 +2467,18 @@ "bar_style": "success", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_500a70f25097484bbec10c0ffd402595", - "max": 48, + "layout": "IPY_MODEL_bc3fad5fe0194399add875a6d78907bd", + "max": 280, "min": 0, "orientation": "horizontal", - "style": "IPY_MODEL_9d43995246744e26a8053c21e2c5fcfa", - "value": 48 + "style": "IPY_MODEL_42c418329198400bb77cdd3a654a96de", + "value": 280 } }, - "a6570ce51dfc46f383d855e28534bf73": { + "540140c9c2f541b3a82f7a59e4f0b867": { "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", "model_name": "HTMLModel", + "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", @@ -3273,16 +2490,16 @@ "_view_name": "HTMLView", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_b0a2671c90a048548314c2e3d21e19e7", + "layout": "IPY_MODEL_4ef5fe3e9ea84b8c8cb1a90a8208bdb9", "placeholder": "​", - "style": "IPY_MODEL_5f2080a5d12241638447a5851d0c8db3", - "value": "vocab.txt: 100%" + "style": "IPY_MODEL_735b9a74223f4941a2837a1108889f63", + "value": " 280/280 [00:00<00:00, 15.7kB/s]" } }, - "a88953429ab6436fb4f01b6b1e2cf6ff": { + "d5212aa2a4f74de1970c07e282f0e2bc": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -3302,81 +2519,39 @@ "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "aed65947759c47c58abe86f1ee279b86": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4a75d002a4ae4fc99625420ec6e580ee", - "placeholder": "​", - "style": "IPY_MODEL_695f77c019db487ea60171277073efe6", - "value": "special_tokens_map.json: 100%" - } - }, - "b03347c3201849778ea3129314ac340c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_711439e7dcab4c10ab4300bdbe6b86aa", - "placeholder": "​", - "style": "IPY_MODEL_15887401b0814d9386fb4d02d6279412", - "value": " 5.07M/5.07M [00:00<00:00, 19.7MB/s]" + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null } }, - "b0a2671c90a048548314c2e3d21e19e7": { + "4a75d002a4ae4fc99625420ec6e580ee": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -3425,34 +2600,25 @@ "width": null } }, - "b5d7fb93223c4c458e7c80e59daea4d2": { + "695f77c019db487ea60171277073efe6": { "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", "state": { - "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", + "_model_name": "DescriptionStyleModel", "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_bc3fad5fe0194399add875a6d78907bd", - "max": 280, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_42c418329198400bb77cdd3a654a96de", - "value": 280 + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" } }, - "b7bef190ebed494eb8773ec21d9b7160": { + "bc3fad5fe0194399add875a6d78907bd": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -3501,10 +2667,26 @@ "width": null } }, - "bc3fad5fe0194399add875a6d78907bd": { + "42c418329198400bb77cdd3a654a96de": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4ef5fe3e9ea84b8c8cb1a90a8208bdb9": { "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", "model_name": "LayoutModel", + "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", @@ -3553,10 +2735,10 @@ "width": null } }, - "bc52c57fa6464ab39823cd3ddb9d7d78": { + "735b9a74223f4941a2837a1108889f63": { "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", @@ -3567,484 +2749,1360 @@ "_view_name": "StyleView", "description_width": "" } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Fetch surronding chucks (N-1, N+1)\n", + "\n", + "This notebook is designed to handle the ingestion of book text (Harry Potter and the Sorcerer's Stone) into an Elasticsearch Cloud instance. It includes partitioning the book text into chapters and chunking the chapter text, which are then ingested into Elasticsearch. The setup utilizes a nested structure, and for each chunk, it stores dense and sparse (ELSER) vector representations along with the text representation.\n", + "\n", + "Searches are performed using dense vector comparisons, sparse vector comparisons, and text search in parallel to demonstrate the power of hybrid search strategies. Additionally, the notebook is configured to retrieve adjacent chunks (n-1 and n+1), allowing for a more contextual understanding of the search results.\n", + "\n" + ], + "metadata": { + "id": "aAUkwshINwV7" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Install required python libraries\n" + ], + "metadata": { + "id": "MUEpppV7SeLu" + } + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nXuL8wsQNq8G", + "outputId": "2257cb06-4809-4be0-f698-f4a8ba11488e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: elasticsearch in /usr/local/lib/python3.10/dist-packages (8.12.0)\n", + "Requirement already satisfied: elastic-transport<9,>=8 in /usr/local/lib/python3.10/dist-packages (from elasticsearch) (8.13.0)\n", + "Requirement already satisfied: urllib3<3,>=1.26.2 in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8->elasticsearch) (2.0.7)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8->elasticsearch) (2024.2.2)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (1.5.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.4)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.23.5)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install elasticsearch\n", + "!pip install pandas\n", + "\n", + "import json\n", + "import time\n", + "import urllib.request\n", + "import re\n", + "import pandas as pd\n", + "from transformers import AutoTokenizer, BertTokenizer\n", + "from elasticsearch import Elasticsearch, helpers\n", + "from google.colab import userdata\n", + "import textwrap" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Elasticsearch and Tokenizer Configuration\n" + ], + "metadata": { + "id": "_d4RWjNAN6Q9" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Elasticsearch and Tokenizer Configuration\n", + "\n", + "This section sets up the necessary configurations for connecting to Elasticsearch and initializing the tokenizers used for text processing.\n", + "\n", + "### Configuration Details:\n", + "1. **Elasticsearch Credentials**:\n", + " - `ELASTIC_CLOUD_ID`: The Cloud ID for the Elasticsearch cluster, securely fetched using the `getpass` function.\n", + " - `ELASTIC_API_KEY`: The API key for Elasticsearch authentication, securely fetched using the `getpass` function.\n", + "\n", + "2. **Index Settings**:\n", + " - `raw_source_index`: The name of the index for the raw dataset (`harry_potter_dataset-raw`).\n", + " - `index_name`: The name of the enriched dataset index (`harry_potter_dataset_enriched`).\n", + "\n", + "3. **Embedding Model**:\n", + " - `dense_embedding_model`: Specifies the model used for generating dense embeddings (`sentence-transformers/all-minilm-l6-v2`).\n", + " - `elser_model_id`: Specifies the ELSER model ID (`.elser_model_2_linux-x86_64`).\n", + "\n", + "4. **Tokenizer Initialization**:\n", + " - `bert_tokenizer`: Initializes the BERT tokenizer (`bert-base-uncased`) for English text processing.\n", + " - `e5_tokenizer`: Initializes the Multilingual E5 tokenizer (`intfloat/multilingual-e5-base`) for handling diverse datasets.\n", + "\n", + "5. **Chunking Parameters**:\n", + " - `SEMANTIC_SEARCH_TOKEN_LIMIT`: Sets the token limit for each chunk (500 tokens per chunk, considering space for special tokens).\n", + " - `ELSER_TOKEN_OVERLAP`: Defines the overlap ratio between chunks (default is 0%, customizable for context continuity).\n", + "\n", + "These configurations ensure that the necessary components are properly set up for efficient text processing, indexing, and search operations in Elasticsearch.\n" + ], + "metadata": { + "id": "2w7uTCYdQ0m6" + } + }, + { + "cell_type": "code", + "source": [ + "from elasticsearch import Elasticsearch\n", + "from getpass import getpass\n", + "\n", + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n", + "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", + "\n", + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key\n", + "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", + "\n", + "raw_source_index = \"harry_potter_dataset-raw\"\n", + "index_name = \"harry_potter_dataset_enriched\"\n", + "\n", + "dense_embedding_model = \"sentence-transformers__all-minilm-l6-v2\"\n", + "elser_model_id = \".elser_model_2_linux-x86_64\"\n", + "\n", + "bert_tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", + "e5_tokenizer = AutoTokenizer.from_pretrained(\"intfloat/multilingual-e5-base\")\n", + "\n", + "\n", + "SEMANTIC_SEARCH_TOKEN_LIMIT = 500\n", + "ELSER_TOKEN_OVERLAP = 0.0" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LGQAjG6PERfx", + "outputId": "8a37251e-2df3-4359-ad3e-bd340731c7d1" + }, + "execution_count": 5, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Elastic Cloud ID: ··········\n", + "Elastic Api Key: ··········\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Elasticsearch and Tokenizer Configuration\n", + "es_username = \"elastic\"\n", + "es_password = userdata.get(\"es_password\")\n", + "es_cloudid = userdata.get(\"es_cloudid\")\n", + "\n", + "raw_source_index = \"harry_potter_dataset-raw\"\n", + "index_name = \"harry_potter_dataset_enriched\"\n", + "\n", + "dense_embedding_model = \"sentence-transformers__all-minilm-l6-v2\"\n", + "elser_model_id = \".elser_model_2_linux-x86_64\"\n", + "\n", + "bert_tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", + "e5_tokenizer = AutoTokenizer.from_pretrained(\"intfloat/multilingual-e5-base\")\n", + "\n", + "\n", + "SEMANTIC_SEARCH_TOKEN_LIMIT = 500\n", + "ELSER_TOKEN_OVERLAP = 0.0" + ], + "metadata": { + "id": "LQzCw0pgN4ll", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 328, + "referenced_widgets": [ + "9c160e35cf414c528b5bffe05725a7d9", + "e87bc6913a7747728aed4b60a645bc2c", + "9fa94c466004402bb293e4aa0bdc82f4", + "95316b2f654a4ddc99c92d7c60c2f417", + "0fc0b516e82941dc934c26eba22d9e01", + "d93a6e3ddd364921a7c2a24451d27ffc", + "2155cf3c7b2043d0a41fc011bf4f0e04", + "500a70f25097484bbec10c0ffd402595", + "9d43995246744e26a8053c21e2c5fcfa", + "2ed85f5360ba4feda6469aabd0324e7a", + "808af1e1f2464a928ee23398c837ff48", + "31906527169a4c08801dc6b21936188d", + "a6570ce51dfc46f383d855e28534bf73", + "41cc49a71a164065bc833d080027e4d2", + "748e7f3c8da243e9b5320654ec8e8146", + "a88953429ab6436fb4f01b6b1e2cf6ff", + "b0a2671c90a048548314c2e3d21e19e7", + "5f2080a5d12241638447a5851d0c8db3", + "7ce44d2f323d45838633a750f2386525", + "406f3564a217478d8f60dee5e1fb6dbf", + "61ae734ac8d441fd9b3ea198aff3f2c7", + "bc52c57fa6464ab39823cd3ddb9d7d78", + "02f735a438bf4058a9cfacf8d2b8660f", + "331b178397164de49408dc50ce417a36", + "07ee43d2a1684fb0b1445755802b6ea5", + "c867bce7e34b4800903eb9ec99f34784", + "8169e16a9b0146f5a57a015601c2ebcb", + "35ca86faebfd43faaef0202389d958fd", + "f04f37ba10e9498ea61acdce637431ee", + "527bfa6067c84b94a1e70dfadfd4b78e", + "312e85864e074b958d86325b6417a0fa", + "3b0fc37739334025b037a5270c9515bf", + "0dfb7f264674449b92a390324d17c4cf", + "7e58bf25549d4b428f231d528e8fef54", + "461ca08f677a4cba9ec2a388c2e346f3", + "90d31fb52af949b0a2b41e3613827233", + "1afbe347ab364b28b887f49dad54f5d7", + "8483a759cc0e4e12834fc7d08dab3b7e", + "59d8ffb31bb340eba7e0dcebfbbdd977", + "2b62b542c091466cbae559e29ec797bd", + "092a4de220ba4ca2a23a0f273aba601b", + "e0c46565371f437a85a26d44c5b20c5b", + "4831ce9114e5437ea8a24919557c40e2", + "53344cac458d4d5ebdc504744c18b7de", + "f34bf7b0bb424a8e8c00ff75309bbe6f", + "d604bc170c02491fae573c702e790893", + "d8a3bdb8be354365944ab587738280d3", + "89ad2dee66324ae896eec71924aee670", + "31c0c3d684564d5fb87d2e25e6de96eb", + "b7bef190ebed494eb8773ec21d9b7160", + "6c822f08434c4212931dcf097a80b7d4", + "0f8d4b5000174234bded5d4e017aa4e9", + "243f1e7de5414b82aaa4b50482dd964d", + "0961f276155348f98c12d1be4ad78e62", + "95c6cffafe1b4345a905be485b787728", + "43072f923bd24566ae0e20ca9aa3cdc5", + "5a4f80526c2c4b53a1bce182e9b3e5fa", + "400b0a8ef7c64477bf4f02a16b5508a0", + "b03347c3201849778ea3129314ac340c", + "3542b02e36ce4e03850b37c28d88da30", + "7b2abc768054422f8af3d21837400b4a", + "d8e4ceae237d4381aa5e44b020d7564e", + "e5ec838bb84644b6a27e3eaec9d7ac74", + "30ba8c556cc34fde96b530ed66ac376d", + "711439e7dcab4c10ab4300bdbe6b86aa", + "15887401b0814d9386fb4d02d6279412", + "bec0cae37feb48a4add318d970d8ef96", + "feb8127671424fa68b9b93a7547e40eb", + "94c89aa435e44c8d9369305c21ca028c", + "7abe3bc7884e4afeb9995f7d7acc8c0f", + "7edccbff1ca145eabd4af6f9da32442a", + "9c11fce2ab1f4811a3d98f9154818825", + "2cf5d1a84ed947ddb16e5f8e6984b01e", + "d5de868032e640d5a07c34c9917190c3", + "d6295baf48b24c929c3ab4a317356e2b", + "df2ed1f8e3754f3a8f30be35935e82f3", + "2f125504e41344c088231f0307d7cb92", + "6a6665e93675459394536fd9f846fbea", + "aed65947759c47c58abe86f1ee279b86", + "b5d7fb93223c4c458e7c80e59daea4d2", + "540140c9c2f541b3a82f7a59e4f0b867", + "d5212aa2a4f74de1970c07e282f0e2bc", + "4a75d002a4ae4fc99625420ec6e580ee", + "695f77c019db487ea60171277073efe6", + "bc3fad5fe0194399add875a6d78907bd", + "42c418329198400bb77cdd3a654a96de", + "4ef5fe3e9ea84b8c8cb1a90a8208bdb9", + "735b9a74223f4941a2837a1108889f63" + ] }, - "bec0cae37feb48a4add318d970d8ef96": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_feb8127671424fa68b9b93a7547e40eb", - "IPY_MODEL_94c89aa435e44c8d9369305c21ca028c", - "IPY_MODEL_7abe3bc7884e4afeb9995f7d7acc8c0f" + "outputId": "7ed04793-8bb9-49c6-b090-82111d9835f6" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/48.0 [00:00= len(tokens):\n", + " break\n", + " return result\n", + "\n", + "\n", + "def check_task_status(es, task_id):\n", + " while True:\n", + " task_response = es.tasks.get(task_id=task_id)\n", + " if task_response[\"completed\"]:\n", + " print(\"Reindexing complete.\")\n", + " break\n", + " else:\n", + " print(\"Indexing...\")\n", + " time.sleep(10)" + ], + "metadata": { + "id": "xB2a9-qtONbQ" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "##Ingest Pipelines" + ], + "metadata": { + "id": "izMU8HqqP7ld" + } + }, + { + "cell_type": "code", + "source": [ + "# Define the ingest pipeline configuration\n", + "pipeline_body = {\n", + " \"description\": \"Pipeline for processing book passages\",\n", + " \"processors\": [\n", + " {\n", + " \"foreach\": {\n", + " \"field\": \"passages\",\n", + " \"processor\": {\n", + " \"inference\": {\n", + " \"field_map\": {\"_ingest._value.text\": \"text_field\"},\n", + " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", + " \"target_field\": \"_ingest._value.vector\",\n", + " \"on_failure\": [\n", + " {\n", + " \"append\": {\n", + " \"field\": \"_source._ingest.inference_errors\",\n", + " \"value\": [\n", + " {\n", + " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", + " \"pipeline\": \"ml-inference-title-vector\",\n", + " \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n", + " }\n", + " ],\n", + " }\n", + " }\n", + " ],\n", + " }\n", + " },\n", + " }\n", + " },\n", + " {\n", + " \"foreach\": {\n", + " \"field\": \"passages\",\n", + " \"processor\": {\n", + " \"inference\": {\n", + " \"field_map\": {\"_ingest._value.text\": \"text_field\"},\n", + " \"model_id\": elser_model_id,\n", + " \"target_field\": \"_ingest._value.content_embedding\",\n", + " \"on_failure\": [\n", + " {\n", + " \"append\": {\n", + " \"field\": \"_source._ingest.inference_errors\",\n", + " \"value\": [\n", + " {\n", + " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", + " \"pipeline\": \"ml-inference-title-vector\",\n", + " \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n", + " }\n", + " ],\n", + " }\n", + " }\n", + " ],\n", + " }\n", + " },\n", + " }\n", + " },\n", + " ],\n", + "}\n", + "\n", + "# Create or update the pipeline\n", + "pipeline_id = \"books_dataset_chunker\"\n", + "es = create_es_client()\n", + "es.ingest.put_pipeline(id=pipeline_id, body=pipeline_body)\n", + "print(f\"Ingest pipeline '{pipeline_id}' created/updated successfully.\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "e5ec838bb84644b6a27e3eaec9d7ac74": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } + "id": "iUOFJK48OamP", + "outputId": "b277277d-5aab-46bc-8a98-457c627a46d2" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Ingest pipeline 'books_dataset_chunker' created/updated successfully.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "##Index Settings" + ], + "metadata": { + "id": "6ZkRwEGdQBRP" + } + }, + { + "cell_type": "code", + "source": [ + "index_settings = {\n", + " \"settings\": {\n", + " \"number_of_shards\": 2,\n", + " \"number_of_replicas\": 0,\n", + " \"default_pipeline\": \"books_dataset_chunker\",\n", + " },\n", + " \"mappings\": {\n", + " \"dynamic\": \"false\",\n", + " \"properties\": {\n", + " \"book_title\": {\"type\": \"keyword\"},\n", + " \"chapter\": {\"type\": \"keyword\"},\n", + " \"chapter_full_text\": {\"type\": \"text\", \"index\": False},\n", + " \"passages\": {\n", + " \"type\": \"nested\",\n", + " \"properties\": {\n", + " \"content_embedding\": {\n", + " \"properties\": {\n", + " \"is_truncated\": {\"type\": \"boolean\"},\n", + " \"model_id\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\n", + " \"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}\n", + " },\n", + " },\n", + " \"predicted_value\": {\"type\": \"sparse_vector\"},\n", + " }\n", + " },\n", + " \"text\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"vector\": {\n", + " \"properties\": {\n", + " \"is_truncated\": {\"type\": \"boolean\"},\n", + " \"model_id\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\n", + " \"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}\n", + " },\n", + " },\n", + " \"predicted_value\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 384,\n", + " \"index\": True,\n", + " \"similarity\": \"dot_product\",\n", + " },\n", + " }\n", + " },\n", + " \"chunk_number\": {\"type\": \"integer\"},\n", + " },\n", + " },\n", + " },\n", + " },\n", + "}\n", + "\n", + "raw_source_index_settings = {\n", + " \"settings\": {\"number_of_shards\": 2, \"number_of_replicas\": 0},\n", + " \"mappings\": {\n", + " \"dynamic\": \"false\",\n", + " \"properties\": {\n", + " \"book_title\": {\"type\": \"keyword\"},\n", + " \"chapter\": {\"type\": \"keyword\"},\n", + " \"chapter_full_text\": {\"type\": \"text\", \"index\": False},\n", + " \"passages\": {\n", + " \"type\": \"nested\",\n", + " \"properties\": {\n", + " \"text\": {\n", + " \"type\": \"text\",\n", + " \"fields\": {\"keyword\": {\"type\": \"keyword\", \"ignore_above\": 256}},\n", + " },\n", + " \"chunk_number\": {\"type\": \"integer\"},\n", + " },\n", + " },\n", + " },\n", + " },\n", + "}\n", + "\n", + "# Manage indices\n", + "manage_index(\n", + " es,\n", + " index_name,\n", + " index_settings[\"settings\"],\n", + " index_settings[\"mappings\"],\n", + " delete_index=True,\n", + ")\n", + "manage_index(\n", + " es,\n", + " raw_source_index,\n", + " raw_source_index_settings[\"settings\"],\n", + " raw_source_index_settings[\"mappings\"],\n", + " delete_index=True,\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "e87bc6913a7747728aed4b60a645bc2c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_d93a6e3ddd364921a7c2a24451d27ffc", - "placeholder": "​", - "style": "IPY_MODEL_2155cf3c7b2043d0a41fc011bf4f0e04", - "value": "tokenizer_config.json: 100%" - } + "id": "vZ3Z5gZbOgjF", + "outputId": "e93982a7-be52-4b3c-9f2b-910b1e188f79" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Index harry_potter_dataset_enriched exists. Deleting it...\n", + "Index harry_potter_dataset_enriched deleted!\n", + "Index harry_potter_dataset_enriched created successfully!\n", + "Index harry_potter_dataset-raw exists. Deleting it...\n", + "Index harry_potter_dataset-raw deleted!\n", + "Index harry_potter_dataset-raw created successfully!\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Fetch and Process the Book Text\n", + "\n", + "This section downloads the full text of \"Harry Potter and the Sorcerer's Stone\" from a specified URL and processes it to extract chapters and their titles. The text is then structured into a pandas DataFrame for further analysis and indexing.\n", + "\n", + "### Key Steps:\n", + "1. **Download Text**: The book is fetched using `urllib.request` from the provided URL.\n", + "2. **Extract Chapters**: The text is split into chapters based on predefined patterns, omitting the text before the first chapter.\n", + "3. **Capture Chapter Titles**: Chapter titles are extracted and paired with their respective texts.\n", + "4. **Data Structuring**:\n", + " - Convert the list of chapter titles and texts into a DataFrame.\n", + " - Assign sequential numbers to chapters.\n", + " - Add the book title as metadata.\n", + " - Apply a text chunking function to split each chapter into manageable passages.\n", + "\n", + "This prepares the text data for efficient indexing and advanced search operations in Elasticsearch.\n" + ], + "metadata": { + "id": "NPtbLhVOQUF3" + } + }, + { + "cell_type": "code", + "source": [ + "# Fetch and process the book text\n", + "potter_book_url = \"https://raw.githubusercontent.com/amephraim/nlp/master/texts/J.%20K.%20Rowling%20-%20Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt\"\n", + "response = urllib.request.urlopen(potter_book_url)\n", + "harry_potter_book_text = response.read().decode(\"utf-8\")\n", + "chapter_pattern = re.compile(r\"CHAPTER [A-Z]+\", re.IGNORECASE)\n", + "chapters = chapter_pattern.split(harry_potter_book_text)[1:]\n", + "chapter_titles = re.findall(chapter_pattern, harry_potter_book_text)\n", + "chapters_with_titles = list(zip(chapter_titles, chapters))\n", + "\n", + "print(\"Total chapters found:\", len(chapters))\n", + "if chapters_with_titles:\n", + " print(\"First chapter title:\", chapters_with_titles[0][0])\n", + " print(\"Text sample from first chapter:\", chapters_with_titles[0][1][:500])\n", + "\n", + "\n", + "# Structuring chapters into a DataFrame\n", + "df = pd.DataFrame(chapters_with_titles, columns=[\"chapter_title\", \"chapter_full_text\"])\n", + "df[\"chapter\"] = df.index + 1\n", + "df[\"book_title\"] = \"Harry Potter and the Sorcerer’s Stone\"\n", + "df[\"passages\"] = df[\"chapter_full_text\"].apply(lambda text: chunk(text))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "f04f37ba10e9498ea61acdce637431ee": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } + "id": "0L4YI96xOuKn", + "outputId": "a2dc28b9-4ece-485e-b160-0b51d0fa9ebd" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Token indices sequence length is longer than the specified maximum sequence length for this model (6535 > 512). Running this sequence through the model will result in indexing errors\n" + ] }, - "f34bf7b0bb424a8e8c00ff75309bbe6f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_d604bc170c02491fae573c702e790893", - "IPY_MODEL_d8a3bdb8be354365944ab587738280d3", - "IPY_MODEL_89ad2dee66324ae896eec71924aee670" - ], - "layout": "IPY_MODEL_31c0c3d684564d5fb87d2e25e6de96eb" - } + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Total chapters found: 17\n", + "First chapter title: CHAPTER ONE\n", + "Text sample from first chapter: \n", + "\n", + "THE BOY WHO LIVED\n", + "\n", + "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\n", + "that they were perfectly normal, thank you very much. They were the last\n", + "people you'd expect to be involved in anything strange or mysterious,\n", + "because they just didn't hold with such nonsense.\n", + "\n", + "Mr. Dursley was the director of a firm called Grunnings, which made\n", + "drills. He was a big, beefy man with hardly any neck, although he did\n", + "have a very large mustache. Mrs. Dursley was thin and blonde and had\n", + "nearly t\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Indexing DataFrame into Elasticsearch\n", + "\n", + "This section uploads the structured data from a pandas DataFrame into a specified Elasticsearch index. The DataFrame contains chapter information from \"Harry Potter and the Sorcerer's Stone\", including chapter titles, full texts, and additional metadata.\n", + "\n", + "### Key Operation:\n", + "- **Index Data**: The `index_dataframe` function is called with the Elasticsearch client, the raw source index name, and the DataFrame as arguments. This operation effectively uploads the data into Elasticsearch, making it searchable and ready for further processing.\n" + ], + "metadata": { + "id": "DKK4574EQaTl" + } + }, + { + "cell_type": "code", + "source": [ + "index_dataframe(es, raw_source_index, df)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "feb8127671424fa68b9b93a7547e40eb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9c11fce2ab1f4811a3d98f9154818825", - "placeholder": "​", - "style": "IPY_MODEL_2cf5d1a84ed947ddb16e5f8e6984b01e", - "value": "tokenizer.json: 100%" - } + "id": "7ReLAtz1O1HF", + "outputId": "eb6054fe-9e12-4c55-92f5-40c00d055d45" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Indexing documents to harry_potter_dataset-raw...\n", + "Successfully indexed 17 documents.\n", + "Failed to index 0 documents.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Asynchronous Reindexing in Elasticsearch\n", + "\n", + "This section initiates an asynchronous reindex operation to transfer data from the raw source index to the enriched index in Elasticsearch. This process runs in the background, allowing other operations to continue without waiting for completion.\n", + "\n", + "### Key Steps:\n", + "1. **Start Reindex**: The reindex operation is triggered from the `raw_source_index` to the `index_name`, with `wait_for_completion` set to `False` to allow asynchronous execution.\n", + "2. **Retrieve Task ID**: The task ID of the reindex operation is captured and printed for monitoring purposes.\n", + "3. **Monitor Progress**: The `check_task_status` function continuously checks the status of the reindex task, providing updates every 10 seconds until the operation is complete.\n" + ], + "metadata": { + "id": "pA5QroYdQgcM" + } + }, + { + "cell_type": "code", + "source": [ + "# Start the reindex operation asynchronously\n", + "response = es.reindex(\n", + " body={\"source\": {\"index\": raw_source_index}, \"dest\": {\"index\": index_name}},\n", + " wait_for_completion=False,\n", + ")\n", + "task_id = response[\"task\"]\n", + "print(\"Task ID:\", task_id)\n", + "check_task_status(es, task_id)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HOCX_lbmO3zl", + "outputId": "1e9834c9-dc8f-4bb3-c676-1f3b62133ca0" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Task ID: vU-Vl_2tRMaigpDD944gmQ:105797433\n", + "Indexing...\n", + "Reindexing complete.\n" + ] } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Custom Search Query Construction and Execution\n", + "\n", + "This section constructs and executes a custom search query in Elasticsearch, utilizing a hybrid approach combining vector and text-based search methods to enhance search accuracy and relevance. The specific example used is a user query about the \"Nimbus 2000\".\n", + "\n", + "### Key Steps:\n", + "1. **Define User Query**: The user query is specified as \"what is a nimbus 2000\".\n", + "2. **Set Boost Factors**:\n", + " - `knn_boost_factor`: A value to amplify the importance of the vector-based search component.\n", + " - `text_expansion_boost`: A value to modify the weight of the text-based search component.\n", + "3. **Build Query**: The `build_custom_query` function constructs the search query, incorporating both dense vector and text expansion components.\n", + "4. **Execute Search**: The query is executed against the specified Elasticsearch index.\n", + "5. **Identify Relevant Passages**:\n", + " - The search results are analyzed to find the passage with the highest relevance score.\n", + " - The ID and chunk number of the best matching passage are captured and printed.\n", + "6. **Fetch Surrounding Chunks**: Constructs and executes a query to retrieve chunks adjacent to the identified passage for broader context. If the matched chunk is the first chunk, fetches n, n+1, and n+2. If the chunk is the last chunk in the chapter, fetches n, n-1, and n-2. For other chunks, fetches n-1, n, and n+1.\n", + "7. **Display Results**: Outputs text from the relevant and adjacent passages." + ], + "metadata": { + "id": "xJBDwRmDQq4n" } + }, + { + "cell_type": "code", + "source": [ + "# Custom Search Query Construction\n", + "user_query = \"what is a nimbus 2000\"\n", + "\n", + "\n", + "knn_boost_factor = 20\n", + "text_expansion_boost = 1\n", + "query = build_custom_query(\n", + " build_vector(user_query),\n", + " user_query,\n", + " knn_boost_factor,\n", + " text_expansion_boost,\n", + " debug=False,\n", + ")\n", + "\n", + "# Searching and identifying relevant passages\n", + "results = es.search(index=index_name, body=query, _source=False)\n", + "\n", + "hit_id = None\n", + "chunk_number = None\n", + "\n", + "if results and results.get(\"hits\") and results[\"hits\"].get(\"hits\"):\n", + " highest_score = -1\n", + " best_hit = None\n", + " hit_id = results[\"hits\"][\"hits\"][0][\"_id\"]\n", + " chapter_number = results[\"hits\"][\"hits\"][0][\"fields\"][\"chapter\"][0]\n", + " if \"inner_hits\" in results[\"hits\"][\"hits\"][0]:\n", + " for hit_type in [\"text_hits\", \"dense_hit\", \"sparse_hits\"]:\n", + " if hit_type in results[\"hits\"][\"hits\"][0][\"inner_hits\"]:\n", + " inner_hit = results[\"hits\"][\"hits\"][0][\"inner_hits\"][hit_type][\"hits\"]\n", + " if inner_hit[\"hits\"]:\n", + " max_score = inner_hit[\"max_score\"]\n", + " if max_score and max_score > highest_score:\n", + " highest_score = max_score\n", + " best_hit = inner_hit[\"hits\"][0]\n", + "\n", + " if best_hit:\n", + " first_passage_text = best_hit[\"_source\"][\"text\"]\n", + " chunk_number = best_hit[\"_source\"][\"chunk_number\"]\n", + " # print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text: {first_passage_text}\")\n", + " print(\n", + " f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text:\\n{textwrap.fill(first_passage_text, width=200)}\"\n", + " )\n", + " print(f\"\\n\")\n", + " else:\n", + " print(f\"ID: {hit_id}, No relevant passages found.\")\n", + "else:\n", + " print(\"No results found.\")\n", + "\n", + "print(f\"Fetch Surrounding Chunks\")\n", + "print(f\"------------------------\")\n", + "\n", + "max_chapter_chunk_result = es.search(\n", + " index=index_name,\n", + " body=get_max_chunk_number_query(chapter_number, debug=False),\n", + " _source=False,\n", + ")\n", + "max_chunk_number = max_chapter_chunk_result[\"aggregations\"][\"max_chunk_number\"][\n", + " \"max_chunk\"\n", + "][\"value\"]\n", + "\n", + "adjacent_chunks_query = get_adjacent_chunks_query(\n", + " hit_id, chunk_number, max_chunk_number, debug=False\n", + ")\n", + "results = es.search(index=index_name, body=adjacent_chunks_query, _source=False)\n", + "print_text_from_results(results)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "u7NFZBRJO3t7", + "outputId": "19461a64-d3eb-4940-9884-bd89d5e23910" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Matched Chunk ID: rV8Y5Y8BQsZxvNJ9cO4t, Chunk Number: 3, Text:\n", + "t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "\n", + "\n", + "Fetch Surrounding Chunks\n", + "------------------------\n", + "\n", + "\n", + "Text from Chunk 2: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "\n", + "\n", + "Text from Chunk 3: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "\n", + "\n", + "Text from Chunk 4: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n" + ] + } + ] } - }, - "nbformat": 4, - "nbformat_minor": 0 -} + ] +} \ No newline at end of file From 2ed877b209ecf9bfc680419270fa9f53d97436cf Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Wed, 5 Jun 2024 00:17:08 -0500 Subject: [PATCH 05/17] Updated Notebook Updated notebook to handle downloading required models such as elser and sentence transformer minilm --- .../fetch-surrounding-chunks.ipynb | 3404 +---------------- 1 file changed, 176 insertions(+), 3228 deletions(-) diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb index 0cc262f6..fe583d3f 100644 --- a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb +++ b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb @@ -11,2746 +11,6 @@ }, "language_info": { "name": "python" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "9c160e35cf414c528b5bffe05725a7d9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_e87bc6913a7747728aed4b60a645bc2c", - "IPY_MODEL_9fa94c466004402bb293e4aa0bdc82f4", - "IPY_MODEL_95316b2f654a4ddc99c92d7c60c2f417" - ], - "layout": "IPY_MODEL_0fc0b516e82941dc934c26eba22d9e01" - } - }, - "e87bc6913a7747728aed4b60a645bc2c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_d93a6e3ddd364921a7c2a24451d27ffc", - "placeholder": "​", - "style": "IPY_MODEL_2155cf3c7b2043d0a41fc011bf4f0e04", - "value": "tokenizer_config.json: 100%" - } - }, - "9fa94c466004402bb293e4aa0bdc82f4": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_500a70f25097484bbec10c0ffd402595", - "max": 48, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_9d43995246744e26a8053c21e2c5fcfa", - "value": 48 - } - }, - "95316b2f654a4ddc99c92d7c60c2f417": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2ed85f5360ba4feda6469aabd0324e7a", - "placeholder": "​", - "style": "IPY_MODEL_808af1e1f2464a928ee23398c837ff48", - "value": " 48.0/48.0 [00:00<00:00, 1.58kB/s]" - } - }, - "0fc0b516e82941dc934c26eba22d9e01": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d93a6e3ddd364921a7c2a24451d27ffc": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2155cf3c7b2043d0a41fc011bf4f0e04": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "500a70f25097484bbec10c0ffd402595": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "9d43995246744e26a8053c21e2c5fcfa": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "2ed85f5360ba4feda6469aabd0324e7a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "808af1e1f2464a928ee23398c837ff48": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "31906527169a4c08801dc6b21936188d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_a6570ce51dfc46f383d855e28534bf73", - "IPY_MODEL_41cc49a71a164065bc833d080027e4d2", - "IPY_MODEL_748e7f3c8da243e9b5320654ec8e8146" - ], - "layout": "IPY_MODEL_a88953429ab6436fb4f01b6b1e2cf6ff" - } - }, - "a6570ce51dfc46f383d855e28534bf73": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b0a2671c90a048548314c2e3d21e19e7", - "placeholder": "​", - "style": "IPY_MODEL_5f2080a5d12241638447a5851d0c8db3", - "value": "vocab.txt: 100%" - } - }, - "41cc49a71a164065bc833d080027e4d2": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7ce44d2f323d45838633a750f2386525", - "max": 231508, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_406f3564a217478d8f60dee5e1fb6dbf", - "value": 231508 - } - }, - "748e7f3c8da243e9b5320654ec8e8146": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_61ae734ac8d441fd9b3ea198aff3f2c7", - "placeholder": "​", - "style": "IPY_MODEL_bc52c57fa6464ab39823cd3ddb9d7d78", - "value": " 232k/232k [00:00<00:00, 2.88MB/s]" - } - }, - "a88953429ab6436fb4f01b6b1e2cf6ff": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b0a2671c90a048548314c2e3d21e19e7": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5f2080a5d12241638447a5851d0c8db3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "7ce44d2f323d45838633a750f2386525": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "406f3564a217478d8f60dee5e1fb6dbf": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "61ae734ac8d441fd9b3ea198aff3f2c7": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "bc52c57fa6464ab39823cd3ddb9d7d78": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "02f735a438bf4058a9cfacf8d2b8660f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_331b178397164de49408dc50ce417a36", - "IPY_MODEL_07ee43d2a1684fb0b1445755802b6ea5", - "IPY_MODEL_c867bce7e34b4800903eb9ec99f34784" - ], - "layout": "IPY_MODEL_8169e16a9b0146f5a57a015601c2ebcb" - } - }, - "331b178397164de49408dc50ce417a36": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_35ca86faebfd43faaef0202389d958fd", - "placeholder": "​", - "style": "IPY_MODEL_f04f37ba10e9498ea61acdce637431ee", - "value": "tokenizer.json: 100%" - } - }, - "07ee43d2a1684fb0b1445755802b6ea5": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_527bfa6067c84b94a1e70dfadfd4b78e", - "max": 466062, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_312e85864e074b958d86325b6417a0fa", - "value": 466062 - } - }, - "c867bce7e34b4800903eb9ec99f34784": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3b0fc37739334025b037a5270c9515bf", - "placeholder": "​", - "style": "IPY_MODEL_0dfb7f264674449b92a390324d17c4cf", - "value": " 466k/466k [00:00<00:00, 6.88MB/s]" - } - }, - "8169e16a9b0146f5a57a015601c2ebcb": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "35ca86faebfd43faaef0202389d958fd": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f04f37ba10e9498ea61acdce637431ee": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "527bfa6067c84b94a1e70dfadfd4b78e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "312e85864e074b958d86325b6417a0fa": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "3b0fc37739334025b037a5270c9515bf": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0dfb7f264674449b92a390324d17c4cf": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "7e58bf25549d4b428f231d528e8fef54": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_461ca08f677a4cba9ec2a388c2e346f3", - "IPY_MODEL_90d31fb52af949b0a2b41e3613827233", - "IPY_MODEL_1afbe347ab364b28b887f49dad54f5d7" - ], - "layout": "IPY_MODEL_8483a759cc0e4e12834fc7d08dab3b7e" - } - }, - "461ca08f677a4cba9ec2a388c2e346f3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_59d8ffb31bb340eba7e0dcebfbbdd977", - "placeholder": "​", - "style": "IPY_MODEL_2b62b542c091466cbae559e29ec797bd", - "value": "config.json: 100%" - } - }, - "90d31fb52af949b0a2b41e3613827233": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_092a4de220ba4ca2a23a0f273aba601b", - "max": 570, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_e0c46565371f437a85a26d44c5b20c5b", - "value": 570 - } - }, - "1afbe347ab364b28b887f49dad54f5d7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4831ce9114e5437ea8a24919557c40e2", - "placeholder": "​", - "style": "IPY_MODEL_53344cac458d4d5ebdc504744c18b7de", - "value": " 570/570 [00:00<00:00, 19.9kB/s]" - } - }, - "8483a759cc0e4e12834fc7d08dab3b7e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "59d8ffb31bb340eba7e0dcebfbbdd977": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2b62b542c091466cbae559e29ec797bd": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "092a4de220ba4ca2a23a0f273aba601b": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e0c46565371f437a85a26d44c5b20c5b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "4831ce9114e5437ea8a24919557c40e2": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "53344cac458d4d5ebdc504744c18b7de": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "f34bf7b0bb424a8e8c00ff75309bbe6f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_d604bc170c02491fae573c702e790893", - "IPY_MODEL_d8a3bdb8be354365944ab587738280d3", - "IPY_MODEL_89ad2dee66324ae896eec71924aee670" - ], - "layout": "IPY_MODEL_31c0c3d684564d5fb87d2e25e6de96eb" - } - }, - "d604bc170c02491fae573c702e790893": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b7bef190ebed494eb8773ec21d9b7160", - "placeholder": "​", - "style": "IPY_MODEL_6c822f08434c4212931dcf097a80b7d4", - "value": "tokenizer_config.json: 100%" - } - }, - "d8a3bdb8be354365944ab587738280d3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0f8d4b5000174234bded5d4e017aa4e9", - "max": 418, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_243f1e7de5414b82aaa4b50482dd964d", - "value": 418 - } - }, - "89ad2dee66324ae896eec71924aee670": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0961f276155348f98c12d1be4ad78e62", - "placeholder": "​", - "style": "IPY_MODEL_95c6cffafe1b4345a905be485b787728", - "value": " 418/418 [00:00<00:00, 13.9kB/s]" - } - }, - "31c0c3d684564d5fb87d2e25e6de96eb": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b7bef190ebed494eb8773ec21d9b7160": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6c822f08434c4212931dcf097a80b7d4": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "0f8d4b5000174234bded5d4e017aa4e9": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "243f1e7de5414b82aaa4b50482dd964d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "0961f276155348f98c12d1be4ad78e62": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "95c6cffafe1b4345a905be485b787728": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "43072f923bd24566ae0e20ca9aa3cdc5": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_5a4f80526c2c4b53a1bce182e9b3e5fa", - "IPY_MODEL_400b0a8ef7c64477bf4f02a16b5508a0", - "IPY_MODEL_b03347c3201849778ea3129314ac340c" - ], - "layout": "IPY_MODEL_3542b02e36ce4e03850b37c28d88da30" - } - }, - "5a4f80526c2c4b53a1bce182e9b3e5fa": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7b2abc768054422f8af3d21837400b4a", - "placeholder": "​", - "style": "IPY_MODEL_d8e4ceae237d4381aa5e44b020d7564e", - "value": "sentencepiece.bpe.model: 100%" - } - }, - "400b0a8ef7c64477bf4f02a16b5508a0": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e5ec838bb84644b6a27e3eaec9d7ac74", - "max": 5069051, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_30ba8c556cc34fde96b530ed66ac376d", - "value": 5069051 - } - }, - "b03347c3201849778ea3129314ac340c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_711439e7dcab4c10ab4300bdbe6b86aa", - "placeholder": "​", - "style": "IPY_MODEL_15887401b0814d9386fb4d02d6279412", - "value": " 5.07M/5.07M [00:00<00:00, 19.7MB/s]" - } - }, - "3542b02e36ce4e03850b37c28d88da30": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7b2abc768054422f8af3d21837400b4a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d8e4ceae237d4381aa5e44b020d7564e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e5ec838bb84644b6a27e3eaec9d7ac74": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "30ba8c556cc34fde96b530ed66ac376d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "711439e7dcab4c10ab4300bdbe6b86aa": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "15887401b0814d9386fb4d02d6279412": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "bec0cae37feb48a4add318d970d8ef96": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_feb8127671424fa68b9b93a7547e40eb", - "IPY_MODEL_94c89aa435e44c8d9369305c21ca028c", - "IPY_MODEL_7abe3bc7884e4afeb9995f7d7acc8c0f" - ], - "layout": "IPY_MODEL_7edccbff1ca145eabd4af6f9da32442a" - } - }, - "feb8127671424fa68b9b93a7547e40eb": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9c11fce2ab1f4811a3d98f9154818825", - "placeholder": "​", - "style": "IPY_MODEL_2cf5d1a84ed947ddb16e5f8e6984b01e", - "value": "tokenizer.json: 100%" - } - }, - "94c89aa435e44c8d9369305c21ca028c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_d5de868032e640d5a07c34c9917190c3", - "max": 17082660, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_d6295baf48b24c929c3ab4a317356e2b", - "value": 17082660 - } - }, - "7abe3bc7884e4afeb9995f7d7acc8c0f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_df2ed1f8e3754f3a8f30be35935e82f3", - "placeholder": "​", - "style": "IPY_MODEL_2f125504e41344c088231f0307d7cb92", - "value": " 17.1M/17.1M [00:00<00:00, 76.7MB/s]" - } - }, - "7edccbff1ca145eabd4af6f9da32442a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "9c11fce2ab1f4811a3d98f9154818825": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2cf5d1a84ed947ddb16e5f8e6984b01e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "d5de868032e640d5a07c34c9917190c3": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d6295baf48b24c929c3ab4a317356e2b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "df2ed1f8e3754f3a8f30be35935e82f3": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2f125504e41344c088231f0307d7cb92": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "6a6665e93675459394536fd9f846fbea": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_aed65947759c47c58abe86f1ee279b86", - "IPY_MODEL_b5d7fb93223c4c458e7c80e59daea4d2", - "IPY_MODEL_540140c9c2f541b3a82f7a59e4f0b867" - ], - "layout": "IPY_MODEL_d5212aa2a4f74de1970c07e282f0e2bc" - } - }, - "aed65947759c47c58abe86f1ee279b86": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4a75d002a4ae4fc99625420ec6e580ee", - "placeholder": "​", - "style": "IPY_MODEL_695f77c019db487ea60171277073efe6", - "value": "special_tokens_map.json: 100%" - } - }, - "b5d7fb93223c4c458e7c80e59daea4d2": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_bc3fad5fe0194399add875a6d78907bd", - "max": 280, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_42c418329198400bb77cdd3a654a96de", - "value": 280 - } - }, - "540140c9c2f541b3a82f7a59e4f0b867": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4ef5fe3e9ea84b8c8cb1a90a8208bdb9", - "placeholder": "​", - "style": "IPY_MODEL_735b9a74223f4941a2837a1108889f63", - "value": " 280/280 [00:00<00:00, 15.7kB/s]" - } - }, - "d5212aa2a4f74de1970c07e282f0e2bc": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4a75d002a4ae4fc99625420ec6e580ee": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "695f77c019db487ea60171277073efe6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "bc3fad5fe0194399add875a6d78907bd": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "42c418329198400bb77cdd3a654a96de": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "4ef5fe3e9ea84b8c8cb1a90a8208bdb9": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "735b9a74223f4941a2837a1108889f63": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } } }, "cells": [ @@ -2779,37 +39,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "nXuL8wsQNq8G", - "outputId": "2257cb06-4809-4be0-f698-f4a8ba11488e" + "id": "nXuL8wsQNq8G" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Requirement already satisfied: elasticsearch in /usr/local/lib/python3.10/dist-packages (8.12.0)\n", - "Requirement already satisfied: elastic-transport<9,>=8 in /usr/local/lib/python3.10/dist-packages (from elasticsearch) (8.13.0)\n", - "Requirement already satisfied: urllib3<3,>=1.26.2 in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8->elasticsearch) (2.0.7)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8->elasticsearch) (2024.2.2)\n", - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", - "\u001b[0mRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (1.5.3)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.4)\n", - "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.23.5)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n", - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", - "\u001b[0m" - ] - } - ], + "outputs": [], "source": [ - "!pip install elasticsearch\n", + "!pip install elasticsearch==8.13.2\n", "!pip install pandas\n", + "!python -m pip install eland\n", "\n", "import json\n", "import time\n", @@ -2817,20 +55,10 @@ "import re\n", "import pandas as pd\n", "from transformers import AutoTokenizer, BertTokenizer\n", - "from elasticsearch import Elasticsearch, helpers\n", - "from google.colab import userdata\n", + "from elasticsearch import Elasticsearch, helpers, exceptions\n", "import textwrap" ] }, - { - "cell_type": "markdown", - "source": [ - "# Elasticsearch and Tokenizer Configuration\n" - ], - "metadata": { - "id": "_d4RWjNAN6Q9" - } - }, { "cell_type": "markdown", "source": [ @@ -2847,13 +75,16 @@ " - `raw_source_index`: The name of the index for the raw dataset (`harry_potter_dataset-raw`).\n", " - `index_name`: The name of the enriched dataset index (`harry_potter_dataset_enriched`).\n", "\n", - "3. **Embedding Model**:\n", - " - `dense_embedding_model`: Specifies the model used for generating dense embeddings (`sentence-transformers/all-minilm-l6-v2`).\n", + "3. **Embedding Models**:\n", + " - `dense_embedding_model`: Specifies the model used for generating dense embeddings (`sentence-transformers__all-minilm-l6-v2`).\n", + " - `dense_huggingface_model_id`: The Hugging Face model ID for the dense embeddings (`sentence-transformers/all-MiniLM-L6-v2`).\n", + " - `dense_model_number_of_allocators`: The number of allocators for the dense embedding model (2).\n", + "\n", " - `elser_model_id`: Specifies the ELSER model ID (`.elser_model_2_linux-x86_64`).\n", + " - `elser_model_number_of_allocators`: The number of allocators for the ELSER model (2).\n", "\n", "4. **Tokenizer Initialization**:\n", " - `bert_tokenizer`: Initializes the BERT tokenizer (`bert-base-uncased`) for English text processing.\n", - " - `e5_tokenizer`: Initializes the Multilingual E5 tokenizer (`intfloat/multilingual-e5-base`) for handling diverse datasets.\n", "\n", "5. **Chunking Parameters**:\n", " - `SEMANTIC_SEARCH_TOKEN_LIMIT`: Sets the token limit for each chunk (500 tokens per chunk, considering space for special tokens).\n", @@ -2881,276 +112,160 @@ "index_name = \"harry_potter_dataset_enriched\"\n", "\n", "dense_embedding_model = \"sentence-transformers__all-minilm-l6-v2\"\n", + "dense_huggingface_model_id = \"sentence-transformers/all-MiniLM-L6-v2\"\n", + "dense_model_number_of_allocators = 2\n", + "\n", "elser_model_id = \".elser_model_2_linux-x86_64\"\n", + "elser_model_number_of_allocators = 2\n", "\n", "bert_tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", - "e5_tokenizer = AutoTokenizer.from_pretrained(\"intfloat/multilingual-e5-base\")\n", "\n", "\n", "SEMANTIC_SEARCH_TOKEN_LIMIT = 500\n", - "ELSER_TOKEN_OVERLAP = 0.0" + "ELSER_TOKEN_OVERLAP = 0.0\n", + "\n", + "\n", + "# Create the client instance\n", + "esclient = Elasticsearch(\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=ELASTIC_API_KEY,\n", + ")\n", + "print(esclient.info())" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "LGQAjG6PERfx", - "outputId": "8a37251e-2df3-4359-ad3e-bd340731c7d1" + "id": "LGQAjG6PERfx" }, - "execution_count": 5, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Elastic Cloud ID: ··········\n", - "Elastic Api Key: ··········\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { - "cell_type": "code", + "cell_type": "markdown", "source": [ - "# Elasticsearch and Tokenizer Configuration\n", - "es_username = \"elastic\"\n", - "es_password = userdata.get(\"es_password\")\n", - "es_cloudid = userdata.get(\"es_cloudid\")\n", "\n", - "raw_source_index = \"harry_potter_dataset-raw\"\n", - "index_name = \"harry_potter_dataset_enriched\"\n", + "## Import model\n", + "Using the eland_import_hub_model script, download and install all-MiniLM-L6-v2 transformer model. Setting the NLP --task-type as text_embedding.\n", "\n", - "dense_embedding_model = \"sentence-transformers__all-minilm-l6-v2\"\n", - "elser_model_id = \".elser_model_2_linux-x86_64\"\n", + "To get the cloud id, go to Elastic cloud and On the deployment overview page, copy down the Cloud ID.\n", "\n", - "bert_tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n", - "e5_tokenizer = AutoTokenizer.from_pretrained(\"intfloat/multilingual-e5-base\")\n", + "To authenticate your request, You could use API key. Alternatively, you can use your cloud deployment username and password." + ], + "metadata": { + "id": "rOWheQ-uJE2C" + } + }, + { + "cell_type": "code", + "source": [ + "!eland_import_hub_model --cloud-id $ELASTIC_CLOUD_ID --hub-model-id {dense_huggingface_model_id} --task-type text_embedding --es-api-key $ELASTIC_API_KEY --start --clear-previous\n", + "resp = esclient.ml.update_trained_model_deployment(\n", + " model_id=dense_embedding_model,\n", + " body={\"number_of_allocations\": dense_model_number_of_allocators},\n", + ")\n", + "print(resp)" + ], + "metadata": { + "id": "4NH8JJkQJDit" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Download and Deploy ELSER Model\n", "\n", + "In this example, we are going to download and deploy the ELSER model in our ML node. Make sure you have an ML node in order to run the ELSER model." + ], + "metadata": { + "id": "f1SXd1uhhhhe" + } + }, + { + "cell_type": "code", + "source": [ + "# delete model if already downloaded and deployed\n", + "try:\n", + " esclient.ml.delete_trained_model(model_id=elser_model_id, force=True)\n", + " print(\"Model deleted successfully, We will proceed with creating one\")\n", + "except exceptions.NotFoundError:\n", + " print(\"Model doesn't exist, but We will proceed with creating one\")\n", + "\n", + "# Creates the ELSER model configuration. Automatically downloads the model if it doesn't exist.\n", + "esclient.ml.put_trained_model(\n", + " model_id=elser_model_id, input={\"field_names\": [\"text_field\"]}\n", + ")" + ], + "metadata": { + "id": "vL68fse9hhAN" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The above command will download the ELSER model. This will take a few minutes to complete. Use the following command to check the status of the model download." + ], + "metadata": { + "id": "2R54LYIqwC-f" + } + }, + { + "cell_type": "code", + "source": [ + "while True:\n", + " status = esclient.ml.get_trained_models(\n", + " model_id=elser_model_id, include=\"definition_status\"\n", + " )\n", "\n", - "SEMANTIC_SEARCH_TOKEN_LIMIT = 500\n", - "ELSER_TOKEN_OVERLAP = 0.0" + " if status[\"trained_model_configs\"][0][\"fully_defined\"]:\n", + " print(\"ELSER Model is downloaded and ready to be deployed.\")\n", + " break\n", + " else:\n", + " print(\"ELSER Model is downloaded but not ready to be deployed.\")\n", + " time.sleep(5)" ], "metadata": { - "id": "LQzCw0pgN4ll", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 328, - "referenced_widgets": [ - "9c160e35cf414c528b5bffe05725a7d9", - "e87bc6913a7747728aed4b60a645bc2c", - "9fa94c466004402bb293e4aa0bdc82f4", - "95316b2f654a4ddc99c92d7c60c2f417", - "0fc0b516e82941dc934c26eba22d9e01", - "d93a6e3ddd364921a7c2a24451d27ffc", - "2155cf3c7b2043d0a41fc011bf4f0e04", - "500a70f25097484bbec10c0ffd402595", - "9d43995246744e26a8053c21e2c5fcfa", - "2ed85f5360ba4feda6469aabd0324e7a", - "808af1e1f2464a928ee23398c837ff48", - "31906527169a4c08801dc6b21936188d", - "a6570ce51dfc46f383d855e28534bf73", - "41cc49a71a164065bc833d080027e4d2", - "748e7f3c8da243e9b5320654ec8e8146", - "a88953429ab6436fb4f01b6b1e2cf6ff", - "b0a2671c90a048548314c2e3d21e19e7", - "5f2080a5d12241638447a5851d0c8db3", - "7ce44d2f323d45838633a750f2386525", - "406f3564a217478d8f60dee5e1fb6dbf", - "61ae734ac8d441fd9b3ea198aff3f2c7", - "bc52c57fa6464ab39823cd3ddb9d7d78", - "02f735a438bf4058a9cfacf8d2b8660f", - "331b178397164de49408dc50ce417a36", - "07ee43d2a1684fb0b1445755802b6ea5", - "c867bce7e34b4800903eb9ec99f34784", - "8169e16a9b0146f5a57a015601c2ebcb", - "35ca86faebfd43faaef0202389d958fd", - "f04f37ba10e9498ea61acdce637431ee", - "527bfa6067c84b94a1e70dfadfd4b78e", - "312e85864e074b958d86325b6417a0fa", - "3b0fc37739334025b037a5270c9515bf", - "0dfb7f264674449b92a390324d17c4cf", - "7e58bf25549d4b428f231d528e8fef54", - "461ca08f677a4cba9ec2a388c2e346f3", - "90d31fb52af949b0a2b41e3613827233", - "1afbe347ab364b28b887f49dad54f5d7", - "8483a759cc0e4e12834fc7d08dab3b7e", - "59d8ffb31bb340eba7e0dcebfbbdd977", - "2b62b542c091466cbae559e29ec797bd", - "092a4de220ba4ca2a23a0f273aba601b", - "e0c46565371f437a85a26d44c5b20c5b", - "4831ce9114e5437ea8a24919557c40e2", - "53344cac458d4d5ebdc504744c18b7de", - "f34bf7b0bb424a8e8c00ff75309bbe6f", - "d604bc170c02491fae573c702e790893", - "d8a3bdb8be354365944ab587738280d3", - "89ad2dee66324ae896eec71924aee670", - "31c0c3d684564d5fb87d2e25e6de96eb", - "b7bef190ebed494eb8773ec21d9b7160", - "6c822f08434c4212931dcf097a80b7d4", - "0f8d4b5000174234bded5d4e017aa4e9", - "243f1e7de5414b82aaa4b50482dd964d", - "0961f276155348f98c12d1be4ad78e62", - "95c6cffafe1b4345a905be485b787728", - "43072f923bd24566ae0e20ca9aa3cdc5", - "5a4f80526c2c4b53a1bce182e9b3e5fa", - "400b0a8ef7c64477bf4f02a16b5508a0", - "b03347c3201849778ea3129314ac340c", - "3542b02e36ce4e03850b37c28d88da30", - "7b2abc768054422f8af3d21837400b4a", - "d8e4ceae237d4381aa5e44b020d7564e", - "e5ec838bb84644b6a27e3eaec9d7ac74", - "30ba8c556cc34fde96b530ed66ac376d", - "711439e7dcab4c10ab4300bdbe6b86aa", - "15887401b0814d9386fb4d02d6279412", - "bec0cae37feb48a4add318d970d8ef96", - "feb8127671424fa68b9b93a7547e40eb", - "94c89aa435e44c8d9369305c21ca028c", - "7abe3bc7884e4afeb9995f7d7acc8c0f", - "7edccbff1ca145eabd4af6f9da32442a", - "9c11fce2ab1f4811a3d98f9154818825", - "2cf5d1a84ed947ddb16e5f8e6984b01e", - "d5de868032e640d5a07c34c9917190c3", - "d6295baf48b24c929c3ab4a317356e2b", - "df2ed1f8e3754f3a8f30be35935e82f3", - "2f125504e41344c088231f0307d7cb92", - "6a6665e93675459394536fd9f846fbea", - "aed65947759c47c58abe86f1ee279b86", - "b5d7fb93223c4c458e7c80e59daea4d2", - "540140c9c2f541b3a82f7a59e4f0b867", - "d5212aa2a4f74de1970c07e282f0e2bc", - "4a75d002a4ae4fc99625420ec6e580ee", - "695f77c019db487ea60171277073efe6", - "bc3fad5fe0194399add875a6d78907bd", - "42c418329198400bb77cdd3a654a96de", - "4ef5fe3e9ea84b8c8cb1a90a8208bdb9", - "735b9a74223f4941a2837a1108889f63" - ] - }, - "outputId": "7ed04793-8bb9-49c6-b090-82111d9835f6" + "id": "wE3KHB3BwCVk" }, "execution_count": null, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "tokenizer_config.json: 0%| | 0.00/48.0 [00:00 512). Running this sequence through the model will result in indexing errors\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Total chapters found: 17\n", - "First chapter title: CHAPTER ONE\n", - "Text sample from first chapter: \n", - "\n", - "THE BOY WHO LIVED\n", - "\n", - "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\n", - "that they were perfectly normal, thank you very much. They were the last\n", - "people you'd expect to be involved in anything strange or mysterious,\n", - "because they just didn't hold with such nonsense.\n", - "\n", - "Mr. Dursley was the director of a firm called Grunnings, which made\n", - "drills. He was a big, beefy man with hardly any neck, although he did\n", - "have a very large mustache. Mrs. Dursley was thin and blonde and had\n", - "nearly t\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -3869,27 +911,13 @@ { "cell_type": "code", "source": [ - "index_dataframe(es, raw_source_index, df)" + "index_dataframe(esclient, raw_source_index, df)" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "7ReLAtz1O1HF", - "outputId": "eb6054fe-9e12-4c55-92f5-40c00d055d45" + "id": "7ReLAtz1O1HF" }, - "execution_count": 11, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Indexing documents to harry_potter_dataset-raw...\n", - "Successfully indexed 17 documents.\n", - "Failed to index 0 documents.\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -3911,33 +939,19 @@ "cell_type": "code", "source": [ "# Start the reindex operation asynchronously\n", - "response = es.reindex(\n", + "response = esclient.reindex(\n", " body={\"source\": {\"index\": raw_source_index}, \"dest\": {\"index\": index_name}},\n", " wait_for_completion=False,\n", ")\n", "task_id = response[\"task\"]\n", "print(\"Task ID:\", task_id)\n", - "check_task_status(es, task_id)" + "check_task_status(esclient, task_id)" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "HOCX_lbmO3zl", - "outputId": "1e9834c9-dc8f-4bb3-c676-1f3b62133ca0" + "id": "HOCX_lbmO3zl" }, - "execution_count": 12, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Task ID: vU-Vl_2tRMaigpDD944gmQ:105797433\n", - "Indexing...\n", - "Reindexing complete.\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -3981,7 +995,7 @@ ")\n", "\n", "# Searching and identifying relevant passages\n", - "results = es.search(index=index_name, body=query, _source=False)\n", + "results = esclient.search(index=index_name, body=query, _source=False)\n", "\n", "hit_id = None\n", "chunk_number = None\n", @@ -4017,7 +1031,7 @@ "print(f\"Fetch Surrounding Chunks\")\n", "print(f\"------------------------\")\n", "\n", - "max_chapter_chunk_result = es.search(\n", + "max_chapter_chunk_result = esclient.search(\n", " index=index_name,\n", " body=get_max_chunk_number_query(chapter_number, debug=False),\n", " _source=False,\n", @@ -4029,80 +1043,14 @@ "adjacent_chunks_query = get_adjacent_chunks_query(\n", " hit_id, chunk_number, max_chunk_number, debug=False\n", ")\n", - "results = es.search(index=index_name, body=adjacent_chunks_query, _source=False)\n", + "results = esclient.search(index=index_name, body=adjacent_chunks_query, _source=False)\n", "print_text_from_results(results)" ], "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "u7NFZBRJO3t7", - "outputId": "19461a64-d3eb-4940-9884-bd89d5e23910" + "id": "u7NFZBRJO3t7" }, - "execution_count": 13, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Matched Chunk ID: rV8Y5Y8BQsZxvNJ9cO4t, Chunk Number: 3, Text:\n", - "t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", - "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", - "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", - "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", - "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", - "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", - "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", - "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", - "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", - "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", - "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", - "\n", - "\n", - "Fetch Surrounding Chunks\n", - "------------------------\n", - "\n", - "\n", - "Text from Chunk 2: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", - "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", - "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", - "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", - "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", - "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", - "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", - "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", - "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", - "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", - "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", - "\n", - "\n", - "Text from Chunk 3: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", - "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", - "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", - "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", - "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", - "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", - "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", - "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", - "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", - "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", - "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", - "\n", - "\n", - "Text from Chunk 4: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", - "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", - "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", - "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", - "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", - "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", - "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", - "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", - "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", - "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", - "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n" - ] - } - ] + "execution_count": null, + "outputs": [] } ] } \ No newline at end of file From fe8141e346a1d1880d310fee49f6bcba5af70074 Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Wed, 5 Jun 2024 14:02:24 -0500 Subject: [PATCH 06/17] updated notebook var var chapter_number was not initialized. Fixed. --- .../fetch-surrounding-chunks.ipynb | 53 +++++++------------ 1 file changed, 19 insertions(+), 34 deletions(-) diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb index fe583d3f..9ce5d56e 100644 --- a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb +++ b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb @@ -986,42 +986,35 @@ "\n", "knn_boost_factor = 20\n", "text_expansion_boost = 1\n", - "query = build_custom_query(\n", - " build_vector(user_query),\n", - " user_query,\n", - " knn_boost_factor,\n", - " text_expansion_boost,\n", - " debug=False,\n", - ")\n", + "query = build_custom_query(build_vector(user_query), user_query, knn_boost_factor, text_expansion_boost, debug=False)\n", "\n", "# Searching and identifying relevant passages\n", "results = esclient.search(index=index_name, body=query, _source=False)\n", "\n", "hit_id = None\n", "chunk_number = None\n", + "chapter_number None\n", "\n", - "if results and results.get(\"hits\") and results[\"hits\"].get(\"hits\"):\n", + "if results and results.get('hits') and results['hits'].get('hits'):\n", " highest_score = -1\n", " best_hit = None\n", - " hit_id = results[\"hits\"][\"hits\"][0][\"_id\"]\n", - " chapter_number = results[\"hits\"][\"hits\"][0][\"fields\"][\"chapter\"][0]\n", - " if \"inner_hits\" in results[\"hits\"][\"hits\"][0]:\n", - " for hit_type in [\"text_hits\", \"dense_hit\", \"sparse_hits\"]:\n", - " if hit_type in results[\"hits\"][\"hits\"][0][\"inner_hits\"]:\n", - " inner_hit = results[\"hits\"][\"hits\"][0][\"inner_hits\"][hit_type][\"hits\"]\n", - " if inner_hit[\"hits\"]:\n", - " max_score = inner_hit[\"max_score\"]\n", + " hit_id = results['hits']['hits'][0]['_id']\n", + " chapter_number = results['hits']['hits'][0]['fields']['chapter'][0]\n", + " if 'inner_hits' in results['hits']['hits'][0]:\n", + " for hit_type in ['text_hits', 'dense_hit', 'sparse_hits']:\n", + " if hit_type in results['hits']['hits'][0]['inner_hits']:\n", + " inner_hit = results['hits']['hits'][0]['inner_hits'][hit_type]['hits']\n", + " if inner_hit['hits']:\n", + " max_score = inner_hit['max_score']\n", " if max_score and max_score > highest_score:\n", " highest_score = max_score\n", - " best_hit = inner_hit[\"hits\"][0]\n", + " best_hit = inner_hit['hits'][0]\n", "\n", " if best_hit:\n", - " first_passage_text = best_hit[\"_source\"][\"text\"]\n", - " chunk_number = best_hit[\"_source\"][\"chunk_number\"]\n", - " # print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text: {first_passage_text}\")\n", - " print(\n", - " f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text:\\n{textwrap.fill(first_passage_text, width=200)}\"\n", - " )\n", + " first_passage_text = best_hit['_source']['text']\n", + " chunk_number = best_hit['_source']['chunk_number']\n", + " #print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text: {first_passage_text}\")\n", + " print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text:\\n{textwrap.fill(first_passage_text, width=200)}\")\n", " print(f\"\\n\")\n", " else:\n", " print(f\"ID: {hit_id}, No relevant passages found.\")\n", @@ -1031,18 +1024,10 @@ "print(f\"Fetch Surrounding Chunks\")\n", "print(f\"------------------------\")\n", "\n", - "max_chapter_chunk_result = esclient.search(\n", - " index=index_name,\n", - " body=get_max_chunk_number_query(chapter_number, debug=False),\n", - " _source=False,\n", - ")\n", - "max_chunk_number = max_chapter_chunk_result[\"aggregations\"][\"max_chunk_number\"][\n", - " \"max_chunk\"\n", - "][\"value\"]\n", + "max_chapter_chunk_result = esclient.search(index=index_name, body=get_max_chunk_number_query(chapter_number, debug=False), _source=False)\n", + "max_chunk_number = max_chapter_chunk_result['aggregations']['max_chunk_number']['max_chunk']['value']\n", "\n", - "adjacent_chunks_query = get_adjacent_chunks_query(\n", - " hit_id, chunk_number, max_chunk_number, debug=False\n", - ")\n", + "adjacent_chunks_query = get_adjacent_chunks_query(hit_id, chunk_number, max_chunk_number, debug=False)\n", "results = esclient.search(index=index_name, body=adjacent_chunks_query, _source=False)\n", "print_text_from_results(results)" ], From 5b49345f9c6a917766860dd5d7e7c0f21d51898d Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Wed, 5 Jun 2024 17:49:46 -0500 Subject: [PATCH 07/17] updated notebook for chapter_number bug chapter_number = None. forgot = sign --- .../fetch-surrounding-chunks.ipynb | 54 ++++++++++++------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb index 9ce5d56e..beb4428e 100644 --- a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb +++ b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb @@ -986,35 +986,43 @@ "\n", "knn_boost_factor = 20\n", "text_expansion_boost = 1\n", - "query = build_custom_query(build_vector(user_query), user_query, knn_boost_factor, text_expansion_boost, debug=False)\n", + "query = build_custom_query(\n", + " build_vector(user_query),\n", + " user_query,\n", + " knn_boost_factor,\n", + " text_expansion_boost,\n", + " debug=False,\n", + ")\n", "\n", "# Searching and identifying relevant passages\n", "results = esclient.search(index=index_name, body=query, _source=False)\n", "\n", "hit_id = None\n", "chunk_number = None\n", - "chapter_number None\n", + "chapter_number = None\n", "\n", - "if results and results.get('hits') and results['hits'].get('hits'):\n", + "if results and results.get(\"hits\") and results[\"hits\"].get(\"hits\"):\n", " highest_score = -1\n", " best_hit = None\n", - " hit_id = results['hits']['hits'][0]['_id']\n", - " chapter_number = results['hits']['hits'][0]['fields']['chapter'][0]\n", - " if 'inner_hits' in results['hits']['hits'][0]:\n", - " for hit_type in ['text_hits', 'dense_hit', 'sparse_hits']:\n", - " if hit_type in results['hits']['hits'][0]['inner_hits']:\n", - " inner_hit = results['hits']['hits'][0]['inner_hits'][hit_type]['hits']\n", - " if inner_hit['hits']:\n", - " max_score = inner_hit['max_score']\n", + " hit_id = results[\"hits\"][\"hits\"][0][\"_id\"]\n", + " chapter_number = results[\"hits\"][\"hits\"][0][\"fields\"][\"chapter\"][0]\n", + " if \"inner_hits\" in results[\"hits\"][\"hits\"][0]:\n", + " for hit_type in [\"text_hits\", \"dense_hit\", \"sparse_hits\"]:\n", + " if hit_type in results[\"hits\"][\"hits\"][0][\"inner_hits\"]:\n", + " inner_hit = results[\"hits\"][\"hits\"][0][\"inner_hits\"][hit_type][\"hits\"]\n", + " if inner_hit[\"hits\"]:\n", + " max_score = inner_hit[\"max_score\"]\n", " if max_score and max_score > highest_score:\n", " highest_score = max_score\n", - " best_hit = inner_hit['hits'][0]\n", + " best_hit = inner_hit[\"hits\"][0]\n", "\n", " if best_hit:\n", - " first_passage_text = best_hit['_source']['text']\n", - " chunk_number = best_hit['_source']['chunk_number']\n", - " #print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text: {first_passage_text}\")\n", - " print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text:\\n{textwrap.fill(first_passage_text, width=200)}\")\n", + " first_passage_text = best_hit[\"_source\"][\"text\"]\n", + " chunk_number = best_hit[\"_source\"][\"chunk_number\"]\n", + " # print(f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text: {first_passage_text}\")\n", + " print(\n", + " f\"Matched Chunk ID: {hit_id}, Chunk Number: {chunk_number}, Text:\\n{textwrap.fill(first_passage_text, width=200)}\"\n", + " )\n", " print(f\"\\n\")\n", " else:\n", " print(f\"ID: {hit_id}, No relevant passages found.\")\n", @@ -1024,10 +1032,18 @@ "print(f\"Fetch Surrounding Chunks\")\n", "print(f\"------------------------\")\n", "\n", - "max_chapter_chunk_result = esclient.search(index=index_name, body=get_max_chunk_number_query(chapter_number, debug=False), _source=False)\n", - "max_chunk_number = max_chapter_chunk_result['aggregations']['max_chunk_number']['max_chunk']['value']\n", + "max_chapter_chunk_result = esclient.search(\n", + " index=index_name,\n", + " body=get_max_chunk_number_query(chapter_number, debug=False),\n", + " _source=False,\n", + ")\n", + "max_chunk_number = max_chapter_chunk_result[\"aggregations\"][\"max_chunk_number\"][\n", + " \"max_chunk\"\n", + "][\"value\"]\n", "\n", - "adjacent_chunks_query = get_adjacent_chunks_query(hit_id, chunk_number, max_chunk_number, debug=False)\n", + "adjacent_chunks_query = get_adjacent_chunks_query(\n", + " hit_id, chunk_number, max_chunk_number, debug=False\n", + ")\n", "results = esclient.search(index=index_name, body=adjacent_chunks_query, _source=False)\n", "print_text_from_results(results)" ], From 8fc393d7b0ccae850e9df64adf2e8a5ee1d0b0f9 Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Wed, 5 Jun 2024 22:33:49 -0500 Subject: [PATCH 08/17] updated noted added es_model_id --- .../document-chunking/fetch-surrounding-chunks.ipynb | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb index beb4428e..7557da7f 100644 --- a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb +++ b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb @@ -78,7 +78,9 @@ "3. **Embedding Models**:\n", " - `dense_embedding_model`: Specifies the model used for generating dense embeddings (`sentence-transformers__all-minilm-l6-v2`).\n", " - `dense_huggingface_model_id`: The Hugging Face model ID for the dense embeddings (`sentence-transformers/all-MiniLM-L6-v2`).\n", + " - `dense_es_model_id`: Model ID name within Elastic (`sentence-transformers__all-minilm-l6-v2`).\n", " - `dense_model_number_of_allocators`: The number of allocators for the dense embedding model (2).\n", + " \n", "\n", " - `elser_model_id`: Specifies the ELSER model ID (`.elser_model_2_linux-x86_64`).\n", " - `elser_model_number_of_allocators`: The number of allocators for the ELSER model (2).\n", @@ -113,6 +115,7 @@ "\n", "dense_embedding_model = \"sentence-transformers__all-minilm-l6-v2\"\n", "dense_huggingface_model_id = \"sentence-transformers/all-MiniLM-L6-v2\"\n", + "dense_es_model_id = \"sentence-transformers__all-minilm-l6-v2\"\n", "dense_model_number_of_allocators = 2\n", "\n", "elser_model_id = \".elser_model_2_linux-x86_64\"\n", @@ -156,7 +159,7 @@ { "cell_type": "code", "source": [ - "!eland_import_hub_model --cloud-id $ELASTIC_CLOUD_ID --hub-model-id {dense_huggingface_model_id} --task-type text_embedding --es-api-key $ELASTIC_API_KEY --start --clear-previous\n", + "!eland_import_hub_model --cloud-id $ELASTIC_CLOUD_ID --es-model-id {dense_es_model_id} --hub-model-id {dense_huggingface_model_id} --task-type text_embedding --es-api-key $ELASTIC_API_KEY --start --clear-previous\n", "resp = esclient.ml.update_trained_model_deployment(\n", " model_id=dense_embedding_model,\n", " body={\"number_of_allocations\": dense_model_number_of_allocators},\n", @@ -243,7 +246,7 @@ { "cell_type": "code", "source": [ - "# Start trained model deployment if not already deployed\n", + "# Start ELSER model deployment if not already deployed\n", "esclient.ml.start_trained_model_deployment(\n", " model_id=elser_model_id,\n", " number_of_allocations=elser_model_number_of_allocators,\n", @@ -670,7 +673,7 @@ " \"processor\": {\n", " \"inference\": {\n", " \"field_map\": {\"_ingest._value.text\": \"text_field\"},\n", - " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", + " \"model_id\": dense_es_model_id,\n", " \"target_field\": \"_ingest._value.vector\",\n", " \"on_failure\": [\n", " {\n", From df1211ed5cbd7d7504cb4279770f3441c023e1c6 Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Wed, 5 Jun 2024 22:44:44 -0500 Subject: [PATCH 09/17] updated notebook remove es_model_id as it is not needed. --- .../fetch-surrounding-chunks.ipynb | 314 ++++++++++++++++-- 1 file changed, 279 insertions(+), 35 deletions(-) diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb index 7557da7f..db5dc3cf 100644 --- a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb +++ b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb @@ -39,11 +39,53 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "metadata": { - "id": "nXuL8wsQNq8G" + "id": "nXuL8wsQNq8G", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "8d79cd0b-1603-4efe-a039-0b494f3dae5a" }, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: elasticsearch==8.13.2 in /usr/local/lib/python3.10/dist-packages (8.13.2)\n", + "Requirement already satisfied: elastic-transport<9,>=8.13 in /usr/local/lib/python3.10/dist-packages (from elasticsearch==8.13.2) (8.13.0)\n", + "Requirement already satisfied: urllib3<3,>=1.26.2 in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch==8.13.2) (2.0.7)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch==8.13.2) (2024.2.2)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (1.5.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.4)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.23.5)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: eland in /usr/local/lib/python3.10/dist-packages (8.13.1)\n", + "Requirement already satisfied: elasticsearch<9,>=8.3 in /usr/local/lib/python3.10/dist-packages (from eland) (8.13.2)\n", + "Requirement already satisfied: pandas<2,>=1.5 in /usr/local/lib/python3.10/dist-packages (from eland) (1.5.3)\n", + "Requirement already satisfied: matplotlib>=3.6 in /usr/local/lib/python3.10/dist-packages (from eland) (3.7.1)\n", + "Requirement already satisfied: numpy<2,>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from eland) (1.23.5)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from eland) (23.2)\n", + "Requirement already satisfied: elastic-transport<9,>=8.13 in /usr/local/lib/python3.10/dist-packages (from elasticsearch<9,>=8.3->eland) (8.13.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (1.2.0)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (4.47.2)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (1.4.5)\n", + "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (9.4.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (3.1.1)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<2,>=1.5->eland) (2023.4)\n", + "Requirement already satisfied: urllib3<3,>=1.26.2 in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch<9,>=8.3->eland) (2.0.7)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch<9,>=8.3->eland) (2024.2.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib>=3.6->eland) (1.16.0)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], "source": [ "!pip install elasticsearch==8.13.2\n", "!pip install pandas\n", @@ -76,9 +118,8 @@ " - `index_name`: The name of the enriched dataset index (`harry_potter_dataset_enriched`).\n", "\n", "3. **Embedding Models**:\n", - " - `dense_embedding_model`: Specifies the model used for generating dense embeddings (`sentence-transformers__all-minilm-l6-v2`).\n", + " - `dense_embedding_model_id`: Specifies the model used for generating dense embeddings (`sentence-transformers__all-minilm-l6-v2`).\n", " - `dense_huggingface_model_id`: The Hugging Face model ID for the dense embeddings (`sentence-transformers/all-MiniLM-L6-v2`).\n", - " - `dense_es_model_id`: Model ID name within Elastic (`sentence-transformers__all-minilm-l6-v2`).\n", " - `dense_model_number_of_allocators`: The number of allocators for the dense embedding model (2).\n", " \n", "\n", @@ -113,9 +154,8 @@ "raw_source_index = \"harry_potter_dataset-raw\"\n", "index_name = \"harry_potter_dataset_enriched\"\n", "\n", - "dense_embedding_model = \"sentence-transformers__all-minilm-l6-v2\"\n", + "dense_embedding_model_id = \"sentence-transformers__all-minilm-l6-v2\"\n", "dense_huggingface_model_id = \"sentence-transformers/all-MiniLM-L6-v2\"\n", - "dense_es_model_id = \"sentence-transformers__all-minilm-l6-v2\"\n", "dense_model_number_of_allocators = 2\n", "\n", "elser_model_id = \".elser_model_2_linux-x86_64\"\n", @@ -136,10 +176,24 @@ "print(esclient.info())" ], "metadata": { - "id": "LGQAjG6PERfx" + "id": "LGQAjG6PERfx", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "6389acbb-f13e-4850-8e66-cd814d39bb83" }, - "execution_count": null, - "outputs": [] + "execution_count": 61, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Elastic Cloud ID: ··········\n", + "Elastic Api Key: ··········\n", + "{'name': 'instance-0000000001', 'cluster_name': '951b9d7d79064735b681a5a2d7921825', 'cluster_uuid': 'ITHi4ramTZq6OIR5dEB9Eg', 'version': {'number': '8.14.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '8d96bbe3bf5fed931f3119733895458eab75dca9', 'build_date': '2024-06-03T10:05:49.073003402Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n" + ] + } + ] }, { "cell_type": "markdown", @@ -159,18 +213,50 @@ { "cell_type": "code", "source": [ - "!eland_import_hub_model --cloud-id $ELASTIC_CLOUD_ID --es-model-id {dense_es_model_id} --hub-model-id {dense_huggingface_model_id} --task-type text_embedding --es-api-key $ELASTIC_API_KEY --start --clear-previous\n", + "!eland_import_hub_model --cloud-id $ELASTIC_CLOUD_ID --es-model-id {dense_embedding_model_id} --hub-model-id {dense_huggingface_model_id} --task-type text_embedding --es-api-key $ELASTIC_API_KEY --start --clear-previous\n", "resp = esclient.ml.update_trained_model_deployment(\n", - " model_id=dense_embedding_model,\n", + " model_id=dense_embedding_model_id,\n", " body={\"number_of_allocations\": dense_model_number_of_allocators},\n", ")\n", "print(resp)" ], "metadata": { - "id": "4NH8JJkQJDit" + "id": "4NH8JJkQJDit", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "35415b4a-6ccd-4820-fedc-5f0a63ba1c83" }, - "execution_count": null, - "outputs": [] + "execution_count": 62, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/usr/local/lib/python3.10/dist-packages/eland/ml/_optional.py:116: UserWarning: Eland requires version '1.3' or newer of 'sklearn' (version '1.2.2' currently installed). Use pip or conda to update sklearn.\n", + " warnings.warn(msg, UserWarning)\n", + "2024-06-06 03:41:36,972 INFO : Establishing connection to Elasticsearch\n", + "2024-06-06 03:41:37,026 INFO : Connected to cluster named '951b9d7d79064735b681a5a2d7921825' (version: 8.14.0)\n", + "2024-06-06 03:41:37,026 INFO : Loading HuggingFace transformer tokenizer and model 'sentence-transformers/all-MiniLM-L6-v2'\n", + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "STAGE:2024-06-06 03:41:37 636871:636871 ActivityProfilerController.cpp:312] Completed Stage: Warm Up\n", + "STAGE:2024-06-06 03:41:37 636871:636871 ActivityProfilerController.cpp:318] Completed Stage: Collection\n", + "STAGE:2024-06-06 03:41:37 636871:636871 ActivityProfilerController.cpp:322] Completed Stage: Post Processing\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "2024-06-06 03:41:39,201 INFO : Stopping deployment for model with id 'sentence-transformers__all-minilm-l6-v2'\n", + "2024-06-06 03:41:39,339 INFO : Deleting model with id 'sentence-transformers__all-minilm-l6-v2'\n", + "2024-06-06 03:41:39,493 INFO : Creating model with id 'sentence-transformers__all-minilm-l6-v2'\n", + "2024-06-06 03:41:39,520 INFO : Uploading model definition\n", + "100% 87/87 [00:14<00:00, 5.80 parts/s]\n", + "2024-06-06 03:41:54,517 INFO : Uploading model vocabulary\n", + "2024-06-06 03:41:54,697 INFO : Starting model deployment\n", + "2024-06-06 03:41:57,028 INFO : Model successfully imported with id 'sentence-transformers__all-minilm-l6-v2'\n", + "{'assignment': {'task_parameters': {'model_id': 'sentence-transformers__all-minilm-l6-v2', 'deployment_id': 'sentence-transformers__all-minilm-l6-v2', 'model_bytes': 90303458, 'threads_per_allocation': 1, 'number_of_allocations': 2, 'queue_capacity': 1024, 'cache_size': '90303458b', 'priority': 'normal', 'per_deployment_memory_bytes': 90269696, 'per_allocation_memory_bytes': 236040288}, 'routing_table': {'hO-ZlcEkQW2sb3rdra9ERg': {'current_allocations': 1, 'target_allocations': 2, 'routing_state': 'started', 'reason': ''}}, 'assignment_state': 'started', 'start_time': '2024-06-06T03:41:54.73081128Z', 'max_assigned_allocations': 1}}\n" + ] + } + ] }, { "cell_type": "markdown", @@ -648,7 +734,7 @@ "metadata": { "id": "xB2a9-qtONbQ" }, - "execution_count": null, + "execution_count": 64, "outputs": [] }, { @@ -673,7 +759,7 @@ " \"processor\": {\n", " \"inference\": {\n", " \"field_map\": {\"_ingest._value.text\": \"text_field\"},\n", - " \"model_id\": dense_es_model_id,\n", + " \"model_id\": dense_embedding_model_id,\n", " \"target_field\": \"_ingest._value.vector\",\n", " \"on_failure\": [\n", " {\n", @@ -728,10 +814,22 @@ "print(f\"Ingest pipeline '{pipeline_id}' created/updated successfully.\")" ], "metadata": { - "id": "iUOFJK48OamP" + "id": "iUOFJK48OamP", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "b7feb26f-a084-4d48-dbba-4a53cc0b0255" }, - "execution_count": null, - "outputs": [] + "execution_count": 65, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Ingest pipeline 'books_dataset_chunker' created/updated successfully.\n" + ] + } + ] }, { "cell_type": "markdown", @@ -839,10 +937,27 @@ ")" ], "metadata": { - "id": "vZ3Z5gZbOgjF" + "id": "vZ3Z5gZbOgjF", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "5a1ed103-d9be-42ae-daac-bab2daca51be" }, - "execution_count": null, - "outputs": [] + "execution_count": 67, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Index harry_potter_dataset_enriched exists. Deleting it...\n", + "Index harry_potter_dataset_enriched deleted!\n", + "Index harry_potter_dataset_enriched created successfully!\n", + "Index harry_potter_dataset-raw exists. Deleting it...\n", + "Index harry_potter_dataset-raw deleted!\n", + "Index harry_potter_dataset-raw created successfully!\n" + ] + } + ] }, { "cell_type": "markdown", @@ -892,10 +1007,43 @@ "df[\"passages\"] = df[\"chapter_full_text\"].apply(lambda text: chunk(text))" ], "metadata": { - "id": "0L4YI96xOuKn" + "id": "0L4YI96xOuKn", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "68318a23-b10f-49ab-a329-ec32b1d49993" }, - "execution_count": null, - "outputs": [] + "execution_count": 68, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Token indices sequence length is longer than the specified maximum sequence length for this model (6535 > 512). Running this sequence through the model will result in indexing errors\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Total chapters found: 17\n", + "First chapter title: CHAPTER ONE\n", + "Text sample from first chapter: \n", + "\n", + "THE BOY WHO LIVED\n", + "\n", + "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\n", + "that they were perfectly normal, thank you very much. They were the last\n", + "people you'd expect to be involved in anything strange or mysterious,\n", + "because they just didn't hold with such nonsense.\n", + "\n", + "Mr. Dursley was the director of a firm called Grunnings, which made\n", + "drills. He was a big, beefy man with hardly any neck, although he did\n", + "have a very large mustache. Mrs. Dursley was thin and blonde and had\n", + "nearly t\n" + ] + } + ] }, { "cell_type": "markdown", @@ -917,10 +1065,24 @@ "index_dataframe(esclient, raw_source_index, df)" ], "metadata": { - "id": "7ReLAtz1O1HF" + "id": "7ReLAtz1O1HF", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e07cace3-8c74-4a72-b2a9-10a7f22d99fd" }, - "execution_count": null, - "outputs": [] + "execution_count": 69, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Indexing documents to harry_potter_dataset-raw...\n", + "Successfully indexed 17 documents.\n", + "Failed to index 0 documents.\n" + ] + } + ] }, { "cell_type": "markdown", @@ -951,10 +1113,26 @@ "check_task_status(esclient, task_id)" ], "metadata": { - "id": "HOCX_lbmO3zl" + "id": "HOCX_lbmO3zl", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "4e8a2859-6c28-42ff-b956-7183c80ede9e" }, - "execution_count": null, - "outputs": [] + "execution_count": 70, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Task ID: _m32HYljRgqsVl7G-4wPtw:23883\n", + "Indexing...\n", + "Indexing...\n", + "Indexing...\n", + "Reindexing complete.\n" + ] + } + ] }, { "cell_type": "markdown", @@ -1051,10 +1229,76 @@ "print_text_from_results(results)" ], "metadata": { - "id": "u7NFZBRJO3t7" + "id": "u7NFZBRJO3t7", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "c932fb1c-5bdf-45ef-c6d9-452c49273be4" }, - "execution_count": null, - "outputs": [] + "execution_count": 71, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Matched Chunk ID: 6rWk648BZDaSvPZ6OJE_, Chunk Number: 3, Text:\n", + "t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "\n", + "\n", + "Fetch Surrounding Chunks\n", + "------------------------\n", + "\n", + "\n", + "Text from Chunk 2: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "\n", + "\n", + "Text from Chunk 3: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "\n", + "\n", + "Text from Chunk 4: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", + "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", + "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", + "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", + "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", + "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", + "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", + "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", + "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", + "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", + "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n" + ] + } + ] } ] } \ No newline at end of file From fac740dc5556bd6aa813c8b350c2408d29d686e2 Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Wed, 5 Jun 2024 22:57:51 -0500 Subject: [PATCH 10/17] var ini ini fetch-surrounding-chunks --- notebooks/document-chunking/fetch-surrounding-chunks.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb index db5dc3cf..0d5b5e0d 100644 --- a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb +++ b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb @@ -1181,6 +1181,7 @@ "hit_id = None\n", "chunk_number = None\n", "chapter_number = None\n", + "max_chapter_chunk_result = None\n", "\n", "if results and results.get(\"hits\") and results[\"hits\"].get(\"hits\"):\n", " highest_score = -1\n", From 215f85a324e0411ac329050f7cf7e0d11ae4983c Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Wed, 5 Jun 2024 23:00:43 -0500 Subject: [PATCH 11/17] dense_embedding_model_id dense_embedding_model_id was missing from query. renamed. --- notebooks/document-chunking/fetch-surrounding-chunks.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb index 0d5b5e0d..7976ae53 100644 --- a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb +++ b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb @@ -429,7 +429,7 @@ "def build_vector(text):\n", " docs = [{\"text_field\": text}]\n", " response = esclient.ml.infer_trained_model(\n", - " model_id=dense_embedding_model, docs=docs\n", + " model_id=dense_embedding_model_id, docs=docs\n", " )\n", " return response.get(\"inference_results\", [{}])[0].get(\"predicted_value\", [])\n", "\n", From 79e98203b995fd59cb223d2cfc9b45680521dfbc Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Wed, 5 Jun 2024 23:41:29 -0500 Subject: [PATCH 12/17] update for debug for debugging changed max_chapter_chunk_result --- .../fetch-surrounding-chunks.ipynb | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb index 7976ae53..a7924c5a 100644 --- a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb +++ b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": null, "metadata": { "id": "nXuL8wsQNq8G", "colab": { @@ -182,7 +182,7 @@ }, "outputId": "6389acbb-f13e-4850-8e66-cd814d39bb83" }, - "execution_count": 61, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -227,7 +227,7 @@ }, "outputId": "35415b4a-6ccd-4820-fedc-5f0a63ba1c83" }, - "execution_count": 62, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -734,7 +734,7 @@ "metadata": { "id": "xB2a9-qtONbQ" }, - "execution_count": 64, + "execution_count": null, "outputs": [] }, { @@ -820,7 +820,7 @@ }, "outputId": "b7feb26f-a084-4d48-dbba-4a53cc0b0255" }, - "execution_count": 65, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -943,7 +943,7 @@ }, "outputId": "5a1ed103-d9be-42ae-daac-bab2daca51be" }, - "execution_count": 67, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -1013,7 +1013,7 @@ }, "outputId": "68318a23-b10f-49ab-a329-ec32b1d49993" }, - "execution_count": 68, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -1071,7 +1071,7 @@ }, "outputId": "e07cace3-8c74-4a72-b2a9-10a7f22d99fd" }, - "execution_count": 69, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -1119,7 +1119,7 @@ }, "outputId": "4e8a2859-6c28-42ff-b956-7183c80ede9e" }, - "execution_count": 70, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -1181,8 +1181,11 @@ "hit_id = None\n", "chunk_number = None\n", "chapter_number = None\n", + "max_chunk_number = None\n", + "max_chapter_chunk_result = None\n", "max_chapter_chunk_result = None\n", "\n", + "\n", "if results and results.get(\"hits\") and results[\"hits\"].get(\"hits\"):\n", " highest_score = -1\n", " best_hit = None\n", @@ -1214,10 +1217,11 @@ "print(f\"Fetch Surrounding Chunks\")\n", "print(f\"------------------------\")\n", "\n", + "max_chunk_query = get_max_chunk_number_query(chapter_number, debug=False)\n", + "\n", + "# max_chapter_chunk_result = esclient.search(index=index_name, body=get_max_chunk_number_query(chapter_number, debug=False), _source=False)\n", "max_chapter_chunk_result = esclient.search(\n", - " index=index_name,\n", - " body=get_max_chunk_number_query(chapter_number, debug=False),\n", - " _source=False,\n", + " index=index_name, body=max_chunk_query, _source=False\n", ")\n", "max_chunk_number = max_chapter_chunk_result[\"aggregations\"][\"max_chunk_number\"][\n", " \"max_chunk\"\n", @@ -1234,9 +1238,9 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "c932fb1c-5bdf-45ef-c6d9-452c49273be4" + "outputId": "fa6b58d7-ce04-47f2-c42a-289f13056213" }, - "execution_count": 71, + "execution_count": null, "outputs": [ { "output_type": "stream", From 21698d0ae2914e7133dc5a869f91e77710df80ae Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Thu, 6 Jun 2024 00:01:18 -0500 Subject: [PATCH 13/17] updated notebook added error handling --- .../fetch-surrounding-chunks.ipynb | 46 +++++++++++-------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb index a7924c5a..2ee34c6d 100644 --- a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb +++ b/notebooks/document-chunking/fetch-surrounding-chunks.ipynb @@ -1183,7 +1183,7 @@ "chapter_number = None\n", "max_chunk_number = None\n", "max_chapter_chunk_result = None\n", - "max_chapter_chunk_result = None\n", + "max_chunk_query = None\n", "\n", "\n", "if results and results.get(\"hits\") and results[\"hits\"].get(\"hits\"):\n", @@ -1214,33 +1214,43 @@ "else:\n", " print(\"No results found.\")\n", "\n", - "print(f\"Fetch Surrounding Chunks\")\n", - "print(f\"------------------------\")\n", + "# Fetch Surrounding Chunks if chapter_number is not None\n", + "if chapter_number is not None:\n", + " print(f\"Fetch Surrounding Chunks\")\n", + " print(f\"------------------------\")\n", + "\n", + " # max_chunk_query = get_max_chunk_number_query(chapter_number, debug=False)\n", + " # max_chapter_chunk_result = esclient.search(index=index_name, body=max_chunk_query, _source=False)\n", + " max_chapter_chunk_result = esclient.search(\n", + " index=index_name,\n", + " body=get_max_chunk_number_query(chapter_number, debug=False),\n", + " _source=False,\n", + " )\n", + " max_chunk_number = max_chapter_chunk_result[\"aggregations\"][\"max_chunk_number\"][\n", + " \"max_chunk\"\n", + " ][\"value\"]\n", "\n", - "max_chunk_query = get_max_chunk_number_query(chapter_number, debug=False)\n", + " adjacent_chunks_query = get_adjacent_chunks_query(\n", + " hit_id, chunk_number, max_chunk_number, debug=False\n", + " )\n", + " results = esclient.search(\n", + " index=index_name, body=adjacent_chunks_query, _source=False\n", + " )\n", + " print_text_from_results(results)\n", + "else:\n", + " print(\"Skipping fetch of surrounding chunks due to no initial results.\")\n", "\n", - "# max_chapter_chunk_result = esclient.search(index=index_name, body=get_max_chunk_number_query(chapter_number, debug=False), _source=False)\n", - "max_chapter_chunk_result = esclient.search(\n", - " index=index_name, body=max_chunk_query, _source=False\n", - ")\n", - "max_chunk_number = max_chapter_chunk_result[\"aggregations\"][\"max_chunk_number\"][\n", - " \"max_chunk\"\n", - "][\"value\"]\n", "\n", - "adjacent_chunks_query = get_adjacent_chunks_query(\n", - " hit_id, chunk_number, max_chunk_number, debug=False\n", - ")\n", - "results = esclient.search(index=index_name, body=adjacent_chunks_query, _source=False)\n", - "print_text_from_results(results)" + "# max_chapter_chunk_result = esclient.search(index=index_name, body=get_max_chunk_number_query(chapter_number, debug=False), _source=False)" ], "metadata": { "id": "u7NFZBRJO3t7", "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "fa6b58d7-ce04-47f2-c42a-289f13056213" + "outputId": "69b06dea-5189-40e7-83c4-bcc8baac5b91" }, - "execution_count": null, + "execution_count": 77, "outputs": [ { "output_type": "stream", From 9b51ed8a6523596e9470525b9b444ed88d90b9f8 Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Thu, 6 Jun 2024 08:42:40 -0500 Subject: [PATCH 14/17] Ini Push removed fetch surrounding chunks from doc chunking folder and into supporting blog content --- .../fetch-surrounding-chunks/README.md | 8 ++++++++ .../fetch-surrounding-chunks.ipynb | 0 2 files changed, 8 insertions(+) create mode 100644 supporting-blog-content/fetch-surrounding-chunks/README.md rename {notebooks/document-chunking => supporting-blog-content/fetch-surrounding-chunks}/fetch-surrounding-chunks.ipynb (100%) diff --git a/supporting-blog-content/fetch-surrounding-chunks/README.md b/supporting-blog-content/fetch-surrounding-chunks/README.md new file mode 100644 index 00000000..9bd5d9f0 --- /dev/null +++ b/supporting-blog-content/fetch-surrounding-chunks/README.md @@ -0,0 +1,8 @@ +# Fetch Surrounding Chunks (N-1, N+1) + +This notebook is designed to handle the ingestion of book text (Harry Potter and the Sorcerer's Stone) into an Elasticsearch Cloud instance. It includes partitioning the book text into chapters and chunking the chapter text, which are then ingested into Elasticsearch. The setup utilizes a nested structure, and for each chunk, it stores dense and sparse (ELSER) vector representations along with the text representation. + +Searches are performed using dense vector comparisons, sparse vector comparisons, and text search in parallel to demonstrate the power of hybrid search strategies. Additionally, the notebook is configured to retrieve adjacent chunks (n-1 and n+1), allowing for a more contextual understanding of the search results. + +## Elasticsearch Version +Versions of Elasticsearch `8.13` and `8.14` were tested with this notebook. The notebook will not work with previous versions Elasticsearch diff --git a/notebooks/document-chunking/fetch-surrounding-chunks.ipynb b/supporting-blog-content/fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb similarity index 100% rename from notebooks/document-chunking/fetch-surrounding-chunks.ipynb rename to supporting-blog-content/fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb From b3eb75697ae013ffbe667a563559337ede5b6f9b Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Thu, 6 Jun 2024 10:36:57 -0500 Subject: [PATCH 15/17] added open in colab added open in colab --- .../fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/supporting-blog-content/fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb b/supporting-blog-content/fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb index 2ee34c6d..e2b74417 100644 --- a/supporting-blog-content/fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb +++ b/supporting-blog-content/fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb @@ -19,6 +19,8 @@ "source": [ "# Fetch surronding chucks (N-1, N+1)\n", "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/supporting-blog-content/fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb)\n", + "\n", "This notebook is designed to handle the ingestion of book text (Harry Potter and the Sorcerer's Stone) into an Elasticsearch Cloud instance. It includes partitioning the book text into chapters and chunking the chapter text, which are then ingested into Elasticsearch. The setup utilizes a nested structure, and for each chunk, it stores dense and sparse (ELSER) vector representations along with the text representation.\n", "\n", "Searches are performed using dense vector comparisons, sparse vector comparisons, and text search in parallel to demonstrate the power of hybrid search strategies. Additionally, the notebook is configured to retrieve adjacent chunks (n-1 and n+1), allowing for a more contextual understanding of the search results.\n", From 328b96181bd135940e5c697779fd89b41a34ea19 Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Mon, 10 Jun 2024 12:23:20 -0500 Subject: [PATCH 16/17] Bug fix Valentin Crettaz found during his review dup chunks were printed out. fixed the issue. --- .../fetch-surrounding-chunks.ipynb | 131 +++++++++++------- 1 file changed, 78 insertions(+), 53 deletions(-) diff --git a/supporting-blog-content/fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb b/supporting-blog-content/fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb index 055fbe7d..96dd84e2 100644 --- a/supporting-blog-content/fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb +++ b/supporting-blog-content/fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb @@ -19,8 +19,6 @@ "source": [ "# Fetch surronding chucks (N-1, N+1)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://ela.st/fetch-surrounding-chunks)\n", - "\n", "This notebook is designed to handle the ingestion of book text (Harry Potter and the Sorcerer's Stone) into an Elasticsearch Cloud instance. It includes partitioning the book text into chapters and chunking the chapter text, which are then ingested into Elasticsearch. The setup utilizes a nested structure, and for each chunk, it stores dense and sparse (ELSER) vector representations along with the text representation.\n", "\n", "Searches are performed using dense vector comparisons, sparse vector comparisons, and text search in parallel to demonstrate the power of hybrid search strategies. Additionally, the notebook is configured to retrieve adjacent chunks (n-1 and n+1), allowing for a more contextual understanding of the search results.\n", @@ -41,13 +39,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "id": "nXuL8wsQNq8G", "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "8d79cd0b-1603-4efe-a039-0b494f3dae5a" + "outputId": "f7de69ed-828c-485c-8411-78b404515239" }, "outputs": [ { @@ -182,17 +180,35 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "6389acbb-f13e-4850-8e66-cd814d39bb83" + "outputId": "56ab2605-5bf2-4bb0-a7b1-5916b9e3d0ab" }, - "execution_count": null, + "execution_count": 3, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Elastic Cloud ID: ··········\n", - "Elastic Api Key: ··········\n", - "{'name': 'instance-0000000001', 'cluster_name': '951b9d7d79064735b681a5a2d7921825', 'cluster_uuid': 'ITHi4ramTZq6OIR5dEB9Eg', 'version': {'number': '8.14.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '8d96bbe3bf5fed931f3119733895458eab75dca9', 'build_date': '2024-06-03T10:05:49.073003402Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n" + "Elastic Api Key: ··········\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'name': 'instance-0000000008', 'cluster_name': 'ccb3490b2f684a1bad1e54a3de285244', 'cluster_uuid': 'P63_NMvcQmaOmnsL4ZkBxQ', 'version': {'number': '8.13.4', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'da95df118650b55a500dcc181889ac35c6d8da7c', 'build_date': '2024-05-06T22:04:45.107454559Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n" ] } ] @@ -225,9 +241,10 @@ "metadata": { "id": "4NH8JJkQJDit", "colab": { - "base_uri": "https://localhost:8080/" + "base_uri": "https://localhost:8080/", + "height": 595 }, - "outputId": "35415b4a-6ccd-4820-fedc-5f0a63ba1c83" + "outputId": "fdd46986-c5e6-43ff-9f86-4a685e8b3f0e" }, "execution_count": null, "outputs": [ @@ -237,25 +254,35 @@ "text": [ "/usr/local/lib/python3.10/dist-packages/eland/ml/_optional.py:116: UserWarning: Eland requires version '1.3' or newer of 'sklearn' (version '1.2.2' currently installed). Use pip or conda to update sklearn.\n", " warnings.warn(msg, UserWarning)\n", - "2024-06-06 03:41:36,972 INFO : Establishing connection to Elasticsearch\n", - "2024-06-06 03:41:37,026 INFO : Connected to cluster named '951b9d7d79064735b681a5a2d7921825' (version: 8.14.0)\n", - "2024-06-06 03:41:37,026 INFO : Loading HuggingFace transformer tokenizer and model 'sentence-transformers/all-MiniLM-L6-v2'\n", + "2024-06-06 05:29:18,918 INFO : Establishing connection to Elasticsearch\n", + "2024-06-06 05:29:18,982 INFO : Connected to cluster named '951b9d7d79064735b681a5a2d7921825' (version: 8.14.0)\n", + "2024-06-06 05:29:18,982 INFO : Loading HuggingFace transformer tokenizer and model 'sentence-transformers/all-MiniLM-L6-v2'\n", "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n", - "STAGE:2024-06-06 03:41:37 636871:636871 ActivityProfilerController.cpp:312] Completed Stage: Warm Up\n", - "STAGE:2024-06-06 03:41:37 636871:636871 ActivityProfilerController.cpp:318] Completed Stage: Collection\n", - "STAGE:2024-06-06 03:41:37 636871:636871 ActivityProfilerController.cpp:322] Completed Stage: Post Processing\n", + "STAGE:2024-06-06 05:29:19 656714:656714 ActivityProfilerController.cpp:312] Completed Stage: Warm Up\n", + "STAGE:2024-06-06 05:29:19 656714:656714 ActivityProfilerController.cpp:318] Completed Stage: Collection\n", + "STAGE:2024-06-06 05:29:19 656714:656714 ActivityProfilerController.cpp:322] Completed Stage: Post Processing\n", "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", " warnings.warn(\n", - "2024-06-06 03:41:39,201 INFO : Stopping deployment for model with id 'sentence-transformers__all-minilm-l6-v2'\n", - "2024-06-06 03:41:39,339 INFO : Deleting model with id 'sentence-transformers__all-minilm-l6-v2'\n", - "2024-06-06 03:41:39,493 INFO : Creating model with id 'sentence-transformers__all-minilm-l6-v2'\n", - "2024-06-06 03:41:39,520 INFO : Uploading model definition\n", - "100% 87/87 [00:14<00:00, 5.80 parts/s]\n", - "2024-06-06 03:41:54,517 INFO : Uploading model vocabulary\n", - "2024-06-06 03:41:54,697 INFO : Starting model deployment\n", - "2024-06-06 03:41:57,028 INFO : Model successfully imported with id 'sentence-transformers__all-minilm-l6-v2'\n", - "{'assignment': {'task_parameters': {'model_id': 'sentence-transformers__all-minilm-l6-v2', 'deployment_id': 'sentence-transformers__all-minilm-l6-v2', 'model_bytes': 90303458, 'threads_per_allocation': 1, 'number_of_allocations': 2, 'queue_capacity': 1024, 'cache_size': '90303458b', 'priority': 'normal', 'per_deployment_memory_bytes': 90269696, 'per_allocation_memory_bytes': 236040288}, 'routing_table': {'hO-ZlcEkQW2sb3rdra9ERg': {'current_allocations': 1, 'target_allocations': 2, 'routing_state': 'started', 'reason': ''}}, 'assignment_state': 'started', 'start_time': '2024-06-06T03:41:54.73081128Z', 'max_assigned_allocations': 1}}\n" + "2024-06-06 05:29:21,155 INFO : Stopping deployment for model with id 'sentence-transformers__all-minilm-l6-v2'\n", + "2024-06-06 05:29:21,307 INFO : Deleting model with id 'sentence-transformers__all-minilm-l6-v2'\n", + "2024-06-06 05:29:21,496 INFO : Creating model with id 'sentence-transformers__all-minilm-l6-v2'\n", + "2024-06-06 05:29:21,522 INFO : Uploading model definition\n", + "100% 87/87 [00:15<00:00, 5.68 parts/s]\n", + "2024-06-06 05:29:36,852 INFO : Uploading model vocabulary\n", + "2024-06-06 05:29:37,035 INFO : Starting model deployment\n", + "2024-06-06 05:29:39,224 INFO : Model successfully imported with id 'sentence-transformers__all-minilm-l6-v2'\n" + ] + }, + { + "output_type": "error", + "ename": "AttributeError", + "evalue": "'MlClient' object has no attribute 'update_trained_model_deployment'", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'eland_import_hub_model --cloud-id $ELASTIC_CLOUD_ID --es-model-id {dense_embedding_model_id} --hub-model-id {dense_huggingface_model_id} --task-type text_embedding --es-api-key $ELASTIC_API_KEY --start --clear-previous'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m resp = esclient.ml.update_trained_model_deployment(\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mmodel_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdense_embedding_model_id\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"number_of_allocations\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdense_model_number_of_allocators\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m )\n", + "\u001b[0;31mAttributeError\u001b[0m: 'MlClient' object has no attribute 'update_trained_model_deployment'" ] } ] @@ -699,7 +726,7 @@ " text = nested_hit[\"_source\"][\"text\"]\n", " # print(f\"Text from Chunk {chunk_number}: {text}\")\n", " print(\n", - " f\"\\n\\nText from Chunk {chunk_number}: {textwrap.fill(first_passage_text, width=200)}\"\n", + " f\"\\n\\nText from Chunk {chunk_number}: {textwrap.fill(text, width=200)}\"\n", " )\n", " else:\n", " print(\"No hits found.\")\n", @@ -736,7 +763,7 @@ "metadata": { "id": "xB2a9-qtONbQ" }, - "execution_count": null, + "execution_count": 5, "outputs": [] }, { @@ -1250,15 +1277,15 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "69b06dea-5189-40e7-83c4-bcc8baac5b91" + "outputId": "01d444cf-17f6-40c1-f5af-e24db219e581" }, - "execution_count": 77, + "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ - "Matched Chunk ID: 6rWk648BZDaSvPZ6OJE_, Chunk Number: 3, Text:\n", + "Matched Chunk ID: rV8Y5Y8BQsZxvNJ9cO4t, Chunk Number: 3, Text:\n", "t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", @@ -1276,17 +1303,16 @@ "------------------------\n", "\n", "\n", - "Text from Chunk 2: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", - "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", - "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", - "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", - "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", - "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", - "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", - "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", - "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", - "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", - "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", + "Text from Chunk 2: ##wrap the broomstick in private before their first class, but halfway across the entrance hall they found the way upstairs barred by crabbe and goyle. malfoy seized the package from harry and felt\n", + "it. \" that's a broomstick, \" he said, throwing it back to harry with a mixture of jealousy and spite on his face. \" you'll be in for it this time, potter, first years aren't allowed them. \" ron\n", + "couldn't resist it. \" it's not any old broomstick, \" he said, \" it's a nimbus two thousand. what did you say you've got at home, malfoy, a comet two sixty? \" ron grinned at harry. \" comets look\n", + "flashy, but they're not in the same league as the nimbus. \" \" what would you know about it, weasley, you couldn't afford half the handle, \" malfoy snapped back. \" i suppose you and your brothers have\n", + "to save up twig by twig. \" before ron could answer, professor flitwick appeared at malfoy's elbow. \" not arguing, i hope, boys? \" he squeaked. \" potter's been sent a broomstick, professor, \" said\n", + "malfoy quickly. \" yes, yes, that's right, \" said professor flitwick, beaming at harry. \" professor mcgonagall told me all about the special circumstances, potter. and what model is it? \" \" a nimbus\n", + "two thousand, sit, \" said harry, fighting not to laugh at the look of horror on malfoy's face. \" and it's really thanks to malfoy here that i've got it, \" he added. harry and ron headed upstairs,\n", + "smothering their laughter at malfoy's obvious rage and confusion. \" well, it's true, \" harry chortled as they reached the top of the marble staircase, \" if he hadn't stolen neville's remembrall i\n", + "wouln't be on the team.... \" \" so i suppose you think that's a reward for breaking rules? \" came an angry voice from just behind them. hermione was stomping up the stairs, looking disapprovingly at\n", + "the package in harry's hand. \" i thought you weren '\n", "\n", "\n", "Text from Chunk 3: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", @@ -1302,17 +1328,16 @@ "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n", "\n", "\n", - "Text from Chunk 4: t speaking to us? \" said harry. \" yes, don't stop now, \" said ron, \" it's doing us so much good. \" hermione marched away with her nose in the air. harry had a lot of trouble keeping his mind on his\n", - "lessons that day. it kept wandering up to the dormitory where his new broomstick was lying under his bed, or straying off to the quidditch field where he'd be learning to play that night. he bolted\n", - "his dinner that evening without noticing what he was eating, and then rushed upstairs with ron to unwrap the nimbus two thousand at last. \" wow, \" ron sighed, as the broomstick rolled onto harry's\n", - "bedspread. even harry, who knew nothing about the different brooms, thought it looked wonderful. sleek and shiny, with a mahogany handle, it had a long tail of neat, straight twigs and nimbus two\n", - "thousand written in gold near the top. as seven o'clock drew nearer, harry left the castle and set off in the dusk toward the quidditch field. held never been inside the stadium before. hundreds of\n", - "seats were raised in stands around the field so that the spectators were high enough to see what was going on. at either end of the field were three golden poles with hoops on the end. they reminded\n", - "harry of the little plastic sticks muggle children blew bubbles through, except that they were fifty feet high. too eager to fly again to wait for wood, harry mounted his broomstick and kicked off\n", - "from the ground. what a feeling - - he swooped in and out of the goal posts and then sped up and down the field. the nimbus two thousand turned wherever he wanted at his lightest touch. \" hey, potter,\n", - "come down!'oliver wood had arrived. fie was carrying a large wooden crate under his arm. harry landed next to him. \" very nice, \" said wood, his eyes glinting. \" i see what mcgonagall meant... you\n", - "really are a natural. i'm just going to teach you the rules this evening, then you'll be joining team practice three times a week. \" he opened the crate. inside were four different - sized balls. \"\n", - "right, \" said wood. \" now, quidditch is easy enough to understand, even if it's not too easy to play. there are seven players on each side.\n" + "Text from Chunk 4: three of them are called chasers. \" \" three chasers, \" harry repeated, as wood took out a bright red ball about the size of a soccer ball. \" this ball's called the quaffle, \" said wood. \" the chasers\n", + "throw the quaffle to each other and try and get it through one of the hoops to score a goal. ten points every time the quaffle goes through one of the hoops. follow me? \" \" the chasers throw the\n", + "quaffle and put it through the hoops to score, \" harry recited. \" so - - that's sort of like basketball on broomsticks with six hoops, isn't it? \" \" what's basketball? \" said wood curiously. \" never\n", + "mind, \" said harry quickly. \" now, there's another player on each side who's called the keeper - i'm keeper for gryffindor. i have to fly around our hoops and stop the other team from scoring. \" \"\n", + "three chasers, one keeper, \" said harry, who was determined to remember it all. \" and they play with the quaffle. okay, got that. so what are they for? \" he pointed at the three balls left inside the\n", + "box. \" i'll show you now, \" said wood. \" take this. \" he handed harry a small club, a bit like a short baseball bat. \" i'm going to show you what the bludgers do, \" wood said. \" these two are the\n", + "bludgers. \" he showed harry two identical balls, jet black and slightly smaller than the red quaffle. harry noticed that they seemed to be straining to escape the straps holding them inside the box. \"\n", + "stand back, \" wood warned harry. he bent down and freed one of the bludgers. at once, the black ball rose high in the air and then pelted straight at harry's face. harry swung at it with the bat to\n", + "stop it from breaking his nose, and sent it zigzagging away into the air - - it zoomed around their heads and then shot at wood, who dived on top of it and managed to pin it to the ground. \" see? \"\n", + "wood panted, forcing the struggling bludger back into the crate and strapping it down safely. \" the bludgers rocket around, trying to knock players off their\n" ] } ] From 22d6cf539b63faf9c1ab3152120ea4315d8a1c0b Mon Sep 17 00:00:00 2001 From: Sunile Manjee Date: Mon, 10 Jun 2024 14:58:15 -0500 Subject: [PATCH 17/17] updated Remove warnings from output --- .../fetch-surrounding-chunks.ipynb | 137 ++---------------- 1 file changed, 10 insertions(+), 127 deletions(-) diff --git a/supporting-blog-content/fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb b/supporting-blog-content/fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb index 96dd84e2..9df7202a 100644 --- a/supporting-blog-content/fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb +++ b/supporting-blog-content/fetch-surrounding-chunks/fetch-surrounding-chunks.ipynb @@ -39,53 +39,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { - "id": "nXuL8wsQNq8G", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "f7de69ed-828c-485c-8411-78b404515239" + "id": "nXuL8wsQNq8G" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Requirement already satisfied: elasticsearch==8.13.2 in /usr/local/lib/python3.10/dist-packages (8.13.2)\n", - "Requirement already satisfied: elastic-transport<9,>=8.13 in /usr/local/lib/python3.10/dist-packages (from elasticsearch==8.13.2) (8.13.0)\n", - "Requirement already satisfied: urllib3<3,>=1.26.2 in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch==8.13.2) (2.0.7)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch==8.13.2) (2024.2.2)\n", - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", - "\u001b[0mRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (1.5.3)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.4)\n", - "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.23.5)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n", - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", - "\u001b[0mRequirement already satisfied: eland in /usr/local/lib/python3.10/dist-packages (8.13.1)\n", - "Requirement already satisfied: elasticsearch<9,>=8.3 in /usr/local/lib/python3.10/dist-packages (from eland) (8.13.2)\n", - "Requirement already satisfied: pandas<2,>=1.5 in /usr/local/lib/python3.10/dist-packages (from eland) (1.5.3)\n", - "Requirement already satisfied: matplotlib>=3.6 in /usr/local/lib/python3.10/dist-packages (from eland) (3.7.1)\n", - "Requirement already satisfied: numpy<2,>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from eland) (1.23.5)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from eland) (23.2)\n", - "Requirement already satisfied: elastic-transport<9,>=8.13 in /usr/local/lib/python3.10/dist-packages (from elasticsearch<9,>=8.3->eland) (8.13.0)\n", - "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (1.2.0)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (0.12.1)\n", - "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (4.47.2)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (1.4.5)\n", - "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (9.4.0)\n", - "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (3.1.1)\n", - "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.6->eland) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<2,>=1.5->eland) (2023.4)\n", - "Requirement already satisfied: urllib3<3,>=1.26.2 in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch<9,>=8.3->eland) (2.0.7)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from elastic-transport<9,>=8.13->elasticsearch<9,>=8.3->eland) (2024.2.2)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib>=3.6->eland) (1.16.0)\n", - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", - "\u001b[0m" - ] - } - ], + "outputs": [], "source": [ "!pip install elasticsearch==8.13.2\n", "!pip install pandas\n", @@ -176,42 +134,10 @@ "print(esclient.info())" ], "metadata": { - "id": "LGQAjG6PERfx", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "56ab2605-5bf2-4bb0-a7b1-5916b9e3d0ab" + "id": "LGQAjG6PERfx" }, - "execution_count": 3, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Elastic Cloud ID: ··········\n", - "Elastic Api Key: ··········\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", - "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", - "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", - "You will be able to reuse this secret in all of your notebooks.\n", - "Please note that authentication is recommended but still optional to access public models or datasets.\n", - " warnings.warn(\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "{'name': 'instance-0000000008', 'cluster_name': 'ccb3490b2f684a1bad1e54a3de285244', 'cluster_uuid': 'P63_NMvcQmaOmnsL4ZkBxQ', 'version': {'number': '8.13.4', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'da95df118650b55a500dcc181889ac35c6d8da7c', 'build_date': '2024-05-06T22:04:45.107454559Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -239,53 +165,10 @@ "print(resp)" ], "metadata": { - "id": "4NH8JJkQJDit", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 595 - }, - "outputId": "fdd46986-c5e6-43ff-9f86-4a685e8b3f0e" + "id": "4NH8JJkQJDit" }, "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "/usr/local/lib/python3.10/dist-packages/eland/ml/_optional.py:116: UserWarning: Eland requires version '1.3' or newer of 'sklearn' (version '1.2.2' currently installed). Use pip or conda to update sklearn.\n", - " warnings.warn(msg, UserWarning)\n", - "2024-06-06 05:29:18,918 INFO : Establishing connection to Elasticsearch\n", - "2024-06-06 05:29:18,982 INFO : Connected to cluster named '951b9d7d79064735b681a5a2d7921825' (version: 8.14.0)\n", - "2024-06-06 05:29:18,982 INFO : Loading HuggingFace transformer tokenizer and model 'sentence-transformers/all-MiniLM-L6-v2'\n", - "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", - " warnings.warn(\n", - "STAGE:2024-06-06 05:29:19 656714:656714 ActivityProfilerController.cpp:312] Completed Stage: Warm Up\n", - "STAGE:2024-06-06 05:29:19 656714:656714 ActivityProfilerController.cpp:318] Completed Stage: Collection\n", - "STAGE:2024-06-06 05:29:19 656714:656714 ActivityProfilerController.cpp:322] Completed Stage: Post Processing\n", - "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", - " warnings.warn(\n", - "2024-06-06 05:29:21,155 INFO : Stopping deployment for model with id 'sentence-transformers__all-minilm-l6-v2'\n", - "2024-06-06 05:29:21,307 INFO : Deleting model with id 'sentence-transformers__all-minilm-l6-v2'\n", - "2024-06-06 05:29:21,496 INFO : Creating model with id 'sentence-transformers__all-minilm-l6-v2'\n", - "2024-06-06 05:29:21,522 INFO : Uploading model definition\n", - "100% 87/87 [00:15<00:00, 5.68 parts/s]\n", - "2024-06-06 05:29:36,852 INFO : Uploading model vocabulary\n", - "2024-06-06 05:29:37,035 INFO : Starting model deployment\n", - "2024-06-06 05:29:39,224 INFO : Model successfully imported with id 'sentence-transformers__all-minilm-l6-v2'\n" - ] - }, - { - "output_type": "error", - "ename": "AttributeError", - "evalue": "'MlClient' object has no attribute 'update_trained_model_deployment'", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'eland_import_hub_model --cloud-id $ELASTIC_CLOUD_ID --es-model-id {dense_embedding_model_id} --hub-model-id {dense_huggingface_model_id} --task-type text_embedding --es-api-key $ELASTIC_API_KEY --start --clear-previous'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m resp = esclient.ml.update_trained_model_deployment(\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mmodel_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdense_embedding_model_id\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"number_of_allocations\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdense_model_number_of_allocators\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m )\n", - "\u001b[0;31mAttributeError\u001b[0m: 'MlClient' object has no attribute 'update_trained_model_deployment'" - ] - } - ] + "outputs": [] }, { "cell_type": "markdown", @@ -763,7 +646,7 @@ "metadata": { "id": "xB2a9-qtONbQ" }, - "execution_count": 5, + "execution_count": null, "outputs": [] }, { @@ -1279,7 +1162,7 @@ }, "outputId": "01d444cf-17f6-40c1-f5af-e24db219e581" }, - "execution_count": 6, + "execution_count": null, "outputs": [ { "output_type": "stream",