{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4", "authorship_tag": "ABX9TyMohoDhmmKsuh9OLDHor3GB", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "c281b60e104f4c5da547bbdd7208d4bc": { "model_module": "@jupyter-widgets/controls", "model_name": "VBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "VBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "VBoxView", "box_style": "", "children": [ "IPY_MODEL_2e2fabac70484c1c8b16fa6ca8fd8537", "IPY_MODEL_bf53c635fa374420ad850eea22cd1e31", "IPY_MODEL_065d59126a734c1aa096ba40cd4a129f", "IPY_MODEL_e8855d5678a342f5a33171aa74d3b7bc" ], "layout": "IPY_MODEL_1c8a6b959f9c4443a92f58eff1b03077" } }, "74b084c97f6f46d293a197bf9804460c": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_9fb5726f91734b1da149784680dc9624", "placeholder": "​", "style": "IPY_MODEL_202a8eb11eda4e58942113fbeacfdc3d", "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" } }, "1409574c4f9742e7a711965dd2c8ad87": { "model_module": "@jupyter-widgets/controls", "model_name": "PasswordModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "PasswordModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "PasswordView", "continuous_update": true, "description": "Token:", "description_tooltip": null, "disabled": false, "layout": "IPY_MODEL_970d4d3daf854f92bd650dc4da99e1bc", "placeholder": "​", "style": "IPY_MODEL_24b1e007921046b1adc61db0f2bf9fc7", "value": "" } }, "704ecf9409244e0b93612d6a11476346": { "model_module": "@jupyter-widgets/controls", "model_name": "CheckboxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "CheckboxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "CheckboxView", "description": "Add token as git credential?", "description_tooltip": null, "disabled": false, "indent": true, "layout": "IPY_MODEL_24d3d72f5de54de8a1ded4e528dde332", "style": "IPY_MODEL_e90cb0ce526a4556bc643ba6c5485661", "value": true } }, "b1a8d3a9a379415393d9e7d995a40788": { "model_module": "@jupyter-widgets/controls", "model_name": "ButtonModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ButtonView", "button_style": "", "description": "Login", "disabled": false, "icon": "", "layout": "IPY_MODEL_76e7372656b745c889b9283b76c04148", "style": "IPY_MODEL_ce0204c7e1ff4a51b2648284a2492262", "tooltip": "" } }, "f928772f92724579b068e984d9eef387": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_6dbb8e8a5ebb40a4ba910b09dde27e1a", "placeholder": "​", "style": "IPY_MODEL_7944af54f2564920822d5d4b348896c4", "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " } }, "1c8a6b959f9c4443a92f58eff1b03077": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": "center", "align_self": null, "border": null, "bottom": null, "display": "flex", "flex": null, "flex_flow": "column", "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": "50%" } }, "9fb5726f91734b1da149784680dc9624": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "202a8eb11eda4e58942113fbeacfdc3d": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "970d4d3daf854f92bd650dc4da99e1bc": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "24b1e007921046b1adc61db0f2bf9fc7": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "24d3d72f5de54de8a1ded4e528dde332": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e90cb0ce526a4556bc643ba6c5485661": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "76e7372656b745c889b9283b76c04148": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ce0204c7e1ff4a51b2648284a2492262": { "model_module": "@jupyter-widgets/controls", "model_name": "ButtonStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "button_color": null, "font_weight": "" } }, "6dbb8e8a5ebb40a4ba910b09dde27e1a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7944af54f2564920822d5d4b348896c4": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "1b55372f62494ca0baabf87f7e7f4ba8": { "model_module": "@jupyter-widgets/controls", "model_name": "LabelModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_bf612001ad354ea19de6ee45a166a43c", "placeholder": "​", "style": "IPY_MODEL_a8e4691970b14955bfb4865bcef5e912", "value": "Connecting..." } }, "bf612001ad354ea19de6ee45a166a43c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a8e4691970b14955bfb4865bcef5e912": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "2e2fabac70484c1c8b16fa6ca8fd8537": { "model_module": "@jupyter-widgets/controls", "model_name": "LabelModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7eb6de1a979b46f7b234724073f8bc3a", "placeholder": "​", "style": "IPY_MODEL_6ae4640196da492fadafeb63f4bc89d2", "value": "Token is valid (permission: write)." } }, "bf53c635fa374420ad850eea22cd1e31": { "model_module": "@jupyter-widgets/controls", "model_name": "LabelModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_cef83433dbea4f529f43722fe78a8baf", "placeholder": "​", "style": "IPY_MODEL_845ba8115d5140ac9ee22af4a9e6a03b", "value": "Your token has been saved in your configured git credential helpers (store)." } }, "065d59126a734c1aa096ba40cd4a129f": { "model_module": "@jupyter-widgets/controls", "model_name": "LabelModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_cdd888041aca4dcf8adc785309071fc6", "placeholder": "​", "style": "IPY_MODEL_cf63214cb4f8442999fa5b971035fe4f", "value": "Your token has been saved to /root/.cache/huggingface/token" } }, "e8855d5678a342f5a33171aa74d3b7bc": { "model_module": "@jupyter-widgets/controls", "model_name": "LabelModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7d9b22f2b7fe4a749f989e247bce446a", "placeholder": "​", "style": "IPY_MODEL_7f8e268db8144adfb09d089784d8411a", "value": "Login successful" } }, "7eb6de1a979b46f7b234724073f8bc3a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "6ae4640196da492fadafeb63f4bc89d2": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "cef83433dbea4f529f43722fe78a8baf": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "845ba8115d5140ac9ee22af4a9e6a03b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "cdd888041aca4dcf8adc785309071fc6": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "cf63214cb4f8442999fa5b971035fe4f": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "7d9b22f2b7fe4a749f989e247bce446a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7f8e268db8144adfb09d089784d8411a": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "source": [ "# Quantize Llama 2 models using GGUF and llama.cpp\n", "> 🗣️ [Large Language Model Course](https://github.com/mlabonne/llm-course)\n", "\n", "❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne).\n", "\n", "## Usage\n", "\n", "* `MODEL_ID`: The ID of the model to quantize (e.g., `mlabonne/EvolCodeLlama-7b`).\n", "* `QUANTIZATION_METHOD`: The quantization method to use.\n", "\n", "## Quantization methods\n", "\n", "The names of the quantization methods follow the naming convention: \"q\" + the number of bits + the variant used (detailed below). Here is a list of all the possible quant methods and their corresponding use cases, based on model cards made by [TheBloke](https://huggingface.co/TheBloke/):\n", "\n", "* `q2_k`: Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.\n", "* `q3_k_l`: Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K\n", "* `q3_k_m`: Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K\n", "* `q3_k_s`: Uses Q3_K for all tensors\n", "* `q4_0`: Original quant method, 4-bit.\n", "* `q4_1`: Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.\n", "* `q4_k_m`: Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K\n", "* `q4_k_s`: Uses Q4_K for all tensors\n", "* `q5_0`: Higher accuracy, higher resource usage and slower inference.\n", "* `q5_1`: Even higher accuracy, resource usage and slower inference.\n", "* `q5_k_m`: Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K\n", "* `q5_k_s`: Uses Q5_K for all tensors\n", "* `q6_k`: Uses Q8_K for all tensors\n", "* `q8_0`: Almost indistinguishable from float16. High resource use and slow. Not recommended for most users.\n", "\n", "As a rule of thumb, **I recommend using Q5_K_M** as it preserves most of the model's performance. Alternatively, you can use Q4_K_M if you want to save some memory. In general, K_M versions are better than K_S versions. I cannot recommend Q2_K or Q3_* versions, as they drastically decrease model performance." ], "metadata": { "id": "8y_Rk94LzG7I" } }, { "cell_type": "code", "source": [ "# Variables\n", "MODEL_ID = \"mlabonne/EvolCodeLlama-7b\"\n", "QUANTIZATION_METHODS = [\"q4_k_m\", \"q5_k_m\"]\n", "\n", "# Constants\n", "MODEL_NAME = MODEL_ID.split('/')[-1]\n", "\n", "# Install llama.cpp\n", "!git clone https://github.com/ggerganov/llama.cpp\n", "!cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make\n", "!pip install -r llama.cpp/requirements.txt\n", "\n", "# Download model\n", "!git lfs install\n", "!git clone https://huggingface.co/{MODEL_ID}\n", "\n", "# Convert to fp16\n", "fp16 = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin\"\n", "!python llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16}\n", "\n", "# Quantize the model for each method in the QUANTIZATION_METHODS list\n", "for method in QUANTIZATION_METHODS:\n", " qtype = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf\"\n", " !./llama.cpp/quantize {fp16} {qtype} {method}" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fD24jJxq7t3k", "outputId": "94954934-0829-44e9-a5e5-262c17e162d0" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "ggml_init_cublas: found 1 CUDA devices:\n", " Device 0: Tesla T4, compute capability 7.5\n", "main: build = 1100 (dd0dc36)\n", "main: quantizing 'EvolCodeLlama-7b/evolcodellama-7b.gguf.fp16.bin' to 'EvolCodeLlama-7b/evolcodellama-7b.gguf.q4_k_s.bin' as Q4_K_S\n", "llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from EvolCodeLlama-7b/evolcodellama-7b.gguf.fp16.bin (version GGUF V1 (support until nov 2023))\n", "llama_model_loader: - tensor 0: token_embd.weight f16 [ 4096, 32016, 1, 1 ]\n", "llama_model_loader: - tensor 1: blk.0.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 2: blk.0.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 3: blk.0.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 5: blk.0.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 6: blk.0.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 7: blk.0.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 8: blk.0.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 9: blk.0.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 10: blk.1.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 11: blk.1.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 12: blk.1.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 13: blk.1.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 14: blk.1.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 15: blk.1.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 16: blk.1.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 17: blk.1.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 18: blk.1.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 19: blk.2.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 20: blk.2.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 21: blk.2.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 22: blk.2.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 23: blk.2.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 24: blk.2.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 25: blk.2.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 26: blk.2.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 27: blk.2.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 28: blk.3.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 29: blk.3.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 30: blk.3.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 31: blk.3.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 32: blk.3.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 33: blk.3.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 34: blk.3.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 35: blk.3.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 36: blk.3.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 37: blk.4.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 38: blk.4.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 39: blk.4.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 40: blk.4.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 41: blk.4.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 42: blk.4.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 43: blk.4.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 44: blk.4.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 45: blk.4.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 46: blk.5.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 47: blk.5.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 48: blk.5.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 49: blk.5.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 50: blk.5.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 51: blk.5.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 52: blk.5.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 53: blk.5.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 54: blk.5.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 55: blk.6.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 56: blk.6.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 57: blk.6.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 58: blk.6.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 59: blk.6.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 60: blk.6.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 61: blk.6.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 62: blk.6.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 63: blk.6.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 64: blk.7.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 65: blk.7.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 66: blk.7.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 67: blk.7.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 68: blk.7.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 69: blk.7.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 70: blk.7.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 71: blk.7.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 72: blk.7.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 73: blk.8.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 74: blk.8.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 75: blk.8.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 76: blk.8.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 77: blk.8.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 78: blk.8.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 79: blk.8.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 80: blk.8.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 81: blk.8.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 82: blk.9.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 83: blk.9.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 84: blk.9.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 85: blk.9.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 86: blk.9.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 87: blk.9.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 88: blk.9.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 89: blk.9.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 90: blk.9.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 91: blk.10.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 92: blk.10.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 93: blk.10.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 94: blk.10.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 95: blk.10.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 96: blk.10.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 97: blk.10.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 98: blk.10.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 99: blk.10.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 100: blk.11.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 101: blk.11.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 102: blk.11.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 103: blk.11.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 104: blk.11.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 105: blk.11.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 106: blk.11.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 107: blk.11.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 108: blk.11.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 109: blk.12.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 110: blk.12.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 111: blk.12.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 112: blk.12.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 113: blk.12.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 114: blk.12.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 115: blk.12.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 116: blk.12.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 117: blk.12.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 118: blk.13.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 119: blk.13.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 120: blk.13.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 121: blk.13.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 122: blk.13.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 123: blk.13.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 124: blk.13.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 125: blk.13.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 126: blk.13.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 127: blk.14.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 128: blk.14.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 129: blk.14.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 130: blk.14.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 131: blk.14.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 132: blk.14.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 133: blk.14.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 134: blk.14.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 135: blk.14.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 136: blk.15.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 137: blk.15.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 138: blk.15.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 139: blk.15.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 140: blk.15.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 141: blk.15.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 142: blk.15.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 143: blk.15.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 144: blk.15.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 145: blk.16.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 146: blk.16.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 147: blk.16.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 148: blk.16.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 149: blk.16.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 150: blk.16.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 151: blk.16.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 152: blk.16.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 153: blk.16.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 154: blk.17.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 155: blk.17.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 156: blk.17.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 157: blk.17.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 158: blk.17.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 159: blk.17.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 160: blk.17.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 161: blk.17.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 162: blk.17.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 163: blk.18.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 164: blk.18.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 165: blk.18.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 166: blk.18.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 167: blk.18.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 168: blk.18.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 169: blk.18.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 170: blk.18.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 171: blk.18.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 172: blk.19.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 173: blk.19.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 174: blk.19.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 175: blk.19.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 176: blk.19.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 177: blk.19.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 178: blk.19.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 179: blk.19.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 180: blk.19.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 181: blk.20.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 182: blk.20.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 183: blk.20.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 184: blk.20.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 185: blk.20.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 186: blk.20.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 187: blk.20.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 188: blk.20.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 189: blk.20.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 190: blk.21.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 191: blk.21.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 192: blk.21.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 193: blk.21.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 194: blk.21.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 195: blk.21.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 196: blk.21.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 197: blk.21.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 198: blk.21.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 199: blk.22.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 200: blk.22.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 201: blk.22.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 202: blk.22.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 203: blk.22.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 204: blk.22.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 205: blk.22.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 206: blk.22.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 207: blk.22.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 208: blk.23.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 209: blk.23.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 210: blk.23.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 211: blk.23.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 212: blk.23.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 213: blk.23.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 214: blk.23.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 215: blk.23.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 216: blk.23.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 217: blk.24.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 218: blk.24.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 219: blk.24.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 220: blk.24.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 221: blk.24.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 222: blk.24.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 223: blk.24.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 224: blk.24.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 225: blk.24.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 226: blk.25.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 227: blk.25.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 228: blk.25.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 229: blk.25.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 230: blk.25.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 231: blk.25.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 232: blk.25.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 233: blk.25.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 234: blk.25.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 235: blk.26.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 236: blk.26.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 237: blk.26.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 238: blk.26.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 239: blk.26.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 240: blk.26.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 241: blk.26.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 242: blk.26.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 243: blk.26.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 244: blk.27.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 245: blk.27.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 246: blk.27.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 247: blk.27.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 248: blk.27.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 249: blk.27.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 250: blk.27.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 251: blk.27.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 252: blk.27.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 253: blk.28.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 254: blk.28.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 255: blk.28.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 256: blk.28.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 257: blk.28.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 258: blk.28.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 259: blk.28.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 260: blk.28.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 261: blk.28.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 262: blk.29.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 263: blk.29.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 264: blk.29.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 265: blk.29.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 266: blk.29.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 267: blk.29.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 268: blk.29.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 269: blk.29.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 270: blk.29.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 271: blk.30.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 272: blk.30.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 273: blk.30.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 274: blk.30.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 275: blk.30.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 276: blk.30.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 277: blk.30.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 278: blk.30.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 279: blk.30.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 280: blk.31.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 281: blk.31.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 282: blk.31.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 283: blk.31.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 284: blk.31.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 285: blk.31.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 286: blk.31.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 287: blk.31.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 288: blk.31.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 289: output_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 290: output.weight f16 [ 4096, 32016, 1, 1 ]\n", "llama_model_loader: - kv 0: general.architecture str \n", "llama_model_loader: - kv 1: general.name str \n", "llama_model_loader: - kv 2: llama.context_length u32 \n", "llama_model_loader: - kv 3: llama.embedding_length u32 \n", "llama_model_loader: - kv 4: llama.block_count u32 \n", "llama_model_loader: - kv 5: llama.feed_forward_length u32 \n", "llama_model_loader: - kv 6: llama.rope.dimension_count u32 \n", "llama_model_loader: - kv 7: llama.attention.head_count u32 \n", "llama_model_loader: - kv 8: llama.attention.head_count_kv u32 \n", "llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 \n", "llama_model_loader: - kv 10: llama.rope.freq_base f32 \n", "llama_model_loader: - kv 11: general.file_type u32 \n", "llama_model_loader: - kv 12: tokenizer.ggml.model str \n", "llama_model_loader: - kv 13: tokenizer.ggml.tokens arr \n", "llama_model_loader: - kv 14: tokenizer.ggml.scores arr \n", "llama_model_loader: - kv 15: tokenizer.ggml.token_type arr \n", "llama_model_loader: - type f32: 65 tensors\n", "llama_model_loader: - type f16: 226 tensors\n", "llama_model_quantize_internal: meta size = 741408 bytes\n", "[ 1/ 291] token_embd.weight - [ 4096, 32016, 1, 1], type = f16, quantizing to q4_K .. size = 250.12 MB -> 70.35 MB | hist: \n", "[ 2/ 291] blk.0.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 3/ 291] blk.0.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 4/ 291] blk.0.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MB -> 11.00 MB | hist: \n", "[ 5/ 291] blk.0.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 6/ 291] blk.0.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 7/ 291] blk.0.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 8/ 291] blk.0.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MB -> 29.56 MB | hist: \n", "[ 9/ 291] blk.0.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 10/ 291] blk.0.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 11/ 291] blk.1.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 12/ 291] blk.1.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 13/ 291] blk.1.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MB -> 11.00 MB | hist: \n", "[ 14/ 291] blk.1.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 15/ 291] blk.1.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 16/ 291] blk.1.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 17/ 291] blk.1.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MB -> 29.56 MB | hist: \n", "[ 18/ 291] blk.1.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 19/ 291] blk.1.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 20/ 291] blk.2.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 21/ 291] blk.2.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 22/ 291] blk.2.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MB -> 11.00 MB | hist: \n", "[ 23/ 291] blk.2.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 24/ 291] blk.2.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 25/ 291] blk.2.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 26/ 291] blk.2.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MB -> 29.56 MB | hist: \n", "[ 27/ 291] blk.2.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 28/ 291] blk.2.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 29/ 291] blk.3.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 30/ 291] blk.3.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 31/ 291] blk.3.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MB -> 11.00 MB | hist: \n", "[ 32/ 291] blk.3.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 33/ 291] blk.3.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 34/ 291] blk.3.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 35/ 291] blk.3.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MB -> 29.56 MB | hist: \n", "[ 36/ 291] blk.3.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 37/ 291] blk.3.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 38/ 291] blk.4.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 39/ 291] blk.4.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 40/ 291] blk.4.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 41/ 291] blk.4.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 42/ 291] blk.4.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 43/ 291] blk.4.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 44/ 291] blk.4.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 45/ 291] blk.4.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 46/ 291] blk.4.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 47/ 291] blk.5.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 48/ 291] blk.5.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 49/ 291] blk.5.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 50/ 291] blk.5.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 51/ 291] blk.5.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 52/ 291] blk.5.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 53/ 291] blk.5.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 54/ 291] blk.5.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 55/ 291] blk.5.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 56/ 291] blk.6.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 57/ 291] blk.6.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 58/ 291] blk.6.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 59/ 291] blk.6.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 60/ 291] blk.6.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 61/ 291] blk.6.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 62/ 291] blk.6.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 63/ 291] blk.6.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 64/ 291] blk.6.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 65/ 291] blk.7.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 66/ 291] blk.7.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 67/ 291] blk.7.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 68/ 291] blk.7.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 69/ 291] blk.7.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 70/ 291] blk.7.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 71/ 291] blk.7.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 72/ 291] blk.7.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 73/ 291] blk.7.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 74/ 291] blk.8.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 75/ 291] blk.8.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 76/ 291] blk.8.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 77/ 291] blk.8.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 78/ 291] blk.8.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 79/ 291] blk.8.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 80/ 291] blk.8.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 81/ 291] blk.8.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 82/ 291] blk.8.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 83/ 291] blk.9.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 84/ 291] blk.9.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 85/ 291] blk.9.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 86/ 291] blk.9.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 87/ 291] blk.9.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 88/ 291] blk.9.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 89/ 291] blk.9.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 90/ 291] blk.9.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 91/ 291] blk.9.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 92/ 291] blk.10.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 93/ 291] blk.10.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 94/ 291] blk.10.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 95/ 291] blk.10.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 96/ 291] blk.10.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 97/ 291] blk.10.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 98/ 291] blk.10.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 99/ 291] blk.10.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 100/ 291] blk.10.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 101/ 291] blk.11.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 102/ 291] blk.11.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 103/ 291] blk.11.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 104/ 291] blk.11.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 105/ 291] blk.11.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 106/ 291] blk.11.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 107/ 291] blk.11.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 108/ 291] blk.11.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 109/ 291] blk.11.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 110/ 291] blk.12.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 111/ 291] blk.12.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 112/ 291] blk.12.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 113/ 291] blk.12.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 114/ 291] blk.12.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 115/ 291] blk.12.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 116/ 291] blk.12.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 117/ 291] blk.12.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 118/ 291] blk.12.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 119/ 291] blk.13.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 120/ 291] blk.13.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 121/ 291] blk.13.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 122/ 291] blk.13.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 123/ 291] blk.13.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 124/ 291] blk.13.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 125/ 291] blk.13.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 126/ 291] blk.13.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 127/ 291] blk.13.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 128/ 291] blk.14.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 129/ 291] blk.14.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 130/ 291] blk.14.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 131/ 291] blk.14.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 132/ 291] blk.14.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 133/ 291] blk.14.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 134/ 291] blk.14.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 135/ 291] blk.14.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 136/ 291] blk.14.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 137/ 291] blk.15.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 138/ 291] blk.15.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 139/ 291] blk.15.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 140/ 291] blk.15.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 141/ 291] blk.15.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 142/ 291] blk.15.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 143/ 291] blk.15.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 144/ 291] blk.15.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 145/ 291] blk.15.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 146/ 291] blk.16.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 147/ 291] blk.16.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 148/ 291] blk.16.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 149/ 291] blk.16.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 150/ 291] blk.16.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 151/ 291] blk.16.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 152/ 291] blk.16.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 153/ 291] blk.16.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 154/ 291] blk.16.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 155/ 291] blk.17.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 156/ 291] blk.17.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 157/ 291] blk.17.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 158/ 291] blk.17.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 159/ 291] blk.17.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 160/ 291] blk.17.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 161/ 291] blk.17.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 162/ 291] blk.17.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 163/ 291] blk.17.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 164/ 291] blk.18.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 165/ 291] blk.18.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 166/ 291] blk.18.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 167/ 291] blk.18.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 168/ 291] blk.18.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 169/ 291] blk.18.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 170/ 291] blk.18.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 171/ 291] blk.18.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 172/ 291] blk.18.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 173/ 291] blk.19.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 174/ 291] blk.19.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 175/ 291] blk.19.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 176/ 291] blk.19.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 177/ 291] blk.19.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 178/ 291] blk.19.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 179/ 291] blk.19.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 180/ 291] blk.19.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 181/ 291] blk.19.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 182/ 291] blk.20.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 183/ 291] blk.20.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 184/ 291] blk.20.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 185/ 291] blk.20.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 186/ 291] blk.20.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 187/ 291] blk.20.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 188/ 291] blk.20.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 189/ 291] blk.20.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 190/ 291] blk.20.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 191/ 291] blk.21.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 192/ 291] blk.21.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 193/ 291] blk.21.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 194/ 291] blk.21.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 195/ 291] blk.21.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 196/ 291] blk.21.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 197/ 291] blk.21.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 198/ 291] blk.21.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 199/ 291] blk.21.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 200/ 291] blk.22.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 201/ 291] blk.22.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 202/ 291] blk.22.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 203/ 291] blk.22.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 204/ 291] blk.22.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 205/ 291] blk.22.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 206/ 291] blk.22.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 207/ 291] blk.22.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 208/ 291] blk.22.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 209/ 291] blk.23.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 210/ 291] blk.23.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 211/ 291] blk.23.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 212/ 291] blk.23.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 213/ 291] blk.23.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 214/ 291] blk.23.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 215/ 291] blk.23.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 216/ 291] blk.23.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 217/ 291] blk.23.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 218/ 291] blk.24.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 219/ 291] blk.24.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 220/ 291] blk.24.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 221/ 291] blk.24.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 222/ 291] blk.24.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 223/ 291] blk.24.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 224/ 291] blk.24.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 225/ 291] blk.24.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 226/ 291] blk.24.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 227/ 291] blk.25.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 228/ 291] blk.25.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 229/ 291] blk.25.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 230/ 291] blk.25.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 231/ 291] blk.25.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 232/ 291] blk.25.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 233/ 291] blk.25.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 234/ 291] blk.25.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 235/ 291] blk.25.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 236/ 291] blk.26.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 237/ 291] blk.26.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 238/ 291] blk.26.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 239/ 291] blk.26.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 240/ 291] blk.26.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 241/ 291] blk.26.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 242/ 291] blk.26.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 243/ 291] blk.26.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 244/ 291] blk.26.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 245/ 291] blk.27.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 246/ 291] blk.27.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 247/ 291] blk.27.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 248/ 291] blk.27.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 249/ 291] blk.27.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 250/ 291] blk.27.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 251/ 291] blk.27.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 252/ 291] blk.27.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 253/ 291] blk.27.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 254/ 291] blk.28.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 255/ 291] blk.28.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 256/ 291] blk.28.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 257/ 291] blk.28.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 258/ 291] blk.28.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 259/ 291] blk.28.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 260/ 291] blk.28.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 261/ 291] blk.28.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 262/ 291] blk.28.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 263/ 291] blk.29.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 264/ 291] blk.29.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 265/ 291] blk.29.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 266/ 291] blk.29.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 267/ 291] blk.29.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 268/ 291] blk.29.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 269/ 291] blk.29.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 270/ 291] blk.29.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 271/ 291] blk.29.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 272/ 291] blk.30.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 273/ 291] blk.30.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 274/ 291] blk.30.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 275/ 291] blk.30.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 276/ 291] blk.30.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 277/ 291] blk.30.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 278/ 291] blk.30.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 279/ 291] blk.30.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 280/ 291] blk.30.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 281/ 291] blk.31.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 282/ 291] blk.31.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 283/ 291] blk.31.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 284/ 291] blk.31.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", "[ 285/ 291] blk.31.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 286/ 291] blk.31.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 287/ 291] blk.31.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", "[ 288/ 291] blk.31.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 289/ 291] blk.31.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 290/ 291] output_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", "[ 291/ 291] output.weight - [ 4096, 32016, 1, 1], type = f16, quantizing to q6_K .. size = 250.12 MB -> 102.59 MB | hist: \n", "llama_model_quantize_internal: model size = 12853.27 MB\n", "llama_model_quantize_internal: quant size = 3677.45 MB\n", "\n", "main: quantize time = 1089230.46 ms\n", "main: total time = 1089230.46 ms\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Run inference\n", "\n", "Here is a simple script to run your quantized models. I'm offloading every layer to the GPU (35 for a 7b parameter model) to speed up inference." ], "metadata": { "id": "WqI1CPiXI4dP" } }, { "cell_type": "code", "source": [ "import os\n", "\n", "model_list = [file for file in os.listdir(MODEL_NAME) if \"gguf\" in file]\n", "\n", "prompt = input(\"Enter your prompt: \")\n", "chosen_method = input(\"Name of the model (options: \" + \", \".join(model_list) + \"): \")\n", "\n", "# Verify the chosen method is in the list\n", "if chosen_method not in model_list:\n", " print(\"Invalid name\")\n", "else:\n", " qtype = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf\"\n", " !./llama.cpp/main -m {qtype} -n 128 --color -ngl 35 -p \"{prompt}\"" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vNPL9WYg78l-", "outputId": "3c3e7d2f-f0de-429d-fd97-dab480bc514a" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Enter your prompt: prompt\n", "Please specify the quantization method to run the model (options: q4_k_s): q4_k_s\n", "main: build = 1100 (dd0dc36)\n", "main: seed = 1693227123\n", "ggml_init_cublas: found 1 CUDA devices:\n", " Device 0: Tesla T4, compute capability 7.5\n", "llama_model_loader: loaded meta data with 17 key-value pairs and 291 tensors from EvolCodeLlama-7b/evolcodellama-7b.gguf.q4_k_s.bin (version GGUF V2 (latest))\n", "llama_model_loader: - tensor 0: token_embd.weight q4_K [ 4096, 32016, 1, 1 ]\n", "llama_model_loader: - tensor 1: blk.0.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 2: blk.0.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 3: blk.0.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 4: blk.0.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 5: blk.0.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 6: blk.0.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 7: blk.0.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 8: blk.0.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 9: blk.0.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 10: blk.1.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 11: blk.1.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 12: blk.1.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 13: blk.1.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 14: blk.1.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 15: blk.1.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 16: blk.1.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 17: blk.1.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 18: blk.1.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 19: blk.2.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 20: blk.2.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 21: blk.2.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 22: blk.2.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 23: blk.2.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 24: blk.2.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 25: blk.2.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 26: blk.2.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 27: blk.2.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 28: blk.3.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 29: blk.3.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 30: blk.3.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 31: blk.3.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 32: blk.3.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 33: blk.3.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 34: blk.3.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 35: blk.3.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 36: blk.3.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 37: blk.4.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 38: blk.4.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 39: blk.4.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 40: blk.4.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 41: blk.4.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 42: blk.4.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 43: blk.4.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 44: blk.4.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 45: blk.4.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 46: blk.5.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 47: blk.5.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 48: blk.5.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 49: blk.5.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 50: blk.5.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 51: blk.5.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 52: blk.5.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 53: blk.5.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 54: blk.5.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 55: blk.6.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 56: blk.6.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 57: blk.6.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 58: blk.6.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 59: blk.6.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 60: blk.6.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 61: blk.6.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 62: blk.6.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 63: blk.6.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 64: blk.7.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 65: blk.7.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 66: blk.7.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 67: blk.7.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 68: blk.7.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 69: blk.7.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 70: blk.7.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 71: blk.7.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 72: blk.7.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 73: blk.8.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 74: blk.8.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 75: blk.8.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 76: blk.8.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 77: blk.8.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 78: blk.8.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 79: blk.8.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 80: blk.8.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 81: blk.8.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 82: blk.9.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 83: blk.9.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 84: blk.9.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 85: blk.9.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 86: blk.9.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 87: blk.9.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 88: blk.9.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 89: blk.9.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 90: blk.9.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 91: blk.10.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 92: blk.10.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 93: blk.10.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 94: blk.10.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 95: blk.10.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 96: blk.10.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 97: blk.10.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 98: blk.10.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 99: blk.10.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 100: blk.11.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 101: blk.11.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 102: blk.11.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 103: blk.11.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 104: blk.11.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 105: blk.11.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 106: blk.11.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 107: blk.11.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 108: blk.11.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 109: blk.12.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 110: blk.12.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 111: blk.12.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 112: blk.12.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 113: blk.12.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 114: blk.12.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 115: blk.12.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 116: blk.12.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 117: blk.12.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 118: blk.13.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 119: blk.13.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 120: blk.13.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 121: blk.13.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 122: blk.13.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 123: blk.13.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 124: blk.13.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 125: blk.13.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 126: blk.13.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 127: blk.14.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 128: blk.14.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 129: blk.14.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 130: blk.14.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 131: blk.14.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 132: blk.14.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 133: blk.14.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 134: blk.14.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 135: blk.14.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 136: blk.15.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 137: blk.15.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 138: blk.15.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 139: blk.15.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 140: blk.15.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 141: blk.15.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 142: blk.15.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 143: blk.15.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 144: blk.15.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 145: blk.16.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 146: blk.16.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 147: blk.16.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 148: blk.16.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 149: blk.16.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 150: blk.16.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 151: blk.16.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 152: blk.16.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 153: blk.16.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 154: blk.17.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 155: blk.17.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 156: blk.17.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 157: blk.17.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 158: blk.17.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 159: blk.17.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 160: blk.17.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 161: blk.17.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 162: blk.17.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 163: blk.18.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 164: blk.18.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 165: blk.18.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 166: blk.18.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 167: blk.18.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 168: blk.18.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 169: blk.18.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 170: blk.18.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 171: blk.18.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 172: blk.19.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 173: blk.19.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 174: blk.19.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 175: blk.19.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 176: blk.19.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 177: blk.19.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 178: blk.19.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 179: blk.19.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 180: blk.19.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 181: blk.20.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 182: blk.20.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 183: blk.20.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 184: blk.20.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 185: blk.20.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 186: blk.20.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 187: blk.20.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 188: blk.20.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 189: blk.20.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 190: blk.21.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 191: blk.21.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 192: blk.21.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 193: blk.21.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 194: blk.21.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 195: blk.21.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 196: blk.21.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 197: blk.21.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 198: blk.21.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 199: blk.22.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 200: blk.22.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 201: blk.22.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 202: blk.22.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 203: blk.22.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 204: blk.22.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 205: blk.22.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 206: blk.22.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 207: blk.22.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 208: blk.23.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 209: blk.23.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 210: blk.23.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 211: blk.23.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 212: blk.23.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 213: blk.23.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 214: blk.23.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 215: blk.23.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 216: blk.23.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 217: blk.24.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 218: blk.24.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 219: blk.24.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 220: blk.24.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 221: blk.24.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 222: blk.24.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 223: blk.24.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 224: blk.24.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 225: blk.24.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 226: blk.25.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 227: blk.25.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 228: blk.25.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 229: blk.25.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 230: blk.25.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 231: blk.25.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 232: blk.25.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 233: blk.25.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 234: blk.25.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 235: blk.26.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 236: blk.26.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 237: blk.26.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 238: blk.26.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 239: blk.26.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 240: blk.26.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 241: blk.26.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 242: blk.26.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 243: blk.26.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 244: blk.27.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 245: blk.27.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 246: blk.27.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 247: blk.27.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 248: blk.27.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 249: blk.27.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 250: blk.27.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 251: blk.27.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 252: blk.27.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 253: blk.28.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 254: blk.28.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 255: blk.28.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 256: blk.28.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 257: blk.28.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 258: blk.28.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 259: blk.28.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 260: blk.28.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 261: blk.28.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 262: blk.29.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 263: blk.29.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 264: blk.29.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 265: blk.29.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 266: blk.29.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 267: blk.29.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 268: blk.29.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 269: blk.29.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 270: blk.29.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 271: blk.30.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 272: blk.30.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 273: blk.30.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 274: blk.30.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 275: blk.30.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 276: blk.30.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 277: blk.30.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 278: blk.30.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 279: blk.30.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 280: blk.31.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 281: blk.31.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 282: blk.31.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 283: blk.31.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 284: blk.31.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 285: blk.31.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", "llama_model_loader: - tensor 286: blk.31.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", "llama_model_loader: - tensor 287: blk.31.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 288: blk.31.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 289: output_norm.weight f32 [ 4096, 1, 1, 1 ]\n", "llama_model_loader: - tensor 290: output.weight q6_K [ 4096, 32016, 1, 1 ]\n", "llama_model_loader: - kv 0: general.architecture str \n", "llama_model_loader: - kv 1: general.name str \n", "llama_model_loader: - kv 2: llama.context_length u32 \n", "llama_model_loader: - kv 3: llama.embedding_length u32 \n", "llama_model_loader: - kv 4: llama.block_count u32 \n", "llama_model_loader: - kv 5: llama.feed_forward_length u32 \n", "llama_model_loader: - kv 6: llama.rope.dimension_count u32 \n", "llama_model_loader: - kv 7: llama.attention.head_count u32 \n", "llama_model_loader: - kv 8: llama.attention.head_count_kv u32 \n", "llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 \n", "llama_model_loader: - kv 10: llama.rope.freq_base f32 \n", "llama_model_loader: - kv 11: general.file_type u32 \n", "llama_model_loader: - kv 12: tokenizer.ggml.model str \n", "llama_model_loader: - kv 13: tokenizer.ggml.tokens arr \n", "llama_model_loader: - kv 14: tokenizer.ggml.scores arr \n", "llama_model_loader: - kv 15: tokenizer.ggml.token_type arr \n", "llama_model_loader: - kv 16: general.quantization_version u32 \n", "llama_model_loader: - type f32: 65 tensors\n", "llama_model_loader: - type q4_K: 217 tensors\n", "llama_model_loader: - type q5_K: 8 tensors\n", "llama_model_loader: - type q6_K: 1 tensors\n", "llm_load_print_meta: format = GGUF V2 (latest)\n", "llm_load_print_meta: arch = llama\n", "llm_load_print_meta: vocab type = SPM\n", "llm_load_print_meta: n_vocab = 32016\n", "llm_load_print_meta: n_merges = 0\n", "llm_load_print_meta: n_ctx_train = 16384\n", "llm_load_print_meta: n_ctx = 512\n", "llm_load_print_meta: n_embd = 4096\n", "llm_load_print_meta: n_head = 32\n", "llm_load_print_meta: n_head_kv = 32\n", "llm_load_print_meta: n_layer = 32\n", "llm_load_print_meta: n_rot = 128\n", "llm_load_print_meta: n_gqa = 1\n", "llm_load_print_meta: f_norm_eps = 1.0e-05\n", "llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n", "llm_load_print_meta: n_ff = 11008\n", "llm_load_print_meta: freq_base = 1000000.0\n", "llm_load_print_meta: freq_scale = 1\n", "llm_load_print_meta: model type = 7B\n", "llm_load_print_meta: model ftype = mostly Q4_K - Small\n", "llm_load_print_meta: model size = 6.74 B\n", "llm_load_print_meta: general.name = LLaMA\n", "llm_load_print_meta: BOS token = 1 ''\n", "llm_load_print_meta: EOS token = 2 ''\n", "llm_load_print_meta: UNK token = 0 ''\n", "llm_load_print_meta: LF token = 13 '<0x0A>'\n", "llm_load_tensors: ggml ctx size = 0.09 MB\n", "llm_load_tensors: using CUDA for GPU acceleration\n", "llm_load_tensors: mem required = 70.44 MB (+ 256.00 MB per state)\n", "llm_load_tensors: offloading 32 repeating layers to GPU\n", "llm_load_tensors: offloading non-repeating layers to GPU\n", "llm_load_tensors: offloading v cache to GPU\n", "llm_load_tensors: offloading k cache to GPU\n", "llm_load_tensors: offloaded 35/35 layers to GPU\n", "llm_load_tensors: VRAM used: 3864 MB\n", "..................................................................................................\n", "llama_new_context_with_model: kv self size = 256.00 MB\n", "llama_new_context_with_model: compute buffer total size = 71.94 MB\n", "llama_new_context_with_model: VRAM scratch buffer: 70.53 MB\n", "\n", "system_info: n_threads = 2 / 2 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | \n", "sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000\n", "generate: n_ctx = 512, n_batch = 512, n_predict = 128, n_keep = 0\n", "\n", "\n", "\u001b[33m prompt\u001b[0m.\t\t\t\t\n", "\t\t\t\t\tif( !this->m_pMiscSettings ) { return; }\t// If no misc settings, do nothing\n", "\t\t\t\t\t\n", "\t\t\t\t\t// Get the value of the checkbox for \"Always on top\"\n", "\t\t\t\t\tbool alwaysOnTop = this->m_pMiscSettings->GetBool(L\"AlwaysOnTop\", false);\n", "\t\t\t\t\tthis->SetWindowPos((alwaysOnTop ? HWND_TOPMOST : HWND_NOTOPMOST\n", "llama_print_timings: load time = 1392.10 ms\n", "llama_print_timings: sample time = 147.99 ms / 128 runs ( 1.16 ms per token, 864.92 tokens per second)\n", "llama_print_timings: prompt eval time = 261.80 ms / 2 tokens ( 130.90 ms per token, 7.64 tokens per second)\n", "llama_print_timings: eval time = 5923.18 ms / 127 runs ( 46.64 ms per token, 21.44 tokens per second)\n", "llama_print_timings: total time = 6370.96 ms\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Push to hub\n", "\n", "To push your model to the hub, you'll need to input your Hugging Face token (https://huggingface.co/settings/tokens) in Google Colab's \"Secrets\" tab. The following code creates a new repo with the \"-GGUF\" suffix. Don't forget to change the `username` variable." ], "metadata": { "id": "Ar8pO7bb80US" } }, { "cell_type": "code", "source": [ "!pip install -q huggingface_hub\n", "from huggingface_hub import create_repo, HfApi\n", "from google.colab import userdata\n", "\n", "# Defined in the secrets tab in Google Colab\n", "hf_token = userdata.get('huggingface')\n", "\n", "api = HfApi()\n", "username = \"mlabonne\"\n", "\n", "# Create empty repo\n", "create_repo(\n", " repo_id = f\"{username}/{MODEL_NAME}-GGUF\",\n", " repo_type=\"model\",\n", " exist_ok=True,\n", " token=hf_token\n", ")\n", "\n", "# Upload gguf files\n", "api.upload_folder(\n", " folder_path=MODEL_NAME,\n", " repo_id=f\"{username}/{MODEL_NAME}-GGUF\",\n", " allow_patterns=f\"*.gguf\",\n", " token=hf_token\n", ")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 163, "referenced_widgets": [ "c281b60e104f4c5da547bbdd7208d4bc", "74b084c97f6f46d293a197bf9804460c", "1409574c4f9742e7a711965dd2c8ad87", "704ecf9409244e0b93612d6a11476346", "b1a8d3a9a379415393d9e7d995a40788", "f928772f92724579b068e984d9eef387", "1c8a6b959f9c4443a92f58eff1b03077", "9fb5726f91734b1da149784680dc9624", "202a8eb11eda4e58942113fbeacfdc3d", "970d4d3daf854f92bd650dc4da99e1bc", "24b1e007921046b1adc61db0f2bf9fc7", "24d3d72f5de54de8a1ded4e528dde332", "e90cb0ce526a4556bc643ba6c5485661", "76e7372656b745c889b9283b76c04148", "ce0204c7e1ff4a51b2648284a2492262", "6dbb8e8a5ebb40a4ba910b09dde27e1a", "7944af54f2564920822d5d4b348896c4", "1b55372f62494ca0baabf87f7e7f4ba8", "bf612001ad354ea19de6ee45a166a43c", "a8e4691970b14955bfb4865bcef5e912", "2e2fabac70484c1c8b16fa6ca8fd8537", "bf53c635fa374420ad850eea22cd1e31", "065d59126a734c1aa096ba40cd4a129f", "e8855d5678a342f5a33171aa74d3b7bc", "7eb6de1a979b46f7b234724073f8bc3a", "6ae4640196da492fadafeb63f4bc89d2", "cef83433dbea4f529f43722fe78a8baf", "845ba8115d5140ac9ee22af4a9e6a03b", "cdd888041aca4dcf8adc785309071fc6", "cf63214cb4f8442999fa5b971035fe4f", "7d9b22f2b7fe4a749f989e247bce446a", "7f8e268db8144adfb09d089784d8411a" ] }, "id": "UOyKfUD-8jmh", "outputId": "3c8df47b-f350-4251-a19f-4b9fb1116381" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/268.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.7/268.8 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h" ] }, { "output_type": "display_data", "data": { "text/plain": [ "VBox(children=(HTML(value='