pull/17/head
Maxime Labonne 6 months ago
commit fae0ad2681

@ -0,0 +1,763 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"machine_shape": "hm",
"gpuType": "A100",
"authorship_tag": "ABX9TyOJJCuqxZQnS1q+Fvz5+URG",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"22773c721a7c4221a9c14cd388461d4c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_6b54841f5de1482694c360095dae3039",
"IPY_MODEL_448ccbc85e624ec3b3e71931a7ee4ff6",
"IPY_MODEL_173769f6f465485f8848a11bf269850b"
],
"layout": "IPY_MODEL_60978b9b4e8348f0a71ce3e35c73bcff"
}
},
"6b54841f5de1482694c360095dae3039": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_6a38dcbaf4674b448329ac0a16587d2a",
"placeholder": "",
"style": "IPY_MODEL_7eaeada2158e493189449af91f643553",
"value": "Loading checkpoint shards: 100%"
}
},
"448ccbc85e624ec3b3e71931a7ee4ff6": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_6e32854952b340008edca0139d3471d6",
"max": 3,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_db6d7cfcdade4b4baa213a5d0abc07d7",
"value": 3
}
},
"173769f6f465485f8848a11bf269850b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_9083029642744c43b7705532cbe0cf79",
"placeholder": "",
"style": "IPY_MODEL_d028a98caa13425b907ceb513119006e",
"value": " 3/3 [00:11<00:00, 2.89s/it]"
}
},
"60978b9b4e8348f0a71ce3e35c73bcff": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"6a38dcbaf4674b448329ac0a16587d2a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"7eaeada2158e493189449af91f643553": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"6e32854952b340008edca0139d3471d6": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"db6d7cfcdade4b4baa213a5d0abc07d7": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"9083029642744c43b7705532cbe0cf79": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"d028a98caa13425b907ceb513119006e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/mlabonne/llm-course/blob/main/Fine_tune_a_Mistral_7b_model_with_DPO.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"# Fine-tune a Mistral-7b model with DPO\n",
"\n",
"❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne)."
],
"metadata": {
"id": "Pa8905-YsHAn"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_zIBL8IssExG"
},
"outputs": [],
"source": [
"!pip install -q datasets trl peft bitsandbytes sentencepiece wandb"
]
},
{
"cell_type": "code",
"source": [
"import os\n",
"import gc\n",
"import torch\n",
"\n",
"import transformers\n",
"from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig\n",
"from datasets import load_dataset\n",
"from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training\n",
"from trl import DPOTrainer\n",
"import bitsandbytes as bnb\n",
"from google.colab import userdata\n",
"import wandb\n",
"\n",
"# Defined in the secrets tab in Google Colab\n",
"hf_token = userdata.get('huggingface')\n",
"wb_token = userdata.get('wandb')\n",
"wandb.login(key=wb_token)\n",
"\n",
"model_name = \"teknium/OpenHermes-2.5-Mistral-7B\"\n",
"new_model = \"NeuralHermes-2.5-Mistral-7B\""
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YpdkZsMNylvp",
"outputId": "6c2df234-1ce7-4cd2-a7e3-567e7536319f"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/trl/trainer/ppo_config.py:141: UserWarning: The `optimize_cuda_cache` arguement will be deprecated soon, please use `optimize_device_cache` instead.\n",
" warnings.warn(\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mmlabonne\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## Format dataset"
],
"metadata": {
"id": "d8CvUgROUDw-"
}
},
{
"cell_type": "code",
"source": [
"def chatml_format(example):\n",
" # Format system\n",
" if len(example['system']) > 0:\n",
" message = {\"role\": \"system\", \"content\": example['system']}\n",
" system = tokenizer.apply_chat_template([message], tokenize=False)\n",
" else:\n",
" system = \"\"\n",
"\n",
" # Format instruction\n",
" message = {\"role\": \"user\", \"content\": example['question']}\n",
" prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)\n",
"\n",
" # Format chosen answer\n",
" chosen = example['chatgpt'] + \"<|im_end|>\\n\"\n",
"\n",
" # Format rejected answer\n",
" rejected = example['llama2-13b-chat'] + \"<|im_end|>\\n\"\n",
"\n",
" return {\n",
" \"prompt\": system + prompt,\n",
" \"chosen\": chosen,\n",
" \"rejected\": rejected,\n",
" }\n",
"\n",
"# Load dataset\n",
"dataset = load_dataset(\"Intel/orca_dpo_pairs\")['train']\n",
"\n",
"# Save columns\n",
"original_columns = dataset.column_names\n",
"\n",
"# Tokenizer\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"tokenizer.padding_side = \"left\"\n",
"\n",
"# Format dataset\n",
"dataset = dataset.map(\n",
" chatml_format,\n",
" remove_columns=original_columns\n",
")\n",
"\n",
"# Print sample\n",
"dataset[1]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "MCD77GZ60DOT",
"outputId": "c7c6773c-5545-4fee-bfa3-6fa6d69c0f3f"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'prompt': '<|im_start|>system\\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.<|im_end|>\\n<|im_start|>user\\nGenerate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One<|im_end|>\\n<|im_start|>assistant\\n',\n",
" 'chosen': 'Midsummer House is a moderately priced Chinese restaurant with a 3/5 customer rating, located near All Bar One.<|im_end|>\\n',\n",
" 'rejected': ' Sure! Here\\'s a sentence that describes all the data you provided:\\n\\n\"Midsummer House is a moderately priced Chinese restaurant with a customer rating of 3 out of 5, located near All Bar One, offering a variety of delicious dishes.\"<|im_end|>\\n'}"
]
},
"metadata": {},
"execution_count": 3
}
]
},
{
"cell_type": "markdown",
"source": [
"## Train model with DPO"
],
"metadata": {
"id": "DeT5eUK_UJgK"
}
},
{
"cell_type": "code",
"source": [
"# LoRA configuration\n",
"peft_config = LoraConfig(\n",
" r=16,\n",
" lora_alpha=16,\n",
" lora_dropout=0.05,\n",
" bias=\"none\",\n",
" task_type=\"CAUSAL_LM\",\n",
" target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']\n",
")\n",
"\n",
"# Model to fine-tune\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" model_name,\n",
" torch_dtype=torch.float16,\n",
" load_in_4bit=True\n",
")\n",
"model.config.use_cache = False\n",
"\n",
"# Reference model\n",
"ref_model = AutoModelForCausalLM.from_pretrained(\n",
" model_name,\n",
" torch_dtype=torch.float16,\n",
" load_in_4bit=True\n",
")\n",
"\n",
"# Training arguments\n",
"training_args = TrainingArguments(\n",
" per_device_train_batch_size=4,\n",
" gradient_accumulation_steps=4,\n",
" gradient_checkpointing=True,\n",
" learning_rate=5e-5,\n",
" lr_scheduler_type=\"cosine\",\n",
" max_steps=200,\n",
" save_strategy=\"no\",\n",
" logging_steps=1,\n",
" output_dir=new_model,\n",
" optim=\"paged_adamw_32bit\",\n",
" warmup_steps=100,\n",
" bf16=True,\n",
" report_to=\"wandb\",\n",
")\n",
"\n",
"# Create DPO trainer\n",
"dpo_trainer = DPOTrainer(\n",
" model,\n",
" ref_model,\n",
" args=training_args,\n",
" train_dataset=dataset,\n",
" tokenizer=tokenizer,\n",
" peft_config=peft_config,\n",
" beta=0.1,\n",
" max_prompt_length=1024,\n",
" max_length=1536,\n",
")\n",
"\n",
"# Fine-tune model with DPO\n",
"dpo_trainer.train()"
],
"metadata": {
"id": "rKPILNOLR-aK"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## Upload model"
],
"metadata": {
"id": "3LdhPpcrUM3H"
}
},
{
"cell_type": "code",
"source": [
"# Save artifacts\n",
"dpo_trainer.model.save_pretrained(\"final_checkpoint\")\n",
"tokenizer.save_pretrained(\"final_checkpoint\")\n",
"\n",
"# Flush memory\n",
"del dpo_trainer, model, ref_model\n",
"gc.collect()\n",
"torch.cuda.empty_cache()\n",
"\n",
"# Reload model in FP16 (instead of NF4)\n",
"base_model = AutoModelForCausalLM.from_pretrained(\n",
" model_name,\n",
" return_dict=True,\n",
" torch_dtype=torch.float16,\n",
")\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"\n",
"# Merge base model with the adapter\n",
"model = PeftModel.from_pretrained(base_model, \"final_checkpoint\")\n",
"model = model.merge_and_unload()\n",
"\n",
"# Save model and tokenizer\n",
"model.save_pretrained(new_model)\n",
"tokenizer.save_pretrained(new_model)\n",
"\n",
"# Push them to the HF Hub\n",
"model.push_to_hub(new_model, use_temp_dir=False, token=hf_token)\n",
"tokenizer.push_to_hub(new_model, use_temp_dir=False, token=hf_token)"
],
"metadata": {
"id": "h7cIvxcTfBC4"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## Inference"
],
"metadata": {
"id": "G6EFsmS4UOgV"
}
},
{
"cell_type": "code",
"source": [
"# Format prompt\n",
"message = [\n",
" {\"role\": \"system\", \"content\": \"You are a helpful assistant chatbot.\"},\n",
" {\"role\": \"user\", \"content\": \"What is a Large Language Model?\"}\n",
"]\n",
"tokenizer = AutoTokenizer.from_pretrained(new_model)\n",
"prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False)\n",
"\n",
"# Create pipeline\n",
"pipeline = transformers.pipeline(\n",
" \"text-generation\",\n",
" model=new_model,\n",
" tokenizer=tokenizer\n",
")\n",
"\n",
"# Generate text\n",
"sequences = pipeline(\n",
" prompt,\n",
" do_sample=True,\n",
" temperature=0.7,\n",
" top_p=0.9,\n",
" num_return_sequences=1,\n",
" max_length=200,\n",
")\n",
"print(sequences[0]['generated_text'])"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 251,
"referenced_widgets": [
"22773c721a7c4221a9c14cd388461d4c",
"6b54841f5de1482694c360095dae3039",
"448ccbc85e624ec3b3e71931a7ee4ff6",
"173769f6f465485f8848a11bf269850b",
"60978b9b4e8348f0a71ce3e35c73bcff",
"6a38dcbaf4674b448329ac0a16587d2a",
"7eaeada2158e493189449af91f643553",
"6e32854952b340008edca0139d3471d6",
"db6d7cfcdade4b4baa213a5d0abc07d7",
"9083029642744c43b7705532cbe0cf79",
"d028a98caa13425b907ceb513119006e"
]
},
"id": "LAEUZFjvlJOv",
"outputId": "9b5720c7-49ef-45c7-e5a7-f38d64899b1e"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "22773c721a7c4221a9c14cd388461d4c"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1473: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )\n",
" warnings.warn(\n",
"Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"<|im_start|>system\n",
"You are a helpful assistant chatbot.<|im_end|>\n",
"<|im_start|>user\n",
"What is a Large Language Model?<|im_end|>\n",
"<|im_start|>assistant\n",
"A large language model is a type of artificial intelligence (AI) system that has been trained on vast amounts of text data. These models are designed to understand and generate human language, allowing them to perform various natural language processing tasks, such as text generation, language translation, and question answering. Large language models typically use deep learning techniques, like recurrent neural networks (RNNs) or transformers, to learn patterns and relationships in the data, enabling them to generate coherent and contextually relevant responses. The size of these models, in terms of the number of parameters and the volume of data they are trained on, plays a significant role in their ability to comprehend and produce complex language structures.\n"
]
}
]
}
]
}

@ -1,5 +1,7 @@
# 🗣️ Large Language Model Course
<p align="center"><a href="https://twitter.com/maximelabonne">Follow me on X</a><a href="https://mlabonne.github.io/blog">Blog</a><a href="https://github.com/PacktPublishing/Hands-On-Graph-Neural-Networks-Using-Python">Hands-on GNN</a></p>
The LLM course is divided into three parts:
1. 🧩 **LLM Fundamentals** covers essential knowledge about mathematics, Python, and neural networks.
@ -144,13 +146,14 @@ While it's easy to find raw data from Wikipedia and other websites, it's difficu
Pre-training is a very long and costly process, which is why this is not the focus of this course. It's good to have some level of understanding of what happens during pre-training, but hands-on experience is not required.
* **Data pipeline**: Pre-training requires huge datasets (e.g., [Llama 2](https://arxiv.org/abs/2307.09288) was trained on 2 trillion tokens) that need to be filtered, tokenized, and collated with a pre-defined vocabulary.
* **Causal language modeling**: Learn the difference between causal and masked language modeling, as well as the loss function used in this case.
* **Causal language modeling**: Learn the difference between causal and masked language modeling, as well as the loss function used in this case. For efficient pre-training, learn more about [Megatron-LM](https://github.com/NVIDIA/Megatron-LM).
* **Scaling laws**: The [scaling laws](https://arxiv.org/pdf/2001.08361.pdf) describe the expected model performance based on the model size, dataset size, and the amount of compute used for training.
* **High-Performance Computing**: Out of scope here, but more knowledge about HPC is fundamental if you're planning to create your own LLM from scratch (hardware, distributed workload, etc.).
📚 **References**:
* [LLMDataHub](https://github.com/Zjh-819/LLMDataHub) by Junhao Zhao: Curated list of datasets for pre-training, fine-tuning, and RLHF.
* [Training a causal language model from scratch](https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt) by Hugging Face: Pre-train a GPT-2 model from scratch using the transformers library.
* [Megatron-LM](https://github.com/NVIDIA/Megatron-LM): State-of-the-art library to efficiently pre-train models.
* [TinyLlama](https://github.com/jzhang38/TinyLlama) by Zhang et al.: Check this project to get a good understanding of how a Llama model is trained from scratch.
* [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) by Hugging Face: Explain the difference between causal and masked language modeling and how to quickly fine-tune a DistilGPT-2 model.
* [Chinchilla's wild implications](https://www.lesswrong.com/posts/6Fpvch8RR29qLEWNH/chinchilla-s-wild-implications) by nostalgebraist: Discuss the scaling laws and explain what they mean to LLMs in general.

Loading…
Cancel
Save