mirror of https://github.com/arc53/DocsGPT
sage-guide
parent
a4483cf255
commit
1570b9c9c5
@ -0,0 +1,120 @@
|
|||||||
|
# How to deploy LLM's on Sagemaker for DocsGPT
|
||||||
|
|
||||||
|
This guide uses some of the methods from the [Phil Schmid's guides](https://www.philschmid.de/) so if you want to dive deeper into the topic, check out his guides.
|
||||||
|
|
||||||
|
### 1. Create a new python notebook on Sagemaker and prep dependencies and permissions
|
||||||
|
|
||||||
|
Install dependencies
|
||||||
|
|
||||||
|
```python
|
||||||
|
!pip install "sagemaker>=2.175.0" --upgrade --quiet
|
||||||
|
```
|
||||||
|
|
||||||
|
Check permissions
|
||||||
|
|
||||||
|
```python
|
||||||
|
import sagemaker
|
||||||
|
import boto3
|
||||||
|
sess = sagemaker.Session()
|
||||||
|
# sagemaker session bucket -> used for uploading data, models and logs
|
||||||
|
# sagemaker will automatically create this bucket if it not exists
|
||||||
|
sagemaker_session_bucket=None
|
||||||
|
if sagemaker_session_bucket is None and sess is not None:
|
||||||
|
# set to default bucket if a bucket name is not given
|
||||||
|
sagemaker_session_bucket = sess.default_bucket()
|
||||||
|
|
||||||
|
try:
|
||||||
|
role = sagemaker.get_execution_role()
|
||||||
|
except ValueError:
|
||||||
|
iam = boto3.client('iam')
|
||||||
|
role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
|
||||||
|
|
||||||
|
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
|
||||||
|
|
||||||
|
print(f"sagemaker role arn: {role}")
|
||||||
|
print(f"sagemaker session region: {sess.boto_region_name}")
|
||||||
|
print(f"sagemaker session bucket: {sess.default_bucket()}")
|
||||||
|
```
|
||||||
|
|
||||||
|
Get huggingfaces llm image uri for the container
|
||||||
|
|
||||||
|
```python
|
||||||
|
from sagemaker.huggingface import get_huggingface_llm_image_uri
|
||||||
|
|
||||||
|
# retrieve the llm image uri
|
||||||
|
llm_image = get_huggingface_llm_image_uri(
|
||||||
|
"huggingface",
|
||||||
|
version="1.1.0",
|
||||||
|
)
|
||||||
|
|
||||||
|
# print ecr image uri
|
||||||
|
print(f"llm image uri: {llm_image}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Prepare the Model
|
||||||
|
|
||||||
|
Running this code will create a model with some default parameters. You can change these parameters to suit your needs.
|
||||||
|
There are two ways you can choose which model to use.
|
||||||
|
|
||||||
|
You can either use the model_id from the huggingface.co/models page or you can use the model_data from a previous training job.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import json
|
||||||
|
from sagemaker.huggingface import HuggingFaceModel
|
||||||
|
|
||||||
|
# sagemaker config
|
||||||
|
instance_type = "ml.g5.xlarge"
|
||||||
|
number_of_gpu = 1
|
||||||
|
health_check_timeout = 600
|
||||||
|
|
||||||
|
# Define Model and Endpoint configuration parameter
|
||||||
|
config = {
|
||||||
|
'HF_MODEL_ID': "/opt/ml/model", # model_id from hf.co/models
|
||||||
|
#'HF_MODEL_ID': "Arc53/DocsGPT-7B",
|
||||||
|
'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
|
||||||
|
'MAX_INPUT_LENGTH': json.dumps(7000), # Max length of input text
|
||||||
|
'MAX_TOTAL_TOKENS': json.dumps(8000), # Max length of the generation (including input text)
|
||||||
|
'MAX_BATCH_TOTAL_TOKENS': json.dumps(8192), # Limits the number of tokens that can be processed in parallel during the generation
|
||||||
|
'MAX_BATCH_PREFILL_TOKENS': json.dumps(7000),
|
||||||
|
}
|
||||||
|
|
||||||
|
# create HuggingFaceModel with the image uri
|
||||||
|
llm_model = HuggingFaceModel(
|
||||||
|
model_data="s3://docsgpt/models/hf-tensors/docsgpt-7b-O-hq-64-alpha-2023-11-22-15-04-04-455/model.tar.gz",
|
||||||
|
role=role,
|
||||||
|
image_uri=llm_image,
|
||||||
|
env=config
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Deploy the Model
|
||||||
|
|
||||||
|
Running this line will create Model in the Sagemaker console. Next it will create an endpoint configuration and finally it will create an endpoint.
|
||||||
|
|
||||||
|
```python
|
||||||
|
llm = llm_model.deploy(
|
||||||
|
initial_instance_count=1,
|
||||||
|
endpoint_name="docsgpt-7b",
|
||||||
|
instance_type=instance_type,
|
||||||
|
container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Connect it to the application
|
||||||
|
|
||||||
|
Change you .env file and set the following variables:
|
||||||
|
|
||||||
|
```python
|
||||||
|
SAGEMAKER_ENDPOINT: str = None # SageMaker endpoint name (docsgpt-7b)
|
||||||
|
SAGEMAKER_REGION: str = None # SageMaker region name
|
||||||
|
SAGEMAKER_ACCESS_KEY: str = None # SageMaker access key
|
||||||
|
SAGEMAKER_SECRET_KEY: str = None # SageMaker secret key
|
||||||
|
```
|
||||||
|
|
||||||
|
> **_NOTE:_** If you are using the same AWS account for the application and SageMaker, you can leave the access and secret keys empty.
|
||||||
|
|
||||||
|
Also make sure you switch to appropriate embeddings if you want everything runs locally for example
|
||||||
|
|
||||||
|
```python
|
||||||
|
EMBEDDINGS_NAME=huggingface_sentence-transformers/all-mpnet-base-v2
|
||||||
|
```
|
Loading…
Reference in New Issue