From 3f011ecd3a56d42f928ab9fb81fc207dc5a852a2 Mon Sep 17 00:00:00 2001
From: Zack <zack@zackbradshaw.com>
Date: Sun, 10 Dec 2023 10:14:18 -0800
Subject: [PATCH] feat: add huggingface token

---
 app.py       | 41 ++++++++++++++++++++++-------------------
 serving.yaml | 20 --------------------
 2 files changed, 22 insertions(+), 39 deletions(-)
 delete mode 100644 serving.yaml

diff --git a/app.py b/app.py
index 62cd433b..1c86ba74 100644
--- a/app.py
+++ b/app.py
@@ -1,4 +1,5 @@
 import boto3
+from transformers import AutoTokenizer
 from botocore.exceptions import NoCredentialsError
 import tokenize
 import requests
@@ -321,15 +322,13 @@ def clear_history():
     yield gr.update(visible=True, value=return_msg)
 
 
-# Add this function to fetch the tokenizer from the Hugging Face Model Hub API
-def fetch_tokenizer(model_name: str):
-    response = requests.get(f"https://huggingface.co/{model_name}/resolve/main/tokenizer_config.json")
-    if response.status_code == 200:
-        tokenizer_config = response.json()
-        return tokenizer_config.get("tokenizer_class")
-    else:
-        return "Tokenizer not found for the selected model"
 
+def fetch_tokenizer(model_name):
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        return f"Tokenizer for {model_name} loaded successfully."
+    except Exception as e:
+        return f"Error loading tokenizer: {str(e)}"
 
 # Add this function to handle the button click
 def deploy_on_sky_pilot(model_name: str, tokenizer: str, accelerators: str):
@@ -340,7 +339,7 @@ def deploy_on_sky_pilot(model_name: str, tokenizer: str, accelerators: str):
         },
         "envs": {
             "MODEL_NAME": model_name,
-            "TOKENIZER": tokenizer
+            "TOKENIZER": AutoTokenizer.from_pretrained(model_name)
         },
         "setup": "conda create -n vllm python=3.9 -y\nconda activate vllm\ngit clone https://github.com/vllm-project/vllm.git\ncd vllm\npip install .\npip install gradio",
         "run": "conda activate vllm\necho 'Starting vllm api server...'\npython -u -m vllm.entrypoints.api_server --model $MODEL_NAME --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE --tokenizer $TOKENIZER 2>&1 | tee api_server.log &\necho 'Waiting for vllm api server to start...'\nwhile ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done\necho 'Starting gradio server...'\npython vllm/examples/gradio_webserver.py"
@@ -385,6 +384,7 @@ with gr.Blocks() as demo:
         SCENEX_API_KEY = gr.Textbox(label="Scenex api key:", placeholder="Key to use sceneXplain", type="text")
         STEAMSHIP_API_KEY = gr.Textbox(label="Steamship api key:", placeholder="Key to use image generation", type="text")
         HUGGINGFACE_API_KEY = gr.Textbox(label="Huggingface api key:", placeholder="Key to use models in huggingface hub", type="text")
+        HUGGINGFACE_TOKEN = gr.Textbox(label="HuggingFace Token:", placeholder="Token for huggingface", type="text"),
         AMADEUS_ID = gr.Textbox(label="Amadeus id:", placeholder="Id to use Amadeus", type="text")
         AMADEUS_KEY = gr.Textbox(label="Amadeus key:", placeholder="Key to use Amadeus", type="text")
         AWS_ACCESS_KEY_ID = gr.Textbox(label="AWS Access Key ID:", placeholder="AWS Access Key ID", type="text")
@@ -416,10 +416,19 @@ with gr.Blocks() as demo:
                         buttonDownload = gr.Button("Download Model");
                         buttonDownload.click(fn=download_model, inputs=[model_url, memory_utilization]);
                         model_chosen = gr.Dropdown(
-                            list(available_models), value=DEFAULTMODEL, multiselect=False, label="Model provided",
+                            list(available_models),
+                            value=DEFAULTMODEL,
+                            multiselect=False,
+                            label="Model provided",
                             info="Choose the model to solve your question, Default means ChatGPT."
-                        
-                )
+                        )
+                        tokenizer_output = gr.outputs.Textbox(label="Tokenizer")
+                        model_chosen.change(fetch_tokenizer, outputs=tokenizer_output)
+                        available_accelerators = ["A100", "V100", "P100", "K80", "T4", "P4"]
+                        accelerators = gr.Dropdown(available_accelerators, label="Accelerators:")
+                        buttonDeploy = gr.Button("Deploy on SkyPilot")
+
+                        buttonDeploy.click(deploy_on_sky_pilot, [model_chosen, tokenizer_output, accelerators, HUGGINGFACE_TOKEN])
                 with gr.Row():
                     tools_search = gr.Textbox(
                         lines=1,
@@ -434,13 +443,6 @@ with gr.Blocks() as demo:
                     info="Choose the tools to solve your question.",
                 )
 
-            tokenizer_output = gr.outputs.Textbox()
-            model_chosen.change(fetch_tokenizer, outputs=tokenizer_output)
-            available_accelerators = ["A100", "V100", "P100", "K80", "T4", "P4"]
-            accelerators = gr.Dropdown(available_accelerators, label="Accelerators:")
-            buttonDeploy = gr.Button("Deploy on SkyPilot")
-
-            buttonDeploy.click(deploy_on_sky_pilot, [model_chosen, tokenizer_output, accelerators])
 
         # TODO finish integrating model flow
         # with gr.Tab("model"):
@@ -468,6 +470,7 @@ with gr.Blocks() as demo:
         SCENEX_API_KEY,
         STEAMSHIP_API_KEY,
         HUGGINGFACE_API_KEY,
+        HUGGINGFACE_TOKEN,
         AMADEUS_ID,
         AMADEUS_KEY,
     ], outputs=key_set_btn)
diff --git a/serving.yaml b/serving.yaml
deleted file mode 100644
index 31027155..00000000
--- a/serving.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-envs:
-  MODEL_NAME: decapoda-research/llama-13b-hf
-  TOKENIZER: Tokenizer not found for the selected model
-resources:
-  accelerators: A100
-run: |-
-  conda activate vllm
-  echo 'Starting vllm api server...'
-  python -u -m vllm.entrypoints.api_server --model $MODEL_NAME --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE --tokenizer $TOKENIZER 2>&1 | tee api_server.log &
-  echo 'Waiting for vllm api server to start...'
-  while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
-  echo 'Starting gradio server...'
-  python vllm/examples/gradio_webserver.py
-setup: |-
-  conda create -n vllm python=3.9 -y
-  conda activate vllm
-  git clone https://github.com/vllm-project/vllm.git
-  cd vllm
-  pip install .
-  pip install gradio