Spaces:

rphrp1985
/

zerogpu

Running on Zero

rphrp1985 commited on Jun 12, 2024

Commit

4ad0753

verified ·

1 Parent(s): bd8e143

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -8,6 +8,9 @@ import os
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
 subprocess.run(
     "pip install flash-attn --no-build-isolation",
@@ -35,7 +38,8 @@ tokenizer = AutoTokenizer.from_pretrained(
     model_id
     , token= token,)
-model = AutoModelForCausalLM.from_pretrained(model_id, token= token,
                                                  # torch_dtype= torch.uint8,
                                              torch_dtype=torch.float16,
                                               # torch_dtype=torch.fl,
@@ -50,6 +54,11 @@ model = AutoModelForCausalLM.from_pretrained(model_id, token= token,
 #
 @spaces.GPU(duration=60)
 def respond(

 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
+from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
 subprocess.run(
     "pip install flash-attn --no-build-isolation",
     model_id
     , token= token,)
+with init_empty_weights():
+    model = AutoModelForCausalLM.from_pretrained(model_id, token= token,
                                                  # torch_dtype= torch.uint8,
                                              torch_dtype=torch.float16,
                                               # torch_dtype=torch.fl,
 #
+device_map = infer_auto_device_map(model, max_memory={0: "80GB", 1: "80GB", "cpu": "65GB"})
+# Load the model with the inferred device map
+model = load_checkpoint_and_dispatch(model, "path_to_checkpoint", device_map=device_map, no_split_module_classes=["GPTJBlock"])
 @spaces.GPU(duration=60)
 def respond(