Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,9 @@ import os
|
|
| 8 |
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
|
| 9 |
"""
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
subprocess.run(
|
| 13 |
"pip install flash-attn --no-build-isolation",
|
|
@@ -35,7 +38,8 @@ tokenizer = AutoTokenizer.from_pretrained(
|
|
| 35 |
model_id
|
| 36 |
, token= token,)
|
| 37 |
|
| 38 |
-
|
|
|
|
| 39 |
# torch_dtype= torch.uint8,
|
| 40 |
torch_dtype=torch.float16,
|
| 41 |
# torch_dtype=torch.fl,
|
|
@@ -50,6 +54,11 @@ model = AutoModelForCausalLM.from_pretrained(model_id, token= token,
|
|
| 50 |
#
|
| 51 |
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
@spaces.GPU(duration=60)
|
| 55 |
def respond(
|
|
|
|
| 8 |
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
|
| 12 |
+
|
| 13 |
+
|
| 14 |
|
| 15 |
subprocess.run(
|
| 16 |
"pip install flash-attn --no-build-isolation",
|
|
|
|
| 38 |
model_id
|
| 39 |
, token= token,)
|
| 40 |
|
| 41 |
+
with init_empty_weights():
|
| 42 |
+
model = AutoModelForCausalLM.from_pretrained(model_id, token= token,
|
| 43 |
# torch_dtype= torch.uint8,
|
| 44 |
torch_dtype=torch.float16,
|
| 45 |
# torch_dtype=torch.fl,
|
|
|
|
| 54 |
#
|
| 55 |
|
| 56 |
|
| 57 |
+
device_map = infer_auto_device_map(model, max_memory={0: "80GB", 1: "80GB", "cpu": "65GB"})
|
| 58 |
+
|
| 59 |
+
# Load the model with the inferred device map
|
| 60 |
+
model = load_checkpoint_and_dispatch(model, "path_to_checkpoint", device_map=device_map, no_split_module_classes=["GPTJBlock"])
|
| 61 |
+
|
| 62 |
|
| 63 |
@spaces.GPU(duration=60)
|
| 64 |
def respond(
|