Updated README with complete working model loading instructions
Browse files
README.md
CHANGED
|
@@ -66,52 +66,90 @@ config_path = hf_hub_download(
|
|
| 66 |
filename="config.json"
|
| 67 |
)
|
| 68 |
|
| 69 |
-
# Step 3: Load the model
|
| 70 |
-
#
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
-
#
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
# Encode the prompt
|
| 76 |
input_ids = tokenizer.encode_as_ids(prompt)
|
| 77 |
-
input_tensor = torch.tensor(input_ids).unsqueeze(0)
|
| 78 |
|
| 79 |
-
# Generate
|
| 80 |
-
|
|
|
|
| 81 |
|
| 82 |
# Decode the output
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
pass
|
| 86 |
```
|
| 87 |
|
| 88 |
### Example Usage
|
| 89 |
|
| 90 |
```python
|
| 91 |
# Example 1: Simple text generation
|
| 92 |
-
prompt = "
|
| 93 |
-
|
| 94 |
-
|
|
|
|
| 95 |
|
| 96 |
# Example 2: Encode and decode text
|
| 97 |
text = "ଓଡିଆ ଭାଷା ଏକ ସୁନ୍ଦର ଭାଷା"
|
| 98 |
encoded = tokenizer.encode_as_ids(text)
|
|
|
|
| 99 |
print(f"Encoded: {encoded}")
|
| 100 |
|
| 101 |
decoded = tokenizer.decode(encoded)
|
| 102 |
print(f"Decoded: {decoded}")
|
| 103 |
```
|
| 104 |
|
| 105 |
-
### Full Implementation Example
|
| 106 |
-
|
| 107 |
-
For a complete working example with the model architecture:
|
| 108 |
-
|
| 109 |
-
```python
|
| 110 |
-
# The full model architecture and implementation
|
| 111 |
-
# is available in the repository files.
|
| 112 |
-
# Please refer to the model implementation for complete code.
|
| 113 |
-
```
|
| 114 |
-
|
| 115 |
## Training Details
|
| 116 |
|
| 117 |
### Training Hyperparameters
|
|
@@ -127,15 +165,15 @@ For a complete working example with the model architecture:
|
|
| 127 |
The model was trained on a combination of:
|
| 128 |
1. **OdiaGenAIdata/fine_web2_odia_pt** - High-quality Odia web text
|
| 129 |
2. **bigscience-data/roots_indic-or_indic_nlp_corpus** - Odia corpus from Indic NLP
|
|
|
|
| 130 |
|
| 131 |
-
Total training samples: ~
|
| 132 |
|
| 133 |
## Limitations
|
| 134 |
|
| 135 |
- Maximum context length is 256 tokens
|
| 136 |
- Trained specifically on Odia text, may not perform well on other languages
|
| 137 |
- May generate repetitive text for very long sequences
|
| 138 |
-
- The model requires the custom GPT architecture code to run
|
| 139 |
|
| 140 |
## Intended Use
|
| 141 |
|
|
|
|
| 66 |
filename="config.json"
|
| 67 |
)
|
| 68 |
|
| 69 |
+
# Step 3: Load the model architecture and weights
|
| 70 |
+
# First, download the model architecture file
|
| 71 |
+
architecture_path = hf_hub_download(
|
| 72 |
+
repo_id="rupakrpk93/odia_tokenizers_test",
|
| 73 |
+
filename="model_architecture.py"
|
| 74 |
+
)
|
| 75 |
|
| 76 |
+
# Import the model classes
|
| 77 |
+
import sys
|
| 78 |
+
import importlib.util
|
| 79 |
+
spec = importlib.util.spec_from_file_location("model_architecture", architecture_path)
|
| 80 |
+
model_module = importlib.util.module_from_spec(spec)
|
| 81 |
+
sys.modules["model_architecture"] = model_module
|
| 82 |
+
spec.loader.exec_module(model_module)
|
| 83 |
+
|
| 84 |
+
# Import the classes we need
|
| 85 |
+
GPTConfig = model_module.GPTConfig
|
| 86 |
+
GPT = model_module.GPT
|
| 87 |
+
|
| 88 |
+
# Create model configuration
|
| 89 |
+
config = GPTConfig()
|
| 90 |
+
|
| 91 |
+
# Initialize and load the model
|
| 92 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 93 |
+
model = GPT(config)
|
| 94 |
+
|
| 95 |
+
# Load the pretrained weights
|
| 96 |
+
checkpoint = torch.load(model_path, map_location=device)
|
| 97 |
+
|
| 98 |
+
# Check if the state_dict is nested and extract it if necessary
|
| 99 |
+
if isinstance(checkpoint, dict) and 'model' in checkpoint:
|
| 100 |
+
state_dict = checkpoint['model']
|
| 101 |
+
else:
|
| 102 |
+
state_dict = checkpoint
|
| 103 |
+
|
| 104 |
+
# Remove the 'model.' prefix from keys if present
|
| 105 |
+
from collections import OrderedDict
|
| 106 |
+
new_state_dict = OrderedDict()
|
| 107 |
+
for k, v in state_dict.items():
|
| 108 |
+
if k.startswith('model.'):
|
| 109 |
+
new_state_dict[k[6:]] = v # Remove 'model.' prefix
|
| 110 |
+
else:
|
| 111 |
+
new_state_dict[k] = v
|
| 112 |
+
|
| 113 |
+
model.load_state_dict(new_state_dict)
|
| 114 |
+
|
| 115 |
+
model = model.to(device)
|
| 116 |
+
model.eval()
|
| 117 |
+
print(f"Model loaded successfully on {device}")
|
| 118 |
+
|
| 119 |
+
# Step 4: Generate text function
|
| 120 |
+
def generate_odia_text(prompt, max_length=100, temperature=0.8):
|
| 121 |
# Encode the prompt
|
| 122 |
input_ids = tokenizer.encode_as_ids(prompt)
|
| 123 |
+
input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)
|
| 124 |
|
| 125 |
+
# Generate
|
| 126 |
+
with torch.no_grad():
|
| 127 |
+
output = model.generate(input_tensor, max_length, temperature=temperature)
|
| 128 |
|
| 129 |
# Decode the output
|
| 130 |
+
generated_text = tokenizer.decode(output.squeeze().tolist())
|
| 131 |
+
return generated_text
|
|
|
|
| 132 |
```
|
| 133 |
|
| 134 |
### Example Usage
|
| 135 |
|
| 136 |
```python
|
| 137 |
# Example 1: Simple text generation
|
| 138 |
+
prompt = "ସେ କାଲି ସ୍କୁଲକୁ"
|
| 139 |
+
generated_text = generate_odia_text(prompt, max_length=200)
|
| 140 |
+
print(f"Prompt: {prompt}")
|
| 141 |
+
print(f"Generated: {generated_text}")
|
| 142 |
|
| 143 |
# Example 2: Encode and decode text
|
| 144 |
text = "ଓଡିଆ ଭାଷା ଏକ ସୁନ୍ଦର ଭାଷା"
|
| 145 |
encoded = tokenizer.encode_as_ids(text)
|
| 146 |
+
print(f"Original: {text}")
|
| 147 |
print(f"Encoded: {encoded}")
|
| 148 |
|
| 149 |
decoded = tokenizer.decode(encoded)
|
| 150 |
print(f"Decoded: {decoded}")
|
| 151 |
```
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
## Training Details
|
| 154 |
|
| 155 |
### Training Hyperparameters
|
|
|
|
| 165 |
The model was trained on a combination of:
|
| 166 |
1. **OdiaGenAIdata/fine_web2_odia_pt** - High-quality Odia web text
|
| 167 |
2. **bigscience-data/roots_indic-or_indic_nlp_corpus** - Odia corpus from Indic NLP
|
| 168 |
+
3. **Custom curated Odia dataset** - Additional hand-curated Odia texts
|
| 169 |
|
| 170 |
+
Total training samples: ~4M+ texts
|
| 171 |
|
| 172 |
## Limitations
|
| 173 |
|
| 174 |
- Maximum context length is 256 tokens
|
| 175 |
- Trained specifically on Odia text, may not perform well on other languages
|
| 176 |
- May generate repetitive text for very long sequences
|
|
|
|
| 177 |
|
| 178 |
## Intended Use
|
| 179 |
|