rupakrpk93
/

odia_tokenizers_test

@@ -66,52 +66,90 @@ config_path = hf_hub_download(
     filename="config.json"
 )
-# Step 3: Load the model (you need the model class definition)
-# Note: You'll need to define the GPT model architecture
-# The model architecture code is available in the repository
-# Step 4: Generate text
-def generate_odia_text(prompt, max_length=100):
     # Encode the prompt
     input_ids = tokenizer.encode_as_ids(prompt)
-    input_tensor = torch.tensor(input_ids).unsqueeze(0)
-    # Generate (assuming model is loaded)
-    # output = model.generate(input_tensor, max_length)
     # Decode the output
-    # generated_text = tokenizer.decode(output.squeeze().tolist())
-    # return generated_text
-    pass
 ```
 ### Example Usage
 ```python
 # Example 1: Simple text generation
-prompt = "ବର୍ଷା"
-# generated_text = generate_odia_text(prompt, max_length=200)
-# print(generated_text)
 # Example 2: Encode and decode text
 text = "ଓଡିଆ ଭାଷା ଏକ ସୁନ୍ଦର ଭାଷା"
 encoded = tokenizer.encode_as_ids(text)
 print(f"Encoded: {encoded}")
 decoded = tokenizer.decode(encoded)
 print(f"Decoded: {decoded}")
 ```
-### Full Implementation Example
-For a complete working example with the model architecture:
-```python
-# The full model architecture and implementation
-# is available in the repository files.
-# Please refer to the model implementation for complete code.
-```
 ## Training Details
 ### Training Hyperparameters
@@ -127,15 +165,15 @@ For a complete working example with the model architecture:
 The model was trained on a combination of:
 1. **OdiaGenAIdata/fine_web2_odia_pt** - High-quality Odia web text
 2. **bigscience-data/roots_indic-or_indic_nlp_corpus** - Odia corpus from Indic NLP
-Total training samples: ~3.8M texts
 ## Limitations
 - Maximum context length is 256 tokens
 - Trained specifically on Odia text, may not perform well on other languages
 - May generate repetitive text for very long sequences
-- The model requires the custom GPT architecture code to run
 ## Intended Use

     filename="config.json"
 )
+# Step 3: Load the model architecture and weights
+# First, download the model architecture file
+architecture_path = hf_hub_download(
+    repo_id="rupakrpk93/odia_tokenizers_test",
+    filename="model_architecture.py"
+)
+# Import the model classes
+import sys
+import importlib.util
+spec = importlib.util.spec_from_file_location("model_architecture", architecture_path)
+model_module = importlib.util.module_from_spec(spec)
+sys.modules["model_architecture"] = model_module
+spec.loader.exec_module(model_module)
+# Import the classes we need
+GPTConfig = model_module.GPTConfig
+GPT = model_module.GPT
+# Create model configuration
+config = GPTConfig()
+# Initialize and load the model
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = GPT(config)
+# Load the pretrained weights
+checkpoint = torch.load(model_path, map_location=device)
+# Check if the state_dict is nested and extract it if necessary
+if isinstance(checkpoint, dict) and 'model' in checkpoint:
+    state_dict = checkpoint['model']
+else:
+    state_dict = checkpoint
+# Remove the 'model.' prefix from keys if present
+from collections import OrderedDict
+new_state_dict = OrderedDict()
+for k, v in state_dict.items():
+    if k.startswith('model.'):
+        new_state_dict[k[6:]] = v  # Remove 'model.' prefix
+    else:
+        new_state_dict[k] = v
+model.load_state_dict(new_state_dict)
+model = model.to(device)
+model.eval()
+print(f"Model loaded successfully on {device}")
+# Step 4: Generate text function
+def generate_odia_text(prompt, max_length=100, temperature=0.8):
     # Encode the prompt
     input_ids = tokenizer.encode_as_ids(prompt)
+    input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)
+    # Generate
+    with torch.no_grad():
+        output = model.generate(input_tensor, max_length, temperature=temperature)
     # Decode the output
+    generated_text = tokenizer.decode(output.squeeze().tolist())
+    return generated_text
 ```
 ### Example Usage
 ```python
 # Example 1: Simple text generation
+prompt = "ସେ କାଲି ସ୍କୁଲକୁ"
+generated_text = generate_odia_text(prompt, max_length=200)
+print(f"Prompt: {prompt}")
+print(f"Generated: {generated_text}")
 # Example 2: Encode and decode text
 text = "ଓଡିଆ ଭାଷା ଏକ ସୁନ୍ଦର ଭାଷା"
 encoded = tokenizer.encode_as_ids(text)
+print(f"Original: {text}")
 print(f"Encoded: {encoded}")
 decoded = tokenizer.decode(encoded)
 print(f"Decoded: {decoded}")
 ```
 ## Training Details
 ### Training Hyperparameters
 The model was trained on a combination of:
 1. **OdiaGenAIdata/fine_web2_odia_pt** - High-quality Odia web text
 2. **bigscience-data/roots_indic-or_indic_nlp_corpus** - Odia corpus from Indic NLP
+3. **Custom curated Odia dataset** - Additional hand-curated Odia texts
+Total training samples: ~4M+ texts
 ## Limitations
 - Maximum context length is 256 tokens
 - Trained specifically on Odia text, may not perform well on other languages
 - May generate repetitive text for very long sequences
 ## Intended Use