Upload 8 files

Browse files

Files changed (8) hide show

config.json +77 -0
model.safetensors +3 -0
readme.md +138 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +59 -0
training_args.bin +3 -0
vocab.txt +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+  "architectures": [
+    "BertForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "eos_token_ids": 0,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "O",
+    "1": "B-PER",
+    "2": "I-PER",
+    "3": "B-ORG",
+    "4": "I-ORG",
+    "5": "B-LOC",
+    "6": "I-LOC",
+    "7": "B-GPE",
+    "8": "I-GPE",
+    "9": "B-PROD",
+    "10": "I-PROD",
+    "11": "B-TITLE",
+    "12": "I-TITLE",
+    "13": "B-EVENT",
+    "14": "I-EVENT",
+    "15": "B-DATE",
+    "16": "I-DATE",
+    "17": "B-TIME",
+    "18": "I-TIME",
+    "19": "B-MONEY",
+    "20": "I-MONEY",
+    "21": "B-PERCENT",
+    "22": "I-PERCENT"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "B-DATE": 15,
+    "B-EVENT": 13,
+    "B-GPE": 7,
+    "B-LOC": 5,
+    "B-MONEY": 19,
+    "B-ORG": 3,
+    "B-PER": 1,
+    "B-PERCENT": 21,
+    "B-PROD": 9,
+    "B-TIME": 17,
+    "B-TITLE": 11,
+    "I-DATE": 16,
+    "I-EVENT": 14,
+    "I-GPE": 8,
+    "I-LOC": 6,
+    "I-MONEY": 20,
+    "I-ORG": 4,
+    "I-PER": 2,
+    "I-PERCENT": 22,
+    "I-PROD": 10,
+    "I-TIME": 18,
+    "I-TITLE": 12,
+    "O": 0
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.57.3",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 50000
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fac4ed3fbb3bbf7a4684f46c54d16cc9db87b55decefa55f1453c4dfcbb7f5c5
+size 495497116

readme.md ADDED Viewed

	@@ -0,0 +1,138 @@

+# Estonian NER Model - Fine-tuned on Synthetic Government Data
+This model is a domain-adapted version of [tartuNLP/EstBERT_NER_v2](https://huggingface.co/tartuNLP/EstBERT_NER_v2), further fine-tuned on synthetically generated Estonian text focusing on government services and public administration communications.
+## Model Description
+**Base Model:** tartuNLP/EstBERT_NER_v2
+**Language:** Estonian (et)
+**Task:** Token Classification (Named Entity Recognition)
+**Training Data:** Synthetic data generated using Google Gemini-3-pro API
+This model specializes in extracting named entities from Estonian government and public service-related text, including citizen communications with government agencies.
+## Supported Entity Types
+The model recognizes 11 entity types:
+- **PER**: Person names
+- **ORG**: Organizations, companies, government agencies
+- **LOC**: Locations, addresses, streets, buildings
+- **GPE**: Geopolitical entities (cities, counties, countries)
+- **PROD**: Products
+- **TITLE**: Titles, positions
+- **EVENT**: Events
+- **DATE**: Dates
+- **TIME**: Time expressions
+- **MONEY**: Monetary values
+- **PERCENT**: Percentages
+Each entity uses BIO tagging (B- for beginning, I- for inside).
+## Training Data
+The model was fine-tuned on synthetically generated data created specifically for Estonian government and public service domains. The synthetic dataset includes:
+- **Generation Method**: Google Gemini-3-pro API with structured prompts
+- **Domain Coverage**: 22+ Estonian government agencies including Töötukassa (Unemployment Insurance Fund), Maksu- ja Tolliamet (Tax and Customs Board), Politsei- ja Piirivalveamet (Police and Border Guard), and others
+- **Topics**: Various government services like unemployment benefits, tax declarations, social insurance, permits, registrations, etc.
+- **Style Diversity**: Multiple writing styles (formal, casual, shorthand, mixed) to improve robustness
+### Why Synthetic Data?
+Synthetic data generation allowed us to:
+1. Create domain-specific training examples for government services
+2. Ensure comprehensive coverage of Estonian public sector terminology
+3. Include diverse writing styles found in citizen-government communications
+4. Control entity distribution and annotation quality
+## Training Details
+- **Base Model**: tartuNLP/EstBERT_NER_v2
+- **Training Epochs**: 10
+- **Batch Size**: 16
+- **Learning Rate**: 5e-5
+- **Max Sequence Length**: 512 tokens
+- **Optimizer**: AdamW (weight decay: 0.01)
+- **Training Framework**: Hugging Face Transformers + PyTorch
+## Usage
+```python
+from transformers import BertTokenizerFast, BertForTokenClassification
+from transformers import pipeline
+# Load model and tokenizer
+tokenizer = BertTokenizerFast.from_pretrained('buerokratt/{model_name}')
+model = BertForTokenClassification.from_pretrained('buerokratt/{model_name}')
+# Create NER pipeline
+nlp = pipeline("ner", model=model, tokenizer=tokenizer)
+# Example text
+text = ""
+# Get predictions
+ner_results = nlp(text)
+for entity in ner_results:
+    print(f"{entity['word']}: {entity['entity']}")
+```
+### Overall Metrics
+| Metric | Score |
+|--------|-------|
+| **Micro F1-Score** | 0.8544 |
+| **Macro F1-Score** | 0.8561 |
+| **Micro Precision** | 0.8404|
+| **Micro Recall** | 0.8689 |
+### Per-Entity Performance
+| Entity | Precision | Recall | F1-Score |
+|--------|-----------|--------|----------|
+| **GPE** | 0.7778 | 0.7925 | 0.7850 |
+| **LOC** | 0.9796 | 0.9412 | 0.9600 |
+| **ORG** | 0.7778 | 0.8077 | 0.7925 |
+| **PER** | 0.8393 | 0.9400 | 0.8868 |
+## Intended Use
+This model is optimized for:
+- Processing Estonian government service inquiries
+- Extracting entities from citizen communications
+- Analyzing public administration texts
+- Information extraction from Estonian bureaucratic documents
+## Limitations
+- **Domain Specificity**: Optimized for government/public service text; may underperform on other domains
+- **Synthetic Training Data**: While diverse, synthetic data may not capture all real-world linguistic variations
+- **Base Model Limitations**: Inherits limitations from EstBERT_NER_v2
+## Citation
+If you use this model, please cite the base EstBERT_NER model:
+```bibtex
+@misc{tanvir2020estbert,
+      title={EstBERT: A Pretrained Language-Specific BERT for Estonian},
+      author={Hasan Tanvir and Claudia Kittask and Kairit Sirts},
+      year={2020},
+      eprint={2011.04784},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+## Acknowledgments
+- **Base Model**: [tartuNLP/EstBERT_NER_v2](https://huggingface.co/tartuNLP/EstBERT_NER_v2) by the NLP research group at the University of Tartu
+- **Synthetic Data Generation**: Google Gemini-3-pro API
+- **Training Framework**: Hugging Face Transformers

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "full_tokenizer_file": null,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5cf00e8b082841964594b63f61be358d3d2e9b0aff969c52544197526d8f5c95
+size 5368

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff