""" Script to explore CodeGen model vocabulary """ from transformers import AutoTokenizer # Load the tokenizer (which contains the vocabulary) tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono") print("=" * 80) print("CODEGEN VOCABULARY EXPLORATION") print("=" * 80) # 1. Vocabulary size vocab_size = len(tokenizer) print(f"\n1. Vocabulary Size: {vocab_size:,} tokens") # 2. Get the vocabulary as a dictionary (token -> id) vocab = tokenizer.get_vocab() print(f"\n2. Vocabulary type: {type(vocab)}") # 3. Show some example tokens print("\n3. Sample tokens from vocabulary:") sample_tokens = list(vocab.items())[:20] for token, token_id in sample_tokens: print(f" ID {token_id:5d}: '{token}'") # 4. Search for specific tokens print("\n4. Programming-related tokens:") search_terms = ["length", "def", "class", "function", "return", "import", "for", "while"] for term in search_terms: if term in vocab: token_id = vocab[term] print(f" '{term}' -> Token ID: {token_id}") else: print(f" '{term}' -> NOT found as single token") # 5. Show how a word gets tokenized print("\n5. Tokenization examples:") examples = ["length", "quicksort", "def", "uncommon_variable_name", "print"] for example in examples: tokens = tokenizer.tokenize(example) token_ids = tokenizer.encode(example, add_special_tokens=False) print(f" '{example}':") print(f" Tokens: {tokens}") print(f" IDs: {token_ids}") # 6. Reverse lookup - get token from ID print("\n6. Reverse lookup (ID -> token):") interesting_ids = [0, 1, 2, 100, 1000, 5000, 10000] for token_id in interesting_ids: token = tokenizer.decode([token_id]) print(f" ID {token_id:5d} -> '{token}'") # 7. Special tokens print("\n7. Special tokens:") print(f" BOS (beginning of sequence): {tokenizer.bos_token} (ID: {tokenizer.bos_token_id})") print(f" EOS (end of sequence): {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})") print(f" PAD (padding): {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})") print(f" UNK (unknown): {tokenizer.unk_token} (ID: {tokenizer.unk_token_id})") # 8. Export vocabulary to file (optional) print("\n8. Export options:") print(" To export full vocabulary to JSON:") print(" import json") print(" with open('codegen_vocabulary.json', 'w') as f:") print(" json.dump(vocab, f, indent=2)") print("\n" + "=" * 80) print("TIP: The vocabulary is fixed - you cannot add new tokens at inference time!") print("=" * 80)