import streamlit as st import os import xml.etree.ElementTree as ET import re from huggingface_hub import InferenceClient # Coptic alphabet helper COPTIC_ALPHABET = { 'Ⲁ': 'Alpha', 'Ⲃ': 'Beta', 'Ⲅ': 'Gamma', 'Ⲇ': 'Delta', 'Ⲉ': 'Epsilon', 'Ⲋ': 'Zeta', 'Ⲏ': 'Eta', 'Ⲑ': 'Theta', 'Ⲓ': 'Iota', 'Ⲕ': 'Kappa', 'Ⲗ': 'Lambda', 'Ⲙ': 'Mu', 'Ⲛ': 'Nu', 'Ⲝ': 'Xi', 'Ⲟ': 'Omicron', 'Ⲡ': 'Pi', 'Ⲣ': 'Rho', 'Ⲥ': 'Sigma', 'Ⲧ': 'Tau', 'Ⲩ': 'Upsilon', 'Ⲫ': 'Phi', 'Ⲭ': 'Chi', 'Ⲯ': 'Psi', 'Ⲱ': 'Omega', 'Ϣ': 'Shai', 'Ϥ': 'Fai', 'Ϧ': 'Khei', 'Ϩ': 'Hori', 'Ϫ': 'Gangia', 'Ϭ': 'Shima', 'Ϯ': 'Ti' } # Coptic linguistic prompts (will be formatted with target language) def get_coptic_prompts(target_language): """Generate Coptic analysis prompts with specified target language""" return { 'dialect_analysis': f"Analyze the Coptic dialect of this text and identify linguistic features. Respond in {target_language}:", 'translation': f"You are a professional Coptic translator. Translate the following Coptic text to {target_language}.\n\nIMPORTANT: Provide ONLY the direct translation. Do not include:\n- The original Coptic text\n- Explanations or commentary\n- Notes about context or meaning\n- Any text other than the {target_language} translation\n\nCoptic text to translate:", 'transcription': f"Provide a romanized transcription of this Coptic text. Respond in {target_language}:", 'morphology': f"Analyze the morphological structure of these Coptic words. Respond in {target_language}:", 'lexicon_lookup': f"Look up these Coptic words and provide definitions with Greek etymologies. Respond in {target_language}:" } # Lexicon loader @st.cache_data def load_coptic_lexicon(file_path=None): """Load Coptic lexicon from various formats including TEI XML""" if not file_path or not os.path.exists(file_path): return {} lexicon = {} try: # Handle XML format (TEI structure for Comprehensive Coptic Lexicon) if file_path.endswith('.xml'): tree = ET.parse(file_path) root = tree.getroot() # Handle TEI namespace ns = {'tei': 'http://www.tei-c.org/ns/1.0'} # Find entries in TEI format entries = root.findall('.//tei:entry', ns) for entry in entries[:100]: # Limit to first 100 entries for performance coptic_word = "" definition = "" # Extract Coptic headword from TEI structure form = entry.find('.//tei:form[@type="lemma"]', ns) or entry.find('.//tei:form', ns) if form is not None: orth = form.find('.//tei:orth', ns) if orth is not None and orth.text: coptic_word = orth.text.strip() # Extract definition from sense elements senses = entry.findall('.//tei:sense', ns) definitions = [] for sense in senses[:2]: # Limit to first 2 senses def_elem = sense.find('.//tei:def', ns) if def_elem is not None and def_elem.text: definitions.append(def_elem.text.strip()) if definitions: definition = "; ".join(definitions) # Clean and store if coptic_word and definition: # Clean Coptic word (preserve Coptic and Greek Unicode) coptic_word = re.sub(r'[^\u2C80-\u2CFF\u03B0-\u03FF\u1F00-\u1FFF\w\s\-]', '', coptic_word).strip() if coptic_word: lexicon[coptic_word] = definition[:200] # Limit definition length # Handle text formats else: with open(file_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line: continue # Support multiple separators separator = None for sep in ['\t', '|', ',', ';']: if sep in line: separator = sep break if separator: parts = line.split(separator, 1) if len(parts) >= 2: coptic_word = parts[0].strip() definition = parts[1].strip() lexicon[coptic_word] = definition except Exception as e: st.error(f"Error loading lexicon: {str(e)}") return lexicon # Language detection and UI LANGUAGES = { 'en': 'English', 'es': 'Español', 'fr': 'Français', 'de': 'Deutsch', 'zh': '中文', 'ja': '日本語', 'ar': 'العربية', 'hi': 'हिन्दी', 'cop': 'Coptic (ⲘⲉⲧⲢⲉⲙ̀ⲛⲭⲏⲙⲓ)', 'cop-sa': 'Sahidic Coptic', 'cop-bo': 'Bohairic Coptic' } st.set_page_config(page_title="Apertus Chat", layout="wide") # Language selector selected_lang = st.selectbox("Language / Langue / Idioma", options=list(LANGUAGES.keys()), format_func=lambda x: LANGUAGES[x]) # Sidebar for Coptic tools with st.sidebar: st.header("Coptic Tools") # HuggingFace API Token input st.subheader("🔑 API Configuration") hf_token_input = st.text_input( "HuggingFace API Token", type="password", help="Required for Apertus-8B translation. Get your token at: https://huggingface.co/settings/tokens" ) if hf_token_input: st.success("✅ API token configured") else: st.warning("⚠️ Translation requires an API token") st.markdown("[Get your free HF token →](https://huggingface.co/settings/tokens)") st.divider() # Lexicon file uploader st.subheader("📚 Lexicon Upload") lexicon_file = st.file_uploader( "Upload Coptic Lexicon (optional)", type=['txt', 'tsv', 'csv', 'xml'], help="Supports: Text (TAB/pipe separated), XML (TEI format), CSV\nNote: Comprehensive lexicon is pre-loaded" ) # Load lexicon if lexicon_file: try: # Check file size (max 20MB) file_size = len(lexicon_file.getvalue()) if file_size > 20 * 1024 * 1024: st.error("❌ File too large (max 20MB)") coptic_lexicon = {} else: # Save uploaded file temporarily temp_path = f"temp_lexicon.{lexicon_file.name.split('.')[-1]}" with open(temp_path, "wb") as f: f.write(lexicon_file.getbuffer()) coptic_lexicon = load_coptic_lexicon(temp_path) if coptic_lexicon: st.success(f"✅ Loaded {len(coptic_lexicon)} lexicon entries from {lexicon_file.name}") else: st.warning("⚠️ File uploaded but no valid entries found") coptic_lexicon = {} # Clean up temp file if os.path.exists(temp_path): os.remove(temp_path) except Exception as e: st.error(f"❌ Error loading file: {str(e)}") st.info("💡 Supported formats: Plain text (TAB/pipe separated), XML (TEI), CSV") coptic_lexicon = {} else: # Try to load the comprehensive lexicon if available comprehensive_lexicon_path = "Comprehensive_Coptic_Lexicon-v1.2-2020.xml" if os.path.exists(comprehensive_lexicon_path): coptic_lexicon = load_coptic_lexicon(comprehensive_lexicon_path) if coptic_lexicon: st.info(f"📚 Loaded Comprehensive Coptic Lexicon: {len(coptic_lexicon)} entries") else: coptic_lexicon = {} else: coptic_lexicon = {} # Coptic alphabet reference if st.expander("Coptic Alphabet"): for letter, name in COPTIC_ALPHABET.items(): st.text(f"{letter} - {name}") # Lexicon search if coptic_lexicon: st.subheader("Lexicon Search") # Initialize session state for search term if "search_term" not in st.session_state: st.session_state.search_term = "" # Virtual Coptic keyboard st.write("**Virtual Keyboard:**") coptic_letters = ['ⲁ', 'ⲃ', 'ⲅ', 'ⲇ', 'ⲉ', 'ⲍ', 'ⲏ', 'ⲑ', 'ⲓ', 'ⲕ', 'ⲗ', 'ⲙ', 'ⲛ', 'ⲝ', 'ⲟ', 'ⲡ', 'ⲣ', 'ⲥ', 'ⲧ', 'ⲩ', 'ⲫ', 'ⲭ', 'ⲯ', 'ⲱ', 'ϣ', 'ϥ', 'ϧ', 'ϩ', 'ϫ', 'ϭ', 'ϯ'] # Create keyboard layout in rows cols1 = st.columns(8) cols2 = st.columns(8) cols3 = st.columns(8) cols4 = st.columns(8) # Keyboard buttons - accumulate in session state for i, letter in enumerate(coptic_letters): col_idx = i % 8 if i < 8: if cols1[col_idx].button(letter, key=f"key_{letter}"): st.session_state.search_term += letter st.rerun() elif i < 16: if cols2[col_idx].button(letter, key=f"key_{letter}"): st.session_state.search_term += letter st.rerun() elif i < 24: if cols3[col_idx].button(letter, key=f"key_{letter}"): st.session_state.search_term += letter st.rerun() else: if cols4[col_idx].button(letter, key=f"key_{letter}"): st.session_state.search_term += letter st.rerun() # Control buttons col_space, col_back, col_clear = st.columns(3) with col_space: if st.button("Space"): st.session_state.search_term += " " st.rerun() with col_back: if st.button("⌫ Backspace"): st.session_state.search_term = st.session_state.search_term[:-1] st.rerun() with col_clear: if st.button("Clear"): st.session_state.search_term = "" st.rerun() # Search input - directly use session state WITHOUT key parameter to avoid conflicts search_term = st.text_input("Search Coptic word:", value=st.session_state.search_term) # Update session state if user types directly if search_term != st.session_state.search_term: st.session_state.search_term = search_term if search_term: if search_term in coptic_lexicon: st.write(f"**{search_term}**") st.write(coptic_lexicon[search_term]) else: # Partial matches matches = [k for k in coptic_lexicon.keys() if search_term in k] if matches: st.write("Partial matches:") for match in matches[:5]: # Show first 5 matches st.write(f"**{match}** → {coptic_lexicon[match][:100]}...") else: st.write("No matches found") # Linguistic analysis options for Coptic input if selected_lang in ['cop', 'cop-sa', 'cop-bo']: st.subheader("Analysis Type") analysis_type = st.selectbox("Choose analysis:", options=['translation', 'dialect_analysis', 'transcription', 'morphology', 'lexicon_lookup'], format_func=lambda x: x.replace('_', ' ').title()) # Target language selector for translation if analysis_type == 'translation': st.subheader("Target Language") target_lang = st.selectbox("Translate to:", options=[k for k in LANGUAGES.keys() if k not in ['cop', 'cop-sa', 'cop-bo']], format_func=lambda x: LANGUAGES[x], index=0) # Default to English target_language_name = LANGUAGES[target_lang] else: # For non-translation tasks, use English as default output language target_language_name = "English" # Get prompts for the target language COPTIC_PROMPTS = get_coptic_prompts(target_language_name) # Use HuggingFace Inference API instead of loading model locally # This is much faster and doesn't require GPU MODEL_NAME = "swiss-ai/Apertus-8B-Instruct-2509" def get_inference_client(token=None): """Initialize HuggingFace Inference API client with provided token""" try: if token: client = InferenceClient(token=token) return client else: # Try to get token from Space secrets as fallback if hasattr(st, 'secrets') and 'HF_TOKEN' in st.secrets: client = InferenceClient(token=st.secrets['HF_TOKEN']) return client else: return None except Exception as e: st.error(f"Error initializing inference client: {e}") return None # Chat interface if "messages" not in st.session_state: st.session_state.messages = [] # Display chat history for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"]) # User input if prompt := st.chat_input("Type your message..."): # Check if API token is available if not hf_token_input: st.error("⚠️ Please enter your HuggingFace API token in the sidebar to use translation.") st.stop() # Initialize inference client with user token inference_client = get_inference_client(hf_token_input) if not inference_client: st.error("❌ Failed to initialize inference client. Please check your API token.") st.stop() # Add Coptic-specific prompt prefix if applicable if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals(): full_prompt = f"{COPTIC_PROMPTS[analysis_type]} {prompt}" # Add lexicon context for lexicon lookup if analysis_type == 'lexicon_lookup' and coptic_lexicon: words_in_prompt = prompt.split() lexicon_matches = [] for word in words_in_prompt: if word in coptic_lexicon: lexicon_matches.append(f"{word} = {coptic_lexicon[word]}") if lexicon_matches: full_prompt += f"\n\nLexicon entries found: {'; '.join(lexicon_matches)}" else: full_prompt = prompt st.session_state.messages.append({"role": "user", "content": full_prompt}) with st.chat_message("user"): st.markdown(full_prompt) # Generate response using HuggingFace Inference API with st.chat_message("assistant"): try: with st.spinner("🤖 Generating response..."): # Prepare messages with system instruction for better control if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals() and analysis_type == 'translation': # For translation: strict system message messages = [ {"role": "system", "content": "You are a professional Coptic-to-modern-language translator. Provide only direct translations without explanations, commentary, or repeating the source text."}, {"role": "user", "content": full_prompt} ] else: # For other tasks: standard chat messages = [{"role": "user", "content": full_prompt}] response_stream = inference_client.chat_completion( model=MODEL_NAME, messages=messages, max_tokens=512, temperature=0.5, # Lower temperature for more focused translations top_p=0.9, stream=True ) # Stream the response response_placeholder = st.empty() full_response = "" for message in response_stream: if message.choices[0].delta.content: full_response += message.choices[0].delta.content response_placeholder.markdown(full_response + "▌") response_placeholder.markdown(full_response) st.session_state.messages.append({"role": "assistant", "content": full_response}) except Exception as e: st.error(f"❌ Error generating response: {str(e)}") st.info("💡 Please verify your API token is valid and has not expired.")