import gradio as gr import os import sys import tempfile from pathlib import Path # Add project root to path sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from detect_sync import load_model, detect_offset # Initialize model print("Loading FCN-SyncNet model...") fcn_model = load_model("checkpoints/syncnet_fcn_epoch2.pth") print("Model loaded successfully!") def analyze_video(video_file): """ Analyze a video file for audio-video synchronization Args: video_file: Uploaded video file path Returns: str: Analysis results """ try: if video_file is None: return "❌ Please upload a video file" print(f"Processing video: {video_file}") # Detect offset result = detect_offset(fcn_model, video_file, verbose=True) offset = result['offset_frames'] conf = result['confidence'] proc_time = result['processing_time'] # Interpret results if offset > 0: sync_status = f"🔊 Audio leads video by {offset} frames" description = "Audio is playing before the corresponding video frames" elif offset < 0: sync_status = f"🎬 Video leads audio by {abs(offset)} frames" description = "Video is playing before the corresponding audio" else: sync_status = "✅ Audio and video are synchronized" description = "Perfect synchronization detected" # Confidence interpretation if conf > 0.8: conf_text = "Very High" conf_emoji = "🟢" elif conf > 0.6: conf_text = "High" conf_emoji = "🟡" elif conf > 0.4: conf_text = "Medium" conf_emoji = "🟠" else: conf_text = "Low" conf_emoji = "🔴" result_text = f""" ## 📊 Sync Detection Results ### {sync_status} **Description:** {description} --- ### 📈 Detailed Metrics - **Offset:** {offset} frames - **Confidence:** {conf_emoji} {conf:.2%} ({conf_text}) - **Processing Time:** {proc_time:.2f}s --- ### 💡 Interpretation - **Positive offset:** Audio is ahead of video (delayed video sync) - **Negative offset:** Video is ahead of audio (delayed audio sync) - **Zero offset:** Perfect synchronization --- ### ⚡ Model Info - **Model:** FCN-SyncNet (Calibrated) - **Processing:** ~3x faster than original SyncNet - **Calibration:** Applied (offset=3, scale=-0.5, baseline=-15) """ return result_text except Exception as e: return f"❌ Error processing video: {str(e)}\n\nPlease ensure the video has both audio and video tracks." # Create Gradio interface with gr.Blocks(title="FCN-SyncNet: Audio-Video Sync Detection", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎬 FCN-SyncNet: Real-Time Audio-Visual Synchronization Detection Upload a video to detect audio-video synchronization offset. This model uses a Fully Convolutional Network (FCN) for fast and accurate sync detection. ### How it works: 1. Upload a video file (MP4, AVI, MOV, etc.) 2. The model extracts audio-visual features 3. Correlation analysis detects the offset 4. Calibration ensures accurate results ### Performance: - **Speed:** ~3x faster than original SyncNet - **Accuracy:** Matches original SyncNet performance - **Real-time capable:** Can process HLS streams """) with gr.Row(): with gr.Column(): video_input = gr.Video(label="Upload Video") analyze_btn = gr.Button("🔍 Analyze Sync", variant="primary", size="lg") with gr.Column(): output_text = gr.Markdown(label="Results") analyze_btn.click( fn=analyze_video, inputs=video_input, outputs=output_text ) gr.Markdown(""" --- ## 📚 About This project implements a **Fully Convolutional Network (FCN)** approach to audio-visual synchronization detection, built upon the original SyncNet architecture. ### Key Features: - ✅ **3x faster** than original SyncNet - ✅ **Calibrated output** corrects regression-to-mean bias - ✅ **Real-time capable** for HLS streams - ✅ **High accuracy** matches original SyncNet ### Research Journey: - Tried regression (regression-to-mean problem) - Tried classification (loss of precision) - **Solution:** Correlation method + calibration formula ### GitHub: [github.com/R-V-Abhishek/Syncnet_FCN](https://github.com/R-V-Abhishek/Syncnet_FCN) --- *Built with ❤️ using Gradio and PyTorch* """) if __name__ == "__main__": demo.launch()