Spaces:
Running
Running
File size: 4,848 Bytes
579f772 6c2496c 579f772 6c2496c 579f772 6c2496c 579f772 6c2496c 579f772 6c2496c 579f772 6c2496c 579f772 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import gradio as gr
import os
import sys
import tempfile
from pathlib import Path
# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from detect_sync import load_model, detect_offset
# Initialize model
print("Loading FCN-SyncNet model...")
fcn_model = load_model("checkpoints/syncnet_fcn_epoch2.pth")
print("Model loaded successfully!")
def analyze_video(video_file):
"""
Analyze a video file for audio-video synchronization
Args:
video_file: Uploaded video file path
Returns:
str: Analysis results
"""
try:
if video_file is None:
return "β Please upload a video file"
print(f"Processing video: {video_file}")
# Detect offset
result = detect_offset(fcn_model, video_file, verbose=True)
offset = result['offset_frames']
conf = result['confidence']
proc_time = result['processing_time']
# Interpret results
if offset > 0:
sync_status = f"π Audio leads video by {offset} frames"
description = "Audio is playing before the corresponding video frames"
elif offset < 0:
sync_status = f"π¬ Video leads audio by {abs(offset)} frames"
description = "Video is playing before the corresponding audio"
else:
sync_status = "β
Audio and video are synchronized"
description = "Perfect synchronization detected"
# Confidence interpretation
if conf > 0.8:
conf_text = "Very High"
conf_emoji = "π’"
elif conf > 0.6:
conf_text = "High"
conf_emoji = "π‘"
elif conf > 0.4:
conf_text = "Medium"
conf_emoji = "π "
else:
conf_text = "Low"
conf_emoji = "π΄"
result_text = f"""
## π Sync Detection Results
### {sync_status}
**Description:** {description}
---
### π Detailed Metrics
- **Offset:** {offset} frames
- **Confidence:** {conf_emoji} {conf:.2%} ({conf_text})
- **Processing Time:** {proc_time:.2f}s
---
### π‘ Interpretation
- **Positive offset:** Audio is ahead of video (delayed video sync)
- **Negative offset:** Video is ahead of audio (delayed audio sync)
- **Zero offset:** Perfect synchronization
---
### β‘ Model Info
- **Model:** FCN-SyncNet (Calibrated)
- **Processing:** ~3x faster than original SyncNet
- **Calibration:** Applied (offset=3, scale=-0.5, baseline=-15)
"""
return result_text
except Exception as e:
return f"β Error processing video: {str(e)}\n\nPlease ensure the video has both audio and video tracks."
# Create Gradio interface
with gr.Blocks(title="FCN-SyncNet: Audio-Video Sync Detection", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# π¬ FCN-SyncNet: Real-Time Audio-Visual Synchronization Detection
Upload a video to detect audio-video synchronization offset. This model uses a Fully Convolutional Network (FCN)
for fast and accurate sync detection.
### How it works:
1. Upload a video file (MP4, AVI, MOV, etc.)
2. The model extracts audio-visual features
3. Correlation analysis detects the offset
4. Calibration ensures accurate results
### Performance:
- **Speed:** ~3x faster than original SyncNet
- **Accuracy:** Matches original SyncNet performance
- **Real-time capable:** Can process HLS streams
""")
with gr.Row():
with gr.Column():
video_input = gr.Video(label="Upload Video")
analyze_btn = gr.Button("π Analyze Sync", variant="primary", size="lg")
with gr.Column():
output_text = gr.Markdown(label="Results")
analyze_btn.click(
fn=analyze_video,
inputs=video_input,
outputs=output_text
)
gr.Markdown("""
---
## π About
This project implements a **Fully Convolutional Network (FCN)** approach to audio-visual synchronization detection,
built upon the original SyncNet architecture.
### Key Features:
- β
**3x faster** than original SyncNet
- β
**Calibrated output** corrects regression-to-mean bias
- β
**Real-time capable** for HLS streams
- β
**High accuracy** matches original SyncNet
### Research Journey:
- Tried regression (regression-to-mean problem)
- Tried classification (loss of precision)
- **Solution:** Correlation method + calibration formula
### GitHub:
[github.com/R-V-Abhishek/Syncnet_FCN](https://github.com/R-V-Abhishek/Syncnet_FCN)
---
*Built with β€οΈ using Gradio and PyTorch*
""")
if __name__ == "__main__":
demo.launch()
|