File size: 4,848 Bytes
579f772
 
 
 
 
 
 
 
 
6c2496c
579f772
 
 
6c2496c
579f772
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c2496c
 
 
 
 
 
579f772
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c2496c
579f772
 
 
 
 
 
 
 
 
 
 
 
6c2496c
579f772
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c2496c
579f772
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import gradio as gr
import os
import sys
import tempfile
from pathlib import Path

# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from detect_sync import load_model, detect_offset

# Initialize model
print("Loading FCN-SyncNet model...")
fcn_model = load_model("checkpoints/syncnet_fcn_epoch2.pth")
print("Model loaded successfully!")

def analyze_video(video_file):
    """
    Analyze a video file for audio-video synchronization
    
    Args:
        video_file: Uploaded video file path
        
    Returns:
        str: Analysis results
    """
    try:
        if video_file is None:
            return "❌ Please upload a video file"
        
        print(f"Processing video: {video_file}")
        
        # Detect offset
        result = detect_offset(fcn_model, video_file, verbose=True)
        
        offset = result['offset_frames']
        conf = result['confidence']
        proc_time = result['processing_time']
        
        # Interpret results
        if offset > 0:
            sync_status = f"πŸ”Š Audio leads video by {offset} frames"
            description = "Audio is playing before the corresponding video frames"
        elif offset < 0:
            sync_status = f"🎬 Video leads audio by {abs(offset)} frames"
            description = "Video is playing before the corresponding audio"
        else:
            sync_status = "βœ… Audio and video are synchronized"
            description = "Perfect synchronization detected"
        
        # Confidence interpretation
        if conf > 0.8:
            conf_text = "Very High"
            conf_emoji = "🟒"
        elif conf > 0.6:
            conf_text = "High"
            conf_emoji = "🟑"
        elif conf > 0.4:
            conf_text = "Medium"
            conf_emoji = "🟠"
        else:
            conf_text = "Low"
            conf_emoji = "πŸ”΄"
        
        result_text = f"""
## πŸ“Š Sync Detection Results

### {sync_status}

**Description:** {description}

---

### πŸ“ˆ Detailed Metrics

- **Offset:** {offset} frames
- **Confidence:** {conf_emoji} {conf:.2%} ({conf_text})
- **Processing Time:** {proc_time:.2f}s

---

### πŸ’‘ Interpretation

- **Positive offset:** Audio is ahead of video (delayed video sync)
- **Negative offset:** Video is ahead of audio (delayed audio sync)
- **Zero offset:** Perfect synchronization

---

### ⚑ Model Info

- **Model:** FCN-SyncNet (Calibrated)
- **Processing:** ~3x faster than original SyncNet
- **Calibration:** Applied (offset=3, scale=-0.5, baseline=-15)
        """
        
        return result_text
        
    except Exception as e:
        return f"❌ Error processing video: {str(e)}\n\nPlease ensure the video has both audio and video tracks."

# Create Gradio interface
with gr.Blocks(title="FCN-SyncNet: Audio-Video Sync Detection", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎬 FCN-SyncNet: Real-Time Audio-Visual Synchronization Detection
    
    Upload a video to detect audio-video synchronization offset. This model uses a Fully Convolutional Network (FCN) 
    for fast and accurate sync detection.
    
    ### How it works:
    1. Upload a video file (MP4, AVI, MOV, etc.)
    2. The model extracts audio-visual features
    3. Correlation analysis detects the offset
    4. Calibration ensures accurate results
    
    ### Performance:
    - **Speed:** ~3x faster than original SyncNet
    - **Accuracy:** Matches original SyncNet performance
    - **Real-time capable:** Can process HLS streams
    """)
    
    with gr.Row():
        with gr.Column():
            video_input = gr.Video(label="Upload Video")
            analyze_btn = gr.Button("πŸ” Analyze Sync", variant="primary", size="lg")
        
        with gr.Column():
            output_text = gr.Markdown(label="Results")
    
    analyze_btn.click(
        fn=analyze_video,
        inputs=video_input,
        outputs=output_text
    )
    
    gr.Markdown("""
    ---
    
    ## πŸ“š About
    
    This project implements a **Fully Convolutional Network (FCN)** approach to audio-visual synchronization detection,
    built upon the original SyncNet architecture.
    
    ### Key Features:
    - βœ… **3x faster** than original SyncNet
    - βœ… **Calibrated output** corrects regression-to-mean bias
    - βœ… **Real-time capable** for HLS streams
    - βœ… **High accuracy** matches original SyncNet
    
    ### Research Journey:
    - Tried regression (regression-to-mean problem)
    - Tried classification (loss of precision)
    - **Solution:** Correlation method + calibration formula
    
    ### GitHub:
    [github.com/R-V-Abhishek/Syncnet_FCN](https://github.com/R-V-Abhishek/Syncnet_FCN)
    
    ---
    
    *Built with ❀️ using Gradio and PyTorch*
    """)

if __name__ == "__main__":
    demo.launch()