Spaces:
Sleeping
Sleeping
File size: 5,893 Bytes
1d3a5fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
#!/bin/bash
# Cluster connection configuration
CLUSTER_HOST="killarney"
CLUSTER_USER="gsa"
# Job configuration
SCRIPT_NAME="gradio_job.slurm"
APP_PATH="/home/gsa/quick-tokenizer-accuracy/app.py"
JOB_NAME="gradio-app"
PARTITION="l40s"
NODES=1
NTASKS_PER_NODE=1
CPUS_PER_TASK=4
MEM="8G"
TIME="02:00:00"
GRADIO_PORT=7860
ACCOUNT="aip-craffel"
script_location="/project/aip-craffel/gsa/$SCRIPT_NAME"
ENV_PATH="/home/gsa/tokenizers/.venv/bin/activate"
OUTPUT_PATH="/project/aip-craffel/gsa/.slurm"
# Function to cleanup temporary files
cleanup() {
echo "Cleaning up..."
if [ -f "$SCRIPT_NAME" ]; then
rm "$SCRIPT_NAME"
fi
exit 0
}
# Set trap for cleanup on script exit
trap cleanup EXIT INT TERM
# Generate SLURM job script locally
cat > "$SCRIPT_NAME" << EOF
#!/bin/bash
#SBATCH --job-name=$JOB_NAME
#SBATCH --partition=$PARTITION
#SBATCH --nodes=$NODES
#SBATCH --ntasks-per-node=$NTASKS_PER_NODE
#SBATCH --cpus-per-task=$CPUS_PER_TASK
#SBATCH --mem=$MEM
#SBATCH --time=$TIME
#SBATCH --account=$ACCOUNT
#SBATCH --output=$OUTPUT_PATH/%j.out
# Print job info
echo "Job started on node: \$(hostname)"
echo "Job ID: \$SLURM_JOB_ID"
echo "Allocated nodes: \$SLURM_JOB_NODELIST"
echo "Working directory: \$(pwd)"
echo "Starting time: \$(date)"
# Load necessary modules
module load slurm/killarney/24.05.7 StdEnv/2023 gcc/13.3 openmpi/5.0.3 cuda/12.6 python/3.10.13
# Activate virtual environment
source $ENV_PATH
# Set up environment
export GRADIO_SERVER_NAME="0.0.0.0"
export GRADIO_SERVER_PORT=$GRADIO_PORT
# Start Gradio app
echo "Starting Gradio app on port $GRADIO_PORT..."
python $APP_PATH --no-browser
# Keep the job alive
echo "Gradio app finished at: \$(date)"
EOF
echo "Generated SLURM job script: $SCRIPT_NAME"
# Transfer the job script to the cluster and submit it
scp "$SCRIPT_NAME" "$CLUSTER_USER@$CLUSTER_HOST:$script_location"
if [ $? -ne 0 ]; then
echo "Error: Failed to transfer job script to cluster"
exit 1
fi
echo "Submitting job to cluster..."
JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'cd /project && sbatch --parsable $script_location'")
if [ $? -ne 0 ]; then
echo "Error: Failed to submit job to cluster"
exit 1
fi
echo "Job submitted with ID: $JOB_ID"
# Monitor job status from local machine
echo "Monitoring job status from local machine..."
while true; do
JOB_STATUS=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%T\" 2>/dev/null'")
echo "Job status: $JOB_STATUS"
if [ -z "$JOB_STATUS" ]; then
echo "Error: Job $JOB_ID not found. It may have failed to start."
echo "Checking job output..."
ssh "$CLUSTER_USER@$CLUSTER_HOST" "ls -la ${JOB_ID}.* 2>/dev/null && echo 'Output files:' && cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null"
exit 1
elif [ "$JOB_STATUS" = "RUNNING" ]; then
echo "Job is now running!"
break
elif [ "$JOB_STATUS" = "PENDING" ]; then
echo "Job is pending... (waiting for resources)"
sleep 5
else
echo "Job status: $JOB_STATUS"
if [[ "$JOB_STATUS" =~ ^(FAILED|CANCELLED|TIMEOUT|COMPLETED)$ ]]; then
echo "Job ended with status: $JOB_STATUS"
echo "Checking job output files..."
ssh "$CLUSTER_USER@$CLUSTER_HOST" "cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null"
exit 1
fi
sleep 5
fi
done
# Get the allocated node
NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'")
echo "Job is running on node: $NODE"
# Wait a moment for the Gradio app to start
echo "Waiting for Gradio app to initialize..."
sleep 10
# Check if Gradio is actually running
echo "Checking if Gradio app started successfully..."
# GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "ssh $NODE 'ps aux | grep gradio | grep -v grep' 2>/dev/null")
GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")
# Get NODE locally
NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \
"bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'")
# Check Gradio process on that node
GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \
"bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")
# Handle process check
if [ -n "$GRADIO_CHECK" ]; then
echo "✓ Gradio app appears to be running"
else
echo "⚠ Warning: Gradio app may not have started properly"
echo "Check the job output:"
ssh "$CLUSTER_USER@$CLUSTER_HOST" \
"bash -l -c 'tail ${JOB_ID}.out'"
fi
# Connection info
cat <<EOF
=========================================
Gradio app should be running on:
Cluster: $CLUSTER_HOST
Node: $NODE
Port: $GRADIO_PORT
To access from your local machine:
ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST
Then open: http://localhost:$GRADIO_PORT
Alternative direct SSH with forwarding:
ssh -L $GRADIO_PORT:localhost:$GRADIO_PORT $CLUSTER_USER@$NODE.$CLUSTER_HOST
Check job status:
ssh $CLUSTER_USER@$CLUSTER_HOST 'squeue -j $JOB_ID '
Cancel job:
ssh $CLUSTER_USER@$CLUSTER_HOST 'scancel $JOB_ID '
=========================================
EOF
# Optional port forwarding
read -p "Would you like to set up port forwarding now? (y/n): " -n 1 -r
echo ""
if [[ $REPLY =~ ^[Yy]$ ]]; then
echo "Setting up port forwarding..."
ssh -L "${GRADIO_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \
-t "echo 'Port forwarding active: localhost:${GRADIO_PORT} -> ${NODE}:${GRADIO_PORT}'; bash"
echo ""
echo "Port forwarding ended."
else
echo "Skipping port forwarding."
echo "Later you can run: ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST"
fi
echo ""
echo "Job $JOB_ID is still running on $CLUSTER_HOST:$NODE"
echo "Don't forget to cancel it when done: ssh $CLUSTER_USER@$CLUSTER_HOST 'scancel $JOB_ID'"
|