Trouter-Library commited on
Commit
788f4ae
·
verified ·
1 Parent(s): 8e6e44b

Create benchmark.py

Browse files
Files changed (1) hide show
  1. benchmark.py +464 -0
benchmark.py ADDED
@@ -0,0 +1,464 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helion-OSC Comprehensive Benchmark Suite
3
+ Performance benchmarking and comparison with other models
4
+ """
5
+
6
+ import torch
7
+ import time
8
+ import psutil
9
+ import numpy as np
10
+ from typing import Dict, List, Any, Optional
11
+ from dataclasses import dataclass, asdict
12
+ import json
13
+ import logging
14
+ from transformers import AutoTokenizer, AutoModelForCausalLM
15
+ from tqdm import tqdm
16
+ import pandas as pd
17
+ import matplotlib.pyplot as plt
18
+ import seaborn as sns
19
+
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ @dataclass
25
+ class BenchmarkResult:
26
+ """Single benchmark result"""
27
+ model_name: str
28
+ task: str
29
+ prompt_length: int
30
+ generation_length: int
31
+ temperature: float
32
+ inference_time: float
33
+ tokens_per_second: float
34
+ memory_used_mb: float
35
+ gpu_memory_mb: Optional[float]
36
+ success: bool
37
+ error: Optional[str] = None
38
+
39
+
40
+ @dataclass
41
+ class AggregatedResults:
42
+ """Aggregated benchmark results"""
43
+ model_name: str
44
+ total_tests: int
45
+ successful_tests: int
46
+ failed_tests: int
47
+ avg_inference_time: float
48
+ avg_tokens_per_second: float
49
+ avg_memory_mb: float
50
+ min_inference_time: float
51
+ max_inference_time: float
52
+ std_inference_time: float
53
+
54
+
55
+ class PerformanceBenchmark:
56
+ """Performance benchmarking utilities"""
57
+
58
+ def __init__(self, model_name: str = "DeepXR/Helion-OSC"):
59
+ self.model_name = model_name
60
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
61
+
62
+ logger.info(f"Loading model: {model_name}")
63
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
64
+ self.model = AutoModelForCausalLM.from_pretrained(
65
+ model_name,
66
+ torch_dtype=torch.bfloat16 if self.device == "cuda" else torch.float32,
67
+ device_map="auto" if self.device == "cuda" else None
68
+ )
69
+
70
+ if self.device == "cpu":
71
+ self.model = self.model.to(self.device)
72
+
73
+ self.model.eval()
74
+ self.results: List[BenchmarkResult] = []
75
+
76
+ def get_memory_usage(self) -> tuple:
77
+ """Get current memory usage"""
78
+ process = psutil.Process()
79
+ ram_mb = process.memory_info().rss / 1024 / 1024
80
+
81
+ gpu_mb = None
82
+ if torch.cuda.is_available():
83
+ gpu_mb = torch.cuda.memory_allocated() / 1024 / 1024
84
+
85
+ return ram_mb, gpu_mb
86
+
87
+ def benchmark_inference(
88
+ self,
89
+ prompt: str,
90
+ task: str,
91
+ max_length: int = 512,
92
+ temperature: float = 0.7,
93
+ num_runs: int = 1
94
+ ) -> List[BenchmarkResult]:
95
+ """Benchmark inference performance"""
96
+ run_results = []
97
+
98
+ for run in range(num_runs):
99
+ try:
100
+ # Tokenize
101
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
102
+ prompt_length = inputs.input_ids.shape[1]
103
+
104
+ # Warm up GPU
105
+ if run == 0 and self.device == "cuda":
106
+ with torch.no_grad():
107
+ _ = self.model.generate(**inputs, max_length=prompt_length + 10)
108
+ torch.cuda.synchronize()
109
+
110
+ # Clear cache
111
+ if self.device == "cuda":
112
+ torch.cuda.empty_cache()
113
+
114
+ # Measure memory before
115
+ ram_before, gpu_before = self.get_memory_usage()
116
+
117
+ # Generate
118
+ start_time = time.time()
119
+
120
+ with torch.no_grad():
121
+ outputs = self.model.generate(
122
+ **inputs,
123
+ max_length=max_length,
124
+ temperature=temperature,
125
+ do_sample=temperature > 0,
126
+ pad_token_id=self.tokenizer.eos_token_id
127
+ )
128
+
129
+ if self.device == "cuda":
130
+ torch.cuda.synchronize()
131
+
132
+ end_time = time.time()
133
+
134
+ # Measure memory after
135
+ ram_after, gpu_after = self.get_memory_usage()
136
+
137
+ # Calculate metrics
138
+ inference_time = end_time - start_time
139
+ generation_length = outputs.shape[1] - prompt_length
140
+ tokens_per_second = generation_length / inference_time if inference_time > 0 else 0
141
+ memory_used = ram_after - ram_before
142
+ gpu_memory = (gpu_after - gpu_before) if gpu_after and gpu_before else None
143
+
144
+ result = BenchmarkResult(
145
+ model_name=self.model_name,
146
+ task=task,
147
+ prompt_length=prompt_length,
148
+ generation_length=generation_length,
149
+ temperature=temperature,
150
+ inference_time=inference_time,
151
+ tokens_per_second=tokens_per_second,
152
+ memory_used_mb=memory_used,
153
+ gpu_memory_mb=gpu_memory,
154
+ success=True
155
+ )
156
+
157
+ run_results.append(result)
158
+ self.results.append(result)
159
+
160
+ except Exception as e:
161
+ logger.error(f"Benchmark failed: {e}")
162
+ result = BenchmarkResult(
163
+ model_name=self.model_name,
164
+ task=task,
165
+ prompt_length=0,
166
+ generation_length=0,
167
+ temperature=temperature,
168
+ inference_time=0,
169
+ tokens_per_second=0,
170
+ memory_used_mb=0,
171
+ gpu_memory_mb=None,
172
+ success=False,
173
+ error=str(e)
174
+ )
175
+ run_results.append(result)
176
+ self.results.append(result)
177
+
178
+ return run_results
179
+
180
+ def run_benchmark_suite(self) -> List[BenchmarkResult]:
181
+ """Run comprehensive benchmark suite"""
182
+ logger.info("Starting comprehensive benchmark suite...")
183
+
184
+ test_cases = [
185
+ {
186
+ "prompt": "def fibonacci(n):",
187
+ "task": "simple_function",
188
+ "max_length": 256,
189
+ "temperature": 0.7
190
+ },
191
+ {
192
+ "prompt": "Write a Python class for a binary search tree with insert, search, and delete methods:",
193
+ "task": "complex_class",
194
+ "max_length": 1024,
195
+ "temperature": 0.7
196
+ },
197
+ {
198
+ "prompt": "Implement quicksort algorithm in Python with detailed comments:",
199
+ "task": "algorithm",
200
+ "max_length": 512,
201
+ "temperature": 0.5
202
+ },
203
+ {
204
+ "prompt": "Solve: What is the derivative of f(x) = x^3 + 2x^2 - 5x + 3?",
205
+ "task": "math_simple",
206
+ "max_length": 256,
207
+ "temperature": 0.3
208
+ },
209
+ {
210
+ "prompt": "Prove using mathematical induction that the sum of first n natural numbers is n(n+1)/2:",
211
+ "task": "math_proof",
212
+ "max_length": 1024,
213
+ "temperature": 0.2
214
+ },
215
+ {
216
+ "prompt": "Design a RESTful API for a todo list application with proper documentation:",
217
+ "task": "system_design",
218
+ "max_length": 2048,
219
+ "temperature": 0.7
220
+ },
221
+ ]
222
+
223
+ all_results = []
224
+
225
+ for test_case in tqdm(test_cases, desc="Running benchmarks"):
226
+ results = self.benchmark_inference(
227
+ prompt=test_case["prompt"],
228
+ task=test_case["task"],
229
+ max_length=test_case["max_length"],
230
+ temperature=test_case["temperature"],
231
+ num_runs=3
232
+ )
233
+ all_results.extend(results)
234
+
235
+ logger.info("Benchmark suite completed!")
236
+ return all_results
237
+
238
+ def aggregate_results(self) -> AggregatedResults:
239
+ """Aggregate benchmark results"""
240
+ if not self.results:
241
+ raise ValueError("No benchmark results available")
242
+
243
+ successful = [r for r in self.results if r.success]
244
+
245
+ if not successful:
246
+ raise ValueError("No successful benchmark runs")
247
+
248
+ inference_times = [r.inference_time for r in successful]
249
+ tokens_per_sec = [r.tokens_per_second for r in successful]
250
+ memory_usage = [r.memory_used_mb for r in successful]
251
+
252
+ return AggregatedResults(
253
+ model_name=self.model_name,
254
+ total_tests=len(self.results),
255
+ successful_tests=len(successful),
256
+ failed_tests=len(self.results) - len(successful),
257
+ avg_inference_time=np.mean(inference_times),
258
+ avg_tokens_per_second=np.mean(tokens_per_sec),
259
+ avg_memory_mb=np.mean(memory_usage),
260
+ min_inference_time=np.min(inference_times),
261
+ max_inference_time=np.max(inference_times),
262
+ std_inference_time=np.std(inference_times)
263
+ )
264
+
265
+ def save_results(self, output_file: str = "benchmark_results.json"):
266
+ """Save benchmark results to file"""
267
+ results_dict = [asdict(r) for r in self.results]
268
+
269
+ with open(output_file, 'w') as f:
270
+ json.dump(results_dict, f, indent=2)
271
+
272
+ logger.info(f"Results saved to {output_file}")
273
+
274
+ def generate_report(self, output_file: str = "benchmark_report.txt"):
275
+ """Generate human-readable benchmark report"""
276
+ agg = self.aggregate_results()
277
+
278
+ report = f"""
279
+ {'='*80}
280
+ HELION-OSC BENCHMARK REPORT
281
+ {'='*80}
282
+
283
+ Model: {agg.model_name}
284
+ Device: {self.device}
285
+
286
+ OVERALL STATISTICS
287
+ {'='*80}
288
+ Total Tests: {agg.total_tests}
289
+ Successful: {agg.successful_tests}
290
+ Failed: {agg.failed_tests}
291
+ Success Rate: {(agg.successful_tests/agg.total_tests)*100:.2f}%
292
+
293
+ PERFORMANCE METRICS
294
+ {'='*80}
295
+ Average Inference Time: {agg.avg_inference_time:.4f} seconds
296
+ Min Inference Time: {agg.min_inference_time:.4f} seconds
297
+ Max Inference Time: {agg.max_inference_time:.4f} seconds
298
+ Std Inference Time: {agg.std_inference_time:.4f} seconds
299
+
300
+ Average Tokens/Second: {agg.avg_tokens_per_second:.2f}
301
+ Average Memory Usage: {agg.avg_memory_mb:.2f} MB
302
+
303
+ PER-TASK BREAKDOWN
304
+ {'='*80}
305
+ """
306
+
307
+ # Group by task
308
+ df = pd.DataFrame([asdict(r) for r in self.results if r.success])
309
+ if not df.empty:
310
+ task_stats = df.groupby('task').agg({
311
+ 'inference_time': ['mean', 'min', 'max'],
312
+ 'tokens_per_second': 'mean',
313
+ 'memory_used_mb': 'mean'
314
+ })
315
+
316
+ report += task_stats.to_string()
317
+
318
+ report += f"\n\n{'='*80}\n"
319
+
320
+ with open(output_file, 'w') as f:
321
+ f.write(report)
322
+
323
+ logger.info(f"Report saved to {output_file}")
324
+ print(report)
325
+
326
+ def plot_results(self, output_dir: str = "./benchmark_plots"):
327
+ """Generate visualization plots"""
328
+ import os
329
+ os.makedirs(output_dir, exist_ok=True)
330
+
331
+ df = pd.DataFrame([asdict(r) for r in self.results if r.success])
332
+
333
+ if df.empty:
334
+ logger.warning("No data to plot")
335
+ return
336
+
337
+ # Set style
338
+ sns.set_style("whitegrid")
339
+
340
+ # Plot 1: Inference time by task
341
+ plt.figure(figsize=(12, 6))
342
+ sns.barplot(data=df, x='task', y='inference_time')
343
+ plt.xticks(rotation=45, ha='right')
344
+ plt.title('Inference Time by Task')
345
+ plt.ylabel('Time (seconds)')
346
+ plt.tight_layout()
347
+ plt.savefig(f"{output_dir}/inference_time_by_task.png", dpi=300)
348
+ plt.close()
349
+
350
+ # Plot 2: Tokens per second by task
351
+ plt.figure(figsize=(12, 6))
352
+ sns.barplot(data=df, x='task', y='tokens_per_second')
353
+ plt.xticks(rotation=45, ha='right')
354
+ plt.title('Tokens Per Second by Task')
355
+ plt.ylabel('Tokens/Second')
356
+ plt.tight_layout()
357
+ plt.savefig(f"{output_dir}/tokens_per_second_by_task.png", dpi=300)
358
+ plt.close()
359
+
360
+ # Plot 3: Memory usage by task
361
+ plt.figure(figsize=(12, 6))
362
+ sns.barplot(data=df, x='task', y='memory_used_mb')
363
+ plt.xticks(rotation=45, ha='right')
364
+ plt.title('Memory Usage by Task')
365
+ plt.ylabel('Memory (MB)')
366
+ plt.tight_layout()
367
+ plt.savefig(f"{output_dir}/memory_usage_by_task.png", dpi=300)
368
+ plt.close()
369
+
370
+ # Plot 4: Scatter plot - generation length vs inference time
371
+ plt.figure(figsize=(10, 6))
372
+ sns.scatterplot(data=df, x='generation_length', y='inference_time', hue='task', s=100)
373
+ plt.title('Generation Length vs Inference Time')
374
+ plt.xlabel('Generation Length (tokens)')
375
+ plt.ylabel('Inference Time (seconds)')
376
+ plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
377
+ plt.tight_layout()
378
+ plt.savefig(f"{output_dir}/length_vs_time.png", dpi=300)
379
+ plt.close()
380
+
381
+ logger.info(f"Plots saved to {output_dir}")
382
+
383
+
384
+ class ComparisonBenchmark:
385
+ """Compare multiple models"""
386
+
387
+ def __init__(self, model_names: List[str]):
388
+ self.model_names = model_names
389
+ self.benchmarks = {}
390
+
391
+ def run_comparison(self):
392
+ """Run benchmarks for all models"""
393
+ for model_name in self.model_names:
394
+ logger.info(f"\nBenchmarking {model_name}...")
395
+ try:
396
+ benchmark = PerformanceBenchmark(model_name)
397
+ benchmark.run_benchmark_suite()
398
+ self.benchmarks[model_name] = benchmark
399
+ except Exception as e:
400
+ logger.error(f"Failed to benchmark {model_name}: {e}")
401
+
402
+ def generate_comparison_report(self, output_file: str = "comparison_report.txt"):
403
+ """Generate comparison report"""
404
+ report = f"""
405
+ {'='*80}
406
+ MODEL COMPARISON REPORT
407
+ {'='*80}
408
+
409
+ """
410
+
411
+ for model_name, benchmark in self.benchmarks.items():
412
+ agg = benchmark.aggregate_results()
413
+ report += f"""
414
+ Model: {model_name}
415
+ {'='*80}
416
+ Avg Inference Time: {agg.avg_inference_time:.4f}s
417
+ Avg Tokens/Second: {agg.avg_tokens_per_second:.2f}
418
+ Avg Memory Usage: {agg.avg_memory_mb:.2f} MB
419
+ Success Rate: {(agg.successful_tests/agg.total_tests)*100:.2f}%
420
+
421
+ """
422
+
423
+ with open(output_file, 'w') as f:
424
+ f.write(report)
425
+
426
+ print(report)
427
+ logger.info(f"Comparison report saved to {output_file}")
428
+
429
+
430
+ def main():
431
+ """Main benchmark script"""
432
+ import argparse
433
+
434
+ parser = argparse.ArgumentParser(description="Benchmark Helion-OSC model")
435
+ parser.add_argument("--model", type=str, default="DeepXR/Helion-OSC")
436
+ parser.add_argument("--output-dir", type=str, default="./benchmark_results")
437
+ parser.add_argument("--compare", nargs='+', help="List of models to compare")
438
+ parser.add_argument("--plot", action="store_true", help="Generate plots")
439
+
440
+ args = parser.parse_args()
441
+
442
+ import os
443
+ os.makedirs(args.output_dir, exist_ok=True)
444
+
445
+ if args.compare:
446
+ # Comparison mode
447
+ comparison = ComparisonBenchmark(args.compare)
448
+ comparison.run_comparison()
449
+ comparison.generate_comparison_report(
450
+ os.path.join(args.output_dir, "comparison_report.txt")
451
+ )
452
+ else:
453
+ # Single model benchmark
454
+ benchmark = PerformanceBenchmark(args.model)
455
+ benchmark.run_benchmark_suite()
456
+ benchmark.save_results(os.path.join(args.output_dir, "benchmark_results.json"))
457
+ benchmark.generate_report(os.path.join(args.output_dir, "benchmark_report.txt"))
458
+
459
+ if args.plot:
460
+ benchmark.plot_results(os.path.join(args.output_dir, "plots"))
461
+
462
+
463
+ if __name__ == "__main__":
464
+ main()