Spaces:
Sleeping
Sleeping
Commit
Β·
e1255d1
1
Parent(s):
b623f54
fix temp? but seems not work?
Browse files- benchmark.py +3 -0
- geo_bot.py +9 -1
- main.py +14 -6
benchmark.py
CHANGED
|
@@ -71,6 +71,7 @@ class MapGuesserBenchmark:
|
|
| 71 |
self,
|
| 72 |
models: Optional[List[str]] = None,
|
| 73 |
max_samples: Optional[int] = None,
|
|
|
|
| 74 |
**kwargs,
|
| 75 |
) -> Dict:
|
| 76 |
if not self.golden_labels:
|
|
@@ -88,6 +89,7 @@ class MapGuesserBenchmark:
|
|
| 88 |
print(f"π Starting LIVE benchmark:")
|
| 89 |
print(f" Models: {models_to_test}")
|
| 90 |
print(f" Samples: {len(test_samples)}")
|
|
|
|
| 91 |
|
| 92 |
all_results = []
|
| 93 |
for model_name in models_to_test:
|
|
@@ -100,6 +102,7 @@ class MapGuesserBenchmark:
|
|
| 100 |
model_name=model_class_name,
|
| 101 |
use_selenium=True,
|
| 102 |
headless=self.headless,
|
|
|
|
| 103 |
) as bot:
|
| 104 |
for i, sample in enumerate(test_samples):
|
| 105 |
print('########################################################')
|
|
|
|
| 71 |
self,
|
| 72 |
models: Optional[List[str]] = None,
|
| 73 |
max_samples: Optional[int] = None,
|
| 74 |
+
temperature: float = 0.0,
|
| 75 |
**kwargs,
|
| 76 |
) -> Dict:
|
| 77 |
if not self.golden_labels:
|
|
|
|
| 89 |
print(f"π Starting LIVE benchmark:")
|
| 90 |
print(f" Models: {models_to_test}")
|
| 91 |
print(f" Samples: {len(test_samples)}")
|
| 92 |
+
print(f" Temperature: {temperature}")
|
| 93 |
|
| 94 |
all_results = []
|
| 95 |
for model_name in models_to_test:
|
|
|
|
| 102 |
model_name=model_class_name,
|
| 103 |
use_selenium=True,
|
| 104 |
headless=self.headless,
|
| 105 |
+
temperature=temperature,
|
| 106 |
) as bot:
|
| 107 |
for i, sample in enumerate(test_samples):
|
| 108 |
print('########################################################')
|
geo_bot.py
CHANGED
|
@@ -63,9 +63,17 @@ class GeoBot:
|
|
| 63 |
model_name: str,
|
| 64 |
use_selenium: bool = True,
|
| 65 |
headless: bool = False,
|
|
|
|
| 66 |
):
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
self.model_name = model_name
|
|
|
|
| 69 |
self.use_selenium = use_selenium
|
| 70 |
self.controller = (
|
| 71 |
MapCrunchController(headless=headless) if use_selenium else None
|
|
|
|
| 63 |
model_name: str,
|
| 64 |
use_selenium: bool = True,
|
| 65 |
headless: bool = False,
|
| 66 |
+
temperature: float = 0.0,
|
| 67 |
):
|
| 68 |
+
# Initialize model with temperature parameter
|
| 69 |
+
model_kwargs = {
|
| 70 |
+
"model": model_name,
|
| 71 |
+
"temperature": temperature,
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
self.model = model(**model_kwargs)
|
| 75 |
self.model_name = model_name
|
| 76 |
+
self.temperature = temperature
|
| 77 |
self.use_selenium = use_selenium
|
| 78 |
self.controller = (
|
| 79 |
MapCrunchController(headless=headless) if use_selenium else None
|
main.py
CHANGED
|
@@ -12,13 +12,13 @@ from benchmark import MapGuesserBenchmark
|
|
| 12 |
from config import MODELS_CONFIG, DATA_PATHS, SUCCESS_THRESHOLD_KM
|
| 13 |
|
| 14 |
|
| 15 |
-
def agent_mode(model_name: str, steps: int, headless: bool, samples: int):
|
| 16 |
"""
|
| 17 |
Runs the AI Agent in a benchmark loop over multiple samples,
|
| 18 |
using multi-step exploration for each.
|
| 19 |
"""
|
| 20 |
print(
|
| 21 |
-
f"Starting Agent Mode (as a benchmark): model={model_name}, steps={steps}, samples={samples}"
|
| 22 |
)
|
| 23 |
|
| 24 |
try:
|
|
@@ -44,7 +44,7 @@ def agent_mode(model_name: str, steps: int, headless: bool, samples: int):
|
|
| 44 |
all_results = []
|
| 45 |
|
| 46 |
with GeoBot(
|
| 47 |
-
model=model_class, model_name=model_instance_name, headless=headless
|
| 48 |
) as bot:
|
| 49 |
for i, sample in enumerate(test_samples):
|
| 50 |
print(
|
|
@@ -107,11 +107,11 @@ def agent_mode(model_name: str, steps: int, headless: bool, samples: int):
|
|
| 107 |
print("\nAgent Mode finished.")
|
| 108 |
|
| 109 |
|
| 110 |
-
def benchmark_mode(models: list, samples: int, headless: bool):
|
| 111 |
"""Runs the benchmark on pre-collected data."""
|
| 112 |
-
print(f"Starting Benchmark Mode: models={models}, samples={samples}")
|
| 113 |
benchmark = MapGuesserBenchmark(headless=headless)
|
| 114 |
-
summary = benchmark.run_benchmark(models=models, max_samples=samples)
|
| 115 |
if summary:
|
| 116 |
print("\n--- Benchmark Complete! Summary ---")
|
| 117 |
for model, stats in summary.items():
|
|
@@ -152,6 +152,12 @@ def main():
|
|
| 152 |
choices=list(MODELS_CONFIG.keys()),
|
| 153 |
help="[Benchmark] Models to benchmark.",
|
| 154 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
args = parser.parse_args()
|
| 157 |
|
|
@@ -161,12 +167,14 @@ def main():
|
|
| 161 |
steps=args.steps,
|
| 162 |
headless=args.headless,
|
| 163 |
samples=args.samples,
|
|
|
|
| 164 |
)
|
| 165 |
elif args.mode == "benchmark":
|
| 166 |
benchmark_mode(
|
| 167 |
models=args.models or [args.model],
|
| 168 |
samples=args.samples,
|
| 169 |
headless=args.headless,
|
|
|
|
| 170 |
)
|
| 171 |
|
| 172 |
|
|
|
|
| 12 |
from config import MODELS_CONFIG, DATA_PATHS, SUCCESS_THRESHOLD_KM
|
| 13 |
|
| 14 |
|
| 15 |
+
def agent_mode(model_name: str, steps: int, headless: bool, samples: int, temperature: float = 0.0):
|
| 16 |
"""
|
| 17 |
Runs the AI Agent in a benchmark loop over multiple samples,
|
| 18 |
using multi-step exploration for each.
|
| 19 |
"""
|
| 20 |
print(
|
| 21 |
+
f"Starting Agent Mode (as a benchmark): model={model_name}, steps={steps}, samples={samples}, temperature={temperature}"
|
| 22 |
)
|
| 23 |
|
| 24 |
try:
|
|
|
|
| 44 |
all_results = []
|
| 45 |
|
| 46 |
with GeoBot(
|
| 47 |
+
model=model_class, model_name=model_instance_name, headless=headless, temperature=temperature
|
| 48 |
) as bot:
|
| 49 |
for i, sample in enumerate(test_samples):
|
| 50 |
print(
|
|
|
|
| 107 |
print("\nAgent Mode finished.")
|
| 108 |
|
| 109 |
|
| 110 |
+
def benchmark_mode(models: list, samples: int, headless: bool, temperature: float = 0.0):
|
| 111 |
"""Runs the benchmark on pre-collected data."""
|
| 112 |
+
print(f"Starting Benchmark Mode: models={models}, samples={samples}, temperature={temperature}")
|
| 113 |
benchmark = MapGuesserBenchmark(headless=headless)
|
| 114 |
+
summary = benchmark.run_benchmark(models=models, max_samples=samples, temperature=temperature)
|
| 115 |
if summary:
|
| 116 |
print("\n--- Benchmark Complete! Summary ---")
|
| 117 |
for model, stats in summary.items():
|
|
|
|
| 152 |
choices=list(MODELS_CONFIG.keys()),
|
| 153 |
help="[Benchmark] Models to benchmark.",
|
| 154 |
)
|
| 155 |
+
parser.add_argument(
|
| 156 |
+
"--temperature",
|
| 157 |
+
type=float,
|
| 158 |
+
default=0.0,
|
| 159 |
+
help="Temperature parameter for LLM sampling (0.0 = deterministic, higher = more random). Default: 0.0",
|
| 160 |
+
)
|
| 161 |
|
| 162 |
args = parser.parse_args()
|
| 163 |
|
|
|
|
| 167 |
steps=args.steps,
|
| 168 |
headless=args.headless,
|
| 169 |
samples=args.samples,
|
| 170 |
+
temperature=args.temperature,
|
| 171 |
)
|
| 172 |
elif args.mode == "benchmark":
|
| 173 |
benchmark_mode(
|
| 174 |
models=args.models or [args.model],
|
| 175 |
samples=args.samples,
|
| 176 |
headless=args.headless,
|
| 177 |
+
temperature=args.temperature,
|
| 178 |
)
|
| 179 |
|
| 180 |
|