From f47cff1edf4d0962c885903147ef46f63b36d5fc Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 30 Aug 2024 06:10:52 -0700 Subject: [PATCH] Update SWE-bench instructions, lint (#20) --- programmer/evaluate.py | 4 ++-- programmer/swebench/README.md | 11 +++++++++++ programmer/swebench/evaluate.py | 24 +++--------------------- programmer/swebench/run_instance.py | 4 ++-- 4 files changed, 18 insertions(+), 25 deletions(-) diff --git a/programmer/evaluate.py b/programmer/evaluate.py index 0cb20ea..97c3e62 100644 --- a/programmer/evaluate.py +++ b/programmer/evaluate.py @@ -3,7 +3,7 @@ import weave from agent import AgentState -from config import agent +from config import agent_4o_basic # Need to serialize AgentState as json for now since we can't save weave Objects # in Dataset set. @@ -40,7 +40,7 @@ def final_answer_substr(expected_substr: str, model_output: str): @weave.op() def model_agent_bridge(state: str): - return agent.run(AgentState(**json.loads(state))).model_dump_json() + return agent_4o_basic.run(AgentState(**json.loads(state))).model_dump_json() if __name__ == "__main__": diff --git a/programmer/swebench/README.md b/programmer/swebench/README.md index 582e5a7..b24a441 100644 --- a/programmer/swebench/README.md +++ b/programmer/swebench/README.md @@ -1,5 +1,12 @@ # SWE Bench programmer evaluation +This is a custom setup to run fast SWE-bench evals on programmer. The steps are: +- serve swebench docker containers from a remote machine + - setup an x86 machine (I use a gcp e2-standard-32) + - build the swebench instance images. For SWE-bench_Verified this builds about 550 images. + - run [containerserver](../containerserver/README.md) on the machine. containerserver serves an HTTP interface into the Docker containers. +- on your local machine, run python -m programmer.swebench.run_instance or python -m programmer.swebench.evaluate + ## Build SWE-bench images First do setup (below) then run this command to build all the images. --cache_level instance tells the script not to delete the instance images, which are what we want to use with container-manager. @@ -13,6 +20,10 @@ python -m swebench.harness.run_evaluation \ --cache_level instance ``` +## Run containerserver + +See [containerserver](../containerserver/README.md) for setup and running containerserver. + ## remote machine setup instructions on gcp VM ubuntu 20.04 diff --git a/programmer/swebench/evaluate.py b/programmer/swebench/evaluate.py index 38a5725..84bceee 100644 --- a/programmer/swebench/evaluate.py +++ b/programmer/swebench/evaluate.py @@ -7,14 +7,7 @@ from .swebench_model import SWEBenchProgrammerModel from .score import score_swebench from ..agent import Agent -from ..config import SYSTEM_MESSAGE -from ..tools import ( - list_files, - run_command, - view_image, - read_lines_from_file, - replace_lines_in_file, -) +from ..config import agent_4o_basic def load_raw_dataset(name: str, split: str): @@ -62,22 +55,11 @@ def main(): # ds = load_weave_dataset("SWE-bench_Verified", "test", instance_ids=instance_ids) ds = load_weave_dataset("SWE-bench_Verified", "test", limit=50, shuffle_seed=42) eval = weave.Evaluation( - name="SWE-bench_Verified", dataset=ds, scorers=[score_swebench], trials=5 + name="SWE-bench_Verified", dataset=ds, scorers=[score_swebench], trials=1 ) model = SWEBenchProgrammerModel( - agent=Agent( - model_name="gpt-4o-2024-08-06", - temperature=0.7, - system_message=SYSTEM_MESSAGE, - tools=[ - list_files, - run_command, - view_image, - read_lines_from_file, - replace_lines_in_file, - ], - ), + agent=agent_4o_basic, max_runtime_seconds=180, ) res = asyncio.run(eval.evaluate(model)) diff --git a/programmer/swebench/run_instance.py b/programmer/swebench/run_instance.py index f8846d5..f8ed0b8 100644 --- a/programmer/swebench/run_instance.py +++ b/programmer/swebench/run_instance.py @@ -11,7 +11,7 @@ from ..swebench.swebench_model import SWEBenchProgrammerModel from ..swebench.score import score_swebench -from ..config import agent_replace +from ..config import agent_4o_basic def main(): @@ -42,7 +42,7 @@ def main(): print("SOLUTION\n", instance["patch"]) print() - model = SWEBenchProgrammerModel(agent=agent_replace) + model = SWEBenchProgrammerModel(agent=agent_4o_basic) model_output = model.predict(instance) score = score_swebench(instance, model_output["answer"]) print("SCORE\n", score)