Skip to content

Commit

Permalink
Update SWE-bench instructions, lint (#20)
Browse files Browse the repository at this point in the history
  • Loading branch information
shawnlewis authored Aug 30, 2024
1 parent 88467a2 commit f47cff1
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 25 deletions.
4 changes: 2 additions & 2 deletions programmer/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import weave

from agent import AgentState
from config import agent
from config import agent_4o_basic

# Need to serialize AgentState as json for now since we can't save weave Objects
# in Dataset set.
Expand Down Expand Up @@ -40,7 +40,7 @@ def final_answer_substr(expected_substr: str, model_output: str):

@weave.op()
def model_agent_bridge(state: str):
return agent.run(AgentState(**json.loads(state))).model_dump_json()
return agent_4o_basic.run(AgentState(**json.loads(state))).model_dump_json()


if __name__ == "__main__":
Expand Down
11 changes: 11 additions & 0 deletions programmer/swebench/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# SWE Bench programmer evaluation

This is a custom setup to run fast SWE-bench evals on programmer. The steps are:
- serve swebench docker containers from a remote machine
- setup an x86 machine (I use a gcp e2-standard-32)
- build the swebench instance images. For SWE-bench_Verified this builds about 550 images.
- run [containerserver](../containerserver/README.md) on the machine. containerserver serves an HTTP interface into the Docker containers.
- on your local machine, run python -m programmer.swebench.run_instance or python -m programmer.swebench.evaluate

## Build SWE-bench images

First do setup (below) then run this command to build all the images. --cache_level instance tells the script not to delete the instance images, which are what we want to use with container-manager.
Expand All @@ -13,6 +20,10 @@ python -m swebench.harness.run_evaluation \
--cache_level instance
```

## Run containerserver

See [containerserver](../containerserver/README.md) for setup and running containerserver.


## remote machine setup instructions on gcp VM ubuntu 20.04

Expand Down
24 changes: 3 additions & 21 deletions programmer/swebench/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,7 @@
from .swebench_model import SWEBenchProgrammerModel
from .score import score_swebench
from ..agent import Agent
from ..config import SYSTEM_MESSAGE
from ..tools import (
list_files,
run_command,
view_image,
read_lines_from_file,
replace_lines_in_file,
)
from ..config import agent_4o_basic


def load_raw_dataset(name: str, split: str):
Expand Down Expand Up @@ -62,22 +55,11 @@ def main():
# ds = load_weave_dataset("SWE-bench_Verified", "test", instance_ids=instance_ids)
ds = load_weave_dataset("SWE-bench_Verified", "test", limit=50, shuffle_seed=42)
eval = weave.Evaluation(
name="SWE-bench_Verified", dataset=ds, scorers=[score_swebench], trials=5
name="SWE-bench_Verified", dataset=ds, scorers=[score_swebench], trials=1
)

model = SWEBenchProgrammerModel(
agent=Agent(
model_name="gpt-4o-2024-08-06",
temperature=0.7,
system_message=SYSTEM_MESSAGE,
tools=[
list_files,
run_command,
view_image,
read_lines_from_file,
replace_lines_in_file,
],
),
agent=agent_4o_basic,
max_runtime_seconds=180,
)
res = asyncio.run(eval.evaluate(model))
Expand Down
4 changes: 2 additions & 2 deletions programmer/swebench/run_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from ..swebench.swebench_model import SWEBenchProgrammerModel
from ..swebench.score import score_swebench
from ..config import agent_replace
from ..config import agent_4o_basic


def main():
Expand Down Expand Up @@ -42,7 +42,7 @@ def main():
print("SOLUTION\n", instance["patch"])
print()

model = SWEBenchProgrammerModel(agent=agent_replace)
model = SWEBenchProgrammerModel(agent=agent_4o_basic)
model_output = model.predict(instance)
score = score_swebench(instance, model_output["answer"])
print("SCORE\n", score)
Expand Down

0 comments on commit f47cff1

Please sign in to comment.