Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add SWE-bench benchmarking integration (#415) #670

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 10 additions & 33 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,33 +1,10 @@
flask
flask-cors
toml
urllib3
requests
colorama
fastlogging
Jinja2
mistletoe
markdownify
pdfminer.six
playwright
pytest-playwright
tiktoken
ollama
openai
anthropic
google-generativeai
sqlmodel
keybert
GitPython
netlify-py
Markdown
xhtml2pdf
mistralai
Flask-SocketIO
eventlet
groq
duckduckgo-search
orjson
gevent
gevent-websocket
curl_cffi
# Core dependencies
datasets>=2.0.0
docker>=6.0.0
pytest>=7.0.0
pytest-asyncio>=0.21.0
pytest-cov>=4.1.0

# SWE-bench dependencies
swebench>=0.1.0
huggingface-hub>=0.19.0
18 changes: 18 additions & 0 deletions src/benchmark/swebench/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""
SWE-bench integration module for Devika.

This module provides integration with the SWE-bench benchmark for evaluating
code generation capabilities on real-world GitHub issues.
"""

from .swebench import SWEBenchRunner
from .dataset import SWEBenchDataset
from .evaluator import SWEBenchEvaluator
from .reporter import SWEBenchReporter

__all__ = [
'SWEBenchRunner',
'SWEBenchDataset',
'SWEBenchEvaluator',
'SWEBenchReporter',
]
38 changes: 38 additions & 0 deletions src/benchmark/swebench/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""SWE-bench dataset loading and management."""

from typing import Dict, List, Optional
from datasets import load_dataset

class SWEBenchDataset:
"""Handler for SWE-bench dataset operations."""

def __init__(self, dataset_name: str = "princeton-nlp/SWE-bench"):
"""Initialize dataset handler.

Args:
dataset_name: HuggingFace dataset name
"""
self.dataset_name = dataset_name
self.dataset = None

def load_instances(self, instance_ids: Optional[List[str]] = None) -> List[Dict]:
"""Load benchmark instances.

Args:
instance_ids: Optional list of specific instances to load

Returns:
List of benchmark instances
"""
if self.dataset is None:
self.dataset = load_dataset(self.dataset_name, split='test')

if instance_ids:
instances = [
inst for inst in self.dataset
if inst['instance_id'] in instance_ids
]
else:
instances = list(self.dataset)

return instances
139 changes: 139 additions & 0 deletions src/benchmark/swebench/evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
"""Docker-based evaluation harness for SWE-bench."""

import json
import logging
import os
import subprocess
import tempfile
from pathlib import Path
from typing import Dict, List, Optional

logger = logging.getLogger(__name__)

class SWEBenchEvaluator:
"""Evaluator for running SWE-bench in Docker containers."""

def __init__(self, max_workers: int = 4, working_dir: Optional[Path] = None):
"""Initialize evaluator.

Args:
max_workers: Number of parallel workers
working_dir: Working directory for evaluation files
"""
self.max_workers = max_workers
self.working_dir = working_dir or Path(tempfile.mkdtemp(prefix='swebench_'))
self.working_dir.mkdir(parents=True, exist_ok=True)

def evaluate_instances(
self,
instances: List[Dict],
run_id: Optional[str] = None
) -> Dict:
"""Evaluate benchmark instances.

Args:
instances: List of benchmark instances to evaluate
run_id: Optional identifier for this evaluation run

Returns:
Dictionary containing evaluation results
"""
results = {}
run_dir = self.working_dir / (run_id or 'default')
run_dir.mkdir(parents=True, exist_ok=True)

# Save predictions for batch evaluation
predictions_dir = run_dir / 'predictions'
predictions_dir.mkdir(parents=True, exist_ok=True)

for instance in instances:
try:
# Save instance prediction
instance_dir = predictions_dir / instance['instance_id']
instance_dir.mkdir(parents=True, exist_ok=True)
with open(instance_dir / 'prediction.json', 'w') as f:
json.dump(instance, f, indent=2)
except Exception as e:
logger.error(f"Error preparing {instance['instance_id']}: {e}")
results[instance['instance_id']] = {
'status': 'error',
'error': f"Failed to prepare instance: {str(e)}"
}

# Run batch evaluation using SWE-bench harness
try:
result = self._run_docker_evaluation(predictions_dir, run_id)
results.update(self._parse_evaluation_results(result))
except Exception as e:
logger.error(f"Docker evaluation failed: {e}")
for instance in instances:
if instance['instance_id'] not in results:
results[instance['instance_id']] = {
'status': 'error',
'error': f"Docker evaluation failed: {str(e)}"
}

return results

def _run_docker_evaluation(self, predictions_dir: Path, run_id: str) -> str:
"""Run Docker-based evaluation using SWE-bench harness.

Args:
predictions_dir: Directory containing instance predictions
run_id: Identifier for this evaluation run

Returns:
Raw evaluation output
"""
cmd = [
'python', '-m', 'swebench.harness.run_evaluation',
'--predictions_path', str(predictions_dir),
'--max_workers', str(self.max_workers),
'--run_id', run_id or 'default'
]

try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
check=True
)
return result.stdout
except subprocess.CalledProcessError as e:
logger.error(f"Docker evaluation command failed: {e.output}")
raise RuntimeError(f"Docker evaluation failed: {str(e)}")

def _parse_evaluation_results(self, output: str) -> Dict:
"""Parse evaluation output to extract metrics.

Args:
output: Raw evaluation output string

Returns:
Dictionary containing parsed metrics per instance
"""
results = {}
try:
# Extract results from evaluation output
# Format: instance_id: {metrics}
for line in output.splitlines():
if ':' in line:
instance_id, metrics_str = line.split(':', 1)
instance_id = instance_id.strip()
try:
metrics = json.loads(metrics_str.strip())
results[instance_id] = {
'status': 'success',
'metrics': metrics
}
except json.JSONDecodeError:
results[instance_id] = {
'status': 'error',
'error': f"Failed to parse metrics: {metrics_str}"
}
except Exception as e:
logger.error(f"Failed to parse evaluation results: {e}")
raise RuntimeError(f"Failed to parse evaluation results: {str(e)}")

return results
53 changes: 53 additions & 0 deletions src/benchmark/swebench/reporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Results reporting for SWE-bench benchmark."""

import json
from pathlib import Path
from typing import Dict

class SWEBenchReporter:
"""Reporter for SWE-bench benchmark results."""

def generate_report(self, results: Dict) -> Dict:
"""Generate benchmark report.

Args:
results: Dictionary containing benchmark results

Returns:
Dictionary containing formatted report
"""
report = {
'summary': self._generate_summary(results),
'details': results
}
return report

def save_report(self, report: Dict, output_file: Path):
"""Save benchmark report to file.

Args:
report: Dictionary containing benchmark report
output_file: Path to save report
"""
with open(output_file, 'w') as f:
json.dump(report, f, indent=2)

def _generate_summary(self, results: Dict) -> Dict:
"""Generate summary statistics from results.

Args:
results: Dictionary containing benchmark results

Returns:
Dictionary containing summary statistics
"""
total = len(results)
successful = sum(1 for r in results.values() if r.get('status') == 'success')
failed = sum(1 for r in results.values() if r.get('status') == 'error')

return {
'total_instances': total,
'successful': successful,
'failed': failed,
'success_rate': successful / total if total > 0 else 0
}
62 changes: 62 additions & 0 deletions src/benchmark/swebench/swebench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Main SWE-bench runner implementation."""

import logging
from pathlib import Path
from typing import Dict, List, Optional

from .dataset import SWEBenchDataset
from .evaluator import SWEBenchEvaluator
from .reporter import SWEBenchReporter

logger = logging.getLogger(__name__)

class SWEBenchRunner:
"""Main class for running SWE-bench benchmarks."""

def __init__(
self,
dataset_name: str = "princeton-nlp/SWE-bench",
max_workers: int = 4,
working_dir: Optional[Path] = None
):
"""Initialize SWE-bench runner.

Args:
dataset_name: HuggingFace dataset name
max_workers: Number of parallel workers for evaluation
working_dir: Working directory for benchmark files
"""
self.dataset = SWEBenchDataset(dataset_name)
self.evaluator = SWEBenchEvaluator(max_workers=max_workers)
self.reporter = SWEBenchReporter()
self.working_dir = working_dir or Path.cwd() / "swebench_results"
self.working_dir.mkdir(parents=True, exist_ok=True)

def run_benchmark(
self,
instance_ids: Optional[List[str]] = None,
run_id: Optional[str] = None
) -> Dict:
"""Run benchmark evaluation.

Args:
instance_ids: Optional list of specific instances to evaluate
run_id: Optional identifier for this benchmark run

Returns:
Dictionary containing benchmark results
"""
logger.info("Loading benchmark dataset...")
instances = self.dataset.load_instances(instance_ids)

logger.info("Running evaluations...")
results = self.evaluator.evaluate_instances(instances, run_id)

logger.info("Generating report...")
report = self.reporter.generate_report(results)

# Save results
results_file = self.working_dir / f"results_{run_id or 'default'}.json"
self.reporter.save_report(report, results_file)

return report
1 change: 1 addition & 0 deletions tests/benchmark/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Benchmark test package."""
28 changes: 28 additions & 0 deletions tests/benchmark/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Pytest configuration for benchmark tests."""

import pytest
from pathlib import Path

@pytest.fixture
def sample_instance():
"""Sample benchmark instance for testing."""
return {
'instance_id': 'test_instance',
'repo': 'test/repo',
'issue': 'Sample issue description',
'patch': 'Sample patch content'
}

@pytest.fixture
def sample_results():
"""Sample benchmark results for testing."""
return {
'test_instance_1': {
'status': 'success',
'metrics': {'accuracy': 0.95}
},
'test_instance_2': {
'status': 'error',
'error': 'Test error message'
}
}
Loading