Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add scripts for question generation, LLM-based question/issue a… #50

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions scripts/question_gen/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
#
# SPDX-License-Identifier: Apache-2.0

import json
from pathlib import Path
from typing import Dict, List, Optional

import openai

from repoqa.provider.request.openai import make_auto_request

ISSUE_SEP = (
"-" * 50
+ "\nThe below is the discussion and comments on the question:"
+ "\n"
+ "-" * 50
)
GEN_QUESTION_ANS_SEP = "\n==========\n"


def extract_answers(jsonl_file: Path, key: str) -> Dict[str, Dict[str, str]]:
assert key in [
"issue_id",
"question_id",
], "Key must be either 'issue_id' or 'question_id'"
answers = {}
with open(jsonl_file, "r") as f:
for line in f.readlines():
data = json.loads(line)
answers[data[key]] = data
return answers


def retrieve_code_context_files(
dataset_path: str,
issue_content: str,
repo_name: str,
relevant_file_paths: Optional[List[str]] = None,
) -> Dict[str, str]:
with open(dataset_path, "r") as f:
lists = json.load(f)

for lang, repos in lists.items():
for repo in repos:
if repo["repo"] == repo_name:
repo_content = repo["content"] # dict of {file_path: code}
if relevant_file_paths is not None:
repo_content_relevant = {
file_path: repo_content[file_path]
for file_path in relevant_file_paths
}
else:
relevant_file_paths = get_potential_context_files(
repo_content, repo_name, issue_content
)
repo_content_relevant = {
file_path: repo_content[file_path]
for file_path in relevant_file_paths
}
return repo_content_relevant

raise ValueError(f"Repository {repo_name} not found in the dataset")


def truncate_context_files_if_too_large(
issue_or_question_id: str, code_context_dict: Dict[str, str], max_lines: int = 2000
) -> Dict[str, str]:
# sort the code context by lines of code from smallest to largest
code_context_dict = dict(
sorted(code_context_dict.items(), key=lambda x: x[1].count("\n"))
)
code_context = f"\n\n".join(
[
f"File: {file_path}\n\n{code}"
for file_path, code in code_context_dict.items()
]
)
if code_context.count("\n") > max_lines:
# span the context files to the first max_lines lines
code_context = ""
for idx, (file_path, code) in enumerate(code_context_dict.items()):
if code_context.count("\n") + code.count("\n") > max_lines:
print(
f"[WARNING] Code context of issue or question {issue_or_question_id} is too large, limiting to {idx} files"
)
break
code_context += f"File: {file_path}\n\n{code}\n\n"
return code_context


def get_potential_context_files(
repo_content: Dict[str, str], repo_name: str, issue_content: str
) -> List[str]:
# use OpenAI GPT-4 to decide which code context is relevant to the issue
client = openai.Client()
file_list = "\n".join([f"{file_path}" for file_path in repo_content.keys()])
prompt = f"Here is a real-world GitHub issue from the repository {repo_name}:\n\n{issue_content}\n\nThe below is a list of all code files in the repository:\n\n{file_list}\n\nPlease select up to 10 code files that may be relevant to the issue above.\n\nPlease return the file paths in a list split by ', ' like 'path/to/file1.py, path/to/file2.py, path/to/file3.py'.\n\n Do not reply anything else other than the file paths."

output = make_auto_request(
client, prompt, "gpt-4o", max_tokens=1000, temperature=0, n=1
)
relevant_file_paths = output.choices[0].message.content.split(", ")
for path in relevant_file_paths:
assert (
path in repo_content.keys()
), f"File path {path} is not in the repository content"
return relevant_file_paths


def get_code_context_from_gen_question_jsonl(
gen_question_jsonl_file: Path, repo_name: str, middle_func_name: str
) -> str:
with open(gen_question_jsonl_file, "r") as f:
for line in f:
data = json.loads(line)
if data["repo"] == repo_name and data["name"] == middle_func_name:
return data["code"]
raise ValueError(
f"Function {middle_func_name} not found in the generated question JSONL file"
)
107 changes: 107 additions & 0 deletions scripts/question_gen/answer_gen_question.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
#
# SPDX-License-Identifier: Apache-2.0

import json
import os
from pathlib import Path
from typing import Optional

import openai

from repoqa.provider.request.openai import make_auto_request
from scripts.question_gen import (
GEN_QUESTION_ANS_SEP,
retrieve_code_context_files,
truncate_context_files_if_too_large,
)


def question_answer_gen(
repo: str,
question_content: str,
model: str,
code_context: Optional[str] = None,
base_url: str = None,
backend: str = "openai",
max_new_tokens: int = 2048,
) -> str:
if backend == "openai":
client = openai.Client()
else:
raise NotImplementedError("Only openai is supported for now")

prompt = f"Here is a question on the repository {repo}:\n\n{question_content}\n\nPlease provide a brief answer to the issue above.\n\n"
if code_context is not None:
prompt = f"{prompt}\n\n Here is the code context that may be relevant to this issue:\n\n{code_context}\n\n"
output = make_auto_request(
client,
prompt,
model,
max_tokens=max_new_tokens,
temperature=0.2,
n=1,
)

return output.choices[0].message.content


def main(
dataset_path: str,
gen_question_jsonl_file: str,
model: str = "gpt-4o",
output_path: str = "gen_question_answer.jsonl",
use_batch_api: bool = False,
):
assert use_batch_api == False, "Batch API is not supported yet."
assert dataset_path.endswith(".json"), "Dataset must be a JSON file, check README"
assert os.path.exists(
gen_question_jsonl_file
), "Generated question JSONL file does not exist"

with open(output_path, "+a") as f_out:
with open(gen_question_jsonl_file, "r") as f:
for line in f:
data = json.loads(line)
repo = data["repo"]
mid_func_name = data["name"]
code_context = data["code"]
response = data["response"]
elements = response.split(GEN_QUESTION_ANS_SEP)
for element in elements:
if element.strip() == "":
continue
# E.g. **Question_1**: What is the primary purpose of the tool in this repository?\n**Answer_1**: The tool is designed as an uncompromising code formatter for Python, aiming to standardize the formatting of Python code across projects.
question_id = (
element.split("\n")[0].split(":")[0].strip().replace("**", "")
+ "#"
+ mid_func_name
)
question = element.split("\n")[0].split(":")[1].strip()
gt_answer = element.split("\n")[1].split(":")[1].strip()

gen_question_answer_no_context = question_answer_gen(
repo, question, model, backend="openai"
)
gen_question_answer_with_context = question_answer_gen(
repo, question, model, code_context, backend="openai"
)

result = {
"repo": repo,
"question_id": question_id,
"question": question,
"gt_answer": gt_answer,
"model": model,
"answer_no_context": gen_question_answer_no_context,
"answer_with_context": gen_question_answer_with_context,
}
json.dump(result, f_out)
f_out.write("\n")
f_out.flush()


if __name__ == "__main__":
from fire import Fire

Fire(main)
113 changes: 113 additions & 0 deletions scripts/question_gen/answer_gh_issue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
#
# SPDX-License-Identifier: Apache-2.0

import json
import os
from pathlib import Path
from typing import Optional

import openai

from repoqa.provider.request.openai import make_auto_request
from scripts.question_gen import (
ISSUE_SEP,
retrieve_code_context_files,
truncate_context_files_if_too_large,
)


def strip_issue_question(issue_content: str) -> str:
issue_question_content = issue_content.split(ISSUE_SEP)[0].strip()
issue_replies = issue_content.split(ISSUE_SEP)[1].strip()

# 0-idx is the first reply
if "(QUESTIONER) replies:" in issue_replies.split("\n")[0]:
for (idx, line) in enumerate(issue_replies.split("\n")):
if "replies: " in line and idx > 0:
break
issue_self_reply = "\n".join(issue_replies.split("\n")[:idx])
issue_question_content = f"{issue_question_content}\n\n{issue_self_reply}"

return issue_question_content


def issue_answer_gen(
repo: str,
issue_content: str,
model: str,
code_context: Optional[str] = None,
base_url: str = None,
backend: str = "openai",
max_new_tokens: int = 2048,
) -> str:
issue_question_content = strip_issue_question(issue_content)
if backend == "openai":
client = openai.Client()
else:
raise NotImplementedError("Only openai is supported for now")

prompt = f"Here is a real-world GitHub issue from the repository {repo}:\n\n{issue_question_content}\n\nPlease provide a brief answer to the issue above.\n\n"
if code_context is not None:
prompt = f"{prompt}\n\n Here is the code context that may be relevant to this issue:\n\n{code_context}\n\n"
output = make_auto_request(
client,
prompt,
model,
max_tokens=max_new_tokens,
temperature=0.2,
n=1,
)

return output.choices[0].message.content


def main(
dataset_path: str,
issue_dir: str,
max_ctx_lines: int = 2000,
model: str = "gpt-4o", # we use the best gpt-4o as ground truth to filter issues
output_path: str = "gh_issue_answer.jsonl",
use_batch_api: bool = False,
):
assert use_batch_api == False, "Batch API is not supported yet."
assert dataset_path.endswith(".json"), "Dataset must be a JSON file, check README"
assert os.path.exists(issue_dir), "Issue directory does not exist"

with open(output_path, "+a") as f_out:
for issue_file in Path(issue_dir).glob("*.txt"):
issue_content = issue_file.read_text()
issue_file_name = issue_file.stem
issue_repo_name = "/".join(issue_file_name.split("_")[:2])
code_context_dict = retrieve_code_context_files(
dataset_path, issue_content, issue_repo_name
)
limitted_code_context = truncate_context_files_if_too_large(
issue_file_name, code_context_dict, max_ctx_lines
)
issue_answer_no_context = issue_answer_gen(
issue_repo_name, issue_content, model
)
issue_answer_with_context = issue_answer_gen(
issue_repo_name,
issue_content,
model,
code_context=limitted_code_context,
)

result = {
"repo": issue_repo_name,
"issue_id": issue_file_name.replace(".txt", ""),
"code_context_files": list(code_context_dict.keys()),
"answer_no_context": issue_answer_no_context,
"answer_with_context": issue_answer_with_context,
}
json.dump(result, f_out)
f_out.write("\n")
f_out.flush()


if __name__ == "__main__":
from fire import Fire

Fire(main)
20 changes: 20 additions & 0 deletions scripts/question_gen/dump_issue_json.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
#
# SPDX-License-Identifier: Apache-2.0

issue_list_file=$(realpath $(dirname "${BASH_SOURCE[0]}")/issue_demo/issue_demo_list.txt)
dumped_issue_json_dir=$(realpath $(dirname "${BASH_SOURCE[0]}")/dumped_issue_json)
mkdir -p ${dumped_issue_json_dir}

while read issue_url; do
array=(${issue_url//\// })
owner_name=${array[2]}
repo_name=${array[3]}
issue_number=${array[5]}

# echo "Dumping issue json for ${owner_name}/${repo_name}#${issue_number}"
issue_json_file=${dumped_issue_json_dir}/${owner_name}_${repo_name}_${issue_number}.json
gh issue view ${issue_number} -R ${owner_name}/${repo_name} --json title,body,createdAt,updatedAt,author,labels,comments > ${issue_json_file}
done < $issue_list_file
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"author":{"id":"MDQ6VXNlcjcxMDg4MzQ=","is_bot":false,"login":"iosifnicolae2","name":"Iosif Nicolae"},"body":"Hi.\r\nIs json type supported?\r\n\r\nThank you!","comments":[{"id":"IC_kwDODGaNZc5GCPMh","author":{"login":"Enmk"},"authorAssociation":"COLLABORATOR","body":"Hi! Not supported RN, but any PRs are very welcome!","createdAt":"2022-07-05T12:16:58Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/ClickHouse/clickhouse-cpp/issues/188#issuecomment-1174991649","viewerDidAuthor":false}],"createdAt":"2022-06-16T10:48:04Z","labels":[{"id":"MDU6TGFiZWwxNTUyMTUzNjI2","name":"question","description":"Further information is requested","color":"d876e3"}],"title":"Is json type supported?","updatedAt":"2022-10-04T13:20:09Z"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"author":{"id":"MDQ6VXNlcjI0MjY2NzQ0","is_bot":false,"login":"constructivetim","name":""},"body":"ho do i stop this appearing every time I start java?\r\n\r\n|\r\n | Password4j\r\n + \\ .: v1.7.3 :.\r\n \\\\.G_.*=.\r\n `(H'/.\\| ✅ Argon2\r\n .>' (_--. ✅ scrypt\r\n _=/d ,^\\ ✅ bcrypt\r\n ~~ \\)-'-' ✅ PBKDF2-SHA1/SHA384/SHA512/256/SHA256/SHA512/224/SHA224/SHA512\r\n / |\r\n ' '\r\n ⭐ If you enjoy Password4j, please star the project at https://github.com/Password4j/password4j\r\n 🪲 Report any issue at https://github.com/Password4j/password4j/issues\r\n\r\n\r\n","comments":[{"id":"IC_kwDODqvDdM5yGdqn","author":{"login":"firaja"},"authorAssociation":"MEMBER","body":"Hi @constructivetim you can disable the startup banner by setting `global.banner=false` in your `psw4j.properties`\r\n\r\nHere the doc for the property https://github.com/Password4j/password4j/wiki/Properties#globalbanner-boolean","createdAt":"2024-01-29T09:31:20Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/Password4j/password4j/issues/136#issuecomment-1914296999","viewerDidAuthor":false},{"id":"IC_kwDODqvDdM5yGrje","author":{"login":"constructivetim"},"authorAssociation":"NONE","body":"thanks very much","createdAt":"2024-01-29T10:03:11Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[{"content":"THUMBS_UP","users":{"totalCount":1}}],"url":"https://github.com/Password4j/password4j/issues/136#issuecomment-1914353886","viewerDidAuthor":false}],"createdAt":"2024-01-25T12:00:03Z","labels":[{"id":"MDU6TGFiZWwyNzIwMTA1OTk0","name":"priority: low","description":"","color":"0e8a16"},{"id":"MDU6TGFiZWwyNzIwMTA2OTE1","name":"status: confirmed","description":"","color":"215cea"},{"id":"MDU6TGFiZWwyNzIwMTA5Mzk5","name":"type: question","description":"Further information is requested","color":"d876e3"}],"title":"stdout polluted with friendly message","updatedAt":"2024-01-29T11:28:30Z"}
Loading