evalplus · claudeyj · Sep 9, 2024
diff --git a/scripts/question_gen/__init__.py b/scripts/question_gen/__init__.py
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import openai
+
+from repoqa.provider.request.openai import make_auto_request
+
+ISSUE_SEP = (
+    "-" * 50
+    + "\nThe below is the discussion and comments on the question:"
+    + "\n"
+    + "-" * 50
+)
+GEN_QUESTION_ANS_SEP = "\n==========\n"
+
+
+def extract_answers(jsonl_file: Path, key: str) -> Dict[str, Dict[str, str]]:
+    assert key in [
+        "issue_id",
+        "question_id",
+    ], "Key must be either 'issue_id' or 'question_id'"
+    answers = {}
+    with open(jsonl_file, "r") as f:
+        for line in f.readlines():
+            data = json.loads(line)
+            answers[data[key]] = data
+    return answers
+
+
+def retrieve_code_context_files(
+    dataset_path: str,
+    issue_content: str,
+    repo_name: str,
+    relevant_file_paths: Optional[List[str]] = None,
+) -> Dict[str, str]:
+    with open(dataset_path, "r") as f:
+        lists = json.load(f)
+
+    for lang, repos in lists.items():
+        for repo in repos:
+            if repo["repo"] == repo_name:
+                repo_content = repo["content"]  # dict of {file_path: code}
+                if relevant_file_paths is not None:
+                    repo_content_relevant = {
+                        file_path: repo_content[file_path]
+                        for file_path in relevant_file_paths
+                    }
+                else:
+                    relevant_file_paths = get_potential_context_files(
+                        repo_content, repo_name, issue_content
+                    )
+                    repo_content_relevant = {
+                        file_path: repo_content[file_path]
+                        for file_path in relevant_file_paths
+                    }
+                return repo_content_relevant
+
+    raise ValueError(f"Repository {repo_name} not found in the dataset")
+
+
+def truncate_context_files_if_too_large(
+    issue_or_question_id: str, code_context_dict: Dict[str, str], max_lines: int = 2000
+) -> Dict[str, str]:
+    # sort the code context by lines of code from smallest to largest
+    code_context_dict = dict(
+        sorted(code_context_dict.items(), key=lambda x: x[1].count("\n"))
+    )
+    code_context = f"\n\n".join(
+        [
+            f"File: {file_path}\n\n{code}"
+            for file_path, code in code_context_dict.items()
+        ]
+    )
+    if code_context.count("\n") > max_lines:
+        # span the context files to the first max_lines lines
+        code_context = ""
+        for idx, (file_path, code) in enumerate(code_context_dict.items()):
+            if code_context.count("\n") + code.count("\n") > max_lines:
+                print(
+                    f"[WARNING] Code context of issue or question {issue_or_question_id} is too large, limiting to {idx} files"
+                )
+                break
+            code_context += f"File: {file_path}\n\n{code}\n\n"
+    return code_context
+
+
+def get_potential_context_files(
+    repo_content: Dict[str, str], repo_name: str, issue_content: str
+) -> List[str]:
+    # use OpenAI GPT-4 to decide which code context is relevant to the issue
+    client = openai.Client()
+    file_list = "\n".join([f"{file_path}" for file_path in repo_content.keys()])
+    prompt = f"Here is a real-world GitHub issue from the repository {repo_name}:\n\n{issue_content}\n\nThe below is a list of all code files in the repository:\n\n{file_list}\n\nPlease select up to 10 code files that may be relevant to the issue above.\n\nPlease return the file paths in a list split by ', ' like 'path/to/file1.py, path/to/file2.py, path/to/file3.py'.\n\n Do not reply anything else other than the file paths."
+
+    output = make_auto_request(
+        client, prompt, "gpt-4o", max_tokens=1000, temperature=0, n=1
+    )
+    relevant_file_paths = output.choices[0].message.content.split(", ")
+    for path in relevant_file_paths:
+        assert (
+            path in repo_content.keys()
+        ), f"File path {path} is not in the repository content"
+    return relevant_file_paths
+
+
+def get_code_context_from_gen_question_jsonl(
+    gen_question_jsonl_file: Path, repo_name: str, middle_func_name: str
+) -> str:
+    with open(gen_question_jsonl_file, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            if data["repo"] == repo_name and data["name"] == middle_func_name:
+                return data["code"]
+    raise ValueError(
+        f"Function {middle_func_name} not found in the generated question JSONL file"
+    )
diff --git a/scripts/question_gen/answer_gen_question.py b/scripts/question_gen/answer_gen_question.py
@@ -0,0 +1,107 @@
+# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+from pathlib import Path
+from typing import Optional
+
+import openai
+
+from repoqa.provider.request.openai import make_auto_request
+from scripts.question_gen import (
+    GEN_QUESTION_ANS_SEP,
+    retrieve_code_context_files,
+    truncate_context_files_if_too_large,
+)
+
+
+def question_answer_gen(
+    repo: str,
+    question_content: str,
+    model: str,
+    code_context: Optional[str] = None,
+    base_url: str = None,
+    backend: str = "openai",
+    max_new_tokens: int = 2048,
+) -> str:
+    if backend == "openai":
+        client = openai.Client()
+    else:
+        raise NotImplementedError("Only openai is supported for now")
+
+    prompt = f"Here is a question on the repository {repo}:\n\n{question_content}\n\nPlease provide a brief answer to the issue above.\n\n"
+    if code_context is not None:
+        prompt = f"{prompt}\n\n Here is the code context that may be relevant to this issue:\n\n{code_context}\n\n"
+    output = make_auto_request(
+        client,
+        prompt,
+        model,
+        max_tokens=max_new_tokens,
+        temperature=0.2,
+        n=1,
+    )
+
+    return output.choices[0].message.content
+
+
+def main(
+    dataset_path: str,
+    gen_question_jsonl_file: str,
+    model: str = "gpt-4o",
+    output_path: str = "gen_question_answer.jsonl",
+    use_batch_api: bool = False,
+):
+    assert use_batch_api == False, "Batch API is not supported yet."
+    assert dataset_path.endswith(".json"), "Dataset must be a JSON file, check README"
+    assert os.path.exists(
+        gen_question_jsonl_file
+    ), "Generated question JSONL file does not exist"
+
+    with open(output_path, "+a") as f_out:
+        with open(gen_question_jsonl_file, "r") as f:
+            for line in f:
+                data = json.loads(line)
+                repo = data["repo"]
+                mid_func_name = data["name"]
+                code_context = data["code"]
+                response = data["response"]
+                elements = response.split(GEN_QUESTION_ANS_SEP)
+                for element in elements:
+                    if element.strip() == "":
+                        continue
+                    # E.g. **Question_1**: What is the primary purpose of the tool in this repository?\n**Answer_1**: The tool is designed as an uncompromising code formatter for Python, aiming to standardize the formatting of Python code across projects.
+                    question_id = (
+                        element.split("\n")[0].split(":")[0].strip().replace("**", "")
+                        + "#"
+                        + mid_func_name
+                    )
+                    question = element.split("\n")[0].split(":")[1].strip()
+                    gt_answer = element.split("\n")[1].split(":")[1].strip()
+
+                    gen_question_answer_no_context = question_answer_gen(
+                        repo, question, model, backend="openai"
+                    )
+                    gen_question_answer_with_context = question_answer_gen(
+                        repo, question, model, code_context, backend="openai"
+                    )
+
+                    result = {
+                        "repo": repo,
+                        "question_id": question_id,
+                        "question": question,
+                        "gt_answer": gt_answer,
+                        "model": model,
+                        "answer_no_context": gen_question_answer_no_context,
+                        "answer_with_context": gen_question_answer_with_context,
+                    }
+                    json.dump(result, f_out)
+                    f_out.write("\n")
+                    f_out.flush()
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(main)
diff --git a/scripts/question_gen/answer_gh_issue.py b/scripts/question_gen/answer_gh_issue.py
@@ -0,0 +1,113 @@
+# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+from pathlib import Path
+from typing import Optional
+
+import openai
+
+from repoqa.provider.request.openai import make_auto_request
+from scripts.question_gen import (
+    ISSUE_SEP,
+    retrieve_code_context_files,
+    truncate_context_files_if_too_large,
+)
+
+
+def strip_issue_question(issue_content: str) -> str:
+    issue_question_content = issue_content.split(ISSUE_SEP)[0].strip()
+    issue_replies = issue_content.split(ISSUE_SEP)[1].strip()
+
+    # 0-idx is the first reply
+    if "(QUESTIONER) replies:" in issue_replies.split("\n")[0]:
+        for (idx, line) in enumerate(issue_replies.split("\n")):
+            if "replies: " in line and idx > 0:
+                break
+        issue_self_reply = "\n".join(issue_replies.split("\n")[:idx])
+        issue_question_content = f"{issue_question_content}\n\n{issue_self_reply}"
+
+    return issue_question_content
+
+
+def issue_answer_gen(
+    repo: str,
+    issue_content: str,
+    model: str,
+    code_context: Optional[str] = None,
+    base_url: str = None,
+    backend: str = "openai",
+    max_new_tokens: int = 2048,
+) -> str:
+    issue_question_content = strip_issue_question(issue_content)
+    if backend == "openai":
+        client = openai.Client()
+    else:
+        raise NotImplementedError("Only openai is supported for now")
+
+    prompt = f"Here is a real-world GitHub issue from the repository {repo}:\n\n{issue_question_content}\n\nPlease provide a brief answer to the issue above.\n\n"
+    if code_context is not None:
+        prompt = f"{prompt}\n\n Here is the code context that may be relevant to this issue:\n\n{code_context}\n\n"
+    output = make_auto_request(
+        client,
+        prompt,
+        model,
+        max_tokens=max_new_tokens,
+        temperature=0.2,
+        n=1,
+    )
+
+    return output.choices[0].message.content
+
+
+def main(
+    dataset_path: str,
+    issue_dir: str,
+    max_ctx_lines: int = 2000,
+    model: str = "gpt-4o",  # we use the best gpt-4o as ground truth to filter issues
+    output_path: str = "gh_issue_answer.jsonl",
+    use_batch_api: bool = False,
+):
+    assert use_batch_api == False, "Batch API is not supported yet."
+    assert dataset_path.endswith(".json"), "Dataset must be a JSON file, check README"
+    assert os.path.exists(issue_dir), "Issue directory does not exist"
+
+    with open(output_path, "+a") as f_out:
+        for issue_file in Path(issue_dir).glob("*.txt"):
+            issue_content = issue_file.read_text()
+            issue_file_name = issue_file.stem
+            issue_repo_name = "/".join(issue_file_name.split("_")[:2])
+            code_context_dict = retrieve_code_context_files(
+                dataset_path, issue_content, issue_repo_name
+            )
+            limitted_code_context = truncate_context_files_if_too_large(
+                issue_file_name, code_context_dict, max_ctx_lines
+            )
+            issue_answer_no_context = issue_answer_gen(
+                issue_repo_name, issue_content, model
+            )
+            issue_answer_with_context = issue_answer_gen(
+                issue_repo_name,
+                issue_content,
+                model,
+                code_context=limitted_code_context,
+            )
+
+            result = {
+                "repo": issue_repo_name,
+                "issue_id": issue_file_name.replace(".txt", ""),
+                "code_context_files": list(code_context_dict.keys()),
+                "answer_no_context": issue_answer_no_context,
+                "answer_with_context": issue_answer_with_context,
+            }
+            json.dump(result, f_out)
+            f_out.write("\n")
+            f_out.flush()
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(main)
diff --git a/scripts/question_gen/dump_issue_json.sh b/scripts/question_gen/dump_issue_json.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+issue_list_file=$(realpath $(dirname "${BASH_SOURCE[0]}")/issue_demo/issue_demo_list.txt)
+dumped_issue_json_dir=$(realpath $(dirname "${BASH_SOURCE[0]}")/dumped_issue_json)
+mkdir -p ${dumped_issue_json_dir}
+
+while read issue_url; do
+    array=(${issue_url//\// })
+    owner_name=${array[2]}
+    repo_name=${array[3]}
+    issue_number=${array[5]}
+
+    # echo "Dumping issue json for ${owner_name}/${repo_name}#${issue_number}"
+    issue_json_file=${dumped_issue_json_dir}/${owner_name}_${repo_name}_${issue_number}.json
+    gh issue view ${issue_number} -R ${owner_name}/${repo_name} --json title,body,createdAt,updatedAt,author,labels,comments > ${issue_json_file}
+done < $issue_list_file
diff --git a/scripts/question_gen/dumped_issue_json/ClickHouse_clickhouse-cpp_188.json b/scripts/question_gen/dumped_issue_json/ClickHouse_clickhouse-cpp_188.json
@@ -0,0 +1 @@
+{"author":{"id":"MDQ6VXNlcjcxMDg4MzQ=","is_bot":false,"login":"iosifnicolae2","name":"Iosif Nicolae"},"body":"Hi.\r\nIs json type supported?\r\n\r\nThank you!","comments":[{"id":"IC_kwDODGaNZc5GCPMh","author":{"login":"Enmk"},"authorAssociation":"COLLABORATOR","body":"Hi! Not supported RN, but any PRs are very welcome!","createdAt":"2022-07-05T12:16:58Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/ClickHouse/clickhouse-cpp/issues/188#issuecomment-1174991649","viewerDidAuthor":false}],"createdAt":"2022-06-16T10:48:04Z","labels":[{"id":"MDU6TGFiZWwxNTUyMTUzNjI2","name":"question","description":"Further information is requested","color":"d876e3"}],"title":"Is json type supported?","updatedAt":"2022-10-04T13:20:09Z"}
diff --git a/scripts/question_gen/dumped_issue_json/Password4j_password4j_136.json b/scripts/question_gen/dumped_issue_json/Password4j_password4j_136.json
@@ -0,0 +1 @@
+{"author":{"id":"MDQ6VXNlcjI0MjY2NzQ0","is_bot":false,"login":"constructivetim","name":""},"body":"ho do i stop this appearing every time I start java?\r\n\r\n|\r\n    |                Password4j\r\n    + \\             .: v1.7.3 :.\r\n    \\\\.G_.*=.\r\n     `(H'/.\\|        ✅ Argon2\r\n      .>' (_--.      ✅ scrypt\r\n   _=/d   ,^\\        ✅ bcrypt\r\n ~~ \\)-'-'           ✅ PBKDF2-SHA1/SHA384/SHA512/256/SHA256/SHA512/224/SHA224/SHA512\r\n    / |\r\n    '  '\r\n ⭐ If you enjoy Password4j, please star the project at https://github.com/Password4j/password4j\r\n 🪲  Report any issue at https://github.com/Password4j/password4j/issues\r\n\r\n\r\n","comments":[{"id":"IC_kwDODqvDdM5yGdqn","author":{"login":"firaja"},"authorAssociation":"MEMBER","body":"Hi @constructivetim you can disable the startup banner by setting `global.banner=false` in your `psw4j.properties`\r\n\r\nHere the doc for the property https://github.com/Password4j/password4j/wiki/Properties#globalbanner-boolean","createdAt":"2024-01-29T09:31:20Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/Password4j/password4j/issues/136#issuecomment-1914296999","viewerDidAuthor":false},{"id":"IC_kwDODqvDdM5yGrje","author":{"login":"constructivetim"},"authorAssociation":"NONE","body":"thanks very much","createdAt":"2024-01-29T10:03:11Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[{"content":"THUMBS_UP","users":{"totalCount":1}}],"url":"https://github.com/Password4j/password4j/issues/136#issuecomment-1914353886","viewerDidAuthor":false}],"createdAt":"2024-01-25T12:00:03Z","labels":[{"id":"MDU6TGFiZWwyNzIwMTA1OTk0","name":"priority: low","description":"","color":"0e8a16"},{"id":"MDU6TGFiZWwyNzIwMTA2OTE1","name":"status: confirmed","description":"","color":"215cea"},{"id":"MDU6TGFiZWwyNzIwMTA5Mzk5","name":"type: question","description":"Further information is requested","color":"d876e3"}],"title":"stdout polluted with friendly message","updatedAt":"2024-01-29T11:28:30Z"}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"author":{"id":"MDQ6VXNlcjcxMDg4MzQ=","is_bot":false,"login":"iosifnicolae2","name":"Iosif Nicolae"},"body":"Hi.\r\nIs json type supported?\r\n\r\nThank you!","comments":[{"id":"IC_kwDODGaNZc5GCPMh","author":{"login":"Enmk"},"authorAssociation":"COLLABORATOR","body":"Hi! Not supported RN, but any PRs are very welcome!","createdAt":"2022-07-05T12:16:58Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/ClickHouse/clickhouse-cpp/issues/188#issuecomment-1174991649","viewerDidAuthor":false}],"createdAt":"2022-06-16T10:48:04Z","labels":[{"id":"MDU6TGFiZWwxNTUyMTUzNjI2","name":"question","description":"Further information is requested","color":"d876e3"}],"title":"Is json type supported?","updatedAt":"2022-10-04T13:20:09Z"}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"author":{"id":"MDQ6VXNlcjI0MjY2NzQ0","is_bot":false,"login":"constructivetim","name":""},"body":"ho do i stop this appearing every time I start java?\r\n\r\n\|\r\n \| Password4j\r\n + \\ .: v1.7.3 :.\r\n \\\\.G_.*=.\r\n `(H'/.\\\| ✅ Argon2\r\n .>' (_--. ✅ scrypt\r\n _=/d ,^\\ ✅ bcrypt\r\n ~~ \\)-'-' ✅ PBKDF2-SHA1/SHA384/SHA512/256/SHA256/SHA512/224/SHA224/SHA512\r\n / \|\r\n ' '\r\n ⭐ If you enjoy Password4j, please star the project at https://github.com/Password4j/password4j\r\n 🪲 Report any issue at https://github.com/Password4j/password4j/issues\r\n\r\n\r\n","comments":[{"id":"IC_kwDODqvDdM5yGdqn","author":{"login":"firaja"},"authorAssociation":"MEMBER","body":"Hi @constructivetim you can disable the startup banner by setting `global.banner=false` in your `psw4j.properties`\r\n\r\nHere the doc for the property https://github.com/Password4j/password4j/wiki/Properties#globalbanner-boolean","createdAt":"2024-01-29T09:31:20Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/Password4j/password4j/issues/136#issuecomment-1914296999","viewerDidAuthor":false},{"id":"IC_kwDODqvDdM5yGrje","author":{"login":"constructivetim"},"authorAssociation":"NONE","body":"thanks very much","createdAt":"2024-01-29T10:03:11Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[{"content":"THUMBS_UP","users":{"totalCount":1}}],"url":"https://github.com/Password4j/password4j/issues/136#issuecomment-1914353886","viewerDidAuthor":false}],"createdAt":"2024-01-25T12:00:03Z","labels":[{"id":"MDU6TGFiZWwyNzIwMTA1OTk0","name":"priority: low","description":"","color":"0e8a16"},{"id":"MDU6TGFiZWwyNzIwMTA2OTE1","name":"status: confirmed","description":"","color":"215cea"},{"id":"MDU6TGFiZWwyNzIwMTA5Mzk5","name":"type: question","description":"Further information is requested","color":"d876e3"}],"title":"stdout polluted with friendly message","updatedAt":"2024-01-29T11:28:30Z"}