From f5091bef14cdb3f2e77b7ed8327c004fc821f172 Mon Sep 17 00:00:00 2001
From: Jun Yang <jy70@illinois.edu>
Date: Sun, 8 Sep 2024 21:24:42 -0500
Subject: [PATCH] feat: add scripts for question generation, LLM-based
 question/issue answer, (ablation of context)

---
 scripts/question_gen/__init__.py              | 121 +++++++++++
 scripts/question_gen/answer_gen_question.py   | 107 +++++++++
 scripts/question_gen/answer_gh_issue.py       | 113 ++++++++++
 scripts/question_gen/dump_issue_json.sh       |  20 ++
 .../ClickHouse_clickhouse-cpp_188.json        |   1 +
 .../Password4j_password4j_136.json            |   1 +
 .../drogonframework_drogon_560.json           |   1 +
 .../drogonframework_drogon_695.json           |   1 +
 .../mrdoob_three.js_22160.json                |   1 +
 .../dumped_issue_json/nsqio_nsq_1309.json     |   1 +
 .../dumped_issue_json/oatpp_oatpp_374.json    |   1 +
 .../polybar_polybar_2241.json                 |   1 +
 .../xenova_transformers.js_337.json           |   1 +
 .../xenova_transformers.js_421.json           |   1 +
 .../ClickHouse_clickhouse-cpp_188.txt         |   9 +
 .../Password4j_password4j_136.txt             |  26 +++
 .../drogonframework_drogon_560.txt            |  76 +++++++
 .../drogonframework_drogon_695.txt            |  12 +
 .../mrdoob_three.js_22160.txt                 |  16 ++
 .../few_shot_examples/nsqio_nsq_1309.txt      |   6 +
 .../few_shot_examples/oatpp_oatpp_374.txt     |  11 +
 .../polybar_polybar_2241.txt                  |  14 ++
 .../xenova_transformers.js_337.txt            |  70 ++++++
 .../xenova_transformers.js_421.txt            |  75 +++++++
 scripts/question_gen/gen.py                   | 205 ++++++++++++++++++
 .../issue_demo/issue_demo_list.txt            |  10 +
 .../ClickHouse_clickhouse-cpp_188.txt         |   9 +
 .../Password4j_password4j_136.txt             |  26 +++
 .../drogonframework_drogon_560.txt            |  76 +++++++
 .../drogonframework_drogon_695.txt            |  12 +
 .../issue_dialogue/mrdoob_three.js_22160.txt  |  16 ++
 .../issue_dialogue/nsqio_nsq_1309.txt         |   6 +
 .../issue_dialogue/oatpp_oatpp_374.txt        |  11 +
 .../issue_dialogue/polybar_polybar_2241.txt   |  14 ++
 .../xenova_transformers.js_337.txt            |  70 ++++++
 .../xenova_transformers.js_421.txt            |  75 +++++++
 scripts/question_gen/llm_judge_answer.py      | 168 ++++++++++++++
 scripts/question_gen/make_readable_issue.py   |  47 ++++
 38 files changed, 1431 insertions(+)
 create mode 100644 scripts/question_gen/__init__.py
 create mode 100644 scripts/question_gen/answer_gen_question.py
 create mode 100644 scripts/question_gen/answer_gh_issue.py
 create mode 100755 scripts/question_gen/dump_issue_json.sh
 create mode 100644 scripts/question_gen/dumped_issue_json/ClickHouse_clickhouse-cpp_188.json
 create mode 100644 scripts/question_gen/dumped_issue_json/Password4j_password4j_136.json
 create mode 100644 scripts/question_gen/dumped_issue_json/drogonframework_drogon_560.json
 create mode 100644 scripts/question_gen/dumped_issue_json/drogonframework_drogon_695.json
 create mode 100644 scripts/question_gen/dumped_issue_json/mrdoob_three.js_22160.json
 create mode 100644 scripts/question_gen/dumped_issue_json/nsqio_nsq_1309.json
 create mode 100644 scripts/question_gen/dumped_issue_json/oatpp_oatpp_374.json
 create mode 100644 scripts/question_gen/dumped_issue_json/polybar_polybar_2241.json
 create mode 100644 scripts/question_gen/dumped_issue_json/xenova_transformers.js_337.json
 create mode 100644 scripts/question_gen/dumped_issue_json/xenova_transformers.js_421.json
 create mode 100644 scripts/question_gen/few_shot_examples/ClickHouse_clickhouse-cpp_188.txt
 create mode 100644 scripts/question_gen/few_shot_examples/Password4j_password4j_136.txt
 create mode 100644 scripts/question_gen/few_shot_examples/drogonframework_drogon_560.txt
 create mode 100644 scripts/question_gen/few_shot_examples/drogonframework_drogon_695.txt
 create mode 100644 scripts/question_gen/few_shot_examples/mrdoob_three.js_22160.txt
 create mode 100644 scripts/question_gen/few_shot_examples/nsqio_nsq_1309.txt
 create mode 100644 scripts/question_gen/few_shot_examples/oatpp_oatpp_374.txt
 create mode 100644 scripts/question_gen/few_shot_examples/polybar_polybar_2241.txt
 create mode 100644 scripts/question_gen/few_shot_examples/xenova_transformers.js_337.txt
 create mode 100644 scripts/question_gen/few_shot_examples/xenova_transformers.js_421.txt
 create mode 100644 scripts/question_gen/gen.py
 create mode 100644 scripts/question_gen/issue_demo/issue_demo_list.txt
 create mode 100644 scripts/question_gen/issue_dialogue/ClickHouse_clickhouse-cpp_188.txt
 create mode 100644 scripts/question_gen/issue_dialogue/Password4j_password4j_136.txt
 create mode 100644 scripts/question_gen/issue_dialogue/drogonframework_drogon_560.txt
 create mode 100644 scripts/question_gen/issue_dialogue/drogonframework_drogon_695.txt
 create mode 100644 scripts/question_gen/issue_dialogue/mrdoob_three.js_22160.txt
 create mode 100644 scripts/question_gen/issue_dialogue/nsqio_nsq_1309.txt
 create mode 100644 scripts/question_gen/issue_dialogue/oatpp_oatpp_374.txt
 create mode 100644 scripts/question_gen/issue_dialogue/polybar_polybar_2241.txt
 create mode 100644 scripts/question_gen/issue_dialogue/xenova_transformers.js_337.txt
 create mode 100644 scripts/question_gen/issue_dialogue/xenova_transformers.js_421.txt
 create mode 100644 scripts/question_gen/llm_judge_answer.py
 create mode 100644 scripts/question_gen/make_readable_issue.py

diff --git a/scripts/question_gen/__init__.py b/scripts/question_gen/__init__.py
new file mode 100644
index 0000000..925deda
--- /dev/null
+++ b/scripts/question_gen/__init__.py
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import openai
+
+from repoqa.provider.request.openai import make_auto_request
+
+ISSUE_SEP = (
+    "-" * 50
+    + "\nThe below is the discussion and comments on the question:"
+    + "\n"
+    + "-" * 50
+)
+GEN_QUESTION_ANS_SEP = "\n==========\n"
+
+
+def extract_answers(jsonl_file: Path, key: str) -> Dict[str, Dict[str, str]]:
+    assert key in [
+        "issue_id",
+        "question_id",
+    ], "Key must be either 'issue_id' or 'question_id'"
+    answers = {}
+    with open(jsonl_file, "r") as f:
+        for line in f.readlines():
+            data = json.loads(line)
+            answers[data[key]] = data
+    return answers
+
+
+def retrieve_code_context_files(
+    dataset_path: str,
+    issue_content: str,
+    repo_name: str,
+    relevant_file_paths: Optional[List[str]] = None,
+) -> Dict[str, str]:
+    with open(dataset_path, "r") as f:
+        lists = json.load(f)
+
+    for lang, repos in lists.items():
+        for repo in repos:
+            if repo["repo"] == repo_name:
+                repo_content = repo["content"]  # dict of {file_path: code}
+                if relevant_file_paths is not None:
+                    repo_content_relevant = {
+                        file_path: repo_content[file_path]
+                        for file_path in relevant_file_paths
+                    }
+                else:
+                    relevant_file_paths = get_potential_context_files(
+                        repo_content, repo_name, issue_content
+                    )
+                    repo_content_relevant = {
+                        file_path: repo_content[file_path]
+                        for file_path in relevant_file_paths
+                    }
+                return repo_content_relevant
+
+    raise ValueError(f"Repository {repo_name} not found in the dataset")
+
+
+def truncate_context_files_if_too_large(
+    issue_or_question_id: str, code_context_dict: Dict[str, str], max_lines: int = 2000
+) -> Dict[str, str]:
+    # sort the code context by lines of code from smallest to largest
+    code_context_dict = dict(
+        sorted(code_context_dict.items(), key=lambda x: x[1].count("\n"))
+    )
+    code_context = f"\n\n".join(
+        [
+            f"File: {file_path}\n\n{code}"
+            for file_path, code in code_context_dict.items()
+        ]
+    )
+    if code_context.count("\n") > max_lines:
+        # span the context files to the first max_lines lines
+        code_context = ""
+        for idx, (file_path, code) in enumerate(code_context_dict.items()):
+            if code_context.count("\n") + code.count("\n") > max_lines:
+                print(
+                    f"[WARNING] Code context of issue or question {issue_or_question_id} is too large, limiting to {idx} files"
+                )
+                break
+            code_context += f"File: {file_path}\n\n{code}\n\n"
+    return code_context
+
+
+def get_potential_context_files(
+    repo_content: Dict[str, str], repo_name: str, issue_content: str
+) -> List[str]:
+    # use OpenAI GPT-4 to decide which code context is relevant to the issue
+    client = openai.Client()
+    file_list = "\n".join([f"{file_path}" for file_path in repo_content.keys()])
+    prompt = f"Here is a real-world GitHub issue from the repository {repo_name}:\n\n{issue_content}\n\nThe below is a list of all code files in the repository:\n\n{file_list}\n\nPlease select up to 10 code files that may be relevant to the issue above.\n\nPlease return the file paths in a list split by ', ' like 'path/to/file1.py, path/to/file2.py, path/to/file3.py'.\n\n Do not reply anything else other than the file paths."
+
+    output = make_auto_request(
+        client, prompt, "gpt-4o", max_tokens=1000, temperature=0, n=1
+    )
+    relevant_file_paths = output.choices[0].message.content.split(", ")
+    for path in relevant_file_paths:
+        assert (
+            path in repo_content.keys()
+        ), f"File path {path} is not in the repository content"
+    return relevant_file_paths
+
+
+def get_code_context_from_gen_question_jsonl(
+    gen_question_jsonl_file: Path, repo_name: str, middle_func_name: str
+) -> str:
+    with open(gen_question_jsonl_file, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            if data["repo"] == repo_name and data["name"] == middle_func_name:
+                return data["code"]
+    raise ValueError(
+        f"Function {middle_func_name} not found in the generated question JSONL file"
+    )
diff --git a/scripts/question_gen/answer_gen_question.py b/scripts/question_gen/answer_gen_question.py
new file mode 100644
index 0000000..8f75a02
--- /dev/null
+++ b/scripts/question_gen/answer_gen_question.py
@@ -0,0 +1,107 @@
+# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+from pathlib import Path
+from typing import Optional
+
+import openai
+
+from repoqa.provider.request.openai import make_auto_request
+from scripts.question_gen import (
+    GEN_QUESTION_ANS_SEP,
+    retrieve_code_context_files,
+    truncate_context_files_if_too_large,
+)
+
+
+def question_answer_gen(
+    repo: str,
+    question_content: str,
+    model: str,
+    code_context: Optional[str] = None,
+    base_url: str = None,
+    backend: str = "openai",
+    max_new_tokens: int = 2048,
+) -> str:
+    if backend == "openai":
+        client = openai.Client()
+    else:
+        raise NotImplementedError("Only openai is supported for now")
+
+    prompt = f"Here is a question on the repository {repo}:\n\n{question_content}\n\nPlease provide a brief answer to the issue above.\n\n"
+    if code_context is not None:
+        prompt = f"{prompt}\n\n Here is the code context that may be relevant to this issue:\n\n{code_context}\n\n"
+    output = make_auto_request(
+        client,
+        prompt,
+        model,
+        max_tokens=max_new_tokens,
+        temperature=0.2,
+        n=1,
+    )
+
+    return output.choices[0].message.content
+
+
+def main(
+    dataset_path: str,
+    gen_question_jsonl_file: str,
+    model: str = "gpt-4o",
+    output_path: str = "gen_question_answer.jsonl",
+    use_batch_api: bool = False,
+):
+    assert use_batch_api == False, "Batch API is not supported yet."
+    assert dataset_path.endswith(".json"), "Dataset must be a JSON file, check README"
+    assert os.path.exists(
+        gen_question_jsonl_file
+    ), "Generated question JSONL file does not exist"
+
+    with open(output_path, "+a") as f_out:
+        with open(gen_question_jsonl_file, "r") as f:
+            for line in f:
+                data = json.loads(line)
+                repo = data["repo"]
+                mid_func_name = data["name"]
+                code_context = data["code"]
+                response = data["response"]
+                elements = response.split(GEN_QUESTION_ANS_SEP)
+                for element in elements:
+                    if element.strip() == "":
+                        continue
+                    # E.g. **Question_1**: What is the primary purpose of the tool in this repository?\n**Answer_1**: The tool is designed as an uncompromising code formatter for Python, aiming to standardize the formatting of Python code across projects.
+                    question_id = (
+                        element.split("\n")[0].split(":")[0].strip().replace("**", "")
+                        + "#"
+                        + mid_func_name
+                    )
+                    question = element.split("\n")[0].split(":")[1].strip()
+                    gt_answer = element.split("\n")[1].split(":")[1].strip()
+
+                    gen_question_answer_no_context = question_answer_gen(
+                        repo, question, model, backend="openai"
+                    )
+                    gen_question_answer_with_context = question_answer_gen(
+                        repo, question, model, code_context, backend="openai"
+                    )
+
+                    result = {
+                        "repo": repo,
+                        "question_id": question_id,
+                        "question": question,
+                        "gt_answer": gt_answer,
+                        "model": model,
+                        "answer_no_context": gen_question_answer_no_context,
+                        "answer_with_context": gen_question_answer_with_context,
+                    }
+                    json.dump(result, f_out)
+                    f_out.write("\n")
+                    f_out.flush()
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(main)
diff --git a/scripts/question_gen/answer_gh_issue.py b/scripts/question_gen/answer_gh_issue.py
new file mode 100644
index 0000000..bacb29a
--- /dev/null
+++ b/scripts/question_gen/answer_gh_issue.py
@@ -0,0 +1,113 @@
+# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+from pathlib import Path
+from typing import Optional
+
+import openai
+
+from repoqa.provider.request.openai import make_auto_request
+from scripts.question_gen import (
+    ISSUE_SEP,
+    retrieve_code_context_files,
+    truncate_context_files_if_too_large,
+)
+
+
+def strip_issue_question(issue_content: str) -> str:
+    issue_question_content = issue_content.split(ISSUE_SEP)[0].strip()
+    issue_replies = issue_content.split(ISSUE_SEP)[1].strip()
+
+    # 0-idx is the first reply
+    if "(QUESTIONER) replies:" in issue_replies.split("\n")[0]:
+        for (idx, line) in enumerate(issue_replies.split("\n")):
+            if "replies: " in line and idx > 0:
+                break
+        issue_self_reply = "\n".join(issue_replies.split("\n")[:idx])
+        issue_question_content = f"{issue_question_content}\n\n{issue_self_reply}"
+
+    return issue_question_content
+
+
+def issue_answer_gen(
+    repo: str,
+    issue_content: str,
+    model: str,
+    code_context: Optional[str] = None,
+    base_url: str = None,
+    backend: str = "openai",
+    max_new_tokens: int = 2048,
+) -> str:
+    issue_question_content = strip_issue_question(issue_content)
+    if backend == "openai":
+        client = openai.Client()
+    else:
+        raise NotImplementedError("Only openai is supported for now")
+
+    prompt = f"Here is a real-world GitHub issue from the repository {repo}:\n\n{issue_question_content}\n\nPlease provide a brief answer to the issue above.\n\n"
+    if code_context is not None:
+        prompt = f"{prompt}\n\n Here is the code context that may be relevant to this issue:\n\n{code_context}\n\n"
+    output = make_auto_request(
+        client,
+        prompt,
+        model,
+        max_tokens=max_new_tokens,
+        temperature=0.2,
+        n=1,
+    )
+
+    return output.choices[0].message.content
+
+
+def main(
+    dataset_path: str,
+    issue_dir: str,
+    max_ctx_lines: int = 2000,
+    model: str = "gpt-4o",  # we use the best gpt-4o as ground truth to filter issues
+    output_path: str = "gh_issue_answer.jsonl",
+    use_batch_api: bool = False,
+):
+    assert use_batch_api == False, "Batch API is not supported yet."
+    assert dataset_path.endswith(".json"), "Dataset must be a JSON file, check README"
+    assert os.path.exists(issue_dir), "Issue directory does not exist"
+
+    with open(output_path, "+a") as f_out:
+        for issue_file in Path(issue_dir).glob("*.txt"):
+            issue_content = issue_file.read_text()
+            issue_file_name = issue_file.stem
+            issue_repo_name = "/".join(issue_file_name.split("_")[:2])
+            code_context_dict = retrieve_code_context_files(
+                dataset_path, issue_content, issue_repo_name
+            )
+            limitted_code_context = truncate_context_files_if_too_large(
+                issue_file_name, code_context_dict, max_ctx_lines
+            )
+            issue_answer_no_context = issue_answer_gen(
+                issue_repo_name, issue_content, model
+            )
+            issue_answer_with_context = issue_answer_gen(
+                issue_repo_name,
+                issue_content,
+                model,
+                code_context=limitted_code_context,
+            )
+
+            result = {
+                "repo": issue_repo_name,
+                "issue_id": issue_file_name.replace(".txt", ""),
+                "code_context_files": list(code_context_dict.keys()),
+                "answer_no_context": issue_answer_no_context,
+                "answer_with_context": issue_answer_with_context,
+            }
+            json.dump(result, f_out)
+            f_out.write("\n")
+            f_out.flush()
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(main)
diff --git a/scripts/question_gen/dump_issue_json.sh b/scripts/question_gen/dump_issue_json.sh
new file mode 100755
index 0000000..105e402
--- /dev/null
+++ b/scripts/question_gen/dump_issue_json.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+issue_list_file=$(realpath $(dirname "${BASH_SOURCE[0]}")/issue_demo/issue_demo_list.txt)
+dumped_issue_json_dir=$(realpath $(dirname "${BASH_SOURCE[0]}")/dumped_issue_json)
+mkdir -p ${dumped_issue_json_dir}
+
+while read issue_url; do
+    array=(${issue_url//\// })
+    owner_name=${array[2]}
+    repo_name=${array[3]}
+    issue_number=${array[5]}
+
+    # echo "Dumping issue json for ${owner_name}/${repo_name}#${issue_number}"
+    issue_json_file=${dumped_issue_json_dir}/${owner_name}_${repo_name}_${issue_number}.json
+    gh issue view ${issue_number} -R ${owner_name}/${repo_name} --json title,body,createdAt,updatedAt,author,labels,comments > ${issue_json_file}
+done < $issue_list_file
diff --git a/scripts/question_gen/dumped_issue_json/ClickHouse_clickhouse-cpp_188.json b/scripts/question_gen/dumped_issue_json/ClickHouse_clickhouse-cpp_188.json
new file mode 100644
index 0000000..5384f24
--- /dev/null
+++ b/scripts/question_gen/dumped_issue_json/ClickHouse_clickhouse-cpp_188.json
@@ -0,0 +1 @@
+{"author":{"id":"MDQ6VXNlcjcxMDg4MzQ=","is_bot":false,"login":"iosifnicolae2","name":"Iosif Nicolae"},"body":"Hi.\r\nIs json type supported?\r\n\r\nThank you!","comments":[{"id":"IC_kwDODGaNZc5GCPMh","author":{"login":"Enmk"},"authorAssociation":"COLLABORATOR","body":"Hi! Not supported RN, but any PRs are very welcome!","createdAt":"2022-07-05T12:16:58Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/ClickHouse/clickhouse-cpp/issues/188#issuecomment-1174991649","viewerDidAuthor":false}],"createdAt":"2022-06-16T10:48:04Z","labels":[{"id":"MDU6TGFiZWwxNTUyMTUzNjI2","name":"question","description":"Further information is requested","color":"d876e3"}],"title":"Is json type supported?","updatedAt":"2022-10-04T13:20:09Z"}
diff --git a/scripts/question_gen/dumped_issue_json/Password4j_password4j_136.json b/scripts/question_gen/dumped_issue_json/Password4j_password4j_136.json
new file mode 100644
index 0000000..fbe931b
--- /dev/null
+++ b/scripts/question_gen/dumped_issue_json/Password4j_password4j_136.json
@@ -0,0 +1 @@
+{"author":{"id":"MDQ6VXNlcjI0MjY2NzQ0","is_bot":false,"login":"constructivetim","name":""},"body":"ho do i stop this appearing every time I start java?\r\n\r\n|\r\n    |                Password4j\r\n    + \\             .: v1.7.3 :.\r\n    \\\\.G_.*=.\r\n     `(H'/.\\|        ✅ Argon2\r\n      .>' (_--.      ✅ scrypt\r\n   _=/d   ,^\\        ✅ bcrypt\r\n ~~ \\)-'-'           ✅ PBKDF2-SHA1/SHA384/SHA512/256/SHA256/SHA512/224/SHA224/SHA512\r\n    / |\r\n    '  '\r\n ⭐ If you enjoy Password4j, please star the project at https://github.com/Password4j/password4j\r\n 🪲  Report any issue at https://github.com/Password4j/password4j/issues\r\n\r\n\r\n","comments":[{"id":"IC_kwDODqvDdM5yGdqn","author":{"login":"firaja"},"authorAssociation":"MEMBER","body":"Hi @constructivetim you can disable the startup banner by setting `global.banner=false` in your `psw4j.properties`\r\n\r\nHere the doc for the property https://github.com/Password4j/password4j/wiki/Properties#globalbanner-boolean","createdAt":"2024-01-29T09:31:20Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/Password4j/password4j/issues/136#issuecomment-1914296999","viewerDidAuthor":false},{"id":"IC_kwDODqvDdM5yGrje","author":{"login":"constructivetim"},"authorAssociation":"NONE","body":"thanks very much","createdAt":"2024-01-29T10:03:11Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[{"content":"THUMBS_UP","users":{"totalCount":1}}],"url":"https://github.com/Password4j/password4j/issues/136#issuecomment-1914353886","viewerDidAuthor":false}],"createdAt":"2024-01-25T12:00:03Z","labels":[{"id":"MDU6TGFiZWwyNzIwMTA1OTk0","name":"priority: low","description":"","color":"0e8a16"},{"id":"MDU6TGFiZWwyNzIwMTA2OTE1","name":"status: confirmed","description":"","color":"215cea"},{"id":"MDU6TGFiZWwyNzIwMTA5Mzk5","name":"type: question","description":"Further information is requested","color":"d876e3"}],"title":"stdout polluted with friendly message","updatedAt":"2024-01-29T11:28:30Z"}
diff --git a/scripts/question_gen/dumped_issue_json/drogonframework_drogon_560.json b/scripts/question_gen/dumped_issue_json/drogonframework_drogon_560.json
new file mode 100644
index 0000000..5a8d16a
--- /dev/null
+++ b/scripts/question_gen/dumped_issue_json/drogonframework_drogon_560.json
@@ -0,0 +1 @@
+{"author":{"id":"MDQ6VXNlcjcwMTAyMzc5","is_bot":false,"login":"gtorrico","name":""},"body":"Thank you for creating such an performant yet easy to use framework.\r\n\r\nSome users might like to use stricter warnings or treat warnings as errors in their code.  Exporting the includes as system libraries [should allow](https://foonathan.net/2018/10/cmake-warnings/#preventing-warnings-in-header-files) the compiler to ignore warnings generated when including `drogon` header files but I still got warnings after changing `drogon/CMakeLists.txt` in my submodule copy of `drogon` like so:\r\n```diff\r\ndiff --git a/CMakeLists.txt b/CMakeLists.txt\r\nindex 0df11f1..dcb5c1a 100755\r\n--- a/CMakeLists.txt\r\n+++ b/CMakeLists.txt\r\n@@ -52,7 +52,12 @@ endif(HAS_ANY AND HAS_STRING_VIEW)\r\n\r\n target_include_directories(\r\n   ${PROJECT_NAME}\r\n-  PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib/inc>\r\n+  PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib/inc>\r\n+         $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/orm_lib/inc>\r\n+         $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>\r\n+         $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/trantor>\r\n+         $<INSTALL_INTERFACE:${INSTALL_INCLUDE_DIR}>\r\n+  SYSTEM INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib/inc>\r\n          $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/orm_lib/inc>\r\n          $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>\r\n          $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/trantor>\r\n```\r\n\r\nAs a workaround I did this in my `CMakeLists.txt`:\r\n```CMake\r\nadd_subdirectory(drogon)\r\n\r\nget_target_property(DROGON_INTERFACE_INCLUDES drogon INTERFACE_INCLUDE_DIRECTORIES)\r\nadd_library(drogon_incs INTERFACE)\r\ntarget_include_directories(drogon_incs SYSTEM INTERFACE ${DROGON_INTERFACE_INCLUDES})\r\n\r\nadd_executable(${exe_target} ${exe_sources})            \r\ntarget_link_libraries(${exe_target} PRIVATE drogon_incs)\r\ntarget_link_libraries(${exe_target} PRIVATE drogon)\r\ntarget_compile_features(${exe_target} PRIVATE cxx_std_20)         \r\ntarget_compile_options(${exe_target} PRIVATE ${extra_warning_flags})   \r\n```\r\n\r\nIdeally this workaround would not be needed, but I don't know enough about `CMake` to know why my local changes to `drogon/CMakeLists.txt` failed to work.","comments":[{"id":"MDEyOklzc3VlQ29tbWVudDY4ODE1MzEyMw==","author":{"login":"an-tao"},"authorAssociation":"MEMBER","body":"please change drogon/CMakeLists.txt as follows:\r\n\r\n```diff\r\ndiff --git a/CMakeLists.txt b/CMakeLists.txt\r\nindex 0df11f1..c14aff7 100755\r\n--- a/CMakeLists.txt\r\n+++ b/CMakeLists.txt\r\n@@ -52,7 +52,7 @@ endif(HAS_ANY AND HAS_STRING_VIEW)\r\n \r\n target_include_directories(\r\n   ${PROJECT_NAME}\r\n-  PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib/inc>\r\n+SYSTEM  PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib/inc>\r\n          $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/orm_lib/inc>\r\n          $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>\r\n          $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/trantor>\r\n```\r\n\r\nAnd then add drogon as a subdirectory\r\n\r\n```cmake\r\n add_subdirectory(drogon)\r\n target_link_libraries(${PROJECT_NAME} PRIVATE drogon)\r\n```\r\n\r\nthen you could set the compiler flag of your application like:\r\n\r\n```\r\ntarget_compile_options(${PROJECT_NAME} PRIVATE -Wall -Wextra -pedantic -Werror)\r\n```","createdAt":"2020-09-07T08:18:27Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/drogonframework/drogon/issues/560#issuecomment-688153123","viewerDidAuthor":false},{"id":"MDEyOklzc3VlQ29tbWVudDY4ODQyODk5Mw==","author":{"login":"gtorrico"},"authorAssociation":"NONE","body":"Thank you that worked!","createdAt":"2020-09-07T16:39:55Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/drogonframework/drogon/issues/560#issuecomment-688428993","viewerDidAuthor":false}],"createdAt":"2020-09-07T03:10:07Z","labels":[{"id":"MDU6TGFiZWw5MTU0Nzc5NDE=","name":"question","description":"Further information is requested","color":"d876e3"}],"title":"how can drogon users have stricter warnings in their code?","updatedAt":"2020-09-07T16:39:55Z"}
diff --git a/scripts/question_gen/dumped_issue_json/drogonframework_drogon_695.json b/scripts/question_gen/dumped_issue_json/drogonframework_drogon_695.json
new file mode 100644
index 0000000..cfe98b3
--- /dev/null
+++ b/scripts/question_gen/dumped_issue_json/drogonframework_drogon_695.json
@@ -0,0 +1 @@
+{"author":{"id":"MDQ6VXNlcjI4NDI4NDQ=","is_bot":false,"login":"gyb997","name":"guoyibin"},"body":"Ok, I can parse the post request by set the \r\nContent-Type: application/x-www-form-urlencoded\r\n\r\nbut how can I do it with multipart/form-data","comments":[{"id":"MDEyOklzc3VlQ29tbWVudDc2NzQ4NjA2Nw==","author":{"login":"gyb997"},"authorAssociation":"NONE","body":"sorry ,I got it ~\r\njust use the MultiPartParser class","createdAt":"2021-01-26T11:36:55Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/drogonframework/drogon/issues/695#issuecomment-767486067","viewerDidAuthor":false},{"id":"MDEyOklzc3VlQ29tbWVudDc2NzQ4OTE3Nw==","author":{"login":"an-tao"},"authorAssociation":"MEMBER","body":"Yes, you are right.","createdAt":"2021-01-26T11:43:53Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/drogonframework/drogon/issues/695#issuecomment-767489177","viewerDidAuthor":false}],"createdAt":"2021-01-26T11:35:03Z","labels":[{"id":"MDU6TGFiZWw5MTU0Nzc5NDE=","name":"question","description":"Further information is requested","color":"d876e3"}],"title":"How can I parse the \"form-data\" post request？","updatedAt":"2021-01-31T06:11:58Z"}
diff --git a/scripts/question_gen/dumped_issue_json/mrdoob_three.js_22160.json b/scripts/question_gen/dumped_issue_json/mrdoob_three.js_22160.json
new file mode 100644
index 0000000..db4ef00
--- /dev/null
+++ b/scripts/question_gen/dumped_issue_json/mrdoob_three.js_22160.json
@@ -0,0 +1 @@
+{"author":{"id":"MDQ6VXNlcjQyMjc1MTI=","is_bot":false,"login":"h4k1m0u","name":"Hakim Benoudjit"},"body":"`ShapeUtils.area()` is basically returning a negative value in certain cases (I'm not sure what's the meaning of a negative area in geometry).\r\nI think  [this algorithm](https://www.mathopenref.com/coordpolygonarea.html) was used but an absolute value was forgotten.\r\n\r\n**Platform:**\r\n - Device: Desktop\r\n - OS: Windows\r\n - Browser: Firefox\r\n - Three.js version: r124 (function didn't seem to have changed in dev)\r\n","comments":[{"id":"IC_kwDOAAjKyc40qIOq","author":{"login":"Mugen87"},"authorAssociation":"COLLABORATOR","body":"The result of `ShapeUtils.area()` is singed in order to determine the winding order of polygons.\r\n\r\nhttps://github.com/mrdoob/three.js/blob/35bdc42a8115c7404997b9ef9b9e7fdb832a5099/src/extras/ShapeUtils.js#L22-L26","createdAt":"2021-07-20T14:54:12Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[{"content":"THUMBS_UP","users":{"totalCount":1}}],"url":"https://github.com/mrdoob/three.js/issues/22160#issuecomment-883458986","viewerDidAuthor":false}],"createdAt":"2021-07-20T14:41:45Z","labels":[{"id":"MDU6TGFiZWw1ODY3Mw==","name":"Question","description":"","color":"dad9ff"}],"title":"ShapeUtils.area() returns negative area","updatedAt":"2021-07-20T14:54:18Z"}
diff --git a/scripts/question_gen/dumped_issue_json/nsqio_nsq_1309.json b/scripts/question_gen/dumped_issue_json/nsqio_nsq_1309.json
new file mode 100644
index 0000000..d540ebe
--- /dev/null
+++ b/scripts/question_gen/dumped_issue_json/nsqio_nsq_1309.json
@@ -0,0 +1 @@
+{"author":{"id":"MDQ6VXNlcjE0ODE4ODc4","is_bot":false,"login":"kevinsir","name":""},"body":"How can I set the max times when the message retry? I mean when set the max times=3 nsq will only delivery this message 3 times . When it is all error. it will be discarded","comments":[{"id":"MDEyOklzc3VlQ29tbWVudDc0MTEzNDEzMw==","author":{"login":"ploxiln"},"authorAssociation":"MEMBER","body":"This is configured in the nsq consumer library, for example in the go-nsq `Config.MaxAttempts`: https://godoc.org/github.com/nsqio/go-nsq#Config","createdAt":"2020-12-08T22:24:59Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/nsqio/nsq/issues/1309#issuecomment-741134133","viewerDidAuthor":false}],"createdAt":"2020-12-08T13:44:05Z","labels":[{"id":"MDU6TGFiZWw0ODkwNTI2","name":"question","description":"","color":"f7c6c7"}],"title":"How can I set the max times when the message retry","updatedAt":"2020-12-26T02:13:26Z"}
diff --git a/scripts/question_gen/dumped_issue_json/oatpp_oatpp_374.json b/scripts/question_gen/dumped_issue_json/oatpp_oatpp_374.json
new file mode 100644
index 0000000..fd6ff0d
--- /dev/null
+++ b/scripts/question_gen/dumped_issue_json/oatpp_oatpp_374.json
@@ -0,0 +1 @@
+{"author":{"id":"MDQ6VXNlcjE5ODcxMDkw","is_bot":false,"login":"suvidh","name":""},"body":"I basically do not want my website to have \"not secure\" tag..","comments":[{"id":"MDEyOklzc3VlQ29tbWVudDc1NzM2OTAzOA==","author":{"login":"lganzzzo"},"authorAssociation":"MEMBER","body":"Hey @suvidh ,\r\n\r\nSure, just use one of the provided SSL adaptors:\r\n- [oatpp-openssl](https://github.com/oatpp/oatpp-openssl)\r\n- [oatpp-libressl](https://github.com/oatpp/oatpp-libressl)\r\n- [oatpp-mbedtls](https://github.com/oatpp/oatpp-mbedtls)","createdAt":"2021-01-09T21:20:59Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/oatpp/oatpp/issues/374#issuecomment-757369038","viewerDidAuthor":false}],"createdAt":"2021-01-09T20:16:09Z","labels":[{"id":"MDU6TGFiZWw4NjUwOTg2MDc=","name":"Question","description":"Further information is requested","color":"d876e3"}],"title":"Does it support Https?","updatedAt":"2021-01-10T01:06:37Z"}
diff --git a/scripts/question_gen/dumped_issue_json/polybar_polybar_2241.json b/scripts/question_gen/dumped_issue_json/polybar_polybar_2241.json
new file mode 100644
index 0000000..06368ec
--- /dev/null
+++ b/scripts/question_gen/dumped_issue_json/polybar_polybar_2241.json
@@ -0,0 +1 @@
+{"author":{"id":"MDQ6VXNlcjU3NTU5NzEx","is_bot":false,"login":"kelvin-hey","name":"Kelvin Hey"},"body":"I want to remove the system tray from the polybar, to display only the bar itself.\r\n\r\nIs that possible?\r\n\r\nIf yes, how to make.","comments":[{"id":"MDEyOklzc3VlQ29tbWVudDcyNTY5NzAxMQ==","author":{"login":"patrick96"},"authorAssociation":"MEMBER","body":"`tray-position = none`","createdAt":"2020-11-11T22:29:28Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[{"content":"THUMBS_UP","users":{"totalCount":1}}],"url":"https://github.com/polybar/polybar/issues/2241#issuecomment-725697011","viewerDidAuthor":false},{"id":"MDEyOklzc3VlQ29tbWVudDcyNTcxMzg5OA==","author":{"login":"kelvin-hey"},"authorAssociation":"NONE","body":"Many thanks.\r\n\r\nI used tray-position = undefined and work too.","createdAt":"2020-11-11T23:14:42Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[{"content":"LAUGH","users":{"totalCount":1}}],"url":"https://github.com/polybar/polybar/issues/2241#issuecomment-725713898","viewerDidAuthor":false}],"createdAt":"2020-11-11T21:28:16Z","labels":[{"id":"MDU6TGFiZWwzNzkxMTc2OTM=","name":"question","description":"","color":"cc317c"}],"title":"is possible to remove system tray?","updatedAt":"2020-11-11T23:14:42Z"}
diff --git a/scripts/question_gen/dumped_issue_json/xenova_transformers.js_337.json b/scripts/question_gen/dumped_issue_json/xenova_transformers.js_337.json
new file mode 100644
index 0000000..364598a
--- /dev/null
+++ b/scripts/question_gen/dumped_issue_json/xenova_transformers.js_337.json
@@ -0,0 +1 @@
+{"author":{"id":"MDQ6VXNlcjExNjc1NzU=","is_bot":false,"login":"josephrocca","name":""},"body":"My tokenizer files are hosted within this folder:\r\n```\r\nhttps://example.com/public/models/TheBloke/Llama-2-13B-GPTQ/\r\n```\r\nFirst I load the lib:\r\n```js\r\nlet { AutoTokenizer } = await import('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.6.1');\r\n```\r\nThen I tried what I thought would be the most obvious/intuitive API:\r\n```js\r\nawait AutoTokenizer.from_pretrained(\"/public/models/TheBloke/Llama-2-13B-GPTQ\")\r\n// requests: https://example.com/models/public/models/TheBloke/Llama-2-13B-GPTQ/tokenizer.json\r\n```\r\nThis is strongly counter-intuitive to me. If I add a `/` at the start of the URL, it shouldn't add anything before that. A path that starts with `/` on the web always means \"append this to the origin\".\r\n\r\nSo I read the docs, and it seems to suggest that you need to put at `.` on the end:\r\n```js\r\nawait AutoTokenizer.from_pretrained(\"/public/models/TheBloke/Llama-2-13B-GPTQ/.\")\r\n// requests: https://example.com/models/public/models/TheBloke/Llama-2-13B-GPTQ/tokenizer.json \r\n```\r\nNope. So the next obvious step was to just give it an absolute URL and be done with it:\r\n```js\r\nawait AutoTokenizer.from_pretrained(\"https://example.com/public/models/TheBloke/Llama-2-13B-GPTQ\")\r\n// requests:  'https://huggingface.co/https://example.com/public/models/TheBloke/Llama-2-13B-GPTQ/resolve/main/tokenizer_config.json\r\n```\r\nOof.\r\n\r\nSo I'm a bit confused here 😵‍💫\r\n\r\nGoing to keep trying, but I've spent 20 minutes on this so far, so posting here so you can improve the DX around this, even if I do manage to solve it myself soon.","comments":[{"id":"IC_kwDOI9T9VM5ntnIg","author":{"login":"xenova"},"authorAssociation":"OWNER","body":"If your app is hosted on a different site as the models, the terminology I use is `remoteModels`:\r\n\r\nYou can try set `env.remoteHost` and/or `env.remotePathTemplate` (see [docs](https://huggingface.co/docs/transformers.js/api/env#:~:text=Hugging%20Face%20Hub.-,remoteHost,-string))\r\n\r\n![image](https://github.com/xenova/transformers.js/assets/26504141/ce72a93e-8471-4d5d-b3e4-f38e455543bd)\r\n\r\n\r\nThese values default to:\r\n```\r\nremoteHost: 'https://huggingface.co/',\r\nremotePathTemplate: '{model}/resolve/{revision}/',\r\n```\r\n(see [code](https://github.com/xenova/transformers.js/blob/5b31129218e2f6ea001f8477a094f4f3f15a2502/src/env.js#L101-L102)).\r\n\r\nOpen to suggestions on how to improve DX :) ","createdAt":"2023-09-28T21:10:11Z","includesCreatedEdit":true,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/xenova/transformers.js/issues/337#issuecomment-1740010016","viewerDidAuthor":false},{"id":"IC_kwDOI9T9VM5ntnWd","author":{"login":"xenova"},"authorAssociation":"OWNER","body":"If your app is hosted on the same site as the models (terminology: `localModels`), then you can modify `env.localModelPath`. It defaults to `/models/` as you see above.\r\n```js\r\nimport { env } from '@xenova/transformers';\r\nenv.localModelPath = '/path/to/local/models/';\r\n// subsequent requests will go to https://example.com/path/to/local/models/\r\n```","createdAt":"2023-09-28T21:11:04Z","includesCreatedEdit":true,"isMinimized":false,"minimizedReason":"","reactionGroups":[{"content":"THUMBS_UP","users":{"totalCount":1}}],"url":"https://github.com/xenova/transformers.js/issues/337#issuecomment-1740010909","viewerDidAuthor":false},{"id":"IC_kwDOI9T9VM5ntzpk","author":{"login":"josephrocca"},"authorAssociation":"CONTRIBUTOR","body":"Ah, thanks! :pray:  I think the main DX improvement here would just be to mention that here where it says:\r\n\r\n> A path to a *directory* containing tokenizer files, e.g., `./my_model_directory/.`:\r\n\r\n![image](https://github.com/xenova/transformers.js/assets/1167575/4f0f42ac-f2de-46dd-a632-061edb4f3d0b)\r\n\r\nHere's the line: https://github.com/xenova/transformers.js/blob/main/src/tokenizers.js#L3812C11-L3812C11\r\n\r\nI was going to make a quick pull request just now using the Github web editor but wasn't really sure on the wording here. Maybe something like \"`env.localModelPath` will be prepended to the path\" where `env.localModelPath` is also a link to https://huggingface.co/docs/transformers.js/custom_usage ?\r\n\r\nBut I'd also vote for URLs starting with `/` and `https://` to be treated differently. Same with `blob:` and `data:` too I guess, since I did consider embedding the tokenizer files in the page for quicker load. Also it definitely seems like the default `localModelPath` path should just be empty rather than \"models\". That would have been another route to reducing confusion/friction for me here.\r\n\r\nAll of that said, I haven't thought about this long so don't weight these thoughts too much!","createdAt":"2023-09-28T22:03:04Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[{"content":"THUMBS_UP","users":{"totalCount":1}},{"content":"HEART","users":{"totalCount":1}}],"url":"https://github.com/xenova/transformers.js/issues/337#issuecomment-1740061284","viewerDidAuthor":false}],"createdAt":"2023-09-28T21:00:41Z","labels":[{"id":"LA_kwDOI9T9VM8AAAABMvEY-w","name":"question","description":"Further information is requested","color":"d876e3"}],"title":"[Question] How do I specify a non-huggingface URL (that doesn't start with `/models/`) in `AutoTokenizer.from_pretrained`?","updatedAt":"2023-09-28T22:03:05Z"}
diff --git a/scripts/question_gen/dumped_issue_json/xenova_transformers.js_421.json b/scripts/question_gen/dumped_issue_json/xenova_transformers.js_421.json
new file mode 100644
index 0000000..a3492aa
--- /dev/null
+++ b/scripts/question_gen/dumped_issue_json/xenova_transformers.js_421.json
@@ -0,0 +1 @@
+{"author":{"id":"MDQ6VXNlcjk2NTI5NQ==","is_bot":false,"login":"devfacet","name":"Fatih Cetinkaya"},"body":"@xenova : First of all thank you so much for your amazing work with this open source library. It opens up many possibilities.\r\n\r\nOne thing that caught my attention which is [FeatureExtractionPipeline](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.FeatureExtractionPipeline) can accept any amount of input regardless of the models' [sequence lengths](https://huggingface.co/spaces/mteb/leaderboard). Does it truncate or tokenize the data internally before applying it to the model? Is there documentation or an explanation about the implementation details?","comments":[{"id":"IC_kwDOI9T9VM5tGV2d","author":{"login":"xenova"},"authorAssociation":"OWNER","body":"Hi there 👋 Thanks so much for your kind words! 🤗 \r\n\r\nYes, it does perform truncation of the (tokenized) input prior to model execution (see the parent class' `_call` function):\r\nhttps://github.com/xenova/transformers.js/blob/768a2e26d7f34746caa2b102f55dbd270c5d6f36/src/pipelines.js#L126-L130\r\n\r\nThe amount it truncates is determined by the tokenizer's `max_model_length` (which can be found in the tokenizer_config.json, e.g., [here](https://huggingface.co/Xenova/bert-base-cased/blob/main/tokenizer_config.json#L6))","createdAt":"2023-11-28T17:47:11Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/xenova/transformers.js/issues/421#issuecomment-1830378909","viewerDidAuthor":false},{"id":"IC_kwDOI9T9VM5tG5AP","author":{"login":"devfacet"},"authorAssociation":"NONE","body":"Is there a way to set `truncation` to `false` and return error if the given text is larger than `model_max_length`? Or do I need to implement that logic for myself before passing text to the `extractor` function?","createdAt":"2023-11-28T19:18:51Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/xenova/transformers.js/issues/421#issuecomment-1830522895","viewerDidAuthor":false},{"id":"IC_kwDOI9T9VM5tHQOp","author":{"login":"xenova"},"authorAssociation":"OWNER","body":"Sure! You can decompose the pipeline into it's separate parts: (1) Tokenization, followed by (2) Inference. Here's some example code:\r\n\r\n[Test in jsFiddle](https://jsfiddle.net/bksa5pgr/)\r\n\r\n```js\r\nimport { env, AutoModel, AutoTokenizer } from 'https://cdn.jsdelivr.net/npm/@xenova/transformers@2.9.0';\r\nenv.allowLocalModels=false; // disable local model check\r\n\r\nconst model_id = 'Xenova/all-MiniLM-L6-v2';\r\n\r\nconst tokenizer = await AutoTokenizer.from_pretrained(model_id);\r\nconst model = await AutoModel.from_pretrained(model_id);\r\n\r\n// Example function that generates token embeddings for text,\r\n// but throws an error if the text is too long\r\nasync function generateTokenEmbeddings(text){\r\n  // (1) Tokenization\r\n  const model_inputs = tokenizer(text);\r\n  const numberOfTokens = model_inputs.input_ids.dims.at(-1);\r\n\r\n  // Check that inputs are valid\r\n  if(numberOfTokens > tokenizer.model_max_length){\r\n    throw new Error(`Input is larger than max model length (${numberOfTokens} > ${tokenizer.model_max_length})`);\r\n  }\r\n\r\n  // Input is valid\r\n  console.log(`'${text.slice(0,20)}...' is valid (${numberOfTokens} tokens).`)\r\n\t\r\n  // (2) Run model\r\n  const output = await model(model_inputs);\r\n  console.log(output);\r\n}\r\n\r\nconst text = \"Hello world!\"\r\nawait generateTokenEmbeddings(text) // Works\r\n// {\r\n//  last_hidden_state: {\r\n//    data: [object Float32Array],\r\n//    dims: [1, 5, 384],\r\n//    size: 1920,\r\n//    type: \"float32\"\r\n//  }\r\n//}\r\n\r\nconst text2 = \"This won't work \".repeat(200)\r\nawait generateTokenEmbeddings(text2) // this throws an error\r\n// Error: \"Input is larger than max model length (1002 > 512)\"\r\n\r\n```\r\n\r\n\r\n\r\n(see [tokenizers](https://huggingface.co/docs/transformers.js/api/tokenizers) and [models](https://huggingface.co/docs/transformers.js/api/models) docs for more information).","createdAt":"2023-11-28T19:50:48Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[{"content":"THUMBS_UP","users":{"totalCount":1}}],"url":"https://github.com/xenova/transformers.js/issues/421#issuecomment-1830618025","viewerDidAuthor":false},{"id":"IC_kwDOI9T9VM5tHSRz","author":{"login":"devfacet"},"authorAssociation":"NONE","body":"Great! Thank you 🙏 ","createdAt":"2023-11-28T19:56:55Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/xenova/transformers.js/issues/421#issuecomment-1830626419","viewerDidAuthor":false},{"id":"IC_kwDOI9T9VM5tflCj","author":{"login":"devfacet"},"authorAssociation":"NONE","body":"@xenova : Quick question: Some models have a typo (e.g., https://huggingface.co/Xenova/e5-small-v2/blob/a59d88d9e737bbaf6becc14ed014a9a7c82067e4/tokenizer_config.json#L7) which results in invalid `model_max_length` values. Should I track them myself, or is there another way to calculate `model_max_length` values?","createdAt":"2023-12-02T02:09:19Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[],"url":"https://github.com/xenova/transformers.js/issues/421#issuecomment-1836994723","viewerDidAuthor":false},{"id":"IC_kwDOI9T9VM5tgEqx","author":{"login":"xenova"},"authorAssociation":"OWNER","body":"Oh yes good point. In that case, you can use `model.config.max_position_embeddings` (in fact, this is probably the better option). To be safe, you can take the minimum of the two and use that instead.","createdAt":"2023-12-02T11:20:50Z","includesCreatedEdit":false,"isMinimized":false,"minimizedReason":"","reactionGroups":[{"content":"THUMBS_UP","users":{"totalCount":1}}],"url":"https://github.com/xenova/transformers.js/issues/421#issuecomment-1837124273","viewerDidAuthor":false}],"createdAt":"2023-11-28T17:28:28Z","labels":[{"id":"LA_kwDOI9T9VM8AAAABMvEY-w","name":"question","description":"Further information is requested","color":"d876e3"}],"title":"[Question] FeatureExtractionPipeline input length","updatedAt":"2023-12-02T11:20:52Z"}
diff --git a/scripts/question_gen/few_shot_examples/ClickHouse_clickhouse-cpp_188.txt b/scripts/question_gen/few_shot_examples/ClickHouse_clickhouse-cpp_188.txt
new file mode 100644
index 0000000..0faef34
--- /dev/null
+++ b/scripts/question_gen/few_shot_examples/ClickHouse_clickhouse-cpp_188.txt
@@ -0,0 +1,9 @@
+iosifnicolae2(QUESTIONER) asks: Hi.
+Is json type supported?
+
+Thank you!
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+Enmk(COLLABORATOR) replies: Hi! Not supported RN, but any PRs are very welcome!
diff --git a/scripts/question_gen/few_shot_examples/Password4j_password4j_136.txt b/scripts/question_gen/few_shot_examples/Password4j_password4j_136.txt
new file mode 100644
index 0000000..eea9871
--- /dev/null
+++ b/scripts/question_gen/few_shot_examples/Password4j_password4j_136.txt
@@ -0,0 +1,26 @@
+constructivetim(QUESTIONER) asks: ho do i stop this appearing every time I start java?
+
+|
+    |                Password4j
+    + \             .: v1.7.3 :.
+    \\.G_.*=.
+     `(H'/.\|        ✅ Argon2
+      .>' (_--.      ✅ scrypt
+   _=/d   ,^\        ✅ bcrypt
+ ~~ \)-'-'           ✅ PBKDF2-SHA1/SHA384/SHA512/256/SHA256/SHA512/224/SHA224/SHA512
+    / |
+    '  '
+ ⭐ If you enjoy Password4j, please star the project at https://github.com/Password4j/password4j
+ 🪲  Report any issue at https://github.com/Password4j/password4j/issues
+
+
+
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+firaja(MEMBER) replies: Hi @constructivetim you can disable the startup banner by setting `global.banner=false` in your `psw4j.properties`
+
+Here the doc for the property https://github.com/Password4j/password4j/wiki/Properties#globalbanner-boolean
+
+constructivetim(QUESTIONER) replies: thanks very much
diff --git a/scripts/question_gen/few_shot_examples/drogonframework_drogon_560.txt b/scripts/question_gen/few_shot_examples/drogonframework_drogon_560.txt
new file mode 100644
index 0000000..092764a
--- /dev/null
+++ b/scripts/question_gen/few_shot_examples/drogonframework_drogon_560.txt
@@ -0,0 +1,76 @@
+gtorrico(QUESTIONER) asks: Thank you for creating such an performant yet easy to use framework.
+
+Some users might like to use stricter warnings or treat warnings as errors in their code.  Exporting the includes as system libraries [should allow](https://foonathan.net/2018/10/cmake-warnings/#preventing-warnings-in-header-files) the compiler to ignore warnings generated when including `drogon` header files but I still got warnings after changing `drogon/CMakeLists.txt` in my submodule copy of `drogon` like so:
+```diff
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 0df11f1..dcb5c1a 100755
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -52,7 +52,12 @@ endif(HAS_ANY AND HAS_STRING_VIEW)
+
+ target_include_directories(
+   ${PROJECT_NAME}
+-  PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib/inc>
++  PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib/inc>
++         $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/orm_lib/inc>
++         $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>
++         $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/trantor>
++         $<INSTALL_INTERFACE:${INSTALL_INCLUDE_DIR}>
++  SYSTEM INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib/inc>
+          $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/orm_lib/inc>
+          $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>
+          $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/trantor>
+```
+
+As a workaround I did this in my `CMakeLists.txt`:
+```CMake
+add_subdirectory(drogon)
+
+get_target_property(DROGON_INTERFACE_INCLUDES drogon INTERFACE_INCLUDE_DIRECTORIES)
+add_library(drogon_incs INTERFACE)
+target_include_directories(drogon_incs SYSTEM INTERFACE ${DROGON_INTERFACE_INCLUDES})
+
+add_executable(${exe_target} ${exe_sources})
+target_link_libraries(${exe_target} PRIVATE drogon_incs)
+target_link_libraries(${exe_target} PRIVATE drogon)
+target_compile_features(${exe_target} PRIVATE cxx_std_20)
+target_compile_options(${exe_target} PRIVATE ${extra_warning_flags})
+```
+
+Ideally this workaround would not be needed, but I don't know enough about `CMake` to know why my local changes to `drogon/CMakeLists.txt` failed to work.
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+an-tao(MEMBER) replies: please change drogon/CMakeLists.txt as follows:
+
+```diff
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 0df11f1..c14aff7 100755
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -52,7 +52,7 @@ endif(HAS_ANY AND HAS_STRING_VIEW)
+
+ target_include_directories(
+   ${PROJECT_NAME}
+-  PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib/inc>
++SYSTEM  PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib/inc>
+          $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/orm_lib/inc>
+          $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>
+          $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/trantor>
+```
+
+And then add drogon as a subdirectory
+
+```cmake
+ add_subdirectory(drogon)
+ target_link_libraries(${PROJECT_NAME} PRIVATE drogon)
+```
+
+then you could set the compiler flag of your application like:
+
+```
+target_compile_options(${PROJECT_NAME} PRIVATE -Wall -Wextra -pedantic -Werror)
+```
+
+gtorrico(QUESTIONER) replies: Thank you that worked!
diff --git a/scripts/question_gen/few_shot_examples/drogonframework_drogon_695.txt b/scripts/question_gen/few_shot_examples/drogonframework_drogon_695.txt
new file mode 100644
index 0000000..733b9d9
--- /dev/null
+++ b/scripts/question_gen/few_shot_examples/drogonframework_drogon_695.txt
@@ -0,0 +1,12 @@
+gyb997(QUESTIONER) asks: Ok, I can parse the post request by set the
+Content-Type: application/x-www-form-urlencoded
+
+but how can I do it with multipart/form-data
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+gyb997(QUESTIONER) replies: sorry ,I got it ~
+just use the MultiPartParser class
+
+an-tao(MEMBER) replies: Yes, you are right.
diff --git a/scripts/question_gen/few_shot_examples/mrdoob_three.js_22160.txt b/scripts/question_gen/few_shot_examples/mrdoob_three.js_22160.txt
new file mode 100644
index 0000000..53f746e
--- /dev/null
+++ b/scripts/question_gen/few_shot_examples/mrdoob_three.js_22160.txt
@@ -0,0 +1,16 @@
+h4k1m0u(QUESTIONER) asks: `ShapeUtils.area()` is basically returning a negative value in certain cases (I'm not sure what's the meaning of a negative area in geometry).
+I think  [this algorithm](https://www.mathopenref.com/coordpolygonarea.html) was used but an absolute value was forgotten.
+
+**Platform:**
+ - Device: Desktop
+ - OS: Windows
+ - Browser: Firefox
+ - Three.js version: r124 (function didn't seem to have changed in dev)
+
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+Mugen87(COLLABORATOR) replies: The result of `ShapeUtils.area()` is singed in order to determine the winding order of polygons.
+
+https://github.com/mrdoob/three.js/blob/35bdc42a8115c7404997b9ef9b9e7fdb832a5099/src/extras/ShapeUtils.js#L22-L26
diff --git a/scripts/question_gen/few_shot_examples/nsqio_nsq_1309.txt b/scripts/question_gen/few_shot_examples/nsqio_nsq_1309.txt
new file mode 100644
index 0000000..d44e40b
--- /dev/null
+++ b/scripts/question_gen/few_shot_examples/nsqio_nsq_1309.txt
@@ -0,0 +1,6 @@
+kevinsir(QUESTIONER) asks: How can I set the max times when the message retry? I mean when set the max times=3 nsq will only delivery this message 3 times . When it is all error. it will be discarded
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+ploxiln(MEMBER) replies: This is configured in the nsq consumer library, for example in the go-nsq `Config.MaxAttempts`: https://godoc.org/github.com/nsqio/go-nsq#Config
diff --git a/scripts/question_gen/few_shot_examples/oatpp_oatpp_374.txt b/scripts/question_gen/few_shot_examples/oatpp_oatpp_374.txt
new file mode 100644
index 0000000..7da1ee2
--- /dev/null
+++ b/scripts/question_gen/few_shot_examples/oatpp_oatpp_374.txt
@@ -0,0 +1,11 @@
+suvidh(QUESTIONER) asks: I basically do not want my website to have "not secure" tag..
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+lganzzzo(MEMBER) replies: Hey @suvidh ,
+
+Sure, just use one of the provided SSL adaptors:
+- [oatpp-openssl](https://github.com/oatpp/oatpp-openssl)
+- [oatpp-libressl](https://github.com/oatpp/oatpp-libressl)
+- [oatpp-mbedtls](https://github.com/oatpp/oatpp-mbedtls)
diff --git a/scripts/question_gen/few_shot_examples/polybar_polybar_2241.txt b/scripts/question_gen/few_shot_examples/polybar_polybar_2241.txt
new file mode 100644
index 0000000..11fc23d
--- /dev/null
+++ b/scripts/question_gen/few_shot_examples/polybar_polybar_2241.txt
@@ -0,0 +1,14 @@
+kelvin-hey(QUESTIONER) asks: I want to remove the system tray from the polybar, to display only the bar itself.
+
+Is that possible?
+
+If yes, how to make.
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+patrick96(MEMBER) replies: `tray-position = none`
+
+kelvin-hey(QUESTIONER) replies: Many thanks.
+
+I used tray-position = undefined and work too.
diff --git a/scripts/question_gen/few_shot_examples/xenova_transformers.js_337.txt b/scripts/question_gen/few_shot_examples/xenova_transformers.js_337.txt
new file mode 100644
index 0000000..e3a6abb
--- /dev/null
+++ b/scripts/question_gen/few_shot_examples/xenova_transformers.js_337.txt
@@ -0,0 +1,70 @@
+josephrocca(QUESTIONER) asks: My tokenizer files are hosted within this folder:
+```
+https://example.com/public/models/TheBloke/Llama-2-13B-GPTQ/
+```
+First I load the lib:
+```js
+let { AutoTokenizer } = await import('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.6.1');
+```
+Then I tried what I thought would be the most obvious/intuitive API:
+```js
+await AutoTokenizer.from_pretrained("/public/models/TheBloke/Llama-2-13B-GPTQ")
+// requests: https://example.com/models/public/models/TheBloke/Llama-2-13B-GPTQ/tokenizer.json
+```
+This is strongly counter-intuitive to me. If I add a `/` at the start of the URL, it shouldn't add anything before that. A path that starts with `/` on the web always means "append this to the origin".
+
+So I read the docs, and it seems to suggest that you need to put at `.` on the end:
+```js
+await AutoTokenizer.from_pretrained("/public/models/TheBloke/Llama-2-13B-GPTQ/.")
+// requests: https://example.com/models/public/models/TheBloke/Llama-2-13B-GPTQ/tokenizer.json
+```
+Nope. So the next obvious step was to just give it an absolute URL and be done with it:
+```js
+await AutoTokenizer.from_pretrained("https://example.com/public/models/TheBloke/Llama-2-13B-GPTQ")
+// requests:  'https://huggingface.co/https://example.com/public/models/TheBloke/Llama-2-13B-GPTQ/resolve/main/tokenizer_config.json
+```
+Oof.
+
+So I'm a bit confused here 😵‍💫
+
+Going to keep trying, but I've spent 20 minutes on this so far, so posting here so you can improve the DX around this, even if I do manage to solve it myself soon.
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+xenova(OWNER) replies: If your app is hosted on a different site as the models, the terminology I use is `remoteModels`:
+
+You can try set `env.remoteHost` and/or `env.remotePathTemplate` (see [docs](https://huggingface.co/docs/transformers.js/api/env#:~:text=Hugging%20Face%20Hub.-,remoteHost,-string))
+
+![image](https://github.com/xenova/transformers.js/assets/26504141/ce72a93e-8471-4d5d-b3e4-f38e455543bd)
+
+
+These values default to:
+```
+remoteHost: 'https://huggingface.co/',
+remotePathTemplate: '{model}/resolve/{revision}/',
+```
+(see [code](https://github.com/xenova/transformers.js/blob/5b31129218e2f6ea001f8477a094f4f3f15a2502/src/env.js#L101-L102)).
+
+Open to suggestions on how to improve DX :)
+
+xenova(OWNER) replies: If your app is hosted on the same site as the models (terminology: `localModels`), then you can modify `env.localModelPath`. It defaults to `/models/` as you see above.
+```js
+import { env } from '@xenova/transformers';
+env.localModelPath = '/path/to/local/models/';
+// subsequent requests will go to https://example.com/path/to/local/models/
+```
+
+josephrocca(QUESTIONER) replies: Ah, thanks! :pray:  I think the main DX improvement here would just be to mention that here where it says:
+
+> A path to a *directory* containing tokenizer files, e.g., `./my_model_directory/.`:
+
+![image](https://github.com/xenova/transformers.js/assets/1167575/4f0f42ac-f2de-46dd-a632-061edb4f3d0b)
+
+Here's the line: https://github.com/xenova/transformers.js/blob/main/src/tokenizers.js#L3812C11-L3812C11
+
+I was going to make a quick pull request just now using the Github web editor but wasn't really sure on the wording here. Maybe something like "`env.localModelPath` will be prepended to the path" where `env.localModelPath` is also a link to https://huggingface.co/docs/transformers.js/custom_usage ?
+
+But I'd also vote for URLs starting with `/` and `https://` to be treated differently. Same with `blob:` and `data:` too I guess, since I did consider embedding the tokenizer files in the page for quicker load. Also it definitely seems like the default `localModelPath` path should just be empty rather than "models". That would have been another route to reducing confusion/friction for me here.
+
+All of that said, I haven't thought about this long so don't weight these thoughts too much!
diff --git a/scripts/question_gen/few_shot_examples/xenova_transformers.js_421.txt b/scripts/question_gen/few_shot_examples/xenova_transformers.js_421.txt
new file mode 100644
index 0000000..c7e0d83
--- /dev/null
+++ b/scripts/question_gen/few_shot_examples/xenova_transformers.js_421.txt
@@ -0,0 +1,75 @@
+devfacet(QUESTIONER) asks: @xenova : First of all thank you so much for your amazing work with this open source library. It opens up many possibilities.
+
+One thing that caught my attention which is [FeatureExtractionPipeline](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.FeatureExtractionPipeline) can accept any amount of input regardless of the models' [sequence lengths](https://huggingface.co/spaces/mteb/leaderboard). Does it truncate or tokenize the data internally before applying it to the model? Is there documentation or an explanation about the implementation details?
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+xenova(OWNER) replies: Hi there 👋 Thanks so much for your kind words! 🤗
+
+Yes, it does perform truncation of the (tokenized) input prior to model execution (see the parent class' `_call` function):
+https://github.com/xenova/transformers.js/blob/768a2e26d7f34746caa2b102f55dbd270c5d6f36/src/pipelines.js#L126-L130
+
+The amount it truncates is determined by the tokenizer's `max_model_length` (which can be found in the tokenizer_config.json, e.g., [here](https://huggingface.co/Xenova/bert-base-cased/blob/main/tokenizer_config.json#L6))
+
+devfacet(QUESTIONER) replies: Is there a way to set `truncation` to `false` and return error if the given text is larger than `model_max_length`? Or do I need to implement that logic for myself before passing text to the `extractor` function?
+
+xenova(OWNER) replies: Sure! You can decompose the pipeline into it's separate parts: (1) Tokenization, followed by (2) Inference. Here's some example code:
+
+[Test in jsFiddle](https://jsfiddle.net/bksa5pgr/)
+
+```js
+import { env, AutoModel, AutoTokenizer } from 'https://cdn.jsdelivr.net/npm/@xenova/transformers@2.9.0';
+env.allowLocalModels=false; // disable local model check
+
+const model_id = 'Xenova/all-MiniLM-L6-v2';
+
+const tokenizer = await AutoTokenizer.from_pretrained(model_id);
+const model = await AutoModel.from_pretrained(model_id);
+
+// Example function that generates token embeddings for text,
+// but throws an error if the text is too long
+async function generateTokenEmbeddings(text){
+  // (1) Tokenization
+  const model_inputs = tokenizer(text);
+  const numberOfTokens = model_inputs.input_ids.dims.at(-1);
+
+  // Check that inputs are valid
+  if(numberOfTokens > tokenizer.model_max_length){
+    throw new Error(`Input is larger than max model length (${numberOfTokens} > ${tokenizer.model_max_length})`);
+  }
+
+  // Input is valid
+  console.log(`'${text.slice(0,20)}...' is valid (${numberOfTokens} tokens).`)
+
+  // (2) Run model
+  const output = await model(model_inputs);
+  console.log(output);
+}
+
+const text = "Hello world!"
+await generateTokenEmbeddings(text) // Works
+// {
+//  last_hidden_state: {
+//    data: [object Float32Array],
+//    dims: [1, 5, 384],
+//    size: 1920,
+//    type: "float32"
+//  }
+//}
+
+const text2 = "This won't work ".repeat(200)
+await generateTokenEmbeddings(text2) // this throws an error
+// Error: "Input is larger than max model length (1002 > 512)"
+
+```
+
+
+
+(see [tokenizers](https://huggingface.co/docs/transformers.js/api/tokenizers) and [models](https://huggingface.co/docs/transformers.js/api/models) docs for more information).
+
+devfacet(QUESTIONER) replies: Great! Thank you 🙏
+
+devfacet(QUESTIONER) replies: @xenova : Quick question: Some models have a typo (e.g., https://huggingface.co/Xenova/e5-small-v2/blob/a59d88d9e737bbaf6becc14ed014a9a7c82067e4/tokenizer_config.json#L7) which results in invalid `model_max_length` values. Should I track them myself, or is there another way to calculate `model_max_length` values?
+
+xenova(OWNER) replies: Oh yes good point. In that case, you can use `model.config.max_position_embeddings` (in fact, this is probably the better option). To be safe, you can take the minimum of the two and use that instead.
diff --git a/scripts/question_gen/gen.py b/scripts/question_gen/gen.py
new file mode 100644
index 0000000..0b869a9
--- /dev/null
+++ b/scripts/question_gen/gen.py
@@ -0,0 +1,205 @@
+# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+import random
+from pathlib import Path
+from typing import List
+
+import openai
+from tqdm import tqdm
+
+from repoqa.provider.request.openai import make_auto_request
+from repoqa.utility import topological_sort
+
+CAPTURE_HEAD = "<desc_start>"
+CAPTURE_TAIL = "<desc_end>"
+
+NUM_QUESTIONS = 5
+
+
+def get_few_shot_examples(few_shot_example_dir: Path) -> List[str]:
+    return [
+        example_txt.read_text().rstrip()
+        for example_txt in few_shot_example_dir.glob("*.txt")
+    ]
+
+
+def make_prompt(repo_name: str, code: str, few_shot_examples: List[str] = None) -> str:
+    instruction = f'Imagine you are a developer who is new to the repository, and you may have some questions regarding the repo. Can you ask {NUM_QUESTIONS} factual questions regarding the repo "{repo_name}" below and provide **brief** answers correspondingly?'
+    few_shot_instruction = "Here are some examples of questions and answers mined from real-world GitHub issues that you can learn from:"
+    if len(few_shot_examples) > 0:
+        few_shot_prompt = (
+            few_shot_instruction + "\n\n" + "\n\n".join(few_shot_examples) + "\n\n"
+        )
+    else:
+        few_shot_prompt = ""
+    return f"""\
+{instruction}
+
+```
+{code}
+```
+
+{instruction}
+
+{few_shot_prompt}
+
+Please follow format to complete the skeleton below:
+
+{CAPTURE_HEAD}
+==========
+**Question_1**: ...
+**Answer_1**: ...
+==========
+**Question_2**: ...
+**Answer_2**: ...
+==========
+**Question_3**: ...
+**Answer_3**: ...
+==========
+...
+{CAPTURE_TAIL}
+
+{instruction}
+
+Notes:
+1. DO NOT reveal function names ({repo_name}) and variable names
+2. Start with {CAPTURE_HEAD} and end with {CAPTURE_TAIL}
+3. Customize the description to differentiate it from other functions
+"""
+
+
+# Question generation from given repo code snippets
+def main(
+    dataset_path: str,
+    code_ctx_lines: int = 1000,
+    output_desc_path: str = "question_generation.jsonl",
+    use_batch_api: bool = False,
+    verbose: bool = False,
+    debug: bool = False,
+    num_fewshots: int = 0,  # 0 for zero-shot, otherwise few-shot
+):
+    assert use_batch_api == False, "Batch API is not supported yet."
+
+    assert dataset_path.endswith(".json"), "Dataset must be a JSON file, check README"
+    with open(dataset_path, "r") as f:
+        lists = json.load(f)
+
+    # resume from output_desc_path
+    if output_desc_path.endswith(".jsonl") and os.path.exists(output_desc_path):
+        with open(output_desc_path, "r") as f:
+            results = [json.loads(line) for line in f]
+    else:
+        # {repo, name, prompt, response}
+        results = []
+
+    # a set of inference task to run; each item is a tuple of {repo, name, prompt}
+    tasks = []
+    for lang, repos in lists.items():
+        print(f"🔥 Collecting unannotated needle functions for {lang}")
+        for repo in tqdm(repos):
+            if not repo.get("dependency"):
+                print(
+                    f"⚠️ Skipping {repo['repo']} ({lang}) as it does not have `dependency` -- do dependency analysis first"
+                )
+                continue
+            ordered_paths = topological_sort(repo["dependency"])
+            repo_lines = []
+            for path in ordered_paths:
+                repo_lines.extend(repo["content"][path].split("\n"))
+
+            def get_code(global_start_line, global_end_line):
+                return "\n".join(
+                    repo_lines[
+                        max(0, global_start_line - code_ctx_lines) : min(
+                            global_end_line + code_ctx_lines, len(repo_lines)
+                        )
+                    ]
+                )
+
+            existing_needles = set(
+                [item["name"] for item in results if item["repo"] == repo["repo"]]
+            )
+
+            for needle in repo["needles"][:1]:
+                needle_fn_name = needle[
+                    "name"
+                ]  # the function in the middle of the context
+                if needle_fn_name in existing_needles:
+                    continue
+                code = get_code(
+                    needle["global_start_line"], needle["global_end_line"]
+                )  # to be fixed
+                print("*" * 80)
+                print(code)
+                print("*" * 80)
+                all_few_shot_examples = get_few_shot_examples(
+                    Path(__file__).parent / "few_shot_examples"
+                )
+
+                random.seed(42)
+                if num_fewshots > 0:
+                    few_shot_examples = random.sample(
+                        all_few_shot_examples, num_fewshots
+                    )
+                else:
+                    few_shot_examples = []
+                prompt = make_prompt(repo["repo"], code, few_shot_examples)
+                if verbose:
+                    print(prompt)
+                    print("-" * 80)
+                tasks.append(
+                    {
+                        "repo": repo["repo"],
+                        "name": needle_fn_name,
+                        "prompt": prompt,
+                        "code": code,
+                    }
+                )
+
+    print(f"🔥 {len(tasks)} needle functions to be annotated in total")
+    client = openai.Client()
+    with open(output_desc_path, "+a") as f_out:
+        for task in tqdm(tasks):
+            print(f"🔥 Annotating {task['name']} in {task['repo']}")
+            output = make_auto_request(
+                client,
+                task["prompt"],
+                model="gpt-4-turbo",
+                max_tokens=2048,
+                temperature=0.2,
+                n=1,
+            )
+            raw_response = output.choices[0].message.content
+            result = {
+                "repo": task["repo"],
+                "name": task["name"],
+                "prompt": task["prompt"],
+                "middle_function_name": task["name"],
+                "code": task["code"],
+                "raw_response": raw_response,
+                "response": raw_response.split(CAPTURE_HEAD)[-1].split(CAPTURE_TAIL)[0],
+            }
+            json.dump(result, f_out)
+            f_out.write("\n")
+            f_out.flush()
+
+            if debug:
+                print("[PROMPT]", "-" * 80)
+                # the prompt is too long, so we print the last 200 lines
+                print("\n".join(task["prompt"].split("\n")[-200:]))
+                print("[RESPONSE]", "-" * 80)
+                print(raw_response)
+                print("-" * 80)
+                print("Enter to continue... or b to break:")
+                if input() == "b":
+                    break
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(main)
diff --git a/scripts/question_gen/issue_demo/issue_demo_list.txt b/scripts/question_gen/issue_demo/issue_demo_list.txt
new file mode 100644
index 0000000..c0b0d8f
--- /dev/null
+++ b/scripts/question_gen/issue_demo/issue_demo_list.txt
@@ -0,0 +1,10 @@
+https://github.com/oatpp/oatpp/issues/374
+https://github.com/ClickHouse/clickhouse-cpp/issues/188
+https://github.com/polybar/polybar/issues/2241
+https://github.com/drogonframework/drogon/issues/695
+https://github.com/mrdoob/three.js/issues/22160
+https://github.com/drogonframework/drogon/issues/560
+https://github.com/Password4j/password4j/issues/136
+https://github.com/xenova/transformers.js/issues/421
+https://github.com/xenova/transformers.js/issues/337
+https://github.com/nsqio/nsq/issues/1309
diff --git a/scripts/question_gen/issue_dialogue/ClickHouse_clickhouse-cpp_188.txt b/scripts/question_gen/issue_dialogue/ClickHouse_clickhouse-cpp_188.txt
new file mode 100644
index 0000000..0faef34
--- /dev/null
+++ b/scripts/question_gen/issue_dialogue/ClickHouse_clickhouse-cpp_188.txt
@@ -0,0 +1,9 @@
+iosifnicolae2(QUESTIONER) asks: Hi.
+Is json type supported?
+
+Thank you!
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+Enmk(COLLABORATOR) replies: Hi! Not supported RN, but any PRs are very welcome!
diff --git a/scripts/question_gen/issue_dialogue/Password4j_password4j_136.txt b/scripts/question_gen/issue_dialogue/Password4j_password4j_136.txt
new file mode 100644
index 0000000..eea9871
--- /dev/null
+++ b/scripts/question_gen/issue_dialogue/Password4j_password4j_136.txt
@@ -0,0 +1,26 @@
+constructivetim(QUESTIONER) asks: ho do i stop this appearing every time I start java?
+
+|
+    |                Password4j
+    + \             .: v1.7.3 :.
+    \\.G_.*=.
+     `(H'/.\|        ✅ Argon2
+      .>' (_--.      ✅ scrypt
+   _=/d   ,^\        ✅ bcrypt
+ ~~ \)-'-'           ✅ PBKDF2-SHA1/SHA384/SHA512/256/SHA256/SHA512/224/SHA224/SHA512
+    / |
+    '  '
+ ⭐ If you enjoy Password4j, please star the project at https://github.com/Password4j/password4j
+ 🪲  Report any issue at https://github.com/Password4j/password4j/issues
+
+
+
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+firaja(MEMBER) replies: Hi @constructivetim you can disable the startup banner by setting `global.banner=false` in your `psw4j.properties`
+
+Here the doc for the property https://github.com/Password4j/password4j/wiki/Properties#globalbanner-boolean
+
+constructivetim(QUESTIONER) replies: thanks very much
diff --git a/scripts/question_gen/issue_dialogue/drogonframework_drogon_560.txt b/scripts/question_gen/issue_dialogue/drogonframework_drogon_560.txt
new file mode 100644
index 0000000..092764a
--- /dev/null
+++ b/scripts/question_gen/issue_dialogue/drogonframework_drogon_560.txt
@@ -0,0 +1,76 @@
+gtorrico(QUESTIONER) asks: Thank you for creating such an performant yet easy to use framework.
+
+Some users might like to use stricter warnings or treat warnings as errors in their code.  Exporting the includes as system libraries [should allow](https://foonathan.net/2018/10/cmake-warnings/#preventing-warnings-in-header-files) the compiler to ignore warnings generated when including `drogon` header files but I still got warnings after changing `drogon/CMakeLists.txt` in my submodule copy of `drogon` like so:
+```diff
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 0df11f1..dcb5c1a 100755
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -52,7 +52,12 @@ endif(HAS_ANY AND HAS_STRING_VIEW)
+
+ target_include_directories(
+   ${PROJECT_NAME}
+-  PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib/inc>
++  PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib/inc>
++         $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/orm_lib/inc>
++         $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>
++         $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/trantor>
++         $<INSTALL_INTERFACE:${INSTALL_INCLUDE_DIR}>
++  SYSTEM INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib/inc>
+          $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/orm_lib/inc>
+          $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>
+          $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/trantor>
+```
+
+As a workaround I did this in my `CMakeLists.txt`:
+```CMake
+add_subdirectory(drogon)
+
+get_target_property(DROGON_INTERFACE_INCLUDES drogon INTERFACE_INCLUDE_DIRECTORIES)
+add_library(drogon_incs INTERFACE)
+target_include_directories(drogon_incs SYSTEM INTERFACE ${DROGON_INTERFACE_INCLUDES})
+
+add_executable(${exe_target} ${exe_sources})
+target_link_libraries(${exe_target} PRIVATE drogon_incs)
+target_link_libraries(${exe_target} PRIVATE drogon)
+target_compile_features(${exe_target} PRIVATE cxx_std_20)
+target_compile_options(${exe_target} PRIVATE ${extra_warning_flags})
+```
+
+Ideally this workaround would not be needed, but I don't know enough about `CMake` to know why my local changes to `drogon/CMakeLists.txt` failed to work.
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+an-tao(MEMBER) replies: please change drogon/CMakeLists.txt as follows:
+
+```diff
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 0df11f1..c14aff7 100755
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -52,7 +52,7 @@ endif(HAS_ANY AND HAS_STRING_VIEW)
+
+ target_include_directories(
+   ${PROJECT_NAME}
+-  PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib/inc>
++SYSTEM  PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib/inc>
+          $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/orm_lib/inc>
+          $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>
+          $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/trantor>
+```
+
+And then add drogon as a subdirectory
+
+```cmake
+ add_subdirectory(drogon)
+ target_link_libraries(${PROJECT_NAME} PRIVATE drogon)
+```
+
+then you could set the compiler flag of your application like:
+
+```
+target_compile_options(${PROJECT_NAME} PRIVATE -Wall -Wextra -pedantic -Werror)
+```
+
+gtorrico(QUESTIONER) replies: Thank you that worked!
diff --git a/scripts/question_gen/issue_dialogue/drogonframework_drogon_695.txt b/scripts/question_gen/issue_dialogue/drogonframework_drogon_695.txt
new file mode 100644
index 0000000..733b9d9
--- /dev/null
+++ b/scripts/question_gen/issue_dialogue/drogonframework_drogon_695.txt
@@ -0,0 +1,12 @@
+gyb997(QUESTIONER) asks: Ok, I can parse the post request by set the
+Content-Type: application/x-www-form-urlencoded
+
+but how can I do it with multipart/form-data
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+gyb997(QUESTIONER) replies: sorry ,I got it ~
+just use the MultiPartParser class
+
+an-tao(MEMBER) replies: Yes, you are right.
diff --git a/scripts/question_gen/issue_dialogue/mrdoob_three.js_22160.txt b/scripts/question_gen/issue_dialogue/mrdoob_three.js_22160.txt
new file mode 100644
index 0000000..53f746e
--- /dev/null
+++ b/scripts/question_gen/issue_dialogue/mrdoob_three.js_22160.txt
@@ -0,0 +1,16 @@
+h4k1m0u(QUESTIONER) asks: `ShapeUtils.area()` is basically returning a negative value in certain cases (I'm not sure what's the meaning of a negative area in geometry).
+I think  [this algorithm](https://www.mathopenref.com/coordpolygonarea.html) was used but an absolute value was forgotten.
+
+**Platform:**
+ - Device: Desktop
+ - OS: Windows
+ - Browser: Firefox
+ - Three.js version: r124 (function didn't seem to have changed in dev)
+
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+Mugen87(COLLABORATOR) replies: The result of `ShapeUtils.area()` is singed in order to determine the winding order of polygons.
+
+https://github.com/mrdoob/three.js/blob/35bdc42a8115c7404997b9ef9b9e7fdb832a5099/src/extras/ShapeUtils.js#L22-L26
diff --git a/scripts/question_gen/issue_dialogue/nsqio_nsq_1309.txt b/scripts/question_gen/issue_dialogue/nsqio_nsq_1309.txt
new file mode 100644
index 0000000..d44e40b
--- /dev/null
+++ b/scripts/question_gen/issue_dialogue/nsqio_nsq_1309.txt
@@ -0,0 +1,6 @@
+kevinsir(QUESTIONER) asks: How can I set the max times when the message retry? I mean when set the max times=3 nsq will only delivery this message 3 times . When it is all error. it will be discarded
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+ploxiln(MEMBER) replies: This is configured in the nsq consumer library, for example in the go-nsq `Config.MaxAttempts`: https://godoc.org/github.com/nsqio/go-nsq#Config
diff --git a/scripts/question_gen/issue_dialogue/oatpp_oatpp_374.txt b/scripts/question_gen/issue_dialogue/oatpp_oatpp_374.txt
new file mode 100644
index 0000000..7da1ee2
--- /dev/null
+++ b/scripts/question_gen/issue_dialogue/oatpp_oatpp_374.txt
@@ -0,0 +1,11 @@
+suvidh(QUESTIONER) asks: I basically do not want my website to have "not secure" tag..
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+lganzzzo(MEMBER) replies: Hey @suvidh ,
+
+Sure, just use one of the provided SSL adaptors:
+- [oatpp-openssl](https://github.com/oatpp/oatpp-openssl)
+- [oatpp-libressl](https://github.com/oatpp/oatpp-libressl)
+- [oatpp-mbedtls](https://github.com/oatpp/oatpp-mbedtls)
diff --git a/scripts/question_gen/issue_dialogue/polybar_polybar_2241.txt b/scripts/question_gen/issue_dialogue/polybar_polybar_2241.txt
new file mode 100644
index 0000000..11fc23d
--- /dev/null
+++ b/scripts/question_gen/issue_dialogue/polybar_polybar_2241.txt
@@ -0,0 +1,14 @@
+kelvin-hey(QUESTIONER) asks: I want to remove the system tray from the polybar, to display only the bar itself.
+
+Is that possible?
+
+If yes, how to make.
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+patrick96(MEMBER) replies: `tray-position = none`
+
+kelvin-hey(QUESTIONER) replies: Many thanks.
+
+I used tray-position = undefined and work too.
diff --git a/scripts/question_gen/issue_dialogue/xenova_transformers.js_337.txt b/scripts/question_gen/issue_dialogue/xenova_transformers.js_337.txt
new file mode 100644
index 0000000..e3a6abb
--- /dev/null
+++ b/scripts/question_gen/issue_dialogue/xenova_transformers.js_337.txt
@@ -0,0 +1,70 @@
+josephrocca(QUESTIONER) asks: My tokenizer files are hosted within this folder:
+```
+https://example.com/public/models/TheBloke/Llama-2-13B-GPTQ/
+```
+First I load the lib:
+```js
+let { AutoTokenizer } = await import('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.6.1');
+```
+Then I tried what I thought would be the most obvious/intuitive API:
+```js
+await AutoTokenizer.from_pretrained("/public/models/TheBloke/Llama-2-13B-GPTQ")
+// requests: https://example.com/models/public/models/TheBloke/Llama-2-13B-GPTQ/tokenizer.json
+```
+This is strongly counter-intuitive to me. If I add a `/` at the start of the URL, it shouldn't add anything before that. A path that starts with `/` on the web always means "append this to the origin".
+
+So I read the docs, and it seems to suggest that you need to put at `.` on the end:
+```js
+await AutoTokenizer.from_pretrained("/public/models/TheBloke/Llama-2-13B-GPTQ/.")
+// requests: https://example.com/models/public/models/TheBloke/Llama-2-13B-GPTQ/tokenizer.json
+```
+Nope. So the next obvious step was to just give it an absolute URL and be done with it:
+```js
+await AutoTokenizer.from_pretrained("https://example.com/public/models/TheBloke/Llama-2-13B-GPTQ")
+// requests:  'https://huggingface.co/https://example.com/public/models/TheBloke/Llama-2-13B-GPTQ/resolve/main/tokenizer_config.json
+```
+Oof.
+
+So I'm a bit confused here 😵‍💫
+
+Going to keep trying, but I've spent 20 minutes on this so far, so posting here so you can improve the DX around this, even if I do manage to solve it myself soon.
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+xenova(OWNER) replies: If your app is hosted on a different site as the models, the terminology I use is `remoteModels`:
+
+You can try set `env.remoteHost` and/or `env.remotePathTemplate` (see [docs](https://huggingface.co/docs/transformers.js/api/env#:~:text=Hugging%20Face%20Hub.-,remoteHost,-string))
+
+![image](https://github.com/xenova/transformers.js/assets/26504141/ce72a93e-8471-4d5d-b3e4-f38e455543bd)
+
+
+These values default to:
+```
+remoteHost: 'https://huggingface.co/',
+remotePathTemplate: '{model}/resolve/{revision}/',
+```
+(see [code](https://github.com/xenova/transformers.js/blob/5b31129218e2f6ea001f8477a094f4f3f15a2502/src/env.js#L101-L102)).
+
+Open to suggestions on how to improve DX :)
+
+xenova(OWNER) replies: If your app is hosted on the same site as the models (terminology: `localModels`), then you can modify `env.localModelPath`. It defaults to `/models/` as you see above.
+```js
+import { env } from '@xenova/transformers';
+env.localModelPath = '/path/to/local/models/';
+// subsequent requests will go to https://example.com/path/to/local/models/
+```
+
+josephrocca(QUESTIONER) replies: Ah, thanks! :pray:  I think the main DX improvement here would just be to mention that here where it says:
+
+> A path to a *directory* containing tokenizer files, e.g., `./my_model_directory/.`:
+
+![image](https://github.com/xenova/transformers.js/assets/1167575/4f0f42ac-f2de-46dd-a632-061edb4f3d0b)
+
+Here's the line: https://github.com/xenova/transformers.js/blob/main/src/tokenizers.js#L3812C11-L3812C11
+
+I was going to make a quick pull request just now using the Github web editor but wasn't really sure on the wording here. Maybe something like "`env.localModelPath` will be prepended to the path" where `env.localModelPath` is also a link to https://huggingface.co/docs/transformers.js/custom_usage ?
+
+But I'd also vote for URLs starting with `/` and `https://` to be treated differently. Same with `blob:` and `data:` too I guess, since I did consider embedding the tokenizer files in the page for quicker load. Also it definitely seems like the default `localModelPath` path should just be empty rather than "models". That would have been another route to reducing confusion/friction for me here.
+
+All of that said, I haven't thought about this long so don't weight these thoughts too much!
diff --git a/scripts/question_gen/issue_dialogue/xenova_transformers.js_421.txt b/scripts/question_gen/issue_dialogue/xenova_transformers.js_421.txt
new file mode 100644
index 0000000..c7e0d83
--- /dev/null
+++ b/scripts/question_gen/issue_dialogue/xenova_transformers.js_421.txt
@@ -0,0 +1,75 @@
+devfacet(QUESTIONER) asks: @xenova : First of all thank you so much for your amazing work with this open source library. It opens up many possibilities.
+
+One thing that caught my attention which is [FeatureExtractionPipeline](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.FeatureExtractionPipeline) can accept any amount of input regardless of the models' [sequence lengths](https://huggingface.co/spaces/mteb/leaderboard). Does it truncate or tokenize the data internally before applying it to the model? Is there documentation or an explanation about the implementation details?
+--------------------------------------------------
+The below is the discussion and comments on the question:
+--------------------------------------------------
+
+xenova(OWNER) replies: Hi there 👋 Thanks so much for your kind words! 🤗
+
+Yes, it does perform truncation of the (tokenized) input prior to model execution (see the parent class' `_call` function):
+https://github.com/xenova/transformers.js/blob/768a2e26d7f34746caa2b102f55dbd270c5d6f36/src/pipelines.js#L126-L130
+
+The amount it truncates is determined by the tokenizer's `max_model_length` (which can be found in the tokenizer_config.json, e.g., [here](https://huggingface.co/Xenova/bert-base-cased/blob/main/tokenizer_config.json#L6))
+
+devfacet(QUESTIONER) replies: Is there a way to set `truncation` to `false` and return error if the given text is larger than `model_max_length`? Or do I need to implement that logic for myself before passing text to the `extractor` function?
+
+xenova(OWNER) replies: Sure! You can decompose the pipeline into it's separate parts: (1) Tokenization, followed by (2) Inference. Here's some example code:
+
+[Test in jsFiddle](https://jsfiddle.net/bksa5pgr/)
+
+```js
+import { env, AutoModel, AutoTokenizer } from 'https://cdn.jsdelivr.net/npm/@xenova/transformers@2.9.0';
+env.allowLocalModels=false; // disable local model check
+
+const model_id = 'Xenova/all-MiniLM-L6-v2';
+
+const tokenizer = await AutoTokenizer.from_pretrained(model_id);
+const model = await AutoModel.from_pretrained(model_id);
+
+// Example function that generates token embeddings for text,
+// but throws an error if the text is too long
+async function generateTokenEmbeddings(text){
+  // (1) Tokenization
+  const model_inputs = tokenizer(text);
+  const numberOfTokens = model_inputs.input_ids.dims.at(-1);
+
+  // Check that inputs are valid
+  if(numberOfTokens > tokenizer.model_max_length){
+    throw new Error(`Input is larger than max model length (${numberOfTokens} > ${tokenizer.model_max_length})`);
+  }
+
+  // Input is valid
+  console.log(`'${text.slice(0,20)}...' is valid (${numberOfTokens} tokens).`)
+
+  // (2) Run model
+  const output = await model(model_inputs);
+  console.log(output);
+}
+
+const text = "Hello world!"
+await generateTokenEmbeddings(text) // Works
+// {
+//  last_hidden_state: {
+//    data: [object Float32Array],
+//    dims: [1, 5, 384],
+//    size: 1920,
+//    type: "float32"
+//  }
+//}
+
+const text2 = "This won't work ".repeat(200)
+await generateTokenEmbeddings(text2) // this throws an error
+// Error: "Input is larger than max model length (1002 > 512)"
+
+```
+
+
+
+(see [tokenizers](https://huggingface.co/docs/transformers.js/api/tokenizers) and [models](https://huggingface.co/docs/transformers.js/api/models) docs for more information).
+
+devfacet(QUESTIONER) replies: Great! Thank you 🙏
+
+devfacet(QUESTIONER) replies: @xenova : Quick question: Some models have a typo (e.g., https://huggingface.co/Xenova/e5-small-v2/blob/a59d88d9e737bbaf6becc14ed014a9a7c82067e4/tokenizer_config.json#L7) which results in invalid `model_max_length` values. Should I track them myself, or is there another way to calculate `model_max_length` values?
+
+xenova(OWNER) replies: Oh yes good point. In that case, you can use `model.config.max_position_embeddings` (in fact, this is probably the better option). To be safe, you can take the minimum of the two and use that instead.
diff --git a/scripts/question_gen/llm_judge_answer.py b/scripts/question_gen/llm_judge_answer.py
new file mode 100644
index 0000000..f5f7fc3
--- /dev/null
+++ b/scripts/question_gen/llm_judge_answer.py
@@ -0,0 +1,168 @@
+# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+from pathlib import Path
+
+import openai
+
+from repoqa.provider.request.openai import make_auto_request
+from scripts.question_gen import (
+    extract_answers,
+    get_code_context_from_gen_question_jsonl,
+    retrieve_code_context_files,
+    truncate_context_files_if_too_large,
+)
+
+
+def judge_two_answers(
+    repo: str,
+    issue_or_question_content: str,
+    issue_or_question_id: str,
+    ans_1: str,
+    ans_2: str,
+    model: str,
+    code_context: str,
+    base_url: str = None,
+    gt_answer: str = None,
+    backend: str = "openai",
+    max_new_tokens: int = 2048,
+) -> str:
+    if backend == "openai":
+        client = openai.Client()
+    else:
+        raise NotImplementedError("Only openai is supported for now")
+
+    if (
+        issue_or_question_id.startswith(repo)
+        and issue_or_question_id.replace(repo + "_", "").isdigit()
+    ):
+        prompt = f"Here is a real-world GitHub issue from the repository {repo} (note that the issue is closed and may contain the answer of the question, but only the question was visible to the answerer):"
+    else:
+        prompt = f"Here is a question on the repository {repo}:"
+    prompt += f"""\n\n{issue_or_question_content}.
+    The below are two answers to the issue/question above. Please judge which one is better and give an explanation.
+    <ANSWER_1_BEGIN>
+    {ans_1}
+    <ANSWER_1_END>
+    <ANSWER_2_BEGIN>
+    {ans_2}
+    <ANSWER_2_END>
+
+    Please answer in the following format:
+    ANSWER_1 or ANSWER_2
+    <EXPLANATION_BEGIN>
+    EXPLANATION
+    <EXPLANATION_END>
+    """
+
+    if code_context is not None:
+        prompt = f"{prompt}\n\n Here is the code context that may be relevant to this issue:\n\n{code_context}\n\n"
+
+    if gt_answer is not None:
+        prompt = f"{prompt}\n\n Here is the ground truth answer that you should take as the reference:\n\n{gt_answer}\n\nNote that you should prefer the answer that is more similar/consistent to the ground truth answer."
+
+    output = make_auto_request(
+        client,
+        prompt,
+        model,
+        max_tokens=max_new_tokens,
+        temperature=0.2,
+        n=1,
+    )
+
+    return output.choices[0].message.content
+
+
+def main(
+    key: str,
+    dataset_path: str,
+    answer_path: str,  # eg. "gh_issue_answer.jsonl"
+    output_path: str,  # eg. "judge_gh_issue_answer.jsonl"
+    judge_model: str = "gpt-4o",
+    max_ctx_lines: int = 2000,
+    use_batch_api: bool = False,
+    issue_dir: str = None,
+    gen_question_jsonl_file: str = None,
+):
+    assert use_batch_api == False, "Batch API is not supported yet."
+    assert dataset_path.endswith(".json"), "Dataset must be a JSON file, check README"
+    assert key in [
+        "issue_id",
+        "question_id",
+    ], "Key must be either 'issue_id' or 'question_id'"
+    if key == "issue_id":
+        assert os.path.exists(issue_dir), "Issue directory does not exist"
+    else:
+        assert os.path.exists(
+            gen_question_jsonl_file
+        ), "Generated question JSONL file does not exist"
+
+    answers = extract_answers(answer_path, key)
+    with open(output_path, "+a") as f_out:
+        # for issue_file in Path(issue_dir).glob("*.txt"):
+        # for issue_or_question_id in answers.keys():
+        for issue_or_question_id in answers.keys():
+            repo_name = answers[issue_or_question_id]["repo"]
+            if key == "issue_id":
+                issue_or_question_file = Path(issue_dir) / f"{issue_or_question_id}.txt"
+                issue_or_question_content = issue_or_question_file.read_text()
+
+                code_context_dict = retrieve_code_context_files(
+                    dataset_path,
+                    issue_or_question_content,
+                    repo_name,
+                    answers[issue_or_question_id]["code_context_files"],
+                )
+                # cut the context files if too large
+                code_context = truncate_context_files_if_too_large(
+                    issue_or_question_id, code_context_dict, max_ctx_lines
+                )
+            else:
+                issue_or_question_content = answers[issue_or_question_id]["question"]
+                middle_func_name = issue_or_question_id.split("#")[1]
+                code_context = get_code_context_from_gen_question_jsonl(
+                    gen_question_jsonl_file, repo_name, middle_func_name
+                )
+
+            issue_answer_no_context = answers[issue_or_question_id]["answer_no_context"]
+            issue_answer_with_context = answers[issue_or_question_id][
+                "answer_with_context"
+            ]
+            judge_result = judge_two_answers(
+                repo_name,
+                issue_or_question_content,
+                issue_or_question_id,
+                issue_answer_no_context,
+                issue_answer_with_context,
+                judge_model,
+                code_context,
+            )
+
+            better_answer_id = judge_result.split("<EXPLANATION_BEGIN>")[0].strip()
+            explanation = (
+                judge_result.split("<EXPLANATION_BEGIN>")[1]
+                .split("<EXPLANATION_END>")[0]
+                .strip()
+            )
+            better_answer = (
+                "no_context" if better_answer_id == "ANSWER_1" else "with_context"
+            )
+
+            result = {
+                "repo": repo_name,
+                "issue_or_question_id": issue_or_question_id,
+                "better_answer": better_answer,
+                "explanation": explanation,
+            }
+            json.dump(result, f_out)
+            f_out.write("\n")
+            f_out.flush()
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    Fire(main)
diff --git a/scripts/question_gen/make_readable_issue.py b/scripts/question_gen/make_readable_issue.py
new file mode 100644
index 0000000..e4311f1
--- /dev/null
+++ b/scripts/question_gen/make_readable_issue.py
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+from pathlib import Path
+
+from scripts.question_gen import ISSUE_SEP
+
+QUESTIONER_ROLE = "QUESTIONER"
+
+
+def format_issue_dialogue(question_pair, comment_tuples):
+    question_author, question = question_pair
+    dialogue = f"{question_author}({QUESTIONER_ROLE}) asks: {question}"
+    dialogue += ISSUE_SEP
+    for comment_author, comment_author_role, comment in comment_tuples:
+        if comment_author == question_author:
+            comment_author_role = QUESTIONER_ROLE
+        dialogue += f"\n\n{comment_author}({comment_author_role}) replies: {comment}"
+    return dialogue
+
+
+def make_readable_issue_from_json(issue_json_file):
+    with open(issue_json_file, "r") as f:
+        data = json.load(f)
+    question = data["body"]
+    question_author = data["author"]["login"]
+    question_pair = (question_author, question)
+    comment_tuples = []
+    for comment in data["comments"]:
+        comment_tuples.append(
+            (comment["author"]["login"], comment["authorAssociation"], comment["body"])
+        )
+
+    return format_issue_dialogue(question_pair, comment_tuples)
+
+
+if __name__ == "__main__":
+    dumped_issue_json_dir = Path(__file__).parent / "dumped_issue_json"
+    issue_dialogue_dir = Path(__file__).parent / "issue_dialogue"
+    issue_dialogue_dir.mkdir(exist_ok=True)
+    for issue_json_file in dumped_issue_json_dir.iterdir():
+        issue_dialogue = make_readable_issue_from_json(issue_json_file)
+        issue_dialogue_file = issue_dialogue_dir / (issue_json_file.stem + ".txt")
+        with open(issue_dialogue_file, "w") as f:
+            f.write(issue_dialogue)