We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Python 3.8 evaluate: a4bdc10 OS: Ubuntu 24
=========================================================== FAILURES ============================================================ __________________________________________ LocalModuleTest.test_load_metric_code_eval ___________________________________________ [gw0] linux -- Python 3.8.19 /var/git/repos/evaluate/.venv/bin/python self = <tests.test_metric_common.LocalModuleTest testMethod=test_load_metric_code_eval>, evaluation_module_name = 'code_eval' evaluation_module_type = 'metric' def test_load(self, evaluation_module_name, evaluation_module_type): doctest.ELLIPSIS_MARKER = "[...]" evaluation_module = importlib.import_module( evaluate.loading.evaluation_module_factory( os.path.join(evaluation_module_type + "s", evaluation_module_name), module_type=evaluation_module_type ).module_path ) evaluation_instance = evaluate.loading.import_main_class(evaluation_module.__name__) # check parameters parameters = inspect.signature(evaluation_instance._compute).parameters self.assertTrue(all([p.kind != p.VAR_KEYWORD for p in parameters.values()])) # no **kwargs # run doctest with self.patch_intensive_calls(evaluation_module_name, evaluation_module.__name__): with self.use_local_metrics(evaluation_module_type): try: > results = doctest.testmod(evaluation_module, verbose=True, raise_on_error=True) tests/test_metric_common.py:117: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /usr/lib/python3.8/doctest.py:1956: in testmod runner.run(test) /usr/lib/python3.8/doctest.py:1844: in run r = DocTestRunner.run(self, test, compileflags, out, False) /usr/lib/python3.8/doctest.py:1483: in run return self.__run(test, compileflags, out) /usr/lib/python3.8/doctest.py:1388: in __run self.report_unexpected_exception(out, test, example, _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = <doctest.DebugRunner object at 0x785042c7ebe0> out = <built-in method write of _io.TextIOWrapper object at 0x7851f6fd0520> test = <DocTest evaluate_modules.metrics.code_eval.78d307ea938083398db7d9815f03ed661e9c15f60d77880ce007a8a02648f176.code_eval...dules/metrics/code_eval/78d307ea938083398db7d9815f03ed661e9c15f60d77880ce007a8a02648f176/code_eval.py:144 (5 examples)> example = <doctest.Example object at 0x785042c7efd0> exc_info = (<class 'ValueError'>, ValueError('\n################################################################################\...##############################################################################'), <traceback object at 0x785043fb5680>) def report_unexpected_exception(self, out, test, example, exc_info): > raise UnexpectedException(test, example, exc_info) E doctest.UnexpectedException: <DocTest evaluate_modules.metrics.code_eval.78d307ea938083398db7d9815f03ed661e9c15f60d77880ce007a8a02648f176.code_eval.CodeEval from /tmp/pytest-of-jpodivin/pytest-1/popen-gw0/cache/modules/evaluate_modules/metrics/code_eval/78d307ea938083398db7d9815f03ed661e9c15f60d77880ce007a8a02648f176/code_eval.py:144 (5 examples)> /usr/lib/python3.8/doctest.py:1850: UnexpectedException During handling of the above exception, another exception occurred: self = <tests.test_metric_common.LocalModuleTest testMethod=test_load_metric_code_eval>, evaluation_module_name = 'code_eval' evaluation_module_type = 'metric' def test_load(self, evaluation_module_name, evaluation_module_type): doctest.ELLIPSIS_MARKER = "[...]" evaluation_module = importlib.import_module( evaluate.loading.evaluation_module_factory( os.path.join(evaluation_module_type + "s", evaluation_module_name), module_type=evaluation_module_type ).module_path ) evaluation_instance = evaluate.loading.import_main_class(evaluation_module.__name__) # check parameters parameters = inspect.signature(evaluation_instance._compute).parameters self.assertTrue(all([p.kind != p.VAR_KEYWORD for p in parameters.values()])) # no **kwargs # run doctest with self.patch_intensive_calls(evaluation_module_name, evaluation_module.__name__): with self.use_local_metrics(evaluation_module_type): try: results = doctest.testmod(evaluation_module, verbose=True, raise_on_error=True) except doctest.UnexpectedException as e: > raise e.exc_info[1] # raise the exception that doctest caught tests/test_metric_common.py:119: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /usr/lib/python3.8/doctest.py:1336: in __run exec(compile(example.source, filename, "single", <doctest evaluate_modules.metrics.code_eval.78d307ea938083398db7d9815f03ed661e9c15f60d77880ce007a8a02648f176.code_eval.CodeEval[3]>:1: in <module> ??? .venv/lib/python3.8/site-packages/evaluate/module.py:467: in compute output = self._compute(**inputs, **compute_kwargs) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = EvaluationModule(name: "code_eval", module_type: "metric", features: {'predictions': Sequence(feature=Value(dtype='str... predictions=candidates, k=[1, 2]) >>> print(pass_at_k) {'pass@1': 0.5, 'pass@2': 1.0} """, stored examples: 0) predictions = [['def add(a,b): return a*b', 'def add(a, b): return a+b']], references = ['assert add(2,3)==5'], k = [1, 2] num_workers = 4, timeout = 3.0 def _compute(self, predictions, references, k=[1, 10, 100], num_workers=4, timeout=3.0): """Returns the scores""" if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1": > raise ValueError(_WARNING) E ValueError: E ################################################################################ E !!!WARNING!!! E ################################################################################ E The "code_eval" metric executes untrusted model-generated code in Python. E Although it is highly unlikely that model-generated code will do something E overtly malicious in response to this test suite, model-generated code may act E destructively due to a lack of model capability or alignment. E Users are strongly encouraged to sandbox this evaluation suite so that it E does not perform destructive actions on their host or network. For more E information on how OpenAI sandboxes its code, see the paper "Evaluating Large E Language Models Trained on Code" (https://arxiv.org/abs/2107.03374). E E Once you have read this disclaimer and taken appropriate precautions, E set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this E with: E E >>> import os E >>> os.environ["HF_ALLOW_CODE_EVAL"] = "1" E E ################################################################################
The text was updated successfully, but these errors were encountered:
No branches or pull requests
Environment:
Python 3.8
evaluate: a4bdc10
OS: Ubuntu 24
Trace:
The text was updated successfully, but these errors were encountered: