diff --git a/examples/tool_inference.py b/examples/tool_inference.py
new file mode 100644
index 000000000..34b08af5c
--- /dev/null
+++ b/examples/tool_inference.py
@@ -0,0 +1,45 @@
+import os
+import argparse
+from lmflow.args import InferencerArguments
+from lmflow.args import ModelArguments
+from lmflow.args import DatasetArguments
+from lmflow.models import hf_decoder_model
+from lmflow.pipeline.inferencer import ToolInferencer
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--gpu', type=str, default='0',
+ help='gpu id, currently speculative inference only support single gpu')
+ parser.add_argument('--model', type=str, default='codellama/CodeLlama-7b-instruct-hf',
+ help='target code generation model name or path you \
+ currently only supports huggingface decoder only models')
+ params = parser.parse_args()
+ os.environ["CUDA_VISIBLE_DEVICES"] = params.gpu
+
+ model_args = ModelArguments(model_name_or_path=params.model)
+ model = hf_decoder_model.HFDecoderModel(model_args)
+ inferencer_args = InferencerArguments()
+ data_args = DatasetArguments()
+
+ toolinf = ToolInferencer(model_args, data_args, inferencer_args)
+
+ while True:
+ try:
+ text = input("Tool Inference: ")
+ toolinf_res = toolinf.inference(model, text)
+ toolinf_res = toolinf_res.replace("","")
+ toolinf_res = toolinf_res.replace("","")
+ print('\n\nResult:')
+ print(toolinf_res)
+ print('\n\n')
+ run_code = input("Run code? (y/n): ")
+ if run_code == 'y':
+ toolinf.code_exec(toolinf_res)
+ if run_code == 'n':
+ continue
+
+
+ except EOFError:
+ break
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file
diff --git a/scripts/run_tool.sh b/scripts/run_tool.sh
new file mode 100644
index 000000000..45904f9ea
--- /dev/null
+++ b/scripts/run_tool.sh
@@ -0,0 +1,3 @@
+model="gorilla-llm/gorilla-7b-hf-delta-v1"
+python examples/tool_inference.py \
+ --model ${model} \
\ No newline at end of file
diff --git a/src/lmflow/pipeline/inferencer.py b/src/lmflow/pipeline/inferencer.py
index b9d66f486..fb11ab564 100644
--- a/src/lmflow/pipeline/inferencer.py
+++ b/src/lmflow/pipeline/inferencer.py
@@ -15,6 +15,7 @@
import logging
from typing import Dict, List
from concurrent.futures import ThreadPoolExecutor
+import subprocess
from transformers import AutoConfig
import torch.distributed as dist
@@ -553,3 +554,88 @@ def speculative_sampling(input_ids: torch.Tensor,
def stream_inference(self):
raise NotImplementedError("Streaming output for SpeculativeInferencer is not supported yet")
+
+class ToolInferencer(Inferencer):
+ """
+ Initializes the `ToolInferencer` class with given arguments.
+
+ Parameters
+ ------------
+ model_args : ModelArguments object.
+ Contains the arguments required to load the model.
+
+ data_args : DatasetArguments object.
+ Contains the arguments required to load the dataset.
+
+ inferencer_args : InferencerArguments object.
+ Contains the arguments required to perform inference.
+
+
+ """
+ def __init__(self, model_args, data_args, inferencer_args):
+ super().__init__(model_args, data_args, inferencer_args)
+
+ self.model = HFDecoderModel(self.model_args)
+
+ def inference(
+ self,
+ model: HFDecoderModel,
+ input: str,
+ max_new_tokens: int=1024,
+ ):
+ """
+ Perform inference for a model
+
+ Parameters
+ ------------
+ model : HFDecoderModel object.
+ TunableModel to perform inference
+
+ input : str.
+ The input text (i.e., the prompt) for the model.
+
+ max_new_tokens : int.
+ The maximum number of tokens to be generated by the model.
+
+ Returns:
+
+ output : str.
+ The output text generated by the model.
+ """
+ if self.inferencer_args.device == "gpu":
+ input_id = model.encode(input, return_tensors="pt").to(device=self.local_rank)
+ elif self.inferencer_args.device == "cpu":
+ input_id = model.encode(input, return_tensors="pt").to(device='cpu')
+ logger.debug(f"input_id: {input_id}")
+ input_length = input_id.shape[1]
+ output_id = model.inference(
+ input_id,
+ use_accelerator=True,
+ max_new_tokens=max_new_tokens,
+ # pad_token_id=model.tokenizer.eos_token_id,
+ )
+ # logger.debug(f"output: {output_id}")
+ output = model.decode(output_id[0])
+ output = output.replace(input,"")
+ return output
+
+ def code_exec(self, code):
+ # Execute the code
+ result = subprocess.run(["python", "-c", code], capture_output=True, text=True)
+
+ # Print the result
+ if result.returncode == 0:
+ print("Successfully Executed, the result is:")
+ print(result.stdout)
+ return result.stdout
+ else:
+ print("Error:")
+ print(result.stderr)
+ return result
+
+
+
+
+
+
+
diff --git a/tests/models/test_tool_inferencer.py b/tests/models/test_tool_inferencer.py
new file mode 100644
index 000000000..fd856bf57
--- /dev/null
+++ b/tests/models/test_tool_inferencer.py
@@ -0,0 +1,36 @@
+from lmflow.pipeline.inferencer import ToolInferencer
+import unittest
+from lmflow.args import InferencerArguments
+from lmflow.args import ModelArguments
+from lmflow.args import DatasetArguments
+from lmflow.models import hf_decoder_model
+
+CODE_1 = "print(\"hello world\")"
+RES_1 = "hello world\n"
+CODE_2 = "b=a+1\nprint(b)"
+RES_2 = """Traceback (most recent call last):
+ File "", line 1, in
+NameError: name 'a' is not defined
+"""
+
+class ToolInferencerTest(unittest.TestCase):
+ def set_up(self):
+ model_args = ModelArguments(model_name_or_path="codellama/CodeLlama-7b-instruct-hf")
+ model = hf_decoder_model.HFDecoderModel(model_args)
+ inferencer_args = InferencerArguments()
+ data_args = DatasetArguments()
+ self.toolinf = ToolInferencer(model_args, data_args, inferencer_args)
+
+ def test_code_exec_1(self,code=CODE_1, expected_output=RES_1):
+
+ toolinf_res = self.toolinf.code_exec(code)
+ self.assertEqual(toolinf_res, expected_output)
+
+ def test_code_exec_2(self,code=CODE_2):
+ toolinf_res = self.toolinf.code_exec(code)
+ self.assertNotEqual(toolinf_res.returncode, 0)
+
+unittest.main()
+
+
+
\ No newline at end of file