From d30d1e4d52620dffa12ec39e76aae3c57b802a22 Mon Sep 17 00:00:00 2001 From: Sander Niels Hummerich <64867257+hummerichsander@users.noreply.github.com> Date: Wed, 25 Sep 2024 12:46:34 +0000 Subject: [PATCH 1/2] refresh environment variables and add them to README --- README.md | 8 +++++++- example_env | 3 +-- openai_api_server_mock/chat/surrogates.py | 16 ++++++++++++---- openai_api_server_mock/settings.py | 1 - sandbox.ipynb | 10 +++++----- 5 files changed, 25 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 16df7d1..c335298 100644 --- a/README.md +++ b/README.md @@ -5,10 +5,16 @@ Available endpoints: - /v1/chat/completion Instead of running a LLM model to generate completions, it simply returns a response generated by surrogate models. Available surrogate models are: -- "yes_no": returns random "yes" or "no" response +- "yes_no": returns random "Yes" or "No" response +- "ja_nein": returns random "Ja" or "Nein" response - "lorem_ipsum": returns random "lorem ipsum" text ## Run via docker: ```bash docker pull ghcr.io/hummerichsander/openai_api_server_mock:v ... # replace ... with the latest version ``` + +Environment variables: +- `CONTEXT_SIZE`: context size for the model (default: 4096) +- `SLEEP_TIME`: sleep time in seconds before returning the response (default: 0) +- `MAX_CONCURRENT_REQUESTS`: maximum number of concurrent requests (default: 10^9) diff --git a/example_env b/example_env index e63d5d7..41d7391 100644 --- a/example_env +++ b/example_env @@ -1,5 +1,4 @@ -MODEL_CONTEXT_SIZE=4096 -SURROGATE="yes_no" +CONTEXT_SIZE=4096 SLEEP_TIME=1 MAX_CONCURRENT_REQUESTS=1 LANGUAGE="en" \ No newline at end of file diff --git a/openai_api_server_mock/chat/surrogates.py b/openai_api_server_mock/chat/surrogates.py index f7ca263..902d8c4 100644 --- a/openai_api_server_mock/chat/surrogates.py +++ b/openai_api_server_mock/chat/surrogates.py @@ -89,15 +89,23 @@ class YesNoSurrogate(ModelSurrogate): @classmethod async def generate(cls, n: int, messages: List[Message]) -> List[str]: - if settings.language == "en": - return ["Yes" if random.random() > 0.5 else "No"] - elif settings.language == "de": - return ["Ja" if random.random() > 0.5 else "Nein"] + return ["Yes" if random.random() > 0.5 else "No"] YesNoSurrogate.register() +class JaNeinSurrogate(ModelSurrogate): + name: str = "ja_nein" + + @classmethod + async def generate(cls, n: int, messages: List[Message]) -> List[str]: + return ["Ja" if random.random() > 0.5 else "Nein"] + + +JaNeinSurrogate.register() + + async def get_surrogate(model: str) -> ModelSurrogate: global available_surrogates for surrogate in available_surrogates: diff --git a/openai_api_server_mock/settings.py b/openai_api_server_mock/settings.py index 2dede47..8045944 100644 --- a/openai_api_server_mock/settings.py +++ b/openai_api_server_mock/settings.py @@ -5,7 +5,6 @@ class Settings(BaseSettings): context_size: int = Field(alias="CONTEXT_SIZE", default=4096) - surrogate: str = Field(alias="SURROGATE", default="lorem_ipsum") sleep_time: int = Field(alias="SLEEP_TIME", default=0) max_concurrent_requests: int = Field( alias="MAX_CONCURRENT_REQUESTS", default=10**9 diff --git a/sandbox.ipynb b/sandbox.ipynb index 1d04f78..57e04d7 100644 --- a/sandbox.ipynb +++ b/sandbox.ipynb @@ -20,12 +20,12 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "chat_completion = client.chat.completions.create(\n", - " model=\"yes_no\",\n", + " model=\"ja_nein\",\n", " messages=[\n", " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", " {\"role\": \"user\", \"content\": \"Is the sky blue?\"}\n", @@ -38,16 +38,16 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Choice(finish_reason='stop', index=0, logprobs=ChoiceLogprobs(content=[ChatCompletionTokenLogprob(token='No', bytes=None, logprob=-0.4558056105339685, top_logprobs=[TopLogprob(token='Yes', bytes=None, logprob=-2.1267604392490442), TopLogprob(token='No', bytes=None, logprob=-0.7188313398698458), TopLogprob(token='Yes', bytes=None, logprob=-3.7428107344910946)])], refusal=None), message=ChatCompletionMessage(content='Yes', refusal=None, role='assistant', function_call=None, tool_calls=None, name=None))]" + "[Choice(finish_reason='length', index=0, logprobs=ChoiceLogprobs(content=[ChatCompletionTokenLogprob(token='Nein', bytes=None, logprob=-0.05135242454878156, top_logprobs=[TopLogprob(token='Ja', bytes=None, logprob=-0.9180391264546016), TopLogprob(token='Nein', bytes=None, logprob=-0.11234122861118023), TopLogprob(token='Ja', bytes=None, logprob=-2.7463193707941906)])], refusal=None), message=ChatCompletionMessage(content='Ja', refusal=None, role='assistant', function_call=None, tool_calls=None, name=None))]" ] }, - "execution_count": 4, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } From 08f511525ddc6e18e6c74a9e93c8bc7f3dc05a30 Mon Sep 17 00:00:00 2001 From: Sander Niels Hummerich <64867257+hummerichsander@users.noreply.github.com> Date: Wed, 25 Sep 2024 12:49:19 +0000 Subject: [PATCH 2/2] Refactor README to include available endpoints and environment variables --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index c335298..c3ceb84 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,11 @@ This is a simple fastapi based server mock that implements the OpenAI API. Available endpoints: + - /v1/chat/completion Instead of running a LLM model to generate completions, it simply returns a response generated by surrogate models. Available surrogate models are: + - "yes_no": returns random "Yes" or "No" response - "ja_nein": returns random "Ja" or "Nein" response - "lorem_ipsum": returns random "lorem ipsum" text @@ -15,6 +17,7 @@ docker pull ghcr.io/hummerichsander/openai_api_server_mock:v ... # replace ... w ``` Environment variables: + - `CONTEXT_SIZE`: context size for the model (default: 4096) - `SLEEP_TIME`: sleep time in seconds before returning the response (default: 0) - `MAX_CONCURRENT_REQUESTS`: maximum number of concurrent requests (default: 10^9)