add turboboost_tps param

togethercomputer · Jan 2, 2025 · acc31b2 · acc31b2
1 parent 5cd3742
commit acc31b2
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 0 deletions.
diff --git a/src/together/resources/chat/completions.py b/src/together/resources/chat/completions.py
@@ -41,6 +41,7 @@ def create(
         response_format: Dict[str, str | Dict[str, Any]] | None = None,
         tools: List[Dict[str, Any]] | None = None,
         tool_choice: str | Dict[str, str | Dict[str, str]] | None = None,
+        turboboost_tps: float | None = None,
         **kwargs: Any,
     ) -> ChatCompletionResponse | Iterator[ChatCompletionChunk]:
         """
@@ -103,6 +104,13 @@ def create(
                     via {"type": "function", "function": {"name": "my_function"}} forces the model to call that function.
                     Sets to `auto` if None.
                 Defaults to None.
+            turboboost_tps (float, optional): A parameter that controls the speed-quality tradeoff between 
+                    the draft model (faster, lower quality) and target model (slower, higher quality). 
+                    Values range from 0.0 (regular speculative decoding with target model quality) to 
+                    1.0 (draft model speed with 100% acceptance rate). Higher values increase speed 
+                    while potentially reducing quality.
+                Defaults to 0.0.
+
 
         Returns:
             ChatCompletionResponse | Iterator[ChatCompletionChunk]: Object containing the completions
@@ -135,6 +143,7 @@ def create(
             response_format=response_format,
             tools=tools,
             tool_choice=tool_choice,
+            turboboost_tps=turboboost_tps,
             **kwargs,
         ).model_dump(exclude_none=True)
 
@@ -183,6 +192,7 @@ async def create(
         response_format: Dict[str, Any] | None = None,
         tools: Dict[str, str | Dict[str, str | Dict[str, Any]]] | None = None,
         tool_choice: str | Dict[str, str | Dict[str, str]] | None = None,
+        turboboost_tps: float | None = None,
         **kwargs: Any,
     ) -> AsyncGenerator[ChatCompletionChunk, None] | ChatCompletionResponse:
         """
@@ -245,6 +255,12 @@ async def create(
                     via {"type": "function", "function": {"name": "my_function"}} forces the model to call that function.
                     Sets to `auto` if None.
                 Defaults to None.
+            turboboost_tps (float, optional): A parameter that controls the speed-quality tradeoff between 
+                    the draft model (faster, lower quality) and target model (slower, higher quality). 
+                    Values range from 0.0 (regular speculative decoding with target model quality) to 
+                    1.0 (draft model speed with 100% acceptance rate). Higher values increase speed 
+                    while potentially reducing quality.
+                Defaults to 0.0.
 
         Returns:
             AsyncGenerator[ChatCompletionChunk, None] | ChatCompletionResponse: Object containing the completions
@@ -277,6 +293,7 @@ async def create(
             response_format=response_format,
             tools=tools,
             tool_choice=tool_choice,
+            turboboost_tps=turboboost_tps,
             **kwargs,
         ).model_dump(exclude_none=True)
 

diff --git a/src/together/resources/completions.py b/src/together/resources/completions.py
@@ -38,6 +38,7 @@ def create(
         echo: bool | None = None,
         n: int | None = None,
         safety_model: str | None = None,
+        turboboost_tps: float | None = None,
         **kwargs: Any,
     ) -> CompletionResponse | Iterator[CompletionChunk]:
         """
@@ -88,6 +89,12 @@ def create(
             safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation
                     models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
                 Defaults to None.
+            turboboost_tps (float, optional): A parameter that controls the speed-quality tradeoff between 
+                    the draft model (faster, lower quality) and target model (slower, higher quality). 
+                    Values range from 0.0 (regular speculative decoding with target model quality) to 
+                    1.0 (draft model speed with 100% acceptance rate). Higher values increase speed 
+                    while potentially reducing quality.
+                Defaults to 0.0.
 
         Returns:
             CompletionResponse | Iterator[CompletionChunk]: Object containing the completions
@@ -117,6 +124,7 @@ def create(
             echo=echo,
             n=n,
             safety_model=safety_model,
+            turboboost_tps=turboboost_tps,
             **kwargs,
         ).model_dump(exclude_none=True)
 
@@ -162,6 +170,7 @@ async def create(
         echo: bool | None = None,
         n: int | None = None,
         safety_model: str | None = None,
+        turboboost_tps: float | None = None,
         **kwargs: Any,
     ) -> AsyncGenerator[CompletionChunk, None] | CompletionResponse:
         """
@@ -212,6 +221,12 @@ async def create(
             safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation
                     models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
                 Defaults to None.
+            turboboost_tps (float, optional): A parameter that controls the speed-quality tradeoff between 
+                    the draft model (faster, lower quality) and target model (slower, higher quality). 
+                    Values range from 0.0 (regular speculative decoding with target model quality) to 
+                    1.0 (draft model speed with 100% acceptance rate). Higher values increase speed 
+                    while potentially reducing quality.
+                Defaults to 0.0.
 
         Returns:
             AsyncGenerator[CompletionChunk, None] | CompletionResponse: Object containing the completions
@@ -241,6 +256,7 @@ async def create(
             echo=echo,
             n=n,
             safety_model=safety_model,
+            turboboost_tps=turboboost_tps,
             **kwargs,
         ).model_dump(exclude_none=True)