diff --git a/src/together/resources/chat/completions.py b/src/together/resources/chat/completions.py index 5e4b44b0..d43cbe58 100644 --- a/src/together/resources/chat/completions.py +++ b/src/together/resources/chat/completions.py @@ -41,6 +41,7 @@ def create( response_format: Dict[str, str | Dict[str, Any]] | None = None, tools: List[Dict[str, Any]] | None = None, tool_choice: str | Dict[str, str | Dict[str, str]] | None = None, + turboboost_tps: float | None = None, **kwargs: Any, ) -> ChatCompletionResponse | Iterator[ChatCompletionChunk]: """ @@ -103,6 +104,13 @@ def create( via {"type": "function", "function": {"name": "my_function"}} forces the model to call that function. Sets to `auto` if None. Defaults to None. + turboboost_tps (float, optional): A parameter that controls the speed-quality tradeoff between + the draft model (faster, lower quality) and target model (slower, higher quality). + Values range from 0.0 (regular speculative decoding with target model quality) to + 1.0 (draft model speed with 100% acceptance rate). Higher values increase speed + while potentially reducing quality. + Defaults to 0.0. + Returns: ChatCompletionResponse | Iterator[ChatCompletionChunk]: Object containing the completions @@ -135,6 +143,7 @@ def create( response_format=response_format, tools=tools, tool_choice=tool_choice, + turboboost_tps=turboboost_tps, **kwargs, ).model_dump(exclude_none=True) @@ -183,6 +192,7 @@ async def create( response_format: Dict[str, Any] | None = None, tools: Dict[str, str | Dict[str, str | Dict[str, Any]]] | None = None, tool_choice: str | Dict[str, str | Dict[str, str]] | None = None, + turboboost_tps: float | None = None, **kwargs: Any, ) -> AsyncGenerator[ChatCompletionChunk, None] | ChatCompletionResponse: """ @@ -245,6 +255,12 @@ async def create( via {"type": "function", "function": {"name": "my_function"}} forces the model to call that function. Sets to `auto` if None. Defaults to None. + turboboost_tps (float, optional): A parameter that controls the speed-quality tradeoff between + the draft model (faster, lower quality) and target model (slower, higher quality). + Values range from 0.0 (regular speculative decoding with target model quality) to + 1.0 (draft model speed with 100% acceptance rate). Higher values increase speed + while potentially reducing quality. + Defaults to 0.0. Returns: AsyncGenerator[ChatCompletionChunk, None] | ChatCompletionResponse: Object containing the completions @@ -277,6 +293,7 @@ async def create( response_format=response_format, tools=tools, tool_choice=tool_choice, + turboboost_tps=turboboost_tps, **kwargs, ).model_dump(exclude_none=True) diff --git a/src/together/resources/completions.py b/src/together/resources/completions.py index de484715..cc70c1f5 100644 --- a/src/together/resources/completions.py +++ b/src/together/resources/completions.py @@ -38,6 +38,7 @@ def create( echo: bool | None = None, n: int | None = None, safety_model: str | None = None, + turboboost_tps: float | None = None, **kwargs: Any, ) -> CompletionResponse | Iterator[CompletionChunk]: """ @@ -88,6 +89,12 @@ def create( safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation models found [here](https://docs.together.ai/docs/inference-models#moderation-models). Defaults to None. + turboboost_tps (float, optional): A parameter that controls the speed-quality tradeoff between + the draft model (faster, lower quality) and target model (slower, higher quality). + Values range from 0.0 (regular speculative decoding with target model quality) to + 1.0 (draft model speed with 100% acceptance rate). Higher values increase speed + while potentially reducing quality. + Defaults to 0.0. Returns: CompletionResponse | Iterator[CompletionChunk]: Object containing the completions @@ -117,6 +124,7 @@ def create( echo=echo, n=n, safety_model=safety_model, + turboboost_tps=turboboost_tps, **kwargs, ).model_dump(exclude_none=True) @@ -162,6 +170,7 @@ async def create( echo: bool | None = None, n: int | None = None, safety_model: str | None = None, + turboboost_tps: float | None = None, **kwargs: Any, ) -> AsyncGenerator[CompletionChunk, None] | CompletionResponse: """ @@ -212,6 +221,12 @@ async def create( safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation models found [here](https://docs.together.ai/docs/inference-models#moderation-models). Defaults to None. + turboboost_tps (float, optional): A parameter that controls the speed-quality tradeoff between + the draft model (faster, lower quality) and target model (slower, higher quality). + Values range from 0.0 (regular speculative decoding with target model quality) to + 1.0 (draft model speed with 100% acceptance rate). Higher values increase speed + while potentially reducing quality. + Defaults to 0.0. Returns: AsyncGenerator[CompletionChunk, None] | CompletionResponse: Object containing the completions @@ -241,6 +256,7 @@ async def create( echo=echo, n=n, safety_model=safety_model, + turboboost_tps=turboboost_tps, **kwargs, ).model_dump(exclude_none=True)