@@ -66,6 +66,12 @@ export interface ChatCompletion {
6666 */
6767 object : 'chat.completion' ;
6868
69+ /**
70+ * The service tier used for processing the request. This field is only included if
71+ * the `service_tier` parameter is specified in the request.
72+ */
73+ service_tier ?: 'scale' | 'default' | null ;
74+
6975 /**
7076 * This fingerprint represents the backend configuration that the model runs with.
7177 *
@@ -205,6 +211,12 @@ export interface ChatCompletionChunk {
205211 */
206212 object : 'chat.completion.chunk' ;
207213
214+ /**
215+ * The service tier used for processing the request. This field is only included if
216+ * the `service_tier` parameter is specified in the request.
217+ */
218+ service_tier ?: 'scale' | 'default' | null ;
219+
208220 /**
209221 * This fingerprint represents the backend configuration that the model runs with.
210222 * Can be used in conjunction with the `seed` request parameter to understand when
@@ -800,6 +812,19 @@ export interface ChatCompletionCreateParamsBase {
800812 */
801813 seed ?: number | null ;
802814
815+ /**
816+ * Specifies the latency tier to use for processing the request. This parameter is
817+ * relevant for customers subscribed to the scale tier service:
818+ *
819+ * - If set to 'auto', the system will utilize scale tier credits until they are
820+ * exhausted.
821+ * - If set to 'default', the request will be processed in the shared cluster.
822+ *
823+ * When this parameter is set, the response body will include the `service_tier`
824+ * utilized.
825+ */
826+ service_tier ?: 'auto' | 'default' | null ;
827+
803828 /**
804829 * Up to 4 sequences where the API will stop generating further tokens.
805830 */
0 commit comments