kubernetes-sigs
diff --git a/‎docs/config.md‎
Lines changed: 25 additions & 10 deletions b/‎docs/config.md‎
Lines changed: 25 additions & 10 deletions
diff --git a/‎inference_perf/apis/base.py‎
Lines changed: 4 additions & 0 deletions b/‎inference_perf/apis/base.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎inference_perf/apis/chat.py‎
Lines changed: 8 additions & 6 deletions b/‎inference_perf/apis/chat.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎inference_perf/apis/completion.py‎
Lines changed: 1 addition & 0 deletions b/‎inference_perf/apis/completion.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎inference_perf/apis/user_session.py‎
Lines changed: 2 additions & 2 deletions b/‎inference_perf/apis/user_session.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎inference_perf/client/modelserver/openai_client.py‎
Lines changed: 30 additions & 6 deletions b/‎inference_perf/client/modelserver/openai_client.py‎
Lines changed: 30 additions & 6 deletions
diff --git a/‎inference_perf/config.py‎
Lines changed: 5 additions & 1 deletion b/‎inference_perf/config.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎inference_perf/datagen/shared_prefix_datagen.py‎
Lines changed: 64 additions & 9 deletions b/‎inference_perf/datagen/shared_prefix_datagen.py‎
Lines changed: 64 additions & 9 deletions
@@ -22,15 +22,20 @@ This document provides complete documentation for all configuration options avai
 
 ### API Configuration
 
-Controls the API interaction behavior:
+Controls the API interaction behavior. If SLO headers are present, each request is evaluated for SLO compliance and SLO-related metrics are reported:
 
 ```yaml
 api:
-  type: completion  # API type (completion|chat) (default: completion), completion is the default since the chat API is not typically enabled on model servers such as vLLM by default without additional configuration.
-  streaming: false  # Enable/disable streaming (default: false), needs to be enabled for metrics like TTFT, ITL and TPOT to be measured
-  headers:          # Add custom http headers to the request sent to the inference server
+  type: completion             # API type (completion|chat). completion is default since chat may require extra server config
+  streaming: true             # Enable streaming for TTFT, ITL, and TPOT metrics
+  headers:                     # Optional custom HTTP headers
     x-inference-model: llama
     x-routing-strategy: round-robin
+    x-slo-tpot-ms: "2"
+    x-slo-ttft-ms: "1000"
+  slo_unit: "ms"               # Optional SLO unit (e.g., ms, s), default is ms
+  slo_tpot_header: "x-slo-tpot-ms"        # Optional header name for TPOT SLO Header, default is x-slo-tpot-ms
+  slo_ttft_header: "x-slo-ttft-ms"        # Optional header name for TTFT SLO Header, default is x-slo-ttft-ms
 ```  
 
 ### Data Generation
@@ -53,12 +58,22 @@ data:
     mean: 50
     std_dev: 10
     total_count: 100
-  shared_prefix:
-    num_unique_system_prompts: 10     # Number of distinct shared prefixes (formerly num_groups)
-    num_users_per_system_prompt: 10   # Number of unique questions per shared prefix (formerly num_prompts_per_group)
-    system_prompt_len: 100            # Length of the shared prefix (in tokens)
-    question_len: 50                  # Length of the unique question part (in tokens)
-    output_len: 50                    # Target length for the model's generated output (in tokens)
+  shared_prefix:              # For shared_prefix type
+    num_groups: 10            # Number of shared prefix groups
+    num_prompts_per_group: 10 # Unique questions per group
+    system_prompt_len: 100    # Shared prefix length (tokens)
+    question_len: 50          # Default question length (tokens), used when question_distribution is absent
+    output_len: 50            # Default output length (tokens), used when output_distribution is absent
+    question_distribution:    # Optional: distribution for question lengths (overrides question_len)
+      min: 10
+      max: 1024
+      mean: 50
+      std_dev: 5
+    output_distribution:      # Optional: distribution for output lengths (overrides output_len)
+      min: 10
+      max: 1024
+      mean: 50
+      std_dev: 5
 ```
 
 ### Load Configuration
 
@@ -43,6 +43,10 @@ class RequestLifecycleMetric(BaseModel):
     info: InferenceInfo
     error: Optional[ErrorResponseInfo]
 
+    ttft_slo_sec: Optional[float] = None
+    tpot_slo_sec: Optional[float] = None
+
+
 
 class InferenceAPIData(BaseModel):
     # loadgen should assign this request to prefered worker if possible
 
@@ -44,12 +44,14 @@ async def to_payload(
         if self.max_tokens == 0:
             self.max_tokens = max_tokens
         return {
-            "model": effective_model_name,
-            "messages": [{"role": m.role, "content": m.content} for m in self.messages],
-            "max_tokens": self.max_tokens,
-            "ignore_eos": ignore_eos,
-            "stream": streaming,
-        }
+                    "model": effective_model_name,
+                    "messages": [{"role": m.role, "content": m.content} for m in self.messages],
+                    "max_tokens": self.max_tokens,
+                    "ignore_eos": ignore_eos,
+                    "stream": streaming,
+                    **({"stream_options": {"include_usage": "true"}} if streaming else {}),
+                }
+
 
     async def process_response(
         self, response: ClientResponse, config: APIConfig, tokenizer: CustomTokenizer, lora_adapter: Optional[str] = None
 
@@ -45,6 +45,7 @@ async def to_payload(
             "max_tokens": self.max_tokens,
             "ignore_eos": ignore_eos,
             "stream": streaming,
+            **({"stream_options": {"include_usage": "true"}} if streaming else {}),
         }
 
     async def process_response(
 
@@ -47,14 +47,14 @@ class UserSessionCompletionAPIData(CompletionAPIData):
     user_session: LocalUserSession = Field(exclude=True)
     target_round: int
 
-    async def to_payload(self, model_name: str, max_tokens: int, ignore_eos: bool, streaming: bool) -> dict[str, Any]:
+    async def to_payload(self, effective_model_name: str, max_tokens: int, ignore_eos: bool, streaming: bool) -> dict[str, Any]:
         self._session_context = await self.user_session.get_context(self.target_round)
         # TODO: Currently, only prompt style (concat messages) support. Adding support for messages style payload.
         self.prompt = self._session_context + " " + self.prompt
         # TODO: The combined prompt (session context + current prompt) might exceed the model's
         #       maximum sequence length. Implement truncation logic/strategy to prevent
         #       errors/failures from the inference server.
-        return await super().to_payload(model_name, max_tokens, ignore_eos, streaming)
+        return await super().to_payload(effective_model_name, max_tokens, ignore_eos, streaming)
 
     def update_inference_info(self, inference_info: InferenceInfo) -> None:
         inference_info.extra_info["user_session"] = self.user_session.user_session_id
 
@@ -28,8 +28,7 @@
 import ssl
 
 logger = logging.getLogger(__name__)
-
-
+    
 class openAIModelServerClient(ModelServerClient):
     _session: "openAIModelServerClientSession | None" = None
     _session_lock = asyncio.Lock()
@@ -189,8 +188,7 @@ async def process_request(
                         error_type=f"{response.status} {response.reason}",
                     )
 
-                self.client.metrics_collector.record_metric(
-                    RequestLifecycleMetric(
+                metric = RequestLifecycleMetric(
                         stage_id=stage_id,
                         request_data=request_data,
                         response_data=response_content,
@@ -199,8 +197,34 @@ async def process_request(
                         start_time=start,
                         end_time=end_time,
                         scheduled_time=scheduled_time,
-                    )
-                )
+                    ) 
+                    
+                    # Grab TTFT and TPOT thresholds from request headers if available for streaming requests with token-level timestamps
+                if response_info.output_token_times:
+                    ttft_threshold = None
+                    tpot_threshold = None
+                    slo_unit = getattr(self.client.api_config, "slo_unit", None) or "ms"
+                
+                    default_ttft_header = f"x-slo-ttft-{slo_unit}"
+                    default_tpot_header = f"x-slo-tpot-{slo_unit}"
+                    ttft_header = getattr(self.client.api_config, "slo_ttft_header", None) or default_ttft_header
+                    tpot_header = getattr(self.client.api_config, "slo_tpot_header", None) or default_tpot_header
+                    if self.client.api_config.headers:
+                        ttft_threshold = self.client.api_config.headers.get(ttft_header)
+                        tpot_threshold = self.client.api_config.headers.get(tpot_header)
+
+                        unit = slo_unit.lower()
+                        unit_to_s = {"s": 1.0, "ms": 0.001, "us": 0.000001}
+                        factor = unit_to_s.get(unit, 1.0)
+
+                        if ttft_threshold is not None:
+                            metric.ttft_slo_sec = float(ttft_threshold) * factor
+
+                        if tpot_threshold is not None:
+                            metric.tpot_slo_sec = float(tpot_threshold) * factor
+                 # Record the metric
+                self.client.metrics_collector.record_metric(metric)
+
         except Exception as e:
             if isinstance(e, asyncio.exceptions.TimeoutError):
                 logger.error("request timed out:", exc_info=True)
 
@@ -32,6 +32,9 @@ class APIConfig(BaseModel):
     type: APIType = APIType.Completion
     streaming: bool = False
     headers: Optional[dict[str, str]] = None
+    slo_unit: Optional[str] = None
+    slo_tpot_header: Optional[str] = None
+    slo_ttft_header: Optional[str] = None
 
 
 class TraceFormat(Enum):
@@ -82,6 +85,8 @@ class SharedPrefix(BaseModel):
     system_prompt_len: int = 100
     question_len: int = 50
     output_len: int = 50
+    question_distribution: Optional[Distribution] = None
+    output_distribution: Optional[Distribution] = None
     enable_multi_turn_chat: bool = False
 
 
@@ -99,7 +104,6 @@ class DataConfig(BaseModel):
     # Trace file is only supported for random dataset at this moment
     trace: Optional[TraceConfig] = None
 
-
 class ModelServerType(Enum):
     VLLM = "vllm"
     SGLANG = "sglang"
 
@@ -1,11 +1,12 @@
 import random
 from typing import Generator, List, Optional
+from inference_perf.utils.distribution import generate_distribution
 import numpy as np
 
 from inference_perf.apis.base import InferenceAPIData, LazyLoadInferenceAPIData
 from inference_perf.apis.completion import CompletionAPIData
 from inference_perf.apis.user_session import LocalUserSession, UserSessionCompletionAPIData
-from inference_perf.config import APIConfig, APIType, DataConfig
+from inference_perf.config import APIConfig, APIType, DataConfig, Distribution
 from inference_perf.utils.custom_tokenizer import CustomTokenizer
 from .base import DataGenerator, LazyLoadDataMixin
 
@@ -42,12 +43,43 @@ def __init__(self, api_config: APIConfig, config: DataConfig, tokenizer: Optiona
         self.num_groups: int = self.shared_prefix.num_groups
         self.num_prompts_per_group: int = self.shared_prefix.num_prompts_per_group
         self.system_prompt_len: int = self.shared_prefix.system_prompt_len
-        self.question_len: int = self.shared_prefix.question_len
-        self.output_len: int = self.shared_prefix.output_len
         self.enable_multi_turn_chat: bool = self.shared_prefix.enable_multi_turn_chat
+        
+        # Use distribution configs, or fall back to question_len/output_len with std_dev=0
+        q_len = self.shared_prefix.question_len
+        o_len = self.shared_prefix.output_len
+        question_dist = self.shared_prefix.question_distribution or Distribution(min=q_len, max=q_len, mean=q_len, std_dev=0)
+        output_dist = self.shared_prefix.output_distribution or Distribution(min=o_len, max=o_len, mean=o_len, std_dev=0)
+
+        # Generate separate distributions for each group
+        self.question_len_list_per_group: List[List[int]] = []
+        self.output_len_list_per_group: List[List[int]] = []
+
+        for _ in range(self.num_groups):
+            question_lens = generate_distribution(
+                question_dist.min,
+                question_dist.max,
+                question_dist.mean,
+                question_dist.std_dev,
+                self.shared_prefix.num_prompts_per_group,
+            )
+            self.question_len_list_per_group.append(question_lens.tolist())
+
+            output_lens = generate_distribution(
+                output_dist.min,
+                output_dist.max,
+                output_dist.mean,
+                output_dist.std_dev,
+                self.shared_prefix.num_prompts_per_group,
+            )
+            self.output_len_list_per_group.append(output_lens.tolist())
+        
+        
+
 
         self.prompts: List[str] = []
         self.user_sessions: List[LocalUserSession] = []
+        self.flat_output_lens: List[int] = []
         self._generate_prompts()
 
     def get_supported_apis(self) -> List[APIType]:
@@ -64,17 +96,19 @@ def is_prefered_worker_requested(self) -> bool:
 
     def load_lazy_data(self, data: LazyLoadInferenceAPIData) -> InferenceAPIData:
         i = data.data_index % len(self.prompts)
+        output_len = self.flat_output_lens[i]
+          
         if self.enable_multi_turn_chat:
             user_id = data.data_index % len(self.user_sessions)
             round = data.data_index // len(self.user_sessions)
             return UserSessionCompletionAPIData(
                 prompt=self.prompts[i],
-                max_tokens=self.output_len,
+                max_tokens=output_len,
                 user_session=self.user_sessions[user_id],
                 target_round=round,
             )
         else:
-            return CompletionAPIData(prompt=self.prompts[i], max_tokens=self.output_len)
+            return CompletionAPIData(prompt=self.prompts[i], max_tokens=output_len)
 
     def get_data(self) -> Generator[InferenceAPIData, None, None]:
         if not self.prompts:
@@ -99,17 +133,27 @@ def _generate_prompts(self) -> None:
             # This check is defensive; __init__ should have already validated this.
             raise ValueError("Tokenizer is not available for generating prompts.")
 
+        if self.shared_prefix is None:
+            raise ValueError("Shared prefix is not available for generating prompts.")
+        
         hf_tokenizer = self.tokenizer.get_tokenizer()
 
         for group_id in range(self.num_groups):
             # Generate a shared prefix (system prompt)
             shared_prefix_token_ids = self._generate_random_token_ids(self.system_prompt_len)
             shared_prefix_text = hf_tokenizer.decode(shared_prefix_token_ids, skip_special_tokens=True)
 
+            # Batch generate all question token IDs for this group
+            all_question_token_ids = [
+                self._generate_random_token_ids(self.question_len_list_per_group[group_id][prompt_id])
+                for prompt_id in range(self.num_prompts_per_group)
+            ]
+
+            # Batch decode all questions at once (much faster than individual decode calls)
+            all_question_texts = hf_tokenizer.batch_decode(all_question_token_ids, skip_special_tokens=True)
+
             for prompt_id in range(self.num_prompts_per_group):
-                # Generate a unique question
-                question_token_ids = self._generate_random_token_ids(self.question_len)
-                question_text = hf_tokenizer.decode(question_token_ids, skip_special_tokens=True)
+                question_text = all_question_texts[prompt_id]
 
                 if self.enable_multi_turn_chat:
                     # multi turn chat, create user to keep conversation
@@ -125,9 +169,20 @@ def _generate_prompts(self) -> None:
 
                 self.prompts.append(question_text)
 
+        # Flatten output lengths to match prompts ordering
+        self.flat_output_lens = [
+            self.output_len_list_per_group[g][p]
+            for g in range(self.num_groups)
+            for p in range(self.num_prompts_per_group)
+        ]
+
         # Shuffle the generated prompts to ensure randomness if served sequentially by different workers
         if self.enable_multi_turn_chat:
             # no need to sync shuffles - multi-round initial prompt does not include system prompt 
             random.shuffle(self.user_sessions)        
         else:
-            random.shuffle(self.prompts)
+            # Shuffle prompts and output lengths in sync
+            combined = list(zip(self.prompts, self.flat_output_lens, strict=True))
+            random.shuffle(combined)
+            self.prompts, self.flat_output_lens = [list(t) for t in zip(*combined, strict=True)]
+
Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,7 @@ async def to_payload(`
`45`	`45`	`"max_tokens": self.max_tokens,`
`46`	`46`	`"ignore_eos": ignore_eos,`
`47`	`47`	`"stream": streaming,`
	`48`	`+ **({"stream_options": {"include_usage": "true"}} if streaming else {}),`
`48`	`49`	`}`
`49`	`50`
`50`	`51`	`async def process_response(`