forked from kubernetes-sigs/inference-perf
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig-shared-prefix.yml
More file actions
35 lines (35 loc) · 1.02 KB
/
config-shared-prefix.yml
File metadata and controls
35 lines (35 loc) · 1.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
load:
type: constant
interval: 15
stages:
- rate: 1
duration: 30
- rate: 2
duration: 30
api:
type: completion
server:
type: vllm
model_name: HuggingFaceTB/SmolLM2-135M-Instruct
base_url: http://0.0.0.0:8000
ignore_eos: true
tokenizer:
pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
data:
type: shared_prefix
shared_prefix:
num_unique_system_prompts: 10 # Number of distinct shared prefixes (formerly num_groups)
num_users_per_system_prompt: 10 # Number of unique questions per shared prefix (formerly num_prompts_per_group)
system_prompt_len: 100 # Length of the shared prefix (in tokens)
question_len: 50 # Length of the unique question part (in tokens)
output_len: 50 # Target length for the model's generated output (in tokens)
metrics:
type: prometheus
prometheus:
url: http://localhost:9090
scrape_interval: 15
report:
request_lifecycle:
summary: true
per_stage: true
per_request: true