forked from kubernetes-sigs/inference-perf
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig-shared-prefix-multi-turn.yml
More file actions
35 lines (35 loc) · 1.3 KB
/
config-shared-prefix-multi-turn.yml
File metadata and controls
35 lines (35 loc) · 1.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
load:
type: constant
num_workers: 2
worker_max_concurrency: 10
stages:
- rate: 20 # Send all 20 users' requests per second
duration: 5
api:
type: completion
server:
type: vllm
model_name: HuggingFaceTB/SmolLM2-135M-Instruct
base_url: http://0.0.0.0:8000
ignore_eos: true
tokenizer:
pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
data:
type: shared_prefix
shared_prefix:
num_unique_system_prompts: 2 # Number of distinct prefix, Note: the number of users is num_unique_system_prompts * num_users_per_system_prompt
num_users_per_system_prompt: 10 # Number of unique questions per group (prefix)
system_prompt_len: 100 # Length of the first prefix (in tokens), simulate initialization of a system prompt
question_len: 50 # Length of the unique question part (in tokens)
output_len: 50 # Target length for the model's generated output (in tokens)
enable_multi_turn_chat: true # enable multi-turn chat, it will create user session to keep the conversation. The chat context will be appended for the each request.
metrics:
type: prometheus
prometheus:
url: http://localhost:9090
scrape_interval: 15
report:
request_lifecycle:
summary: true
per_stage: true
per_request: true