inference-perf-multi/examples/vllm/config-shared-prefix-multi-turn.yml at main · tomatillo-and-multiverse/inference-perf-multi · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
load:
  type: constant
  num_workers: 2
  worker_max_concurrency: 10
  stages:
  - rate: 20                      # Send all 20 users' requests per second
    duration: 5
api:
  type: completion
server:
  type: vllm
  model_name: HuggingFaceTB/SmolLM2-135M-Instruct
  base_url: http://0.0.0.0:8000
  ignore_eos: true
tokenizer:
  pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
data:
  type: shared_prefix
  shared_prefix:
    num_unique_system_prompts: 2      # Number of distinct prefix, Note: the number of users is num_unique_system_prompts * num_users_per_system_prompt
    num_users_per_system_prompt: 10   # Number of unique questions per group (prefix)
    system_prompt_len: 100            # Length of the first prefix (in tokens), simulate initialization of a system prompt
    question_len: 50              # Length of the unique question part (in tokens)
    output_len: 50                # Target length for the model's generated output (in tokens)
    enable_multi_turn_chat: true  # enable multi-turn chat, it will create user session to keep the conversation. The chat context will be appended for the each request.
metrics:
  type: prometheus
  prometheus:
    url: http://localhost:9090
    scrape_interval: 15
report:
  request_lifecycle:
    summary: true
    per_stage: true
    per_request: true