inference-perf-multi/examples/vllm/config-shared-prefix.yml at main · tomatillo-and-multiverse/inference-perf-multi · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
load:
  type: constant
  interval: 15
  stages:
  - rate: 1
    duration: 30
  - rate: 2
    duration: 30
api:
  type: completion
server:
  type: vllm
  model_name: HuggingFaceTB/SmolLM2-135M-Instruct
  base_url: http://0.0.0.0:8000
  ignore_eos: true
tokenizer:
  pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
data:
  type: shared_prefix
  shared_prefix:
    num_unique_system_prompts: 10     # Number of distinct shared prefixes (formerly num_groups)
    num_users_per_system_prompt: 10   # Number of unique questions per shared prefix (formerly num_prompts_per_group)
    system_prompt_len: 100            # Length of the shared prefix (in tokens)
    question_len: 50                  # Length of the unique question part (in tokens)
    output_len: 50                    # Target length for the model's generated output (in tokens)
metrics:
  type: prometheus
  prometheus:
    url: http://localhost:9090
    scrape_interval: 15
report:
  request_lifecycle:
    summary: true
    per_stage: true
    per_request: true