From 281c39bbd64ddfe69fb0a097db6c0dd4ec44cd7d Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Mon, 5 Aug 2024 13:50:37 +0000 Subject: [PATCH 01/16] add profiler --- examples/offline_profile.py | 287 ++++++++++++ neuralmagic/tools/profiler/print_table.py | 77 ++++ neuralmagic/tools/profiler/visualize_trace.py | 431 ++++++++++++++++++ vllm/profiler/__init__.py | 5 + vllm/profiler/nm_profile.py | 349 ++++++++++++++ vllm/profiler/utils.py | 145 ++++++ vllm/worker/model_runner.py | 10 +- 7 files changed, 1300 insertions(+), 4 deletions(-) create mode 100644 examples/offline_profile.py create mode 100644 neuralmagic/tools/profiler/print_table.py create mode 100644 neuralmagic/tools/profiler/visualize_trace.py create mode 100644 vllm/profiler/__init__.py create mode 100644 vllm/profiler/nm_profile.py create mode 100644 vllm/profiler/utils.py diff --git a/examples/offline_profile.py b/examples/offline_profile.py new file mode 100644 index 000000000000..c5c2346c908a --- /dev/null +++ b/examples/offline_profile.py @@ -0,0 +1,287 @@ +import argparse +import inspect +import json +import sys +from dataclasses import asdict, dataclass +from typing import Optional + +import torch + +from vllm import LLM, SamplingParams +from vllm.profiler import nm_profile + +BATCH_SIZE_DEFAULT = 1 +PROMPT_LEN_DEFAULT = 256 +OUTPUT_LEN_DEFAULT = 2 + + +@dataclass +class ProfileContext: + model: str + tokenizer: str + model_revision: str + quantization: str + max_model_len: int + max_num_batched_tokens: int + prompt_len: int + output_len: int + batch_size: int + dtype: str + tensor_parallel_size: int + allow_cuda_graphs: bool + + +def get_dtype(dtype: str): + if dtype == "torch.float": + return torch.float + else: + return dtype + + +def run_profile(context: ProfileContext, csv_output: Optional[str], + json_output: Optional[str]): + print("Run profile with:") + for key, value in asdict(context).items(): + print(f" {key} = {value}") + + # Create sampling params + sampling_params = SamplingParams(temperature=0.8, + top_p=0.95, + max_tokens=args.output_len, + ignore_eos=True) + + # Create LLM + llm = LLM(model=context.model, + tokenizer=context.tokenizer + if context.tokenizer is not None else context.model, + revision=context.model_revision, + enforce_eager=not context.allow_cuda_graphs, + tensor_parallel_size=context.tensor_parallel_size, + gpu_memory_utilization=0.9, + max_model_len=context.max_model_len, + quantization=context.quantization, + dtype=get_dtype(context.dtype), + max_num_batched_tokens=context.max_num_batched_tokens) + + batch_size = context.batch_size + prompt_len = context.prompt_len + output_len = context.output_len + + scheduler_config = llm.llm_engine.scheduler_config + max_model_len = llm.llm_engine.model_config.max_model_len + max_num_batched_tokens = scheduler_config.max_num_batched_tokens + max_num_seqs = scheduler_config.max_num_seqs + + if batch_size * prompt_len > max_num_batched_tokens: + print(f"ERROR: chosen batch_size * prompt_len " + f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is " + f"larger than max_num_batched_tokens ({max_num_batched_tokens}) " + f"and therefore cannot be run in a single profile step, please " + f"choose a smaller batch size or prompt length, or increase " + f"--max_num_batched_tokens") + sys.exit(-1) + if batch_size >= max_num_seqs: + print( + f"ERROR: chosen batch_size ({batch_size}) is larger than " + f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a " + f"single profile step, please choose a smaller batch size") + sys.exit(-1) + print("llm.llm_engine.model_config.max_model_len: ", + llm.llm_engine.model_config.max_model_len) + if prompt_len + output_len > llm.llm_engine.model_config.max_model_len: + print( + f"ERROR: chosen prompt_len + output_len ({prompt_len} + " + f"{output_len} = {prompt_len + output_len}) is larger than the " + f"model's max_model_len ({max_model_len}), please choose a smaller " + f"prompt_len or output_len, or increase --max-model-len") + sys.exit(-1) + + for i in range(batch_size): + prompt_token_ids = torch.randint( + llm.llm_engine.model_config.get_vocab_size(), + size=(prompt_len, )).tolist() + + llm.llm_engine.add_request( + request_id=f"seq{i}", + inputs={'prompt_token_ids': prompt_token_ids}, + params=sampling_params) + + with nm_profile() as prefill_prof: + llm.llm_engine.step() # First step is prefill + + decode_results_list = [] + for x in range(args.output_len - 1): + with nm_profile() as decode_prof: + llm.llm_engine.step() + decode_results_list.append(decode_prof.results) + + prefill_results = prefill_prof.results + has_decode = len(decode_results_list) > 0 + + print("=" * 80) + print(f"= Prefill Model Table " + f"(prompt_len={prompt_len}, batch_size={batch_size})") + print("=" * 80) + print() + prefill_results.print_model_table() + + if has_decode: + print() + print("=" * 80) + print(f"= First Decode Step Model Table " + f"(prompt_len={prompt_len}, batch_size={batch_size})") + print("=" * 80) + print() + decode_results_list[0].print_model_table() + + print() + print("=" * 80) + print(f"= Prefill Summary Table " + f"(prompt_len={prompt_len}, batch_size={batch_size})") + print("=" * 80) + print() + prefill_results.print_summary_table() + + if has_decode: + print() + print("=" * 80) + print(f"= First Decode Step Summary Table " + f"(prompt_len={prompt_len}, batch_size={batch_size})") + print("=" * 80) + print() + decode_results_list[0].print_summary_table() + + if csv_output: + csv_filename_base = csv_output.rstrip(".csv") + prefill_results.export_model_stats_table_csv( + csv_filename_base + "_prefill_model_table.csv") + prefill_results.export_summary_stats_table_csv( + csv_filename_base + "_prefill_summary_table.csv") + + if has_decode: + decode_results_list[0].export_model_stats_table_csv(\ + csv_filename_base + "_decode_model_table.csv") + decode_results_list[0].export_summary_stats_table_csv( + csv_filename_base + "_decode_summary_table.csv") + + if json_output: + cuda_devices = [ + torch.cuda.get_device_properties(dev_idx) + for dev_idx in range(torch.cuda.device_count()) + ] + + json_dict = { + "context": { + "python_version": f"{sys.version}", + "torch_version": f"{torch.__version__}", + "torch_cuda_version": f"{torch.version.cuda}", + "cuda_devices": f"{cuda_devices}", + **asdict(context) + }, + "prefill": prefill_results.convert_stats_to_dict(), + } + + if has_decode: + for idx, dr in enumerate(decode_results_list): + json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict() + + for idx, dr in enumerate(decode_results_list[1:]): + json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict() + + with open(json_output.rstrip(".json") + ".json", "w+") as f: + json.dump(json_dict, f, indent=2) + pass + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model", + type=str, + required=True, + help='The name or path of a HuggingFace Transformers model.') + parser.add_argument("--tokenizer", + type=str, + default=None, + help="path to the tokenizer") + + parser.add_argument("--model-revision", type=str, default=None) + parser.add_argument( + "--csv", + type=str, + default=None, + help="Export the results as multiple csv file. This should be the root " + "filename, will create _prefill_model_table.csv, " + "_prefill_summary_table.csv, " + "_decode_model_table.csv, and " + "_decode_summary_table.csv") + parser.add_argument( + "--json", + type=str, + default=None, + help="Export the results as a json file. This should be the filename") + parser.add_argument( + "--quantization", + "-q", + type=str, + choices=['awq', 'gptq', 'squeezellm', 'marlin', 'smoothquant', None], + default=None, + help="The method used to quantize the model weights, " + "options are \"marlin\", \"awq\", \"gptq\", " + "\"squeezellm\", \"smoothquant\"") + parser.add_argument("--dtype", + type=str, + default='auto', + help="model dtype") + parser.add_argument( + "--max-model-len", + type=int, + default=None, + help="Maximum length of a sequence (including prompt and output)") + parser.add_argument( + "--max-num-batched-tokens", + type=int, + default=None, + help="Maximum number of tokens to be processed in a single iteration. " + " Should be greater than batch-size * prompt-len so the prefill can " + " run in a single iteration.") + parser.add_argument( + "--prompt-len", + type=int, + default=PROMPT_LEN_DEFAULT, + help=f"Length of the random prompt to use when profiling, all batched " + f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}") + parser.add_argument("--batch-size", + type=int, + default=BATCH_SIZE_DEFAULT, + help=f"Number of requests to run as a single batch, " + f"default={BATCH_SIZE_DEFAULT}") + parser.add_argument("--tensor-parallel-size", + "-tp", + type=int, + default=1, + help="Number of GPUs to use i.e. tensor parallelism, " + "default=1") + parser.add_argument( + "--allow-cuda-graphs", + action='store_true', + help="Enables cuda graphs to be used, well remove a lot of the module " + "level info in the profiler results since almost everything runs in " + "the graph where we do not have access to an informative stack trace") + parser.add_argument( + "--output-len", + type=int, + default=OUTPUT_LEN_DEFAULT, + help="Number of llm steps to run (includes prefill and decode) " + "- default={OUTPUT_LEN_DEFAULT}") + + args = parser.parse_args() + + context = ProfileContext( + **{ + k: v + for k, v in vars(args).items() + if k in inspect.signature(ProfileContext).parameters + }) + run_profile(context, csv_output=args.csv, json_output=args.json) diff --git a/neuralmagic/tools/profiler/print_table.py b/neuralmagic/tools/profiler/print_table.py new file mode 100644 index 000000000000..9081583a9f95 --- /dev/null +++ b/neuralmagic/tools/profiler/print_table.py @@ -0,0 +1,77 @@ +import argparse +import json +from typing import Dict + +from vllm.profiler.nm_profile import ModelStatsEntry, SummaryStatsEntry +from vllm.profiler.utils import TablePrinter, indent_string + + +def flatten_entries(entry_cls, profile_dict: Dict): + entries_and_depth = [] + + def get_entries(node, curr_depth=0): + entries_and_depth.append((entry_cls(**node["entry"]), curr_depth)) + + for child in node["children"]: + get_entries( + child, + curr_depth=curr_depth + 1, + ) + + for root in profile_dict: + get_entries(root) + + return entries_and_depth + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--json-trace", + type=str, + required=True, + help="json trace file output by " + "examples/offline_profile.py") + parser.add_argument("--phase", + type=str, + choices=["prefill", "decode_1"], + required=True, + help="The phase to print the table for.") + parser.add_argument("--table", + type=str, + choices=["summary", "model"], + default="summary", + help="Which table to print, the summary table or the " + "layerwise model table") + + args = parser.parse_args() + + with open(args.json_trace, "r") as f: + profile_data = json.load(f) + + if args.table == "summary": + entries_and_depths = flatten_entries( + SummaryStatsEntry, profile_data[args.phase]["summary_stats"]) + column_widths = dict(name=80, + cuda_time_us=12, + pct_cuda_time=12, + invocations=15) + elif args.table == "model": + entries_and_depths = flatten_entries( + ModelStatsEntry, profile_data[args.phase]["model_stats"]) + column_widths = dict(name=60, + cpu_time_us=12, + cuda_time_us=12, + pct_cuda_time=12, + trace=60) + + # ident entry names based on the depth + entries = [] + for entry, depth in entries_and_depths: + entry.name = indent_string( + entry.name, + indent=depth, + indent_style=lambda indent: "|" + "-" * indent + " ") + entries.append(entry) + + TablePrinter(type(entries[0]), column_widths).print_table(entries) diff --git a/neuralmagic/tools/profiler/visualize_trace.py b/neuralmagic/tools/profiler/visualize_trace.py new file mode 100644 index 000000000000..c1284226fec6 --- /dev/null +++ b/neuralmagic/tools/profiler/visualize_trace.py @@ -0,0 +1,431 @@ +import argparse +import copy +import json +import math +from pathlib import Path +from typing import Any, List, Optional, Tuple + +import matplotlib.pyplot as plt +import pandas as pd + +## JSON parsing utils #### + + +def largest_dist_from_leaf(node: dict, depth: int = 0): + if len(node["children"]) == 0: + return depth + return max([ + largest_dist_from_leaf(child, depth=depth + 1) + for child in node["children"] + ]) + + +def get_entries_at_depth(depth: int, + entries_and_traces: List[Tuple[Any, Any]], + node: dict, + curr_depth: int = 0, + trace=()): + # assert that the query is at kernel or module level + assert depth == -1 or depth == -2 + + if curr_depth == 0 and largest_dist_from_leaf(node) <= (abs(depth) - 1): + # The tree is not tall enough! + entries_and_traces.append((node["entry"], trace)) + return + + if largest_dist_from_leaf(node) == (abs(depth) - 1): + entries_and_traces.append((node["entry"], trace)) + + trace = (node["entry"]["name"], ) + trace + for child in node["children"]: + get_entries_at_depth(depth, + entries_and_traces, + child, + curr_depth=curr_depth + 1, + trace=trace) + + +def fold_nodes(root: dict, nodes_to_fold: List[str]): + + stack: List[dict] = [root] + while len(stack) != 0: + node = stack.pop() + if node['entry']['name'] in nodes_to_fold: + node["children"] = [] + continue + for child in node["children"]: + stack.append(child) + return root + + +## Operation name cleanup utils #### + + +def trim_string_back(string: str, width: int) -> str: + if len(string) > width: + offset = len(string) - width + 3 + string = string[:-offset] + if len(string) > 3: + string = string + "..." + return string + + +def shorten_plot_legend_strings(legend, max_char_len: int): + for t in legend.get_texts(): + t.set_text( + trim_string_back(abbreviate_known_names(t.get_text()), + max_char_len)) + + +def abbreviate_known_names(name: str) -> str: + abbreviations = { + "MergedColumnParallelLinear": "MCPLinear", + "QKVParallelLinear": "QKVPLinear", + "RowParallelLinear": "RPLinear", + "weight=": "w=", + "bfloat16": "bf16", + "float16": "f16", + } + for key, value in abbreviations.items(): + name = name.replace(key, value) + return name + + +def attempt_to_make_names_unique(entries_and_traces): + names, non_unique_names = (set(), set()) + + def all_the_same(items) -> bool: + return all(i == items[0] for i in items) + + for entry, _ in entries_and_traces: + if entry["name"] in names: + non_unique_names.add(entry["name"]) + else: + names.add(entry["name"]) + + for name in non_unique_names: + entries_and_traces_with_name = [(entry, trace) + for entry, trace in entries_and_traces + if entry["name"] == name] + + zipped_traces = list( + zip(*[trace for _, trace in entries_and_traces_with_name])) + first_trace_difference = next( + (i for i, trace_eles in enumerate(zipped_traces) + if not all_the_same(trace_eles)), None) + + if first_trace_difference is None: + # can't create a unique name, leave them names as the + # are they will get aggregated by the pivot_table call + continue + + for entry, trace in entries_and_traces_with_name: + entry["name"] = " <- ".join((entry["name"], ) + + trace[:first_trace_difference + 1]) + + +## Operation grouping utils #### +''' + Group operations in the given dataframe by some high-level ops like, + - gemms + - attention + - rms_norm + etc. +''' + + +def group_trace_by_operations(trace_df: pd.DataFrame) -> pd.DataFrame: + + def is_rms_norm(op_name: str): + if "rms_norm_kernel" in op_name: + return True + + def is_attention_block(op_name: str): + if "flash_fwd" in op_name or \ + "reshape_and_cache_flash_kernel" in op_name: + return True + + def is_quant(op_name: str): + if "scaled_fp8_quant" in op_name or \ + "scaled_int8_quant" in op_name: + return True + + def is_gemm_op(op_name: str): + if is_quant(op_name): + return False + if "xmma_gemm" in op_name or \ + "gemv2T_kernel" in op_name or \ + "splitKreduce" in op_name or \ + "void cutlass::Kernel" in op_name or \ + "void cutlass::device_kernel" in op_name or \ + "s16816gemm" in op_name: + return True + + def is_elementwise_op(op_name: str): + return "elementwise_kernel" in op_name + + def is_mem_op(op_name: str): + return "memcpy" in op_name.lower() or \ + "memset" in op_name.lower() + + def is_vocab_embedding_op(op_name: str): + return "vocabparallelembed" in op_name.lower() + + headers = list(trace_df) + ops = copy.deepcopy(headers) + + attention_ops = list(filter(lambda x: is_attention_block(x), ops)) + ops = list(filter(lambda x: x not in attention_ops, ops)) + + quant_ops = list(filter(lambda x: is_quant(x), ops)) + ops = list(filter(lambda x: x not in quant_ops, ops)) + + gemm_ops = list(filter(lambda x: is_gemm_op(x), ops)) + ops = list(filter(lambda x: x not in gemm_ops, ops)) + + rms_norm_ops = list(filter(lambda x: is_rms_norm(x), ops)) + ops = list(filter(lambda x: x not in rms_norm_ops, ops)) + + vocab_embed_ops = list(filter(lambda x: is_vocab_embedding_op(x), ops)) + ops = list(filter(lambda x: x not in vocab_embed_ops, ops)) + + mem_ops = list(filter(lambda x: is_mem_op(x), ops)) + ops = list(filter(lambda x: x not in mem_ops, ops)) + + elementwise_ops = list(filter(lambda x: is_elementwise_op(x), ops)) + ops = list(filter(lambda x: x not in elementwise_ops, ops)) + + if len(attention_ops): + trace_df['attention'] = trace_df[attention_ops].agg("sum", axis=1) + if len(quant_ops): + trace_df['quant_ops'] = trace_df[quant_ops].agg("sum", axis=1) + if len(gemm_ops): + trace_df['gemm_ops'] = trace_df[gemm_ops].agg("sum", axis=1) + if len(rms_norm_ops): + trace_df['rms_norm_ops'] = trace_df[rms_norm_ops].agg("sum", axis=1) + if len(vocab_embed_ops): + trace_df['vocab_embed_ops'] = trace_df[vocab_embed_ops].agg("sum", + axis=1) + if len(mem_ops): + trace_df['mem_ops'] = trace_df[mem_ops].agg("sum", axis=1) + if len(elementwise_ops): + trace_df['elementwise_ops'] = trace_df[elementwise_ops].agg("sum", + axis=1) + + trace_df.drop(attention_ops + quant_ops + gemm_ops + rms_norm_ops + + vocab_embed_ops + mem_ops + elementwise_ops, + axis=1, + inplace=True) + return trace_df + + +## Data plotting utils #### + + +def plot_trace_df(traces_df: pd.DataFrame, + plot_metric: str, + plot_title: str, + output: Optional[Path] = None): + + phases = traces_df['phase'].unique() + traces_df = traces_df.pivot_table(index="phase", + columns="name", + values=plot_metric, + aggfunc="sum") + + traces_df = group_trace_by_operations(traces_df) + + # Make the figure + fig, ax = plt.subplots(1, figsize=(5, 8), sharex=True) + + # Draw the stacked bars + ops = list(traces_df) + bottom = [0] * len(phases) + for op in ops: + values = [traces_df[op][phase] for phase in phases] + values = list(map(lambda x: 0.0 if math.isnan(x) else x, values)) + ax.bar(phases, values, label=op, bottom=bottom) + bottom = [bottom[j] + values[j] for j in range(len(phases))] + + # Write the values as text on the bars + for bar in ax.patches: + if bar.get_height() != 0: + ax.text(bar.get_x() + bar.get_width() / 2, + bar.get_height() / 2 + bar.get_y(), + f"{round(bar.get_height(), 2)}", + ha='center', + color='w', + weight='bold', + size=5) + + # Setup legend + handles, labels = plt.gca().get_legend_handles_labels() + legend = fig.legend(handles, + labels, + loc='center left', + bbox_to_anchor=(1, 1)) + shorten_plot_legend_strings(legend, 50) + + # Setup labels and title + plt.setp(ax.get_xticklabels(), rotation=90) + ax.set_ylabel(plot_metric) + plt.suptitle(plot_title) + + plt.savefig(output, bbox_inches='tight') + print("Created: ", output) + + +def main( + json_trace: Path, + output_directory: Path, + depth: int, # Fetch/Plot operations at this depth of the Json tree + plot_metric: str, + make_names_unique: bool, + top_k: int, + json_nodes_to_fold: List[str]): + + def prepare_data(profile_json: dict, step_keys: List[str]) -> pd.DataFrame: + + def get_entries_and_traces(key: str): + entries_and_traces: List[Tuple[Any, Any]] = [] + for root in profile_json[key]["summary_stats"]: + # Fold nodes in the traces as per user request. i.e. simply + # make the requested nodes leaf-nodes. + root = fold_nodes(root, json_nodes_to_fold) + get_entries_at_depth(depth, entries_and_traces, root) + return entries_and_traces + + def keep_only_top_entries(df: pd.DataFrame, + metric: str, + top_k: int = 9) -> pd.DataFrame: + df.loc[df.nsmallest(len(df) - top_k + 1, metric).index, + ["name"]] = "others" + return df + + # Get data for each key + traces = list(map(lambda x: get_entries_and_traces(x), step_keys)) + + # Attempt some cleanup + if make_names_unique: + for trace in traces: + attempt_to_make_names_unique(trace) + + # To pandas dataframe + trace_dfs = list( + map(lambda t: pd.DataFrame([entry for entry, _ in t]).fillna(0), + traces)) + + # Respect top_k + if top_k: + trace_dfs = list( + map( + lambda trace_df: keep_only_top_entries( + trace_df, "cuda_time_us", top_k), trace_dfs)) + + # Fill in information about the step-keys + for trace_df, step_key in zip(trace_dfs, step_keys): + trace_df['phase'] = step_key + + # Combine all data frames so they can be put in a single plot + traces_df = pd.concat(trace_dfs) + + # Add a derived metric `cuda_time_ms` + traces_df["cuda_time_ms"] = traces_df["cuda_time_us"] / 1000 + traces_df = traces_df.fillna(0) + + return traces_df + + def make_plot_title_suffix(profile_json: dict) -> str: + context = profile_json["context"] + sparsity = context.get('sparsity', None) + return (f"{context['model']}\n" + f"Batch={context['batch_size']}, " + f"PromptLen={context['prompt_len']}, " + f"OutputLen={context['output_len']}," + f"NumGpus={context['tensor_parallel_size']}" + f"{', Sparsity ' + sparsity if sparsity else ''}") + + profile_json = None + with open(json_trace, "r") as f: + profile_json = json.load(f) + assert profile_json is not None + + # Get all `llm.generate.step()` profile + step_traces = list(profile_json.keys()) + assert (step_traces[0] == 'context') + step_traces = step_traces[1:] # have only prefill and decodes + prefills = list(filter(lambda x: "prefill" in x, step_traces)) + all_decodes = list(filter(lambda x: "decode" in x, step_traces)) + assert len(prefills) + len(all_decodes) == len(step_traces) + assert len(prefills) == 1 + + decodes = all_decodes[::args.step_plot_interval] + if decodes[-1] != all_decodes[-1]: + # Always have the last decode + decodes.append(all_decodes[-1]) + + prefill_traces = prepare_data(profile_json, prefills) + decode_traces = prepare_data(profile_json, decodes) + + plot_title_suffix = make_plot_title_suffix(profile_json) + + plot_trace_df(prefill_traces, plot_metric, "prefill " + plot_title_suffix, + output_directory / Path("prefill.png")) + plot_trace_df(decode_traces, plot_metric, "decodes " + plot_title_suffix, + output_directory / Path("decode_steps.png")) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--json-trace", + type=str, + required=True, + help="json trace file output by examples/offline_profile.py") + parser.add_argument("--output-directory", + type=str, + required=False, + help="Directory to output plots") + parser.add_argument("--level", + type=str, + default="module", + choices=["module", "kernel"]) + parser.add_argument("--top-k", + type=int, + default=12, + help="Only graph the top `top_k` entries by time.") + parser.add_argument("--fold-json-node", + nargs='+', + default=['Sampler', 'LogitsProcessor'], + help='Do not plot the children of these nodes. Let, \ + the node represent the aggregate of all its \ + children') + parser.add_argument("--plot-metric", + type=str, + default="cuda_time_ms", + help='Metric to plot. some options are cuda_time_ms, \ + pct_cuda_time') + parser.add_argument( + "--step-plot-interval", + type=int, + default=4, + help="For every `step_plot_interval` steps, plot 1 step") + + args = parser.parse_args() + + # Prepare/Extract relevant args + make_names_unique = False + if args.level == "module": + depth = -2 + make_names_unique = True + elif args.level == "kernel": + depth = -1 + else: + raise Exception(f"Unexpected level value ({args.level})") + + output_directory = args.output_directory if args.output_directory else Path( + args.json_trace).parent + + main(Path(args.json_trace), output_directory, depth, args.plot_metric, + make_names_unique, args.top_k, args.fold_json_node) diff --git a/vllm/profiler/__init__.py b/vllm/profiler/__init__.py new file mode 100644 index 000000000000..93ec4a800e60 --- /dev/null +++ b/vllm/profiler/__init__.py @@ -0,0 +1,5 @@ +from .nm_profile import nm_profile + +__all__ = [ + "nm_profile", +] diff --git a/vllm/profiler/nm_profile.py b/vllm/profiler/nm_profile.py new file mode 100644 index 000000000000..5901362ecd82 --- /dev/null +++ b/vllm/profiler/nm_profile.py @@ -0,0 +1,349 @@ +import copy +from collections import defaultdict +from dataclasses import asdict, dataclass, field +from typing import Callable, Dict, List, Optional, Tuple, TypeAlias, Union + +import pandas as pd +from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult +from torch._C._profiler import _EventType, _ExperimentalConfig, _ProfilerEvent +from torch.autograd.profiler import FunctionEvent +from torch.profiler import ProfilerActivity, profile + +from vllm.profiler.utils import (TablePrinter, event_has_module, + event_is_torch_op, event_module_repr, + event_torch_op_stack_trace, indent_string) + + +@dataclass +class _ModuleTreeNode: + event: _ProfilerEvent + parent: Optional['_ModuleTreeNode'] = None + children: List['_ModuleTreeNode'] = field(default_factory=list) + trace: str = "" + + @property + def is_leaf(self): + return (self.event.children is None or len(self.event.children) == 0) + + @property + def is_torch_op(self): + return event_is_torch_op(self.event) + + @property + def is_cuda(self): + return (self.event.tag == _EventType.Kineto + and self.event.typed[1].device_type == DeviceType.CUDA) + + +@dataclass +class SummaryStatsEntry: + name: str + cuda_time_us: float + pct_cuda_time: float + invocations: int + + +@dataclass +class ModelStatsEntry: + name: str + cpu_time_us: float + cuda_time_us: float + pct_cuda_time: float + trace: str + + +StatsEntry: TypeAlias = Union[ModelStatsEntry, SummaryStatsEntry] + + +@dataclass +class _StatsTreeNode: + entry: StatsEntry + children: List[StatsEntry] + parent: Optional[StatsEntry] + + +@dataclass +class NMProfileResults(profile): + _kineto_results: _ProfilerResult + _kineto_event_correlation_map: Dict[int, + List[_KinetoEvent]] = field(init=False) + _event_correlation_map: Dict[int, List[FunctionEvent]] = field(init=False) + _module_tree: List[_ModuleTreeNode] = field(init=False) + _model_stats_tree: List[_StatsTreeNode] = field(init=False) + _summary_stats_tree: List[_StatsTreeNode] = field(init=False) + + def __post_init__(self): + self._build_correlation_map() + self._build_module_tree() + self._build_stats_trees() + + def print_model_table(self, column_widths: Dict[str, int] = None): + _column_widths = dict(name=60, + cpu_time_us=12, + cuda_time_us=12, + pct_cuda_time=12, + trace=60) + if column_widths: + _column_widths.update(**column_widths) + filtered_model_table = [ + (depth, row) + for depth, row in self._flatten_stats_tree(self._model_stats_tree) + if row.cuda_time_us > 0 or row.cpu_time_us > 0 + ] + TablePrinter(ModelStatsEntry, _column_widths).print_table( + self._indent_row_names_based_on_depth( + filtered_model_table, + indent_style=lambda indent: "|" + "-" * indent + " ")) + + def print_summary_table(self, column_widths: Dict[str, int] = None): + _column_widths = dict(name=80, + cuda_time_us=12, + pct_cuda_time=12, + invocations=15) + if column_widths: + _column_widths.update(**column_widths) + filtered_summary_table = [(depth, row) + for depth, row in self._flatten_stats_tree( + self._summary_stats_tree) + if row.cuda_time_us > 0] + TablePrinter(SummaryStatsEntry, _column_widths).print_table( + self._indent_row_names_based_on_depth( + filtered_summary_table, + indent_style=lambda indent: "|" + "-" * indent + " ")) + + def export_model_stats_table_csv(self, filename: str): + df = pd.DataFrame([ + asdict(row) + for _, row in self._flatten_stats_tree(self._model_stats_tree) + ]) + df.to_csv(filename) + + def export_summary_stats_table_csv(self, filename: str): + df = pd.DataFrame([ + asdict(row) + for _, row in self._flatten_stats_tree(self._summary_stats_tree) + ]) + df.to_csv(filename) + + def convert_stats_to_dict(self) -> str: + return { + "summary_stats": + self._convert_stats_tree_to_dict(self._summary_stats_tree), + "model_stats": + self._convert_stats_tree_to_dict(self._model_stats_tree) + } + + @staticmethod + def _indent_row_names_based_on_depth(depths_rows: List[Tuple[int, + StatsEntry]], + indent_style: Union[Callable[[int], + str], + str] = " "): + indented_rows = [] + for depth, row in depths_rows: + if row.cuda_time_us == 0: + continue + indented_row = copy.deepcopy(row) + indented_row.name = indent_string(indented_row.name, depth, + indent_style) + indented_rows.append(indented_row) + return indented_rows + + def _build_correlation_map(self): + self._kineto_event_correlation_map = defaultdict(list) + for event in self._kineto_results.events(): + self._kineto_event_correlation_map[event.correlation_id()].append( + event) + + def _build_module_tree(self): + self._module_tree = [] + event_tree = self._kineto_results.experimental_event_tree() + + def _df_traversal(event: _ProfilerEvent, + curr_node: Optional[_ModuleTreeNode] = None): + if event_has_module(event): + node = _ModuleTreeNode(event=event, parent=curr_node) + if curr_node: + curr_node.children.append(node) + else: + self._module_tree.append(node) + curr_node = node + + is_leaf = (event.children is None or len(event.children) == 0) + if is_leaf and curr_node: + node = _ModuleTreeNode( + event=event, + parent=curr_node, + trace=event_torch_op_stack_trace( + event, until=lambda x: event_has_module(x))) + curr_node.children.append(node) + curr_node = node + + for child in event.children: + _df_traversal(child, curr_node) + + for root in event_tree: + _df_traversal(root) + + def _get_kineto_gpu_event(self, node: _ModuleTreeNode): + if node.event.tag != _EventType.Kineto: + return None + correlated_kineto_events = self._kineto_event_correlation_map.get( + node.event.correlation_id, []) + iterator = (x for x in correlated_kineto_events + if x.device_type() == DeviceType.CUDA + and x.name() == node.event.name) + return next(iterator, None) + + def _cumulative_cuda_time(self, node: _ModuleTreeNode): + 'Return cuda time in microseconds' + + def _cumulative_cuda_time_recursive(node: _ModuleTreeNode): + if node.is_leaf and (gpu_kineto_event := + self._get_kineto_gpu_event(node)): + return gpu_kineto_event.duration_ns() / 1000.0 + else: + cumulative_cuda_time = 0 + for child in node.children: + cumulative_cuda_time += _cumulative_cuda_time_recursive( + child) + return cumulative_cuda_time + + return _cumulative_cuda_time_recursive(node) + + def _total_cuda_time(self): + return sum( + [self._cumulative_cuda_time(root) for root in self._module_tree]) + + def _build_stats_trees(self): + summary_dict: Dict[str, self.StatsTreeNode] = {} + total_cuda_time = self._total_cuda_time() + + def pct_cuda_time(cuda_time_us): + return (cuda_time_us / total_cuda_time) * 100 + + def build_summary_stats_tree_df( + node: _ModuleTreeNode, + parent: Optional[_StatsTreeNode] = None, + summary_trace: Tuple[str] = ()): + + if event_has_module(node.event): + name = event_module_repr(node.event) + cuda_time_us = self._cumulative_cuda_time(node) + elif (gpu_kineto_event := self._get_kineto_gpu_event(node)): + name = gpu_kineto_event.name() + cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0 + else: + return None + + summary_trace = summary_trace + (name, ) + if summary_trace in summary_dict: + entry = summary_dict[summary_trace].entry + entry.cuda_time_us += cuda_time_us + entry.invocations += 1 + entry.pct_cuda_time = pct_cuda_time(entry.cuda_time_us) + else: + new_node = _StatsTreeNode(entry=SummaryStatsEntry( + name=name, + cuda_time_us=cuda_time_us, + pct_cuda_time=pct_cuda_time(cuda_time_us), + invocations=1), + children=[], + parent=parent) + if parent: + parent.children.append(new_node) + summary_dict[summary_trace] = new_node + + for child in node.children: + build_summary_stats_tree_df(child, summary_dict[summary_trace], + summary_trace) + + return summary_dict[summary_trace] + + self._summary_stats_tree = [] + for root in self._module_tree: + self._summary_stats_tree.append(build_summary_stats_tree_df(root)) + + def build_model_stats_tree_df(node: _ModuleTreeNode, + parent: Optional[_StatsTreeNode] = None): + if event_has_module(node.event, ): + name = event_module_repr(node.event) + cuda_time_us = self._cumulative_cuda_time(node) + cpu_time_us = node.event.duration_time_ns / 1000 + trace = "" + elif (gpu_kineto_event := self._get_kineto_gpu_event(node)): + name = gpu_kineto_event.name() + cuda_time_us = gpu_kineto_event.duration_ns() / 1000.0 + cpu_time_us = 0 + trace = node.trace + else: + return None + + new_node = _StatsTreeNode(entry=ModelStatsEntry( + name=name, + cpu_time_us=cpu_time_us, + cuda_time_us=cuda_time_us, + pct_cuda_time=pct_cuda_time(cuda_time_us), + trace=trace), + parent=parent, + children=[]) + if parent: + parent.children.append(new_node) + + for child in node.children: + build_model_stats_tree_df(child, new_node) + + return new_node + + self._model_stats_tree = [] + for root in self._module_tree: + self._model_stats_tree.append(build_model_stats_tree_df(root)) + + def _flatten_stats_tree( + self, tree: List[_StatsTreeNode]) -> List[Tuple[int, StatsEntry]]: + entries: List[Tuple[int, StatsEntry]] = [] + + def df_traversal(node: _StatsTreeNode, depth=0): + entries.append((depth, node.entry)) + for child in node.children: + df_traversal(child, depth=depth + 1) + + for root in tree: + df_traversal(root) + + return entries + + def _convert_stats_tree_to_dict(self, + tree: List[_StatsTreeNode]) -> List[Dict]: + root_dicts: List[Dict] = [] + + def df_traversal(node: _StatsTreeNode, curr_json_list: List[Dict]): + curr_json_list.append({ + "entry": asdict(node.entry), + "children": [] + }) + for child in node.children: + df_traversal(child, curr_json_list[-1]["children"]) + + for root in tree: + df_traversal(root, root_dicts) + + return root_dicts + + +class nm_profile(profile): + + def __init__(self): + super().__init__( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + record_shapes=True, + with_stack=True, + with_modules=True, + experimental_config=_ExperimentalConfig(verbose=True)) + + def __enter__(self): + return super().__enter__() + + def __exit__(self, exc_type, exc_val, exc_tb): + super().__exit__(exc_type, exc_val, exc_tb) + self.results = NMProfileResults(self.profiler.kineto_results) diff --git a/vllm/profiler/utils.py b/vllm/profiler/utils.py new file mode 100644 index 000000000000..033035e43432 --- /dev/null +++ b/vllm/profiler/utils.py @@ -0,0 +1,145 @@ +import dataclasses +from typing import Callable, Dict, List, Type, Union + +from torch._C._profiler import _EventType, _ProfilerEvent, _TensorMetadata + +# +# String / Print Manipulation +# + + +def trim_string_front(string, width): + if len(string) > width: + offset = len(string) - width + 3 + string = string[offset:] + if len(string) > 3: + string = "..." + string[3:] + return string + + +def trim_string_back(string, width): + if len(string) > width: + offset = len(string) - width + 3 + string = string[:-offset] + if len(string) > 3: + string = string + "..." + return string + + +class TablePrinter: + + def __init__(self, row_cls: Type[dataclasses.dataclass], + column_widths: Dict[str, int]): + self.row_cls = row_cls + self.fieldnames = [x.name for x in dataclasses.fields(row_cls)] + self.column_widths = column_widths + assert set(self.column_widths.keys()) == set(self.fieldnames) + + def print_table(self, rows: List[dataclasses.dataclass]): + self._print_header() + self._print_line() + for row in rows: + self._print_row(row) + + def _print_header(self): + for i, f in enumerate(self.fieldnames): + last = (i == len(self.fieldnames) - 1) + col_width = self.column_widths[f] + print(trim_string_back(f, col_width).ljust(col_width), + end=" | " if not last else "\n") + + def _print_row(self, row): + assert isinstance(row, self.row_cls) + + for i, f in enumerate(self.fieldnames): + last = (i == len(self.fieldnames) - 1) + col_width = self.column_widths[f] + val = getattr(row, f) + + val_str = "" + if isinstance(val, str): + val_str = trim_string_back(val, col_width).ljust(col_width) + elif type(val) in [float, int]: + val_str = f"{float(val):>.2f}".rjust(col_width) + else: + val_str = f"{val}".rjust(col_width) + print(val_str, end=" | " if not last else "\n") + + def _print_line(self): + total_col_width = 0 + for column_width in self.column_widths.values(): + total_col_width += column_width + print("=" * (total_col_width + 3 * (len(self.column_widths) - 1))) + + +def indent_string(string: str, + indent: int, + indent_style: Union[Callable[[int], str], str] = " ") -> str: + if indent: + if isinstance(indent_style, str): + return indent_style * indent + string + else: + return indent_style(indent) + string + else: + return string + + +# +# _ProfilerEvent utils +# + + +def event_has_module(event: _ProfilerEvent) -> bool: + event_type, typed_event = event.typed + if event_type == _EventType.PyCall: + return typed_event.module is not None + return False + + +def event_is_torch_op(event: _ProfilerEvent) -> bool: + return event.tag == _EventType.TorchOp + + +def event_arg_repr(arg) -> str: + if arg is None or type(arg) in [float, int, bool, str]: + return f"{arg}" + elif isinstance(arg, list): + return f"[{', '.join([event_arg_repr(x) for x in arg])}]" + elif isinstance(arg, tuple): + return f"({', '.join([event_arg_repr(x) for x in arg])})" + else: + assert isinstance(arg, + _TensorMetadata), f"Unsupported type: {type(arg)}" + sizes_str = ', '.join([str(x) for x in arg.sizes]) + return f"{str(arg.dtype).replace('torch.', '')}[{sizes_str}]" + + +def event_torch_op_repr(event: _ProfilerEvent) -> str: + assert event.tag == _EventType.TorchOp + args_str = ', '.join([event_arg_repr(x) for x in event.typed[1].inputs]) + return f"{event.name}({args_str})".replace("aten::", "") + + +def event_module_repr(event: _ProfilerEvent) -> str: + assert event_has_module(event) + module = event.typed[1].module + if module.parameters and len(module.parameters) > 0: + args_str = ', '.join( + [f'{x[0]}={event_arg_repr(x[1])}' for x in module.parameters]) + return f"{module.cls_name}({args_str})" + else: + return module.cls_name + + +def event_torch_op_stack_trace(curr_event: _ProfilerEvent, + until: Callable[[_ProfilerEvent], bool]) -> str: + trace = "" + curr_event = curr_event.parent + while curr_event and not until(curr_event): + if event_is_torch_op(curr_event): + if len(trace) > 0: + trace += " <- " + trace += event_torch_op_repr(curr_event) + curr_event = curr_event.parent + + return trace diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 0f3c379cee8f..5e916558072e 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1,3 +1,5 @@ +# This file has been modified by Neural Magic + import dataclasses import gc import inspect @@ -1742,10 +1744,13 @@ def execute_model( return [output] -class CUDAGraphRunner: +# NOTE: this is nn.Module so the profiler can properly capture/group +# kernels calls made within the graph +class CUDAGraphRunner(nn.Module): def __init__(self, model: nn.Module, backend_name: str, attn_state: AttentionState, is_encoder_decoder_model: bool): + super().__init__() self.model = model self.backend_name = backend_name self.attn_state = attn_state @@ -1892,9 +1897,6 @@ def forward( return self.output_buffers - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) - def _get_graph_batch_size(batch_size: int) -> int: """Returns the padded batch size given actual batch size. From 87de44bc088da6d376e9090dddc048dd4026f88f Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 10 Sep 2024 15:16:05 +0000 Subject: [PATCH 02/16] refactor --- examples/offline_profile.py | 8 ++++---- vllm/profiler/__init__.py | 4 ++-- vllm/profiler/{nm_profile.py => layerwise_profile.py} | 6 +++--- .../profiler/print_layerwise_table.py | 2 +- .../profiler/visualize_layerwise_profile.py | 4 ++++ 5 files changed, 14 insertions(+), 10 deletions(-) rename vllm/profiler/{nm_profile.py => layerwise_profile.py} (98%) rename neuralmagic/tools/profiler/print_table.py => vllm/profiler/print_layerwise_table.py (96%) rename neuralmagic/tools/profiler/visualize_trace.py => vllm/profiler/visualize_layerwise_profile.py (99%) diff --git a/examples/offline_profile.py b/examples/offline_profile.py index c5c2346c908a..fb44c69a49a8 100644 --- a/examples/offline_profile.py +++ b/examples/offline_profile.py @@ -8,7 +8,7 @@ import torch from vllm import LLM, SamplingParams -from vllm.profiler import nm_profile +from vllm.profiler import layerwise_profile BATCH_SIZE_DEFAULT = 1 PROMPT_LEN_DEFAULT = 256 @@ -78,7 +78,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], f"larger than max_num_batched_tokens ({max_num_batched_tokens}) " f"and therefore cannot be run in a single profile step, please " f"choose a smaller batch size or prompt length, or increase " - f"--max_num_batched_tokens") + f"--max-num-batched-tokens") sys.exit(-1) if batch_size >= max_num_seqs: print( @@ -106,12 +106,12 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], inputs={'prompt_token_ids': prompt_token_ids}, params=sampling_params) - with nm_profile() as prefill_prof: + with layerwise_profile() as prefill_prof: llm.llm_engine.step() # First step is prefill decode_results_list = [] for x in range(args.output_len - 1): - with nm_profile() as decode_prof: + with layerwise_profile() as decode_prof: llm.llm_engine.step() decode_results_list.append(decode_prof.results) diff --git a/vllm/profiler/__init__.py b/vllm/profiler/__init__.py index 93ec4a800e60..3e25f5cc283f 100644 --- a/vllm/profiler/__init__.py +++ b/vllm/profiler/__init__.py @@ -1,5 +1,5 @@ -from .nm_profile import nm_profile +from .layerwise_profile import layerwise_profile __all__ = [ - "nm_profile", + "layerwise_profile", ] diff --git a/vllm/profiler/nm_profile.py b/vllm/profiler/layerwise_profile.py similarity index 98% rename from vllm/profiler/nm_profile.py rename to vllm/profiler/layerwise_profile.py index 5901362ecd82..e074cc3f0d2a 100644 --- a/vllm/profiler/nm_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -63,7 +63,7 @@ class _StatsTreeNode: @dataclass -class NMProfileResults(profile): +class LayerwiseProfileResults(profile): _kineto_results: _ProfilerResult _kineto_event_correlation_map: Dict[int, List[_KinetoEvent]] = field(init=False) @@ -331,7 +331,7 @@ def df_traversal(node: _StatsTreeNode, curr_json_list: List[Dict]): return root_dicts -class nm_profile(profile): +class layerwise_profile(profile): def __init__(self): super().__init__( @@ -346,4 +346,4 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): super().__exit__(exc_type, exc_val, exc_tb) - self.results = NMProfileResults(self.profiler.kineto_results) + self.results = LayerwiseProfileResults(self.profiler.kineto_results) diff --git a/neuralmagic/tools/profiler/print_table.py b/vllm/profiler/print_layerwise_table.py similarity index 96% rename from neuralmagic/tools/profiler/print_table.py rename to vllm/profiler/print_layerwise_table.py index 9081583a9f95..91bfec747f4a 100644 --- a/neuralmagic/tools/profiler/print_table.py +++ b/vllm/profiler/print_layerwise_table.py @@ -2,7 +2,7 @@ import json from typing import Dict -from vllm.profiler.nm_profile import ModelStatsEntry, SummaryStatsEntry +from vllm.profiler.layerwise_profile import ModelStatsEntry, SummaryStatsEntry from vllm.profiler.utils import TablePrinter, indent_string diff --git a/neuralmagic/tools/profiler/visualize_trace.py b/vllm/profiler/visualize_layerwise_profile.py similarity index 99% rename from neuralmagic/tools/profiler/visualize_trace.py rename to vllm/profiler/visualize_layerwise_profile.py index c1284226fec6..ea27152670d3 100644 --- a/neuralmagic/tools/profiler/visualize_trace.py +++ b/vllm/profiler/visualize_layerwise_profile.py @@ -1,3 +1,4 @@ +import os import argparse import copy import json @@ -426,6 +427,9 @@ def make_plot_title_suffix(profile_json: dict) -> str: output_directory = args.output_directory if args.output_directory else Path( args.json_trace).parent + + if not os.path.exists(output_directory): + os.makedirs(output_directory) main(Path(args.json_trace), output_directory, depth, args.plot_metric, make_names_unique, args.top_k, args.fold_json_node) From 69189059e328c133bf3bf8e850a5008b65e94c78 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 10 Sep 2024 15:27:02 +0000 Subject: [PATCH 03/16] remove cruft --- vllm/worker/model_runner.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 5e916558072e..36753b8580f6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1,5 +1,3 @@ -# This file has been modified by Neural Magic - import dataclasses import gc import inspect From 219bfdb158578d7a9af5fba89dad0c78164dafae Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 16 Sep 2024 16:14:07 +0000 Subject: [PATCH 04/16] format --- vllm/profiler/visualize_layerwise_profile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/profiler/visualize_layerwise_profile.py b/vllm/profiler/visualize_layerwise_profile.py index ea27152670d3..880d3f760d6b 100644 --- a/vllm/profiler/visualize_layerwise_profile.py +++ b/vllm/profiler/visualize_layerwise_profile.py @@ -1,8 +1,8 @@ -import os import argparse import copy import json import math +import os from pathlib import Path from typing import Any, List, Optional, Tuple @@ -427,7 +427,7 @@ def make_plot_title_suffix(profile_json: dict) -> str: output_directory = args.output_directory if args.output_directory else Path( args.json_trace).parent - + if not os.path.exists(output_directory): os.makedirs(output_directory) From c566f94ea07cd9c0f6f9ef8043f8092b8b42513c Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 16 Sep 2024 21:11:11 +0000 Subject: [PATCH 05/16] fix duplicate entries in the tp > 1 case --- vllm/profiler/layerwise_profile.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index e074cc3f0d2a..9d9f427e807f 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -161,6 +161,11 @@ def _build_module_tree(self): def _df_traversal(event: _ProfilerEvent, curr_node: Optional[_ModuleTreeNode] = None): + + # For the tensor parallel case for now only look at task 1 + if event.start_tid != 1: + return + if event_has_module(event): node = _ModuleTreeNode(event=event, parent=curr_node) if curr_node: From ac2271b8cf2e77e62bdc274ae55845c3c8e16d7f Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Mon, 16 Sep 2024 20:34:28 +0000 Subject: [PATCH 06/16] add reduce kernel grouping --- vllm/profiler/visualize_layerwise_profile.py | 95 +++++++++++++++++++- 1 file changed, 91 insertions(+), 4 deletions(-) diff --git a/vllm/profiler/visualize_layerwise_profile.py b/vllm/profiler/visualize_layerwise_profile.py index 880d3f760d6b..65ee3ae108ae 100644 --- a/vllm/profiler/visualize_layerwise_profile.py +++ b/vllm/profiler/visualize_layerwise_profile.py @@ -172,6 +172,36 @@ def is_mem_op(op_name: str): def is_vocab_embedding_op(op_name: str): return "vocabparallelembed" in op_name.lower() + # nccl ops + def is_nccl_op(op_name: str): + return "nccl" in op_name.lower() + + def is_nccl_all_reduce(op_name: str): + return is_nccl_op(op_name) and \ + ("all_reduce" in op_name.lower() or \ + "allreduce" in op_name.lower()) + + def is_nccl_gather(op_name: str): + return is_nccl_op(op_name) and \ + "gather" in op_name.lower() + + def is_nccl_broadcast(op_name: str): + return is_nccl_op(op_name) and \ + "broadcast" in op_name.lower() + + # Reduce ops types + def is_cross_device_reduce_1stage(op_name: str): + return "cross_device_reduce_1stage" in op_name + + def is_cross_device_reduce_2stage(op_name: str): + return "cross_device_reduce_2stage" in op_name + + def is_custom_ar_all_reduce_unreg(op_name: str): + return "_C_custom_ar::all_reduce_unreg" in op_name + + def is_reduce_kernel(op_name: str): + return "reduce_kernel" in op_name + headers = list(trace_df) ops = copy.deepcopy(headers) @@ -196,6 +226,33 @@ def is_vocab_embedding_op(op_name: str): elementwise_ops = list(filter(lambda x: is_elementwise_op(x), ops)) ops = list(filter(lambda x: x not in elementwise_ops, ops)) + nccl_all_reduce_ops = list(filter(lambda x: is_nccl_all_reduce(x), ops)) + ops = list(filter(lambda x: x not in nccl_all_reduce_ops, ops)) + + nccl_gather_ops = list(filter(lambda x: is_nccl_gather(x), ops)) + ops = list(filter(lambda x: x not in nccl_gather_ops, ops)) + + nccl_broadcast_ops = list(filter(lambda x: is_nccl_broadcast(x), ops)) + ops = list(filter(lambda x: x not in nccl_broadcast_ops, ops)) + + nccl_other_ops = list(filter(lambda x: is_nccl_op(x), ops)) + ops = list(filter(lambda x: x not in nccl_other_ops, ops)) + + cross_device_reduce_1stage_ops = list( + filter(lambda x: is_cross_device_reduce_1stage(x), ops)) + ops = list(filter(lambda x: x not in cross_device_reduce_1stage_ops, ops)) + + cross_device_reduce_2stage_ops = list( + filter(lambda x: is_cross_device_reduce_2stage(x), ops)) + ops = list(filter(lambda x: x not in cross_device_reduce_2stage_ops, ops)) + + custom_ar_all_reduce_unreg_ops = list( + filter(lambda x: is_custom_ar_all_reduce_unreg(x), ops)) + ops = list(filter(lambda x: x not in custom_ar_all_reduce_unreg_ops, ops)) + + reduce_kernel_ops = list(filter(lambda x: is_reduce_kernel(x), ops)) + ops = list(filter(lambda x: x not in reduce_kernel_ops, ops)) + if len(attention_ops): trace_df['attention'] = trace_df[attention_ops].agg("sum", axis=1) if len(quant_ops): @@ -213,10 +270,40 @@ def is_vocab_embedding_op(op_name: str): trace_df['elementwise_ops'] = trace_df[elementwise_ops].agg("sum", axis=1) - trace_df.drop(attention_ops + quant_ops + gemm_ops + rms_norm_ops + - vocab_embed_ops + mem_ops + elementwise_ops, - axis=1, - inplace=True) + if len(nccl_all_reduce_ops): + trace_df['nccl_all_reduce_ops'] = trace_df[nccl_all_reduce_ops].agg( + "sum", axis=1) + if len(nccl_gather_ops): + trace_df['nccl_gather_ops'] = trace_df[nccl_gather_ops].agg("sum", + axis=1) + if len(nccl_broadcast_ops): + trace_df['nccl_broadcast_ops'] = trace_df[nccl_broadcast_ops].agg( + "sum", axis=1) + if len(nccl_other_ops): + trace_df['nccl_other_ops'] = trace_df[nccl_other_ops].agg("sum", + axis=1) + + if len(cross_device_reduce_1stage_ops): + trace_df['cross_device_reduce_1stage_ops'] = trace_df[ + cross_device_reduce_1stage_ops].agg("sum", axis=1) + if len(cross_device_reduce_2stage_ops): + trace_df['cross_device_reduce_2stage_ops'] = trace_df[ + cross_device_reduce_2stage_ops].agg("sum", axis=1) + if len(custom_ar_all_reduce_unreg_ops): + trace_df['custom_ar_all_reduce_unreg_ops'] = trace_df[ + custom_ar_all_reduce_unreg_ops].agg("sum", axis=1) + if len(reduce_kernel_ops): + trace_df['reduce_kernel_ops'] = trace_df[reduce_kernel_ops].agg("sum", + axis=1) + + trace_df.drop( + attention_ops + quant_ops + gemm_ops + rms_norm_ops + vocab_embed_ops + + mem_ops + elementwise_ops + nccl_all_reduce_ops + nccl_gather_ops + + nccl_broadcast_ops + nccl_other_ops + cross_device_reduce_1stage_ops + + cross_device_reduce_2stage_ops + custom_ar_all_reduce_unreg_ops + + reduce_kernel_ops, + axis=1, + inplace=True) return trace_df From cc32fe53784513007258b342aef615714b06b01b Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 7 Oct 2024 16:35:34 +0000 Subject: [PATCH 07/16] move to tools folder --- {vllm => tools}/profiler/print_layerwise_table.py | 0 {vllm => tools}/profiler/visualize_layerwise_profile.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {vllm => tools}/profiler/print_layerwise_table.py (100%) rename {vllm => tools}/profiler/visualize_layerwise_profile.py (100%) diff --git a/vllm/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py similarity index 100% rename from vllm/profiler/print_layerwise_table.py rename to tools/profiler/print_layerwise_table.py diff --git a/vllm/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py similarity index 100% rename from vllm/profiler/visualize_layerwise_profile.py rename to tools/profiler/visualize_layerwise_profile.py From 201379e9d22df32b3f9c5c11e005f56fc4a823ca Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 7 Oct 2024 13:04:40 -0400 Subject: [PATCH 08/16] Update examples/offline_profile.py Co-authored-by: Michael Goin --- examples/offline_profile.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/examples/offline_profile.py b/examples/offline_profile.py index fb44c69a49a8..c4be8551a902 100644 --- a/examples/offline_profile.py +++ b/examples/offline_profile.py @@ -225,11 +225,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], "--quantization", "-q", type=str, - choices=['awq', 'gptq', 'squeezellm', 'marlin', 'smoothquant', None], - default=None, - help="The method used to quantize the model weights, " - "options are \"marlin\", \"awq\", \"gptq\", " - "\"squeezellm\", \"smoothquant\"") + choices=[*QUANTIZATION_METHODS, None], + default=None) parser.add_argument("--dtype", type=str, default='auto', From 9ed13d34791d239bc73b4f6412a8d81d62039f4d Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 7 Oct 2024 17:14:23 +0000 Subject: [PATCH 09/16] review comments --- examples/offline_profile.py | 41 +++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/examples/offline_profile.py b/examples/offline_profile.py index c4be8551a902..a2957140f837 100644 --- a/examples/offline_profile.py +++ b/examples/offline_profile.py @@ -1,14 +1,16 @@ -import argparse import inspect import json import sys +from argparse import RawTextHelpFormatter from dataclasses import asdict, dataclass from typing import Optional import torch from vllm import LLM, SamplingParams +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.profiler import layerwise_profile +from vllm.utils import FlexibleArgumentParser BATCH_SIZE_DEFAULT = 1 PROMPT_LEN_DEFAULT = 256 @@ -194,7 +196,31 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = FlexibleArgumentParser(description=""" +Profile a model + + example: + ``` + python examples/offline_profile.py \\ + --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\ + --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 + + ``` + + then you can use various tools to analyze the json output + terminal ascii tables: + ``` + python tools/profiler/print_layerwise_table.py \\ + --json-trace Llama31-8b-FP8.json --phase prefill --table summary + ``` + or create matplotlib stacked bar charts: + ``` + python tools/profiler/visualize_layerwise_profile.py \\ + --json-trace Llama31-8b-FP8.json \\ + --output-directory profile_breakdown --plot-metric pct_cuda_time + ``` +""", + formatter_class=RawTextHelpFormatter) parser.add_argument( "--model", @@ -221,12 +247,11 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], type=str, default=None, help="Export the results as a json file. This should be the filename") - parser.add_argument( - "--quantization", - "-q", - type=str, - choices=[*QUANTIZATION_METHODS, None], - default=None) + parser.add_argument("--quantization", + "-q", + type=str, + choices=[*QUANTIZATION_METHODS, None], + default=None) parser.add_argument("--dtype", type=str, default='auto', From 016dffb273973dcb20277d217d1bad1fd01ce738 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 7 Oct 2024 18:00:02 +0000 Subject: [PATCH 10/16] add offline profile to "Examples Test" --- .buildkite/test-pipeline.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 4385f250856e..398fdc5f0ae2 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -184,6 +184,7 @@ steps: - python3 offline_inference_vision_language_multi_image.py - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference_encoder_decoder.py + - python3 offline_profile.py --model facebook/opt-125m - label: Prefix Caching Test # 9min #mirror_hardwares: [amd] From 58e33f4db76d87538a61f93cccf4699352503153 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 8 Oct 2024 14:58:49 +0000 Subject: [PATCH 11/16] allow trace saving and add a warmup run --- examples/offline_profile.py | 55 +++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/examples/offline_profile.py b/examples/offline_profile.py index a2957140f837..42576d1912c1 100644 --- a/examples/offline_profile.py +++ b/examples/offline_profile.py @@ -1,6 +1,7 @@ import inspect import json import sys +import os from argparse import RawTextHelpFormatter from dataclasses import asdict, dataclass from typing import Optional @@ -31,6 +32,7 @@ class ProfileContext: dtype: str tensor_parallel_size: int allow_cuda_graphs: bool + save_traces_folder: Optional[str] def get_dtype(dtype: str): @@ -98,25 +100,40 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], f"prompt_len or output_len, or increase --max-model-len") sys.exit(-1) - for i in range(batch_size): - prompt_token_ids = torch.randint( - llm.llm_engine.model_config.get_vocab_size(), - size=(prompt_len, )).tolist() - - llm.llm_engine.add_request( - request_id=f"seq{i}", - inputs={'prompt_token_ids': prompt_token_ids}, - params=sampling_params) + def add_requests(): + for i in range(batch_size): + prompt_token_ids = torch.randint( + llm.llm_engine.model_config.get_vocab_size(), + size=(prompt_len, )).tolist() + + llm.llm_engine.add_request( + request_id=f"seq{i}", + inputs={'prompt_token_ids': prompt_token_ids}, + params=sampling_params) + def abort_requests(): + for i in range(batch_size): + llm.llm_engine.abort_request(f"seq{i}") + + # Warm up run + print("Warm up run ...") + add_requests() + llm.llm_engine.step() # Prefill + llm.llm_engine.step() # Decode + abort_requests() + + print("Profile run ...") + add_requests() with layerwise_profile() as prefill_prof: llm.llm_engine.step() # First step is prefill - decode_results_list = [] + decode_profs = [] for x in range(args.output_len - 1): with layerwise_profile() as decode_prof: llm.llm_engine.step() - decode_results_list.append(decode_prof.results) + decode_profs.append(decode_prof) + decode_results_list = [prof.results for prof in decode_profs] prefill_results = prefill_prof.results has_decode = len(decode_results_list) > 0 @@ -195,6 +212,17 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], pass + if context.save_traces_folder is not None: + os.makedirs(context.save_traces_folder, exist_ok=True) + prefill_prof.profiler.export_chrome_trace( + context.save_traces_folder + "/prefill.json") + for idx, decode_prof in enumerate(decode_profs): + decode_prof.profiler.export_chrome_trace( + context.save_traces_folder + f"/decode_{idx + 1}.json") + print("Traces saved as prefill.json and decode_1.json, etc." + f" in folder {context.save_traces_folder}") + + if __name__ == "__main__": parser = FlexibleArgumentParser(description=""" Profile a model @@ -247,6 +275,11 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], type=str, default=None, help="Export the results as a json file. This should be the filename") + parser.add_argument("--save-traces-folder", + type=str, + help="Save chrome traces for the prefill and decode " + "will save traces as prefill.json and decode_1.json, " + "etc. inside this folder") parser.add_argument("--quantization", "-q", type=str, From d1de959fc3d5256c376061f8e9ebd5fdddf4f15a Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 8 Oct 2024 16:25:05 +0000 Subject: [PATCH 12/16] fix deprecation warning --- examples/offline_profile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_profile.py b/examples/offline_profile.py index 42576d1912c1..9dd0768ce880 100644 --- a/examples/offline_profile.py +++ b/examples/offline_profile.py @@ -108,7 +108,7 @@ def add_requests(): llm.llm_engine.add_request( request_id=f"seq{i}", - inputs={'prompt_token_ids': prompt_token_ids}, + prompt={'prompt_token_ids': prompt_token_ids}, params=sampling_params) def abort_requests(): for i in range(batch_size): From b6db3bd1a86b4ee4ce4440a81f84f317376586f4 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 8 Oct 2024 16:27:01 +0000 Subject: [PATCH 13/16] format --- examples/offline_profile.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/offline_profile.py b/examples/offline_profile.py index 9dd0768ce880..15aa7607a556 100644 --- a/examples/offline_profile.py +++ b/examples/offline_profile.py @@ -1,7 +1,7 @@ import inspect import json -import sys import os +import sys from argparse import RawTextHelpFormatter from dataclasses import asdict, dataclass from typing import Optional @@ -110,17 +110,18 @@ def add_requests(): request_id=f"seq{i}", prompt={'prompt_token_ids': prompt_token_ids}, params=sampling_params) + def abort_requests(): for i in range(batch_size): llm.llm_engine.abort_request(f"seq{i}") - + # Warm up run print("Warm up run ...") add_requests() - llm.llm_engine.step() # Prefill - llm.llm_engine.step() # Decode + llm.llm_engine.step() # Prefill + llm.llm_engine.step() # Decode abort_requests() - + print("Profile run ...") add_requests() @@ -211,11 +212,10 @@ def abort_requests(): json.dump(json_dict, f, indent=2) pass - if context.save_traces_folder is not None: os.makedirs(context.save_traces_folder, exist_ok=True) - prefill_prof.profiler.export_chrome_trace( - context.save_traces_folder + "/prefill.json") + prefill_prof.profiler.export_chrome_trace(context.save_traces_folder + + "/prefill.json") for idx, decode_prof in enumerate(decode_profs): decode_prof.profiler.export_chrome_trace( context.save_traces_folder + f"/decode_{idx + 1}.json") From aee8e574c9512a693ec96adbb389e04c5da683d3 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 9 Oct 2024 21:16:35 +0000 Subject: [PATCH 14/16] review comments --- examples/offline_profile.py | 24 +++++++++++++----------- tools/profiler/print_layerwise_table.py | 2 +- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/examples/offline_profile.py b/examples/offline_profile.py index 15aa7607a556..a16e0abd5b68 100644 --- a/examples/offline_profile.py +++ b/examples/offline_profile.py @@ -138,36 +138,37 @@ def abort_requests(): prefill_results = prefill_prof.results has_decode = len(decode_results_list) > 0 - print("=" * 80) + LINE_WIDTH = 80 + print("=" * LINE_WIDTH) print(f"= Prefill Model Table " f"(prompt_len={prompt_len}, batch_size={batch_size})") - print("=" * 80) + print("=" * LINE_WIDTH) print() prefill_results.print_model_table() if has_decode: print() - print("=" * 80) + print("=" * LINE_WIDTH) print(f"= First Decode Step Model Table " f"(prompt_len={prompt_len}, batch_size={batch_size})") - print("=" * 80) + print("=" * LINE_WIDTH) print() decode_results_list[0].print_model_table() print() - print("=" * 80) + print("=" * LINE_WIDTH) print(f"= Prefill Summary Table " f"(prompt_len={prompt_len}, batch_size={batch_size})") - print("=" * 80) + print("=" * LINE_WIDTH) print() prefill_results.print_summary_table() if has_decode: print() - print("=" * 80) + print("=" * LINE_WIDTH) print(f"= First Decode Step Summary Table " f"(prompt_len={prompt_len}, batch_size={batch_size})") - print("=" * 80) + print("=" * LINE_WIDTH) print() decode_results_list[0].print_summary_table() @@ -321,9 +322,10 @@ def abort_requests(): parser.add_argument( "--allow-cuda-graphs", action='store_true', - help="Enables cuda graphs to be used, well remove a lot of the module " - "level info in the profiler results since almost everything runs in " - "the graph where we do not have access to an informative stack trace") + help="Enables cuda graphs to be used. This wo; remove a lot of the " + "module level info in the profiler results since almost everything " + "runs in the graph where we do not have access to an informative stack " + "trace") parser.add_argument( "--output-len", type=int, diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py index 91bfec747f4a..bbd24b085e3a 100644 --- a/tools/profiler/print_layerwise_table.py +++ b/tools/profiler/print_layerwise_table.py @@ -65,7 +65,7 @@ def get_entries(node, curr_depth=0): pct_cuda_time=12, trace=60) - # ident entry names based on the depth + # indent entry names based on the depth entries = [] for entry, depth in entries_and_depths: entry.name = indent_string( From c117052abc8bd2ab59145626bc4392d49d54d1f1 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 9 Oct 2024 21:30:18 +0000 Subject: [PATCH 15/16] pull in all engine args --- examples/offline_profile.py | 78 ++++--------------------------------- 1 file changed, 8 insertions(+), 70 deletions(-) diff --git a/examples/offline_profile.py b/examples/offline_profile.py index a16e0abd5b68..c9fceb03848c 100644 --- a/examples/offline_profile.py +++ b/examples/offline_profile.py @@ -9,7 +9,7 @@ import torch from vllm import LLM, SamplingParams -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.engine.arg_utils import EngineArgs from vllm.profiler import layerwise_profile from vllm.utils import FlexibleArgumentParser @@ -20,18 +20,10 @@ @dataclass class ProfileContext: - model: str - tokenizer: str - model_revision: str - quantization: str - max_model_len: int - max_num_batched_tokens: int + engine_args: EngineArgs prompt_len: int output_len: int batch_size: int - dtype: str - tensor_parallel_size: int - allow_cuda_graphs: bool save_traces_folder: Optional[str] @@ -55,18 +47,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], ignore_eos=True) # Create LLM - llm = LLM(model=context.model, - tokenizer=context.tokenizer - if context.tokenizer is not None else context.model, - revision=context.model_revision, - enforce_eager=not context.allow_cuda_graphs, - tensor_parallel_size=context.tensor_parallel_size, - gpu_memory_utilization=0.9, - max_model_len=context.max_model_len, - quantization=context.quantization, - dtype=get_dtype(context.dtype), - max_num_batched_tokens=context.max_num_batched_tokens) - + llm = LLM(**asdict(context.engine_args)) batch_size = context.batch_size prompt_len = context.prompt_len output_len = context.output_len @@ -232,8 +213,8 @@ def abort_requests(): ``` python examples/offline_profile.py \\ --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\ - --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 - + --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\ + --enforce-eager ``` then you can use various tools to analyze the json output @@ -250,18 +231,6 @@ def abort_requests(): ``` """, formatter_class=RawTextHelpFormatter) - - parser.add_argument( - "--model", - type=str, - required=True, - help='The name or path of a HuggingFace Transformers model.') - parser.add_argument("--tokenizer", - type=str, - default=None, - help="path to the tokenizer") - - parser.add_argument("--model-revision", type=str, default=None) parser.add_argument( "--csv", type=str, @@ -281,27 +250,6 @@ def abort_requests(): help="Save chrome traces for the prefill and decode " "will save traces as prefill.json and decode_1.json, " "etc. inside this folder") - parser.add_argument("--quantization", - "-q", - type=str, - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument("--dtype", - type=str, - default='auto', - help="model dtype") - parser.add_argument( - "--max-model-len", - type=int, - default=None, - help="Maximum length of a sequence (including prompt and output)") - parser.add_argument( - "--max-num-batched-tokens", - type=int, - default=None, - help="Maximum number of tokens to be processed in a single iteration. " - " Should be greater than batch-size * prompt-len so the prefill can " - " run in a single iteration.") parser.add_argument( "--prompt-len", type=int, @@ -313,19 +261,6 @@ def abort_requests(): default=BATCH_SIZE_DEFAULT, help=f"Number of requests to run as a single batch, " f"default={BATCH_SIZE_DEFAULT}") - parser.add_argument("--tensor-parallel-size", - "-tp", - type=int, - default=1, - help="Number of GPUs to use i.e. tensor parallelism, " - "default=1") - parser.add_argument( - "--allow-cuda-graphs", - action='store_true', - help="Enables cuda graphs to be used. This wo; remove a lot of the " - "module level info in the profiler results since almost everything " - "runs in the graph where we do not have access to an informative stack " - "trace") parser.add_argument( "--output-len", type=int, @@ -333,9 +268,12 @@ def abort_requests(): help="Number of llm steps to run (includes prefill and decode) " "- default={OUTPUT_LEN_DEFAULT}") + EngineArgs.add_cli_args(parser) + args = parser.parse_args() context = ProfileContext( + engine_args=EngineArgs.from_cli_args(args), **{ k: v for k, v in vars(args).items() From c1a5507b2f20475b9a2d885cfd9a676da6586c1f Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 9 Oct 2024 21:32:46 +0000 Subject: [PATCH 16/16] rename save traces --- examples/offline_profile.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/offline_profile.py b/examples/offline_profile.py index c9fceb03848c..1d415b82cddb 100644 --- a/examples/offline_profile.py +++ b/examples/offline_profile.py @@ -24,7 +24,7 @@ class ProfileContext: prompt_len: int output_len: int batch_size: int - save_traces_folder: Optional[str] + save_chrome_traces_folder: Optional[str] def get_dtype(dtype: str): @@ -194,15 +194,15 @@ def abort_requests(): json.dump(json_dict, f, indent=2) pass - if context.save_traces_folder is not None: - os.makedirs(context.save_traces_folder, exist_ok=True) - prefill_prof.profiler.export_chrome_trace(context.save_traces_folder + - "/prefill.json") + if context.save_chrome_traces_folder is not None: + os.makedirs(context.save_chrome_traces_folder, exist_ok=True) + prefill_prof.profiler.export_chrome_trace( + context.save_chrome_traces_folder + "/prefill.json") for idx, decode_prof in enumerate(decode_profs): decode_prof.profiler.export_chrome_trace( - context.save_traces_folder + f"/decode_{idx + 1}.json") + context.save_chrome_traces_folder + f"/decode_{idx + 1}.json") print("Traces saved as prefill.json and decode_1.json, etc." - f" in folder {context.save_traces_folder}") + f" in folder {context.save_chrome_traces_folder}") if __name__ == "__main__": @@ -245,7 +245,7 @@ def abort_requests(): type=str, default=None, help="Export the results as a json file. This should be the filename") - parser.add_argument("--save-traces-folder", + parser.add_argument("--save-chrome-traces-folder", type=str, help="Save chrome traces for the prefill and decode " "will save traces as prefill.json and decode_1.json, "