Skip to content

Commit

Permalink
Merge pull request vllm-project#20 from tdg5/exp-2
Browse files Browse the repository at this point in the history
Fix logging lint errors
  • Loading branch information
rkooo567 committed Apr 24, 2024
2 parents e1e567e + 0f92b19 commit e27e61e
Show file tree
Hide file tree
Showing 15 changed files with 102 additions and 81 deletions.
16 changes: 8 additions & 8 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,9 @@ def _verify_quantization(self) -> None:
f"supported in ROCm.")
if self.quantization != "marlin":
logger.warning(
f"{self.quantization} quantization is not fully "
"%s quantization is not fully "
"optimized yet. The speed can be slower than "
"non-quantized models.")
"non-quantized models.", self.quantization)

def _verify_cuda_graph(self) -> None:
if self.max_context_len_to_capture is None:
Expand Down Expand Up @@ -360,7 +360,7 @@ def verify_with_parallel_config(
if cpu_memory_usage > 0.7 * total_cpu_memory:
raise ValueError("Too large swap space. " + msg)
elif cpu_memory_usage > 0.4 * total_cpu_memory:
logger.warning("Possibly too large swap space. " + msg)
logger.warning("Possibly too large swap space. %s", msg)


@dataclass
Expand Down Expand Up @@ -898,8 +898,8 @@ def verify_with_model_config(self, model_config: ModelConfig):
"awq", "gptq"
]:
# TODO support marlin and squeezellm
logger.warning(f"{model_config.quantization} quantization is not "
"tested with LoRA yet.")
logger.warning("%s quantization is not tested with LoRA yet.",
model_config.quantization)

def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
if scheduler_config.max_num_batched_tokens > 65528:
Expand Down Expand Up @@ -1008,7 +1008,7 @@ def _get_and_verify_dtype(
pass
else:
# Casting between float16 and bfloat16 is allowed with a warning.
logger.warning(f"Casting {config_dtype} to {torch_dtype}.")
logger.warning("Casting %s to %s.", config_dtype, torch_dtype)

return torch_dtype

Expand Down Expand Up @@ -1051,8 +1051,8 @@ def _get_and_verify_max_len(
logger.warning(
"The model's config.json does not contain any of the following "
"keys to determine the original maximum length of the model: "
f"{possible_keys}. Assuming the model's maximum length is "
f"{default_max_len}.")
"%d. Assuming the model's maximum length is %d.", possible_keys,
default_max_len)
derived_max_model_len = default_max_len

rope_scaling = getattr(hf_config, "rope_scaling", None)
Expand Down
8 changes: 4 additions & 4 deletions vllm/distributed/device_communicators/custom_all_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def init_custom_ar() -> None:
return

if world_size not in _SUPPORTED_WORLD_SIZES:
logger.warn(
logger.warning(
"Custom allreduce is disabled due to an unsupported world size: "
"%d. Supported world sizes: %s. To silence this warning, specify"
" disable_custom_all_reduce=True explicitly.", world_size,
Expand All @@ -47,7 +47,7 @@ def init_custom_ar() -> None:
# note: num dev can be larger than world_size if we're only using
# first few GPUs
if num_dev < world_size:
logger.warn(
logger.warning(
"Cannot test GPU P2P because not all GPUs are visible to the "
"current process. This might be the case if 'CUDA_VISIBLE_DEVICES'"
" is set.")
Expand All @@ -62,7 +62,7 @@ def init_custom_ar() -> None:
# this checks hardware and driver support for NVLink
full_nvlink = _is_full_nvlink(device_ids)
if world_size > 2 and not full_nvlink:
logger.warn(
logger.warning(
"Custom allreduce is disabled because it's not supported on more"
" than two PCIe-only GPUs. To silence this warning, specify"
" disable_custom_all_reduce=True explicitly.")
Expand All @@ -71,7 +71,7 @@ def init_custom_ar() -> None:
# this is expensive to compute at the first time
# then we cache the result
if not _can_p2p(rank, world_size):
logger.warn(
logger.warning(
"Custom allreduce is disabled because your platform lacks GPU P2P"
" capability or P2P test failed. To silence this warning, specify"
" disable_custom_all_reduce=True explicitly.")
Expand Down
15 changes: 8 additions & 7 deletions vllm/distributed/device_communicators/pynccl.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,16 @@
nccl = ctypes.CDLL(so_file)
except Exception as e:
logger.error(
f"Failed to load NCCL library from {so_file} ."
"Failed to load NCCL library from %s ."
"It is expected if you are not running on NVIDIA/AMD GPUs."
"Otherwise, the nccl library might not exist, be corrupted "
f"or it does not support the current platform {platform.platform()}."
f"One solution is to download libnccl2 version 2.18 from "
f"https://developer.download.nvidia.com/compute/cuda/repos/ "
f"and extract the libnccl.so.2 file. If you already have the "
f"library, please set the environment variable VLLM_NCCL_SO_PATH"
" to point to the correct nccl library path.")
"or it does not support the current platform %s."
"One solution is to download libnccl2 version 2.18 from "
"https://developer.download.nvidia.com/compute/cuda/repos/ "
"and extract the libnccl.so.2 file. If you already have the "
"library, please set the environment variable VLLM_NCCL_SO_PATH"
" to point to the correct nccl library path.", so_file,
platform.platform())
raise e

# === export types and functions from nccl to Python ===
Expand Down
4 changes: 2 additions & 2 deletions vllm/distributed/device_communicators/pynccl_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
except Exception as e:
# in non-NVIDIA environments, we can't import the nccl module
# e.g. when running on machines with AMD GPUs
logger.info(f"Failed to import NCCL library: {e}")
logger.info("Failed to import NCCL library: %s", e)
logger.info("It is expected if you are not running on NVIDIA GPUs.")
pass

Expand All @@ -40,7 +40,7 @@ def set_pynccl_stream(stream: torch.cuda.Stream):
def init_process_group(group: Optional[ProcessGroup] = None) -> None:
assert not is_initialized()
global comm
logger.info(f"vLLM is using nccl=={ncclGetVersion()}")
logger.info("vLLM is using nccl==%s", ncclGetVersion())
comm = NCCLCommunicator(group=group)


Expand Down
6 changes: 4 additions & 2 deletions vllm/distributed/parallel_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ def init_distributed_environment(
local_rank: int = -1,
backend: str = "nccl",
):
logger.debug(f"{world_size=} {rank=} {local_rank=} "
f"{distributed_init_method=} {backend=}")
logger.debug(
"world_size=%d rank=%d local_rank=%d "
"distributed_init_method=%s backend=%s", world_size, rank, local_rank,
distributed_init_method, backend)
if not torch.distributed.is_initialized():
assert distributed_init_method is not None, (
"distributed_init_method must be provided when initializing "
Expand Down
61 changes: 36 additions & 25 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,29 +96,38 @@ def __init__(
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
) -> None:
logger.info(
f"Initializing an LLM engine (v{vllm.__version__}) with config: "
f"model={model_config.model!r}, "
f"speculative_config={speculative_config!r}, "
f"tokenizer={model_config.tokenizer!r}, "
f"skip_tokenizer_init={model_config.skip_tokenizer_init}, "
f"tokenizer_mode={model_config.tokenizer_mode}, "
f"revision={model_config.revision}, "
f"tokenizer_revision={model_config.tokenizer_revision}, "
f"trust_remote_code={model_config.trust_remote_code}, "
f"dtype={model_config.dtype}, "
f"max_seq_len={model_config.max_model_len}, "
f"download_dir={load_config.download_dir!r}, "
f"load_format={load_config.load_format}, "
f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
f"disable_custom_all_reduce="
f"{parallel_config.disable_custom_all_reduce}, "
f"quantization={model_config.quantization}, "
f"enforce_eager={model_config.enforce_eager}, "
f"kv_cache_dtype={cache_config.cache_dtype}, "
f"quantization_param_path={model_config.quantization_param_path}, "
f"device_config={device_config.device}, "
f"decoding_config={decoding_config!r}, "
f"seed={model_config.seed})")
"Initializing an LLM engine (v%s) with config: "
"model=%r, speculative_config=%r, tokenizer=%r, "
"skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
"tokenizer_revision=%s, trust_remote_code=%s, dtype=%s, "
"max_seq_len=%d, download_dir=%r, load_format=%s, "
"tensor_parallel_size=%d, disable_custom_all_reduce=%s"
"quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
"quantization_param_path=%s, device_config=%s, "
"decoding_config=%r, seed=%d)",
vllm.__version__,
model_config.model,
speculative_config,
model_config.tokenizer,
model_config.skip_tokenizer_init,
model_config.tokenizer_mode,
model_config.revision,
model_config.tokenizer_revision,
model_config.trust_remote_code,
model_config.dtype,
model_config.max_model_len,
load_config.download_dir,
load_config.load_format,
parallel_config.tensor_parallel_size,
parallel_config.disable_custom_all_reduce,
model_config.quantization,
model_config.enforce_eager,
cache_config.cache_dtype,
model_config.quantization_param_path,
device_config.device,
decoding_config,
model_config.seed,
)
# TODO(woosuk): Print more configs in debug mode.

self.model_config = model_config
Expand Down Expand Up @@ -237,8 +246,10 @@ def _initialize_kv_caches(self) -> None:

if self.cache_config.num_gpu_blocks_override is not None:
num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
logger.info(f"Overriding {num_gpu_blocks=} with "
f"{num_gpu_blocks_override=}")
logger.info(
"Overriding num_gpu_blocks=%d with "
"num_gpu_blocks_override=%d", num_gpu_blocks,
num_gpu_blocks_override)
num_gpu_blocks = num_gpu_blocks_override

self.cache_config.num_gpu_blocks = num_gpu_blocks
Expand Down
21 changes: 13 additions & 8 deletions vllm/engine/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,14 +227,19 @@ def log(self, stats: Stats) -> None:

# Log to stdout.
logger.info(
f"Avg prompt throughput: {prompt_throughput:.1f} tokens/s, "
f"Avg generation throughput: "
f"{generation_throughput:.1f} tokens/s, "
f"Running: {stats.num_running} reqs, "
f"Swapped: {stats.num_swapped} reqs, "
f"Pending: {stats.num_waiting} reqs, "
f"GPU KV cache usage: {stats.gpu_cache_usage * 100:.1f}%, "
f"CPU KV cache usage: {stats.cpu_cache_usage * 100:.1f}%")
"Avg prompt throughput: %.1f tokens/s, "
"Avg generation throughput: %.1f tokens/s, "
"Running: %d reqs, Swapped: %d reqs, "
"Pending: %d reqs, GPU KV cache usage: %.1f%, "
"CPU KV cache usage: %.1f%",
prompt_throughput,
generation_throughput,
stats.num_running,
stats.num_swapped,
stats.num_waiting,
stats.gpu_cache_usage * 100,
stats.cpu_cache_usage * 100,
)

# Reset tracked stats for next interval.
self.num_prompt_tokens = []
Expand Down
4 changes: 2 additions & 2 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,8 @@ async def authentication(request: Request, call_next):
raise ValueError(f"Invalid middleware {middleware}. "
f"Must be a function or a class.")

logger.info(f"vLLM API server version {vllm.__version__}")
logger.info(f"args: {args}")
logger.info("vLLM API server version %s", vllm.__version__)
logger.info("args: %s", args)

if args.served_model_name is not None:
served_model_names = args.served_model_name
Expand Down
11 changes: 5 additions & 6 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,7 @@ async def create_chat_completion(
tokenize=False,
add_generation_prompt=request.add_generation_prompt)
except Exception as e:
logger.error(
f"Error in applying chat template from request: {str(e)}")
logger.error("Error in applying chat template from request: %s", e)
return self.create_error_response(str(e))

request_id = f"cmpl-{random_uuid()}"
Expand Down Expand Up @@ -338,11 +337,11 @@ def _load_chat_template(self, chat_template):
tokenizer.chat_template = codecs.decode(
chat_template, "unicode_escape")

logger.info(
f"Using supplied chat template:\n{tokenizer.chat_template}")
logger.info("Using supplied chat template:\n%s",
tokenizer.chat_template)
elif tokenizer.chat_template is not None:
logger.info(
f"Using default chat template:\n{tokenizer.chat_template}")
logger.info("Using default chat template:\n%s",
tokenizer.chat_template)
else:
logger.warning(
"No chat template provided. Chat API will not work.")
4 changes: 2 additions & 2 deletions vllm/executor/gpu_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
# NOTE: This is logged in the executor because there can be >1 worker
# with other executors. We could log in the engine level, but work
# remains to abstract away the device for non-GPU configurations.
logger.info(f"# GPU blocks: {num_gpu_blocks}, "
f"# CPU blocks: {num_cpu_blocks}")
logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
num_cpu_blocks)

self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)

Expand Down
4 changes: 2 additions & 2 deletions vllm/executor/ray_gpu_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,8 @@ def initialize_cache(self, num_gpu_blocks: int,
# NOTE: We log here to avoid multiple logs when number of workers is
# greater than one. We could log in the engine, but not all executors
# have GPUs.
logger.info(f"# GPU blocks: {num_gpu_blocks}, "
f"# CPU blocks: {num_cpu_blocks}")
logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
num_cpu_blocks)

self.cache_config.num_gpu_blocks = num_gpu_blocks
self.cache_config.num_cpu_blocks = num_cpu_blocks
Expand Down
2 changes: 1 addition & 1 deletion vllm/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def enable_trace_function_call(log_file_path: str,
"VLLM_TRACE_FUNCTION is enabled. It will record every"
" function executed by Python. This will slow down the code. It "
"is suggested to be used for debugging hang or crashes only.")
logger.info(f"Trace frame log is saved to {log_file_path}")
logger.info("Trace frame log is saved to %s", log_file_path)
if root_dir is None:
# by default, this is the vllm root directory
root_dir = os.path.dirname(os.path.dirname(__file__))
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/layers/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,8 +296,8 @@ def get_moe_configs(E: int, N: int,
os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
if os.path.exists(config_file_path):
with open(config_file_path) as f:
logger.info(
f"Using configuration from {config_file_path} for MoE layer.")
logger.info("Using configuration from %s for MoE layer.",
config_file_path)
# If a configuration has been found, return it
return {int(key): val for key, val in json.load(f).items()}

Expand Down
3 changes: 2 additions & 1 deletion vllm/spec_decode/spec_decode_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,8 @@ def execute_model(
"speculative decoding "
"requires non-None seq_group_metadata_list")

logger.info(f"spec_decode_worker.execute_model {num_lookahead_slots=}")
logger.info("spec_decode_worker.execute_model num_lookahead_slots=%d",
num_lookahead_slots)

# If no spec tokens, call the proposer and scorer workers normally.
# Used for prefill.
Expand Down
20 changes: 11 additions & 9 deletions vllm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,8 +286,9 @@ def get_open_port() -> int:
def update_environment_variables(envs: Dict[str, str]):
for k, v in envs.items():
if k in os.environ and os.environ[k] != v:
logger.warning(f"Overwriting environment variable {k} "
f"from '{os.environ[k]}' to '{v}'")
logger.warning(
"Overwriting environment variable %s "
"from '%s' to '%s'", k, os.environ[k], v)
os.environ[k] = v


Expand All @@ -307,11 +308,12 @@ def get_nvcc_cuda_version() -> Optional[Version]:
if not cuda_home:
cuda_home = '/usr/local/cuda'
if os.path.isfile(cuda_home + '/bin/nvcc'):
logger.info(f'CUDA_HOME is not found in the environment. '
f'Using {cuda_home} as CUDA_HOME.')
logger.info(
'CUDA_HOME is not found in the environment. '
'Using %s as CUDA_HOME.', cuda_home)
else:
logger.warning(
f'Not found nvcc in {cuda_home}. Skip cuda version check!')
logger.warning('Not found nvcc in %s. Skip cuda version check!',
cuda_home)
return None
nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"],
universal_newlines=True)
Expand Down Expand Up @@ -596,14 +598,14 @@ def find_nccl_library():
# manually load the nccl library
if so_file:
logger.info(
f"Found nccl from environment variable VLLM_NCCL_SO_PATH={so_file}"
)
"Found nccl from environment variable VLLM_NCCL_SO_PATH=%s",
so_file)
else:
if torch.version.cuda is not None:
so_file = vllm_nccl_path or find_library("libnccl.so.2")
elif torch.version.hip is not None:
so_file = find_library("librccl.so.1")
else:
raise ValueError("NCCL only supports CUDA and ROCm backends.")
logger.info(f"Found nccl from library {so_file}")
logger.info("Found nccl from library %s", so_file)
return so_file

0 comments on commit e27e61e

Please sign in to comment.