Skip to content

Commit

Permalink
fix: Fixed bugs and addressed review comments
Browse files Browse the repository at this point in the history
Signed-off-by: Anurag Dixit <anuragd@nvidia.com>
  • Loading branch information
Anurag Dixit committed Dec 15, 2021
1 parent a8016ff commit 588e1d1
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 23 deletions.
22 changes: 18 additions & 4 deletions examples/benchmark/py/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@ This is a comprehensive Python benchmark suite to run perf runs using different

Note: Please note that for ONNX models, user can convert the ONNX model to TensorRT serialized engine and then use this package.

## Prerequisite

Benchmark scripts depends on following Python packages in addition to requirements.txt packages

1. Torch-TensorRT
2. Torch
3. TensorRT

## Structure

```
Expand Down Expand Up @@ -42,14 +50,20 @@ There are two sample configuration files added.

| Name | Supported Values | Description |
| --- | --- | --- |
| backend | all, torch, torch_tensorrt, tensorrt | Supported backends for inference |
| backend | all, torch, torch_tensorrt, tensorrt | Supported backends for inference. |
| input | - | Input binding names. Expected to list shapes of each input bindings |
| model | - | Configure the model filename and name |
| filename | - | Model file name to load from disk |
| filename | - | Model file name to load from disk. |
| name | - | Model name |
| runtime | - | Runtime configurations |
| device | 0 | Target device ID to run inference. Range depends on available GPUs |
| precision | fp32, fp16 or half, int8 | Target precision to run inference |
| precision | fp32, fp16 or half, int8 | Target precision to run inference. int8 cannot be used with 'all' backend |
| calibration_cache | - | Calibration cache file expected for torch_tensorrt runtime in int8 precision |

Note:
1. Please note that torch runtime perf is not supported for int8 yet.
2. Torchscript module filename should end with .jit.pt otherwise it will be treated as a TensorRT engine.



Additional sample use case:
Expand All @@ -64,7 +78,7 @@ input:
- 3
- 224
- 224
num_of_input: 1
num_inputs: 1
model:
filename: model.plan
name: vgg16
Expand Down
2 changes: 1 addition & 1 deletion examples/benchmark/py/config/vgg16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ input:
- 3
- 224
- 224
num_of_input: 1
num_inputs: 1
model:
filename: vgg16_traced.jit.pt
name: vgg16
Expand Down
5 changes: 4 additions & 1 deletion examples/benchmark/py/config/vgg16_trt.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@ input:
- 3
- 224
- 224
num_of_input: 1
num_inputs: 1
model:
filename: model.plan
name: vgg16
calibration_cache:
- vgg16.cache
runtime:
device: 0
precision:
- fp32
- fp16
- int8
52 changes: 35 additions & 17 deletions examples/benchmark/py/perf_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ def run_torch_tensorrt(model, input_tensors, params, precision):
"inputs": input_tensors,
"enabled_precisions": {precision_to_dtype(precision)}
}

if precision == 'int8':
compile_settings.update({"calib": params.get('calibration_cache')})


model = torchtrt.compile(model, **compile_settings)

Expand Down Expand Up @@ -166,26 +170,35 @@ def run_tensorrt(model, input_tensors, params, precision, is_trt_engine=False):
k += 1

timings = []
with torch.no_grad():
with engine.create_execution_context() as context:
for i in range(WARMUP_ITER):
context.execute_async(batch_size, bindings, torch.cuda.current_stream().cuda_stream)
torch.cuda.synchronize()

for i in range(iters):
start_time = timeit.default_timer()
context.execute_async(batch_size, bindings, torch.cuda.current_stream().cuda_stream)
torch.cuda.synchronize()
end_time = timeit.default_timer()
meas_time = end_time - start_time
timings.append(meas_time)
print("Iterations {}: {:.6f} s".format(i, end_time - start_time))
with engine.create_execution_context() as context:
for i in range(WARMUP_ITER):
context.execute_async(batch_size, bindings, torch.cuda.current_stream().cuda_stream)
torch.cuda.synchronize()

for i in range(iters):
start_time = timeit.default_timer()
context.execute_async(batch_size, bindings, torch.cuda.current_stream().cuda_stream)
torch.cuda.synchronize()
end_time = timeit.default_timer()
meas_time = end_time - start_time
timings.append(meas_time)
print("Iterations {}: {:.6f} s".format(i, end_time - start_time))

printStats("TensorRT", timings, precision)

# Deploys inference run for different backend configurations
def run(model, input_tensors, params, precision, is_trt_engine = False):
for backend in params.get('backend'):

if precision == 'int8':
if backend == 'all' or backend == 'torch':
print("int8 precision is not supported for torch runtime in this script yet")
return False

if backend == 'all' or backend == 'torch_tensorrt' or params.get('calibration_cache', None) == None:
print("int8 precision expects calibration cache file for inference")
return False

if backend == 'all':
run_torch(model, input_tensors, params, precision)
run_torch_tensorrt(model, input_tensors, params, precision)
Expand Down Expand Up @@ -280,20 +293,25 @@ def load_model(params):
# Create random input tensor of certain size
torch.manual_seed(12345)

num_input = params.get('input').get('num_of_input')
num_input = params.get('input').get('num_inputs')
for precision in params.get('runtime').get('precision', 'fp32'):
input_tensors = []
num_input = params.get('input').get('num_of_input', 1)
num_input = params.get('input').get('num_inputs', 1)
for i in range(num_input):
inp_tensor = params.get('input').get('input' + str(i))
input_tensors.append(torch.randint(0, 2, tuple(d for d in inp_tensor), dtype=precision_to_dtype(precision)).cuda())

if is_trt_engine:
print("Warning, TensorRT engine file is configured. Please make sure the precision matches with the TRT engine for reliable results")

if not is_trt_engine and precision == "fp16" or precision == "half":
# If model is TensorRT serialized engine then model.half will report failure
model = model.half()

# Run inference
run(model, input_tensors, params, precision, is_trt_engine)
status = run(model, input_tensors, params, precision, is_trt_engine)
if status == False:
continue

# Generate report
print('Model Summary:')
Expand Down
5 changes: 5 additions & 0 deletions examples/benchmark/py/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
timeit
numpy
argparse
yaml
pandas

0 comments on commit 588e1d1

Please sign in to comment.