Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions core/runtime/TRTEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,12 @@ TRTEngine::TRTEngine(
out_binding_names[pyt_idx] = binding_name;
}
num_io = std::make_pair(inputs_size, outputs);

this->io_size = this->cuda_engine->getNbIOTensors();
for (int64_t i = 0; i < this->in_binding_names.size(); i++) {
this->isShapeInferenceIO[this->in_binding_names[i]] =
this->cuda_engine->isShapeInferenceIO(this->in_binding_names[i].c_str());
}
}

#ifndef NDEBUG
Expand Down Expand Up @@ -281,6 +287,14 @@ void TRTEngine::enable_profiling() {
exec_ctx->setProfiler(trt_engine_profiler.get());
}

void TRTEngine::set_output_tensors_as_unowned(bool enable) {
this->output_tensors_are_unowned = enable;
}

bool TRTEngine::are_output_tensors_unowned() {
return this->output_tensors_are_unowned;
}

void TRTEngine::set_profile_format(std::string format) {
if (format == "trex") {
this->trt_engine_profiler->set_profile_format(TraceFormat::kTREX);
Expand Down
6 changes: 6 additions & 0 deletions core/runtime/TRTEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ struct TRTEngine : torch::CustomClassHolder {
std::shared_ptr<nvinfer1::ICudaEngine> cuda_engine;
std::shared_ptr<nvinfer1::IExecutionContext> exec_ctx;
std::pair<uint64_t, uint64_t> num_io;
uint64_t io_size;
std::map<std::string, bool> isShapeInferenceIO;
bool output_tensors_are_unowned = false;
std::string name;
RTDevice device_info;

Expand Down Expand Up @@ -159,6 +162,8 @@ struct TRTEngine : torch::CustomClassHolder {
int64_t get_automatic_device_memory_budget();
std::vector<at::Tensor> infer_outputs(std::vector<std::vector<int64_t>> input_shapes);
void set_pre_allocated_outputs(bool enable);
void set_output_tensors_as_unowned(bool enable);
bool are_output_tensors_unowned();
TorchTRTRuntimeStates runtime_states;
friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
static const char BINDING_DELIM = '%';
Expand All @@ -176,6 +181,7 @@ struct TRTEngine : torch::CustomClassHolder {
std::string shape_key = "None";
bool use_pre_allocated_outputs = false;
std::vector<at::Tensor> pre_allocated_outputs;
std::vector<at::Tensor> allocated_outputs;

// Output Allocator-Related Functionality
bool requires_output_allocator = false; // engine requires output allocator
Expand Down
57 changes: 32 additions & 25 deletions core/runtime/execute_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ void setup_input_tensors(
auto shape = core::util::toVec(dims);
LOG_DEBUG("Input Name: " << name << " Shape: " << dims);

if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
if (compiled_engine->isShapeInferenceIO[name]) {
// Shape tensor inputs are casted to int64 explicitly.
// Refer to
// https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
Expand Down Expand Up @@ -145,10 +145,10 @@ void setup_input_tensors(
// Create a new persistent input buffer
compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
}

TORCHTRT_CHECK(
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");

if (need_cudagraphs_record or compiled_engine->allocated_outputs.size() == 0) {
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
}
if (cudagraphs_enabled) {
// If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
Expand Down Expand Up @@ -217,7 +217,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
compiled_engine->cudagraph.reset();
}

std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
std::vector<at::Tensor> outputs;

// Intialize inputs and outputs to be available throughout the succeeding scopes
{ // Input Setup
Expand All @@ -226,10 +226,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
input_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
}

setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
// Check if input shapes can be inferred.
int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
int32_t const io_size{compiled_engine->io_size};
std::vector<char const*> names(io_size);
int32_t const nbNames = compiled_engine->exec_ctx->inferShapes(names.size(), names.data());
TORCHTRT_CHECK(
Expand All @@ -240,6 +239,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
}

{ // Output Setup
bool new_outputs = false;
std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
if (compiled_engine->profile_execution) {
output_profiler_guard =
Expand All @@ -248,26 +248,33 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
if (can_use_pre_allocated_outputs) {
outputs = compiled_engine->pre_allocated_outputs;
} else {
outputs = create_output_tensors(compiled_engine);
if (compiled_engine->allocated_outputs.size() == 0 or compiled_engine->output_tensors_are_unowned or
shape_changed) {
compiled_engine->allocated_outputs = create_output_tensors(compiled_engine);
new_outputs = true;
}
outputs = compiled_engine->allocated_outputs;
}

for (auto output_indices : compiled_engine->out_binding_map) {
auto pyt_idx = output_indices.second;
std::string name = compiled_engine->out_binding_names[pyt_idx];
if (need_cudagraphs_record) {
// If we are recording the cuda graph then we need to update the persistent output buffer
compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
}
if (new_outputs) {
for (auto output_indices : compiled_engine->out_binding_map) {
auto pyt_idx = output_indices.second;
std::string name = compiled_engine->out_binding_names[pyt_idx];
if (need_cudagraphs_record) {
// If we are recording the cuda graph then we need to update the persistent output buffer
compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
}

if (cudagraphs_enabled) {
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(
name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
"Error while setting the output tensor address");
} else {
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr()),
"Error while setting the output tensor address");
if (cudagraphs_enabled) {
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(
name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
"Error while setting the output tensor address");
} else {
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr()),
"Error while setting the output tensor address");
}
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions core/runtime/register_jit_hooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
.def("get_engine_layer_info", &TRTEngine::get_engine_layer_info)
.def("infer_outputs", &TRTEngine::infer_outputs)
.def("reset_captured_graph", &TRTEngine::reset_captured_graph)
.def("set_output_tensors_as_unowned", &TRTEngine::set_output_tensors_as_unowned)
.def("are_output_tensors_unowned", &TRTEngine::are_output_tensors_unowned)
.def_readwrite("use_pre_allocated_outputs", &TRTEngine::use_pre_allocated_outputs)
.def_readwrite("use_output_allocator_outputs", &TRTEngine::use_output_allocator_outputs)
.def_property(
Expand Down
14 changes: 12 additions & 2 deletions py/torch_tensorrt/dynamo/_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@ def compile(
stacklevel=2,
)

if kwargs.get("use_explicit_typing", False) == False:
if not kwargs.get("use_explicit_typing", False):
warnings.warn(
"`use_explicit_typing` is deprecated. This setting will be removed and you should enable autocast instead.",
DeprecationWarning,
Expand Down Expand Up @@ -1070,14 +1070,24 @@ def preserve_module_specs(
) as f:
f.write(trt_module.get_layer_info())

# Only set the requires_unique_output flag for the last TRT Module when user has access to the output tensor

# Parse the graph I/O and store it in dryrun tracker
parse_graph_io(gm, dryrun_tracker)

# Replace all FX Modules with TRT Modules
for name, trt_module in trt_modules.items():
setattr(partitioned_module, name, trt_module)
if settings.lazy_engine_init and not settings.enable_cross_compile_for_windows:
getattr(partitioned_module, name).setup_engine()
trt_module = getattr(partitioned_module, name)
trt_module.setup_engine()

output_node = list(partitioned_module.graph.nodes)[-1]
for arg in output_node.args:
target = arg[0].target
if "acc" not in target:
continue
getattr(partitioned_module, target).set_output_tensors_as_unowned(True)

# Reset settings object to user specification after fallback to global partitioning mode
if fast_partitioner_failed:
Expand Down
80 changes: 61 additions & 19 deletions py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ def __init__(
self.cudagraph: Optional[torch.cuda.CUDAGraph] = None
self._caller_stream: Optional[torch.cuda.Stream] = None
self._engine_stream: Optional[torch.cuda.Stream] = None
self.output_tensors: Optional[List[torch.Tensor]] = None

# TODO: Make the below a Dictionary {shape: cudagraph}
self.shape_key: Optional[str] = None
Expand Down Expand Up @@ -218,10 +219,27 @@ def __init__(
self.requires_output_allocator = requires_output_allocator
self.output_allocator: Optional[DynamicOutputAllocator] = None
self.use_output_allocator_outputs = False

self.device = torch.cuda.current_device()
self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
# If the output tensor is not owned by the engine (output_tensors_are_unowned=True), we need to create a new output tensor in each forward pass
self.output_tensors_are_unowned = False
if self.serialized_engine is not None and not self.settings.lazy_engine_init:
self.setup_engine()

def set_output_tensors_as_unowned(self, enabled: bool) -> None:
"""
Flag to set if the output tensors of this engine are solely owned by the Torch-TensorRT Runtime or if they might be shared with a user.
If the tensors are not owned by the runtime, then they must be recreated on every forward call which may have implications for performance.
Typically only the final engine in a graph requires output tensors to be unowned and there are performance gains to be had for intermediate engines to manage their own standing memory.
Therefore this should only be set to True for the final module in a graph and leave false for intermediate modules.

Args:
enabled: bool
Whether to set the flag to True.

"""
self.output_tensors_are_unowned = enabled

def get_streamable_device_memory_budget(self) -> Any:
return self.engine.streamable_weights_size

Expand Down Expand Up @@ -288,16 +306,25 @@ def setup_engine(self) -> None:
for output_name in self.output_names
]
self.output_shapes = [
self.engine.get_tensor_shape(output_name)
tuple(self.context.get_tensor_shape(output_name))
for output_name in self.output_names
]

self.shape_key = "".join(
str(tuple(t)).replace(" ", "") for t in self.input_shapes
)

if self.requires_output_allocator:
self.create_output_allocator()

if torch_tensorrt.runtime.get_cudagraphs_mode():
self.cudagraph = torch.cuda.CUDAGraph()

self.is_shape_inference_io = {
input_name: self.engine.is_shape_inference_io(input_name)
for input_name in self.input_names
}

def _check_initialized(self) -> None:
if not self.initialized:
raise RuntimeError("PythonTorchTensorRTModule is not initialized.")
Expand Down Expand Up @@ -383,16 +410,19 @@ def setup_input_tensors(

# For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers
# as per TensorRT requirements
if self.engine.is_shape_inference_io(input_name):
if self.is_shape_inference_io[input_name]:
# Shape tensor inputs are casted to int64 explicitly
# Currently Torch CPU pointers are not working; numpy pointers are used instead
# to refer to underlying memory
inputs_cpu = contiguous_inputs[i].cpu().to(torch.int64).numpy().copy()
self.context.set_tensor_address(input_name, inputs_cpu.ctypes.data)
else:
self.context.set_input_shape(
input_name, tuple(contiguous_inputs[i].shape)
)
if (
need_cudagraphs_record or self.output_tensors is None
): # First time execution:
self.context.set_input_shape(
input_name, tuple(contiguous_inputs[i].shape)
)
if cudagraphs_enabled:
self._input_buffers[i].copy_(contiguous_inputs[i])
self.context.set_tensor_address(
Expand All @@ -411,7 +441,7 @@ def create_output_tensors(self) -> List[torch.Tensor]:
output = torch.empty(
size=self.output_shapes[o],
dtype=self.output_dtypes[o],
device=torch.cuda.current_device(),
device=self.device,
)
outputs.append(output)
return outputs
Expand Down Expand Up @@ -460,7 +490,9 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}."

self.setup_input_tensors(
contiguous_inputs, self.cudagraphs_enabled, need_cudagraphs_record
contiguous_inputs,
self.cudagraphs_enabled,
need_cudagraphs_record,
)

if shape_changed:
Expand All @@ -482,15 +514,22 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
if can_use_pre_allocated_outputs:
outputs = self.pre_allocated_outputs
else:
self.output_shapes = [
tuple(self.context.get_tensor_shape(output_name))
for output_name in self.output_names
]
if shape_changed or self.output_tensors is None:
self.output_shapes = [
tuple(self.context.get_tensor_shape(output_name))
for output_name in self.output_names
]
if DYNAMIC_DIM in self.output_shapes:
raise ValueError(
"Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
)
outputs = self.create_output_tensors()
if (
self.output_tensors is None
or self.output_tensors_are_unowned
or shape_changed
):
self.output_tensors = self.create_output_tensors()
outputs = self.output_tensors

for o, output_name in enumerate(self.output_names):
if need_cudagraphs_record:
Expand Down Expand Up @@ -751,13 +790,13 @@ def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
# Representation of input shapes to a given model
# Shapes are concatenated as so:
# x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5)
tensor_inputs = []
for t in inputs:
if not isinstance(t, torch.Tensor):
return True
tensor_inputs.append(t)
if not all(isinstance(t, torch.Tensor) for t in inputs):
return True

new_shape_key = "".join(
str(tuple(t.shape)).replace(" ", "") for t in tensor_inputs
str(tuple(t.shape)).replace(" ", "")
for t in inputs
if isinstance(t, torch.Tensor)
)

# If the new shape key differs from the existing one,
Expand All @@ -768,3 +807,6 @@ def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
return True

return False

def are_output_tensors_unowned(self) -> bool:
return self.output_tensors_are_unowned
13 changes: 13 additions & 0 deletions py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,11 @@ def _pack_engine_info(self) -> List[str | bytes]:
metadata = {
"settings": self.settings,
"weight_name_map": self.weight_name_map,
"requires_new_output_tensor": (
False
if self.engine is None
else self.engine.get_requires_new_output_tensor()
),
}
target_platform = (
Platform.current_platform()
Expand Down Expand Up @@ -284,6 +289,8 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None:
metadata = TorchTensorRTModule.decode_metadata(serialized_metadata)
self.settings = metadata["settings"]
self.weight_name_map = metadata["weight_name_map"]
self.output_tensors_are_unowned = metadata["output_tensors_are_unowned"]
self.engine.set_output_tensors_as_unowned(self.output_tensors_are_unowned)

else:
self.engine = None
Expand Down Expand Up @@ -355,6 +362,12 @@ def enable_profiling(
self.engine.enable_profiling()
self.engine.set_profile_format(profile_format)

def set_output_tensors_as_unowned(self, enabled: bool) -> None:
self.engine.set_output_tensors_as_unowned(enabled)

def are_output_tensors_unowned(self) -> bool:
return self.engine.are_output_tensors_unowned() # type: ignore[no-any-return]

def disable_profiling(self) -> None:
"""Disable the profiler"""
if self.engine is None:
Expand Down
Loading
Loading