Skip to content

Commit

Permalink
Merge pull request #1061 from spcl/fpga_kernel_detection_option
Browse files Browse the repository at this point in the history
Add kernel detection flag
  • Loading branch information
TizianoDeMatteis committed Jul 20, 2022
2 parents 899a1f3 + 112afe8 commit 16b1d35
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 36 deletions.
46 changes: 23 additions & 23 deletions dace/codegen/targets/fpga.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,22 +486,26 @@ def generate_state(self, sdfg: dace.SDFG, state: dace.SDFGState, function_stream
# Then, try to split these components further
subgraphs = dace.sdfg.concurrent_subgraphs(state)

start_kernel = 0
for sg in subgraphs:
# Determine kernels in state
num_kernels, dependencies = self.partition_kernels(sg, default_kernel=start_kernel)
if num_kernels > 1:
# For each kernel, derive the corresponding subgraphs
# and keep track of dependencies
kernels.extend(self._kernels_subgraphs(sg, dependencies))
self._kernels_dependencies.update(dependencies)
else:
kernels.append((sg, start_kernel))
start_kernel = start_kernel + num_kernels
if Config.get_bool("compiler", "fpga", "concurrent_kernel_detection"):
start_kernel = 0
for sg in subgraphs:
# Determine kernels in state
num_kernels, dependencies = self.partition_kernels(sg, default_kernel=start_kernel)
if num_kernels > 1:
# For each kernel, derive the corresponding subgraphs
# and keep track of dependencies
kernels.extend(self._kernels_subgraphs(sg, dependencies))
self._kernels_dependencies.update(dependencies)
else:
kernels.append((sg, start_kernel))
start_kernel = start_kernel + num_kernels

# There is no need to generate additional kernels if the number of found kernels
# is equal to the number of connected components: use PEs instead (only one kernel)
if len(subgraphs) == len(kernels):
# There is no need to generate additional kernels if the number of found kernels
# is equal to the number of connected components: use PEs instead (only one kernel)
if len(subgraphs) == len(kernels):
kernels = [(state, 0)]
else:
# Only one FPGA kernel (possibly with multiple PEs)
kernels = [(state, 0)]

self._num_kernels = len(kernels)
Expand Down Expand Up @@ -920,8 +924,7 @@ def make_parameters(self, sdfg: SDFG, state: SDFGState, subgraphs):
trace_type, trace_bank = parse_location_bank(trace_desc)
if (bank is not None and bank_type is not None
and (bank != trace_bank or bank_type != trace_type)):
raise cgx.CodegenError("Found inconsistent memory bank "
f"specifier for {trace_name}.")
raise cgx.CodegenError("Found inconsistent memory bank " f"specifier for {trace_name}.")
bank = trace_bank
bank_type = trace_type

Expand Down Expand Up @@ -1460,8 +1463,7 @@ def _emit_copy(self, sdfg, state_id, src_node, src_storage, dst_node, dst_storag

if (not sum(copy_shape) == 1 and
(not isinstance(memlet.subset, subsets.Range) or any([step != 1 for _, _, step in memlet.subset]))):
raise NotImplementedError("Only contiguous copies currently "
"supported for FPGA codegen.")
raise NotImplementedError("Only contiguous copies currently " "supported for FPGA codegen.")

if host_to_device or device_to_device:
host_dtype = sdfg.data(src_node.data).dtype
Expand Down Expand Up @@ -1709,8 +1711,7 @@ def _emit_copy(self, sdfg, state_id, src_node, src_storage, dst_node, dst_storag
@staticmethod
def make_opencl_parameter(name, desc):
if isinstance(desc, dt.Array):
return (f"hlslib::ocl::Buffer<{desc.dtype.ctype}, "
f"hlslib::ocl::Access::readWrite> &{name}")
return (f"hlslib::ocl::Buffer<{desc.dtype.ctype}, " f"hlslib::ocl::Access::readWrite> &{name}")
else:
return (desc.as_arg(with_types=True, name=name))

Expand Down Expand Up @@ -1970,8 +1971,7 @@ def _generate_MapEntry(self, sdfg, dfg, state_id, node, function_stream, callsit
elif np.issubdtype(np.dtype(end_type.dtype.type), np.unsignedinteger):
loop_var_type = "size_t"
except (UnboundLocalError):
raise UnboundLocalError('Pipeline scopes require '
'specialized bound values')
raise UnboundLocalError('Pipeline scopes require ' 'specialized bound values')
except (TypeError):
# Raised when the evaluation of begin or skip fails.
# This could occur, for example, if they are defined in terms of other symbols, which
Expand Down
11 changes: 11 additions & 0 deletions dace/config_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,17 @@ required:
Target Xilinx ("xilinx") or Intel ("intel_fpga") FPGAs when
generating code.
concurrent_kernel_detection:
type: bool
default: false
title: Detect parts of an SDFG that can run in parallel
description: >
If set to false, DaCe will place each weakly connected
component found in an SDFG state in a different Kernel/Processing Element.
If true, a heuristic will further inspect each independent component
for other parallelism opportunities (e.g., branches of the SDFG
that can be executed in parallel), creating the corresponding kernels.
#############################################
# FPGA (Xilinx) compiler flags
xilinx:
Expand Down
5 changes: 3 additions & 2 deletions tests/fpga/fpga_instrumentation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
from dace.sdfg.utils import is_fpga_kernel
from dace.fpga_testing import fpga_test
from dace.transformation.interstate import FPGATransformSDFG, InlineSDFG
from dace import config
import numpy as np
import re


def make_sdfg(make_tmp_local: bool):
"""
Creates an SDFG that has a left and a right branch writing into two
Expand Down Expand Up @@ -114,7 +114,8 @@ def test_instrumentation_single():
@fpga_test()
def test_instrumentation_multiple():
sdfg = make_sdfg(False)
run_program(sdfg)
with config.set_temporary("compiler", "fpga", "concurrent_kernel_detection", value=True):
run_program(sdfg)
report = sdfg.get_latest_report()
# There should be five runtimes: One for each kernel, and two for the state
assert len(re.findall(r"[0-9\.]+\s+[0-9\.]+\s+[0-9\.]+\s+[0-9\.]+\s+", str(report))) == 6
Expand Down
27 changes: 16 additions & 11 deletions tests/fpga/kernel_detection_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from dace.sdfg.utils import is_fpga_kernel
from dace.transformation.interstate import FPGATransformSDFG, InlineSDFG
from dace.fpga_testing import fpga_test
from dace import config


def count_kernels(sdfg: dace.SDFG):
Expand Down Expand Up @@ -49,7 +50,6 @@ def test_kernels_inside_component_0():
The 4 maps, should belong to three distinct kernels
:return:
'''

@dace.program
def kernels_inside_component_0(x: dace.float32[8], y: dace.float32[8], v: dace.float32[8], w: dace.float32[8],
z: dace.float32[8]):
Expand All @@ -69,7 +69,8 @@ def kernels_inside_component_0(x: dace.float32[8], y: dace.float32[8], v: dace.f
if is_fpga_kernel(sdfg, state):
state.instrument = dace.InstrumentationType.FPGA

res = sdfg(x=x, y=y, v=v, w=w, z=z)
with config.set_temporary("compiler", "fpga", "concurrent_kernel_detection", value=True):
res = sdfg(x=x, y=y, v=v, w=w, z=z)
assert count_kernels(sdfg) == 3
assert np.allclose(res, x + y + v + w + z)

Expand Down Expand Up @@ -103,7 +104,6 @@ def test_kernels_inside_component_1():
The five Maps should belong to 5 distinct kernels
'''

@dace.program
def kernels_inside_component_1(x: dace.float32[8], y: dace.float32[8], v: dace.float32[8], w: dace.float32[8],
z: dace.float32[8], t: dace.float32[8], alpha: dace.float32, beta: dace.float32):
Expand All @@ -124,7 +124,9 @@ def kernels_inside_component_1(x: dace.float32[8], y: dace.float32[8], v: dace.f

sdfg = kernels_inside_component_1.to_sdfg()
sdfg.apply_transformations([FPGATransformSDFG, InlineSDFG])
program = sdfg.compile()

with config.set_temporary("compiler", "fpga", "concurrent_kernel_detection", value=True):
program = sdfg.compile()
assert count_kernels(sdfg) == 5
program(x=x, y=y, v=v, w=w, z=z, t=t, alpha=alpha, beta=beta)
ref_z = alpha * (x + y + v + w)
Expand Down Expand Up @@ -154,7 +156,6 @@ def test_kernels_inside_component_2():
:return:
'''

@dace.program
def kernels_inside_component_2(x: dace.float32[8], y: dace.float32[8], v: dace.float32[8], z: dace.float32[8],
t: dace.float32[8]):
Expand All @@ -169,7 +170,8 @@ def kernels_inside_component_2(x: dace.float32[8], y: dace.float32[8], v: dace.f

sdfg = kernels_inside_component_2.to_sdfg()
sdfg.apply_transformations([FPGATransformSDFG, InlineSDFG])
program = sdfg.compile()
with config.set_temporary("compiler", "fpga", "concurrent_kernel_detection", value=True):
program = sdfg.compile()

# NOTE: here we have only one kernel since subgraph detection already
# detects two PEs
Expand Down Expand Up @@ -216,7 +218,9 @@ def kernels_lns_inside_component(A: dace.float32[8, 8], x: dace.float32[8], B: d

sdfg = kernels_lns_inside_component.to_sdfg()
sdfg.apply_transformations([FPGATransformSDFG, InlineSDFG])
program = sdfg.compile()

with config.set_temporary("compiler", "fpga", "concurrent_kernel_detection", value=True):
program = sdfg.compile()

assert count_kernels(sdfg) == 3
z = program(A=A, x=x, B=B, y=y)
Expand Down Expand Up @@ -246,7 +250,6 @@ def test_kernels_inside_components_0():
The three maps, should belong to three distinct kernels
'''

@dace.program
def kernels_inside_components_0(x: dace.float32[8], y: dace.float32[8], v: dace.float32[8], w: dace.float32[8],
xx: dace.float32[8], yy: dace.float32[8], vv: dace.float32[8], ww: dace.float32[8]):
Expand All @@ -265,7 +268,9 @@ def kernels_inside_components_0(x: dace.float32[8], y: dace.float32[8], v: dace.

sdfg = kernels_inside_components_0.to_sdfg()
sdfg.apply_transformations([FPGATransformSDFG, InlineSDFG])
program = sdfg.compile()

with config.set_temporary("compiler", "fpga", "concurrent_kernel_detection", value=True):
program = sdfg.compile()

assert count_kernels(sdfg) == 6
z, zz = program(x=x, y=y, v=v, w=w, xx=xx, yy=yy, vv=vv, ww=ww)
Expand Down Expand Up @@ -294,7 +299,6 @@ def test_kernels_inside_components_multiple_states():
The three maps, should belong to three distinct kernels
:return:
'''

def make_sdfg(dtype=dace.float32):
sdfg = dace.SDFG("multiple_kernels_multiple_states")
n = dace.symbol("size")
Expand Down Expand Up @@ -543,7 +547,8 @@ def make_sdfg(dtype=dace.float32):
zz = np.random.rand(8).astype(np.float32)

sdfg = make_sdfg()
program = sdfg.compile()
with config.set_temporary("compiler", "fpga", "concurrent_kernel_detection", value=True):
program = sdfg.compile()
assert count_kernels(sdfg) == 6
program(z=z, zz=zz, x=x, y=y, v=v, w=w, xx=xx, yy=yy, vv=vv, ww=ww, size=8)
assert np.allclose(z, x + y + v + w)
Expand Down

0 comments on commit 16b1d35

Please sign in to comment.