From 8a0d4d1f5ffd63d32559cd168d3c4c574e9f7308 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 19 Jul 2024 13:54:07 +0000 Subject: [PATCH 1/2] compiler: Fix parlang reductions over >= 4 loops --- devito/arch/archinfo.py | 26 +++++++++++--- devito/ir/clusters/algorithms.py | 58 +++++++++++++++++++----------- devito/passes/clusters/blocking.py | 15 ++++++-- tests/test_gpu_common.py | 23 ++++++++++-- 4 files changed, 93 insertions(+), 29 deletions(-) diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py index 0d35fa5aeb..8c8f00e362 100644 --- a/devito/arch/archinfo.py +++ b/devito/arch/archinfo.py @@ -1,17 +1,17 @@ """Collection of utilities to detect properties of the underlying architecture.""" -from subprocess import PIPE, Popen, DEVNULL, run - from functools import cached_property -import cpuinfo +from subprocess import PIPE, Popen, DEVNULL, run import ctypes -import numpy as np -import psutil import re import os import sys import json +import cpuinfo +import numpy as np +import psutil + from devito.logger import warning from devito.tools import as_tuple, all_equal, memoized_func @@ -664,6 +664,16 @@ def max_mem_trans_size(self, dtype): assert self.max_mem_trans_nbytes % np.dtype(dtype).itemsize == 0 return int(self.max_mem_trans_nbytes / np.dtype(dtype).itemsize) + def limits(self, compiler=None, language=None): + """ + Return the architecture-specific limits for the given compiler and + language. + """ + return { + 'max-par-dims': np.inf, + 'max-block-dims': np.inf, + } + class Cpu64(Platform): @@ -832,6 +842,12 @@ def memavail(self, deviceid=0): except (AttributeError, KeyError): return None + def limits(self, compiler=None, language=None): + return { + 'max-par-dims': 3, + 'max-block-dims': 3, + } + class IntelDevice(Device): diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py index fb144e5770..e4e3c9deac 100644 --- a/devito/ir/clusters/algorithms.py +++ b/devito/ir/clusters/algorithms.py @@ -504,13 +504,10 @@ def reduction_comms(clusters): return processed -def normalize(clusters, **kwargs): - options = kwargs['options'] - sregistry = kwargs['sregistry'] - +def normalize(clusters, sregistry=None, options=None, platform=None, **kwargs): clusters = normalize_nested_indexeds(clusters, sregistry) if options['mapify-reduce']: - clusters = normalize_reductions_dense(clusters, sregistry) + clusters = normalize_reductions_dense(clusters, sregistry, platform) else: clusters = normalize_reductions_minmax(clusters) clusters = normalize_reductions_sparse(clusters, sregistry) @@ -594,31 +591,49 @@ def normalize_reductions_minmax(cluster): return init + [cluster.rebuild(processed)] -def normalize_reductions_dense(cluster, sregistry): +def normalize_reductions_dense(cluster, sregistry, platform): """ Extract the right-hand sides of reduction Eq's in to temporaries. """ - return _normalize_reductions_dense(cluster, sregistry, {}) + return _normalize_reductions_dense(cluster, {}, sregistry, platform) @cluster_pass(mode='dense') -def _normalize_reductions_dense(cluster, sregistry, mapper): - dims = [d for d in cluster.ispace.itdims - if cluster.properties.is_parallel_atomic(d)] - if not dims: +def _normalize_reductions_dense(cluster, mapper, sregistry, platform): + """ + Transform augmented expressions whose left-hand side is a scalar into + map-reduces. + + Examples + -------- + Given an increment expression such as + + s += f(u[x], v[x], ...) + + Turn it into + + r[x] = f(u[x], v[x], ...) + s += r[x] + """ + # The candidate Dimensions along which to perform the map part + candidates = [d for d in cluster.ispace.itdims + if cluster.properties.is_parallel_atomic(d)] + if not candidates: return cluster + # If there are more parallel dimensions than the maximum allowed by the + # target platform, we must restrain the number of candidates + max_par_dims = platform.limits()['max-par-dims'] + dims = candidates[-max_par_dims:] + + # All other dimensions must be sequentialized because the output Array + # is constrained to `dims` + sequentialize = candidates[:-max_par_dims] + processed = [] + properties = cluster.properties for e in cluster.exprs: if e.is_Reduction: - # Transform `e` into what is in essence an explicit map-reduce - # For example, turn: - # `s += f(u[x], v[x], ...)` - # into - # `r[x] = f(u[x], v[x], ...)` - # `s += r[x]` - # This makes it much easier to parallelize the map part regardless - # of the target backend lhs, rhs = e.args try: @@ -650,10 +665,13 @@ def _normalize_reductions_dense(cluster, sregistry, mapper): processed.extend([Eq(a.indexify(), rhs), e.func(lhs, a.indexify())]) + + for d in sequentialize: + properties = properties.sequentialize(d) else: processed.append(e) - return cluster.rebuild(processed) + return cluster.rebuild(exprs=processed, properties=properties) @cluster_pass(mode='sparse') diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index c91da4fe08..9ee0882190 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -164,8 +164,14 @@ def _has_atomic_blockable_dim(self, cluster, d): return any(cluster.properties.is_parallel_atomic(i) for i in set(cluster.ispace.itdims) - {d}) - def _has_enough_large_blockable_dims(self, cluster, d): - return len([i for i in set(cluster.ispace.itdims) - {d} + def _has_enough_large_blockable_dims(self, cluster, d, nested=False): + if nested: + _, ispace = cluster.ispace.split(d) + dims = set(ispace.itdims) + else: + ispace = cluster.ispace + dims = set(cluster.ispace.itdims) - {d} + return len([i for i in dims if (cluster.properties.is_parallel_relaxed(i) and not self._has_short_trip_count(i))]) >= 3 @@ -191,6 +197,11 @@ def callback(self, clusters, prefix): # to have more than three large blockable Dimensions return clusters + if self._has_enough_large_blockable_dims(c, d, nested=True): + # Virtually all device programming models forbid parallelism + # along more than three dimensions + return clusters + if any(self._has_short_trip_count(i) for i in c.ispace.itdims): properties = c.properties.block(d, 'small') elif self._has_data_reuse(c): diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index 8f100a1082..3980b60cd2 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -8,13 +8,14 @@ from devito import (Constant, Eq, Inc, Grid, Function, ConditionalDimension, Dimension, MatrixSparseTimeFunction, SparseTimeFunction, SubDimension, SubDomain, SubDomainSet, TimeFunction, - Operator, configuration, switchconfig, TensorTimeFunction) + Operator, configuration, switchconfig, TensorTimeFunction, + Buffer) from devito.arch import get_gpu_info from devito.exceptions import InvalidArgument from devito.ir import (Conditional, Expression, Section, FindNodes, FindSymbols, retrieve_iteration_tree) from devito.passes.iet.languages.openmp import OmpIteration -from devito.types import DeviceID, DeviceRM, Lock, NPThreads, PThreadArray +from devito.types import DeviceID, DeviceRM, Lock, NPThreads, PThreadArray, Symbol from conftest import skipif @@ -147,6 +148,24 @@ def test_incr_perfect_outer(self): op() assert np.all(w.data == 10) + def test_reduction_many_dims(self): + grid = Grid(shape=(25, 25, 25)) + + u = TimeFunction(name='u', grid=grid, time_order=1, save=Buffer(1)) + s = Symbol(name='s', dtype=np.float32) + + eqns = [Eq(s, 0), + Inc(s, 2*u + 1)] + + op0 = Operator(eqns) + op1 = Operator(eqns, opt=('advanced', {'mapify-reduce': True})) + + tree, = retrieve_iteration_tree(op0) + assert 'collapse(4) reduction(+:s)' in str(tree.root.pragmas[0]) + + tree, = retrieve_iteration_tree(op1) + assert 'collapse(3) reduction(+:s)' in str(tree[1].pragmas[0]) + class Bundle(SubDomain): """ From 8c62ae9dec7f535c68cc1dfe1678ea19065afde3 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 22 Jul 2024 07:36:31 +0000 Subject: [PATCH 2/2] compiler: Remember to further improve block size --- devito/arch/archinfo.py | 4 ++-- devito/types/dimension.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py index 8c8f00e362..225909cca1 100644 --- a/devito/arch/archinfo.py +++ b/devito/arch/archinfo.py @@ -670,8 +670,8 @@ def limits(self, compiler=None, language=None): language. """ return { - 'max-par-dims': np.inf, - 'max-block-dims': np.inf, + 'max-par-dims': sys.maxsize, + 'max-block-dims': sys.maxsize, } diff --git a/devito/types/dimension.py b/devito/types/dimension.py index 9f10f98903..69e9633092 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -1327,6 +1327,8 @@ def _arg_values(self, interval, grid=None, args=None, **kwargs): # no value supplied -> the sub-block will span the entire block return {name: args[self.parent.step.name]} else: + # TODO": Check the args for space order and apply heuristics (e.g., + # `2*space_order`?) for even better block sizes value = self._arg_defaults()[name] if value <= args[self.root.max_name] - args[self.root.min_name] + 1: return {name: value}