From 8a0d4d1f5ffd63d32559cd168d3c4c574e9f7308 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Fri, 19 Jul 2024 13:54:07 +0000
Subject: [PATCH 1/2] compiler: Fix parlang reductions over >= 4 loops

---
 devito/arch/archinfo.py            | 26 +++++++++++---
 devito/ir/clusters/algorithms.py   | 58 +++++++++++++++++++-----------
 devito/passes/clusters/blocking.py | 15 ++++++--
 tests/test_gpu_common.py           | 23 ++++++++++--
 4 files changed, 93 insertions(+), 29 deletions(-)

diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py
index 0d35fa5aeb..8c8f00e362 100644
--- a/devito/arch/archinfo.py
+++ b/devito/arch/archinfo.py
@@ -1,17 +1,17 @@
 """Collection of utilities to detect properties of the underlying architecture."""
 
-from subprocess import PIPE, Popen, DEVNULL, run
-
 from functools import cached_property
-import cpuinfo
+from subprocess import PIPE, Popen, DEVNULL, run
 import ctypes
-import numpy as np
-import psutil
 import re
 import os
 import sys
 import json
 
+import cpuinfo
+import numpy as np
+import psutil
+
 from devito.logger import warning
 from devito.tools import as_tuple, all_equal, memoized_func
 
@@ -664,6 +664,16 @@ def max_mem_trans_size(self, dtype):
         assert self.max_mem_trans_nbytes % np.dtype(dtype).itemsize == 0
         return int(self.max_mem_trans_nbytes / np.dtype(dtype).itemsize)
 
+    def limits(self, compiler=None, language=None):
+        """
+        Return the architecture-specific limits for the given compiler and
+        language.
+        """
+        return {
+            'max-par-dims': np.inf,
+            'max-block-dims': np.inf,
+        }
+
 
 class Cpu64(Platform):
 
@@ -832,6 +842,12 @@ def memavail(self, deviceid=0):
         except (AttributeError, KeyError):
             return None
 
+    def limits(self, compiler=None, language=None):
+        return {
+            'max-par-dims': 3,
+            'max-block-dims': 3,
+        }
+
 
 class IntelDevice(Device):
 
diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py
index fb144e5770..e4e3c9deac 100644
--- a/devito/ir/clusters/algorithms.py
+++ b/devito/ir/clusters/algorithms.py
@@ -504,13 +504,10 @@ def reduction_comms(clusters):
     return processed
 
 
-def normalize(clusters, **kwargs):
-    options = kwargs['options']
-    sregistry = kwargs['sregistry']
-
+def normalize(clusters, sregistry=None, options=None, platform=None, **kwargs):
     clusters = normalize_nested_indexeds(clusters, sregistry)
     if options['mapify-reduce']:
-        clusters = normalize_reductions_dense(clusters, sregistry)
+        clusters = normalize_reductions_dense(clusters, sregistry, platform)
     else:
         clusters = normalize_reductions_minmax(clusters)
     clusters = normalize_reductions_sparse(clusters, sregistry)
@@ -594,31 +591,49 @@ def normalize_reductions_minmax(cluster):
     return init + [cluster.rebuild(processed)]
 
 
-def normalize_reductions_dense(cluster, sregistry):
+def normalize_reductions_dense(cluster, sregistry, platform):
     """
     Extract the right-hand sides of reduction Eq's in to temporaries.
     """
-    return _normalize_reductions_dense(cluster, sregistry, {})
+    return _normalize_reductions_dense(cluster, {}, sregistry, platform)
 
 
 @cluster_pass(mode='dense')
-def _normalize_reductions_dense(cluster, sregistry, mapper):
-    dims = [d for d in cluster.ispace.itdims
-            if cluster.properties.is_parallel_atomic(d)]
-    if not dims:
+def _normalize_reductions_dense(cluster, mapper, sregistry, platform):
+    """
+    Transform augmented expressions whose left-hand side is a scalar into
+    map-reduces.
+
+    Examples
+    --------
+    Given an increment expression such as
+
+        s += f(u[x], v[x], ...)
+
+    Turn it into
+
+        r[x] = f(u[x], v[x], ...)
+        s += r[x]
+    """
+    # The candidate Dimensions along which to perform the map part
+    candidates = [d for d in cluster.ispace.itdims
+                  if cluster.properties.is_parallel_atomic(d)]
+    if not candidates:
         return cluster
 
+    # If there are more parallel dimensions than the maximum allowed by the
+    # target platform, we must restrain the number of candidates
+    max_par_dims = platform.limits()['max-par-dims']
+    dims = candidates[-max_par_dims:]
+
+    # All other dimensions must be sequentialized because the output Array
+    # is constrained to `dims`
+    sequentialize = candidates[:-max_par_dims]
+
     processed = []
+    properties = cluster.properties
     for e in cluster.exprs:
         if e.is_Reduction:
-            # Transform `e` into what is in essence an explicit map-reduce
-            # For example, turn:
-            # `s += f(u[x], v[x], ...)`
-            # into
-            # `r[x] = f(u[x], v[x], ...)`
-            # `s += r[x]`
-            # This makes it much easier to parallelize the map part regardless
-            # of the target backend
             lhs, rhs = e.args
 
             try:
@@ -650,10 +665,13 @@ def _normalize_reductions_dense(cluster, sregistry, mapper):
 
                 processed.extend([Eq(a.indexify(), rhs),
                                   e.func(lhs, a.indexify())])
+
+                for d in sequentialize:
+                    properties = properties.sequentialize(d)
         else:
             processed.append(e)
 
-    return cluster.rebuild(processed)
+    return cluster.rebuild(exprs=processed, properties=properties)
 
 
 @cluster_pass(mode='sparse')
diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py
index c91da4fe08..9ee0882190 100644
--- a/devito/passes/clusters/blocking.py
+++ b/devito/passes/clusters/blocking.py
@@ -164,8 +164,14 @@ def _has_atomic_blockable_dim(self, cluster, d):
         return any(cluster.properties.is_parallel_atomic(i)
                    for i in set(cluster.ispace.itdims) - {d})
 
-    def _has_enough_large_blockable_dims(self, cluster, d):
-        return len([i for i in set(cluster.ispace.itdims) - {d}
+    def _has_enough_large_blockable_dims(self, cluster, d, nested=False):
+        if nested:
+            _, ispace = cluster.ispace.split(d)
+            dims = set(ispace.itdims)
+        else:
+            ispace = cluster.ispace
+            dims = set(cluster.ispace.itdims) - {d}
+        return len([i for i in dims
                     if (cluster.properties.is_parallel_relaxed(i) and
                         not self._has_short_trip_count(i))]) >= 3
 
@@ -191,6 +197,11 @@ def callback(self, clusters, prefix):
                         # to have more than three large blockable Dimensions
                         return clusters
 
+                if self._has_enough_large_blockable_dims(c, d, nested=True):
+                    # Virtually all device programming models forbid parallelism
+                    # along more than three dimensions
+                    return clusters
+
                 if any(self._has_short_trip_count(i) for i in c.ispace.itdims):
                     properties = c.properties.block(d, 'small')
                 elif self._has_data_reuse(c):
diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py
index 8f100a1082..3980b60cd2 100644
--- a/tests/test_gpu_common.py
+++ b/tests/test_gpu_common.py
@@ -8,13 +8,14 @@
 from devito import (Constant, Eq, Inc, Grid, Function, ConditionalDimension,
                     Dimension, MatrixSparseTimeFunction, SparseTimeFunction,
                     SubDimension, SubDomain, SubDomainSet, TimeFunction,
-                    Operator, configuration, switchconfig, TensorTimeFunction)
+                    Operator, configuration, switchconfig, TensorTimeFunction,
+                    Buffer)
 from devito.arch import get_gpu_info
 from devito.exceptions import InvalidArgument
 from devito.ir import (Conditional, Expression, Section, FindNodes, FindSymbols,
                        retrieve_iteration_tree)
 from devito.passes.iet.languages.openmp import OmpIteration
-from devito.types import DeviceID, DeviceRM, Lock, NPThreads, PThreadArray
+from devito.types import DeviceID, DeviceRM, Lock, NPThreads, PThreadArray, Symbol
 
 from conftest import skipif
 
@@ -147,6 +148,24 @@ def test_incr_perfect_outer(self):
         op()
         assert np.all(w.data == 10)
 
+    def test_reduction_many_dims(self):
+        grid = Grid(shape=(25, 25, 25))
+
+        u = TimeFunction(name='u', grid=grid, time_order=1, save=Buffer(1))
+        s = Symbol(name='s', dtype=np.float32)
+
+        eqns = [Eq(s, 0),
+                Inc(s, 2*u + 1)]
+
+        op0 = Operator(eqns)
+        op1 = Operator(eqns, opt=('advanced', {'mapify-reduce': True}))
+
+        tree, = retrieve_iteration_tree(op0)
+        assert 'collapse(4) reduction(+:s)' in str(tree.root.pragmas[0])
+
+        tree, = retrieve_iteration_tree(op1)
+        assert 'collapse(3) reduction(+:s)' in str(tree[1].pragmas[0])
+
 
 class Bundle(SubDomain):
     """

From 8c62ae9dec7f535c68cc1dfe1678ea19065afde3 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Mon, 22 Jul 2024 07:36:31 +0000
Subject: [PATCH 2/2] compiler: Remember to further improve block size

---
 devito/arch/archinfo.py   | 4 ++--
 devito/types/dimension.py | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py
index 8c8f00e362..225909cca1 100644
--- a/devito/arch/archinfo.py
+++ b/devito/arch/archinfo.py
@@ -670,8 +670,8 @@ def limits(self, compiler=None, language=None):
         language.
         """
         return {
-            'max-par-dims': np.inf,
-            'max-block-dims': np.inf,
+            'max-par-dims': sys.maxsize,
+            'max-block-dims': sys.maxsize,
         }
 
 
diff --git a/devito/types/dimension.py b/devito/types/dimension.py
index 9f10f98903..69e9633092 100644
--- a/devito/types/dimension.py
+++ b/devito/types/dimension.py
@@ -1327,6 +1327,8 @@ def _arg_values(self, interval, grid=None, args=None, **kwargs):
             # no value supplied -> the sub-block will span the entire block
             return {name: args[self.parent.step.name]}
         else:
+            # TODO": Check the args for space order and apply heuristics (e.g.,
+            # `2*space_order`?) for even better block sizes
             value = self._arg_defaults()[name]
             if value <= args[self.root.max_name] - args[self.root.min_name] + 1:
                 return {name: value}