Skip to content

Commit

Permalink
Merge pull request #447 from itamarst/446-dask-process
Browse files Browse the repository at this point in the history
#446: dask.persist() support.
  • Loading branch information
itamarst committed Jan 23, 2020
2 parents 21765e8 + 9bf62d9 commit 4470cc1
Show file tree
Hide file tree
Showing 5 changed files with 181 additions and 29 deletions.
11 changes: 11 additions & 0 deletions docs/source/news.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
What's New
==========

1.12.0
^^^^^^

Features:

* Dask support now includes support for tracing logging of ``dask.persist()``, via wrapper API ``eliot.dask.persist_with_trace()``.

Bug fixes:

* Dask edge cases that previously weren't handled correctly should work better.

1.11.0
^^^^^^

Expand Down
3 changes: 2 additions & 1 deletion docs/source/scientific-computing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,9 @@ In order to do this you will need to:
* Ensure all worker processes write the Eliot logs to disk (if you're using the ``multiprocessing`` or ``distributed`` backends).
* If you're using multiple worker machines, aggregate all log files into a single place, so you can more easily analyze them with e.g. `eliot-tree <https://github.com/jonathanj/eliottree>`_.
* Replace ``dask.compute()`` with ``eliot.dask.compute_with_trace()``.
* Replace ``dask.persist()`` with ``eliot.dask.persist_with_trace()``.

In the following example, you can see how this works for a Dask run using ``distributed``, the recommended Dask scheduler.
In the following example, you can see how this works for a Dask run using ``distributed``, the recommended Dask scheduler for more sophisticated use cases.
We'll be using multiple worker processes, but only use a single machine:

.. literalinclude:: ../../examples/dask_eliot.py
Expand Down
76 changes: 55 additions & 21 deletions eliot/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,17 @@

from pyrsistent import PClass, field

from dask import compute, optimize
from dask.core import toposort, get_dependencies
from dask import compute, optimize, persist

try:
from dask.distributed import Future
except:

class Future(object):
pass


from dask.core import toposort, get_dependencies, ishashable
from . import start_action, current_action, Action


Expand Down Expand Up @@ -75,6 +84,22 @@ def compute_with_trace(*args):
return compute(*optimized, optimize_graph=False)


def persist_with_trace(*args):
"""Do Dask persist(), but with added Eliot tracing.
Known issues:
1. Retries will confuse Eliot. Probably need different
distributed-tree mechanism within Eliot to solve that.
"""
# 1. Create top-level Eliot Action:
with start_action(action_type="dask:persist"):
# In order to reduce logging verbosity, add logging to the already
# optimized graph:
optimized = optimize(*args, optimizations=[_add_logging])
return persist(*optimized, optimize_graph=False)


def _add_logging(dsk, ignore=None):
"""
Add logging to a Dask graph.
Expand All @@ -101,34 +126,43 @@ def simplify(k):
key_names = {}
for key in keys:
value = dsk[key]
if not callable(value) and value in keys:
if not callable(value) and ishashable(value) and value in keys:
# It's an alias for another key:
key_names[key] = key_names[value]
else:
key_names[key] = simplify(key)

# 2. Create Eliot child Actions for each key, in topological order:
key_to_action_id = {key: str(ctx.serialize_task_id(), "utf-8") for key in keys}
# Values in the graph can be either:
#
# 1. A list of other values.
# 2. A tuple, where first value might be a callable, aka a task.
# 3. A literal of some sort.
def maybe_wrap(key, value):
if isinstance(value, list):
return [maybe_wrap(key, v) for v in value]
elif isinstance(value, tuple):
func = value[0]
args = value[1:]
if not callable(func):
# Not a callable, so nothing to wrap.
return value
wrapped_func = _RunWithEliotContext(
task_id=str(ctx.serialize_task_id(), "utf-8"),
func=func,
key=key_names[key],
dependencies=[key_names[k] for k in get_dependencies(dsk, key)],
)
return (wrapped_func,) + args
else:
return value

# 3. Replace function with wrapper that logs appropriate Action:
# Replace function with wrapper that logs appropriate Action; iterate in
# topological order so action task levels are in reasonable order.
for key in keys:
func = dsk[key][0]
args = dsk[key][1:]
if not callable(func):
# This key is just an alias for another key, no need to add
# logging:
result[key] = dsk[key]
continue
wrapped_func = _RunWithEliotContext(
task_id=key_to_action_id[key],
func=func,
key=key_names[key],
dependencies=[key_names[k] for k in get_dependencies(dsk, key)],
)
result[key] = (wrapped_func,) + tuple(args)
result[key] = maybe_wrap(key, dsk[key])

assert set(result.keys()) == set(dsk.keys())
return result


__all__ = ["compute_with_trace"]
__all__ = ["compute_with_trace", "persist_with_trace"]
117 changes: 110 additions & 7 deletions eliot/tests/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,23 @@
from unittest import TestCase, skipUnless

from ..testing import capture_logging, LoggedAction, LoggedMessage
from .. import start_action, Message
from .. import start_action, log_message

try:
import dask
from dask.bag import from_sequence
from dask.distributed import Client
import dask.dataframe as dd
import pandas as pd
except ImportError:
dask = None
else:
from ..dask import compute_with_trace, _RunWithEliotContext, _add_logging
from ..dask import (
compute_with_trace,
_RunWithEliotContext,
_add_logging,
persist_with_trace,
)


@skipUnless(dask, "Dask not available.")
Expand All @@ -28,30 +36,74 @@ def test_compute(self):
bag = bag.fold(lambda x, y: x + y)
self.assertEqual(dask.compute(bag), compute_with_trace(bag))

def test_future(self):
"""compute_with_trace() can handle Futures."""
client = Client(processes=False)
self.addCleanup(client.shutdown)
[bag] = dask.persist(from_sequence([1, 2, 3]))
bag = bag.map(lambda x: x * 5)
result = dask.compute(bag)
self.assertEqual(result, ([5, 10, 15],))
self.assertEqual(result, compute_with_trace(bag))

def test_persist_result(self):
"""persist_with_trace() runs the same logic as process()."""
client = Client(processes=False)
self.addCleanup(client.shutdown)
bag = from_sequence([1, 2, 3])
bag = bag.map(lambda x: x * 7)
self.assertEqual(
[b.compute() for b in dask.persist(bag)],
[b.compute() for b in persist_with_trace(bag)],
)

def test_persist_pandas(self):
"""persist_with_trace() with a Pandas dataframe.
This ensures we don't blow up, which used to be the case.
"""
df = pd.DataFrame()
df = dd.from_pandas(df, npartitions=1)
persist_with_trace(df)

@capture_logging(None)
def test_logging(self, logger):
def test_persist_logging(self, logger):
"""persist_with_trace() preserves Eliot context."""

def persister(bag):
[bag] = persist_with_trace(bag)
return dask.compute(bag)

self.assert_logging(logger, persister, "dask:persist")

@capture_logging(None)
def test_compute_logging(self, logger):
"""compute_with_trace() preserves Eliot context."""
self.assert_logging(logger, compute_with_trace, "dask:compute")

def assert_logging(self, logger, run_with_trace, top_action_name):
"""Utility function for _with_trace() logging tests."""

def mult(x):
Message.log(message_type="mult")
log_message(message_type="mult")
return x * 4

def summer(x, y):
Message.log(message_type="finally")
log_message(message_type="finally")
return x + y

bag = from_sequence([1, 2])
bag = bag.map(mult).fold(summer)
with start_action(action_type="act1"):
compute_with_trace(bag)
run_with_trace(bag)

[logged_action] = LoggedAction.ofType(logger.messages, "act1")
self.assertEqual(
logged_action.type_tree(),
{
"act1": [
{
"dask:compute": [
top_action_name: [
{"eliot:remote_task": ["dask:task", "mult"]},
{"eliot:remote_task": ["dask:task", "mult"]},
{"eliot:remote_task": ["dask:task", "finally"]},
Expand Down Expand Up @@ -83,6 +135,8 @@ def summer(x, y):
class AddLoggingTests(TestCase):
"""Tests for _add_logging()."""

maxDiff = None

def test_add_logging_to_full_graph(self):
"""_add_logging() recreates Dask graph with wrappers."""
bag = from_sequence([1, 2, 3])
Expand All @@ -104,3 +158,52 @@ def test_add_logging_to_full_graph(self):
logging_removed[key] = value

self.assertEqual(logging_removed, graph)

def test_add_logging_explicit(self):
"""_add_logging() on more edge cases of the graph."""

def add(s):
return s + "s"

def add2(s):
return s + "s"

# b runs first, then d, then a and c.
graph = {
"a": "d",
"d": [1, 2, (add, "b")],
("b", 0): 1,
"c": (add2, "d"),
}

with start_action(action_type="bleh") as action:
task_id = action.task_uuid
self.assertEqual(
_add_logging(graph),
{
"d": [
1,
2,
(
_RunWithEliotContext(
task_id=task_id + "@/2",
func=add,
key="d",
dependencies=["b"],
),
"b",
),
],
"a": "d",
("b", 0): 1,
"c": (
_RunWithEliotContext(
task_id=task_id + "@/3",
func=add2,
key="c",
dependencies=["d"],
),
"d",
),
},
)
3 changes: 3 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ deps = cffi
basepython = python3.7
deps = cffi
dask[bag]
dask[distributed]
dask[pandas]
pandas

[testenv:py38]
basepython = python3.8
Expand Down

0 comments on commit 4470cc1

Please sign in to comment.