-
-
Notifications
You must be signed in to change notification settings - Fork 22
Compatibility fixes with distributed 1.21.3 #63
Changes from 3 commits
c189dee
10677a3
cc20f43
ed352bd
feda1f7
3d6f165
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
from distributed import Scheduler | ||
from distributed.utils import log_errors | ||
from distributed.deploy import adaptive | ||
from six import string_types | ||
from tornado import gen | ||
|
||
from .core import get_session | ||
|
@@ -34,7 +35,7 @@ class Adaptive(adaptive.Adaptive): | |
... """ Remove worker addresses from cluster """ | ||
''' | ||
def __init__(self, cluster=None, scheduler=None, interval=1000, | ||
startup_cost=1, scale_factor=2): | ||
startup_cost=1, scale_factor=2, **kwargs): | ||
if cluster is None: | ||
raise TypeError("`Adaptive.__init__() missing required argument: " | ||
"`cluster`") | ||
|
@@ -50,7 +51,8 @@ def __init__(self, cluster=None, scheduler=None, interval=1000, | |
|
||
super(Adaptive, self).__init__(scheduler, cluster, interval, | ||
startup_cost=startup_cost, | ||
scale_factor=scale_factor) | ||
scale_factor=scale_factor, | ||
**kwargs) | ||
|
||
def get_busy_workers(self): | ||
s = self.scheduler | ||
|
@@ -77,10 +79,20 @@ def get_scale_up_kwargs(self): | |
kwargs = {'n': max(instances, len(self.get_busy_workers()))} | ||
memory = [] | ||
if self.scheduler.unrunnable: | ||
for key in self.scheduler.unrunnable: | ||
for task in self.scheduler.unrunnable: | ||
if isinstance(task, string_types): | ||
# Backwards compatibility for distributed pre-1.21.0 | ||
key = task | ||
prefix = key | ||
else: | ||
# In distributed==1.21.0, the scheduler now stores TaskState objects | ||
# instead of string keys in its task collections: | ||
# https://github.com/dask/distributed/pull/1594 | ||
key = task.key | ||
prefix = task.prefix | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any thoughts on bumping our There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure, I think eventually we will definitely want to so that we don't have to deal with backwards-compatibility, but as of right now There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm comfortable bumping up requirements. If people want to use old versions of distributed then they can use old versions of dask-drmaa as well. I think we should maintain a little bit of slack between the various dask-foo projects, but not too much. |
||
duration = 0 | ||
memory = [] | ||
duration += self.scheduler.task_duration.get(key, 0.1) | ||
duration += self.scheduler.task_duration.get(prefix, 0.1) | ||
|
||
if key in self.scheduler.resource_restrictions: | ||
m = self.scheduler.resource_restrictions[key].get('memory') | ||
|
@@ -93,7 +105,17 @@ def get_scale_up_kwargs(self): | |
return kwargs | ||
|
||
@gen.coroutine | ||
def _retire_workers(self): | ||
def _retire_workers(self, workers=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why add this option? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NVM. Comes from PR ( dask/distributed#1797 ). We might want to revisit whether we should be carrying this function at all or just using the parent class' functionality. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jakirkham Currently There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, sorry, missed where this was coming from on the first pass. Would we be able to reuse that method or do you see issues with that approach? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, agreed that we should try to move back to the base implementation. I am not really sure why the implementation diverged from the base class to begin with, although it definitely seemed intentional: c51a15a#diff-d2ee7bfcb2312cc404b8b4953eaa2576L47. I haven't had a chance to step through the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added as issue ( #65 ). |
||
if workers is None: | ||
workers = self.workers_to_close() | ||
if not workers: | ||
raise gen.Return(workers) | ||
with log_errors(): | ||
workers = yield self.scheduler.retire_workers(close_workers=True) | ||
logger.info("Retiring workers {}".format(workers)) | ||
result = yield self.scheduler.retire_workers(workers, | ||
remove=True, | ||
close_workers=True) | ||
if result: | ||
logger.info("Retiring workers {}".format(result)) | ||
# Diverges from distributed.Adaptive here: | ||
# ref c51a15a35a8a64c21c1182bfd9209cb6b7d95380 | ||
raise gen.Return(result) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,8 +8,6 @@ | |
from distributed import Client | ||
from distributed.utils_test import loop, inc, slowinc | ||
|
||
|
||
@pytest.mark.skip(reason="currently times out for an unknown reason") | ||
def test_adaptive_memory(loop): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this seem to be working reliably now or is it still a little flaky? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As far as I've tried, it seems to work reliably now 😁 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If it acts up again, we can always reopen issue ( #58 ). Guessing you fixed it though as the issue cropped up with Distributed 1.21. Thanks for working on it. |
||
with SGECluster(scheduler_port=0, cleanup_interval=100) as cluster: | ||
adapt = Adaptive(cluster, cluster.scheduler) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add
six
torequirements.txt
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FYI
six
is in therequirements.txt
fordistributed
, but I suppose there is no harm and it is more clear in duplicating here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry I spend part of my days with package management issues. Have generally found explicit requirements makes things easier to manage.