-
Notifications
You must be signed in to change notification settings - Fork 4.2k
/
test_stream_impl.py
365 lines (303 loc) · 13.4 KB
/
test_stream_impl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""The TestStream implementation for the DirectRunner
The DirectRunner implements TestStream as the _TestStream class which is used
to store the events in memory, the _WatermarkController which is used to set the
watermark and emit events, and the multiplexer which sends events to the correct
tagged PCollection.
"""
# pytype: skip-file
import itertools
import logging
from queue import Empty as EmptyException
from queue import Queue
from threading import Thread
from typing import Union
import grpc
from apache_beam import ParDo
from apache_beam import coders
from apache_beam import pvalue
from apache_beam.portability.api import beam_runner_api_pb2
from apache_beam.portability.api import beam_runner_api_pb2_grpc
from apache_beam.testing.test_stream import ElementEvent
from apache_beam.testing.test_stream import ProcessingTimeEvent
from apache_beam.testing.test_stream import WatermarkEvent
from apache_beam.transforms import PTransform
from apache_beam.transforms import core
from apache_beam.transforms import window
from apache_beam.transforms.window import TimestampedValue
from apache_beam.utils import timestamp
from apache_beam.utils.timestamp import Duration
from apache_beam.utils.timestamp import Timestamp
_LOGGER = logging.getLogger(__name__)
class _EndOfStream:
pass
class _WatermarkController(PTransform):
"""A runner-overridable PTransform Primitive to control the watermark.
Expected implementation behavior:
- If the instance recieves a WatermarkEvent, it sets its output watermark to
the specified value then drops the event.
- If the instance receives an ElementEvent, it emits all specified elements
to the Global Window with the event time set to the element's timestamp.
"""
def __init__(self, output_tag):
self.output_tag = output_tag
def get_windowing(self, _):
return core.Windowing(window.GlobalWindows())
def expand(self, pcoll):
ret = pvalue.PCollection.from_(pcoll)
ret.tag = self.output_tag
return ret
class _ExpandableTestStream(PTransform):
def __init__(self, test_stream):
self.test_stream = test_stream
def expand(self, pbegin):
"""Expands the TestStream into the DirectRunner implementation.
Takes the TestStream transform and creates a _TestStream -> multiplexer ->
_WatermarkController.
"""
assert isinstance(pbegin, pvalue.PBegin)
# If there is only one tag there is no need to add the multiplexer.
if len(self.test_stream.output_tags) == 1:
return (
pbegin
| _TestStream(
self.test_stream.output_tags,
events=self.test_stream._events,
coder=self.test_stream.coder,
endpoint=self.test_stream._endpoint)
| _WatermarkController(list(self.test_stream.output_tags)[0]))
# Multiplex to the correct PCollection based upon the event tag.
def mux(event):
if event.tag:
yield pvalue.TaggedOutput(event.tag, event)
else:
yield event
mux_output = (
pbegin
| _TestStream(
self.test_stream.output_tags,
events=self.test_stream._events,
coder=self.test_stream.coder,
endpoint=self.test_stream._endpoint)
| 'TestStream Multiplexer' >> ParDo(mux).with_outputs())
# Apply a way to control the watermark per output. It is necessary to
# have an individual _WatermarkController per PCollection because the
# calculation of the input watermark of a transform is based on the event
# timestamp of the elements flowing through it. Meaning, it is impossible
# to control the output watermarks of the individual PCollections solely
# on the event timestamps.
outputs = {}
for tag in self.test_stream.output_tags:
label = '_WatermarkController[{}]'.format(tag)
outputs[tag] = (mux_output[tag] | label >> _WatermarkController(tag))
return outputs
class _TestStream(PTransform):
"""Test stream that generates events on an unbounded PCollection of elements.
Each event emits elements, advances the watermark or advances the processing
time. After all of the specified elements are emitted, ceases to produce
output.
Expected implementation behavior:
- If the instance receives a WatermarkEvent with the WATERMARK_CONTROL_TAG
then the instance sets its own watermark hold at the specified value and
drops the event.
- If the instance receives any other WatermarkEvent or ElementEvent, it
passes it to the consumer.
"""
# This tag is used on WatermarkEvents to control the watermark at the root
# TestStream.
WATERMARK_CONTROL_TAG = '_TestStream_Watermark'
def __init__(
self,
output_tags,
coder=coders.FastPrimitivesCoder(),
events=None,
endpoint=None):
assert coder is not None
self.coder = coder
self._raw_events = events
self._events = self._add_watermark_advancements(output_tags, events)
self.output_tags = output_tags
self.endpoint = endpoint
def _watermark_starts(self, output_tags):
"""Sentinel values to hold the watermark of outputs to -inf.
The output watermarks of the output PCollections (fake unbounded sources) in
a TestStream are controlled by watermark holds. This sets the hold of each
output PCollection so that the individual holds can be controlled by the
given events.
"""
return [WatermarkEvent(timestamp.MIN_TIMESTAMP, tag) for tag in output_tags]
def _watermark_stops(self, output_tags):
"""Sentinel values to close the watermark of outputs."""
return [WatermarkEvent(timestamp.MAX_TIMESTAMP, tag) for tag in output_tags]
def _test_stream_start(self):
"""Sentinel value to move the watermark hold of the TestStream to +inf.
This sets a hold to +inf such that the individual holds of the output
PCollections are allowed to modify their individial output watermarks with
their holds. This is because the calculation of the output watermark is a
min over all input watermarks.
"""
return [
WatermarkEvent(
timestamp.MAX_TIMESTAMP - timestamp.TIME_GRANULARITY,
_TestStream.WATERMARK_CONTROL_TAG)
]
def _test_stream_stop(self):
"""Sentinel value to close the watermark of the TestStream."""
return [
WatermarkEvent(
timestamp.MAX_TIMESTAMP, _TestStream.WATERMARK_CONTROL_TAG)
]
def _test_stream_init(self):
"""Sentinel value to hold the watermark of the TestStream to -inf.
This sets a hold to ensure that the output watermarks of the output
PCollections do not advance to +inf before their watermark holds are set.
"""
return [
WatermarkEvent(
timestamp.MIN_TIMESTAMP, _TestStream.WATERMARK_CONTROL_TAG)
]
def _set_up(self, output_tags):
return (
self._test_stream_init() + self._watermark_starts(output_tags) +
self._test_stream_start())
def _tear_down(self, output_tags):
return self._watermark_stops(output_tags) + self._test_stream_stop()
def _add_watermark_advancements(self, output_tags, events):
"""Adds watermark advancements to the given events.
The following watermark advancements can be done on the runner side.
However, it makes the logic on the runner side much more complicated than
it needs to be.
In order for watermarks to be properly advanced in a TestStream, a specific
sequence of watermark holds must be sent:
1. Hold the root watermark at -inf (this prevents the pipeline from
immediately returning).
2. Hold the watermarks at the WatermarkControllerss at -inf (this prevents
the pipeline from immediately returning).
3. Advance the root watermark to +inf - 1 (this allows the downstream
WatermarkControllers to control their watermarks via holds).
4. Advance watermarks as normal.
5. Advance WatermarkController watermarks to +inf
6. Advance root watermark to +inf.
"""
if not events:
return []
return self._set_up(output_tags) + events + self._tear_down(output_tags)
def get_windowing(self, unused_inputs):
return core.Windowing(window.GlobalWindows())
def expand(self, pcoll):
return pvalue.PCollection(pcoll.pipeline, is_bounded=False)
def _infer_output_coder(self, input_type=None, input_coder=None):
return self.coder
@staticmethod
def events_from_script(events):
"""Yields the in-memory events.
"""
return itertools.chain(events)
@staticmethod
def _stream_events_from_rpc(endpoint, output_tags, coder, channel, is_alive):
"""Yields the events received from the given endpoint.
This is the producer thread that reads events from the TestStreamService and
puts them onto the shared queue. At the end of the stream, an _EndOfStream
is placed on the channel to signify a successful end.
"""
stub_channel = grpc.insecure_channel(endpoint)
stub = beam_runner_api_pb2_grpc.TestStreamServiceStub(stub_channel)
# Request the PCollections that we are looking for from the service.
event_request = beam_runner_api_pb2.EventsRequest(
output_ids=[str(tag) for tag in output_tags])
event_stream = stub.Events(event_request)
try:
for e in event_stream:
channel.put(_TestStream.test_stream_payload_to_events(e, coder))
if not is_alive():
return
except grpc.RpcError as e:
# Do not raise an exception in the non-error status codes. These can occur
# when the Python interpreter shuts down or when in a notebook environment
# when the kernel is interrupted.
if e.code() in (grpc.StatusCode.CANCELLED, grpc.StatusCode.UNAVAILABLE):
return
raise e
finally:
# Gracefully stop the job if there is an exception.
channel.put(_EndOfStream())
@staticmethod
def events_from_rpc(endpoint, output_tags, coder, evaluation_context):
"""Yields the events received from the given endpoint.
This method starts a new thread that reads from the TestStreamService and
puts the events onto a shared queue. This method then yields all elements
from the queue. Unfortunately, this is necessary because the GRPC API does
not allow for non-blocking calls when utilizing a streaming RPC. It is
officially suggested from the docs to use a producer/consumer pattern to
handle streaming RPCs. By doing so, this gives this method control over when
to cancel reading from the RPC if the server takes too long to respond.
"""
# Shared variable with the producer queue. This shuts down the producer if
# the consumer exits early.
shutdown_requested = False
def is_alive():
return not (shutdown_requested or evaluation_context.shutdown_requested)
# The shared queue that allows the producer and consumer to communicate.
channel: 'Queue[Union[test_stream.Event, _EndOfStream]]' = ( # noqa: F821
Queue())
event_stream = Thread(
target=_TestStream._stream_events_from_rpc,
args=(endpoint, output_tags, coder, channel, is_alive))
event_stream.setDaemon(True)
event_stream.start()
# This pumps the shared queue for events until the _EndOfStream sentinel is
# reached. If the TestStreamService takes longer than expected, the queue
# will timeout and an EmptyException will be raised. This also sets the
# shared is_alive sentinel to shut down the producer.
while True:
try:
# Raise an EmptyException if there are no events during the last timeout
# period.
event = channel.get(timeout=30)
if isinstance(event, _EndOfStream):
break
yield event
except EmptyException as e:
_LOGGER.warning(
'TestStream timed out waiting for new events from service.'
' Stopping pipeline.')
shutdown_requested = True
raise e
@staticmethod
def test_stream_payload_to_events(payload, coder):
"""Returns a TestStream Python event object from a TestStream event Proto.
"""
if payload.HasField('element_event'):
element_event = payload.element_event
elements = [
TimestampedValue(
coder.decode(e.encoded_element), Timestamp(micros=e.timestamp))
for e in element_event.elements
]
return ElementEvent(timestamped_values=elements, tag=element_event.tag)
if payload.HasField('watermark_event'):
watermark_event = payload.watermark_event
return WatermarkEvent(
Timestamp(micros=watermark_event.new_watermark),
tag=watermark_event.tag)
if payload.HasField('processing_time_event'):
processing_time_event = payload.processing_time_event
return ProcessingTimeEvent(
Duration(micros=processing_time_event.advance_duration))
raise RuntimeError(
'Received a proto without the specified fields: {}'.format(payload))