Source code for apache_beam.runners.interactive.caching.streaming_cache

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# pytype: skip-file

import logging
import os
import shutil
import tempfile
import time
import traceback
from collections import OrderedDict
# We don't have an explicit pathlib dependency because this code only works with
# the interactive target installed which has an indirect dependency on pathlib
# through ipython>=5.9.0.
from pathlib import Path

from google.protobuf.message import DecodeError

import apache_beam as beam
from apache_beam import coders
from apache_beam.portability.api import beam_interactive_api_pb2
from apache_beam.portability.api import beam_runner_api_pb2
from apache_beam.runners.interactive.cache_manager import CacheManager
from apache_beam.runners.interactive.cache_manager import SafeFastPrimitivesCoder
from apache_beam.runners.interactive.caching.cacheable import CacheKey
from apache_beam.testing.test_stream import OutputFormat
from apache_beam.testing.test_stream import ReverseTestStream
from apache_beam.utils import timestamp

_LOGGER = logging.getLogger(__name__)


[docs]class StreamingCacheSink(beam.PTransform): """A PTransform that writes TestStreamFile(Header|Records)s to file. This transform takes in an arbitrary element stream and writes the list of TestStream events (as TestStreamFileRecords) to file. When replayed, this will produce the best-effort replay of the original job (e.g. some elements may be produced slightly out of order from the original stream). Note that this PTransform is assumed to be only run on a single machine where the following assumptions are correct: elements come in ordered, no two transforms are writing to the same file. This PTransform is assumed to only run correctly with the DirectRunner. TODO(https://github.com/apache/beam/issues/20002): Generalize this to more source/sink types aside from file based. Also, generalize to cases where there might be multiple workers writing to the same sink. """ def __init__( self, cache_dir, filename, sample_resolution_sec, coder=SafeFastPrimitivesCoder()): self._cache_dir = cache_dir self._filename = filename self._sample_resolution_sec = sample_resolution_sec self._coder = coder self._path = os.path.join(self._cache_dir, self._filename) @property def path(self): """Returns the path the sink leads to.""" return self._path @property def size_in_bytes(self): """Returns the space usage in bytes of the sink.""" try: return os.stat(self._path).st_size except OSError: _LOGGER.debug( 'Failed to calculate cache size for file %s, the file might have not ' 'been created yet. Return 0. %s', self._path, traceback.format_exc()) return 0
[docs] def expand(self, pcoll): class StreamingWriteToText(beam.DoFn): """DoFn that performs the writing. Note that the other file writing methods cannot be used in streaming contexts. """ def __init__(self, full_path, coder=SafeFastPrimitivesCoder()): self._full_path = full_path self._coder = coder # Try and make the given path. Path(os.path.dirname(full_path)).mkdir(parents=True, exist_ok=True) def start_bundle(self): # Open the file for 'append-mode' and writing 'bytes'. self._fh = open(self._full_path, 'ab') def finish_bundle(self): self._fh.close() def process(self, e): """Appends the given element to the file. """ self._fh.write(self._coder.encode(e) + b'\n') return ( pcoll | ReverseTestStream( output_tag=self._filename, sample_resolution_sec=self._sample_resolution_sec, output_format=OutputFormat.SERIALIZED_TEST_STREAM_FILE_RECORDS, coder=self._coder) | beam.ParDo( StreamingWriteToText(full_path=self._path, coder=self._coder)))
[docs]class StreamingCacheSource: """A class that reads and parses TestStreamFile(Header|Reader)s. This source operates in the following way: 1. Wait for up to `timeout_secs` for the file to be available. 2. Read, parse, and emit the entire contents of the file 3. Wait for more events to come or until `is_cache_complete` returns True 4. If there are more events, then go to 2 5. Otherwise, stop emitting. This class is used to read from file and send its to the TestStream via the StreamingCacheManager.Reader. """ def __init__(self, cache_dir, labels, is_cache_complete=None, coder=None): if not coder: coder = SafeFastPrimitivesCoder() if not is_cache_complete: is_cache_complete = lambda _: True self._cache_dir = cache_dir self._coder = coder self._labels = labels self._path = os.path.join(self._cache_dir, *self._labels) self._is_cache_complete = is_cache_complete self._pipeline_id = CacheKey.from_str(labels[-1]).pipeline_id def _wait_until_file_exists(self, timeout_secs=30): """Blocks until the file exists for a maximum of timeout_secs. """ # Wait for up to `timeout_secs` for the file to be available. start = time.time() while not os.path.exists(self._path): time.sleep(1) if time.time() - start > timeout_secs: pcollection_var = CacheKey.from_str(self._labels[-1]).var raise RuntimeError( 'Timed out waiting for cache file for PCollection `{}` to be ' 'available with path {}.'.format(pcollection_var, self._path)) return open(self._path, mode='rb') def _emit_from_file(self, fh, tail): """Emits the TestStreamFile(Header|Record)s from file. This returns a generator to be able to read all lines from the given file. If `tail` is True, then it will wait until the cache is complete to exit. Otherwise, it will read the file only once. """ # Always read at least once to read the whole file. while True: pos = fh.tell() line = fh.readline() # Check if we are at EOF or if we have an incomplete line. if not line or (line and line[-1] != b'\n'[0]): # Read at least the first line to get the header. if not tail and pos != 0: break # Complete reading only when the cache is complete. if self._is_cache_complete(self._pipeline_id): break # Otherwise wait for new data in the file to be written. time.sleep(0.5) fh.seek(pos) else: # The first line at pos = 0 is always the header. Read the line without # the new line. to_decode = line[:-1] if pos == 0: proto_cls = beam_interactive_api_pb2.TestStreamFileHeader else: proto_cls = beam_interactive_api_pb2.TestStreamFileRecord msg = self._try_parse_as(proto_cls, to_decode) if msg: yield msg else: break def _try_parse_as(self, proto_cls, to_decode): try: msg = proto_cls() msg.ParseFromString(self._coder.decode(to_decode)) except DecodeError: _LOGGER.error( 'Could not parse as %s. This can indicate that the cache is ' 'corruputed. Please restart the kernel. ' '\nfile: %s \nmessage: %s', proto_cls, self._path, to_decode) msg = None return msg
[docs] def read(self, tail): """Reads all TestStreamFile(Header|TestStreamFileRecord)s from file. This returns a generator to be able to read all lines from the given file. If `tail` is True, then it will wait until the cache is complete to exit. Otherwise, it will read the file only once. """ with self._wait_until_file_exists() as f: for e in self._emit_from_file(f, tail): yield e
# TODO(victorhc): Add support for cache_dir locations that are on GCS
[docs]class StreamingCache(CacheManager): """Abstraction that holds the logic for reading and writing to cache. """ def __init__( self, cache_dir, is_cache_complete=None, sample_resolution_sec=0.1, saved_pcoders=None): self._sample_resolution_sec = sample_resolution_sec self._is_cache_complete = is_cache_complete if cache_dir: self._cache_dir = cache_dir else: self._cache_dir = tempfile.mkdtemp( prefix='ib-', dir=os.environ.get('TEST_TMPDIR', None)) # List of saved pcoders keyed by PCollection path. It is OK to keep this # list in memory because once FileBasedCacheManager object is # destroyed/re-created it loses the access to previously written cache # objects anyways even if cache_dir already exists. In other words, # it is not possible to resume execution of Beam pipeline from the # saved cache if FileBasedCacheManager has been reset. # # However, if we are to implement better cache persistence, one needs # to take care of keeping consistency between the cached PCollection # and its PCoder type. self._saved_pcoders = saved_pcoders or {} self._default_pcoder = SafeFastPrimitivesCoder() # The sinks to capture data from capturable sources. # Dict([str, StreamingCacheSink]) self._capture_sinks = {} self._capture_keys = set()
[docs] def size(self, *labels): if self.exists(*labels): return os.path.getsize(os.path.join(self._cache_dir, *labels)) return 0
@property def capture_size(self): return sum([sink.size_in_bytes for _, sink in self._capture_sinks.items()]) @property def capture_paths(self): return list(self._capture_sinks.keys()) @property def capture_keys(self): return self._capture_keys
[docs] def exists(self, *labels): if labels and any(labels): path = os.path.join(self._cache_dir, *labels) return os.path.exists(path) return False
# TODO(srohde): Modify this to return the correct version.
[docs] def read(self, *labels, **args): """Returns a generator to read all records from file.""" tail = args.pop('tail', False) # Only immediately return when the file doesn't exist when the user wants a # snapshot of the cache (when tail is false). if not self.exists(*labels) and not tail: return iter([]), -1 reader = StreamingCacheSource( self._cache_dir, labels, self._is_cache_complete, self.load_pcoder(*labels)).read(tail=tail) # Return an empty iterator if there is nothing in the file yet. This can # only happen when tail is False. try: header = next(reader) except StopIteration: return iter([]), -1 return StreamingCache.Reader([header], [reader]).read(), 1
[docs] def read_multiple(self, labels, tail=True): """Returns a generator to read all records from file. Does tail until the cache is complete. This is because it is used in the TestStreamServiceController to read from file which is only used during pipeline runtime which needs to block. """ readers = [ StreamingCacheSource( self._cache_dir, l, self._is_cache_complete, self.load_pcoder(*l)).read(tail=tail) for l in labels ] headers = [next(r) for r in readers] return StreamingCache.Reader(headers, readers).read()
[docs] def write(self, values, *labels): """Writes the given values to cache. """ directory = os.path.join(self._cache_dir, *labels[:-1]) filepath = os.path.join(directory, labels[-1]) if not os.path.exists(directory): os.makedirs(directory) with open(filepath, 'ab') as f: for v in values: if isinstance(v, (beam_interactive_api_pb2.TestStreamFileHeader, beam_interactive_api_pb2.TestStreamFileRecord)): val = v.SerializeToString() else: raise TypeError( 'Values given to streaming cache should be either ' 'TestStreamFileHeader or TestStreamFileRecord.') f.write(self.load_pcoder(*labels).encode(val) + b'\n')
[docs] def clear(self, *labels): directory = os.path.join(self._cache_dir, *labels[:-1]) filepath = os.path.join(directory, labels[-1]) self._capture_keys.discard(labels[-1]) if os.path.exists(filepath): os.remove(filepath) return True return False
[docs] def source(self, *labels): """Returns the StreamingCacheManager source. This is beam.Impulse() because unbounded sources will be marked with this and then the PipelineInstrument will replace these with a TestStream. """ return beam.Impulse()
[docs] def sink(self, labels, is_capture=False): """Returns a StreamingCacheSink to write elements to file. Note that this is assumed to only work in the DirectRunner as the underlying StreamingCacheSink assumes a single machine to have correct element ordering. """ filename = labels[-1] cache_dir = os.path.join(self._cache_dir, *labels[:-1]) sink = StreamingCacheSink( cache_dir, filename, self._sample_resolution_sec, self.load_pcoder(*labels)) if is_capture: self._capture_sinks[sink.path] = sink self._capture_keys.add(filename) return sink
[docs] def save_pcoder(self, pcoder, *labels): self._saved_pcoders[os.path.join(self._cache_dir, *labels)] = pcoder
[docs] def load_pcoder(self, *labels): saved_pcoder = self._saved_pcoders.get( os.path.join(self._cache_dir, *labels), None) if saved_pcoder is None or isinstance(saved_pcoder, coders.FastPrimitivesCoder): return self._default_pcoder return saved_pcoder
[docs] def cleanup(self): if os.path.exists(self._cache_dir): def on_fail_to_cleanup(function, path, excinfo): _LOGGER.warning( 'Failed to clean up temporary files: %s. You may' 'manually delete them if necessary. Error was: %s', path, excinfo) shutil.rmtree(self._cache_dir, onerror=on_fail_to_cleanup) self._saved_pcoders = {} self._capture_sinks = {} self._capture_keys = set()
[docs] class Reader(object): """Abstraction that reads from PCollection readers. This class is an Abstraction layer over multiple PCollection readers to be used for supplying a TestStream service with events. This class is also responsible for holding the state of the clock, injecting clock advancement events, and watermark advancement events. """ def __init__(self, headers, readers): # This timestamp is used as the monotonic clock to order events in the # replay. self._monotonic_clock = timestamp.Timestamp.of(0) # The PCollection cache readers. self._readers = {} # The file headers that are metadata for that particular PCollection. # The header allows for metadata about an entire stream, so that the data # isn't copied per record. self._headers = {header.tag: header for header in headers} self._readers = OrderedDict( ((h.tag, r) for (h, r) in zip(headers, readers))) # The most recently read timestamp per tag. self._stream_times = { tag: timestamp.Timestamp(seconds=0) for tag in self._headers } def _test_stream_events_before_target(self, target_timestamp): """Reads the next iteration of elements from each stream. Retrieves an element from each stream iff the most recently read timestamp from that stream is less than the target_timestamp. Since the amount of events may not fit into memory, this StreamingCache reads at most one element from each stream at a time. """ records = [] for tag, r in self._readers.items(): # The target_timestamp is the maximum timestamp that was read from the # stream. Some readers may have elements that are less than this. Thus, # we skip all readers that already have elements that are at this # timestamp so that we don't read everything into memory. if self._stream_times[tag] >= target_timestamp: continue try: record = next(r).recorded_event if record.HasField('processing_time_event'): self._stream_times[tag] += timestamp.Duration( micros=record.processing_time_event.advance_duration) records.append((tag, record, self._stream_times[tag])) except StopIteration: pass return records def _merge_sort(self, previous_events, new_events): return sorted( previous_events + new_events, key=lambda x: x[2], reverse=True) def _min_timestamp_of(self, events): return events[-1][2] if events else timestamp.MAX_TIMESTAMP def _event_stream_caught_up_to_target(self, events, target_timestamp): empty_events = not events stream_is_past_target = self._min_timestamp_of(events) > target_timestamp return empty_events or stream_is_past_target
[docs] def read(self): """Reads records from PCollection readers. """ # The largest timestamp read from the different streams. target_timestamp = timestamp.MAX_TIMESTAMP # The events from last iteration that are past the target timestamp. unsent_events = [] # Emit events until all events have been read. while True: # Read the next set of events. The read events will most likely be # out of order if there are multiple readers. Here we sort them into # a more manageable state. new_events = self._test_stream_events_before_target(target_timestamp) events_to_send = self._merge_sort(unsent_events, new_events) if not events_to_send: break # Get the next largest timestamp in the stream. This is used as the # timestamp for readers to "catch-up" to. This will only read from # readers with a timestamp less than this. target_timestamp = self._min_timestamp_of(events_to_send) # Loop through the elements with the correct timestamp. while not self._event_stream_caught_up_to_target(events_to_send, target_timestamp): # First advance the clock to match the time of the stream. This has # a side-effect of also advancing this cache's clock. tag, r, curr_timestamp = events_to_send.pop() if curr_timestamp > self._monotonic_clock: yield self._advance_processing_time(curr_timestamp) # Then, send either a new element or watermark. if r.HasField('element_event'): r.element_event.tag = tag yield r elif r.HasField('watermark_event'): r.watermark_event.tag = tag yield r unsent_events = events_to_send target_timestamp = self._min_timestamp_of(unsent_events)
def _advance_processing_time(self, new_timestamp): """Advances the internal clock and returns an AdvanceProcessingTime event. """ advancy_by = new_timestamp.micros - self._monotonic_clock.micros e = beam_runner_api_pb2.TestStreamPayload.Event( processing_time_event=beam_runner_api_pb2.TestStreamPayload.Event. AdvanceProcessingTime(advance_duration=advancy_by)) self._monotonic_clock = new_timestamp return e