Source code for apache_beam.runners.interactive.augmented_pipeline

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Module to augment interactive flavor into the given pipeline.

For internal use only; no backward-compatibility guarantees.
"""
# pytype: skip-file

import copy
from typing import Dict
from typing import Optional
from typing import Set

import apache_beam as beam
from apache_beam.portability.api import beam_runner_api_pb2
from apache_beam.runners.interactive import interactive_environment as ie
from apache_beam.runners.interactive import background_caching_job
from apache_beam.runners.interactive.caching.cacheable import Cacheable
from apache_beam.runners.interactive.caching.read_cache import ReadCache
from apache_beam.runners.interactive.caching.write_cache import WriteCache


[docs]class AugmentedPipeline:
  """A pipeline with augmented interactive flavor that caches intermediate
  PCollections defined by the user, reads computed PCollections as source and
  prunes unnecessary pipeline parts for fast computation.
  """
  def __init__(
      self,
      user_pipeline: beam.Pipeline,
      pcolls: Optional[Set[beam.pvalue.PCollection]] = None):
    """
    Initializes a pipelilne for augmenting interactive flavor.

    Args:
      user_pipeline: a beam.Pipeline instance defined by the user.
      pcolls: cacheable pcolls to be computed/retrieved. If the set is
        empty, all intermediate pcolls assigned to variables are applicable.
    """
    assert not pcolls or all(pcoll.pipeline is user_pipeline for pcoll in
      pcolls), 'All %s need to belong to %s' % (pcolls, user_pipeline)
    self._user_pipeline = user_pipeline
    self._pcolls = pcolls
    self._cache_manager = ie.current_env().get_cache_manager(
        self._user_pipeline, create_if_absent=True)
    if background_caching_job.has_source_to_cache(self._user_pipeline):
      self._cache_manager = ie.current_env().get_cache_manager(
          self._user_pipeline)
    _, self._context = self._user_pipeline.to_runner_api(return_context=True)
    self._context.component_id_map = copy.copy(
        self._user_pipeline.component_id_map)
    self._cacheables = self.cacheables()

  @property
  def augmented_pipeline(self) -> beam_runner_api_pb2.Pipeline:
    return self.augment()

  # TODO(https://github.com/apache/beam/issues/20526): Support generating a
  # background recording job that contains unbound source recording transforms
  # only.
  @property
  def background_recording_pipeline(self) -> beam_runner_api_pb2.Pipeline:
    raise NotImplementedError

[docs]  def cacheables(self) -> Dict[beam.pvalue.PCollection, Cacheable]:
    """Finds all the cacheable intermediate PCollections in the pipeline with
    their metadata.
    """
    c = {}
    for watching in ie.current_env().watching():
      for key, val in watching:
        if (isinstance(val, beam.pvalue.PCollection) and
            val.pipeline is self._user_pipeline and
            (not self._pcolls or val in self._pcolls)):
          c[val] = Cacheable(
              var=key,
              pcoll=val,
              version=str(id(val)),
              producer_version=str(id(val.producer)))
    return c

[docs]  def augment(self) -> beam_runner_api_pb2.Pipeline:
    """Augments the pipeline with cache. Always calculates a new result.

    For a cacheable PCollection, if cache exists, read cache; else, write cache.
    """
    pipeline = self._user_pipeline.to_runner_api()

    # Find pcolls eligible for reading or writing cache.
    readcache_pcolls = set()
    for pcoll, cacheable in self._cacheables.items():
      key = repr(cacheable.to_key())
      if (self._cache_manager.exists('full', key) and
          pcoll in ie.current_env().computed_pcollections):
        readcache_pcolls.add(pcoll)
    writecache_pcolls = set(
        self._cacheables.keys()).difference(readcache_pcolls)

    # Wire in additional transforms to read cache and write cache.
    for readcache_pcoll in readcache_pcolls:
      ReadCache(
          pipeline,
          self._context,
          self._cache_manager,
          self._cacheables[readcache_pcoll]).read_cache()
    for writecache_pcoll in writecache_pcolls:
      WriteCache(
          pipeline,
          self._context,
          self._cache_manager,
          self._cacheables[writecache_pcoll]).write_cache()
    # TODO(https://github.com/apache/beam/issues/20526): Support streaming, add
    # pruning logic, and integrate pipeline fragment logic.
    return pipeline