Source code for apache_beam.runners.interactive.pipeline_analyzer

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Analyzes and modifies the pipeline that utilize the PCollection cache.

This module is experimental. No backwards-compatibility guarantees.
"""

# pytype: skip-file

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections

import apache_beam as beam
from apache_beam.portability.api import beam_runner_api_pb2
from apache_beam.runners.interactive import cache_manager as cache


[docs]class PipelineAnalyzer(object):
  def __init__(self, cache_manager, pipeline_proto, underlying_runner,
               options=None, desired_cache_labels=None):
    """Constructor of PipelineAnanlyzer.

    Args:
      cache_manager: (CacheManager)
      pipeline_proto: (Pipeline proto)
      underlying_runner: (PipelineRunner)
      options: (PipelineOptions)
      desired_cache_labels: (Set[str]) a set of labels of the PCollection
        queried by the user.
    """
    self._cache_manager = cache_manager
    self._pipeline_proto = pipeline_proto
    self._desired_cache_labels = desired_cache_labels or []

    self._pipeline = beam.pipeline.Pipeline.from_runner_api(
        self._pipeline_proto,
        runner=underlying_runner,
        options=options)
    # context returned from to_runner_api is more informative than that returned
    # from from_runner_api.
    _, self._context = self._pipeline.to_runner_api(
        return_context=True, use_fake_coders=True)
    self._pipeline_info = PipelineInfo(self._pipeline_proto.components)

    # Result of the analysis that can be queried by the user.
    self._pipeline_proto_to_execute = None
    self._top_level_referenced_pcoll_ids = None
    self._top_level_required_transforms = None

    self._caches_used = set()
    self._read_cache_ids = set()
    self._write_cache_ids = set()

    # used for _insert_producing_transforms()
    self._analyzed_pcoll_ids = set()

    self._analyze_pipeline()

  def _analyze_pipeline(self):
    """Analyzes the pipeline and sets the variables that can be queried.

    This function construct Pipeline proto to execute by
      1. Start from target PCollections and recursively insert the producing
         PTransforms of those PCollections, where the producing PTransforms are
         either ReadCache or PTransforms in the original pipeline.
      2. Append WriteCache PTransforms in the pipeline.

    After running this function, the following variables will be set:
      self._pipeline_proto_to_execute
      self._top_level_referenced_pcoll_ids
      self._top_level_required_transforms
      self._caches_used
      self._read_cache_ids
      self._write_cache_ids
    """
    # We filter PTransforms to be executed bottom-up from these PCollections.
    desired_pcollections = self._desired_pcollections(self._pipeline_info)

    required_transforms = collections.OrderedDict()
    top_level_required_transforms = collections.OrderedDict()

    for pcoll_id in desired_pcollections:
      # TODO(qinyeli): Collections consumed by no-output transforms.
      self._insert_producing_transforms(pcoll_id,
                                        required_transforms,
                                        top_level_required_transforms)

    top_level_referenced_pcoll_ids = self._referenced_pcoll_ids(
        top_level_required_transforms)

    for pcoll_id in self._pipeline_info.all_pcollections():
      if not pcoll_id in top_level_referenced_pcoll_ids:
        continue

      if (pcoll_id in desired_pcollections
          and not pcoll_id in self._caches_used):
        self._insert_caching_transforms(pcoll_id,
                                        required_transforms,
                                        top_level_required_transforms)

      if not self._cache_manager.exists(
          'sample', self._pipeline_info.cache_label(pcoll_id)):
        self._insert_caching_transforms(pcoll_id,
                                        required_transforms,
                                        top_level_required_transforms,
                                        sample=True)

    required_transforms['_root'] = beam_runner_api_pb2.PTransform(
        subtransforms=list(top_level_required_transforms))

    referenced_pcoll_ids = self._referenced_pcoll_ids(
        required_transforms)
    referenced_pcollections = {}
    for pcoll_id in referenced_pcoll_ids:
      obj = self._context.pcollections.get_by_id(pcoll_id)
      proto = self._context.pcollections.get_proto(obj)
      referenced_pcollections[pcoll_id] = proto

    pipeline_to_execute = beam_runner_api_pb2.Pipeline()
    pipeline_to_execute.root_transform_ids[:] = ['_root']
    set_proto_map(pipeline_to_execute.components.transforms,
                  required_transforms)
    set_proto_map(pipeline_to_execute.components.pcollections,
                  referenced_pcollections)
    set_proto_map(pipeline_to_execute.components.coders,
                  self._context.to_runner_api().coders)
    set_proto_map(pipeline_to_execute.components.windowing_strategies,
                  self._context.to_runner_api().windowing_strategies)

    self._pipeline_proto_to_execute = pipeline_to_execute
    self._top_level_referenced_pcoll_ids = top_level_referenced_pcoll_ids
    self._top_level_required_transforms = top_level_required_transforms

  # -------------------------------------------------------------------------- #
  # Getters
  # -------------------------------------------------------------------------- #

[docs]  def pipeline_info(self):
    """Return PipelineInfo of the original pipeline.
    """
    return self._pipeline_info

[docs]  def pipeline_proto_to_execute(self):
    """Returns Pipeline proto to be executed.
    """
    return self._pipeline_proto_to_execute

[docs]  def tl_referenced_pcoll_ids(self):
    """Returns a set of PCollection IDs referenced by top level PTransforms.
    """
    return self._top_level_referenced_pcoll_ids

[docs]  def tl_required_trans_ids(self):
    """Returns a set of required top level PTransform IDs.
    """
    return list(self._top_level_required_transforms)

[docs]  def caches_used(self):
    """Returns a set of PCollection IDs to read from cache.
    """
    return self._caches_used

[docs]  def read_cache_ids(self):
    """Return a set of ReadCache PTransform IDs inserted.
    """
    return self._read_cache_ids

[docs]  def write_cache_ids(self):
    """Return a set of WriteCache PTransform IDs inserted.
    """
    return self._write_cache_ids

  # -------------------------------------------------------------------------- #
  # Helper methods for _analyze_pipeline()
  # -------------------------------------------------------------------------- #

  def _insert_producing_transforms(self,
                                   pcoll_id,
                                   required_transforms,
                                   top_level_required_transforms,
                                   leaf=False):
    """Inserts PTransforms producing the given PCollection into the dicts.

    Args:
      pcoll_id: (str)
      required_transforms: (Dict[str, PTransform proto])
      top_level_required_transforms: (Dict[str, PTransform proto])
      leaf: (bool) whether the PCollection should be read from cache if the
        cache exists.

    Modifies:
      required_transforms
      top_level_required_transforms
      self._read_cache_ids
    """
    if pcoll_id in self._analyzed_pcoll_ids:
      return
    else:
      self._analyzed_pcoll_ids.add(pcoll_id)

    cache_label = self._pipeline_info.cache_label(pcoll_id)
    if self._cache_manager.exists('full', cache_label) and not leaf:
      self._caches_used.add(pcoll_id)

      cache_label = self._pipeline_info.cache_label(pcoll_id)
      dummy_pcoll = (self._pipeline
                     | 'Load%s' % cache_label >> cache.ReadCache(
                         self._cache_manager, cache_label))

      read_cache = self._top_level_producer(dummy_pcoll)
      read_cache_id = self._context.transforms.get_id(read_cache)
      read_cache_proto = read_cache.to_runner_api(self._context)
      read_cache_proto.outputs['None'] = pcoll_id
      top_level_required_transforms[read_cache_id] = read_cache_proto
      self._read_cache_ids.add(read_cache_id)

      for transform in self._include_subtransforms(read_cache):
        transform_id = self._context.transforms.get_id(transform)
        transform_proto = transform.to_runner_api(self._context)
        if dummy_pcoll in transform.outputs.values():
          transform_proto.outputs['None'] = pcoll_id
        required_transforms[transform_id] = transform_proto

    else:
      pcoll = self._context.pcollections.get_by_id(pcoll_id)

      top_level_transform = self._top_level_producer(pcoll)
      for transform in self._include_subtransforms(top_level_transform):
        transform_id = self._context.transforms.get_id(transform)
        transform_proto = self._context.transforms.get_proto(transform)

        # Inserting ancestor PTransforms.
        for input_id in transform_proto.inputs.values():
          self._insert_producing_transforms(input_id,
                                            required_transforms,
                                            top_level_required_transforms)
        required_transforms[transform_id] = transform_proto

      # Must be inserted after inserting ancestor PTransforms.
      top_level_id = self._context.transforms.get_id(top_level_transform)
      top_level_proto = self._context.transforms.get_proto(top_level_transform)
      top_level_required_transforms[top_level_id] = top_level_proto

  def _insert_caching_transforms(self,
                                 pcoll_id,
                                 required_transforms,
                                 top_level_required_transforms,
                                 sample=False):
    """Inserts PTransforms caching the given PCollection into the dicts.

    Args:
      pcoll_id: (str)
      required_transforms: (Dict[str, PTransform proto])
      top_level_required_transforms: (Dict[str, PTransform proto])
      sample: (bool) whether to cache sample or cache full.

    Modifies:
      required_transforms
      top_level_required_transforms
      self._write_cache_ids
    """
    cache_label = self._pipeline_info.cache_label(pcoll_id)
    pcoll = self._context.pcollections.get_by_id(pcoll_id)

    if not sample:
      pdone = pcoll | 'CacheFull%s' % cache_label >> cache.WriteCache(
          self._cache_manager, cache_label)
    else:
      pdone = pcoll | 'CacheSample%s' % cache_label >> cache.WriteCache(
          self._cache_manager, cache_label, sample=True,
          sample_size=10)

    write_cache = self._top_level_producer(pdone)
    write_cache_id = self._context.transforms.get_id(write_cache)
    write_cache_proto = write_cache.to_runner_api(self._context)
    top_level_required_transforms[write_cache_id] = write_cache_proto
    self._write_cache_ids.add(write_cache_id)

    for transform in self._include_subtransforms(write_cache):
      transform_id = self._context.transforms.get_id(transform)
      transform_proto = transform.to_runner_api(self._context)
      required_transforms[transform_id] = transform_proto

  def _desired_pcollections(self, pipeline_info):
    """Returns IDs of desired (queried or leaf) PCollections.

    Args:
      pipeline_info: (PipelineInfo) info of the original pipeline.

    Returns:
      (Set[str]) a set of PCollections IDs of either leaf PCollections or
      PCollections referenced by the user. These PCollections should be cached
      at the end of pipeline execution.
    """
    desired_pcollections = set(pipeline_info.leaf_pcollections())
    for pcoll_id in pipeline_info.all_pcollections():
      cache_label = pipeline_info.cache_label(pcoll_id)

      if cache_label in self._desired_cache_labels:
        desired_pcollections.add(pcoll_id)
    return desired_pcollections

  def _referenced_pcoll_ids(self, required_transforms):
    """Returns PCollection IDs referenced in the given transforms.

    Args:
      transforms: (Dict[str, PTransform proto]) mapping ID to protos.

    Returns:
      (Set[str]) PCollection IDs referenced as either input or output in the
        given transforms.
    """
    referenced_pcoll_ids = set()
    for transform_proto in required_transforms.values():
      for pcoll_id in transform_proto.inputs.values():
        referenced_pcoll_ids.add(pcoll_id)

      for pcoll_id in transform_proto.outputs.values():
        referenced_pcoll_ids.add(pcoll_id)

    return referenced_pcoll_ids

  def _top_level_producer(self, pcoll):
    """Given a PCollection, returns the top level producing PTransform.

    Args:
      pcoll: (PCollection)

    Returns:
      (AppliedPTransform) top level producing AppliedPTransform of pcoll.
    """
    top_level_transform = pcoll.producer
    while top_level_transform.parent.parent:
      top_level_transform = top_level_transform.parent
    return top_level_transform

  def _include_subtransforms(self, transform):
    """Depth-first yield the PTransform itself and its sub transforms.

    Args:
      transform: (AppliedPTransform)

    Yields:
      The input AppliedPTransform itself and all its sub transforms.
    """
    yield transform
    for subtransform in transform.parts[::-1]:
      for yielded in self._include_subtransforms(subtransform):
        yield yielded


[docs]class PipelineInfo(object):
  """Provides access to pipeline metadata."""

  def __init__(self, proto):
    self._proto = proto
    self._producers = {}
    self._consumers = collections.defaultdict(list)
    for transform_id, transform_proto in self._proto.transforms.items():
      if transform_proto.subtransforms:
        continue
      # Identify producers of each PCollection. A PTransform is a producer of
      # a PCollection if it outputs the PCollection but does not consume the
      # same PCollection as input. The latter part of the definition is to avoid
      # infinite recursions when constructing the PCollection's derivation.
      transform_inputs = set(transform_proto.inputs.values())
      for tag, pcoll_id in transform_proto.outputs.items():
        if pcoll_id in transform_inputs:
          # A transform is not the producer of a PCollection if it consumes the
          # PCollection as an input.
          continue
        self._producers[pcoll_id] = transform_id, tag
      for pcoll_id in transform_inputs:
        self._consumers[pcoll_id].append(transform_id)
    self._derivations = {}

[docs]  def all_pcollections(self):
    return self._proto.pcollections.keys()

[docs]  def leaf_pcollections(self):
    for pcoll_id in self._proto.pcollections:
      if not self._consumers[pcoll_id]:
        yield pcoll_id

[docs]  def producer(self, pcoll_id):
    return self._producers[pcoll_id]

[docs]  def cache_label(self, pcoll_id):
    """Returns the cache label given the PCollection ID."""
    return self._derivation(pcoll_id).cache_label()

  def _derivation(self, pcoll_id):
    if pcoll_id not in self._derivations:
      transform_id, output_tag = self._producers[pcoll_id]
      transform_proto = self._proto.transforms[transform_id]
      self._derivations[pcoll_id] = self.Derivation({
          input_tag: self._derivation(input_id)
          for input_tag, input_id in transform_proto.inputs.items()
      }, transform_proto, output_tag)
    return self._derivations[pcoll_id]

[docs]  class Derivation(object):
    """Records derivation info of a PCollection. Helper for PipelineInfo."""

    def __init__(self, inputs, transform_proto, output_tag):
      """Constructor of Derivation.

      Args:
        inputs: (Dict[str, Derivation]) maps PCollection names to Derivations.
        transform_proto: (Transform proto) the producing PTransform.
        output_tag: (str) local name of the PCollection in analysis.
      """
      self._inputs = inputs
      self._transform_info = {
          # TODO(qinyeli): remove name field when collision is resolved.
          'name': transform_proto.unique_name,
          'urn': transform_proto.spec.urn,
          'payload': transform_proto.spec.payload.decode('latin1')
      }
      self._output_tag = output_tag
      self._hash = None

    def __eq__(self, other):
      if isinstance(other, self.Derivation):
        # pylint: disable=protected-access
        return (self._inputs == other._inputs and
                self._transform_info == other._transform_info)

    def __ne__(self, other):
      # TODO(BEAM-5949): Needed for Python 2 compatibility.
      return not self == other

    def __hash__(self):
      if self._hash is None:
        self._hash = (hash(tuple(sorted(self._transform_info.items())))
                      + sum(hash(tag) * hash(input)
                            for tag, input in self._inputs.items())
                      + hash(self._output_tag))
      return self._hash

[docs]    def cache_label(self):
      # TODO(qinyeli): Collision resistance?
      return 'Pcoll-%x' % abs(hash(self))

[docs]    def json(self):
      return {
          'inputs': self._inputs,
          'transform': self._transform_info,
          'output_tag': self._output_tag
      }

    def __repr__(self):
      return str(self.json())


# TODO(qinyeli) move to proto_utils
[docs]def set_proto_map(proto_map, new_value):
  proto_map.clear()
  for key, value in new_value.items():
    proto_map[key].CopyFrom(value)