Source code for apache_beam.runners.interactive.sql.sql_chain

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Module for tracking a chain of beam_sql magics applied.

For internal use only; no backwards-compatibility guarantees.
"""

# pytype: skip-file

import importlib
import logging
from dataclasses import dataclass
from typing import Any
from typing import Dict
from typing import Optional
from typing import Set
from typing import Union

import apache_beam as beam
from apache_beam.internal import pickler
from apache_beam.runners.interactive.sql.utils import register_coder_for_schema
from apache_beam.runners.interactive.utils import create_var_in_main
from apache_beam.runners.interactive.utils import pcoll_by_name
from apache_beam.runners.interactive.utils import progress_indicated
from apache_beam.transforms.sql import SqlTransform
from apache_beam.utils.interactive_utils import is_in_ipython

_LOGGER = logging.getLogger(__name__)


[docs]@dataclass class SqlNode: """Each SqlNode represents a beam_sql magic applied. Attributes: output_name: the watched unique name of the beam_sql output. Can be used as an identifier. source: the inputs consumed by this node. Can be a pipeline or a set of PCollections represented by their variable names watched. When it's a pipeline, the node computes from raw values in the query, so the output can be consumed by any SqlNode in any SqlChain. query: the SQL query applied by this node. schemas: the schemas (NamedTuple classes) used by this node. evaluated: the pipelines this node has been evaluated for. next: the next SqlNode applied chronologically. execution_count: the execution count if in an IPython env. """ output_name: str source: Union[beam.Pipeline, Set[str]] query: str schemas: Set[Any] = None evaluated: Set[beam.Pipeline] = None next: Optional['SqlNode'] = None execution_count: int = 0 def __post_init__(self): if not self.schemas: self.schemas = set() if not self.evaluated: self.evaluated = set() if is_in_ipython(): from IPython import get_ipython self.execution_count = get_ipython().execution_count def __hash__(self): return hash( (self.output_name, self.source, self.query, self.execution_count))
[docs] def to_pipeline(self, pipeline: Optional[beam.Pipeline]) -> beam.Pipeline: """Converts the chain into an executable pipeline.""" if pipeline not in self.evaluated: # The whole chain should form a single pipeline. source = self.source if isinstance(self.source, beam.Pipeline): if pipeline: # use the known pipeline source = pipeline else: # use the source pipeline pipeline = self.source else: name_to_pcoll = pcoll_by_name() if len(self.source) == 1: source = name_to_pcoll.get(next(iter(self.source))) else: source = {s: name_to_pcoll.get(s) for s in self.source} if isinstance(source, beam.Pipeline): output = source | 'beam_sql_{}_{}'.format( self.output_name, self.execution_count) >> SqlTransform(self.query) else: output = source | 'schema_loaded_beam_sql_{}_{}'.format( self.output_name, self.execution_count ) >> SchemaLoadedSqlTransform( self.output_name, self.query, self.schemas, self.execution_count) _ = create_var_in_main(self.output_name, output) self.evaluated.add(pipeline) if self.next: return self.next.to_pipeline(pipeline) else: return pipeline
[docs]class SchemaLoadedSqlTransform(beam.PTransform): """PTransform that loads schema before executing SQL. When submitting a pipeline to remote runner for execution, schemas defined in the main module are not available without save_main_session. However, save_main_session might fail when there is anything unpicklable. This DoFn makes sure only the schemas needed are pickled locally and restored later on workers. """ def __init__(self, output_name, query, schemas, execution_count): self.output_name = output_name self.query = query self.schemas = schemas self.execution_count = execution_count # TODO(BEAM-8123): clean up this attribute or the whole wrapper PTransform. # Dill does not preserve everything. On the other hand, save_main_session # is not stable. Until cloudpickle replaces dill in Beam, we work around # it by explicitly pickling annotations and load schemas in remote main # sessions. self.schema_annotations = [s.__annotations__ for s in self.schemas] class _SqlTransformDoFn(beam.DoFn): """The DoFn yields all its input without any transform but a setup to configure the main session.""" def __init__(self, schemas, annotations): self.pickled_schemas = [pickler.dumps(s) for s in schemas] self.pickled_annotations = [pickler.dumps(a) for a in annotations] def setup(self): main_session = importlib.import_module('__main__') for pickled_schema, pickled_annotation in zip( self.pickled_schemas, self.pickled_annotations): schema = pickler.loads(pickled_schema) schema.__annotations__ = pickler.loads(pickled_annotation) if not hasattr(main_session, schema.__name__) or not hasattr( getattr(main_session, schema.__name__), '__annotations__'): # Restore the schema in the main session on the [remote] worker. setattr(main_session, schema.__name__, schema) register_coder_for_schema(schema) def process(self, e): yield e
[docs] def expand(self, source): """Applies the SQL transform. If a PCollection uses a schema defined in the main session, use the additional DoFn to restore it on the worker.""" if isinstance(source, dict): schema_loaded = { tag: pcoll | 'load_schemas_{}_tag_{}_{}'.format( self.output_name, tag, self.execution_count) >> beam.ParDo( self._SqlTransformDoFn(self.schemas, self.schema_annotations)) if pcoll.element_type in self.schemas else pcoll for tag, pcoll in source.items() } elif isinstance(source, beam.pvalue.PCollection): schema_loaded = source | 'load_schemas_{}_{}'.format( self.output_name, self.execution_count) >> beam.ParDo( self._SqlTransformDoFn(self.schemas, self.schema_annotations) ) if source.element_type in self.schemas else source else: raise ValueError( '{} should be either a single PCollection or a dict of named ' 'PCollections.'.format(source)) return schema_loaded | 'beam_sql_{}_{}'.format( self.output_name, self.execution_count) >> SqlTransform(self.query)
[docs]@dataclass class SqlChain: """A chain of SqlNodes. Attributes: nodes: all nodes by their output_names. root: the first SqlNode applied chronologically. current: the last node applied. user_pipeline: the user defined pipeline this chain originates from. If None, the whole chain just computes from raw values in queries. Otherwise, at least some of the nodes in chain has queried against PCollections. """ nodes: Dict[str, SqlNode] = None root: Optional[SqlNode] = None current: Optional[SqlNode] = None user_pipeline: Optional[beam.Pipeline] = None def __post_init__(self): if not self.nodes: self.nodes = {}
[docs] @progress_indicated def to_pipeline(self) -> beam.Pipeline: """Converts the chain into a beam pipeline.""" pipeline_to_execute = self.root.to_pipeline(self.user_pipeline) # The pipeline definitely contains external transform: SqlTransform. pipeline_to_execute.contains_external_transforms = True return pipeline_to_execute
[docs] def append(self, node: SqlNode) -> 'SqlChain': """Appends a node to the chain.""" if self.current: self.current.next = node else: self.root = node self.current = node self.nodes[node.output_name] = node return self
[docs] def get(self, output_name: str) -> Optional[SqlNode]: """Gets a node from the chain based on the given output_name.""" return self.nodes.get(output_name, None)