Source code for apache_beam.runners.interactive.sql.utils
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Module of utilities for SQL magics.
For internal use only; no backward-compatibility guarantees.
"""
# pytype: skip-file
import logging
import os
import tempfile
from dataclasses import dataclass
from typing import Any
from typing import Callable
from typing import Dict
from typing import NamedTuple
from typing import Optional
from typing import Type
from typing import Union
import apache_beam as beam
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import WorkerOptions
from apache_beam.runners.interactive.utils import create_var_in_main
from apache_beam.runners.interactive.utils import progress_indicated
from apache_beam.runners.runner import create_runner
from apache_beam.typehints.native_type_compatibility import match_is_named_tuple
from apache_beam.utils.interactive_utils import is_in_ipython
_LOGGER = logging.getLogger(__name__)
[docs]def register_coder_for_schema(
schema: NamedTuple, verbose: bool = False) -> None:
"""Registers a RowCoder for the given schema if hasn't.
Notifies the user of what code has been implicitly executed.
"""
assert match_is_named_tuple(schema), (
'Schema %s is not a typing.NamedTuple.' % schema)
coder = beam.coders.registry.get_coder(schema)
if not isinstance(coder, beam.coders.RowCoder):
if verbose:
_LOGGER.warning(
'Schema %s has not been registered to use a RowCoder. '
'Automatically registering it by running: '
'beam.coders.registry.register_coder(%s, '
'beam.coders.RowCoder)',
schema.__name__,
schema.__name__)
beam.coders.registry.register_coder(schema, beam.coders.RowCoder)
[docs]def find_pcolls(
sql: str,
pcolls: Dict[str, beam.PCollection],
verbose: bool = False) -> Dict[str, beam.PCollection]:
"""Finds all PCollections used in the given sql query.
It does a simple word by word match and calls ib.collect for each PCollection
found.
"""
found = {}
for word in sql.split():
if word in pcolls:
found[word] = pcolls[word]
if found:
if verbose:
_LOGGER.info('Found PCollections used in the magic: %s.', found)
_LOGGER.info('Collecting data...')
return found
[docs]def replace_single_pcoll_token(sql: str, pcoll_name: str) -> str:
"""Replaces the pcoll_name used in the sql with 'PCOLLECTION'.
For sql query using only a single PCollection, the PCollection needs to be
referred to as 'PCOLLECTION' instead of its variable/tag name.
"""
words = sql.split()
token_locations = []
i = 0
for word in words:
if word.lower() == 'from':
token_locations.append(i + 1)
i += 2
continue
i += 1
for token_location in token_locations:
if token_location < len(words) and words[token_location] == pcoll_name:
words[token_location] = 'PCOLLECTION'
return ' '.join(words)
[docs]def pformat_namedtuple(schema: NamedTuple) -> str:
return '{}({})'.format(
schema.__name__,
', '.join([
'{}: {}'.format(k, v.__name__ if hasattr(v, '__name__') else repr(v))
for k,
v in schema.__annotations__.items()
]))
[docs]def pformat_dict(raw_input: Dict[str, Any]) -> str:
return '{{\n{}\n}}'.format(
',\n'.join(['{}: {}'.format(k, v) for k, v in raw_input.items()]))
[docs]@dataclass
class OptionsEntry:
"""An entry of PipelineOptions that can be visualized through ipywidgets to
take inputs in IPython notebooks interactively.
Attributes:
label: The value of the Label widget.
help: The help message of the entry, usually the same to the help in
PipelineOptions.
cls: The PipelineOptions class/subclass the options belong to.
arg_builder: Builds the argument/option. If it's a str, this entry
assigns the input ipywidget's value directly to the argument. If it's a
Dict, use the corresponding Callable to assign the input value to each
argument. If Callable is None, fallback to assign the input value
directly. This allows building multiple similar PipelineOptions
arguments from a single input, such as staging_location and
temp_location in GoogleCloudOptions.
default: The default value of the entry, None if absent.
"""
label: str
help: str
cls: Type[PipelineOptions]
arg_builder: Union[str, Dict[str, Optional[Callable]]]
default: Optional[str] = None
def __post_init__(self):
# The attribute holds an ipywidget, currently only supports Text.
# The str value can be accessed by self.input.value.
self.input = None
[docs]class OptionsForm:
"""A form visualized to take inputs from users in IPython Notebooks and
generate PipelineOptions to run pipelines.
"""
def __init__(self):
self.options = PipelineOptions()
self.entries = []
[docs] def add(self, entry: OptionsEntry) -> 'OptionsForm':
"""Adds an OptionsEntry to the form.
"""
self.entries.append(entry)
return self
[docs] def to_options(self) -> PipelineOptions:
"""Builds the PipelineOptions based on user inputs.
Can only be invoked after display_for_input.
"""
for entry in self.entries:
assert entry.input, (
'to_options invoked before display_for_input. '
'Wrong usage.')
view = self.options.view_as(entry.cls)
if isinstance(entry.arg_builder, str):
setattr(view, entry.arg_builder, entry.input.value)
else:
for arg, builder in entry.arg_builder.items():
if builder:
setattr(view, arg, builder(entry.input.value))
else:
setattr(view, arg, entry.input.value)
self.additional_options()
return self.options
[docs] def display_for_input(self) -> 'OptionsForm':
"""Displays the widgets to take user inputs."""
from IPython.display import display
from ipywidgets import GridBox
from ipywidgets import Label
from ipywidgets import Layout
from ipywidgets import Text
widgets = []
for entry in self.entries:
text_label = Label(value=entry.label)
text_input = entry.input if entry.input else Text(
value=entry.default if entry.default else '')
text_help = Label(value=entry.help)
entry.input = text_input
widgets.append(text_label)
widgets.append(text_input)
widgets.append(text_help)
grid = GridBox(widgets, layout=Layout(grid_template_columns='1fr 2fr 6fr'))
display(grid)
self.display_actions()
return self
[docs] def display_actions(self):
"""Displays actionable widgets to utilize the options, run pipelines and
etc."""
pass
[docs]class DataflowOptionsForm(OptionsForm):
"""A form to take inputs from users in IPython Notebooks to build
PipelineOptions to run pipelines on Dataflow.
Only contains minimum fields needed.
"""
@staticmethod
def _build_default_project() -> str:
"""Builds a default project id."""
try:
# pylint: disable=c-extension-no-member
import google.auth
return google.auth.default()[1]
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
_LOGGER.warning('There is some issue with your gcloud auth: %s', e)
return 'your-project-id'
@staticmethod
def _build_req_file_from_pkgs(pkgs) -> Optional[str]:
"""Builds a requirements file that contains all additional PYPI packages
needed."""
if pkgs:
deps = pkgs.split(',')
req_file = os.path.join(
tempfile.mkdtemp(prefix='beam-sql-dataflow-'), 'req.txt')
with open(req_file, 'a') as f:
for dep in deps:
f.write(dep.strip() + '\n')
return req_file
return None
def __init__(
self,
output_name: str,
output_pcoll: beam.PCollection,
verbose: bool = False):
"""Inits the OptionsForm for setting up Dataflow jobs."""
super().__init__()
self.p = output_pcoll.pipeline
self.output_name = output_name
self.output_pcoll = output_pcoll
self.verbose = verbose
self.notice_shown = False
self.add(
OptionsEntry(
label='Project Id',
help='Name of the Cloud project owning the Dataflow job.',
cls=GoogleCloudOptions,
arg_builder='project',
default=DataflowOptionsForm._build_default_project())
).add(
OptionsEntry(
label='Region',
help='The Google Compute Engine region for creating Dataflow job.',
cls=GoogleCloudOptions,
arg_builder='region',
default='us-central1')
).add(
OptionsEntry(
label='GCS Bucket',
help=(
'GCS path to stage code packages needed by workers and save '
'temporary workflow jobs.'),
cls=GoogleCloudOptions,
arg_builder={
'staging_location': lambda x: x + '/staging',
'temp_location': lambda x: x + '/temp'
},
default='gs://YOUR_GCS_BUCKET_HERE')
).add(
OptionsEntry(
label='Additional Packages',
help=(
'PYPI packages installed, comma-separated. If None, leave '
'this field empty.'),
cls=SetupOptions,
arg_builder={
'requirements_file': lambda x: DataflowOptionsForm.
_build_req_file_from_pkgs(x)
},
default=''))
[docs] def additional_options(self):
# Use the latest Java SDK by default.
sdk_overrides = self.options.view_as(
WorkerOptions).sdk_harness_container_image_overrides
override = '.*java.*,apache/beam_java11_sdk:latest'
if sdk_overrides and override not in sdk_overrides:
sdk_overrides.append(override)
else:
self.options.view_as(
WorkerOptions).sdk_harness_container_image_overrides = [override]
[docs] def display_actions(self):
from IPython.display import HTML
from IPython.display import display
from ipywidgets import Button
from ipywidgets import GridBox
from ipywidgets import Layout
from ipywidgets import Output
options_output_area = Output()
run_output_area = Output()
run_btn = Button(
description='Run on Dataflow',
button_style='success',
tooltip=(
'Submit to Dataflow for execution with the configured options. The '
'output PCollection\'s data will be written to the GCS bucket you '
'configure.'))
show_options_btn = Button(
description='Show Options',
button_style='info',
tooltip='Show current pipeline options configured.')
def _run_on_dataflow(btn):
with run_output_area:
run_output_area.clear_output()
@progress_indicated
def _inner():
options = self.to_options()
# Caches the output_pcoll to a GCS bucket.
try:
execution_count = 0
if is_in_ipython():
from IPython import get_ipython
execution_count = get_ipython().execution_count
output_location = '{}/{}'.format(
options.view_as(GoogleCloudOptions).staging_location,
self.output_name)
_ = self.output_pcoll | 'WriteOuput{}_{}ToGCS'.format(
self.output_name,
execution_count) >> WriteToText(output_location)
_LOGGER.info(
'Data of output PCollection %s will be written to %s',
self.output_name,
output_location)
except (KeyboardInterrupt, SystemExit):
raise
except: # pylint: disable=bare-except
# The transform has been added before, noop.
pass
if self.verbose:
_LOGGER.info(
'Running the pipeline on Dataflow with pipeline options %s.',
pformat_dict(options.display_data()))
result = create_runner('DataflowRunner').run_pipeline(self.p, options)
cloud_options = options.view_as(GoogleCloudOptions)
url = (
'https://console.cloud.google.com/dataflow/jobs/%s/%s?project=%s'
% (cloud_options.region, result.job_id(), cloud_options.project))
display(
HTML(
'Click <a href="%s" target="_new">here</a> for the details '
'of your Dataflow job.' % url))
result_name = 'result_{}'.format(self.output_name)
create_var_in_main(result_name, result)
if self.verbose:
_LOGGER.info(
'The pipeline result of the run can be accessed from variable '
'%s. The current status is %s.',
result_name,
result)
try:
btn.disabled = True
_inner()
finally:
btn.disabled = False
run_btn.on_click(_run_on_dataflow)
def _show_options(btn):
with options_output_area:
options_output_area.clear_output()
options = self.to_options()
options_name = 'options_{}'.format(self.output_name)
create_var_in_main(options_name, options)
_LOGGER.info(
'The pipeline options configured is: %s.',
pformat_dict(options.display_data()))
show_options_btn.on_click(_show_options)
grid = GridBox([run_btn, show_options_btn],
layout=Layout(grid_template_columns='repeat(2, 200px)'))
display(grid)
# Implicitly initializes the options variable before 1st time showing
# options.
options_name_inited, _ = create_var_in_main('options_{}'.format(
self.output_name), self.to_options())
if not self.notice_shown:
_LOGGER.info(
'The pipeline options can be configured through variable %s. You '
'may also add additional options or sink transforms such as write '
'to BigQuery in other notebook cells. Come back to click "Run on '
'Dataflow" button once you complete additional configurations. '
'Optionally, you can chain more beam_sql magics with DataflowRunner '
'and click "Run on Dataflow" in their outputs.',
options_name_inited)
self.notice_shown = True
display(options_output_area)
display(run_output_area)