Source code for apache_beam.testing.benchmarks.nexmark.queries.query10

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
Query 10, 'Log to sharded files' (Not in original suite.)

Every window_size_sec, save all events from the last period into
2*max_workers log files.
"""

import apache_beam as beam
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.transforms import trigger
from apache_beam.transforms import window
from apache_beam.utils.timestamp import Duration

NUM_SHARD_PER_WORKER = 5
LATE_BATCHING_PERIOD = 10

output_path = None
max_num_workers = 5

num_log_shards = NUM_SHARD_PER_WORKER * max_num_workers


[docs]class OutputFile(object):
  def __init__(self, max_timestamp, shard, index, timing, filename):
    self.max_timestamp = max_timestamp
    self.shard = shard
    self.index = index
    self.timing = timing
    self.filename = filename


[docs]def open_writable_gcs_file(options, filename):
  # TODO: [https://github.com/apache/beam/issues/20670] it seems that beam team
  #   has not yet decided about this method and it is left blank and
  #   unspecified.
  pass


[docs]def output_file_for(window, shard, pane):
  """
  Returns:
    an OutputFile object constructed with pane, window and shard.
  """
  filename = '%s/LOG-%s-%s-%03d-%s' % (
      output_path, window.max_timestamp(), shard, pane.index,
      pane.timing) if output_path else None
  return OutputFile(
      window.max_timestamp(), shard, pane.index, pane.timing, filename)


[docs]def index_path_for(window):
  """
  Returns:
    path to the index file containing all shard names or None if no output_path
      is set
  """
  if output_path:
    return '%s/INDEX-%s' % (output_path, window.max_timestamp())
  else:
    return None


[docs]def load(events, metadata=None, pipeline_options=None):
  return (
      events
      | 'query10_shard_events' >> beam.ParDo(ShardEventsDoFn())
      # trigger fires when each sub-triger (executed in order) fires
      # repeatedly 1. after at least maxLogEvents in pane
      #            2. or finally when watermark pass the end of window
      # Repeatedly 1. after at least maxLogEvents in pane
      #            2. or processing time pass the first element in pane + delay
      | 'query10_fix_window' >> beam.WindowInto(
          window.FixedWindows(metadata.get('window_size_sec')),
          trigger=trigger.AfterEach(
              trigger.OrFinally(
                  trigger.Repeatedly(
                      trigger.AfterCount(metadata.get('max_log_events'))),
                  trigger.AfterWatermark()),
              trigger.Repeatedly(
                  trigger.AfterAny(
                      trigger.AfterCount(metadata.get('max_log_events')),
                      trigger.AfterProcessingTime(LATE_BATCHING_PERIOD)))),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          # Use a 1 day allowed lateness so that any forgotten hold will stall
          # the pipeline for that period and be very noticeable.
          allowed_lateness=Duration.of(1 * 24 * 60 * 60))
      | 'query10_gbk' >> beam.GroupByKey()
      | 'query10_write_event' >> beam.ParDo(WriteEventDoFn(), pipeline_options)
      | 'query10_window_log_files' >> beam.WindowInto(
          window.FixedWindows(metadata.get('window_size_sec')),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          allowed_lateness=Duration.of(1 * 24 * 60 * 60))
      | 'query10_gbk_2' >> beam.GroupByKey()
      | 'query10_write_index' >> beam.ParDo(WriteIndexDoFn(), pipeline_options))


[docs]class ShardEventsDoFn(beam.DoFn):
[docs]  def process(self, element):
    shard_number = abs(hash(element) % num_log_shards)
    shard = 'shard-%05d-of-%05d' % (shard_number, num_log_shards)
    yield shard, element


[docs]class WriteEventDoFn(beam.DoFn):
[docs]  def process(
      self,
      element,
      pipeline_options,
      window=beam.DoFn.WindowParam,
      pane_info=beam.DoFn.PaneInfoParam):
    shard = element[0]
    options = pipeline_options.view_as(GoogleCloudOptions)
    output_file = output_file_for(window, shard, pane_info)
    if output_file.filename:
      # not do anything because open_writable_gcs_file does not do anything
      open_writable_gcs_file(options, output_file.filename)
      for event in element[1]:  # pylint: disable=unused-variable
        # write to file
        pass
    yield None, output_file


[docs]class WriteIndexDoFn(beam.DoFn):
[docs]  def process(self, element, pipeline_options, window=beam.DoFn.WindowParam):
    options = pipeline_options.view_as(GoogleCloudOptions)
    filename = index_path_for(window)
    if filename:
      # not do anything because open_writable_gcs_file does not do anything
      open_writable_gcs_file(options, filename)
      for output_file in element[1]:  # pylint: disable=unused-variable
        # write to file
        pass