Source code for apache_beam.runners.dataflow.dataflow_job_service

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import argparse
import logging
import sys

from apache_beam.portability.api import beam_job_api_pb2
from apache_beam.runners.dataflow import dataflow_runner
from apache_beam.runners.portability import local_job_service
from apache_beam.runners.portability import local_job_service_main
from apache_beam.runners.portability import portable_runner


[docs]class DataflowBeamJob(local_job_service.BeamJob):
  """A representation of a single Beam job to be run on the Dataflow runner.
  """
  def _invoke_runner(self):
    """Actually calls Dataflow and waits for completion.
    """
    runner = dataflow_runner.DataflowRunner()
    self.result = runner.run_pipeline(
        None, self.pipeline_options(), self._pipeline_proto)
    # The result can be None if there is no need to send a request
    # to the service (e.g. template creation).
    if not getattr(self.result, 'has_job', None):
      self.set_state(beam_job_api_pb2.JobState.DONE)
      return self.result
    # Prefer this to result.wait_until_finish() to get state updates
    # and avoid creating an extra thread (which also messes with logging).
    dataflow_runner.DataflowRunner.poll_for_job_completion(
        runner,
        self.result,
        None,
        lambda dataflow_state: self.set_state(
            portable_runner.PipelineResult.pipeline_state_to_runner_api_state(
                self.result.api_jobstate_to_pipeline_state(dataflow_state))))
    return self.result

[docs]  def cancel(self):
    if not self.is_terminal_state(self.state):
      self.result.cancel()


[docs]def run(argv, beam_job_type=DataflowBeamJob):
  if argv[0] == __file__:
    argv = argv[1:]
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '-p',
      '--port',
      '--job_port',
      type=int,
      default=0,
      help='port on which to serve the job api')
  parser.add_argument('--staging_dir')
  options = parser.parse_args(argv)

  job_servicer = local_job_service.LocalJobServicer(
      options.staging_dir, beam_job_type=beam_job_type)
  port = job_servicer.start_grpc_server(options.port)
  try:
    local_job_service_main.serve("Listening for beam jobs on port %d." % port)
  finally:
    job_servicer.stop()


if __name__ == '__main__':
  logging.basicConfig()
  logging.getLogger().setLevel(logging.INFO)
  run(sys.argv)