Source code for apache_beam.io.gcp.datastore.v1.helper

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Cloud Datastore helper functions.

For internal use only; no backwards-compatibility guarantees.
"""

import errno
import logging
import sys
import time
from builtins import object
from socket import error as SocketError

from future.builtins import next
from past.builtins import unicode

# pylint: disable=ungrouped-imports
from apache_beam.internal.gcp import auth
from apache_beam.utils import retry

# Protect against environments where datastore library is not available.
# pylint: disable=wrong-import-order, wrong-import-position
try:
  from google.cloud.proto.datastore.v1 import datastore_pb2
  from google.cloud.proto.datastore.v1 import entity_pb2
  from google.cloud.proto.datastore.v1 import query_pb2
  from google.rpc import code_pb2
  from googledatastore import PropertyFilter, CompositeFilter
  from googledatastore import helper as datastore_helper
  from googledatastore.connection import Datastore
  from googledatastore.connection import RPCError
except ImportError:
  pass
# pylint: enable=wrong-import-order, wrong-import-position

# pylint: enable=ungrouped-imports


[docs]def key_comparator(k1, k2):
  """A comparator for Datastore keys.

  Comparison is only valid for keys in the same partition. The comparison here
  is between the list of paths for each key.
  """

  if k1.partition_id != k2.partition_id:
    raise ValueError('Cannot compare keys with different partition ids.')

  k2_iter = iter(k2.path)

  for k1_path in k1.path:
    k2_path = next(k2_iter, None)
    if not k2_path:
      return 1

    result = compare_path(k1_path, k2_path)

    if result != 0:
      return result

  k2_path = next(k2_iter, None)
  if k2_path:
    return -1
  return 0


[docs]def compare_path(p1, p2):
  """A comparator for key path.

  A path has either an `id` or a `name` field defined. The
  comparison works with the following rules:

  1. If one path has `id` defined while the other doesn't, then the
  one with `id` defined is considered smaller.
  2. If both paths have `id` defined, then their ids are compared.
  3. If no `id` is defined for both paths, then their `names` are compared.
  """

  result = (p1.kind > p2.kind) - (p1.kind < p2.kind)
  if result != 0:
    return result

  if p1.HasField('id'):
    if not p2.HasField('id'):
      return -1

    return (p1.id > p2.id) - (p1.id < p2.id)

  if p2.HasField('id'):
    return 1

  return (p1.name > p2.name) - (p1.name < p2.name)


[docs]def get_datastore(project):
  """Returns a Cloud Datastore client."""
  credentials = auth.get_service_credentials()
  return Datastore(project, credentials, host='batch-datastore.googleapis.com')


[docs]def make_request(project, namespace, query):
  """Make a Cloud Datastore request for the given query."""
  req = datastore_pb2.RunQueryRequest()
  req.partition_id.CopyFrom(make_partition(project, namespace))

  req.query.CopyFrom(query)
  return req


[docs]def make_partition(project, namespace):
  """Make a PartitionId for the given project and namespace."""
  partition = entity_pb2.PartitionId()
  partition.project_id = project
  if namespace is not None:
    partition.namespace_id = namespace

  return partition


[docs]def retry_on_rpc_error(exception):
  """A retry filter for Cloud Datastore RPCErrors."""
  if isinstance(exception, RPCError):
    err_code = exception.code
    # TODO(BEAM-2156): put these codes in a global list and use that instead.
    return (err_code == code_pb2.DEADLINE_EXCEEDED or
            err_code == code_pb2.UNAVAILABLE or
            err_code == code_pb2.UNKNOWN or
            err_code == code_pb2.INTERNAL)

  if isinstance(exception, SocketError):
    return (exception.errno == errno.ECONNRESET or
            exception.errno == errno.ETIMEDOUT)

  return False


[docs]def fetch_entities(project, namespace, query, datastore):
  """A helper method to fetch entities from Cloud Datastore.

  Args:
    project: Project ID
    namespace: Cloud Datastore namespace
    query: Query to be read from
    datastore: Cloud Datastore Client

  Returns:
    An iterator of entities.
  """
  return QueryIterator(project, namespace, query, datastore)


[docs]def is_key_valid(key):
  """Returns True if a Cloud Datastore key is complete.

  A key is complete if its last element has either an id or a name.
  """
  if not key.path:
    return False
  return key.path[-1].HasField('id') or key.path[-1].HasField('name')


[docs]def write_mutations(datastore, project, mutations, throttler,
                    rpc_stats_callback=None, throttle_delay=1):
  """A helper function to write a batch of mutations to Cloud Datastore.

  If a commit fails, it will be retried upto 5 times. All mutations in the
  batch will be committed again, even if the commit was partially successful.
  If the retry limit is exceeded, the last exception from Cloud Datastore will
  be raised.

  Args:
    datastore: googledatastore.connection.Datastore
    project: str, project id
    mutations: list of google.cloud.proto.datastore.v1.datastore_pb2.Mutation
    rpc_stats_callback: a function to call with arguments `successes` and
        `failures` and `throttled_secs`; this is called to record successful
        and failed RPCs to Datastore and time spent waiting for throttling.
    throttler: AdaptiveThrottler, to use to select requests to be throttled.
    throttle_delay: float, time in seconds to sleep when throttled.

  Returns a tuple of:
    CommitResponse, the response from Datastore;
    int, the latency of the successful RPC in milliseconds.
  """
  commit_request = datastore_pb2.CommitRequest()
  commit_request.mode = datastore_pb2.CommitRequest.NON_TRANSACTIONAL
  commit_request.project_id = project
  for mutation in mutations:
    commit_request.mutations.add().CopyFrom(mutation)

  @retry.with_exponential_backoff(num_retries=5,
                                  retry_filter=retry_on_rpc_error)
  def commit(request):
    # Client-side throttling.
    while throttler.throttle_request(time.time()*1000):
      logging.info("Delaying request for %ds due to previous failures",
                   throttle_delay)
      time.sleep(throttle_delay)
      rpc_stats_callback(throttled_secs=throttle_delay)

    try:
      start_time = time.time()
      response = datastore.commit(request)
      end_time = time.time()

      rpc_stats_callback(successes=1)
      throttler.successful_request(start_time*1000)
      commit_time_ms = int((end_time-start_time)*1000)
      return response, commit_time_ms
    except (RPCError, SocketError):
      if rpc_stats_callback:
        rpc_stats_callback(errors=1)
      raise

  response, commit_time_ms = commit(commit_request)
  return response, commit_time_ms


[docs]def make_latest_timestamp_query(namespace):
  """Make a Query to fetch the latest timestamp statistics."""
  query = query_pb2.Query()
  if namespace is None:
    query.kind.add().name = '__Stat_Total__'
  else:
    query.kind.add().name = '__Stat_Ns_Total__'

  # Descending order of `timestamp`
  datastore_helper.add_property_orders(query, "-timestamp")
  # Only get the latest entity
  query.limit.value = 1
  return query


[docs]def make_kind_stats_query(namespace, kind, latest_timestamp):
  """Make a Query to fetch the latest kind statistics."""
  kind_stat_query = query_pb2.Query()
  if namespace is None:
    kind_stat_query.kind.add().name = '__Stat_Kind__'
  else:
    kind_stat_query.kind.add().name = '__Stat_Ns_Kind__'

  kind_filter = datastore_helper.set_property_filter(
      query_pb2.Filter(), 'kind_name', PropertyFilter.EQUAL,
      unicode(kind))
  timestamp_filter = datastore_helper.set_property_filter(
      query_pb2.Filter(), 'timestamp', PropertyFilter.EQUAL,
      latest_timestamp)

  datastore_helper.set_composite_filter(kind_stat_query.filter,
                                        CompositeFilter.AND, kind_filter,
                                        timestamp_filter)
  return kind_stat_query


[docs]class QueryIterator(object):
  """A iterator class for entities of a given query.

  Entities are read in batches. Retries on failures.
  """
  # Maximum number of results to request per query.
  _BATCH_SIZE = 500

  def __init__(self, project, namespace, query, datastore):
    self._query = query
    self._datastore = datastore
    self._project = project
    self._namespace = namespace
    self._start_cursor = None
    self._limit = self._query.limit.value or sys.maxsize
    self._req = make_request(project, namespace, query)

  @retry.with_exponential_backoff(num_retries=5,
                                  retry_filter=retry_on_rpc_error)
  def _next_batch(self):
    """Fetches the next batch of entities."""
    if self._start_cursor is not None:
      self._req.query.start_cursor = self._start_cursor

    # set batch size
    self._req.query.limit.value = min(self._BATCH_SIZE, self._limit)
    resp = self._datastore.run_query(self._req)
    return resp

  def __iter__(self):
    more_results = True
    while more_results:
      resp = self._next_batch()
      for entity_result in resp.batch.entity_results:
        yield entity_result.entity

      self._start_cursor = resp.batch.end_cursor
      num_results = len(resp.batch.entity_results)
      self._limit -= num_results

      # Check if we need to read more entities.
      # True when query limit hasn't been satisfied and there are more entities
      # to be read. The latter is true if the response has a status
      # `NOT_FINISHED` or if the number of results read in the previous batch
      # is equal to `_BATCH_SIZE` (all indications that there is more data be
      # read).
      more_results = ((self._limit > 0) and
                      ((num_results == self._BATCH_SIZE) or
                       (resp.batch.more_results ==
                        query_pb2.QueryResultBatch.NOT_FINISHED)))