Source code for apache_beam.io.tfrecordio

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""TFRecord sources and sinks."""

# pytype: skip-file

import codecs
import logging
import struct
from functools import partial

import crcmod

from apache_beam import coders
from apache_beam.io import filebasedsink
from apache_beam.io.filebasedsource import FileBasedSource
from apache_beam.io.filebasedsource import ReadAllFiles
from apache_beam.io.filesystem import CompressionTypes
from apache_beam.io.iobase import Read
from apache_beam.io.iobase import Write
from apache_beam.transforms import PTransform

__all__ = ['ReadFromTFRecord', 'ReadAllFromTFRecord', 'WriteToTFRecord']

_LOGGER = logging.getLogger(__name__)


def _default_crc32c_fn(value):
  """Calculates crc32c of a bytes object using either snappy or crcmod."""

  if not _default_crc32c_fn.fn:
    try:
      import snappy  # pylint: disable=import-error
      # Support multiple versions of python-snappy:
      # https://github.com/andrix/python-snappy/pull/53
      if getattr(snappy, '_crc32c', None):
        _default_crc32c_fn.fn = snappy._crc32c  # pylint: disable=protected-access
      elif getattr(snappy, '_snappy', None):
        _default_crc32c_fn.fn = snappy._snappy._crc32c  # pylint: disable=protected-access
    except ImportError:
      pass

    if not _default_crc32c_fn.fn:
      _LOGGER.warning(
          'Couldn\'t find python-snappy so the implementation of '
          '_TFRecordUtil._masked_crc32c is not as fast as it could '
          'be.')
      _default_crc32c_fn.fn = crcmod.predefined.mkPredefinedCrcFun('crc-32c')
  return _default_crc32c_fn.fn(value)


_default_crc32c_fn.fn = None  # type: ignore


class _TFRecordUtil(object):
  """Provides basic TFRecord encoding/decoding with consistency checks.

  For detailed TFRecord format description see:
    https://www.tensorflow.org/versions/r1.11/api_guides/python/python_io#TFRecords_Format_Details

  Note that masks and length are represented in LittleEndian order.
  """
  @classmethod
  def _masked_crc32c(cls, value, crc32c_fn=_default_crc32c_fn):
    """Compute a masked crc32c checksum for a value.

    Args:
      value: A bytes object for which we compute the crc.
      crc32c_fn: A function that can compute a crc32c.
        This is a performance hook that also helps with testing. Callers are
        not expected to make use of it directly.
    Returns:
      Masked crc32c checksum.
    """

    crc = crc32c_fn(value)
    return (((crc >> 15) | (crc << 17)) + 0xa282ead8) & 0xffffffff

  @staticmethod
  def encoded_num_bytes(record):
    """Return the number of bytes consumed by a record in its encoded form."""
    # 16 = 8 (Length) + 4 (crc of length) + 4 (crc of data)
    return len(record) + 16

  @classmethod
  def write_record(cls, file_handle, value):
    """Encode a value as a TFRecord.

    Args:
      file_handle: The file to write to.
      value: A bytes object representing content of the record.
    """
    encoded_length = struct.pack(b'<Q', len(value))
    file_handle.write(
        b''.join([
            encoded_length,
            struct.pack(b'<I', cls._masked_crc32c(encoded_length)),
            value,
            struct.pack(b'<I', cls._masked_crc32c(value))
        ]))

  @classmethod
  def read_record(cls, file_handle):
    """Read a record from a TFRecords file.

    Args:
      file_handle: The file to read from.
    Returns:
      None if EOF is reached; the paylod of the record otherwise.
    Raises:
      ValueError: If file appears to not be a valid TFRecords file.
    """
    buf_length_expected = 12
    buf = file_handle.read(buf_length_expected)
    if not buf:
      return None  # EOF Reached.

    # Validate all length related payloads.
    if len(buf) != buf_length_expected:
      raise ValueError(
          'Not a valid TFRecord. Fewer than %d bytes: %s' %
          (buf_length_expected, codecs.encode(buf, 'hex')))
    length, length_mask_expected = struct.unpack('<QI', buf)
    length_mask_actual = cls._masked_crc32c(buf[:8])
    if length_mask_actual != length_mask_expected:
      raise ValueError(
          'Not a valid TFRecord. Mismatch of length mask: %s' %
          codecs.encode(buf, 'hex'))

    # Validate all data related payloads.
    buf_length_expected = length + 4
    buf = file_handle.read(buf_length_expected)
    if len(buf) != buf_length_expected:
      raise ValueError(
          'Not a valid TFRecord. Fewer than %d bytes: %s' %
          (buf_length_expected, codecs.encode(buf, 'hex')))
    data, data_mask_expected = struct.unpack('<%dsI' % length, buf)
    data_mask_actual = cls._masked_crc32c(data)
    if data_mask_actual != data_mask_expected:
      raise ValueError(
          'Not a valid TFRecord. Mismatch of data mask: %s' %
          codecs.encode(buf, 'hex'))

    # All validation checks passed.
    return data


class _TFRecordSource(FileBasedSource):
  """A File source for reading files of TFRecords.

  For detailed TFRecords format description see:
    https://www.tensorflow.org/versions/r1.11/api_guides/python/python_io#TFRecords_Format_Details
  """
  def __init__(self, file_pattern, coder, compression_type, validate):
    """Initialize a TFRecordSource.  See ReadFromTFRecord for details."""
    super().__init__(
        file_pattern=file_pattern,
        compression_type=compression_type,
        splittable=False,
        validate=validate)
    self._coder = coder

  def read_records(self, file_name, offset_range_tracker):
    if offset_range_tracker.start_position():
      raise ValueError(
          'Start position not 0:%s' % offset_range_tracker.start_position())

    current_offset = offset_range_tracker.start_position()
    with self.open_file(file_name) as file_handle:
      while True:
        if not offset_range_tracker.try_claim(current_offset):
          raise RuntimeError('Unable to claim position: %s' % current_offset)
        record = _TFRecordUtil.read_record(file_handle)
        if record is None:
          return  # Reached EOF
        else:
          current_offset += _TFRecordUtil.encoded_num_bytes(record)
          yield self._coder.decode(record)


def _create_tfrecordio_source(
    file_pattern=None, coder=None, compression_type=None):
  # We intentionally disable validation for ReadAll pattern so that reading does
  # not fail for globs (elements) that are empty.
  return _TFRecordSource(file_pattern, coder, compression_type, validate=False)


[docs]class ReadAllFromTFRecord(PTransform):
  """A ``PTransform`` for reading a ``PCollection`` of TFRecord files."""
  def __init__(
      self,
      coder=coders.BytesCoder(),
      compression_type=CompressionTypes.AUTO,
      with_filename=False):
    """Initialize the ``ReadAllFromTFRecord`` transform.

    Args:
      coder: Coder used to decode each record.
      compression_type: Used to handle compressed input files. Default value
          is CompressionTypes.AUTO, in which case the file_path's extension will
          be used to detect the compression.
      with_filename: If True, returns a Key Value with the key being the file
        name and the value being the actual data. If False, it only returns
        the data.
    """
    super().__init__()
    source_from_file = partial(
        _create_tfrecordio_source,
        compression_type=compression_type,
        coder=coder)
    # Desired and min bundle sizes do not matter since TFRecord files are
    # unsplittable.
    self._read_all_files = ReadAllFiles(
        splittable=False,
        compression_type=compression_type,
        desired_bundle_size=0,
        min_bundle_size=0,
        source_from_file=source_from_file,
        with_filename=with_filename)

[docs]  def expand(self, pvalue):
    return pvalue | 'ReadAllFiles' >> self._read_all_files


[docs]class ReadFromTFRecord(PTransform):
  """Transform for reading TFRecord sources."""
  def __init__(
      self,
      file_pattern,
      coder=coders.BytesCoder(),
      compression_type=CompressionTypes.AUTO,
      validate=True):
    """Initialize a ReadFromTFRecord transform.

    Args:
      file_pattern: A file glob pattern to read TFRecords from.
      coder: Coder used to decode each record.
      compression_type: Used to handle compressed input files. Default value
          is CompressionTypes.AUTO, in which case the file_path's extension will
          be used to detect the compression.
      validate: Boolean flag to verify that the files exist during the pipeline
          creation time.

    Returns:
      A ReadFromTFRecord transform object.
    """
    super().__init__()
    self._source = _TFRecordSource(
        file_pattern, coder, compression_type, validate)

[docs]  def expand(self, pvalue):
    return pvalue.pipeline | Read(self._source)


class _TFRecordSink(filebasedsink.FileBasedSink):
  """Sink for writing TFRecords files.

  For detailed TFRecord format description see:
    https://www.tensorflow.org/versions/r1.11/api_guides/python/python_io#TFRecords_Format_Details
  """
  def __init__(
      self,
      file_path_prefix,
      coder,
      file_name_suffix,
      num_shards,
      shard_name_template,
      compression_type):
    """Initialize a TFRecordSink. See WriteToTFRecord for details."""

    super().__init__(
        file_path_prefix=file_path_prefix,
        coder=coder,
        file_name_suffix=file_name_suffix,
        num_shards=num_shards,
        shard_name_template=shard_name_template,
        mime_type='application/octet-stream',
        compression_type=compression_type)

  def write_encoded_record(self, file_handle, value):
    _TFRecordUtil.write_record(file_handle, value)


[docs]class WriteToTFRecord(PTransform):
  """Transform for writing to TFRecord sinks."""
  def __init__(
      self,
      file_path_prefix,
      coder=coders.BytesCoder(),
      file_name_suffix='',
      num_shards=0,
      shard_name_template=None,
      compression_type=CompressionTypes.AUTO):
    """Initialize WriteToTFRecord transform.

    Args:
      file_path_prefix: The file path to write to. The files written will begin
        with this prefix, followed by a shard identifier (see num_shards), and
        end in a common extension, if given by file_name_suffix.
      coder: Coder used to encode each record.
      file_name_suffix: Suffix for the files written.
      num_shards: The number of files (shards) used for output. If not set, the
        default value will be used.
      shard_name_template: A template string containing placeholders for
        the shard number and shard count. When constructing a filename for a
        particular shard number, the upper-case letters 'S' and 'N' are
        replaced with the 0-padded shard number and shard count respectively.
        This argument can be '' in which case it behaves as if num_shards was
        set to 1 and only one file will be generated. The default pattern used
        is '-SSSSS-of-NNNNN' if None is passed as the shard_name_template.
      compression_type: Used to handle compressed output files. Typical value
          is CompressionTypes.AUTO, in which case the file_path's extension will
          be used to detect the compression.

    Returns:
      A WriteToTFRecord transform object.
    """
    super().__init__()
    self._sink = _TFRecordSink(
        file_path_prefix,
        coder,
        file_name_suffix,
        num_shards,
        shard_name_template,
        compression_type)

[docs]  def expand(self, pcoll):
    return pcoll | Write(self._sink)