Source code for apache_beam.io.mongodbio

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""This module implements IO classes to read and write data on MongoDB.


Read from MongoDB
-----------------
:class:`ReadFromMongoDB` is a ``PTransform`` that reads from a configured
MongoDB source and returns a ``PCollection`` of dict representing MongoDB
documents.
To configure MongoDB source, the URI to connect to MongoDB server, database
name, collection name needs to be provided.

Example usage::

  pipeline | ReadFromMongoDB(uri='mongodb://localhost:27017',
                             db='testdb',
                             coll='input')

To read from MongoDB Atlas, use ``bucket_auto`` option to enable
``@bucketAuto`` MongoDB aggregation instead of ``splitVector``
command which is a high-privilege function that cannot be assigned
to any user in Atlas.

Example usage::

  pipeline | ReadFromMongoDB(uri='mongodb+srv://user:pwd@cluster0.mongodb.net',
                             db='testdb',
                             coll='input',
                             bucket_auto=True)


Write to MongoDB:
-----------------
:class:`WriteToMongoDB` is a ``PTransform`` that writes MongoDB documents to
configured sink, and the write is conducted through a mongodb bulk_write of
``ReplaceOne`` operations. If the document's _id field already existed in the
MongoDB collection, it results in an overwrite, otherwise, a new document
will be inserted.

Example usage::

  pipeline | WriteToMongoDB(uri='mongodb://localhost:27017',
                            db='testdb',
                            coll='output',
                            batch_size=10)


No backward compatibility guarantees. Everything in this module is experimental.
"""

# pytype: skip-file

import itertools
import json
import logging
import math
import struct

import apache_beam as beam
from apache_beam.io import iobase
from apache_beam.io.range_trackers import OrderedPositionRangeTracker
from apache_beam.transforms import DoFn
from apache_beam.transforms import PTransform
from apache_beam.transforms import Reshuffle
from apache_beam.utils.annotations import experimental

_LOGGER = logging.getLogger(__name__)

try:
  # Mongodb has its own bundled bson, which is not compatible with bson pakcage.
  # (https://github.com/py-bson/bson/issues/82). Try to import objectid and if
  # it fails because bson package is installed, MongoDB IO will not work but at
  # least rest of the SDK will work.
  from bson import objectid

  # pymongo also internally depends on bson.
  from pymongo import ASCENDING
  from pymongo import DESCENDING
  from pymongo import MongoClient
  from pymongo import ReplaceOne
except ImportError:
  objectid = None
  _LOGGER.warning("Could not find a compatible bson package.")

__all__ = ['ReadFromMongoDB', 'WriteToMongoDB']


[docs]@experimental() class ReadFromMongoDB(PTransform): """A ``PTransform`` to read MongoDB documents into a ``PCollection``. """ def __init__( self, uri='mongodb://localhost:27017', db=None, coll=None, filter=None, projection=None, extra_client_params=None, bucket_auto=False): """Initialize a :class:`ReadFromMongoDB` Args: uri (str): The MongoDB connection string following the URI format. db (str): The MongoDB database name. coll (str): The MongoDB collection name. filter: A `bson.SON <https://api.mongodb.com/python/current/api/bson/son.html>`_ object specifying elements which must be present for a document to be included in the result set. projection: A list of field names that should be returned in the result set or a dict specifying the fields to include or exclude. extra_client_params(dict): Optional `MongoClient <https://api.mongodb.com/python/current/api/pymongo/mongo_client.html>`_ parameters. bucket_auto (bool): If :data:`True`, use MongoDB `$bucketAuto` aggregation to split collection into bundles instead of `splitVector` command, which does not work with MongoDB Atlas. If :data:`False` (the default), use `splitVector` command for bundling. Returns: :class:`~apache_beam.transforms.ptransform.PTransform` """ if extra_client_params is None: extra_client_params = {} if not isinstance(db, str): raise ValueError('ReadFromMongDB db param must be specified as a string') if not isinstance(coll, str): raise ValueError( 'ReadFromMongDB coll param must be specified as a ' 'string') self._mongo_source = _BoundedMongoSource( uri=uri, db=db, coll=coll, filter=filter, projection=projection, extra_client_params=extra_client_params, bucket_auto=bucket_auto)
[docs] def expand(self, pcoll): return pcoll | iobase.Read(self._mongo_source)
class _BoundedMongoSource(iobase.BoundedSource): def __init__( self, uri=None, db=None, coll=None, filter=None, projection=None, extra_client_params=None, bucket_auto=False): if extra_client_params is None: extra_client_params = {} if filter is None: filter = {} self.uri = uri self.db = db self.coll = coll self.filter = filter self.projection = projection self.spec = extra_client_params self.bucket_auto = bucket_auto def estimate_size(self): with MongoClient(self.uri, **self.spec) as client: return client[self.db].command('collstats', self.coll).get('size') def _estimate_average_document_size(self): with MongoClient(self.uri, **self.spec) as client: return client[self.db].command('collstats', self.coll).get('avgObjSize') def split(self, desired_bundle_size, start_position=None, stop_position=None): desired_bundle_size_in_mb = desired_bundle_size // 1024 // 1024 # for desired bundle size, if desired chunk size smaller than 1mb, use # MongoDB default split size of 1mb. if desired_bundle_size_in_mb < 1: desired_bundle_size_in_mb = 1 is_initial_split = start_position is None and stop_position is None start_position, stop_position = self._replace_none_positions( start_position, stop_position) if self.bucket_auto: # Use $bucketAuto for bundling split_keys = [] weights = [] for bucket in self._get_auto_buckets(desired_bundle_size_in_mb, start_position, stop_position, is_initial_split): split_keys.append({'_id': bucket['_id']['max']}) weights.append(bucket['count']) else: # Use splitVector for bundling split_keys = self._get_split_keys( desired_bundle_size_in_mb, start_position, stop_position) weights = itertools.cycle((desired_bundle_size_in_mb, )) bundle_start = start_position for split_key_id, weight in zip(split_keys, weights): if bundle_start >= stop_position: break bundle_end = min(stop_position, split_key_id['_id']) yield iobase.SourceBundle( weight=weight, source=self, start_position=bundle_start, stop_position=bundle_end) bundle_start = bundle_end # add range of last split_key to stop_position if bundle_start < stop_position: # bucket_auto mode can come here if not split due to single document weight = 1 if self.bucket_auto else desired_bundle_size_in_mb yield iobase.SourceBundle( weight=weight, source=self, start_position=bundle_start, stop_position=stop_position) def get_range_tracker(self, start_position, stop_position): start_position, stop_position = self._replace_none_positions( start_position, stop_position) return _ObjectIdRangeTracker(start_position, stop_position) def read(self, range_tracker): with MongoClient(self.uri, **self.spec) as client: all_filters = self._merge_id_filter( range_tracker.start_position(), range_tracker.stop_position()) docs_cursor = client[self.db][self.coll].find( filter=all_filters, projection=self.projection).sort([('_id', ASCENDING)]) for doc in docs_cursor: if not range_tracker.try_claim(doc['_id']): return yield doc def display_data(self): res = super(_BoundedMongoSource, self).display_data() res['database'] = self.db res['collection'] = self.coll res['filter'] = json.dumps( self.filter, default=lambda x: 'not_serializable(%s)' % str(x)) res['projection'] = str(self.projection) res['bucket_auto'] = self.bucket_auto return res def _get_split_keys(self, desired_chunk_size_in_mb, start_pos, end_pos): # calls mongodb splitVector command to get document ids at split position if start_pos >= _ObjectIdHelper.increment_id(end_pos, -1): # single document not splittable return [] with MongoClient(self.uri, **self.spec) as client: name_space = '%s.%s' % (self.db, self.coll) return ( client[self.db].command( 'splitVector', name_space, keyPattern={'_id': 1}, # Ascending index min={'_id': start_pos}, max={'_id': end_pos}, maxChunkSize=desired_chunk_size_in_mb)['splitKeys']) def _get_auto_buckets( self, desired_chunk_size_in_mb, start_pos, end_pos, is_initial_split): if start_pos >= _ObjectIdHelper.increment_id(end_pos, -1): # single document not splittable return [] if is_initial_split and not self.filter: # total collection size size_in_mb = self.estimate_size() / float(1 << 20) else: # size of documents within start/end id range and possibly filtered documents_count = self._count_id_range(start_pos, end_pos) avg_document_size = self._estimate_average_document_size() size_in_mb = documents_count * avg_document_size / float(1 << 20) if size_in_mb == 0: # no documents not splittable (maybe a result of filtering) return [] bucket_count = math.ceil(size_in_mb / desired_chunk_size_in_mb) with beam.io.mongodbio.MongoClient(self.uri, **self.spec) as client: pipeline = [ { # filter by positions and by the custom filter if any '$match': self._merge_id_filter(start_pos, end_pos) }, { '$bucketAuto': { 'groupBy': '$_id', 'buckets': bucket_count } } ] buckets = list(client[self.db][self.coll].aggregate(pipeline)) if buckets: buckets[-1]['_id']['max'] = end_pos return buckets def _merge_id_filter(self, start_position, stop_position): # Merge the default filter (if any) with refined _id field range # of range_tracker. # $gte specifies start position (inclusive) # and $lt specifies the end position (exclusive), # see more at # https://docs.mongodb.com/manual/reference/operator/query/gte/ and # https://docs.mongodb.com/manual/reference/operator/query/lt/ id_filter = {'_id': {'$gte': start_position, '$lt': stop_position}} if self.filter: all_filters = { # see more at # https://docs.mongodb.com/manual/reference/operator/query/and/ '$and': [self.filter.copy(), id_filter] } else: all_filters = id_filter return all_filters def _get_head_document_id(self, sort_order): with MongoClient(self.uri, **self.spec) as client: cursor = client[self.db][self.coll].find( filter={}, projection=[]).sort([('_id', sort_order)]).limit(1) try: return cursor[0]['_id'] except IndexError: raise ValueError('Empty Mongodb collection') def _replace_none_positions(self, start_position, stop_position): if start_position is None: start_position = self._get_head_document_id(ASCENDING) if stop_position is None: last_doc_id = self._get_head_document_id(DESCENDING) # increment last doc id binary value by 1 to make sure the last document # is not excluded stop_position = _ObjectIdHelper.increment_id(last_doc_id, 1) return start_position, stop_position def _count_id_range(self, start_position, stop_position): # Number of documents between start_position (inclusive) # and stop_position (exclusive), respecting the custom filter if any. with MongoClient(self.uri, **self.spec) as client: return client[self.db][self.coll].count_documents( filter=self._merge_id_filter(start_position, stop_position)) class _ObjectIdHelper(object): """A Utility class to manipulate bson object ids.""" @classmethod def id_to_int(cls, id): """ Args: id: ObjectId required for each MongoDB document _id field. Returns: Converted integer value of ObjectId's 12 bytes binary value. """ # converts object id binary to integer # id object is bytes type with size of 12 ints = struct.unpack('>III', id.binary) return (ints[0] << 64) + (ints[1] << 32) + ints[2] @classmethod def int_to_id(cls, number): """ Args: number(int): The integer value to be used to convert to ObjectId. Returns: The ObjectId that has the 12 bytes binary converted from the integer value. """ # converts integer value to object id. Int value should be less than # (2 ^ 96) so it can be convert to 12 bytes required by object id. if number < 0 or number >= (1 << 96): raise ValueError('number value must be within [0, %s)' % (1 << 96)) ints = [(number & 0xffffffff0000000000000000) >> 64, (number & 0x00000000ffffffff00000000) >> 32, number & 0x0000000000000000ffffffff] bytes = struct.pack('>III', *ints) return objectid.ObjectId(bytes) @classmethod def increment_id(cls, object_id, inc): """ Args: object_id: The ObjectId to change. inc(int): The incremental int value to be added to ObjectId. Returns: """ # increment object_id binary value by inc value and return new object id. id_number = _ObjectIdHelper.id_to_int(object_id) new_number = id_number + inc if new_number < 0 or new_number >= (1 << 96): raise ValueError( 'invalid incremental, inc value must be within [' '%s, %s)' % (0 - id_number, 1 << 96 - id_number)) return _ObjectIdHelper.int_to_id(new_number) class _ObjectIdRangeTracker(OrderedPositionRangeTracker): """RangeTracker for tracking mongodb _id of bson ObjectId type.""" def position_to_fraction(self, pos, start, end): pos_number = _ObjectIdHelper.id_to_int(pos) start_number = _ObjectIdHelper.id_to_int(start) end_number = _ObjectIdHelper.id_to_int(end) return (pos_number - start_number) / (end_number - start_number) def fraction_to_position(self, fraction, start, end): start_number = _ObjectIdHelper.id_to_int(start) end_number = _ObjectIdHelper.id_to_int(end) total = end_number - start_number pos = int(total * fraction + start_number) # make sure split position is larger than start position and smaller than # end position. if pos <= start_number: return _ObjectIdHelper.increment_id(start, 1) if pos >= end_number: return _ObjectIdHelper.increment_id(end, -1) return _ObjectIdHelper.int_to_id(pos)
[docs]@experimental() class WriteToMongoDB(PTransform): """WriteToMongoDB is a ``PTransform`` that writes a ``PCollection`` of mongodb document to the configured MongoDB server. In order to make the document writes idempotent so that the bundles are retry-able without creating duplicates, the PTransform added 2 transformations before final write stage: a ``GenerateId`` transform and a ``Reshuffle`` transform.:: ----------------------------------------------- Pipeline --> |GenerateId --> Reshuffle --> WriteToMongoSink| ----------------------------------------------- (WriteToMongoDB) The ``GenerateId`` transform adds a random and unique*_id* field to the documents if they don't already have one, it uses the same format as MongoDB default. The ``Reshuffle`` transform makes sure that no fusion happens between ``GenerateId`` and the final write stage transform,so that the set of documents and their unique IDs are not regenerated if final write step is retried due to a failure. This prevents duplicate writes of the same document with different unique IDs. """ def __init__( self, uri='mongodb://localhost:27017', db=None, coll=None, batch_size=100, extra_client_params=None): """ Args: uri (str): The MongoDB connection string following the URI format db (str): The MongoDB database name coll (str): The MongoDB collection name batch_size(int): Number of documents per bulk_write to MongoDB, default to 100 extra_client_params(dict): Optional `MongoClient <https://api.mongodb.com/python/current/api/pymongo/mongo_client.html>`_ parameters as keyword arguments Returns: :class:`~apache_beam.transforms.ptransform.PTransform` """ if extra_client_params is None: extra_client_params = {} if not isinstance(db, str): raise ValueError('WriteToMongoDB db param must be specified as a string') if not isinstance(coll, str): raise ValueError( 'WriteToMongoDB coll param must be specified as a ' 'string') self._uri = uri self._db = db self._coll = coll self._batch_size = batch_size self._spec = extra_client_params
[docs] def expand(self, pcoll): return pcoll \ | beam.ParDo(_GenerateObjectIdFn()) \ | Reshuffle() \ | beam.ParDo(_WriteMongoFn(self._uri, self._db, self._coll, self._batch_size, self._spec))
class _GenerateObjectIdFn(DoFn): def process(self, element, *args, **kwargs): # if _id field already exist we keep it as it is, otherwise the ptransform # generates a new _id field to achieve idempotent write to mongodb. if '_id' not in element: # object.ObjectId() generates a unique identifier that follows mongodb # default format, if _id is not present in document, mongodb server # generates it with this same function upon write. However the # uniqueness of generated id may not be guaranteed if the work load are # distributed across too many processes. See more on the ObjectId format # https://docs.mongodb.com/manual/reference/bson-types/#objectid. element['_id'] = objectid.ObjectId() yield element class _WriteMongoFn(DoFn): def __init__( self, uri=None, db=None, coll=None, batch_size=100, extra_params=None): if extra_params is None: extra_params = {} self.uri = uri self.db = db self.coll = coll self.spec = extra_params self.batch_size = batch_size self.batch = [] def finish_bundle(self): self._flush() def process(self, element, *args, **kwargs): self.batch.append(element) if len(self.batch) >= self.batch_size: self._flush() def _flush(self): if len(self.batch) == 0: return with _MongoSink(self.uri, self.db, self.coll, self.spec) as sink: sink.write(self.batch) self.batch = [] def display_data(self): res = super(_WriteMongoFn, self).display_data() res['database'] = self.db res['collection'] = self.coll res['batch_size'] = self.batch_size return res class _MongoSink(object): def __init__(self, uri=None, db=None, coll=None, extra_params=None): if extra_params is None: extra_params = {} self.uri = uri self.db = db self.coll = coll self.spec = extra_params self.client = None def write(self, documents): if self.client is None: self.client = MongoClient(host=self.uri, **self.spec) requests = [] for doc in documents: # match document based on _id field, if not found in current collection, # insert new one, otherwise overwrite it. requests.append( ReplaceOne( filter={'_id': doc.get('_id', None)}, replacement=doc, upsert=True)) resp = self.client[self.db][self.coll].bulk_write(requests) _LOGGER.debug( 'BulkWrite to MongoDB result in nModified:%d, nUpserted:%d, ' 'nMatched:%d, Errors:%s' % ( resp.modified_count, resp.upserted_count, resp.matched_count, resp.bulk_api_result.get('writeErrors'))) def __enter__(self): if self.client is None: self.client = MongoClient(host=self.uri, **self.spec) return self def __exit__(self, exc_type, exc_val, exc_tb): if self.client is not None: self.client.close()