#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""This module implements IO classes to read and write data on MongoDB.
Read from MongoDB
-----------------
:class:`ReadFromMongoDB` is a ``PTransform`` that reads from a configured
MongoDB source and returns a ``PCollection`` of dict representing MongoDB
documents.
To configure MongoDB source, the URI to connect to MongoDB server, database
name, collection name needs to be provided.
Example usage::
pipeline | ReadFromMongoDB(uri='mongodb://localhost:27017',
db='testdb',
coll='input')
To read from MongoDB Atlas, use ``bucket_auto`` option to enable
``@bucketAuto`` MongoDB aggregation instead of ``splitVector``
command which is a high-privilege function that cannot be assigned
to any user in Atlas.
Example usage::
pipeline | ReadFromMongoDB(uri='mongodb+srv://user:pwd@cluster0.mongodb.net',
db='testdb',
coll='input',
bucket_auto=True)
Write to MongoDB:
-----------------
:class:`WriteToMongoDB` is a ``PTransform`` that writes MongoDB documents to
configured sink, and the write is conducted through a mongodb bulk_write of
``ReplaceOne`` operations. If the document's _id field already existed in the
MongoDB collection, it results in an overwrite, otherwise, a new document
will be inserted.
Example usage::
pipeline | WriteToMongoDB(uri='mongodb://localhost:27017',
db='testdb',
coll='output',
batch_size=10)
No backward compatibility guarantees. Everything in this module is experimental.
"""
# pytype: skip-file
from __future__ import absolute_import
from __future__ import division
import itertools
import json
import logging
import math
import struct
import apache_beam as beam
from apache_beam.io import iobase
from apache_beam.io.range_trackers import OrderedPositionRangeTracker
from apache_beam.transforms import DoFn
from apache_beam.transforms import PTransform
from apache_beam.transforms import Reshuffle
from apache_beam.utils.annotations import experimental
_LOGGER = logging.getLogger(__name__)
try:
# Mongodb has its own bundled bson, which is not compatible with bson pakcage.
# (https://github.com/py-bson/bson/issues/82). Try to import objectid and if
# it fails because bson package is installed, MongoDB IO will not work but at
# least rest of the SDK will work.
from bson import objectid
# pymongo also internally depends on bson.
from pymongo import ASCENDING
from pymongo import DESCENDING
from pymongo import MongoClient
from pymongo import ReplaceOne
except ImportError:
objectid = None
_LOGGER.warning("Could not find a compatible bson package.")
__all__ = ['ReadFromMongoDB', 'WriteToMongoDB']
[docs]@experimental()
class ReadFromMongoDB(PTransform):
"""A ``PTransform`` to read MongoDB documents into a ``PCollection``.
"""
def __init__(
self,
uri='mongodb://localhost:27017',
db=None,
coll=None,
filter=None,
projection=None,
extra_client_params=None,
bucket_auto=False):
"""Initialize a :class:`ReadFromMongoDB`
Args:
uri (str): The MongoDB connection string following the URI format.
db (str): The MongoDB database name.
coll (str): The MongoDB collection name.
filter: A `bson.SON
<https://api.mongodb.com/python/current/api/bson/son.html>`_ object
specifying elements which must be present for a document to be included
in the result set.
projection: A list of field names that should be returned in the result
set or a dict specifying the fields to include or exclude.
extra_client_params(dict): Optional `MongoClient
<https://api.mongodb.com/python/current/api/pymongo/mongo_client.html>`_
parameters.
bucket_auto (bool): If :data:`True`, use MongoDB `$bucketAuto` aggregation
to split collection into bundles instead of `splitVector` command,
which does not work with MongoDB Atlas.
If :data:`False` (the default), use `splitVector` command for bundling.
Returns:
:class:`~apache_beam.transforms.ptransform.PTransform`
"""
if extra_client_params is None:
extra_client_params = {}
if not isinstance(db, str):
raise ValueError('ReadFromMongDB db param must be specified as a string')
if not isinstance(coll, str):
raise ValueError(
'ReadFromMongDB coll param must be specified as a '
'string')
self._mongo_source = _BoundedMongoSource(
uri=uri,
db=db,
coll=coll,
filter=filter,
projection=projection,
extra_client_params=extra_client_params,
bucket_auto=bucket_auto)
[docs] def expand(self, pcoll):
return pcoll | iobase.Read(self._mongo_source)
class _BoundedMongoSource(iobase.BoundedSource):
def __init__(
self,
uri=None,
db=None,
coll=None,
filter=None,
projection=None,
extra_client_params=None,
bucket_auto=False):
if extra_client_params is None:
extra_client_params = {}
if filter is None:
filter = {}
self.uri = uri
self.db = db
self.coll = coll
self.filter = filter
self.projection = projection
self.spec = extra_client_params
self.bucket_auto = bucket_auto
def estimate_size(self):
with MongoClient(self.uri, **self.spec) as client:
return client[self.db].command('collstats', self.coll).get('size')
def _estimate_average_document_size(self):
with MongoClient(self.uri, **self.spec) as client:
return client[self.db].command('collstats', self.coll).get('avgObjSize')
def split(self, desired_bundle_size, start_position=None, stop_position=None):
desired_bundle_size_in_mb = desired_bundle_size // 1024 // 1024
# for desired bundle size, if desired chunk size smaller than 1mb, use
# MongoDB default split size of 1mb.
if desired_bundle_size_in_mb < 1:
desired_bundle_size_in_mb = 1
is_initial_split = start_position is None and stop_position is None
start_position, stop_position = self._replace_none_positions(
start_position, stop_position)
if self.bucket_auto:
# Use $bucketAuto for bundling
split_keys = []
weights = []
for bucket in self._get_auto_buckets(desired_bundle_size_in_mb,
start_position,
stop_position,
is_initial_split):
split_keys.append({'_id': bucket['_id']['max']})
weights.append(bucket['count'])
else:
# Use splitVector for bundling
split_keys = self._get_split_keys(
desired_bundle_size_in_mb, start_position, stop_position)
weights = itertools.cycle((desired_bundle_size_in_mb, ))
bundle_start = start_position
for split_key_id, weight in zip(split_keys, weights):
if bundle_start >= stop_position:
break
bundle_end = min(stop_position, split_key_id['_id'])
yield iobase.SourceBundle(
weight=weight,
source=self,
start_position=bundle_start,
stop_position=bundle_end)
bundle_start = bundle_end
# add range of last split_key to stop_position
if bundle_start < stop_position:
# bucket_auto mode can come here if not split due to single document
weight = 1 if self.bucket_auto else desired_bundle_size_in_mb
yield iobase.SourceBundle(
weight=weight,
source=self,
start_position=bundle_start,
stop_position=stop_position)
def get_range_tracker(self, start_position, stop_position):
start_position, stop_position = self._replace_none_positions(
start_position, stop_position)
return _ObjectIdRangeTracker(start_position, stop_position)
def read(self, range_tracker):
with MongoClient(self.uri, **self.spec) as client:
all_filters = self._merge_id_filter(
range_tracker.start_position(), range_tracker.stop_position())
docs_cursor = client[self.db][self.coll].find(
filter=all_filters,
projection=self.projection).sort([('_id', ASCENDING)])
for doc in docs_cursor:
if not range_tracker.try_claim(doc['_id']):
return
yield doc
def display_data(self):
res = super(_BoundedMongoSource, self).display_data()
res['database'] = self.db
res['collection'] = self.coll
res['filter'] = json.dumps(
self.filter, default=lambda x: 'not_serializable(%s)' % str(x))
res['projection'] = str(self.projection)
res['bucket_auto'] = self.bucket_auto
return res
def _get_split_keys(self, desired_chunk_size_in_mb, start_pos, end_pos):
# calls mongodb splitVector command to get document ids at split position
if start_pos >= _ObjectIdHelper.increment_id(end_pos, -1):
# single document not splittable
return []
with MongoClient(self.uri, **self.spec) as client:
name_space = '%s.%s' % (self.db, self.coll)
return (
client[self.db].command(
'splitVector',
name_space,
keyPattern={'_id': 1}, # Ascending index
min={'_id': start_pos},
max={'_id': end_pos},
maxChunkSize=desired_chunk_size_in_mb)['splitKeys'])
def _get_auto_buckets(
self, desired_chunk_size_in_mb, start_pos, end_pos, is_initial_split):
if start_pos >= _ObjectIdHelper.increment_id(end_pos, -1):
# single document not splittable
return []
if is_initial_split and not self.filter:
# total collection size
size_in_mb = self.estimate_size() / float(1 << 20)
else:
# size of documents within start/end id range and possibly filtered
documents_count = self._count_id_range(start_pos, end_pos)
avg_document_size = self._estimate_average_document_size()
size_in_mb = documents_count * avg_document_size / float(1 << 20)
if size_in_mb == 0:
# no documents not splittable (maybe a result of filtering)
return []
bucket_count = math.ceil(size_in_mb / desired_chunk_size_in_mb)
with beam.io.mongodbio.MongoClient(self.uri, **self.spec) as client:
pipeline = [
{
# filter by positions and by the custom filter if any
'$match': self._merge_id_filter(start_pos, end_pos)
},
{
'$bucketAuto': {
'groupBy': '$_id', 'buckets': bucket_count
}
}
]
buckets = list(client[self.db][self.coll].aggregate(pipeline))
if buckets:
buckets[-1]['_id']['max'] = end_pos
return buckets
def _merge_id_filter(self, start_position, stop_position):
# Merge the default filter (if any) with refined _id field range
# of range_tracker.
# $gte specifies start position (inclusive)
# and $lt specifies the end position (exclusive),
# see more at
# https://docs.mongodb.com/manual/reference/operator/query/gte/ and
# https://docs.mongodb.com/manual/reference/operator/query/lt/
id_filter = {'_id': {'$gte': start_position, '$lt': stop_position}}
if self.filter:
all_filters = {
# see more at
# https://docs.mongodb.com/manual/reference/operator/query/and/
'$and': [self.filter.copy(), id_filter]
}
else:
all_filters = id_filter
return all_filters
def _get_head_document_id(self, sort_order):
with MongoClient(self.uri, **self.spec) as client:
cursor = client[self.db][self.coll].find(
filter={}, projection=[]).sort([('_id', sort_order)]).limit(1)
try:
return cursor[0]['_id']
except IndexError:
raise ValueError('Empty Mongodb collection')
def _replace_none_positions(self, start_position, stop_position):
if start_position is None:
start_position = self._get_head_document_id(ASCENDING)
if stop_position is None:
last_doc_id = self._get_head_document_id(DESCENDING)
# increment last doc id binary value by 1 to make sure the last document
# is not excluded
stop_position = _ObjectIdHelper.increment_id(last_doc_id, 1)
return start_position, stop_position
def _count_id_range(self, start_position, stop_position):
# Number of documents between start_position (inclusive)
# and stop_position (exclusive), respecting the custom filter if any.
with MongoClient(self.uri, **self.spec) as client:
return client[self.db][self.coll].count_documents(
filter=self._merge_id_filter(start_position, stop_position))
class _ObjectIdHelper(object):
"""A Utility class to manipulate bson object ids."""
@classmethod
def id_to_int(cls, id):
"""
Args:
id: ObjectId required for each MongoDB document _id field.
Returns: Converted integer value of ObjectId's 12 bytes binary value.
"""
# converts object id binary to integer
# id object is bytes type with size of 12
ints = struct.unpack('>III', id.binary)
return (ints[0] << 64) + (ints[1] << 32) + ints[2]
@classmethod
def int_to_id(cls, number):
"""
Args:
number(int): The integer value to be used to convert to ObjectId.
Returns: The ObjectId that has the 12 bytes binary converted from the
integer value.
"""
# converts integer value to object id. Int value should be less than
# (2 ^ 96) so it can be convert to 12 bytes required by object id.
if number < 0 or number >= (1 << 96):
raise ValueError('number value must be within [0, %s)' % (1 << 96))
ints = [(number & 0xffffffff0000000000000000) >> 64,
(number & 0x00000000ffffffff00000000) >> 32,
number & 0x0000000000000000ffffffff]
bytes = struct.pack('>III', *ints)
return objectid.ObjectId(bytes)
@classmethod
def increment_id(cls, object_id, inc):
"""
Args:
object_id: The ObjectId to change.
inc(int): The incremental int value to be added to ObjectId.
Returns:
"""
# increment object_id binary value by inc value and return new object id.
id_number = _ObjectIdHelper.id_to_int(object_id)
new_number = id_number + inc
if new_number < 0 or new_number >= (1 << 96):
raise ValueError(
'invalid incremental, inc value must be within ['
'%s, %s)' % (0 - id_number, 1 << 96 - id_number))
return _ObjectIdHelper.int_to_id(new_number)
class _ObjectIdRangeTracker(OrderedPositionRangeTracker):
"""RangeTracker for tracking mongodb _id of bson ObjectId type."""
def position_to_fraction(self, pos, start, end):
pos_number = _ObjectIdHelper.id_to_int(pos)
start_number = _ObjectIdHelper.id_to_int(start)
end_number = _ObjectIdHelper.id_to_int(end)
return (pos_number - start_number) / (end_number - start_number)
def fraction_to_position(self, fraction, start, end):
start_number = _ObjectIdHelper.id_to_int(start)
end_number = _ObjectIdHelper.id_to_int(end)
total = end_number - start_number
pos = int(total * fraction + start_number)
# make sure split position is larger than start position and smaller than
# end position.
if pos <= start_number:
return _ObjectIdHelper.increment_id(start, 1)
if pos >= end_number:
return _ObjectIdHelper.increment_id(end, -1)
return _ObjectIdHelper.int_to_id(pos)
[docs]@experimental()
class WriteToMongoDB(PTransform):
"""WriteToMongoDB is a ``PTransform`` that writes a ``PCollection`` of
mongodb document to the configured MongoDB server.
In order to make the document writes idempotent so that the bundles are
retry-able without creating duplicates, the PTransform added 2 transformations
before final write stage:
a ``GenerateId`` transform and a ``Reshuffle`` transform.::
-----------------------------------------------
Pipeline --> |GenerateId --> Reshuffle --> WriteToMongoSink|
-----------------------------------------------
(WriteToMongoDB)
The ``GenerateId`` transform adds a random and unique*_id* field to the
documents if they don't already have one, it uses the same format as MongoDB
default. The ``Reshuffle`` transform makes sure that no fusion happens between
``GenerateId`` and the final write stage transform,so that the set of
documents and their unique IDs are not regenerated if final write step is
retried due to a failure. This prevents duplicate writes of the same document
with different unique IDs.
"""
def __init__(
self,
uri='mongodb://localhost:27017',
db=None,
coll=None,
batch_size=100,
extra_client_params=None):
"""
Args:
uri (str): The MongoDB connection string following the URI format
db (str): The MongoDB database name
coll (str): The MongoDB collection name
batch_size(int): Number of documents per bulk_write to MongoDB,
default to 100
extra_client_params(dict): Optional `MongoClient
<https://api.mongodb.com/python/current/api/pymongo/mongo_client.html>`_
parameters as keyword arguments
Returns:
:class:`~apache_beam.transforms.ptransform.PTransform`
"""
if extra_client_params is None:
extra_client_params = {}
if not isinstance(db, str):
raise ValueError('WriteToMongoDB db param must be specified as a string')
if not isinstance(coll, str):
raise ValueError(
'WriteToMongoDB coll param must be specified as a '
'string')
self._uri = uri
self._db = db
self._coll = coll
self._batch_size = batch_size
self._spec = extra_client_params
[docs] def expand(self, pcoll):
return pcoll \
| beam.ParDo(_GenerateObjectIdFn()) \
| Reshuffle() \
| beam.ParDo(_WriteMongoFn(self._uri, self._db, self._coll,
self._batch_size, self._spec))
class _GenerateObjectIdFn(DoFn):
def process(self, element, *args, **kwargs):
# if _id field already exist we keep it as it is, otherwise the ptransform
# generates a new _id field to achieve idempotent write to mongodb.
if '_id' not in element:
# object.ObjectId() generates a unique identifier that follows mongodb
# default format, if _id is not present in document, mongodb server
# generates it with this same function upon write. However the
# uniqueness of generated id may not be guaranteed if the work load are
# distributed across too many processes. See more on the ObjectId format
# https://docs.mongodb.com/manual/reference/bson-types/#objectid.
element['_id'] = objectid.ObjectId()
yield element
class _WriteMongoFn(DoFn):
def __init__(
self, uri=None, db=None, coll=None, batch_size=100, extra_params=None):
if extra_params is None:
extra_params = {}
self.uri = uri
self.db = db
self.coll = coll
self.spec = extra_params
self.batch_size = batch_size
self.batch = []
def finish_bundle(self):
self._flush()
def process(self, element, *args, **kwargs):
self.batch.append(element)
if len(self.batch) >= self.batch_size:
self._flush()
def _flush(self):
if len(self.batch) == 0:
return
with _MongoSink(self.uri, self.db, self.coll, self.spec) as sink:
sink.write(self.batch)
self.batch = []
def display_data(self):
res = super(_WriteMongoFn, self).display_data()
res['database'] = self.db
res['collection'] = self.coll
res['batch_size'] = self.batch_size
return res
class _MongoSink(object):
def __init__(self, uri=None, db=None, coll=None, extra_params=None):
if extra_params is None:
extra_params = {}
self.uri = uri
self.db = db
self.coll = coll
self.spec = extra_params
self.client = None
def write(self, documents):
if self.client is None:
self.client = MongoClient(host=self.uri, **self.spec)
requests = []
for doc in documents:
# match document based on _id field, if not found in current collection,
# insert new one, otherwise overwrite it.
requests.append(
ReplaceOne(
filter={'_id': doc.get('_id', None)},
replacement=doc,
upsert=True))
resp = self.client[self.db][self.coll].bulk_write(requests)
_LOGGER.debug(
'BulkWrite to MongoDB result in nModified:%d, nUpserted:%d, '
'nMatched:%d, Errors:%s' % (
resp.modified_count,
resp.upserted_count,
resp.matched_count,
resp.bulk_api_result.get('writeErrors')))
def __enter__(self):
if self.client is None:
self.client = MongoClient(host=self.uri, **self.spec)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if self.client is not None:
self.client.close()