#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Utilities for relating schema-aware PCollections and DataFrame transforms.
The utilities here enforce the type mapping defined in
:mod:`apache_beam.typehints.pandas_type_compatibility`.
"""
# pytype: skip-file
import warnings
from typing import Any
from typing import Dict
from typing import NamedTuple
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import TypeVar
from typing import Union
import pandas as pd
import apache_beam as beam
from apache_beam import typehints
from apache_beam.transforms.util import BatchElements
from apache_beam.typehints.pandas_type_compatibility import INDEX_OPTION_NAME
from apache_beam.typehints.pandas_type_compatibility import create_pandas_batch_converter
from apache_beam.typehints.pandas_type_compatibility import dtype_from_typehint
from apache_beam.typehints.pandas_type_compatibility import dtype_to_fieldtype
from apache_beam.typehints.row_type import RowTypeConstraint
from apache_beam.typehints.schemas import named_fields_from_element_type
from apache_beam.typehints.typehints import normalize
__all__ = (
'BatchRowsAsDataFrame',
'generate_proxy',
'UnbatchPandas',
'element_type_from_dataframe')
T = TypeVar('T', bound=NamedTuple)
[docs]@typehints.with_input_types(T)
@typehints.with_output_types(pd.DataFrame)
class BatchRowsAsDataFrame(beam.PTransform):
"""A transform that batches schema-aware PCollection elements into DataFrames
Batching parameters are inherited from
:class:`~apache_beam.transforms.util.BatchElements`.
"""
def __init__(self, *args, proxy=None, **kwargs):
self._batch_elements_transform = BatchElements(*args, **kwargs)
self._proxy = proxy
[docs] def expand(self, pcoll):
if self._proxy is not None:
# Generate typehint
proxy = self._proxy
element_typehint = _element_typehint_from_proxy(proxy)
else:
# Generate proxy
proxy = generate_proxy(pcoll.element_type)
element_typehint = pcoll.element_type
converter = create_pandas_batch_converter(
element_type=element_typehint, batch_type=type(proxy))
return (
pcoll | self._batch_elements_transform
| beam.Map(converter.produce_batch))
[docs]def generate_proxy(element_type):
# type: (type) -> pd.DataFrame
"""Generate a proxy pandas object for the given PCollection element_type.
Currently only supports generating a DataFrame proxy from a schema-aware
PCollection or a Series proxy from a primitively typed PCollection.
"""
dtype = dtype_from_typehint(element_type)
if dtype is not object:
return pd.Series(dtype=dtype)
else:
fields = named_fields_from_element_type(element_type)
proxy = pd.DataFrame(columns=[name for name, _ in fields])
for name, typehint in fields:
dtype = dtype_from_typehint(typehint)
proxy[name] = proxy[name].astype(dtype)
return proxy
[docs]def element_type_from_dataframe(proxy, include_indexes=False):
# type: (pd.DataFrame, bool) -> type
"""Generate an element_type for an element-wise PCollection from a proxy
pandas object. Currently only supports converting the element_type for
a schema-aware PCollection to a proxy DataFrame.
Currently only supports generating a DataFrame proxy from a schema-aware
PCollection.
"""
return element_typehint_from_dataframe_proxy(proxy, include_indexes).user_type
def _element_typehint_from_proxy(
proxy: pd.core.generic.NDFrame, include_indexes: bool = False):
if isinstance(proxy, pd.DataFrame):
return element_typehint_from_dataframe_proxy(
proxy, include_indexes=include_indexes)
elif isinstance(proxy, pd.Series):
if include_indexes:
warnings.warn(
"include_indexes=True for a Series input. Note that this "
"parameter is _not_ respected for DeferredSeries "
"conversion.")
return dtype_to_fieldtype(proxy.dtype)
else:
raise TypeError(f"Proxy '{proxy}' has unsupported type '{type(proxy)}'")
def element_typehint_from_dataframe_proxy(
proxy: pd.DataFrame, include_indexes: bool = False) -> RowTypeConstraint:
output_columns = []
if include_indexes:
remaining_index_names = list(proxy.index.names)
i = 0
while len(remaining_index_names):
index_name = remaining_index_names.pop(0)
if index_name is None:
raise ValueError(
"Encountered an unnamed index. Cannot convert to a "
"schema-aware PCollection with include_indexes=True. "
"Please name all indexes or consider not including "
"indexes.")
elif index_name in remaining_index_names:
raise ValueError(
"Encountered multiple indexes with the name '%s'. "
"Cannot convert to a schema-aware PCollection with "
"include_indexes=True. Please ensure all indexes have "
"unique names or consider not including indexes." % index_name)
elif index_name in proxy.columns:
raise ValueError(
"Encountered an index that has the same name as one "
"of the columns, '%s'. Cannot convert to a "
"schema-aware PCollection with include_indexes=True. "
"Please ensure all indexes have unique names or "
"consider not including indexes." % index_name)
else:
# its ok!
output_columns.append(
(index_name, proxy.index.get_level_values(i).dtype))
i += 1
output_columns.extend(zip(proxy.columns, proxy.dtypes))
fields = [(column, dtype_to_fieldtype(dtype))
for (column, dtype) in output_columns]
field_options: Optional[Dict[str, Sequence[Tuple[str, Any]]]]
if include_indexes:
field_options = {
index_name: [(INDEX_OPTION_NAME, None)]
for index_name in proxy.index.names
}
else:
field_options = None
return RowTypeConstraint.from_fields(fields, field_options=field_options)
def _unbatch_transform(proxy, include_indexes):
element_typehint = normalize(
_element_typehint_from_proxy(proxy, include_indexes=include_indexes))
converter = create_pandas_batch_converter(
element_type=element_typehint, batch_type=type(proxy))
return beam.FlatMap(
converter.explode_batch).with_output_types(element_typehint)
[docs]@typehints.with_input_types(Union[pd.DataFrame, pd.Series])
class UnbatchPandas(beam.PTransform):
"""A transform that explodes a PCollection of DataFrame or Series. DataFrame
is converterd to a schema-aware PCollection, while Series is converted to its
underlying type.
Args:
include_indexes: (optional, default: False) When unbatching a DataFrame
if include_indexes=True, attempt to include index columns in the output
schema for expanded DataFrames. Raises an error if any of the index
levels are unnamed (name=None), or if any of the names are not unique
among all column and index names.
"""
def __init__(self, proxy, include_indexes=False):
self._proxy = proxy
self._include_indexes = include_indexes
[docs] def expand(self, pcoll):
return pcoll | _unbatch_transform(self._proxy, self._include_indexes)