# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

"""hypothesis strategies for generating schema types.

Intended for internal use only, no backward-compatibility guarantees."""

import keyword
import unicodedata
from typing import Mapping
from typing import Optional
from typing import Sequence

from hypothesis import strategies as st

from apache_beam.typehints import row_type
from apache_beam.typehints.schemas import _PRIMITIVES

PRIMITIVES = [p[0] for p in _PRIMITIVES]

[docs]def field_names(): @st.composite def field_name_candidates(draw): """Strategy to produce valid field names for Beam schema types.""" # unicode categories that cannot be used in Python identifiers identifer_denylist = ( 'Lo', 'Lm', 'C', 'P', 'Sm', 'Sc', 'So', 'Sk', 'M', 'No', 'Z') # First character can't be numeric (Nd). # It also can't start with '_' in a NamedTuple. field_first_character = draw( st.text( alphabet=st.characters( blacklist_categories=('Nd', ) + identifer_denylist, blacklist_characters=('_')), min_size=1, max_size=1)) field_remainder = draw( st.text( alphabet=st.characters(blacklist_categories=identifer_denylist))) return field_first_character + field_remainder return field_name_candidates().filter( lambda s: s.isidentifier() and not keyword.iskeyword(s))
def _named_fields_from_types(types): return st.lists( st.tuples(field_names(), types), min_size=1, # Python identifiers are normalized with form NFKC (see # We use the # same normalization here to avoid name collisions. unique_by=lambda name_and_type: unicodedata.normalize( 'NFKC', name_and_type[0]), )
[docs]def types(): """Strategy to produce types that are convertible to Beam schema FieldType instances.""" def _extend_types(types): optionals = typ: Optional[typ]) sequences = typ: Sequence[typ]) mappings = st.tuples(types, types).map(lambda typs: Mapping[typs[0], typs[1]]) rows = _named_fields_from_types(types).map( row_type.RowTypeConstraint.from_fields) return st.one_of(optionals, sequences, mappings, rows) # TODO: Currently this will only draw from the primitive types that can be # roundtripped faithfully (e.g. np.int64, not int). We should add support for # other types: # - Logical Types (e.g. Timestamp) # - Shunted primitive types (e.g. int) # We'll need to provide support for limiting the types that are drawn. This # could be similar to the allowlist[_categories]/denylist[_categories] pattern # used in st.characters. return st.recursive(st.sampled_from(PRIMITIVES), _extend_types)
[docs]def named_fields(): """Strategy to produce a set of named fields (type ``List[Tuple[str, type]]``).""" return _named_fields_from_types(types())