Source code for apache_beam.testing.benchmarks.nexmark.nexmark_util

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Utilities for the Nexmark suite.

The Nexmark suite is a series of queries (streaming pipelines) performed
on a simulation of auction events. This util includes:

  - A Command class used to terminate the streaming jobs
    launched in nexmark_launcher.py by the DirectRunner.
  - A ParseEventFn DoFn to parse events received from PubSub.

Usage:

To run a process for a certain duration, define in the code:
  command = Command(process_to_terminate, args)
  command.run(timeout=duration)

"""

# pytype: skip-file

import json
import logging
import threading

import apache_beam as beam
from apache_beam.metrics import MetricsFilter
from apache_beam.runners.runner import PipelineResult  # pylint: disable=unused-import
from apache_beam.testing.benchmarks.nexmark.models import auction_bid
from apache_beam.testing.benchmarks.nexmark.models import nexmark_model
from apache_beam.testing.benchmarks.nexmark.models.field_name import FieldNames
from apache_beam.transforms import window
from apache_beam.utils.timestamp import Timestamp

_LOGGER = logging.getLogger(__name__)


[docs]class Command(object): def __init__(self, cmd, args): self.cmd = cmd self.args = args
[docs] def run(self, timeout): def thread_target(): logging.debug( 'Starting thread for %d seconds: %s', timeout, self.cmd.__name__) self.cmd(*self.args) _LOGGER.info( '%d seconds elapsed. Thread (%s) finished.', timeout, self.cmd.__name__) thread = threading.Thread(target=thread_target, name='Thread-timeout') thread.daemon = True thread.start() thread.join(timeout)
[docs]def setup_coder(): beam.coders.registry.register_coder( nexmark_model.Auction, nexmark_model.AuctionCoder) beam.coders.registry.register_coder( nexmark_model.Person, nexmark_model.PersonCoder) beam.coders.registry.register_coder(nexmark_model.Bid, nexmark_model.BidCoder) beam.coders.registry.register_coder( auction_bid.AuctionBid, auction_bid.AuctionBidCoder)
[docs]class ParseEventFn(beam.DoFn): """ Original parser for parsing raw events info into a Python objects. Each event line has the following format: person: <id starting with 'p'>,name,email,credit_card,city, \ state,timestamp,extra auction: <id starting with 'a'>,item_name, description,initial_bid, \ reserve_price,timestamp,expires,seller,category,extra bid: <auction starting with 'b'>,bidder,price,timestamp,extra For example: 'p12345,maria,maria@maria.com,1234-5678-9012-3456, \ sunnyvale,CA,1528098831536' 'a12345,car67,2012 hyundai elantra,15000,20000, \ 1528098831536,20180630,maria,vehicle' 'b12345,maria,20000,1528098831536' """
[docs] def process(self, elem): model_dict = { 'p': nexmark_model.Person, 'a': nexmark_model.Auction, 'b': nexmark_model.Bid, } row = elem.split(',') model = model_dict.get(elem[0]) if not model: raise ValueError('Invalid event: %s.' % row) event = model(*row) logging.debug('Parsed event: %s', event) yield event
[docs]class ParseJsonEventFn(beam.DoFn): """Parses the raw event info into a Python objects. Each event line has the following format: person: {id,name,email,credit_card,city, \ state,timestamp,extra} auction: {id,item_name, description,initial_bid, \ reserve_price,timestamp,expires,seller,category,extra} bid: {auction,bidder,price,timestamp,extra} For example: {"id":1000,"name":"Peter Jones","emailAddress":"nhd@xcat.com",\ "creditCard":"7241 7320 9143 4888","city":"Portland","state":"WY",\ "dateTime":1528098831026,\"extra":"WN_HS_bnpVQ\\[["} {"id":1000,"itemName":"wkx mgee","description":"eszpqxtdxrvwmmywkmogoahf",\ "initialBid":28873,"reserve":29448,"dateTime":1528098831036,\ "expires":1528098840451,"seller":1000,"category":13,"extra":"zcuupiz"} {"auction":1000,"bidder":1001,"price":32530001,"dateTime":1528098831066,\ "extra":"fdiysaV^]NLVsbolvyqwgticfdrwdyiyofWPYTOuwogvszlxjrcNOORM"} """
[docs] def process(self, elem): json_dict = json.loads(elem) if type(json_dict[FieldNames.DATE_TIME]) is dict: json_dict[FieldNames.DATE_TIME] = json_dict[ FieldNames.DATE_TIME]['millis'] if FieldNames.NAME in json_dict: yield nexmark_model.Person( json_dict[FieldNames.ID], json_dict[FieldNames.NAME], json_dict[FieldNames.EMAIL_ADDRESS], json_dict[FieldNames.CREDIT_CARD], json_dict[FieldNames.CITY], json_dict[FieldNames.STATE], millis_to_timestamp(json_dict[FieldNames.DATE_TIME]), json_dict[FieldNames.EXTRA]) elif FieldNames.ITEM_NAME in json_dict: if type(json_dict[FieldNames.EXPIRES]) is dict: json_dict[FieldNames.EXPIRES] = json_dict[FieldNames.EXPIRES]['millis'] yield nexmark_model.Auction( json_dict[FieldNames.ID], json_dict[FieldNames.ITEM_NAME], json_dict[FieldNames.DESCRIPTION], json_dict[FieldNames.INITIAL_BID], json_dict[FieldNames.RESERVE], millis_to_timestamp(json_dict[FieldNames.DATE_TIME]), millis_to_timestamp(json_dict[FieldNames.EXPIRES]), json_dict[FieldNames.SELLER], json_dict[FieldNames.CATEGORY], json_dict[FieldNames.EXTRA]) elif FieldNames.AUCTION in json_dict: yield nexmark_model.Bid( json_dict[FieldNames.AUCTION], json_dict[FieldNames.BIDDER], json_dict[FieldNames.PRICE], millis_to_timestamp(json_dict[FieldNames.DATE_TIME]), json_dict[FieldNames.EXTRA]) else: raise ValueError('Invalid event: %s.' % str(json_dict))
[docs]class CountAndLog(beam.PTransform):
[docs] def expand(self, pcoll): return ( pcoll | 'window' >> beam.WindowInto(window.GlobalWindows()) | "Count" >> beam.combiners.Count.Globally() | "Log" >> beam.Map(log_count_info))
[docs]def log_count_info(count): logging.info('Query resulted in %d results', count) return count
[docs]def display(elm): logging.debug(elm) return elm
[docs]def model_to_json(model): return json.dumps(construct_json_dict(model), separators=(',', ':'))
[docs]def construct_json_dict(model): return {k: unnest_to_json(v) for k, v in model.__dict__.items()}
[docs]def unnest_to_json(cand): if isinstance(cand, Timestamp): return cand.micros // 1000 elif isinstance( cand, (nexmark_model.Auction, nexmark_model.Bid, nexmark_model.Person)): return construct_json_dict(cand) else: return cand
[docs]def millis_to_timestamp(millis: int) -> Timestamp: micro_second = millis * 1000 return Timestamp(micros=micro_second)
[docs]def get_counter_metric( result: PipelineResult, namespace: str, name: str) -> int: """ get specific counter metric from pipeline result Args: result: the PipelineResult which metrics are read from namespace: a string representing the namespace of wanted metric name: a string representing the name of the wanted metric Returns: the result of the wanted metric if it exist, else -1 """ metrics = result.metrics().query( MetricsFilter().with_namespace(namespace).with_name(name)) counters = metrics['counters'] if len(counters) > 1: raise RuntimeError( '%d instead of one metric result matches name: %s in namespace %s' % (len(counters), name, namespace)) return counters[0].result if len(counters) > 0 else -1
[docs]def get_start_time_metric( result: PipelineResult, namespace: str, name: str) -> int: """ get the start time out of all times recorded by the specified distribution metric Args: result: the PipelineResult which metrics are read from namespace: a string representing the namespace of wanted metric name: a string representing the name of the wanted metric Returns: the smallest time in the metric or -1 if it doesn't exist """ distributions = result.metrics().query( MetricsFilter().with_namespace(namespace).with_name( name))['distributions'] min_list = list(map(lambda m: m.result.min, distributions)) return min(min_list) if len(min_list) > 0 else -1
[docs]def get_end_time_metric( result: PipelineResult, namespace: str, name: str) -> int: """ get the end time out of all times recorded by the specified distribution metric Args: result: the PipelineResult which metrics are read from namespace: a string representing the namespace of wanted metric name: a string representing the name of the wanted metric Returns: the largest time in the metric or -1 if it doesn't exist """ distributions = result.metrics().query( MetricsFilter().with_namespace(namespace).with_name( name))['distributions'] max_list = list(map(lambda m: m.result.max, distributions)) return max(max_list) if len(max_list) > 0 else -1