#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Utilities for the Nexmark suite.
The Nexmark suite is a series of queries (streaming pipelines) performed
on a simulation of auction events. This util includes:
- A Command class used to terminate the streaming jobs
launched in nexmark_launcher.py by the DirectRunner.
- A ParseEventFn DoFn to parse events received from PubSub.
Usage:
To run a process for a certain duration, define in the code:
command = Command(process_to_terminate, args)
command.run(timeout=duration)
"""
# pytype: skip-file
import json
import logging
import threading
import apache_beam as beam
from apache_beam.metrics import MetricsFilter
from apache_beam.runners.runner import PipelineResult # pylint: disable=unused-import
from apache_beam.testing.benchmarks.nexmark.models import auction_bid
from apache_beam.testing.benchmarks.nexmark.models import nexmark_model
from apache_beam.testing.benchmarks.nexmark.models.field_name import FieldNames
from apache_beam.transforms import window
from apache_beam.utils.timestamp import Timestamp
_LOGGER = logging.getLogger(__name__)
[docs]class Command(object):
def __init__(self, cmd, args):
self.cmd = cmd
self.args = args
[docs] def run(self, timeout):
def thread_target():
logging.debug(
'Starting thread for %d seconds: %s', timeout, self.cmd.__name__)
self.cmd(*self.args)
_LOGGER.info(
'%d seconds elapsed. Thread (%s) finished.',
timeout,
self.cmd.__name__)
thread = threading.Thread(target=thread_target, name='Thread-timeout')
thread.daemon = True
thread.start()
thread.join(timeout)
[docs]def setup_coder():
beam.coders.registry.register_coder(
nexmark_model.Auction, nexmark_model.AuctionCoder)
beam.coders.registry.register_coder(
nexmark_model.Person, nexmark_model.PersonCoder)
beam.coders.registry.register_coder(nexmark_model.Bid, nexmark_model.BidCoder)
beam.coders.registry.register_coder(
auction_bid.AuctionBid, auction_bid.AuctionBidCoder)
[docs]class ParseEventFn(beam.DoFn):
"""
Original parser for parsing raw events info into a Python objects.
Each event line has the following format:
person: <id starting with 'p'>,name,email,credit_card,city, \
state,timestamp,extra
auction: <id starting with 'a'>,item_name, description,initial_bid, \
reserve_price,timestamp,expires,seller,category,extra
bid: <auction starting with 'b'>,bidder,price,timestamp,extra
For example:
'p12345,maria,maria@maria.com,1234-5678-9012-3456, \
sunnyvale,CA,1528098831536'
'a12345,car67,2012 hyundai elantra,15000,20000, \
1528098831536,20180630,maria,vehicle'
'b12345,maria,20000,1528098831536'
"""
[docs] def process(self, elem):
model_dict = {
'p': nexmark_model.Person,
'a': nexmark_model.Auction,
'b': nexmark_model.Bid,
}
row = elem.split(',')
model = model_dict.get(elem[0])
if not model:
raise ValueError('Invalid event: %s.' % row)
event = model(*row)
logging.debug('Parsed event: %s', event)
yield event
[docs]class ParseJsonEventFn(beam.DoFn):
"""Parses the raw event info into a Python objects.
Each event line has the following format:
person: {id,name,email,credit_card,city, \
state,timestamp,extra}
auction: {id,item_name, description,initial_bid, \
reserve_price,timestamp,expires,seller,category,extra}
bid: {auction,bidder,price,timestamp,extra}
For example:
{"id":1000,"name":"Peter Jones","emailAddress":"nhd@xcat.com",\
"creditCard":"7241 7320 9143 4888","city":"Portland","state":"WY",\
"dateTime":1528098831026,\"extra":"WN_HS_bnpVQ\\[["}
{"id":1000,"itemName":"wkx mgee","description":"eszpqxtdxrvwmmywkmogoahf",\
"initialBid":28873,"reserve":29448,"dateTime":1528098831036,\
"expires":1528098840451,"seller":1000,"category":13,"extra":"zcuupiz"}
{"auction":1000,"bidder":1001,"price":32530001,"dateTime":1528098831066,\
"extra":"fdiysaV^]NLVsbolvyqwgticfdrwdyiyofWPYTOuwogvszlxjrcNOORM"}
"""
[docs] def process(self, elem):
json_dict = json.loads(elem)
if type(json_dict[FieldNames.DATE_TIME]) is dict:
json_dict[FieldNames.DATE_TIME] = json_dict[
FieldNames.DATE_TIME]['millis']
if FieldNames.NAME in json_dict:
yield nexmark_model.Person(
json_dict[FieldNames.ID],
json_dict[FieldNames.NAME],
json_dict[FieldNames.EMAIL_ADDRESS],
json_dict[FieldNames.CREDIT_CARD],
json_dict[FieldNames.CITY],
json_dict[FieldNames.STATE],
millis_to_timestamp(json_dict[FieldNames.DATE_TIME]),
json_dict[FieldNames.EXTRA])
elif FieldNames.ITEM_NAME in json_dict:
if type(json_dict[FieldNames.EXPIRES]) is dict:
json_dict[FieldNames.EXPIRES] = json_dict[FieldNames.EXPIRES]['millis']
yield nexmark_model.Auction(
json_dict[FieldNames.ID],
json_dict[FieldNames.ITEM_NAME],
json_dict[FieldNames.DESCRIPTION],
json_dict[FieldNames.INITIAL_BID],
json_dict[FieldNames.RESERVE],
millis_to_timestamp(json_dict[FieldNames.DATE_TIME]),
millis_to_timestamp(json_dict[FieldNames.EXPIRES]),
json_dict[FieldNames.SELLER],
json_dict[FieldNames.CATEGORY],
json_dict[FieldNames.EXTRA])
elif FieldNames.AUCTION in json_dict:
yield nexmark_model.Bid(
json_dict[FieldNames.AUCTION],
json_dict[FieldNames.BIDDER],
json_dict[FieldNames.PRICE],
millis_to_timestamp(json_dict[FieldNames.DATE_TIME]),
json_dict[FieldNames.EXTRA])
else:
raise ValueError('Invalid event: %s.' % str(json_dict))
[docs]class CountAndLog(beam.PTransform):
[docs] def expand(self, pcoll):
return (
pcoll
| 'window' >> beam.WindowInto(window.GlobalWindows())
| "Count" >> beam.combiners.Count.Globally()
| "Log" >> beam.Map(log_count_info))
[docs]def log_count_info(count):
logging.info('Query resulted in %d results', count)
return count
[docs]def display(elm):
logging.debug(elm)
return elm
[docs]def model_to_json(model):
return json.dumps(construct_json_dict(model), separators=(',', ':'))
[docs]def construct_json_dict(model):
return {k: unnest_to_json(v) for k, v in model.__dict__.items()}
[docs]def unnest_to_json(cand):
if isinstance(cand, Timestamp):
return cand.micros // 1000
elif isinstance(
cand, (nexmark_model.Auction, nexmark_model.Bid, nexmark_model.Person)):
return construct_json_dict(cand)
else:
return cand
[docs]def millis_to_timestamp(millis: int) -> Timestamp:
micro_second = millis * 1000
return Timestamp(micros=micro_second)
[docs]def get_counter_metric(
result: PipelineResult, namespace: str, name: str) -> int:
"""
get specific counter metric from pipeline result
Args:
result: the PipelineResult which metrics are read from
namespace: a string representing the namespace of wanted metric
name: a string representing the name of the wanted metric
Returns:
the result of the wanted metric if it exist, else -1
"""
metrics = result.metrics().query(
MetricsFilter().with_namespace(namespace).with_name(name))
counters = metrics['counters']
if len(counters) > 1:
raise RuntimeError(
'%d instead of one metric result matches name: %s in namespace %s' %
(len(counters), name, namespace))
return counters[0].result if len(counters) > 0 else -1
[docs]def get_start_time_metric(
result: PipelineResult, namespace: str, name: str) -> int:
"""
get the start time out of all times recorded by the specified distribution
metric
Args:
result: the PipelineResult which metrics are read from
namespace: a string representing the namespace of wanted metric
name: a string representing the name of the wanted metric
Returns:
the smallest time in the metric or -1 if it doesn't exist
"""
distributions = result.metrics().query(
MetricsFilter().with_namespace(namespace).with_name(
name))['distributions']
min_list = list(map(lambda m: m.result.min, distributions))
return min(min_list) if len(min_list) > 0 else -1
[docs]def get_end_time_metric(
result: PipelineResult, namespace: str, name: str) -> int:
"""
get the end time out of all times recorded by the specified distribution
metric
Args:
result: the PipelineResult which metrics are read from
namespace: a string representing the namespace of wanted metric
name: a string representing the name of the wanted metric
Returns:
the largest time in the metric or -1 if it doesn't exist
"""
distributions = result.metrics().query(
MetricsFilter().with_namespace(namespace).with_name(
name))['distributions']
max_list = list(map(lambda m: m.result.max, distributions))
return max(max_list) if len(max_list) > 0 else -1