Source code for apache_beam.ml.rag.test_utils

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import contextlib
import logging
import os
import socket
import tempfile
import unittest
from dataclasses import dataclass
from typing import Callable
from typing import List
from typing import Optional
from typing import cast

from apache_beam.ml.rag.types import Chunk
from apache_beam.ml.rag.utils import retry_with_backoff

# pylint: disable=ungrouped-imports
try:
  import yaml
  from pymilvus import CollectionSchema
  from pymilvus import FieldSchema
  from pymilvus import MilvusClient
  from pymilvus.exceptions import MilvusException
  from pymilvus.milvus_client import IndexParams
  from testcontainers.core.config import testcontainers_config
  from testcontainers.core.generic import DbContainer
  from testcontainers.milvus import MilvusContainer

  from apache_beam.ml.rag.enrichment.milvus_search import MilvusConnectionParameters
except ImportError as e:
  raise unittest.SkipTest(f'RAG test util dependencies not installed: {str(e)}')

_LOGGER = logging.getLogger(__name__)



[docs]
@dataclass
class VectorDBContainerInfo:
  """Container information for vector database test instances.

  Holds connection details and container reference for testing with
  vector databases like Milvus in containerized environments.
  """
  container: DbContainer
  host: str
  port: int
  user: str = ""
  password: str = ""
  token: str = ""
  id: str = "default"

  @property
  def uri(self) -> str:
    return f"http://{self.host}:{self.port}"




[docs]
class TestHelpers:

[docs]
  @staticmethod
  def find_free_port():
    """Find a free port on the local machine."""
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
      # Bind to port 0, which asks OS to assign a free port.
      s.bind(('', 0))
      s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
      # Return the port number assigned by OS.
      return s.getsockname()[1]





[docs]
class CustomMilvusContainer(MilvusContainer):
  """Custom Milvus container with configurable ports and environment setup.

  Extends MilvusContainer to provide custom port binding and environment
  configuration for testing with standalone Milvus instances.
  """
  def __init__(
      self,
      image: str,
      service_container_port,
      healthcheck_container_port,
      **kwargs,
  ) -> None:
    # Skip the parent class's constructor and go straight to
    # GenericContainer.
    super(MilvusContainer, self).__init__(image=image, **kwargs)
    self.port = service_container_port
    self.healthcheck_port = healthcheck_container_port
    self.with_exposed_ports(service_container_port, healthcheck_container_port)

    # Get free host ports.
    service_host_port = TestHelpers.find_free_port()
    healthcheck_host_port = TestHelpers.find_free_port()

    # Bind container and host ports.
    self.with_bind_ports(service_container_port, service_host_port)
    self.with_bind_ports(healthcheck_container_port, healthcheck_host_port)
    self.cmd = "milvus run standalone"

    # Set environment variables needed for Milvus.
    envs = {
        "ETCD_USE_EMBED": "true",
        "ETCD_DATA_DIR": "/var/lib/milvus/etcd",
        "COMMON_STORAGETYPE": "local",
        "METRICS_PORT": str(healthcheck_container_port)
    }
    for env, value in envs.items():
      self.with_env(env, value)




[docs]
class MilvusTestHelpers:
  """Helper utilities for testing Milvus vector database operations.

  Provides static methods for managing test containers, configuration files,
  and chunk comparison utilities for Milvus-based integration tests.
  """
  # IMPORTANT: When upgrading the Milvus server version, ensure the pymilvus
  # Python SDK client in setup.py is updated to match. Referring to the Milvus
  # release notes compatibility matrix at
  # https://milvus.io/docs/release_notes.md or PyPI at
  # https://pypi.org/project/pymilvus/ for version compatibility.
  # Example: Milvus v2.6.0 requires pymilvus==2.6.0 (exact match required).

[docs]
  @staticmethod
  def start_db_container(
      image="milvusdb/milvus:v2.5.10",
      max_vec_fields=5,
      vector_client_max_retries=3,
      tc_max_retries=None) -> Optional[VectorDBContainerInfo]:
    service_container_port = TestHelpers.find_free_port()
    healthcheck_container_port = TestHelpers.find_free_port()
    user_yaml_creator = MilvusTestHelpers.create_user_yaml
    with user_yaml_creator(service_container_port, max_vec_fields) as cfg:
      info = None
      original_tc_max_tries = testcontainers_config.max_tries
      if tc_max_retries is not None:
        testcontainers_config.max_tries = tc_max_retries
      for i in range(vector_client_max_retries):
        try:
          vector_db_container = CustomMilvusContainer(
              image=image,
              service_container_port=service_container_port,
              healthcheck_container_port=healthcheck_container_port)
          vector_db_container = vector_db_container.with_volume_mapping(
              cfg, "/milvus/configs/user.yaml")
          vector_db_container.start()
          host = vector_db_container.get_container_host_ip()
          port = vector_db_container.get_exposed_port(service_container_port)
          info = VectorDBContainerInfo(vector_db_container, host, port)
          _LOGGER.info(
              "milvus db container started successfully on %s.", info.uri)
        except Exception as e:
          stdout_logs, stderr_logs = vector_db_container.get_logs()
          stdout_logs = stdout_logs.decode("utf-8")
          stderr_logs = stderr_logs.decode("utf-8")
          _LOGGER.warning(
              "Retry %d/%d: Failed to start Milvus DB container. Reason: %s. "
              "STDOUT logs:\n%s\nSTDERR logs:\n%s",
              i + 1,
              vector_client_max_retries,
              e,
              stdout_logs,
              stderr_logs)
          if i == vector_client_max_retries - 1:
            _LOGGER.error(
                "Unable to start milvus db container for I/O tests after %d "
                "retries. Tests cannot proceed. STDOUT logs:\n%s\n"
                "STDERR logs:\n%s",
                vector_client_max_retries,
                stdout_logs,
                stderr_logs)
            raise e
        finally:
          testcontainers_config.max_tries = original_tc_max_tries
      return info



[docs]
  @staticmethod
  def stop_db_container(db_info: VectorDBContainerInfo):
    if db_info is None:
      _LOGGER.warning("Milvus db info is None. Skipping stop operation.")
      return
    _LOGGER.debug("Stopping milvus db container.")
    db_info.container.stop()
    _LOGGER.info("milvus db container stopped successfully.")



[docs]
  @staticmethod
  def initialize_db_with_data(
      connc_params: MilvusConnectionParameters, config: dict):
    # Open the connection to the milvus db with retry.
    def create_client():
      return MilvusClient(**connc_params.__dict__)

    client = retry_with_backoff(
        create_client,
        max_retries=3,
        retry_delay=1.0,
        operation_name="Test Milvus client connection",
        exception_types=(MilvusException, ))

    # Configure schema.
    field_schemas: List[FieldSchema] = cast(List[FieldSchema], config["fields"])
    schema = CollectionSchema(
        fields=field_schemas, functions=config["functions"])

    # Create collection with the schema.
    collection_name = config["collection_name"]
    index_function: Callable[[], IndexParams] = cast(
        Callable[[], IndexParams], config["index"])
    client.create_collection(
        collection_name=collection_name,
        schema=schema,
        index_params=index_function())

    # Assert that collection was created.
    collection_error = f"Expected collection '{collection_name}' to be created."
    assert client.has_collection(collection_name), collection_error

    # Gather all fields we have excluding 'sparse_embedding_bm25' special field.
    fields = list(map(lambda field: field.name, field_schemas))

    # Prep data for indexing. Currently we can't insert sparse vectors for BM25
    # sparse embedding field as it would be automatically generated by Milvus
    # through the registered BM25 function.
    data_ready_to_index = []
    for doc in config["corpus"]:
      item = {}
      for field in fields:
        if field.startswith("dense_embedding"):
          item[field] = doc["dense_embedding"]
        elif field == "sparse_embedding_inner_product":
          item[field] = doc["sparse_embedding"]
        elif field == "sparse_embedding_bm25":
          # It is automatically generated by Milvus from the content field.
          continue
        else:
          item[field] = doc[field]
      data_ready_to_index.append(item)

    # Index data.
    result = client.insert(
        collection_name=collection_name, data=data_ready_to_index)

    # Assert that the intended data has been properly indexed.
    insertion_err = f'failed to insert the {result["insert_count"]} data points'
    assert result["insert_count"] == len(data_ready_to_index), insertion_err

    # Release the collection from memory. It will be loaded lazily when the
    # enrichment handler is invoked.
    client.release_collection(collection_name)

    # Close the connection to the Milvus database, as no further preparation
    # operations are needed  before executing the enrichment handler.
    client.close()

    return collection_name



[docs]
  @staticmethod
  @contextlib.contextmanager
  def create_user_yaml(service_port: int, max_vector_field_num=5):
    """Creates a temporary user.yaml file for Milvus configuration.

      This user yaml file overrides Milvus default configurations. It sets
      the Milvus service port to the specified container service port. The
      default for maxVectorFieldNum is 4, but we need 5
      (one unique field for each metric).

      Args:
        service_port: Port number for the Milvus service.
        max_vector_field_num: Max number of vec fields allowed per collection.

      Yields:
          str: Path to the created temporary yaml file.
      """
    with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
                                     delete=False) as temp_file:
      # Define the content for user.yaml.
      user_config = {
          'proxy': {
              'maxVectorFieldNum': max_vector_field_num, 'port': service_port
          },
          'etcd': {
              'use': {
                  'embed': True
              }, 'data': {
                  'dir': '/var/lib/milvus/etcd'
              }
          }
      }

      # Write the content to the file.
      yaml.dump(user_config, temp_file, default_flow_style=False)
      path = temp_file.name

    try:
      yield path
    finally:
      if os.path.exists(path):
        os.remove(path)



[docs]
  @staticmethod
  def assert_chunks_equivalent(
      actual_chunks: List[Chunk], expected_chunks: List[Chunk]):
    """assert_chunks_equivalent checks for presence rather than exact match"""
    # Sort both lists by ID to ensure consistent ordering.
    actual_sorted = sorted(actual_chunks, key=lambda c: c.id)
    expected_sorted = sorted(expected_chunks, key=lambda c: c.id)

    actual_len = len(actual_sorted)
    expected_len = len(expected_sorted)
    err_msg = (
        f"Different number of chunks, actual: {actual_len}, "
        f"expected: {expected_len}")
    assert actual_len == expected_len, err_msg

    for actual, expected in zip(actual_sorted, expected_sorted):
      # Assert that IDs match.
      assert actual.id == expected.id

      # Assert that dense embeddings match.
      err_msg = f"Dense embedding mismatch for chunk {actual.id}"
      assert actual.dense_embedding == expected.dense_embedding, err_msg

      # Assert that sparse embeddings match.
      err_msg = f"Sparse embedding mismatch for chunk {actual.id}"
      assert actual.sparse_embedding == expected.sparse_embedding, err_msg

      # Assert that text content match.
      err_msg = f"Text Content mismatch for chunk {actual.id}"
      assert actual.content.text == expected.content.text, err_msg

      # For enrichment_data, be more flexible.
      # If "expected" has values for enrichment_data but actual doesn't, that's
      # acceptable since vector search results can vary based on many factors
      # including implementation details, vector database state, and slight
      # variations in similarity calculations.

      # First ensure the enrichment data key exists.
      err_msg = f"Missing enrichment_data key in chunk {actual.id}"
      assert 'enrichment_data' in actual.metadata, err_msg

      # For enrichment_data, ensure consistent ordering of results.
      actual_data = actual.metadata['enrichment_data']
      expected_data = expected.metadata['enrichment_data']

      # If actual has enrichment data, then perform detailed validation.
      if actual_data:
        # Ensure the id key exist.
        err_msg = f"Missing id key in metadata {actual.id}"
        assert 'id' in actual_data, err_msg

        # Validate IDs have consistent ordering.
        actual_ids = sorted(actual_data['id'])
        expected_ids = sorted(expected_data['id'])
        err_msg = f"IDs in enrichment_data don't match for chunk {actual.id}"
        assert actual_ids == expected_ids, err_msg

        # Ensure the distance key exist.
        err_msg = f"Missing distance key in metadata {actual.id}"
        assert 'distance' in actual_data, err_msg

        # Validate distances exist and have same length as IDs.
        actual_distances = actual_data['distance']
        expected_distances = expected_data['distance']
        err_msg = (
            "Number of distances doesn't match number of IDs for "
            f"chunk {actual.id}")
        assert len(actual_distances) == len(expected_distances), err_msg

        # Ensure the fields key exist.
        err_msg = f"Missing fields key in metadata {actual.id}"
        assert 'fields' in actual_data, err_msg

        # Validate fields have consistent content.
        # Sort fields by 'id' to ensure consistent ordering.
        actual_fields_sorted = sorted(
            actual_data['fields'], key=lambda f: f.get('id', 0))
        expected_fields_sorted = sorted(
            expected_data['fields'], key=lambda f: f.get('id', 0))

        # Compare field IDs.
        actual_field_ids = [f.get('id') for f in actual_fields_sorted]
        expected_field_ids = [f.get('id') for f in expected_fields_sorted]
        err_msg = f"Field IDs don't match for chunk {actual.id}"
        assert actual_field_ids == expected_field_ids, err_msg

        # Compare field content.
        for a_f, e_f in zip(actual_fields_sorted, expected_fields_sorted):
          # Ensure the id key exist.
          err_msg = f"Missing id key in metadata.fields {actual.id}"
          assert 'id' in a_f, err_msg

          err_msg = f"Field ID mismatch chunk {actual.id}"
          assert a_f['id'] == e_f['id'], err_msg

          # Validate field metadata.
          err_msg = f"Field Metadata doesn't match for chunk {actual.id}"
          assert a_f['metadata'] == e_f['metadata'], err_msg




if __name__ == '__main__':
  unittest.main()