tanakachitsamba · tanakachitsamba · Oct 7, 2025 · Oct 7, 2025 · Copilot · Oct 7, 2025
diff --git a/add_documents.py b/add_documents.py
@@ -1,70 +1,159 @@
-import chromadb
-from chromadb.utils import embedding_functions
-from dotenv import load_dotenv
+"""Utilities for ingesting documents into a Chroma collection."""
+
+from __future__ import annotations
+
+import json
+from typing import Any, Iterable, List, Sequence
 import os
-import sys
 
-def load_openai_key():
-    # Load variables from .env file into environment
+
+class IngestionError(ValueError):
+    """Raised when the ingestion payload is invalid."""
+
+
+def load_openai_key() -> str:
+    """Load the OpenAI API key from the environment."""
+
+    from dotenv import load_dotenv
+
     load_dotenv()
-    openai_key = os.environ.get('OPENAI_KEY')
+    openai_key = os.environ.get("OPENAI_KEY")
     if not openai_key:
         raise ValueError("OPENAI_KEY is not set in the .env file.")
     return openai_key
 
 def create_openai_ef(api_key):
+    from chromadb.utils import embedding_functions
+
     # Using OpenAI Embeddings. This assumes you have the openai package installed
     openai_ef = embedding_functions.OpenAIEmbeddingFunction(
         api_key=api_key,
         model_name="text-embedding-ada-002"
     )
     return openai_ef
 
-def create_or_get_collection(client):
-    # Create a new chroma collection
-    collection_name = "lake"
-    return client.get_or_create_collection(name=collection_name)
-
-def add_to_openai_collection(collection, documents, metadatas, ids):
-    try:
-        collection.add(
-            documents=documents,
-            metadatas=metadatas,
-            ids=ids
+def get_persistent_client(persist_directory: str = "db") -> Any:
+    """Return a persistent Chroma client for the provided directory."""
+
+    import chromadb
+
+    return chromadb.PersistentClient(path=persist_directory)
+
+
+def create_or_get_collection(client: Any, name: str = "lake") -> Any:
+    """Create a new chroma collection or return an existing one."""
+
+    return client.get_or_create_collection(name=name)
+
+
+def _ensure_sequence(data: Sequence, expected_length: int, label: str) -> List:
+    if isinstance(data, (str, bytes)) or not isinstance(data, Sequence):
+        raise IngestionError(f"{label} must be a sequence of values.")
+
+    values = list(data)
+    if len(values) != expected_length:
+        raise IngestionError(
+            f"Expected {expected_length} {label}, received {len(values)}."
         )
-        print("Documents added to the collection successfully.")
-    except Exception as e:
-        print(f"Error occurred while adding documents: {e}")
-
-if __name__ == "__main__":
-    try:
-        # Check if three command-line arguments are provided
-        if len(sys.argv) != 4:
-            raise ValueError("Usage: python script.py <documents> <metadatas> <ids>")
-
-        # Extract the command-line arguments as strings
-        documents = sys.argv[1]
-        metadatas = sys.argv[2]
-        ids = sys.argv[3]
-
-        # Create a new Chroma client with persistence enabled.
-        persist_directory = "db" # this path for the db could be an arg 
-        client = chromadb.PersistentClient(path=persist_directory)
-
-        # Load the OpenAI key
-        openai_key = load_openai_key()
-
-        # Create/Open OpenAI Embedding Function
-        openai_ef = create_openai_ef(api_key=openai_key)
-
-        # Create or get the Chroma collection
-        openai_collection = create_or_get_collection(client)
-
-        # Call the function with the provided arguments
-        add_to_openai_collection(openai_collection, documents, metadatas, ids)
-    except ValueError as ve:
-        print(ve)
-    except chromadb.ChromaDBError as cde:
-        print(f"ChromaDBError: {cde}")
-    except Exception as e:
-        print(f"An unexpected error occurred: {e}")
+    return values
+
+
+def ingest_documents(
+    collection,
+    documents: Sequence[str],
+    metadatas: Sequence[dict],
+    ids: Sequence[str],
+) -> int:
+    """Add the provided documents to the collection.
+
+    Args:
+        collection: A Chroma collection (or any object exposing an ``add`` method).
+        documents: Sequence of textual documents.
+        metadatas: Sequence of metadata dictionaries.
+        ids: Sequence of unique document identifiers.
+
+    Returns:
+        The number of documents ingested.
+
+    Raises:
+        IngestionError: If the provided payload is invalid.
+    """
+
+    document_list = list(documents)
+    metadata_list = _ensure_sequence(metadatas, len(document_list), "metadatas")
+    id_list = _ensure_sequence(ids, len(document_list), "ids")
+
+    if not all(isinstance(doc, str) for doc in document_list):
+        raise IngestionError("All documents must be strings.")
+
+    if not all(isinstance(meta, dict) for meta in metadata_list):
+        raise IngestionError("All metadatas must be dictionaries.")
+
+    if not all(isinstance(id_value, str) for id_value in id_list):
+        raise IngestionError("All ids must be strings.")
+
+    if len(set(id_list)) != len(id_list):
+        raise IngestionError("Duplicate ids detected in payload.")
+
+    collection.add(documents=document_list, metadatas=metadata_list, ids=id_list)
+    return len(document_list)
+
+
+def parse_ingestion_payload(payload: str | dict) -> tuple[List[str], List[dict], List[str]]:
+    """Parse a JSON payload into document, metadata, and id lists."""
+
+    if isinstance(payload, str):
+        try:
+            payload_data = json.loads(payload)
+        except json.JSONDecodeError as exc:
+            raise IngestionError("Invalid JSON payload provided.") from exc
+    elif isinstance(payload, dict):
+        payload_data = payload
+    else:
+        raise IngestionError("Payload must be a JSON string or dictionary.")
+
+    required_keys = {"documents", "metadatas", "ids"}
+    missing_keys = required_keys.difference(payload_data)
+    if missing_keys:
+        missing = ", ".join(sorted(missing_keys))
+        raise IngestionError(f"Payload is missing required keys: {missing}.")
+
+    documents = payload_data["documents"]
+    metadatas = payload_data["metadatas"]
+    ids = payload_data["ids"]
+
+    if not isinstance(documents, list):
+        raise IngestionError("Payload field 'documents' must be a list.")
+    if not isinstance(metadatas, list):
+        raise IngestionError("Payload field 'metadatas' must be a list.")
+    if not isinstance(ids, list):
+        raise IngestionError("Payload field 'ids' must be a list.")
+
+    return documents, metadatas, ids
+
+
+def run_ingestion(
+    documents: Iterable[str],
+    metadatas: Iterable[dict],
+    ids: Iterable[str],
+    *,
+    persist_directory: str = "db",
+    collection_name: str = "lake",
+) -> int:
+    """Convenience helper to ingest a batch of documents."""
+
+    client = get_persistent_client(persist_directory=persist_directory)
+    collection = create_or_get_collection(client, name=collection_name)
+    return ingest_documents(collection, documents, metadatas, ids)
+
+
+__all__ = [
+    "IngestionError",
+    "create_openai_ef",
+    "create_or_get_collection",
+    "get_persistent_client",
+    "ingest_documents",
+    "load_openai_key",
+    "parse_ingestion_payload",
+    "run_ingestion",
+]
diff --git a/add_documents_bdd_tests.py b/add_documents_bdd_tests.py
@@ -1,70 +1,120 @@
+import json
 import os
-import sys
-import chromadb
-from chromadb.utils import embedding_functions
-from dotenv import load_dotenv
 
-# Load variables from .env file into environment
-load_dotenv()
+from behave import given, when, then
+
+from add_documents import (
+    IngestionError,
+    create_openai_ef,
+    create_or_get_collection,
+    get_persistent_client,
+    ingest_documents,
+    load_openai_key,
+    parse_ingestion_payload,
+)
+
+
+# Hooks
 
-# Define shared context
 def before_scenario(context, scenario):
-    context.documents = None
-    context.metadatas = None
-    context.ids = None
+    context.error = None
+    context.result = None
+    context.documents = []
+    context.metadatas = []
+    context.ids = []
+    context.openai_collection = None
+
 
 def after_scenario(context, scenario):
-    if context.documents:
-        # Clean up the collection after the test
+    if getattr(context, "openai_collection", None) and context.result:
         context.openai_collection.remove(ids=context.ids)
 
+
 # Step Definitions
+
+
 @given("the OpenAI key is set in the .env file")
-def step_impl_load_openai_key(context):
-    openai_key = os.environ.get('OPENAI_KEY')
-    if not openai_key:
-        raise ValueError("OPENAI_KEY is not set in the .env file.")
-    context.openai_key = openai_key
+def step_impl_openai_key_present(context):
+    os.environ.setdefault("OPENAI_KEY", "test-key")
+    context.openai_key = load_openai_key()
+
+
+@given("the OpenAI key is not set in the environment")
+def step_impl_openai_key_missing(context):
+    os.environ.pop("OPENAI_KEY", None)
+
 
 @given("an OpenAI Embedding Function is created")
 def step_impl_create_openai_ef(context):
-    context.openai_ef = embedding_functions.OpenAIEmbeddingFunction(
-        api_key=context.openai_key,
-        model_name="text-embedding-ada-002"
-    )
+    context.openai_ef = create_openai_ef(api_key=context.openai_key)
+
 
 @given("a Chroma client with persistence enabled is available")
 def step_impl_create_chroma_client(context):
     context.persist_directory = "db"
-    context.client = chromadb.PersistentClient(path=context.persist_directory)
+    context.client = get_persistent_client(persist_directory=context.persist_directory)
+    context.openai_collection = create_or_get_collection(context.client)
 
-@when("documents, metadatas, and ids are provided")
-def step_impl_provide_arguments(context):
-    # Check if three command-line arguments are provided
-    if len(sys.argv) != 4:
-        raise ValueError("Usage: python script.py <documents> <metadatas> <ids>")
 
-    # Extract the command-line arguments as strings
-    context.documents = sys.argv[1]
-    context.metadatas = sys.argv[2]
-    context.ids = sys.argv[3]
+@when("I load the OpenAI key")
+def step_impl_load_openai_key(context):
+    try:
+        context.openai_key = load_openai_key()
+    except Exception as exc:  # pragma: no cover - behave captures the exception
+        context.error = exc
+
+
+@when("the following documents are ingested")
+def step_impl_ingest_documents(context):
+    documents, metadatas, ids = [], [], []
+    for row in context.table:
+        documents.append(row["document"])
+        try:
+            metadata = json.loads(row["metadata"]) if row["metadata"] else {}
+        except json.JSONDecodeError:
+            context.error = IngestionError("Invalid metadata JSON provided.")
+            return
+        metadatas.append(metadata)
+        ids.append(row["id"])
+
+    context.documents = documents
+    context.metadatas = metadatas
+    context.ids = ids
+
+    try:
+        context.result = ingest_documents(
+            context.openai_collection, documents, metadatas, ids
+        )
+    except Exception as exc:  # pragma: no cover - behave captures the exception
+        context.error = exc
 
-    # Create or get the Chroma collection
-    context.openai_collection = context.client.get_or_create_collection(name="lake")
 
-    # Add documents to the collection
+@when("the payload is ingested")
+def step_impl_ingest_payload(context):
     try:
-        context.openai_collection.add(
-            documents=context.documents.split(","),
-            metadatas=context.metadatas.split(","),
-            ids=context.ids.split(",")
+        documents, metadatas, ids = parse_ingestion_payload(context.text)
+        context.documents = documents
+        context.metadatas = metadatas
+        context.ids = ids
+        context.result = ingest_documents(
+            context.openai_collection, documents, metadatas, ids
         )
-    except Exception as e:
-        context.error = e
+    except Exception as exc:  # pragma: no cover - behave captures the exception
+        context.error = exc
+
 
 @then("the documents should be added to the collection successfully")
 def step_impl_verify_success(context):
     assert context.error is None, f"Error occurred while adding documents: {context.error}"
-    assert len(context.openai_collection) == len(context.documents.split(",")), "Number of documents added is incorrect."
+    assert context.result == len(context.documents)
+
+
+@then('an error should be raised containing "{message}"')
+def step_impl_verify_error_message(context, message):
+    assert context.error is not None, "Expected an error but none was raised."
+    assert message in str(context.error)
+
 
-    # Additional assertions if required
+@then("an ingestion error should be raised")
+def step_impl_verify_ingestion_error(context):
+    assert isinstance(context.error, IngestionError)