Remove k_endog & k_exog parameters in SSM (#599)

Dekermanjian · web-flow · commit 3531d298b771 · 2025-11-02T23:29:53.000+01:00
* removed k_endog argument from BayesianETS making endog_names required and updated tests accordingly

* removed k_exog from SARIMAX preferring exog_state_names for defining exogenous variables and updated tests accordingly

* Updated VARMAX model by removing k_endog &amp; k_exog arguments making endog_names required and exog_state_names required for exogenous variables and updated tests accordingly

* updated DFM by removing k_endog &amp; k_exog args and made endog_names required and exog_names required if exogenous variables are requested and updated tests accordingly

* removed k_exog from STS regression component and updated tests accordingly

* updated docstrings of VARMAX and DFM to relect changes in removal of k_endog and k_exog parameters

* moved endog_names validation into stand alone utility

* removed commented code and tests, updated validate names test to be reused in both endog and some exog cases

* updated docstring in regression component

* updated validate_names to always return None, removed _handle_input_data, reverted test_SARIMA_with_exogenous to use stationary initialization
diff --git a/pymc_extras/statespace/models/DFM.py b/pymc_extras/statespace/models/DFM.py
@@ -5,7 +5,7 @@
 import pytensor.tensor as pt
 
 from pymc_extras.statespace.core.statespace import PyMCStateSpace
-from pymc_extras.statespace.models.utilities import make_default_coords
+from pymc_extras.statespace.models.utilities import make_default_coords, validate_names
 from pymc_extras.statespace.utils.constants import (
     ALL_STATE_AUX_DIM,
     ALL_STATE_DIM,
@@ -224,9 +224,7 @@ def __init__(
         self,
         k_factors: int,
         factor_order: int,
-        k_endog: int | None = None,
         endog_names: Sequence[str] | None = None,
-        k_exog: int | None = None,
         exog_names: Sequence[str] | None = None,
         shared_exog_states: bool = False,
         exog_innovations: bool = False,
@@ -249,19 +247,11 @@ def __init__(
             and are modeled as a white noise process, i.e., :math:`f_t = \varepsilon_{f,t}`.
             Therefore, the state vector will include one state per factor and "factor_ar" will not exist.
 
-        k_endog : int, optional
-            Number of observed time series. If not provided, the number of observed series will be inferred from `endog_names`.
-            At least one of `k_endog` or `endog_names` must be provided.
-
         endog_names : list of str, optional
-            Names of the observed time series. If not provided, default names will be generated as `endog_1`, `endog_2`, ..., `endog_k` based on `k_endog`.
-            At least one of `k_endog` or `endog_names` must be provided.
-
-        k_exog : int, optional
-            Number of exogenous variables. If not provided, the model will not have exogenous variables.
+            Names of the observed time series.
 
         exog_names : Sequence[str], optional
-            Names of the exogenous variables. If not provided, but `k_exog` is specified, default names will be generated as `exog_1`, `exog_2`, ..., `exog_k`.
+            Names of the exogenous variables.
 
         shared_exog_states: bool, optional
             Whether exogenous latent states are shared across the observed states. If True, there will be only one set of exogenous latent
@@ -289,13 +279,8 @@ def __init__(
 
         """
 
-        if k_endog is None and endog_names is None:
-            raise ValueError("Either k_endog or endog_names must be provided.")
-        if k_endog is None:
-            k_endog = len(endog_names)
-        if endog_names is None:
-            endog_names = [f"endog_{i}" for i in range(k_endog)]
-
+        validate_names(endog_names, var_name="endog_names", optional=False)
+        k_endog = len(endog_names)
         self.endog_names = endog_names
         self.k_endog = k_endog
         self.k_factors = k_factors
@@ -304,17 +289,17 @@ def __init__(
         self.error_var = error_var
         self.error_cov_type = error_cov_type
 
-        if k_exog is None and exog_names is None:
-            self.k_exog = 0
-        else:
+        if exog_names is not None:
             self.shared_exog_states = shared_exog_states
             self.exog_innovations = exog_innovations
-            if k_exog is None:
-                k_exog = len(exog_names) if exog_names is not None else 0
-            elif exog_names is None:
-                exog_names = [f"exog_{i}" for i in range(k_exog)] if k_exog > 0 else None
+            validate_names(
+                exog_names, var_name="exog_names", optional=True
+            )  # Not sure if this adds anything
+            k_exog = len(exog_names)
             self.k_exog = k_exog
             self.exog_names = exog_names
+        else:
+            self.k_exog = 0
 
         self.k_exog_states = self.k_exog * self.k_endog if not shared_exog_states else self.k_exog
         self.exog_flag = self.k_exog > 0
diff --git a/pymc_extras/statespace/models/ETS.py b/pymc_extras/statespace/models/ETS.py
@@ -9,7 +9,7 @@
 from pytensor.tensor.slinalg import solve_discrete_lyapunov
 
 from pymc_extras.statespace.core.statespace import PyMCStateSpace, floatX
-from pymc_extras.statespace.models.utilities import make_default_coords
+from pymc_extras.statespace.models.utilities import make_default_coords, validate_names
 from pymc_extras.statespace.utils.constants import (
     ALL_STATE_AUX_DIM,
     ALL_STATE_DIM,
@@ -138,12 +138,9 @@ class BayesianETS(PyMCStateSpace):
         or 'N'.
         If provided, the model will be initialized from the given order, and the `trend`, `damped_trend`, and `seasonal`
         arguments will be ignored.
-    endog_names: str or list of str, Optional
+    endog_names: str or list of str
         Names associated with observed states. If a list, the length should be equal to the number of time series
         to be estimated.
-    k_endog: int, Optional
-        Number of time series to estimate. If endog_names are provided, this is ignored and len(endog_names) is
-        used instead.
     trend: bool
         Whether to include a trend component. Setting ``trend=True`` is equivalent to ``order[1] == 'A'``.
     damped_trend: bool
@@ -213,7 +210,6 @@ def __init__(
         self,
         order: tuple[str, str, str] | None = None,
         endog_names: str | list[str] | None = None,
-        k_endog: int = 1,
         trend: bool = True,
         damped_trend: bool = False,
         seasonal: bool = False,
@@ -265,13 +261,9 @@ def __init__(
         if self.seasonal and self.seasonal_periods is None:
             raise ValueError("If seasonal is True, seasonal_periods must be provided.")
 
-        if endog_names is not None:
-            endog_names = list(endog_names)
-            k_endog = len(endog_names)
-        else:
-            endog_names = [f"data_{i}" for i in range(k_endog)] if k_endog > 1 else ["data"]
-
-        self.endog_names = endog_names
+        validate_names(endog_names, var_name="endog_names", optional=False)
+        k_endog = len(endog_names)
+        self.endog_names = list(endog_names)
 
         if dense_innovation_covariance and k_endog == 1:
             dense_innovation_covariance = False
diff --git a/pymc_extras/statespace/models/SARIMAX.py b/pymc_extras/statespace/models/SARIMAX.py
@@ -12,6 +12,7 @@
     make_default_coords,
     make_harvey_state_names,
     make_SARIMA_transition_matrix,
+    validate_names,
 )
 from pymc_extras.statespace.utils.constants import (
     ALL_STATE_AUX_DIM,
@@ -132,7 +133,6 @@ def __init__(
         order: tuple[int, int, int],
         seasonal_order: tuple[int, int, int, int] | None = None,
         exog_state_names: list[str] | None = None,
-        k_exog: int | None = None,
         stationary_initialization: bool = True,
         filter_type: str = "standard",
         state_structure: str = "fast",
@@ -166,10 +166,6 @@ def __init__(
         exog_state_names : list[str], optional
             Names of the exogenous state variables.
 
-        k_exog : int, optional
-            Number of exogenous variables. If provided, must match the length of
-            `exog_state_names`.
-
         stationary_initialization : bool, default True
             If true, the initial state and initial state covariance will not be assigned priors. Instead, their steady
             state values will be used.
@@ -212,18 +208,10 @@ def __init__(
         if seasonal_order is None:
             seasonal_order = (0, 0, 0, 0)
 
-        if exog_state_names is None and k_exog is not None:
-            exog_state_names = [f"exogenous_{i}" for i in range(k_exog)]
-        elif exog_state_names is not None and k_exog is None:
-            k_exog = len(exog_state_names)
-        elif exog_state_names is not None and k_exog is not None:
-            if len(exog_state_names) != k_exog:
-                raise ValueError(
-                    f"Based on provided inputs, expected exog_state_names to have {k_exog} elements, but "
-                    f"found {len(exog_state_names)}"
-                )
-        else:
-            k_exog = 0
+        validate_names(
+            exog_state_names, var_name="exog_state_names", optional=True
+        )  # Not sure if this adds anything
+        k_exog = len(exog_state_names) if exog_state_names is not None else 0
 
         self.exog_state_names = exog_state_names
         self.k_exog = k_exog
diff --git a/pymc_extras/statespace/models/VARMAX.py b/pymc_extras/statespace/models/VARMAX.py
@@ -9,7 +9,7 @@
 from pytensor.tensor.slinalg import solve_discrete_lyapunov
 
 from pymc_extras.statespace.core.statespace import PyMCStateSpace
-from pymc_extras.statespace.models.utilities import make_default_coords
+from pymc_extras.statespace.models.utilities import make_default_coords, validate_names
 from pymc_extras.statespace.utils.constants import (
     ALL_STATE_AUX_DIM,
     ALL_STATE_DIM,
@@ -99,9 +99,7 @@ def __init__(
         self,
         order: tuple[int, int],
         endog_names: list[str] | None = None,
-        k_endog: int | None = None,
         exog_state_names: list[str] | dict[str, list[str]] | None = None,
-        k_exog: int | dict[str, int] | None = None,
         stationary_initialization: bool = False,
         filter_type: str = "standard",
         measurement_error: bool = False,
@@ -118,23 +116,14 @@ def __init__(
             specified order are included. For restricted models, set zeros directly on the priors.
 
         endog_names: list of str, optional
-            Names of the endogenous variables being modeled. Used to generate names for the state and shock coords. If
-            None, the state names will simply be numbered.
-
-            Exactly one of either ``endog_names`` or ``k_endog`` must be specified.
+            Names of the endogenous variables being modeled. Used to generate names for the state and shock coords.
 
         exog_state_names : list[str] or dict[str, list[str]], optional
             Names of the exogenous state variables. If a list, all endogenous variables will share the same exogenous
             variables. If a dict, keys should be the names of the endogenous variables, and values should be lists of the
             exogenous variable names for that endogenous variable. Endogenous variables not included in the dict will
             be assumed to have no exogenous variables. If None, no exogenous variables will be included.
 
-        k_exog : int or dict[str, int], optional
-            Number of exogenous variables. If an int, all endogenous variables will share the same number of exogenous
-            variables. If a dict, keys should be the names of the endogenous variables, and values should be the number of
-            exogenous variables for that endogenous variable. Endogenous variables not included in the dict will be
-            assumed to have no exogenous variables. If None, no exogenous variables will be included.
-
         stationary_initialization: bool, default False
             If true, the initial state and initial state covariance will not be assigned priors. Instead, their steady
             state values will be used. If False, the user is responsible for setting priors on the initial state and
@@ -162,62 +151,23 @@ def __init__(
             to all sampling methods.
 
         """
-        if (endog_names is None) and (k_endog is None):
-            raise ValueError("Must specify either endog_names or k_endog")
-        if (endog_names is not None) and (k_endog is None):
-            k_endog = len(endog_names)
-        if (endog_names is None) and (k_endog is not None):
-            endog_names = [f"observed_{i}" for i in range(k_endog)]
-        if (endog_names is not None) and (k_endog is not None):
-            if len(endog_names) != k_endog:
-                raise ValueError("Length of provided endog_names does not match provided k_endog")
+
+        validate_names(endog_names, var_name="endog_names", optional=False)
+        k_endog = len(endog_names)
 
         needs_exog_data = False
 
-        if k_exog is not None and not isinstance(k_exog, int | dict):
-            raise ValueError("If not None, k_exog must be either an int or a dict")
         if exog_state_names is not None and not isinstance(exog_state_names, list | dict):
             raise ValueError("If not None, exog_state_names must be either a list or a dict")
 
-        if k_exog is not None and exog_state_names is not None:
-            if isinstance(k_exog, int) and isinstance(exog_state_names, list):
-                if len(exog_state_names) != k_exog:
-                    raise ValueError("Length of exog_state_names does not match provided k_exog")
-            elif isinstance(k_exog, int) and isinstance(exog_state_names, dict):
-                raise ValueError(
-                    "If k_exog is an int, exog_state_names must be a list of the same length (or None)"
-                )
-            elif isinstance(k_exog, dict) and isinstance(exog_state_names, list):
-                raise ValueError(
-                    "If k_exog is a dict, exog_state_names must be a dict as well (or None)"
-                )
-            elif isinstance(k_exog, dict) and isinstance(exog_state_names, dict):
-                if set(k_exog.keys()) != set(exog_state_names.keys()):
-                    raise ValueError("Keys of k_exog and exog_state_names dicts must match")
-                if not all(
-                    len(names) == k for names, k in zip(exog_state_names.values(), k_exog.values())
-                ):
-                    raise ValueError(
-                        "If both k_endog and exog_state_names are provided, lengths of exog_state_names "
-                        "lists must match corresponding values in k_exog"
-                    )
-            needs_exog_data = True
-
-        if k_exog is not None and exog_state_names is None:
-            if isinstance(k_exog, int):
-                exog_state_names = [f"exogenous_{i}" for i in range(k_exog)]
-            elif isinstance(k_exog, dict):
-                exog_state_names = {
-                    name: [f"{name}_exogenous_{i}" for i in range(k)] for name, k in k_exog.items()
-                }
-            needs_exog_data = True
-
-        if k_exog is None and exog_state_names is not None:
+        if exog_state_names is not None:
             if isinstance(exog_state_names, list):
                 k_exog = len(exog_state_names)
             elif isinstance(exog_state_names, dict):
                 k_exog = {name: len(names) for name, names in exog_state_names.items()}
             needs_exog_data = True
+        else:
+            k_exog = None
 
         # If exog_state_names is a dict but 1) all endog variables are among the keys, and 2) all values are the same
         # then we can drop back to the list case.
diff --git a/pymc_extras/statespace/models/structural/components/regression.py b/pymc_extras/statespace/models/structural/components/regression.py
@@ -3,6 +3,7 @@
 from pytensor import tensor as pt
 
 from pymc_extras.statespace.models.structural.core import Component
+from pymc_extras.statespace.models.utilities import validate_names
 from pymc_extras.statespace.utils.constants import TIME_DIM
 
 
@@ -12,10 +13,6 @@ class RegressionComponent(Component):
 
     Parameters
     ----------
-    k_exog : int | None, default None
-        Number of exogenous variables to include in the regression. Must be specified if
-        state_names is not provided.
-
     name : str | None, default "regression"
         A name for this regression component. Used to label dimensions and coordinates.
 
@@ -107,7 +104,6 @@ class RegressionComponent(Component):
 
     def __init__(
         self,
-        k_exog: int | None = None,
         name: str | None = "regression",
         state_names: list[str] | None = None,
         observed_state_names: list[str] | None = None,
@@ -120,7 +116,9 @@ def __init__(
             observed_state_names = ["data"]
 
         self.innovations = innovations
-        k_exog = self._handle_input_data(k_exog, state_names, name)
+        validate_names(state_names, var_name="state_names", optional=False)
+        k_exog = len(state_names)
+        self.state_names = state_names
 
         k_states = k_exog
         k_endog = len(observed_state_names)
@@ -140,26 +138,6 @@ def __init__(
             obs_state_idxs=np.ones(k_states),
         )
 
-    @staticmethod
-    def _get_state_names(k_exog: int | None, state_names: list[str] | None, name: str):
-        if k_exog is None and state_names is None:
-            raise ValueError("Must specify at least one of k_exog or state_names")
-        if state_names is not None and k_exog is not None:
-            if len(state_names) != k_exog:
-                raise ValueError(f"Expected {k_exog} state names, found {len(state_names)}")
-        elif k_exog is None:
-            k_exog = len(state_names)
-        else:
-            state_names = [f"{name}_{i + 1}" for i in range(k_exog)]
-
-        return k_exog, state_names
-
-    def _handle_input_data(self, k_exog: int, state_names: list[str] | None, name) -> int:
-        k_exog, state_names = self._get_state_names(k_exog, state_names, name)
-        self.state_names = state_names
-
-        return k_exog
-
     def make_symbolic_graph(self) -> None:
         k_endog = self.k_endog
         k_endog_effective = 1 if self.share_states else k_endog
diff --git a/pymc_extras/statespace/models/utilities.py b/pymc_extras/statespace/models/utilities.py
@@ -670,3 +670,10 @@ def get_exog_dims_from_idata(exog_name, idata):
         exog_dims = None
 
     return exog_dims
+
+
+def validate_names(names: list[str], var_name: str, optional: bool = True) -> None:
+    if names is None:
+        if optional:
+            return None
+        raise ValueError(f"Must specify {var_name}")
diff --git a/tests/statespace/core/test_statespace.py b/tests/statespace/core/test_statespace.py
@@ -182,9 +182,8 @@ def exog_ss_mod(exog_data):
     level_trend = st.LevelTrendComponent(name="trend", order=1, innovations_order=[0])
     exog = st.RegressionComponent(
         name="exog",  # Name of this exogenous variable component
-        k_exog=1,  # Only one exogenous variable now
         innovations=False,  # Typically fixed effect (no stochastic evolution)
-        state_names=exog_data[["x1"]].columns.tolist(),
+        state_names=exog_data[["x1"]].columns.tolist(),  # Only one exogenous variable now
     )
 
     combined_model = level_trend + exog
@@ -198,9 +197,8 @@ def exog_ss_mod_mv(exog_data_mv):
     )
     exog = st.RegressionComponent(
         name="exog",  # Name of this exogenous variable component
-        k_exog=1,  # Only one exogenous variable now
         innovations=False,  # Typically fixed effect (no stochastic evolution)
-        state_names=exog_data_mv[["x1"]].columns.tolist(),
+        state_names=exog_data_mv[["x1"]].columns.tolist(),  # Only one exogenous variable now
         observed_state_names=["y1", "y2"],
     )
 
diff --git a/tests/statespace/filters/test_distributions.py b/tests/statespace/filters/test_distributions.py
@@ -193,7 +193,7 @@ def test_lgss_distribution_with_dims(output_name, ss_mod_me, pymc_model_2):
 def test_lgss_with_time_varying_inputs(output_name, rng):
     X = rng.random(size=(10, 3), dtype=floatX)
     ss_mod = structural.LevelTrendComponent() + structural.RegressionComponent(
-        name="exog", k_exog=3
+        name="exog", state_names=["exog_0", "exog_1", "exog_2"]
     )
     mod = ss_mod.build("data", verbose=False)
 
diff --git a/tests/statespace/models/test_DFM.py b/tests/statespace/models/test_DFM.py
diff --git a/tests/statespace/models/test_ETS.py b/tests/statespace/models/test_ETS.py
diff --git a/tests/statespace/models/test_SARIMAX.py b/tests/statespace/models/test_SARIMAX.py
diff --git a/tests/statespace/models/test_VARMAX.py b/tests/statespace/models/test_VARMAX.py

Original file line number	Diff line number	Diff line change
`@@ -182,9 +182,8 @@ def exog_ss_mod(exog_data):`
`182`	`182`	`level_trend = st.LevelTrendComponent(name="trend", order=1, innovations_order=[0])`
`183`	`183`	`exog = st.RegressionComponent(`
`184`	`184`	`name="exog", # Name of this exogenous variable component`
`185`		`- k_exog=1, # Only one exogenous variable now`
`186`	`185`	`innovations=False, # Typically fixed effect (no stochastic evolution)`
`187`		`- state_names=exog_data[["x1"]].columns.tolist(),`
	`186`	`+ state_names=exog_data[["x1"]].columns.tolist(), # Only one exogenous variable now`
`188`	`187`	`)`
`189`	`188`
`190`	`189`	`combined_model = level_trend + exog`
`@@ -198,9 +197,8 @@ def exog_ss_mod_mv(exog_data_mv):`
`198`	`197`	`)`
`199`	`198`	`exog = st.RegressionComponent(`
`200`	`199`	`name="exog", # Name of this exogenous variable component`
`201`		`- k_exog=1, # Only one exogenous variable now`
`202`	`200`	`innovations=False, # Typically fixed effect (no stochastic evolution)`
`203`		`- state_names=exog_data_mv[["x1"]].columns.tolist(),`
	`201`	`+ state_names=exog_data_mv[["x1"]].columns.tolist(), # Only one exogenous variable now`
`204`	`202`	`observed_state_names=["y1", "y2"],`
`205`	`203`	`)`
`206`	`204`
Original file line number	Diff line number	Diff line change
`@@ -193,7 +193,7 @@ def test_lgss_distribution_with_dims(output_name, ss_mod_me, pymc_model_2):`
`193`	`193`	`def test_lgss_with_time_varying_inputs(output_name, rng):`
`194`	`194`	`X = rng.random(size=(10, 3), dtype=floatX)`
`195`	`195`	`ss_mod = structural.LevelTrendComponent() + structural.RegressionComponent(`
`196`		`- name="exog", k_exog=3`
	`196`	`+ name="exog", state_names=["exog_0", "exog_1", "exog_2"]`
`197`	`197`	`)`
`198`	`198`	`mod = ss_mod.build("data", verbose=False)`
`199`	`199`