andrlik · andrlik · Apr 29, 2024 · Apr 29, 2024 · Apr 29, 2024 · Apr 29, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## Unreleased
+
+[Compare the full difference](https://github.com/andrlik/django-markov/compare/0.2.4...HEAD)
+
+- Implements new feature: `MarkovTextModel.add_new_corpus_data_to_model` and its async counterpart. This allows you to add new corpus entries to an existing model without having to regenerate over the whole corpus. This will not work if the stored model is in a compiled state.
+
 ## 0.2.4
 
 [Compare the full difference](https://github.com/andrlik/django-markov/compare/0.2.3...0.2.4)

diff --git a/README.md b/README.md
@@ -112,6 +112,20 @@ async def create_my_text_model() -> MarkovTextModel:
     return text_model
 ```
 
+You can also later add to that model with new entries, as long as you haven't stored it in a compiled state.
+
+```python
+from django_markov.models import MarkovTextModel
+
+my_markov_model_instance = MarkovTextModel.objects.first()
+my_markov_model_instance.add_new_corpus_data_to_model(
+    corpus_entries=[
+        "I like burgers and fries.",
+        "I once ate a pickle larger than my hand.",
+    ]
+)
+```
+
 Once you have a model initialized, you can have it generate a sentence. For example,
 say that you have a text model in your database already, and you want a sentence generated.
 

diff --git a/src/django_markov/models.py b/src/django_markov/models.py
@@ -61,6 +61,15 @@ def _get_default_state_size() -> int:
 STATE_SIZE = _get_default_state_size()
 
 
+def _get_default_compile_setting() -> bool:
+    """Get the default value from settings."""
+    if not hasattr(settings, "MARKOV_STORE_COMPILED_MODELS") or not isinstance(
+        settings.MARKOV_STORE_COMPILED_MODELS, bool
+    ):
+        return False
+    return settings.MARKOV_STORE_COMPILED_MODELS
+
+
 class MarkovTextModel(models.Model):
     """Stores a compiled markov text model.
 
@@ -148,15 +157,99 @@ def _compiled_model(self) -> POSifiedText | None:
             return text_model
         return text_model.compile(inplace=True)  # type: ignore
 
+    async def aadd_new_corpus_data_to_model(
+        self,
+        corpus_entries: list[str],
+        *,
+        char_limit: int | None = None,
+        weights: list[float] | None = None,
+    ) -> None:
+        """Takes a list of new corpus entries and incorporates them into the model.
+        Unlike `aupdate_model_from_corpus`, this method is additive. This works by
+        first creating a text model based on the new entries, and then uses
+        `markovify.combine` to add them to the existing text model. Note that
+        this will fail if the stored model is compiled.
+
+        Args:
+            corpus_entries (list[str]): A list of text sentences to add.
+            char_limit (int | None): The character limit to use for the new corpus.
+                Use `0` for no limit.
+            weights (list[float] | None): The weighting to use for combine
+                operation, the first value representing the saved model, and the second
+                representing the new entries.
+
+        Raises:
+            MarkovCombineError: If the stored model is already compiled.
+            MarkovEmptyError: If the new models are empty.
+        """
+        saved_model = self._as_text_model()
+        if self.data is None or self.data == "" or saved_model is None:
+            # There's no existing model, use update instead.
+            return await self.aupdate_model_from_corpus(
+                corpus_entries=corpus_entries, char_limit=char_limit
+            )
+        if char_limit is None:
+            char_limit = _get_corpus_char_limit()
+        if weights is not None and len(weights) != 2:  # noqa: PLR2004
+            msg = "If provided, weights must have exactly two entries!"
+            raise ValueError(msg)
+        corpus = " ".join(corpus_entries)
+        if len(corpus_entries) == 0 or corpus.replace(" ", "") == "":
+            msg = "There are no corpus entries to add!"
+            raise MarkovEmptyError(msg)
+        if saved_model.chain.compiled:
+            msg = "Saved model is compiled, cannot combine!"
+            raise MarkovCombineError(msg)
+        new_model = POSifiedText(corpus, state_size=saved_model.state_size)
+        try:
+            combined_model = markovify.combine(
+                [saved_model, new_model], weights=weights
+            )
+        except ValueError as ve:  # no cov
+            # If markovify raises any other unexpected error.
+            msg = f"The following error occurred while combining: {ve}"
+            raise MarkovCombineError(msg) from ve
+        if (
+            combined_model is not None and type(combined_model) is POSifiedText
+        ):  # no cov
+            self.data = combined_model.to_json()
+            await self.asave()
+
+    def add_new_corpus_data_to_model(
+        self,
+        corpus_entries: list[str],
+        *,
+        char_limit: int | None = None,
+        weights: list[float] | None = None,
+    ) -> None:
+        """Sync wrapper for `aadd_new_corpus_data_to_model`.
+
+        Args:
+            corpus_entries (list[str]): A list of text sentences to add.
+            char_limit (int | None): The character limit to use for the new corpus.
+                Use `0` for no limit.
+            weights (list[float] | None): The weighting to use for combine
+                operation, the first value representing the saved model, and the second
+                representing the new entries.
+
+        Raises:
+            MarkovCombineError: If the stored model is already compiled.
+            MarkovEmptyError: If the new models are empty.
+            ValueError: If weights are supplied, and they do not have a length of two.
+        """
+        return async_to_sync(self.aadd_new_corpus_data_to_model)(
+            corpus_entries=corpus_entries, char_limit=char_limit, weights=weights
+        )
+
     async def aupdate_model_from_corpus(
         self,
         corpus_entries: list[str],
         *,
         char_limit: int | None = None,
         store_compiled: bool | None = None,
     ) -> None:
-        """Takes the corpus and updates the model, saving it.
-        The corpus must not exceed the char_limit.
+        """Takes the a list of entries as the new full corpus and recreates the model,
+        saving it. The corpus must not exceed the char_limit.
 
         Args:
             corpus_entries (list[str]): The corpus as a list of text sentences.
@@ -165,17 +258,14 @@ async def aupdate_model_from_corpus(
             store_compiled (bool | None): Whether to store the model in it's compiled
                 state. If None, defaults to settings.MARKOV_STORE_COMPILED_MODELS or
                 False.
+
+        Raises:
+            ValueError: If the corpus is beyond the maximum character limit.
         """
         if not char_limit:
             char_limit = _get_corpus_char_limit()
-        if (
-            store_compiled is None
-            and hasattr(settings, "MARKOV_STORE_COMPILED_MODELS")
-            and isinstance(settings.MARKOV_STORE_COMPILED_MODELS, bool)
-        ):
-            store_compiled = settings.MARKOV_STORE_COMPILED_MODELS
-        else:
-            store_compiled = False
+        if store_compiled is None:
+            store_compiled = _get_default_compile_setting()
         corpus = " ".join(corpus_entries)
         if char_limit != 0 and char_limit < len(corpus):
             msg = f"Supplied corpus is over the maximum character limit: {char_limit}"
@@ -193,7 +283,21 @@ def update_model_from_corpus(
         char_limit: int | None = None,
         store_compiled: bool | None = None,
     ) -> None:
-        """Sync wrapper for the async version"""
+        """Sync wrapper for the async version
+        Takes the a list of entries as the new full corpus and recreates the model,
+        saving it. The corpus must not exceed the char_limit.
+
+        Args:
+            corpus_entries (list[str]): The corpus as a list of text sentences.
+            char_limit (int | None): The maximum number of characters
+                to allow in the corpus.
+            store_compiled (bool | None): Whether to store the model in it's compiled
+                state. If None, defaults to settings.MARKOV_STORE_COMPILED_MODELS or
+                False.
+
+        Raises:
+            ValueError: If the corpus is beyond the maximum character limit.
+        """
         async_to_sync(self.aupdate_model_from_corpus)(  # no cov
             corpus_entries=corpus_entries,
             char_limit=char_limit,
@@ -274,6 +378,11 @@ async def acombine_models(
             Either a new MarkovTextModel instance
                 persisted to the database or a POSifiedText object to manipulate at a
                 low level, and the total number of models combined.
+
+        Raises:
+            ValueError: If any of the parameter combinations is invalid
+            MarkovCombineError: If models are incompatible for combining or a markovify
+                error is raised.
         """
         # First we check to ensure that the models are combinable.
         empty_models = []
@@ -348,6 +457,38 @@ def combine_models(
     ) -> tuple["MarkovTextModel | POSifiedText", int]:
         """
         Sync wrapper of acombine_models.
+
+        Combine multiple MarkovTextModels into a single model.
+
+        Models cannot be combined if any of the following is true:
+            - They are empty of data.
+            - They are stored in compiled state.
+            - The state size between models is not the same.
+            - The underlying text models are not the same type (if you subclass).
+            - You supply weights, but not the same number as the models to combine
+                or if you use permissive mode.
+
+        Args:
+            models (list[MarkovTextModel]): A list of MarkovTextModel instances to
+                combine.
+            return_type (Literal["model_instance", "text_model"]): The desired result
+                 type.
+            mode (Literal["strict", "permissive"]): strict indicates that an exception
+                should be raised if any of the candidate models are incompatible, or
+                if those specific instances should simply be dropped from the operation.
+            weights (list[float] | None): A list of floats representing the relative
+                weights to put on each source. Optional, but can only be used with
+                mode='strict'.
+
+        Returns:
+            Either a new MarkovTextModel instance
+                persisted to the database or a POSifiedText object to manipulate at a
+                low level, and the total number of models combined.
+
+        Raises:
+            ValueError: If any of the parameter combinations is invalid
+            MarkovCombineError: If models are incompatible for combining or a markovify
+                error is raised.
         """
         return async_to_sync(cls.acombine_models)(  # no cov
             models=models, return_type=return_type, mode=mode, weights=weights

diff --git a/tests/test_models.py b/tests/test_models.py
@@ -17,6 +17,7 @@
     MarkovEmptyError,
     MarkovTextModel,
     _get_corpus_char_limit,
+    _get_default_compile_setting,
     _get_default_state_size,
 )
 from django_markov.text_models import POSifiedText
@@ -43,6 +44,17 @@ def test_get_char_limit_missing_settings(settings):
     assert _get_corpus_char_limit() == 0  # Setting was not present
 
 
+@pytest.mark.parametrize("override_value", [False, True])
+def test_get_compile_default_setting(settings, override_value):
+    settings.MARKOV_STORE_COMPILED_MODELS = override_value
+    assert _get_default_compile_setting() == override_value
+
+
+def test_get_compile_default_missing_settings(settings):
+    del settings.MARKOV_STORE_COMPILED_MODELS
+    assert not _get_default_compile_setting()
+
+
 @pytest.mark.parametrize(
     "override_value,expected_result",
     [
@@ -284,3 +296,82 @@ async def test_acombine_successful(
     )
     assert isinstance(result, expected_result_type)
     assert total_combined == num_clean
+
+
+def test_add_data_to_compiled_model_raises_exception(
+    compiled_model, sample_corpus
+) -> None:
+    old_modify = compiled_model.modified
+    old_data = compiled_model.data
+    with pytest.raises(MarkovCombineError):
+        compiled_model.add_new_corpus_data_to_model(
+            [sample_corpus, "This is not going to work."]
+        )
+    compiled_model.refresh_from_db()
+    assert compiled_model.modified == old_modify
+    assert compiled_model.data == old_data
+
+
+@pytest.mark.parametrize(
+    "corpus_entries,char_limit,weights,expected_exception",
+    [
+        ([], None, None, MarkovEmptyError),
+        ([], None, [1.0, 1.0], MarkovEmptyError),
+        (["I like springtime.", "Does this bring joy?"], None, [1.0], ValueError),
+        (["I like springtime.", "Does this bring joy?"], 0, [], ValueError),
+        (
+            ["I like springtime.", "Does this bring joy?"],
+            None,
+            [1.0, 1.3, 1.0],
+            ValueError,
+        ),
+    ],
+)
+def test_add_data_to_model_invocation_failures(
+    text_model, sample_corpus, corpus_entries, char_limit, weights, expected_exception
+):
+    text_model.update_model_from_corpus([sample_corpus], store_compiled=False)
+    text_model.refresh_from_db()
+    old_data = text_model.data
+    old_modify = text_model.modified
+    with pytest.raises(expected_exception):
+        text_model.add_new_corpus_data_to_model(
+            corpus_entries=corpus_entries, weights=weights
+        )
+    text_model.refresh_from_db()
+    assert text_model.modified == old_modify
+    assert text_model.data == old_data
+
+
+@pytest.mark.parametrize(
+    "corpus_entries,char_limit,weights",
+    [
+        (["I like springtime.", "Does this bring joy?"], None, None),
+        (["I like springtime.", "Does this bring joy?"], 0, None),
+        (["I like springtime.", "Does this bring joy?"], None, [1.0, 1.0]),
+    ],
+)
+def test_add_data_to_model_success(
+    text_model, sample_corpus, corpus_entries, char_limit, weights
+):
+    text_model.update_model_from_corpus([sample_corpus], store_compiled=False)
+    text_model.refresh_from_db()
+    old_data = text_model.data
+    old_modify = text_model.modified
+    text_model.add_new_corpus_data_to_model(
+        corpus_entries=corpus_entries, char_limit=char_limit, weights=weights
+    )
+    text_model.refresh_from_db()
+    assert text_model.data != old_data
+    assert text_model.modified > old_modify
+
+
+def test_add_data_to_empty_model_falls_back_to_update(text_model):
+    assert not text_model.data
+    old_modify = text_model.modified
+    text_model.add_new_corpus_data_to_model(
+        corpus_entries=["I like springtime.", "Does this bring joy?"]
+    )
+    text_model.refresh_from_db()
+    assert text_model.data is not None
+    assert text_model.modified > old_modify