hyperopt · bjkomer · Aug 14, 2016 · Oct 6, 2014 · Jun 29, 2016 · Jun 30, 2016
diff --git a/hpsklearn/components.py b/hpsklearn/components.py
diff --git a/hpsklearn/demo_support.py b/hpsklearn/demo_support.py
@@ -23,6 +23,7 @@ def plot_minvalid_vs_time(estimator, ax, ylim=None):
 
 
 class PlotHelper(object):
+
     def __init__(self, estimator, mintodate_ylim):
         self.estimator = estimator
         self.fig, self.axs = plt.subplots(1, 2)
@@ -41,4 +42,3 @@ def post_iter(self):
 
     def post_loop(self):
         display.clear_output()
-
diff --git a/hpsklearn/estimator.py b/hpsklearn/estimator.py
diff --git a/hpsklearn/lagselectors.py b/hpsklearn/lagselectors.py
@@ -0,0 +1,81 @@
+"""Lag selectors that subset time series predictors
+
+This module defines lag selectors with specified lag sizes for endogenous and 
+exogenous predictors, using the same style as the sklearn transformers. They 
+can be used in hpsklearn as preprocessors. The module is well suited for time 
+series data.
+
+When use a lag size of a positive integer, it is assumed that lag=1, 2, ... 
+predictors are located at the 1st, 2nd, ... columns. When use a negative 
+integer, the predictors are located at the N-th, (N - 1)th, ... columns.
+
+"""
+from sklearn.base import BaseEstimator, TransformerMixin
+import numpy as np
+
+
+class LagSelector(BaseEstimator, TransformerMixin):
+    """Subset time series features by choosing the most recent lags
+
+    Parameters
+    ----------
+    lag_size : int, None by default
+        If None, use all features. If positive integer, use features by 
+        subsetting the X as [:, :lag_size]. If negative integer, use features 
+        by subsetting the X as [:, lag_size:]. If 0, discard the features 
+        from this dataset.
+
+    Attributes
+    ----------
+    max_lag_size_ : int
+        The largest allowed lag size inferred from input.
+    """
+
+    def __init__(self, lag_size=None):
+        self.lag_size = lag_size
+
+    def _reset(self):
+        """Reset internal data-dependent state of the selector, if necessary.
+
+        __init__ parameters are not touched.
+        """
+        if hasattr(self, 'max_lag_size_'):
+            del self.max_lag_size_
+
+    def fit(self, X, y=None):
+        """Infer the maximum lag size.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape [n_samples, n_features]
+            The input time series data with lagged predictors as features.
+
+        y: Passthrough for ``Pipeline`` compatibility.
+        """
+
+        # Reset internal state before fitting
+        self._reset()
+        self.max_lag_size_ = X.shape[1]
+
+    def transform(self, X, y=None):
+        """Perform standardization by centering and scaling
+
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_features]
+            The input time series data with lagged predictors as features.
+        """
+        proofed_lag_size = min(self.max_lag_size_, abs(self.lag_size))
+        if self.lag_size >= 0:
+            return X[:, :proofed_lag_size]
+        else:
+            return X[:, -proofed_lag_size:]
+
+
+
+
+
+
+
+
+
diff --git a/hpsklearn/tests/test_demo.py b/hpsklearn/tests/test_demo.py
@@ -1,37 +1,88 @@
+from __future__ import print_function
+# import numpy as np
+from sklearn import datasets
+from sklearn.cross_validation import train_test_split
+from hyperopt import tpe
+import hpsklearn
+import sys
 
 def test_demo_iris():
-    import numpy as np
-    import skdata.iris.view
-    import hyperopt.tpe
-    import hpsklearn
 
-    data_view = skdata.iris.view.KfoldClassification(4)
+    iris = datasets.load_iris()
+    X_train, X_test, y_train, y_test = train_test_split(
+        iris.data, iris.target, test_size=.25, random_state=1)
 
     estimator = hpsklearn.HyperoptEstimator(
         preprocessing=hpsklearn.components.any_preprocessing('pp'),
         classifier=hpsklearn.components.any_classifier('clf'),
-        algo=hyperopt.tpe,
-        trial_timeout=15.0, # seconds
-        max_evals=100,
-        )
+        algo=tpe.suggest,
+        trial_timeout=15.0,  # seconds
+        max_evals=10,
+        seed=1
+    )
 
     # /BEGIN `Demo version of estimator.fit()`
+    print('', file=sys.stderr)
+    print('====Demo classification on Iris dataset====', file=sys.stderr)
 
-    iterator = estimator.fit_iter(
-        data_view.split[0].train.X,
-        data_view.split[0].train.y)
+    iterator = estimator.fit_iter(X_train, y_train)
     next(iterator)
 
+    n_trial = 0
     while len(estimator.trials.trials) < estimator.max_evals:
-        iterator.send(1) # -- try one more model
-        hpsklearn.demo_support.scatter_error_vs_time(estimator)
-        hpsklearn.demo_support.bar_classifier_choice(estimator)
+        iterator.send(1)  # -- try one more model
+        n_trial += 1
+        print('Trial', n_trial, 'loss:', estimator.trials.losses()[-1], 
+              file=sys.stderr)
+        # hpsklearn.demo_support.scatter_error_vs_time(estimator)
+        # hpsklearn.demo_support.bar_classifier_choice(estimator)
 
-    estimator.retrain_best_model_on_full_data(
-        data_view.split[0].train.X,
-        data_view.split[0].train.y)
+    estimator.retrain_best_model_on_full_data(X_train, y_train)
 
     # /END Demo version of `estimator.fit()`
 
-    test_predictions = estimator.predict(data_view.split[0].test.X)
-    print(np.mean(test_predictions == data_view.split[0].test.y))
+    print('Test accuracy:', estimator.score(X_test, y_test), file=sys.stderr)
+    print('====End of demo====', file=sys.stderr)
+
+
+def test_demo_boston():
+
+    boston = datasets.load_boston()
+    X_train, X_test, y_train, y_test = train_test_split(
+        boston.data, boston.target, test_size=.25, random_state=1)
+
+    estimator = hpsklearn.HyperoptEstimator(
+        preprocessing=hpsklearn.components.any_preprocessing('pp'),
+        regressor=hpsklearn.components.any_regressor('reg'),
+        algo=tpe.suggest,
+        trial_timeout=15.0,  # seconds
+        max_evals=10,
+        seed=1
+    )
+
+    # /BEGIN `Demo version of estimator.fit()`
+    print('', file=sys.stderr)
+    print('====Demo regression on Boston dataset====', file=sys.stderr)
+
+
+    iterator = estimator.fit_iter(X_train, y_train)
+    next(iterator)
+
+    n_trial = 0
+    while len(estimator.trials.trials) < estimator.max_evals:
+        iterator.send(1)  # -- try one more model
+        n_trial += 1
+        print('Trial', n_trial, 'loss:', estimator.trials.losses()[-1], 
+              file=sys.stderr)
+        # hpsklearn.demo_support.scatter_error_vs_time(estimator)
+        # hpsklearn.demo_support.bar_classifier_choice(estimator)
+
+    estimator.retrain_best_model_on_full_data(X_train, y_train)
+
+    # /END Demo version of `estimator.fit()`
+
+    print('Test R2:', estimator.score(X_test, y_test), file=sys.stderr)
+    print('====End of demo====', file=sys.stderr)
+
+
+# -- flake8 eof
diff --git a/hpsklearn/tests/test_estimator.py b/hpsklearn/tests/test_estimator.py
@@ -10,43 +10,50 @@
 
 
 class TestIter(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.X = np.random.randn(1000, 2)
         self.Y = (self.X[:, 0] > 0).astype('int')
 
     def test_fit_iter_basic(self):
-        model = hyperopt_estimator(verbose=1, trial_timeout=5.0)
+        model = hyperopt_estimator(
+            classifier=components.any_classifier('classifier'), 
+            verbose=1, trial_timeout=5.0)
         for ii, trials in enumerate(model.fit_iter(self.X, self.Y)):
             assert trials is model.trials
             assert len(trials.trials) == ii
             if ii == 10:
                 break
 
     def test_fit(self):
-        model = hyperopt_estimator(verbose=1, max_evals=5, trial_timeout=5.0)
+        model = hyperopt_estimator(
+            classifier=components.any_classifier('classifier'), 
+            verbose=1, max_evals=5, trial_timeout=5.0)
         model.fit(self.X, self.Y)
         assert len(model.trials.trials) == 5
 
     def test_fit_biginc(self):
-        model = hyperopt_estimator(verbose=1, max_evals=5, trial_timeout=5.0,
-                                   fit_increment=20)
+        model = hyperopt_estimator(
+            classifier=components.any_classifier('classifier'),
+            verbose=1, max_evals=5, trial_timeout=5.0, fit_increment=20)
         model.fit(self.X, self.Y)
         # -- make sure we only get 5 even with big fit_increment
         assert len(model.trials.trials) == 5
 
 
-class TestSpace(unittest.TestCase):
-    def setUp(self):
-        np.random.seed(123)
-        self.X = np.random.randn(1000, 2)
-        self.Y = (self.X[:, 0] > 0).astype('int')
+# class TestSpace(unittest.TestCase):
 
-    def test_smoke(self):
-        # -- verify the space argument is accepted and runs
-        space = components.generic_space()
-        model = hyperopt_estimator(
-            verbose=1, max_evals=10, trial_timeout=5, space=space)
-        model.fit(self.X, self.Y)
+#     def setUp(self):
+#         np.random.seed(123)
+#         self.X = np.random.randn(1000, 2)
+#         self.Y = (self.X[:, 0] > 0).astype('int')
+
+#     def test_smoke(self):
+#         # -- verify the space argument is accepted and runs
+#         space = components.generic_space()
+#         model = hyperopt_estimator(
+#             verbose=1, max_evals=10, trial_timeout=5, space=space)
+#         model.fit(self.X, self.Y)
 
 # -- flake8 eof