Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,537 changes: 973 additions & 564 deletions hpsklearn/components.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion hpsklearn/demo_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def plot_minvalid_vs_time(estimator, ax, ylim=None):


class PlotHelper(object):

def __init__(self, estimator, mintodate_ylim):
self.estimator = estimator
self.fig, self.axs = plt.subplots(1, 2)
Expand All @@ -41,4 +42,3 @@ def post_iter(self):

def post_loop(self):
display.clear_output()

601 changes: 440 additions & 161 deletions hpsklearn/estimator.py

Large diffs are not rendered by default.

81 changes: 81 additions & 0 deletions hpsklearn/lagselectors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""Lag selectors that subset time series predictors

This module defines lag selectors with specified lag sizes for endogenous and
exogenous predictors, using the same style as the sklearn transformers. They
can be used in hpsklearn as preprocessors. The module is well suited for time
series data.

When use a lag size of a positive integer, it is assumed that lag=1, 2, ...
predictors are located at the 1st, 2nd, ... columns. When use a negative
integer, the predictors are located at the N-th, (N - 1)th, ... columns.

"""
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np


class LagSelector(BaseEstimator, TransformerMixin):
"""Subset time series features by choosing the most recent lags

Parameters
----------
lag_size : int, None by default
If None, use all features. If positive integer, use features by
subsetting the X as [:, :lag_size]. If negative integer, use features
by subsetting the X as [:, lag_size:]. If 0, discard the features
from this dataset.

Attributes
----------
max_lag_size_ : int
The largest allowed lag size inferred from input.
"""

def __init__(self, lag_size=None):
self.lag_size = lag_size

def _reset(self):
"""Reset internal data-dependent state of the selector, if necessary.

__init__ parameters are not touched.
"""
if hasattr(self, 'max_lag_size_'):
del self.max_lag_size_

def fit(self, X, y=None):
"""Infer the maximum lag size.

Parameters
----------
X : {array-like, sparse matrix}, shape [n_samples, n_features]
The input time series data with lagged predictors as features.

y: Passthrough for ``Pipeline`` compatibility.
"""

# Reset internal state before fitting
self._reset()
self.max_lag_size_ = X.shape[1]

def transform(self, X, y=None):
"""Perform standardization by centering and scaling

Parameters
----------
X : array-like, shape [n_samples, n_features]
The input time series data with lagged predictors as features.
"""
proofed_lag_size = min(self.max_lag_size_, abs(self.lag_size))
if self.lag_size >= 0:
return X[:, :proofed_lag_size]
else:
return X[:, -proofed_lag_size:]









91 changes: 71 additions & 20 deletions hpsklearn/tests/test_demo.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,88 @@
from __future__ import print_function
# import numpy as np
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from hyperopt import tpe
import hpsklearn
import sys

def test_demo_iris():
import numpy as np
import skdata.iris.view
import hyperopt.tpe
import hpsklearn

data_view = skdata.iris.view.KfoldClassification(4)
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, test_size=.25, random_state=1)

estimator = hpsklearn.HyperoptEstimator(
preprocessing=hpsklearn.components.any_preprocessing('pp'),
classifier=hpsklearn.components.any_classifier('clf'),
algo=hyperopt.tpe,
trial_timeout=15.0, # seconds
max_evals=100,
)
algo=tpe.suggest,
trial_timeout=15.0, # seconds
max_evals=10,
seed=1
)

# /BEGIN `Demo version of estimator.fit()`
print('', file=sys.stderr)
print('====Demo classification on Iris dataset====', file=sys.stderr)

iterator = estimator.fit_iter(
data_view.split[0].train.X,
data_view.split[0].train.y)
iterator = estimator.fit_iter(X_train, y_train)
next(iterator)

n_trial = 0
while len(estimator.trials.trials) < estimator.max_evals:
iterator.send(1) # -- try one more model
hpsklearn.demo_support.scatter_error_vs_time(estimator)
hpsklearn.demo_support.bar_classifier_choice(estimator)
iterator.send(1) # -- try one more model
n_trial += 1
print('Trial', n_trial, 'loss:', estimator.trials.losses()[-1],
file=sys.stderr)
# hpsklearn.demo_support.scatter_error_vs_time(estimator)
# hpsklearn.demo_support.bar_classifier_choice(estimator)

estimator.retrain_best_model_on_full_data(
data_view.split[0].train.X,
data_view.split[0].train.y)
estimator.retrain_best_model_on_full_data(X_train, y_train)

# /END Demo version of `estimator.fit()`

test_predictions = estimator.predict(data_view.split[0].test.X)
print(np.mean(test_predictions == data_view.split[0].test.y))
print('Test accuracy:', estimator.score(X_test, y_test), file=sys.stderr)
print('====End of demo====', file=sys.stderr)


def test_demo_boston():

boston = datasets.load_boston()
X_train, X_test, y_train, y_test = train_test_split(
boston.data, boston.target, test_size=.25, random_state=1)

estimator = hpsklearn.HyperoptEstimator(
preprocessing=hpsklearn.components.any_preprocessing('pp'),
regressor=hpsklearn.components.any_regressor('reg'),
algo=tpe.suggest,
trial_timeout=15.0, # seconds
max_evals=10,
seed=1
)

# /BEGIN `Demo version of estimator.fit()`
print('', file=sys.stderr)
print('====Demo regression on Boston dataset====', file=sys.stderr)


iterator = estimator.fit_iter(X_train, y_train)
next(iterator)

n_trial = 0
while len(estimator.trials.trials) < estimator.max_evals:
iterator.send(1) # -- try one more model
n_trial += 1
print('Trial', n_trial, 'loss:', estimator.trials.losses()[-1],
file=sys.stderr)
# hpsklearn.demo_support.scatter_error_vs_time(estimator)
# hpsklearn.demo_support.bar_classifier_choice(estimator)

estimator.retrain_best_model_on_full_data(X_train, y_train)

# /END Demo version of `estimator.fit()`

print('Test R2:', estimator.score(X_test, y_test), file=sys.stderr)
print('====End of demo====', file=sys.stderr)


# -- flake8 eof
37 changes: 22 additions & 15 deletions hpsklearn/tests/test_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,43 +10,50 @@


class TestIter(unittest.TestCase):

def setUp(self):
np.random.seed(123)
self.X = np.random.randn(1000, 2)
self.Y = (self.X[:, 0] > 0).astype('int')

def test_fit_iter_basic(self):
model = hyperopt_estimator(verbose=1, trial_timeout=5.0)
model = hyperopt_estimator(
classifier=components.any_classifier('classifier'),
verbose=1, trial_timeout=5.0)
for ii, trials in enumerate(model.fit_iter(self.X, self.Y)):
assert trials is model.trials
assert len(trials.trials) == ii
if ii == 10:
break

def test_fit(self):
model = hyperopt_estimator(verbose=1, max_evals=5, trial_timeout=5.0)
model = hyperopt_estimator(
classifier=components.any_classifier('classifier'),
verbose=1, max_evals=5, trial_timeout=5.0)
model.fit(self.X, self.Y)
assert len(model.trials.trials) == 5

def test_fit_biginc(self):
model = hyperopt_estimator(verbose=1, max_evals=5, trial_timeout=5.0,
fit_increment=20)
model = hyperopt_estimator(
classifier=components.any_classifier('classifier'),
verbose=1, max_evals=5, trial_timeout=5.0, fit_increment=20)
model.fit(self.X, self.Y)
# -- make sure we only get 5 even with big fit_increment
assert len(model.trials.trials) == 5


class TestSpace(unittest.TestCase):
def setUp(self):
np.random.seed(123)
self.X = np.random.randn(1000, 2)
self.Y = (self.X[:, 0] > 0).astype('int')
# class TestSpace(unittest.TestCase):

def test_smoke(self):
# -- verify the space argument is accepted and runs
space = components.generic_space()
model = hyperopt_estimator(
verbose=1, max_evals=10, trial_timeout=5, space=space)
model.fit(self.X, self.Y)
# def setUp(self):
# np.random.seed(123)
# self.X = np.random.randn(1000, 2)
# self.Y = (self.X[:, 0] > 0).astype('int')

# def test_smoke(self):
# # -- verify the space argument is accepted and runs
# space = components.generic_space()
# model = hyperopt_estimator(
# verbose=1, max_evals=10, trial_timeout=5, space=space)
# model.fit(self.X, self.Y)

# -- flake8 eof
Loading