|
| 1 | +# Copyright (c) Microsoft Corporation. |
| 2 | +# Licensed under the MIT License. |
| 3 | + |
| 4 | +import inspect |
| 5 | +import numpy as np |
| 6 | +import pandas as pd |
| 7 | +from typing import Union |
| 8 | + |
| 9 | +from qlib.model.base import BaseModel |
| 10 | + |
| 11 | + |
| 12 | +class RiskModel(BaseModel): |
| 13 | + """Risk Model |
| 14 | +
|
| 15 | + A risk model is used to estimate the covariance matrix of stock returns. |
| 16 | + """ |
| 17 | + |
| 18 | + MASK_NAN = "mask" |
| 19 | + FILL_NAN = "fill" |
| 20 | + IGNORE_NAN = "ignore" |
| 21 | + |
| 22 | + def __init__(self, nan_option: str = "ignore", assume_centered: bool = False, scale_return: bool = True): |
| 23 | + """ |
| 24 | + Args: |
| 25 | + nan_option (str): nan handling option (`ignore`/`mask`/`fill`). |
| 26 | + assume_centered (bool): whether the data is assumed to be centered. |
| 27 | + scale_return (bool): whether scale returns as percentage. |
| 28 | + """ |
| 29 | + # nan |
| 30 | + assert nan_option in [ |
| 31 | + self.MASK_NAN, |
| 32 | + self.FILL_NAN, |
| 33 | + self.IGNORE_NAN, |
| 34 | + ], f"`nan_option={nan_option}` is not supported" |
| 35 | + self.nan_option = nan_option |
| 36 | + |
| 37 | + self.assume_centered = assume_centered |
| 38 | + self.scale_return = scale_return |
| 39 | + |
| 40 | + def predict( |
| 41 | + self, |
| 42 | + X: Union[pd.Series, pd.DataFrame, np.ndarray], |
| 43 | + return_corr: bool = False, |
| 44 | + is_price: bool = True, |
| 45 | + return_decomposed_components=False, |
| 46 | + ) -> Union[pd.DataFrame, np.ndarray, tuple]: |
| 47 | + """ |
| 48 | + Args: |
| 49 | + X (pd.Series, pd.DataFrame or np.ndarray): data from which to estimate the covariance, |
| 50 | + with variables as columns and observations as rows. |
| 51 | + return_corr (bool): whether return the correlation matrix. |
| 52 | + is_price (bool): whether `X` contains price (if not assume stock returns). |
| 53 | + return_decomposed_components (bool): whether return decomposed components of the covariance matrix. |
| 54 | +
|
| 55 | + Returns: |
| 56 | + pd.DataFrame or np.ndarray: estimated covariance (or correlation). |
| 57 | + """ |
| 58 | + assert ( |
| 59 | + not return_corr or not return_decomposed_components |
| 60 | + ), "Can only return either correlation matrix or decomposed components." |
| 61 | + |
| 62 | + # transform input into 2D array |
| 63 | + if not isinstance(X, (pd.Series, pd.DataFrame)): |
| 64 | + columns = None |
| 65 | + else: |
| 66 | + if isinstance(X.index, pd.MultiIndex): |
| 67 | + if isinstance(X, pd.DataFrame): |
| 68 | + X = X.iloc[:, 0].unstack(level="instrument") # always use the first column |
| 69 | + else: |
| 70 | + X = X.unstack(level="instrument") |
| 71 | + else: |
| 72 | + # X is 2D DataFrame |
| 73 | + pass |
| 74 | + columns = X.columns # will be used to restore dataframe |
| 75 | + X = X.values |
| 76 | + |
| 77 | + # calculate pct_change |
| 78 | + if is_price: |
| 79 | + X = X[1:] / X[:-1] - 1 # NOTE: resulting `n - 1` rows |
| 80 | + |
| 81 | + # scale return |
| 82 | + if self.scale_return: |
| 83 | + X *= 100 |
| 84 | + |
| 85 | + # handle nan and centered |
| 86 | + X = self._preprocess(X) |
| 87 | + |
| 88 | + # return decomposed components if needed |
| 89 | + if return_decomposed_components: |
| 90 | + assert ( |
| 91 | + "return_decomposed_components" in inspect.getfullargspec(self._predict).args |
| 92 | + ), "This risk model does not support return decomposed components of the covariance matrix " |
| 93 | + |
| 94 | + F, cov_b, var_u = self._predict(X, return_decomposed_components=True) |
| 95 | + return F, cov_b, var_u |
| 96 | + |
| 97 | + # estimate covariance |
| 98 | + S = self._predict(X) |
| 99 | + |
| 100 | + # return correlation if needed |
| 101 | + if return_corr: |
| 102 | + vola = np.sqrt(np.diag(S)) |
| 103 | + corr = S / np.outer(vola, vola) |
| 104 | + if columns is None: |
| 105 | + return corr |
| 106 | + return pd.DataFrame(corr, index=columns, columns=columns) |
| 107 | + |
| 108 | + # return covariance |
| 109 | + if columns is None: |
| 110 | + return S |
| 111 | + return pd.DataFrame(S, index=columns, columns=columns) |
| 112 | + |
| 113 | + def _predict(self, X: np.ndarray) -> np.ndarray: |
| 114 | + """covariance estimation implementation |
| 115 | +
|
| 116 | + This method should be overridden by child classes. |
| 117 | +
|
| 118 | + By default, this method implements the empirical covariance estimation. |
| 119 | +
|
| 120 | + Args: |
| 121 | + X (np.ndarray): data matrix containing multiple variables (columns) and observations (rows). |
| 122 | +
|
| 123 | + Returns: |
| 124 | + np.ndarray: covariance matrix. |
| 125 | + """ |
| 126 | + xTx = np.asarray(X.T.dot(X)) |
| 127 | + N = len(X) |
| 128 | + if isinstance(X, np.ma.MaskedArray): |
| 129 | + M = 1 - X.mask |
| 130 | + N = M.T.dot(M) # each pair has distinct number of samples |
| 131 | + return xTx / N |
| 132 | + |
| 133 | + def _preprocess(self, X: np.ndarray) -> Union[np.ndarray, np.ma.MaskedArray]: |
| 134 | + """handle nan and centerize data |
| 135 | +
|
| 136 | + Note: |
| 137 | + if `nan_option='mask'` then the returned array will be `np.ma.MaskedArray`. |
| 138 | + """ |
| 139 | + # handle nan |
| 140 | + if self.nan_option == self.FILL_NAN: |
| 141 | + X = np.nan_to_num(X) |
| 142 | + elif self.nan_option == self.MASK_NAN: |
| 143 | + X = np.ma.masked_invalid(X) |
| 144 | + # centralize |
| 145 | + if not self.assume_centered: |
| 146 | + X = X - np.nanmean(X, axis=0) |
| 147 | + return X |
0 commit comments