diff --git a/CHANGELOG.md b/CHANGELOG.md index 26ff6f939c..7a2514616a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added - MNIST dataset for SSL banchmark ([#1368](https://github.com/catalyst-team/catalyst/pull/1368)) +- MoveiLens 20M dataset [#1336](https://github.com/catalyst-team/catalyst/pull/1336) - logger property for logging customization ([#1372](https://github.com/catalyst-team/catalyst/pull/1372)) ### Changed diff --git a/catalyst/contrib/datasets/__init__.py b/catalyst/contrib/datasets/__init__.py index e55e4a3881..7741f8c266 100644 --- a/catalyst/contrib/datasets/__init__.py +++ b/catalyst/contrib/datasets/__init__.py @@ -37,3 +37,6 @@ if SETTINGS.ml_required: from catalyst.contrib.datasets.movielens import MovieLens + + if SETTINGS.is_torch_1_7_0: + from catalyst.contrib.datasets.movielens import MovieLens20M diff --git a/catalyst/contrib/datasets/movielens.py b/catalyst/contrib/datasets/movielens.py index 08a48455bf..e5454d52ee 100644 --- a/catalyst/contrib/datasets/movielens.py +++ b/catalyst/contrib/datasets/movielens.py @@ -2,6 +2,7 @@ import os import numpy as np +import pandas as pd import scipy.sparse as sp import torch @@ -91,7 +92,7 @@ def __init__(self, root, train=True, download=False, min_rating=0.0): self._fetch_movies() if not self._check_exists(): - raise RuntimeError("Dataset not found. You can use download=True to download it") + raise RuntimeError("Dataset not found. Set `download=True`") if self.train: data_file = self.training_file @@ -117,16 +118,29 @@ def __len__(self): @property def raw_folder(self): - """Create raw folder for data download""" + """Create raw folder for data download + + Returns: + raw_path (path): raw folder path + """ return os.path.join(self.root, self.__class__.__name__, "raw") @property def processed_folder(self): - """Create the folder for the processed files""" + """Create the folder for the processed files + + Returns: + raw_path (path): processed folder path + """ return os.path.join(self.root, self.__class__.__name__, "processed") def _check_exists(self): - """Check if the path for tarining and testing data exists in processed folder.""" + """Check if the path for tarining and testing data + exists in processed folder. + + Returns: + raw_path (path): processed folder path + """ return os.path.exists( os.path.join(self.processed_folder, self.training_file) ) and os.path.exists(os.path.join(self.processed_folder, self.test_file)) @@ -247,9 +261,14 @@ def _fetch_movies(self): 3. Parse test data 4. Save in the .pt with torch.save """ - (train_raw, test_raw, item_metadata_raw, genres_raw) = self._read_raw_movielens_data() + data = self._read_raw_movielens_data() + train_raw = data[0] + test_raw = data[1] - num_users, num_items = self._get_dimensions(self._parse(train_raw), self._parse(test_raw)) + train_parsed = self._parse(train_raw) + test_parsed = self._parse(test_raw) + + num_users, num_items = self._get_dimensions(train_parsed, test_parsed) train = self._build_interaction_matrix(num_users, num_items, self._parse(train_raw)) test = self._build_interaction_matrix(num_users, num_items, self._parse(test_raw)) @@ -262,4 +281,514 @@ def _fetch_movies(self): torch.save(test, f) -__all__ = ["MovieLens"] +class MovieLens20M(Dataset): + """ + MovieLens data sets (ml-20m) were collected by + the GroupLens Research Project at the University of Minnesota. + + This data set consists of: + * 20,000,263 ratings (1-5) + and 465,564 tag applications from 138,493 users on 27,278 movies. + * Each user has rated at least 20 movies. + * Simple demographic info for the users + (age, gender, occupation, zip) + + Users were selected at random for inclusion. + All selected users had rated at least 20 movies. + No demographic information is included. + Each user is represented by an id, and no other information is provided. + + More details about the contents and use of all these files follows. + This and other GroupLens data sets are publicly available for download + at http://grouplens.org/datasets/. + + The data was collected through the MovieLens web site. + (movielens.umn.edu) between January 09, 1995 and March 31, 2015. + This dataset was generated on October 17, 2016. + + Neither the University of Minnesota nor any of the researchers involved + can guarantee the correctness of the data, its suitability + for any particular purpose, or the validity of + results based on the use of the data set. + + The data set may be used for any research purposes + under the following conditions: + + The user may not state or imply any endorsement + from the University of Minnesota or the GroupLens Research Group. + + The user must acknowledge the use of the data set in + publications resulting from the use of the data set + (see below for citation information). + + The user may not redistribute the data without separate permission. + + The user may not use this information for any + commercial or revenue-bearing purposes + without first obtaining permission from a faculty member + of the GroupLens Research Project at the University of Minnesota. + + The executable software scripts are provided "as is" + without warranty of any kind, either expressed or implied, including, + but not limited to, the implied warranties of merchantability + and fitness for a particular purpose. + + The entire risk as to the quality and performance of them is with you. + Should the program prove defective, + you assume the cost of all necessary servicing, repair or correction. + + In no event shall the University of Minnesota, + its affiliates or employees be liable to you for any damages + arising out of the use or inability to use these programs (including + but not limited to loss of data or data being rendered inaccurate). + + The data are contained in six files: + 1. genome-scores.csv + 2. genome-tags.csv + 3. links.csv + 4. movies.csv + 5. ratings.csv + 6. tags.csv + + Ratings Data File Structure (ratings.csv) + All ratings are contained in the file ratings.csv. + Each line of this file after the header row represents + one rating of one movie by one user,and has the following format: + + 1. userId, + 2. movieId, + 3. rating, + 4. timestamp + + Tags Data File Structure (tags.csv) + + 1. userId, + 2. movieId, + 3. tag, + 4. timestamp + + Movies Data File Structure (movies.csv) + + 1. movieId, + 2. title, + 3. genres + + Movie titles are entered manually or + imported from https://www.themoviedb.org/, and include the year + of release in parentheses. + Errors and inconsistencies may exist in these titles. + + Links Data File Structure (links.csv) + + 1. movieId, + 2. imdbId, + 3. tmdbId + + Tag Genome (genome-scores.csv and genome-tags.csv) + + 1. movieId, + 2. tagId, + 3. relevance + + + If you have any further questions or comments, please contact GroupLens + . + https://files.grouplens.org/datasets/movielens/ml-20m-README.html + """ + + resources = ( + "https://files.grouplens.org/datasets/movielens/ml-20m.zip", + " cd245b17a1ae2cc31bb14903e1204af3", + ) + filename = "ml-20m.zip" + training_file = "training.pt" + test_file = "test.pt" + + def __init__( + self, + root, + train=True, + download=False, + min_rating=0.0, + min_items_per_user=1.0, + min_users_per_item=2.0, + test_prop=0.2, + split="users", + sample=False, + n_rows=1000, + ): + """ + Args: + root (string): Root directory of dataset where + ``MovieLens/processed/training.pt`` + and ``MovieLens/processed/test.pt`` exist. + train (bool, optional): If True, creates dataset from + ``training.pt``, otherwise from ``test.pt``. + download (bool, optional): If true, downloads the dataset from + the internet and puts it in root directory. If dataset + is already downloaded, it is not downloaded again. + min_rating (float, optional): Minimum rating to include in + the interaction matrix + min_items_per_user (float, optional): + Minimum number of items per user + to include in the interaction matrix + min_users_per_item (float, optional): + Minimum rating to users per itemrs + to include in the interaction matrix + test_prop (float, optional): train-test split + split (string, optional): the splittage method. + `users` – split by users + `ts` - split by timestamp + sample (bool, optional): + If true, then use the sample of the dataset. + If true the `n_rows` shold be provide + n_rows (int, optional): number of rows to retrieve. + Availbale only with `sample = True` + + Raises: + RuntimeError: If ``download = False`` and the dataset not found. + RuntimeError: If torch version < `1.7.0`" + """ + if isinstance(root, torch._six.string_classes): + root = os.path.expanduser(root) + + self.root = root + self.train = train + self.min_rating = min_rating + self.min_items_per_user = min_items_per_user + self.min_users_per_item = min_users_per_item + self.test_prop = test_prop + self.nrows = n_rows + self.sample = sample + self.split = split + + if download: + self._download() + + self._fetch_movies(split_by=split) + + if not self._check_exists(): + raise RuntimeError("Dataset not found. Set `download=True`") + + if self.train: + data_file = self.training_file + else: + data_file = self.test_file + + self.data = torch.load(os.path.join(self.processed_folder, data_file)) + + def __getitem__(self, user_index): + """Get item. + + Args: + user_index (int): User index + + Returns: + tensor: (items) item's ranking for the user with index user_index + """ + return self.data[user_index] + + def __len__(self): + """The length of the loader""" + return self.dimensions[0] + + @property + def raw_folder(self): + """Create raw folder for data download + + Returns: + raw_path (path): raw folder path + """ + return os.path.join(self.root, self.__class__.__name__, "raw") + + @property + def processed_folder(self): + """Create the folder for the processed files + + Returns: + raw_path (path): processed folder path + """ + return os.path.join(self.root, self.__class__.__name__, "processed") + + def _check_exists(self): + """Check if the path for tarining and testing data exists in + processed folder. + + Returns: + raw_path (path): processed folder path + """ + return os.path.exists( + os.path.join(self.processed_folder, self.training_file) + ) and os.path.exists(os.path.join(self.processed_folder, self.test_file)) + + def _download(self): + """Download and extract files""" + if self._check_exists(): + return + + os.makedirs(self.raw_folder, exist_ok=True) + os.makedirs(self.processed_folder, exist_ok=True) + url = self.resources[0] + + download_and_extract_archive( + url=url, + download_root=self.raw_folder, + filename=self.filename, + remove_finished=True, + ) + + def _read_raw_movielens_data(self): + """Read the csv files with pandas. + + Returns: + (movies, ratings, genome_scores, genome_tags, tags): + (pd.DataFrame, pd.DataFrame, pd.DataFrame, + pd.DataFrame, pd.DataFrame) + """ + path = self.raw_folder + + if self.sample: + movies = pd.read_csv(path + "/ml-20m/movies.csv", nrows=self.nrows) + ratings = pd.read_csv(path + "/ml-20m/ratings.csv", nrows=self.nrows) + genome_scores = pd.read_csv(path + "/ml-20m/genome-scores.csv", nrows=self.nrows) + genome_tags = pd.read_csv(path + "/ml-20m/genome-tags.csv", nrows=self.nrows) + tags = pd.read_csv(path + "/ml-20m/tags.csv", nrows=self.nrows) + else: + movies = pd.read_csv(path + "/ml-20m/movies.csv") + ratings = pd.read_csv(path + "/ml-20m/ratings.csv") + genome_scores = pd.read_csv(path + "/ml-20m/genome-scores.csv") + genome_tags = pd.read_csv(path + "/ml-20m/genome-tags.csv") + tags = pd.read_csv(path + "/ml-20m/tags.csv") + + return (movies, ratings, genome_scores, genome_tags, tags) + + def _build_interaction_matrix(self, ratings): + """Builds interaction matrix. + + Args: + ratings (pd.Dataframe): pandas DataFrame of the following format + userId movieId rating + 20 1 924 3.5 + 19 1 919 3.5 + 86 1 2683 3.5 + 61 1 1584 3.5 + 23 1 1079 4.0 + + Returns: + interaction_matrix (torch.sparse.Float): + sparse user2item interaction matrix + """ + csr_matrix = sp.coo_matrix( + (ratings["rating"].astype(np.float32), (ratings["movieId"], ratings["userId"])) + ) + + interaction_matrix = torch.sparse.LongTensor( + torch.LongTensor([csr_matrix.row.tolist(), csr_matrix.col.tolist()]), + torch.LongTensor(csr_matrix.data.astype(np.int32)), + ) + + return interaction_matrix + + def _parse( + self, + ratings, + rating_cut=True, + user_per_item_cut=True, + item_per_user_cut=True, + ts_cut=False, + ): + """Parses and pre-process the raw data. + Substract one to shift to zero based indexing + To-do add timestamp cut + + Args: + ratings (pd.Dataframe): pandas DataFrame of the following format + userId movieId rating timestamp + 20 1 924 3.5 1094785598 + 19 1 919 3.5 1094785621 + 86 1 2683 3.5 1094785650 + 61 1 1584 3.5 1094785656 + 23 1 1079 4.0 1094785665 + rating_cut (bool, optional): + If true, filter datafreame on the `min_rating` value + user_per_item_cut (bool, optional): + If true, filter datafreame on the `min_users_per_item` value + item_per_user_cut (bool, optional): + If true, filter datafreame on the `min_items_per_user` value + ts_cut (bool, optional): + If true, filter datafreame on the `min_ts` value [TO-DO] + + Returns: + ratings (pd.Dataframe): filtered `ratings` pandas DataFrame + users_activity (pd.DataFrame): + Number of items each user interacted with + items_activity (pd.DataFrame): + Number of users interacted with each item. + """ + if rating_cut: + ratings = ratings[ratings["rating"] >= self.min_rating].sort_values( + ["userId", "timestamp"] + ) + + movie_id = "movieId" + user_cnt_df = ( + ratings[[movie_id]] + .groupby(movie_id, as_index=False) + .size() + .rename(columns={"size": "user_cnt"}) + ) + user_id = "userId" + item_cnt_df = ( + ratings[[user_id]] + .groupby(user_id, as_index=False) + .size() + .rename(columns={"size": "item_cnt"}) + ) + + user_not_filtered = True + item_not_filtered = True + + while user_not_filtered or item_not_filtered: + ratings = ratings[ + ratings[movie_id].isin( + user_cnt_df.index[user_cnt_df["user_cnt"] >= self.min_users_per_item] + ) + ] + ratings = ratings[ + ratings[user_id].isin( + item_cnt_df.index[item_cnt_df["item_cnt"] >= self.min_items_per_user] + ) + ] + + user_cnt_df = ( + ratings[[movie_id]] + .groupby(movie_id, as_index=False) + .size() + .rename(columns={"size": "user_cnt"}) + ) + item_cnt_df = ( + ratings[[user_id]] + .groupby(user_id, as_index=False) + .size() + .rename(columns={"size": "item_cnt"}) + ) + + user_not_filtered = (user_cnt_df["user_cnt"] < self.min_users_per_item).any() + item_not_filtered = (item_cnt_df["item_cnt"] < self.min_items_per_user).any() + + users_activity = ( + ratings[["userId"]] + .groupby("userId", as_index=False) + .size() + .rename(columns={"size": "user_cnt"}) + ) + items_activity = ( + ratings[["movieId"]] + .groupby("movieId", as_index=False) + .size() + .rename(columns={"size": "item_cnt"}) + ) + return ratings, users_activity, items_activity + + def _split_by_users(self, ratings, users_activity): + """Split the ratings DataFrame into train and test + Randomly shuffle users and split + + Args: + ratings (pd.Dataframe): pandas DataFrame of the following format + userId movieId rating timestamp + 20 1 924 3.5 1094785598 + 19 1 919 3.5 1094785621 + 86 1 2683 3.5 1094785650 + 61 1 1584 3.5 1094785656 + 23 1 1079 4.0 1094785665 + users_activity (pd.DataFrame): + Number of items each user interacted with + + Returns: + train_events (pd.Dataframe): pandas DataFrame for training data + test_events (pd.Dataframe): pandas DataFrame for training data + """ + idx_perm = np.random.permutation(users_activity.index.size) + unique_uid = users_activity.index[idx_perm] + n_users = unique_uid.size + + test_users = unique_uid[: int(n_users * self.test_prop)] + train_users = unique_uid[int(n_users * self.test_prop) :] + + train_events = ratings.loc[ratings["userId"].isin(train_users)] + test_events = ratings.loc[ratings["userId"].isin(test_users)] + + return (train_events, test_events) + + def _split_by_time(self, ratings): + """Split the ratings DataFrame into train and test by timestamp + Ratings[timestamp] extreme values used for the filtering interval + + Args: + ratings (pd.Dataframe): pandas DataFrame of the following format + userId movieId rating timestamp + 20 1 924 3.5 1094785598 + 19 1 919 3.5 1094785621 + 86 1 2683 3.5 1094785650 + 61 1 1584 3.5 1094785656 + 23 1 1079 4.0 1094785665 + + Returns: + train_events (pd.Dataframe): pandas DataFrame for training data + test_events (pd.Dataframe): pandas DataFrame for training data + """ + ts = ratings["timestamp"].sort_values() + ts_max = ts.max() + ts_min = ts.min() + ts_split = ts_min + (ts_max - ts_min) * self.test_prop + + train_events = ratings[ratings["timestamp"] > ts_split] + test_events = ratings[ratings["timestamp"] <= ts_split] + + return (train_events, test_events) + + def _fetch_movies(self, split_by="users"): + """ + Fetch data and save in the pytorch format + 1. Read the MovieLens20 data from raw archive + 2. Parse the rating dataset + 3. Split dataset into train and test + 4. Build user-item matrix interaction + 5. Save in the .pt with torch.save + + Args: + split_by (string, optional): the splittage method. + `users` – split by users + `ts` - split by timestamp + + Raises: + ValueError: If `split_by` argument is not equal `users` or `ts` + """ + raw_data = self._read_raw_movielens_data() + + ratings = raw_data[1] + + # TO-DO: add error handling + ratings, users_activity, items_activity = self._parse(ratings) + self.users_activity = users_activity + self.items_activity = items_activity + + if split_by == "users": + train_raw, test_raw = self._split_by_users(ratings, users_activity) + if split_by == "ts": + train_raw, test_raw = self._split_by_time(ratings) + if split_by != "users" and split_by != "ts": + raise ValueError("Only splitting by users and ts supported") + + train = self._build_interaction_matrix(train_raw) + test = self._build_interaction_matrix(test_raw) + + with open(os.path.join(self.processed_folder, self.training_file), "wb") as f: + torch.save(train, f) + + with open(os.path.join(self.processed_folder, self.test_file), "wb") as f: + torch.save(test, f) + + +__all__ = ["MovieLens", "MovieLens20M"] diff --git a/catalyst/settings.py b/catalyst/settings.py index 64487caa51..3e85ed11f8 100644 --- a/catalyst/settings.py +++ b/catalyst/settings.py @@ -3,7 +3,8 @@ import logging import os -# from packaging.version import parse, Version +from packaging.version import Version + import torch from catalyst.extras.frozen_class import FrozenClass @@ -14,6 +15,10 @@ NUM_CUDA_DEVICES = torch.cuda.device_count() +def _is_torch_1_7_0(): + return Version(torch.__version__) >= Version("1.7.0") + + def _is_apex_avalilable(): try: import apex # noqa: F401 @@ -232,6 +237,8 @@ def __init__( # noqa: D107 log_batch_metrics: Optional[bool] = None, log_epoch_metrics: Optional[bool] = None, compute_per_class_metrics: Optional[bool] = None, + # [versions] + is_torch_1_7_0: Optional[bool] = None, # [to remove] nifti_required: Optional[bool] = None, albu_required: Optional[bool] = None, @@ -368,6 +375,11 @@ def __init__( # noqa: D107 or os.environ.get("CATALYST_COMPUTE_PER_CLASS_METRICS", "0") == "1" ) + # [versions] + self.is_torch_1_7_0: bool = _get_optional_value( + is_torch_1_7_0, _is_torch_1_7_0, "upgrade to torch >= 1.7.0." + ) + # [global] # stages self.stage_train_prefix: str = "train" diff --git a/tests/catalyst/contrib/datasets/test_movielens.py b/tests/catalyst/contrib/datasets/test_movielens.py index c347dfbcb9..6be1f3575e 100644 --- a/tests/catalyst/contrib/datasets/test_movielens.py +++ b/tests/catalyst/contrib/datasets/test_movielens.py @@ -79,10 +79,10 @@ def test_minimal_ranking(): """ Tets retrieveing the minimal ranking """ - train_data_laoder_min_two = MovieLens("./data", min_rating=2.0) - assert 1 not in train_data_laoder_min_two[0].unique() - assert 1 not in train_data_laoder_min_two[120].unique() - assert 3 in train_data_laoder_min_two[0].unique() + train_data_loader_min_two = MovieLens("./data", download=True, min_rating=2.0) + assert 1 not in train_data_loader_min_two[0].unique() + assert 1 not in train_data_loader_min_two[120].unique() + assert 3 in train_data_loader_min_two[0].unique() def teardown_module(): diff --git a/tests/catalyst/contrib/datasets/test_movielens_20m.py b/tests/catalyst/contrib/datasets/test_movielens_20m.py new file mode 100755 index 0000000000..dabdf49f9c --- /dev/null +++ b/tests/catalyst/contrib/datasets/test_movielens_20m.py @@ -0,0 +1,170 @@ +import os +import shutil + +import pytest + +from catalyst.settings import SETTINGS + +if SETTINGS.ml_required and SETTINGS.is_torch_1_7_0: + from catalyst.contrib.datasets import MovieLens20M + +minversion = pytest.mark.skipif( + not (SETTINGS.is_torch_1_7_0), reason="No catalyst[ml] required or torch version " +) + + +def setup_module(): + """ + Remove the temp folder if exists + """ + data_path = "./tmp_data" + try: + shutil.rmtree(data_path) + except Exception as e: + print("Error! Code: {c}, Message, {m}".format(c=type(e).__name__, m=str(e))) + + +@minversion +@pytest.mark.skipif(not (SETTINGS.ml_required), reason="No catalyst[ml] required") +def test_download_split_by_user(): + """ + Test movielense download + """ + MovieLens20M("./tmp_data", download=True, sample=True) + + filename = "ml-20m" + + # check if data folder exists + assert os.path.isdir("./tmp_data") is True + + # cehck if class folder exists + assert os.path.isdir("./tmp_data/MovieLens20M") is True + + # check if raw folder exists + assert os.path.isdir("./tmp_data/MovieLens20M/raw") is True + + # check if processed folder exists + assert os.path.isdir("./tmp_data/MovieLens20M/processed") is True + + # check some random file from MovieLens + assert ( + os.path.isfile("./tmp_data/MovieLens20M/raw/{}/genome-scores.csv".format(filename)) is True + ) + + # check if data file is not Nulll + assert os.path.getsize("./tmp_data/MovieLens20M/raw/{}/genome-scores.csv".format(filename)) > 0 + + +@minversion +@pytest.mark.skipif(not (SETTINGS.ml_required), reason="No catalyst[ml] required") +def test_download_split_by_ts(): + """ + Test movielense download + """ + MovieLens20M("./tmp_data", download=True, split="ts", sample=True) + + filename = "ml-20m" + + # check if data folder exists + assert os.path.isdir("./tmp_data") is True + + # cehck if class folder exists + assert os.path.isdir("./tmp_data/MovieLens20M") is True + + # check if raw folder exists + assert os.path.isdir("./tmp_data/MovieLens20M/raw") is True + + # check if processed folder exists + assert os.path.isdir("./tmp_data/MovieLens20M/processed") is True + + # check some random file from MovieLens + assert ( + os.path.isfile("./tmp_data/MovieLens20M/raw/{}/genome-scores.csv".format(filename)) is True + ) + + # check if data file is not Nulll + assert os.path.getsize("./tmp_data/MovieLens20M/raw/{}/genome-scores.csv".format(filename)) > 0 + + +@minversion +@pytest.mark.skipif(not (SETTINGS.ml_required), reason="No catalyst[ml] required") +def test_minimal_ranking(): + """ + Tets retrieveing the minimal ranking + """ + movielens_20m_min_two = MovieLens20M( + "./tmp_data", download=True, min_rating=2.0, sample=True, n_rows=1000000 + ) + + assert 1 not in movielens_20m_min_two[1]._values().unique() + assert 1 not in movielens_20m_min_two[3]._values().unique() + assert ( + (2 in movielens_20m_min_two[1]._values().unique()) + or 3 in movielens_20m_min_two[1]._values().unique() + or (4 in movielens_20m_min_two[1]._values().unique()) + or (5 in movielens_20m_min_two[1]._values().unique()) + or (len(movielens_20m_min_two[1]._values().unique()) == 0) + ) + assert ( + (2 in movielens_20m_min_two[7]._values().unique()) + or (3 in movielens_20m_min_two[1]._values().unique()) + or (4 in movielens_20m_min_two[7]._values().unique()) + or (5 in movielens_20m_min_two[7]._values().unique()) + or (len(movielens_20m_min_two[1]._values().unique()) == 0) + ) + assert ( + (3 in movielens_20m_min_two[3]._values().unique()) + or (4 in movielens_20m_min_two[3]._values().unique()) + or (5 in movielens_20m_min_two[3]._values().unique()) + or (len(movielens_20m_min_two[1]._values().unique()) == 0) + ) + + +@minversion +@pytest.mark.skipif(not (SETTINGS.ml_required), reason="No catalyst[ml] required") +def test_users_per_item_filtering(): + """ + Tets retrieveing the minimal ranking + """ + min_users_per_item = 2.0 + + movielens_20m_min_users = MovieLens20M( + "./tmp_data", + download=True, + min_users_per_item=min_users_per_item, + sample=True, + n_rows=1000000, + ) + + assert (movielens_20m_min_users.users_activity["user_cnt"] >= min_users_per_item).any() + + +@minversion +@pytest.mark.skipif(not (SETTINGS.ml_required), reason="No catalyst[ml] required") +def test_items_per_user_filtering(): + """ + Tets retrieveing the minimal ranking + """ + min_items_per_user = 2.0 + min_users_per_item = 1.0 + movielens_20m_min_users = MovieLens20M( + "./tmp_data", + download=True, + min_items_per_user=min_items_per_user, + min_users_per_item=min_users_per_item, + sample=True, + n_rows=1000000, + ) + + assert (movielens_20m_min_users.items_activity["item_cnt"] >= min_items_per_user).any() + + +def teardown_module(): + """ + Remove tempoary files after test execution + """ + data_path = "./tmp_data" + try: + shutil.rmtree(data_path) + except Exception as e: + print("Error! Code: {c}, Message, {m}".format(c=type(e).__name__, m=str(e)))