-
Notifications
You must be signed in to change notification settings - Fork 6.6k
Open
Description
Hello,
I am getting a Permission Error while running the trainer.fit()
. I am running the code a personal laptop.
Code:
# Train loop config
train_loop_config = {
"dropout_p": 0.5,
"lr": 1e-2,
"lr_factor": 0.8,
"lr_patience": 3,
"num_epochs": 5,
"batch_size": 32,
"num_classes": num_classes,
}
# Scaling config
scaling_config = ScalingConfig(
num_workers=num_workers,
use_gpu=bool(resources_per_worker["GPU"]),
resources_per_worker=resources_per_worker
)
# Run config
checkpoint_config = CheckpointConfig(num_to_keep=1, checkpoint_score_attribute="val_loss", checkpoint_score_order="min")
run_config = RunConfig(name="llm", checkpoint_config=checkpoint_config, storage_path=Path('./').resolve())
# Dataset
ds = load_data()
train_ds, val_ds = stratify_split(ds, stratify="tag", test_size=test_size)
# Preprocess
preprocessor = CustomPreprocessor()
preprocessor = preprocessor.fit(train_ds)
train_ds = preprocessor.transform(train_ds)
val_ds = preprocessor.transform(val_ds)
train_ds = train_ds.materialize()
val_ds = val_ds.materialize()
# Dataset config
options = ray.data.ExecutionOptions(preserve_order=True)
dataset_config = DataConfig(
datasets_to_split=["train"],
execution_options=options)
# Trainer
trainer = TorchTrainer(
train_loop_per_worker=train_loop_per_worker,
train_loop_config=train_loop_config,
scaling_config=scaling_config,
run_config=run_config,
datasets={"train": train_ds, "val": val_ds},
dataset_config=dataset_config,
metadata={"class_to_index": preprocessor.class_to_index}
)
%%time
# Train
results = trainer.fit()
Error Log:
2024-01-18 11:37:34,621 ERROR tune_controller.py:1374 -- Trial task failed for trial TorchTrainer_10734_00000
Traceback (most recent call last):
File "d:\MLOps\mlenv\lib\site-packages\ray\air\execution\_internal\event_manager.py", line 110, in resolve_future
result = ray.get(future)
File "d:\MLOps\mlenv\lib\site-packages\ray\_private\auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
File "d:\MLOps\mlenv\lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
File "d:\MLOps\mlenv\lib\site-packages\ray\_private\worker.py", line 2624, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(PermissionError): ray::_Inner.train() (pid=16812, ip=127.0.0.1, actor_id=9fe2423e0cd57b52c144a08a01000000, repr=TorchTrainer)
File "python\ray\_raylet.pyx", line 1813, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 1754, in ray._raylet.execute_task.function_executor
File "d:\MLOps\mlenv\lib\site-packages\ray\_private\function_manager.py", line 726, in actor_method_executor
return method(__ray_actor, *args, **kwargs)
File "d:\MLOps\mlenv\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
return method(self, *_args, **_kwargs)
File "d:\MLOps\mlenv\lib\site-packages\ray\tune\trainable\trainable.py", line 342, in train
raise skipped from exception_cause(skipped)
File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\utils.py", line 43, in check_for_failure
ray.get(object_ref)
File "d:\MLOps\mlenv\lib\site-packages\ray\_private\auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
File "d:\MLOps\mlenv\lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
File "d:\MLOps\mlenv\lib\site-packages\ray\_private\worker.py", line 2624, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(PermissionError): ray::_RayTrainWorker__execute.get_next() (pid=9492, ip=127.0.0.1, actor_id=bc7d4a1c3843a673dd0e184f01000000, repr=<ray.train._internal.worker_group.RayTrainWorker object at 0x0000023D4BB79090>)
File "python\ray\_raylet.pyx", line 1813, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 1754, in ray._raylet.execute_task.function_executor
File "d:\MLOps\mlenv\lib\site-packages\ray\_private\function_manager.py", line 726, in actor_method_executor
return method(__ray_actor, *args, **kwargs)
File "d:\MLOps\mlenv\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
return method(self, *_args, **_kwargs)
File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\worker_group.py", line 33, in __execute
raise skipped from exception_cause(skipped)
File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\utils.py", line 118, in discard_return_wrapper
train_func(*args, **kwargs)
File "C:\Users\YASH\AppData\Local\Temp\ipykernel_10888\3384315726.py", line 44, in train_loop_per_worker
File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\session.py", line 644, in wrapper
return fn(*args, **kwargs)
File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\session.py", line 706, in report
_get_session().report(metrics, checkpoint=checkpoint)
File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\session.py", line 417, in report
persisted_checkpoint = self.storage.persist_current_checkpoint(checkpoint)
File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\storage.py", line 558, in persist_current_checkpoint
_pyarrow_fs_copy_files(
File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\storage.py", line 110, in _pyarrow_fs_copy_files
return pyarrow.fs.copy_files(
File "d:\MLOps\mlenv\lib\site-packages\pyarrow\fs.py", line 244, in copy_files
_copy_files_selector(source_fs, source_sel,
File "pyarrow\_fs.pyx", line 1229, in pyarrow._fs._copy_files_selector
File "pyarrow\error.pxi", line 110, in pyarrow.lib.check_status
PermissionError: [WinError 32] Failed copying 'C:/Users/YASH/AppData/Local/Temp/tmpsm8_7sn1/model.pt' to 'D:/MLOps/Made-With-ML/notebooks/llm/TorchTrainer_10734_00000_0_2024-01-18_11-31-47/checkpoint_000000/model.pt'. Detail: [Windows error 32] The process cannot access the file because it is being used by another process.
2024-01-18 11:37:38,459 ERROR tune.py:1038 -- Trials did not complete: [TorchTrainer_10734_00000]
2024-01-18 11:37:38,466 INFO tune.py:1042 -- Total run time: 351.06 seconds (347.16 seconds for the tuning loop).
Metadata
Metadata
Assignees
Labels
No labels