Skip to content

Ray train failed - Permission Error Windows #252

@yashpaneliya

Description

@yashpaneliya

Hello,

I am getting a Permission Error while running the trainer.fit(). I am running the code a personal laptop.

Code:

# Train loop config
train_loop_config = {
    "dropout_p": 0.5,
    "lr": 1e-2,
    "lr_factor": 0.8,
    "lr_patience": 3,
    "num_epochs": 5,
    "batch_size": 32,
    "num_classes": num_classes,
}

# Scaling config
scaling_config = ScalingConfig(
    num_workers=num_workers,
    use_gpu=bool(resources_per_worker["GPU"]),
    resources_per_worker=resources_per_worker
)

# Run config
checkpoint_config = CheckpointConfig(num_to_keep=1, checkpoint_score_attribute="val_loss", checkpoint_score_order="min")
run_config = RunConfig(name="llm", checkpoint_config=checkpoint_config, storage_path=Path('./').resolve())

# Dataset
ds = load_data()
train_ds, val_ds = stratify_split(ds, stratify="tag", test_size=test_size)

# Preprocess
preprocessor = CustomPreprocessor()
preprocessor =  preprocessor.fit(train_ds)
train_ds = preprocessor.transform(train_ds)
val_ds = preprocessor.transform(val_ds)
train_ds = train_ds.materialize()
val_ds = val_ds.materialize()

# Dataset config
options = ray.data.ExecutionOptions(preserve_order=True)
dataset_config = DataConfig(
    datasets_to_split=["train"],
    execution_options=options)

# Trainer
trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config=train_loop_config,
    scaling_config=scaling_config,
    run_config=run_config,
    datasets={"train": train_ds, "val": val_ds},
    dataset_config=dataset_config,
    metadata={"class_to_index": preprocessor.class_to_index}
)

%%time
# Train
results = trainer.fit()

Error Log:

2024-01-18 11:37:34,621	ERROR tune_controller.py:1374 -- Trial task failed for trial TorchTrainer_10734_00000
Traceback (most recent call last):
  File "d:\MLOps\mlenv\lib\site-packages\ray\air\execution\_internal\event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "d:\MLOps\mlenv\lib\site-packages\ray\_private\auto_init_hook.py", line 22, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "d:\MLOps\mlenv\lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "d:\MLOps\mlenv\lib\site-packages\ray\_private\worker.py", line 2624, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(PermissionError): ray::_Inner.train() (pid=16812, ip=127.0.0.1, actor_id=9fe2423e0cd57b52c144a08a01000000, repr=TorchTrainer)
  File "python\ray\_raylet.pyx", line 1813, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1754, in ray._raylet.execute_task.function_executor
  File "d:\MLOps\mlenv\lib\site-packages\ray\_private\function_manager.py", line 726, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "d:\MLOps\mlenv\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
    return method(self, *_args, **_kwargs)
  File "d:\MLOps\mlenv\lib\site-packages\ray\tune\trainable\trainable.py", line 342, in train
    raise skipped from exception_cause(skipped)
  File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\utils.py", line 43, in check_for_failure
    ray.get(object_ref)
  File "d:\MLOps\mlenv\lib\site-packages\ray\_private\auto_init_hook.py", line 22, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "d:\MLOps\mlenv\lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "d:\MLOps\mlenv\lib\site-packages\ray\_private\worker.py", line 2624, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(PermissionError): ray::_RayTrainWorker__execute.get_next() (pid=9492, ip=127.0.0.1, actor_id=bc7d4a1c3843a673dd0e184f01000000, repr=<ray.train._internal.worker_group.RayTrainWorker object at 0x0000023D4BB79090>)
  File "python\ray\_raylet.pyx", line 1813, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1754, in ray._raylet.execute_task.function_executor
  File "d:\MLOps\mlenv\lib\site-packages\ray\_private\function_manager.py", line 726, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "d:\MLOps\mlenv\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
    return method(self, *_args, **_kwargs)
  File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\worker_group.py", line 33, in __execute
    raise skipped from exception_cause(skipped)
  File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\utils.py", line 118, in discard_return_wrapper
    train_func(*args, **kwargs)
  File "C:\Users\YASH\AppData\Local\Temp\ipykernel_10888\3384315726.py", line 44, in train_loop_per_worker
  File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\session.py", line 644, in wrapper
    return fn(*args, **kwargs)
  File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\session.py", line 706, in report
    _get_session().report(metrics, checkpoint=checkpoint)
  File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\session.py", line 417, in report
    persisted_checkpoint = self.storage.persist_current_checkpoint(checkpoint)
  File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\storage.py", line 558, in persist_current_checkpoint
    _pyarrow_fs_copy_files(
  File "d:\MLOps\mlenv\lib\site-packages\ray\train\_internal\storage.py", line 110, in _pyarrow_fs_copy_files
    return pyarrow.fs.copy_files(
  File "d:\MLOps\mlenv\lib\site-packages\pyarrow\fs.py", line 244, in copy_files
    _copy_files_selector(source_fs, source_sel,
  File "pyarrow\_fs.pyx", line 1229, in pyarrow._fs._copy_files_selector
  File "pyarrow\error.pxi", line 110, in pyarrow.lib.check_status
PermissionError: [WinError 32] Failed copying 'C:/Users/YASH/AppData/Local/Temp/tmpsm8_7sn1/model.pt' to 'D:/MLOps/Made-With-ML/notebooks/llm/TorchTrainer_10734_00000_0_2024-01-18_11-31-47/checkpoint_000000/model.pt'. Detail: [Windows error 32] The process cannot access the file because it is being used by another process.
2024-01-18 11:37:38,459	ERROR tune.py:1038 -- Trials did not complete: [TorchTrainer_10734_00000]
2024-01-18 11:37:38,466	INFO tune.py:1042 -- Total run time: 351.06 seconds (347.16 seconds for the tuning loop).

Directory:
image

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions