pythondaskdask-distributedrapids

RuntimeError: Cluster failed to start with dask LocalCudaCluster example setup


I am new to Dask and I run into problems when executing the example code:

from dask.distributed import Client
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster()
client = Client(cluster)

I would get the following error:

AttributeError                            Traceback (most recent call last)
File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/deploy/spec.py:319, in SpecCluster._start(self)
    318     cls = import_term(cls)
--> 319 self.scheduler = cls(**self.scheduler_spec.get("options", {}))
    320 self.scheduler = await self.scheduler

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/scheduler.py:3481, in Scheduler.__init__(self, loop, delete_interval, synchronize_worker_interval, services, service_kwargs, allowed_failures, extensions, validate, scheduler_file, security, worker_ttl, idle_timeout, interface, host, port, protocol, dashboard_address, dashboard, http_prefix, preload, preload_argv, plugins, contact_address, transition_counter_max, jupyter, **kwargs)
   3480 if show_dashboard:
-> 3481     distributed.dashboard.scheduler.connect(
   3482         self.http_application, self.http_server, self, prefix=http_prefix
   3483     )
   3484 self.jupyter = jupyter

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/dashboard/scheduler.py:158, in connect(application, http_server, scheduler, prefix)
    156 def connect(application, http_server, scheduler, prefix=""):
    157     bokeh_app = BokehApplication(
--> 158         applications, scheduler, prefix=prefix, template_variables=template_variables()
    159     )
    160     application.add_application(bokeh_app)

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/cytoolz/functoolz.pyx:475, in cytoolz.functoolz._memoize.__call__()

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/dashboard/scheduler.py:131, in template_variables()
    123 from distributed.diagnostics.nvml import device_get_count
    125 template_variables = {
    126     "pages": [
    127         "status",
    128         "workers",
    129         "tasks",
    130         "system",
--> 131         *(["gpu"] if device_get_count() > 0 else []),
    132         "profile",
    133         "graph",
    134         "groups",
    135         "info",
    136     ],
    137     "plots": [
    138         {
    139             "url": x.strip("/"),
    140             "name": " ".join(x.strip("/").split("-")[1:])
    141             .title()
    142             .replace("Cpu", "CPU")
    143             .replace("Gpu", "GPU"),
    144         }
    145         for x in applications
    146         if "individual" in x
    147     ]
    148     + [{"url": "hardware", "name": "Hardware"}],
    149 }
    150 template_variables["plots"] = sorted(
    151     template_variables["plots"], key=lambda d: d["name"]
    152 )

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/diagnostics/nvml.py:126, in device_get_count()
    125 def device_get_count():
--> 126     init_once()
    127     if not is_initialized():

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/diagnostics/nvml.py:108, in init_once()
    105     return
    107 if _in_wsl() and parse_version(
--> 108     pynvml.nvmlSystemGetDriverVersion().decode()
    109 ) < parse_version(MINIMUM_WSL_VERSION):
    110     NVML_STATE = NVMLState.DISABLED_WSL_INSUFFICIENT_DRIVER

AttributeError: 'str' object has no attribute 'decode'

The above exception was the direct cause of the following exception:

RuntimeError                              Traceback (most recent call last)
Cell In[22], line 3
      1 from dask_cuda import LocalCUDACluster
----> 3 cluster = LocalCUDACluster()
      4 client = Client(cluster)

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/dask_cuda/local_cuda_cluster.py:336, in LocalCUDACluster.__init__(self, CUDA_VISIBLE_DEVICES, n_workers, threads_per_worker, memory_limit, device_memory_limit, data, local_directory, shared_filesystem, protocol, enable_tcp_over_ucx, enable_infiniband, enable_nvlink, enable_rdmacm, rmm_pool_size, rmm_maximum_pool_size, rmm_managed_memory, rmm_async, rmm_log_directory, rmm_track_allocations, jit_unspill, log_spilling, worker_class, pre_import, **kwargs)
    329     worker_class = partial(
    330         LoggedNanny if log_spilling is True else Nanny,
    331         worker_class=worker_class,
    332     )
    334 self.pre_import = pre_import
--> 336 super().__init__(
    337     n_workers=0,
    338     threads_per_worker=threads_per_worker,
    339     memory_limit=self.memory_limit,
    340     processes=True,
    341     data=data,
    342     local_directory=local_directory,
    343     protocol=protocol,
    344     worker_class=worker_class,
    345     config={
    346         "distributed.comm.ucx": get_ucx_config(
    347             enable_tcp_over_ucx=enable_tcp_over_ucx,
    348             enable_nvlink=enable_nvlink,
    349             enable_infiniband=enable_infiniband,
    350             enable_rdmacm=enable_rdmacm,
    351         )
    352     },
    353     **kwargs,
    354 )
    356 self.new_spec["options"]["preload"] = self.new_spec["options"].get(
    357     "preload", []
    358 ) + ["dask_cuda.initialize"]
    359 self.new_spec["options"]["preload_argv"] = self.new_spec["options"].get(
    360     "preload_argv", []
    361 ) + ["--create-cuda-context"]

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/deploy/local.py:253, in LocalCluster.__init__(self, name, n_workers, threads_per_worker, processes, loop, start, host, ip, scheduler_port, silence_logs, dashboard_address, worker_dashboard_address, diagnostics_port, services, worker_services, service_kwargs, asynchronous, security, protocol, blocked_handlers, interface, worker_class, scheduler_kwargs, scheduler_sync_interval, **worker_kwargs)
    250 worker = {"cls": worker_class, "options": worker_kwargs}
    251 workers = {i: worker for i in range(n_workers)}
--> 253 super().__init__(
    254     name=name,
    255     scheduler=scheduler,
    256     workers=workers,
    257     worker=worker,
    258     loop=loop,
    259     asynchronous=asynchronous,
    260     silence_logs=silence_logs,
    261     security=security,
    262     scheduler_sync_interval=scheduler_sync_interval,
    263 )

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/deploy/spec.py:286, in SpecCluster.__init__(self, workers, scheduler, worker, asynchronous, loop, security, silence_logs, name, shutdown_on_close, scheduler_sync_interval)
    284 if not called_from_running_loop:
    285     self._loop_runner.start()
--> 286     self.sync(self._start)
    287     try:
    288         self.sync(self._correct_state)

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/utils.py:338, in SyncMethodMixin.sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    336     return future
    337 else:
--> 338     return sync(
    339         self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
    340     )

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/utils.py:405, in sync(loop, func, callback_timeout, *args, **kwargs)
    403 if error:
    404     typ, exc, tb = error
--> 405     raise exc.with_traceback(tb)
    406 else:
    407     return result

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/utils.py:378, in sync.<locals>.f()
    376         future = asyncio.wait_for(future, callback_timeout)
    377     future = asyncio.ensure_future(future)
--> 378     result = yield future
    379 except Exception:
    380     error = sys.exc_info()

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/tornado/gen.py:769, in Runner.run(self)
    766 exc_info = None
    768 try:
--> 769     value = future.result()
    770 except Exception:
    771     exc_info = sys.exc_info()

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/deploy/spec.py:330, in SpecCluster._start(self)
    328 self.status = Status.failed
    329 await self._close()
--> 330 raise RuntimeError(f"Cluster failed to start: {e}") from e

RuntimeError: Cluster failed to start: 'str' object has no attribute 'decode'

The Dask version I have is Dask Version: 2023.2.0.

I tried to reinstall rapidsai, downgrade my python version from 3.10 to 3.8. and I also tried different parameters for LocalCUDACluster(), but none of these worked.


Solution

  • There was an unexpected breaking change in pyvnml that impacted dask-cuda. Dask-cuda has issued a hotfix release (23.02.01) to solve this in the stable release.

    I see you're using the nightly packages. In the nightly packages, this should have been resolved by this PR. I'm not able to reproduce your issue in the following environment: mamba create -n rapids-23.04 -c rapidsai-nightly -c nvidia -c conda-forge rapids=23.04 python=3.8 cudatoolkit=11.5 jupyterlab strings_udf.

    If you still experience this problem in a fresh environment, please file a dask-cuda Github issue.