I am trying to use cudf.pandas on a notebook on Kaggle and running into a long error message when enabling GPU on GridSearch. The main issue being an AttributeError
on a DatFrame
.
The code works fine if I remove the %load_ext cudf.pandas
directive.
!nvidia-smi
Tue Jun 18 17:31:56 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |
| N/A 46C P8 10W / 70W | 0MiB / 15360MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 1 Tesla T4 Off | 00000000:00:05.0 Off | 0 |
| N/A 40C P8 9W / 70W | 0MiB / 15360MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| No running processes found |
+---------------------------------------------------------------------------------------+
import cudf
print("cuDF version: ", cudf.__version__)
cuDF version: 24.04.01
%load_ext cudf.pandas
import os
import gc
import numpy as np
import pandas as pd
import joblib
# import rdkit
# from rdkit.Chem import (AllChem, MolFromSmiles, rdMolDescriptors,
# Descriptors, rdmolfiles, MolFromPDBFile)
import matplotlib.pyplot as plt
import seaborn as sns
# import lightgbm as lgb
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import average_precision_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import (accuracy_score, precision_score,
recall_score, f1_score, classification_report)
from xgboost import XGBClassifier
import shap
# Parameters grid for the cross-validation exercise
hyperparam_grid = {
'n_estimators': [400, 600, 800, 1000, 1200, 1500],
'learning_rate': [0.05, 0.07, 0.1, 0.13, 0.15],
'max_depth': [3, 5, 7, 9, 10, 11],
}
# Create models and run grid search cross-validation
for p in proteins:
# Instantiate XGBoost model
model[p] = XGBClassifier(scale_pos_weight=spw[p],
random_state=13,
tree_method='hist',
device='cuda'
)
print('Model', p)
print('Running grid search cross validation....')
# Set up the gscv object with 4-fold
gs_cv = GridSearchCV(estimator=model[p],
param_grid=hyperparam_grid,
scoring='average_precision',
cv=4,
return_train_score=True,
n_jobs=-1,
verbose=1)
The Error Message:
Model sEH
Running grid search cross validation....
Fitting 4 folds for each of 180 candidates, totalling 720 fits
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/site-packages/joblib/externals/loky/process_executor.py", line 463, in _process_worker
r = call_item()
File "/opt/conda/lib/python3.10/site-packages/joblib/externals/loky/process_executor.py", line 291, in __call__
return self.fn(*self.args, **self.kwargs)
File "/opt/conda/lib/python3.10/site-packages/joblib/parallel.py", line 598, in __call__
return [func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/joblib/parallel.py", line 598, in <listcomp>
return [func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/sklearn/utils/parallel.py", line 123, in __call__
return self.function(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 678, in _fit_and_score
X_train, y_train = _safe_split(estimator, X, y, train)
File "/opt/conda/lib/python3.10/site-packages/sklearn/utils/metaestimators.py", line 233, in _safe_split
X_subset = _safe_indexing(X, indices)
File "/opt/conda/lib/python3.10/site-packages/sklearn/utils/__init__.py", line 354, in _safe_indexing
return _pandas_indexing(X, indices, indices_dtype, axis=axis)
File "/opt/conda/lib/python3.10/site-packages/sklearn/utils/__init__.py", line 196, in _pandas_indexing
return X.take(key, axis=axis)
File "/opt/conda/lib/python3.10/site-packages/pandas/core/generic.py", line 4133, in take
new_data = self._mgr.take(
File "/opt/conda/lib/python3.10/site-packages/pandas/core/generic.py", line 6299, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute '_mgr'
"""
The above exception was the direct cause of the following exception:
AttributeError Traceback (most recent call last)
File <timed exec>:20
File /opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_search.py:874, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
868 results = self._format_results(
869 all_candidate_params, n_splits, all_out, all_more_results
870 )
872 return results
--> 874 self._run_search(evaluate_candidates)
876 # multimetric is determined here because in the case of a callable
877 # self.scoring the return type is only known after calling
878 first_test_score = all_out[0]["test_scores"]
File /opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_search.py:1388, in GridSearchCV._run_search(self, evaluate_candidates)
1386 def _run_search(self, evaluate_candidates):
1387 """Search all candidates in param_grid"""
-> 1388 evaluate_candidates(ParameterGrid(self.param_grid))
File /opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_search.py:821, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
813 if self.verbose > 0:
814 print(
815 "Fitting {0} folds for each of {1} candidates,"
816 " totalling {2} fits".format(
817 n_splits, n_candidates, n_candidates * n_splits
818 )
819 )
--> 821 out = parallel(
822 delayed(_fit_and_score)(
823 clone(base_estimator),
824 X,
825 y,
826 train=train,
827 test=test,
828 parameters=parameters,
829 split_progress=(split_idx, n_splits),
830 candidate_progress=(cand_idx, n_candidates),
831 **fit_and_score_kwargs,
832 )
833 for (cand_idx, parameters), (split_idx, (train, test)) in product(
834 enumerate(candidate_params), enumerate(cv.split(X, y, groups))
835 )
836 )
838 if len(out) < 1:
839 raise ValueError(
840 "No fits were performed. "
841 "Was the CV iterator empty? "
842 "Were there no candidates?"
843 )
File /opt/conda/lib/python3.10/site-packages/sklearn/utils/parallel.py:63, in Parallel.__call__(self, iterable)
58 config = get_config()
59 iterable_with_config = (
60 (_with_config(delayed_func, config), args, kwargs)
61 for delayed_func, args, kwargs in iterable
62 )
---> 63 return super().__call__(iterable_with_config)
File /opt/conda/lib/python3.10/site-packages/joblib/parallel.py:2007, in Parallel.__call__(self, iterable)
2001 # The first item from the output is blank, but it makes the interpreter
2002 # progress until it enters the Try/Except block of the generator and
2003 # reaches the first `yield` statement. This starts the asynchronous
2004 # dispatch of the tasks to the workers.
2005 next(output)
-> 2007 return output if self.return_generator else list(output)
File /opt/conda/lib/python3.10/site-packages/joblib/parallel.py:1650, in Parallel._get_outputs(self, iterator, pre_dispatch)
1647 yield
1649 with self._backend.retrieval_context():
-> 1650 yield from self._retrieve()
1652 except GeneratorExit:
1653 # The generator has been garbage collected before being fully
1654 # consumed. This aborts the remaining tasks if possible and warn
1655 # the user if necessary.
1656 self._exception = True
File /opt/conda/lib/python3.10/site-packages/joblib/parallel.py:1754, in Parallel._retrieve(self)
1747 while self._wait_retrieval():
1748
1749 # If the callback thread of a worker has signaled that its task
1750 # triggered an exception, or if the retrieval loop has raised an
1751 # exception (e.g. `GeneratorExit`), exit the loop and surface the
1752 # worker traceback.
1753 if self._aborting:
-> 1754 self._raise_error_fast()
1755 break
1757 # If the next job is not ready for retrieval yet, we just wait for
1758 # async callbacks to progress.
File /opt/conda/lib/python3.10/site-packages/joblib/parallel.py:1789, in Parallel._raise_error_fast(self)
1785 # If this error job exists, immediately raise the error by
1786 # calling get_result. This job might not exists if abort has been
1787 # called directly or if the generator is gc'ed.
1788 if error_job is not None:
-> 1789 error_job.get_result(self.timeout)
File /opt/conda/lib/python3.10/site-packages/joblib/parallel.py:745, in BatchCompletionCallBack.get_result(self, timeout)
739 backend = self.parallel._backend
741 if backend.supports_retrieve_callback:
742 # We assume that the result has already been retrieved by the
743 # callback thread, and is stored internally. It's just waiting to
744 # be returned.
--> 745 return self._return_or_raise()
747 # For other backends, the main thread needs to run the retrieval step.
748 try:
File /opt/conda/lib/python3.10/site-packages/joblib/parallel.py:763, in BatchCompletionCallBack._return_or_raise(self)
761 try:
762 if self.status == TASK_ERROR:
--> 763 raise self._result
764 return self._result
765 finally:
AttributeError: 'DataFrame' object has no attribute '_mgr'
As of now (June 2024), joblib does not support cudf.pandas because the loky
backend of joblib does not respect the current process's sys.meta_path
when spawning new processes (similar issues exist for the multiprocessing
backend, if I recall correctly). Here is a link with further discussion: https://github.com/rapidsai/cudf-pandas-integration/pull/82
This PR should help fix the problem in joblib, but it has stalled: https://github.com/joblib/joblib/pull/1525
Try changing n_jobs=-1
to n_jobs=None
to run a single job. I believe this will work around the joblib failures.