[SOLVED] Throw exception 'cuDNN failure 8: CUDNN_STATUS_EXECUTION_FAILED' in training ONNX's pretrained model Emotion FerPlus

I am testing to train Emotion FerPlus emotion recognition model. Training has cuDNN failure 8: CUDNN_STATUS_EXECUTION_FAILED error. I am using Nvidia GPU TitanRTX 24G. Then change the minibatch_size from 32 to 1. But still have error. I am using CNTK-GPU docker. The complete error messages are
About to throw exception 'cuDNN failure 8: CUDNN_STATUS_EXECUTION_FAILED ; GPU=0 ; hostname=d9150da5d531 ; expr=cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv, m_fwdAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), &C::Zero, m_outT, ptr(out))'
cuDNN failure 8: CUDNN_STATUS_EXECUTION_FAILED ; GPU=0 ; hostname=d9150da5d531 ; expr=cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv, m_fwdAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), &C::Zero, m_outT, ptr(out))
Traceback (most recent call last):
  File "train.py", line 193, in <module>
    main(args.base_folder, args.training_mode)
  File "train.py", line 124, in main
    trainer.train_minibatch({input_var : images, label_var : labels})
  File "/root/anaconda3/envs/cntk-py35/lib/python3.5/site-packages/cntk/train/trainer.py", line 184, in train_minibatch
    device)
  File "/root/anaconda3/envs/cntk-py35/lib/python3.5/site-packages/cntk/cntk_py.py", line 3065, in train_minibatch
    return _cntk_py.Trainer_train_minibatch(self, *args)
RuntimeError: cuDNN failure 8: CUDNN_STATUS_EXECUTION_FAILED ; GPU=0 ; hostname=d9150da5d531 ; expr=cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv, m_fwdAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), &C::Zero, m_outT, ptr(out))

[CALL STACK]
[0x7fc04da7ce89]                                                       + 0x732e89
[0x7fc045a71aaf]                                                       + 0xeabaaf
[0x7fc045a7b613]    Microsoft::MSR::CNTK::CuDnnConvolutionEngine<float>::  ForwardCore  (Microsoft::MSR::CNTK::Matrix<float> const&,  Microsoft::MSR::CNTK::Matrix<float> const&,  Microsoft::MSR::CNTK::Matrix<float>&,  Microsoft::MSR::CNTK::Matrix<float>&) + 0x1a3
[0x7fc04dd4f8d3]    Microsoft::MSR::CNTK::ConvolutionNode<float>::  ForwardProp  (Microsoft::MSR::CNTK::FrameRange const&) + 0xa3
[0x7fc04dfba654]    Microsoft::MSR::CNTK::ComputationNetwork::PARTraversalFlowControlNode::  ForwardProp  (std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&,  Microsoft::MSR::CNTK::FrameRange const&) + 0xf4
[0x7fc04dcb6e33]    std::_Function_handler<void (std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&),void Microsoft::MSR::CNTK::ComputationNetwork::ForwardProp<std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>>>(std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>> const&)::{lambda(std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&)#1}>::  _M_invoke  (std::_Any_data const&,  std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&) + 0x63
[0x7fc04dd04ed9]    void Microsoft::MSR::CNTK::ComputationNetwork::  TravserseInSortedGlobalEvalOrder  <std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>>>(std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>> const&,  std::function<void (std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&)> const&) + 0x5b9
[0x7fc04dca64da]    CNTK::CompositeFunction::  Forward  (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&,  std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&,  CNTK::DeviceDescriptor const&,  std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&,  std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&) + 0x15da
[0x7fc04dc3d603]    CNTK::Function::  Forward  (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&,  std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&,  CNTK::DeviceDescriptor const&,  std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&,  std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&) + 0x93
[0x7fc04ddbf91b]    CNTK::Trainer::  ExecuteForwardBackward  (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&,  std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&,  CNTK::DeviceDescriptor const&,  std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&) + 0x36b
[0x7fc04ddc06e4]    CNTK::Trainer::  TrainLocalMinibatch  (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&,  std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&,  bool,  CNTK::DeviceDescriptor const&) + 0x94
[0x7fc04ddc178a]    CNTK::Trainer::  TrainMinibatch  (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&,  bool,  std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&,  CNTK::DeviceDescriptor const&) + 0x5a
[0x7fc04ddc1852]    CNTK::Trainer::  TrainMinibatch  (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&,  bool,  CNTK::DeviceDescriptor const&) + 0x52
[0x7fc04eb2db22]                                                       + 0x229b22
[0x7fc057ea15e9]    PyCFunction_Call                                   + 0xf9
[0x7fc057f267c0]    PyEval_EvalFrameEx                                 + 0x6ba0
[0x7fc057f29b49]                                                       + 0x144b49
[0x7fc057f28df5]    PyEval_EvalFrameEx                                 + 0x91d5
[0x7fc057f29b49]                                                       + 0x144b49
[0x7fc057f28df5]    PyEval_EvalFrameEx                                 + 0x91d5
[0x7fc057f29b49]                                                       + 0x144b49
[0x7fc057f28df5]    PyEval_EvalFrameEx                                 + 0x91d5
[0x7fc057f29b49]                                                       + 0x144b49
[0x7fc057f29cd8]    PyEval_EvalCodeEx                                  + 0x48
[0x7fc057f29d1b]    PyEval_EvalCode                                    + 0x3b
[0x7fc057f4f020]    PyRun_FileExFlags                                  + 0x130
[0x7fc057f50623]    PyRun_SimpleFileExFlags                            + 0x173
[0x7fc057f6b8c7]    Py_Main                                            + 0xca7
[0x400add]          main                                               + 0x15d
[0x7fc056f06830]    __libc_start_main                                  + 0xf0
[0x4008b9]