c++for-loopdeep-learningcaffematcaffe

caffe forward net in a for loop not working


I am currently trying to write a c++ wrapper for PSPNet's prediction (originally in Matlab). PSPNet runs on Caffe.

Situation: I have a trained caffe model, and would like to implement this wrapper to run the segmentation result when given an input. In this case, my crop_size is smaller than it's original size. Thus, it is being cropped manually to multiple 425x425 "frames" and fed forward into caffe net after the pre-processes in a for-loop.

Problem: However, net seems to only be running forward once despite being in a for loop. Supported by its processing time and output, refer below.

This is the incomplete code I am currently trying to work on:

#define USE_OPENCV 1
#define trimapSize 1
#define Debug 0
#include <caffe/caffe.hpp>

#include "Header.h"
#include "caffe/data_reader.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/blob.hpp"


#ifdef USE_OPENCV
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif  // USE_OPENCV

#include <algorithm>
#include <iosfwd>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include <chrono> //Just for time measurement
#include <cmath>
#include <array>

#include <iostream>
#include <fstream>

#ifdef USE_OPENCV
using namespace caffe;  // NOLINT(build/namespaces)
using std::string;


class Classifier {
 public:
  Classifier(const string& model_file,
             const string& trained_file);

  cv::Mat Predict(const cv::Mat& img);

 private:
  void SetMean(int weight, int heigh);

  void WrapInputLayer(std::vector<cv::Mat>* input_channels);

  cv::Mat Visualization(Blob<float>* output_layer);
  cv::Mat Preprocess(const cv::Mat& img_scale, int ori_rows, int ori_cols, std::vector<cv::Mat>* input_channels);

 private:
  shared_ptr<Net<float> > net_;
  cv::Size input_geometry_;
  int num_channels_;
  cv::Mat mean_;
};

Classifier::Classifier(const string& model_file,
                       const string& trained_file) {


  Caffe::set_mode(Caffe::GPU);

  /* Load the network. */
  net_.reset(new Net<float>(model_file, TEST));
  net_->CopyTrainedLayersFrom(trained_file);

  CHECK_EQ(net_->num_inputs(), 1) << "Network should have exactly one input.";
  CHECK_EQ(net_->num_outputs(), 2) << "Network should have exactly one output.";

  Blob<float>* input_layer = net_->input_blobs()[0];
  num_channels_ = input_layer->channels();
  CHECK(num_channels_ == 3 || num_channels_ == 1)
    << "Input layer should have 1 or 3 channels.";
  input_geometry_ = cv::Size(input_layer->width(), input_layer->height());
}

/* Create the mean file in binaryproto format. */
void Classifier::SetMean(int weight, int heigh) {

  mean_ = cv::Mat(heigh, weight, CV_32FC3);

  mean_ = cv::Scalar(94.6744, 88.8887, 100.5404);//RGB

}

cv::Mat Classifier::Predict(const cv::Mat& img) {

  cv::Mat originalTmp = img.clone();
  Blob<float>* input_layer = net_->input_blobs()[0];
  input_layer->Reshape(1, num_channels_,
                       input_geometry_.height, input_geometry_.width);

  std::cout << "input_geometry_.height = " << input_geometry_.height << "input_geometry_.width = "<< input_geometry_.width << std::endl;

  /* Forward dimension change to all layers. */
  net_->Reshape();

  std::vector<cv::Mat> input_channels;
  WrapInputLayer(&input_channels);

   /*-----------------------------FOR MULTI-SCALE PROCESSING--------------------------*/
  int base_size = 0;
  int ori_rows = img.rows;
  int ori_cols = img.cols;
  float scale_array [1] = {1};
  //  float scale_array = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
  std::cout << "ori_rows = " << ori_rows << "\t ori_cols = " << ori_cols << std::endl;
  cv::Mat data_all = cv::Mat::zeros(cv::Size(425, 425), CV_32FC3);

  if (ori_rows > ori_cols) {
       base_size = ori_rows;
  }
  else base_size =  ori_cols;

  std::cout << "base_size = " << base_size << std::endl;
  std::cout << "size of array = " << (sizeof(scale_array)/sizeof(*scale_array)) << std::endl;

  for (int i=0; i < (sizeof(scale_array)/sizeof(*scale_array)); i++){
    int long_size = base_size * scale_array[i] + 1;
    int new_rows = long_size;
    int new_cols = long_size;

     std::cout << "BEFORE new rows = " << new_rows << "\t new cols = " << new_cols << std::endl;

    if (ori_rows > ori_cols){
      new_cols = round(long_size/ori_rows*ori_cols);
    }
    else {new_rows = round(long_size/ori_cols*ori_rows);}

    std::cout << "AFTER new rows = " << new_rows << "\t new cols = " << new_cols << std::endl;

    cv::Mat img_scale;
    cv::resize(img, img_scale, cv::Size(new_cols, new_rows), 0, 0, CV_INTER_LINEAR);

    std::cout << "img_scale height: " << img_scale.rows << "\t width = " << img_scale.cols << std::endl;
    cv::imshow("img_scale",img_scale);
    cv::waitKey(0);

    data_all = data_all + Preprocess(img_scale, ori_rows, ori_cols, &input_channels);
    std::cout << "ok! DONE PREPROCESS!" << std::endl;
  }

  return data_all;
}

cv::Mat Classifier::Preprocess(const cv::Mat& img_scale, int ori_rows, int ori_cols, std::vector<cv::Mat>* input_channels)
{
  int crop_size = 425;
  int new_rows = img_scale.rows;
  int new_cols = img_scale.cols;
  cv::Mat data_output = cv::Mat::zeros(cv::Size(ori_cols, ori_rows), CV_32FC3);
  int long_size = new_rows;
  cv::Mat img_processed;

  if (new_cols > new_rows){
    long_size = new_cols;
  }

  if (long_size <= crop_size){
    // img_processed = Preprocess(img_scale, &input_channels);
    //RUN CAFFE --- NOT YET DONE ---
    std::cout << "OK!" << std::endl;
  }
  else {
    float stride_rate = 2.0/3.0;
    std::cout << "stride_rate = " << stride_rate << std::endl;
    int stride = ceil(crop_size*stride_rate);
    std::cout << "stride = " << stride << std::endl;

    cv::Mat img_pad = img_scale;

    int pad_rows = img_pad.rows;
    int pad_cols = img_pad.cols;
    int h_grid = ceil((pad_rows - crop_size)/stride) + 1;
    int w_grid = ceil((pad_cols - crop_size)/stride) + 1;
    cv::Mat img_sub;

    cv::Mat data_scale = cv::Mat::zeros(cv::Size(pad_cols, pad_cols), CV_32FC3);

    for(int grid_yidx = 1; grid_yidx <= h_grid; grid_yidx++){
      for (int grid_xidx = 1; grid_xidx <= w_grid; grid_xidx++){
        int s_x = (grid_xidx-1)*stride+1;
        int s_y = (grid_yidx-1)*stride+1;
        int e_x = std::min(s_x + crop_size -1, pad_cols);
        int e_y = std::min(s_y + crop_size -1, pad_rows);
        s_x = e_x - crop_size + 1;
        s_y = e_y - crop_size + 1;

    /* Cropping image */
        img_pad(cv::Rect(s_x,s_y,crop_size,crop_size)).copyTo(img_sub);

        cv::Mat sample;
       if (img_sub.channels() == 3 && num_channels_ == 1)
          cv::cvtColor(img_sub, sample, cv::COLOR_BGR2GRAY);
        else if (img_sub.channels() == 4 && num_channels_ == 1)
          cv::cvtColor(img_sub, sample, cv::COLOR_BGRA2GRAY);
        else if (img_sub.channels() == 4 && num_channels_ == 3)
          cv::cvtColor(img_sub, sample, cv::COLOR_BGRA2BGR);
        else if (img_sub.channels() == 1 && num_channels_ == 3)
          cv::cvtColor(img_sub, sample, cv::COLOR_GRAY2BGR);
        else
          sample = img_sub;

        cv::Mat sample_float;

        if (num_channels_ == 3)
          sample.convertTo(sample_float, CV_32FC3);
        else
          sample.convertTo(sample_float, CV_32FC1);

        SetMean(sample.rows, sample.cols);

        cv::imshow("sample_float", sample_float);

        cv::cvtColor(sample_float, sample_float, cv::COLOR_BGRA2RGB);
        sample_float =  sample_float.t();


        cv::Mat sample_normalized(sample_float.size(),sample_float.type());

        cv::subtract(sample_float.clone(), mean_, sample_normalized); 

  cv::Mat sample_temp;
  sample_normalized.convertTo(sample_temp, CV_32FC3, 255);
  cv::imwrite("/home/sgp1053c/Desktop/PSPNET-cudnn5_wrapper/wrapper/sample_normalized.png", sample_temp);
  cv::imshow("sample_normalized", sample_normalized);
  cv::waitKey(0);


        /* This operation will write the separate BGR planes directly to the
         * input layer of the network because it is wrapped by the cv::Mat
         * objects in input_channels. */
        img_processed = sample_normalized.t();

        cv::split(img_processed, *input_channels);

        CHECK(reinterpret_cast<float*>(input_channels->at(0).data)
              == net_->input_blobs()[0]->cpu_data())
          << "Input channels are not wrapping the input layer of the network.";

        img_processed.convertTo(sample_temp, CV_32FC3, 255);
        cv::imwrite("/home/sgp1053c/Desktop/PSPNET-cudnn5_wrapper/wrapper/img_processed.png", sample_temp);
        cv::imshow("img_normalised",img_processed);
        cv::waitKey();

        std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); //Just for time measurement

        // float loss = 0.0;
        // net_->Forward(&loss);
        net_->Forward();

        std::chrono::steady_clock::time_point end= std::chrono::steady_clock::now();
        std::cout << "Processing time = " << (std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count())/1000000.0 << " sec" <<std::endl; //Just for time measurement

        /* Copy the output layer to a std::vector */
        Blob<float>* output_layer = net_->output_blobs()[0];

        cv::Mat segment = Visualization(output_layer);
        cv::imwrite("/home/sgp1053c/Desktop/PSPNET-cudnn5_wrapper/wrapper/segment.png", segment);
      }
    }
  }
  return (img_processed);
}

struct RGB {
  int R;
  int G;
  int B;
};

vector<RGB> get_palette(int nClass)
{
  vector<RGB> listPlalette;
  RGB rgb0;
  rgb0.R = 0;
  rgb0.G = 0;
  rgb0.B = 0;
  listPlalette.push_back(rgb0);
  for (int i = 1; i < nClass; i++)
  {
    RGB rgb;

    rgb.R = i*50;
    rgb.G = i*50 + i;
    rgb.B = 255-i*20;

    listPlalette.push_back(rgb); 
  }
  return listPlalette;
}

cv::Mat Classifier::Visualization(Blob<float>* output_layer) {

  std::vector<cv::Mat> input_channels;

  int H = output_layer->height();
  int W = output_layer->width();
  // int N = output_layer->num();      //Batch Size 
  int C = output_layer->channels(); //Number of classes

  int index = 0;

#ifdef CPU_ONLY
  const float* output_data = output_layer->cpu_data();
#else
  const float* output_data = output_layer->cpu_data();
#endif // !CPU_ONLY


  cv::Mat class_each_row(C, W*H, CV_32F);
  for (int i = 0; i < C; i++) {
    for (int j = 0; j < (W*H); j++) {
      class_each_row.at<float>(i, j) = output_data[index];
      index = index + 1;
    }
  }

  class_each_row = class_each_row.t();

//==================================CONVERT INTO LABELS==================================//
  float maxValue = 0;

  int* labelIndex = (int*)malloc(W*H * sizeof(int));
  int indexX = 0;
  for (int i = 0; i < class_each_row.rows; i++) {

    maxValue = -999999999999;
    indexX = 0;
    for (int k = 0; k < C; k++)
    {
      float dataM = class_each_row.at<float>(i, k);
      if (dataM > maxValue) {
        maxValue = dataM;
        indexX = k;
      }
    }

    labelIndex[i] = indexX;
  }

  cv::Mat labelTmp(W, H, CV_8UC3);
  uchar* dataLabelTmp = labelTmp.data;
  vector<RGB> listPalette = get_palette(21);

  for (int i = 0; i < H; i++)
  {
    for (int j = 0; j < W; j++)
    {
      RGB rgb = listPalette[labelIndex[(i*W + j)]];
      dataLabelTmp[3 * (i*W + j)] = rgb.B;
      dataLabelTmp[3 * (i*W + j) + 1] = rgb.G;
      dataLabelTmp[3 * (i*W + j) + 2] = rgb.R;
    }

  }

   cv::imshow( "Display window", labelTmp);
   cv::waitKey(0);

  free(labelIndex);
  labelIndex = NULL;

  return labelTmp;
}


/* Wrap the input layer of the network in separate cv::Mat objects
 * (one per channel). This way we save one memcpy operation and we
 * don't need to rely on cudaMemcpy2D. The last preprocessing
 * operation will write the separate channels directly to the input
 * layer. */
void Classifier::WrapInputLayer(std::vector<cv::Mat>* input_channels) {
  Blob<float>* input_layer = net_->input_blobs()[0];

  int width = input_layer->width();
  int height = input_layer->height();
  float* input_data = input_layer->mutable_cpu_data();
  for (int i = 0; i < input_layer->channels(); ++i) {
    cv::Mat channel(height, width, CV_32FC1, input_data);
    input_channels->push_back(channel);
    input_data += width * height;
  }
}


int main(int argc, char** argv) {
  if (argc != 4) {
    std::cerr << "Usage: " << argv[0]
              << " \ndeploy.prototxt \nnetwork.caffemodel"
              << " \nimg.jpg" << " \ncamvid12.png (for example: /SegNet-Tutorial/Scripts/camvid12.png)" << std::endl;
    return 1;
  }

  ::google::InitGoogleLogging(argv[0]);

  string model_file   = argv[1];
  string trained_file = argv[2]; //for visualization


  Classifier classifier(model_file, trained_file);

  string file = argv[3];

  std::cout << "---------- Semantic Segmentation for "
            << file << " ----------" << std::endl;

  cv::Mat img = cv::imread(file, 1);
  CHECK(!img.empty()) << "Unable to decode image " << file;
  cv::Mat prediction;

  classifier.Predict(img);
}
#else
int main(int argc, char** argv) {
  LOG(FATAL) << "This example requires OpenCV; compile with USE_OPENCV.";
}
#endif //USE_OPENCV

To clarify: The for-loop refers to the one in pre-process: specifically this portion:

 for(int grid_yidx = 1; grid_yidx <= h_grid; grid_yidx++){
  for (int grid_xidx = 1; grid_xidx <= w_grid; grid_xidx++){
    int s_x = (grid_xidx-1)*stride+1;
    int s_y = (grid_yidx-1)*stride+1;
    int e_x = std::min(s_x + crop_size -1, pad_cols);
    int e_y = std::min(s_y + crop_size -1, pad_rows);
    s_x = e_x - crop_size + 1;
    s_y = e_y - crop_size + 1;

/* Cropping image */
    img_pad(cv::Rect(s_x,s_y,crop_size,crop_size)).copyTo(img_sub);

    cv::Mat sample;
   if (img_sub.channels() == 3 && num_channels_ == 1)
      cv::cvtColor(img_sub, sample, cv::COLOR_BGR2GRAY);
    else if (img_sub.channels() == 4 && num_channels_ == 1)
      cv::cvtColor(img_sub, sample, cv::COLOR_BGRA2GRAY);
    else if (img_sub.channels() == 4 && num_channels_ == 3)
      cv::cvtColor(img_sub, sample, cv::COLOR_BGRA2BGR);
    else if (img_sub.channels() == 1 && num_channels_ == 3)
      cv::cvtColor(img_sub, sample, cv::COLOR_GRAY2BGR);
    else
      sample = img_sub;

    cv::Mat sample_float;

    if (num_channels_ == 3)
      sample.convertTo(sample_float, CV_32FC3);
    else
      sample.convertTo(sample_float, CV_32FC1);

    SetMean(sample.rows, sample.cols);

    cv::imshow("sample_float", sample_float);

    cv::cvtColor(sample_float, sample_float, cv::COLOR_BGRA2RGB);
    sample_float =  sample_float.t();


    cv::Mat sample_normalized(sample_float.size(),sample_float.type());

    cv::subtract(sample_float.clone(), mean_, sample_normalized); 

  cv::Mat sample_temp;
  sample_normalized.convertTo(sample_temp, CV_32FC3, 255);
  cv::imwrite("/home/sgp1053c/Desktop/PSPNET-cudnn5_wrapper/wrapper/sample_normalized.png", sample_temp);
  cv::imshow("sample_normalized", sample_normalized);
  cv::waitKey(0);


    /* This operation will write the separate BGR planes directly to the
     * input layer of the network because it is wrapped by the cv::Mat
     * objects in input_channels. */
    img_processed = sample_normalized.t();

    cv::split(img_processed, *input_channels);

    CHECK(reinterpret_cast<float*>(input_channels->at(0).data)
          == net_->input_blobs()[0]->cpu_data())
      << "Input channels are not wrapping the input layer of the network.";

    img_processed.convertTo(sample_temp, CV_32FC3, 255);
    cv::imwrite("/home/sgp1053c/Desktop/PSPNET-cudnn5_wrapper/wrapper/img_processed.png", sample_temp);
    cv::imshow("img_normalised",img_processed);
    cv::waitKey();

    std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); //Just for time measurement

    // float loss = 0.0;
    // net_->Forward(&loss);
    net_->Forward();

    std::chrono::steady_clock::time_point end= std::chrono::steady_clock::now();
    std::cout << "Processing time = " << (std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count())/1000000.0 << " sec" <<std::endl; //Just for time measurement

    /* Copy the output layer to a std::vector */
    Blob<float>* output_layer = net_->output_blobs()[0];

    cv::Mat segment = Visualization(output_layer);
    cv::imwrite("/home/sgp1053c/Desktop/PSPNET-cudnn5_wrapper/wrapper/segment.png", segment);
  }
}

Original Image:Original Image (Without pre-processing)

Input: Input (first cropped frame)

Output: Output of the first cropped frame

Time taken for forwarding: Time taken

Following cropped frame gives the same output through out.

P/s: If i shift the code below to the end of predict function and return segment instead, it will work well. But only the last cropped frame will be segmented.

 std::chrono::steady_clock::time_point begin = 
 std::chrono::steady_clock::now(); //Just for time measurement

 // float loss = 0.0;
 // net_->Forward(&loss);
 net_->Forward();

 std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
 std::cout << "Processing time = " << (std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count())/1000000.0 << " sec" <<std::endl; //Just for time measurement

 /* Copy the output layer to a std::vector */
 Blob<float>* output_layer = net_->output_blobs()[0];

 cv::Mat segment = Visualization(output_layer);
 cv::imwrite("/home/sgp1053c/Desktop/PSPNET-cudnn5_wrapper/wrapper/segment.png", segment);`

input: Input (Last cropped frame of pre-processed image)

output: Output of the last cropped frame

Any help will be appreciated, thank youuuuu!!!


Solution

  • This issue is solved by wrapping the input channel each time it is changed so that the input will be fed forward correctly.

    Thus the function:

    WrapInputLayer(input_channels);
    

    should be called in the double for loop.