I am writing to request guidance in optimizing my solution / method "CalculateConvolutionOutputTensor__im2col". I would like help determining the best strategy for moving beyond my naive approach; offerings of intuition about any relevant GPU processes and how they apply (e.g., bank conflicts); and help interpreting the above profile in terms of what I can tweak.
A first run of the method takes 0.774 seconds using a GeForce 2080 Ti. I have included a screenshot of the Nsight Compute profile of the only CUDA C++ kernel I have written: im2col.
I could have each GPU thread access shared memory instead of global memory. I could transfer GPU "heap" variables to kernel "stack" instead of dereferencing for every thread and in-kernel for-loop iteration. I could put small parameters into arrays in GPU memory and pass single pointers to those arrays. I could use a more sophisticated version of im2col.
I would prefer not to use cuDNN 7.6.5; when I use cuDNN 7.6.5 and write the statement "cudnnCreate(&cudnnHandle);", Nsight Compute suggests that method cuModuleGetFunction returns CUDA_ERROR_NOT_FOUND.
The procedure I used to create this project was to create a new CUDA 10.2 Runtime project using Visual Studio Community 2019, rename the default source file to "main.cu", replace all contents with the first code block below, add "CalculateConvolutionOutputTensor__im2col.h" to my project, add the second code block below, add "CalculateConvolutionOutputTensor__im2col.cu" to my project, add the third code block below, and add "cublas.lib;" to Project Properties -> Linker -> Input -> Additional Dependencies.
// Allow use of cudaMalloc.
#include <cuda_runtime.h>
// Allow use of time(NULL) as a seed.
#include <ctime>
// Allow construction of a default_random_engine.
#include <random>
// Allow use of CalculateConvolutionOutputTensor__im2col.
#include "CalculateConvolutionOutputTensor__im2col.h"
int main()
{
// --------------------------------------------------------------------------
// Declare and define parameters of CalculateConvolutionOutputTensor__im2col.
// --------------------------------------------------------------------------
float* convolutionOutputTensor;
cudaMalloc(&convolutionOutputTensor, 6 * 3 * 19 * 19 * 4 * sizeof(float));
int elementsInFilter = 3 * 590 * 590;
int elementsInChannelOfOutputTensor = 19 * 19;
int imagesInSubdivision = 4;
int channelsInFilter_host = 3;
int* channelsInFilter_GPU;
cudaMalloc(&channelsInFilter_GPU, sizeof(int));
cudaMemcpy(channelsInFilter_GPU, &channelsInFilter_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfFilter_host = 590;
int* widthOfFilter_GPU;
cudaMalloc(&widthOfFilter_GPU, sizeof(int));
cudaMemcpy(widthOfFilter_GPU, &widthOfFilter_host, sizeof(int), cudaMemcpyHostToDevice);
int heightOfOutputTensor_host = 19;
int* heightOfOutputTensor_GPU;
cudaMalloc(&heightOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(heightOfOutputTensor_GPU, &heightOfOutputTensor_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfOutputTensor_host = 19;
int* widthOfOutputTensor_GPU;
cudaMalloc(&widthOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(widthOfOutputTensor_GPU, &widthOfOutputTensor_host, sizeof(int), cudaMemcpyHostToDevice);
int elementsInChannelOfOutputTensor_host = 19 * 19;
int* elementsInChannelOfOutputTensor_GPU;
cudaMalloc(&elementsInChannelOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(
elementsInChannelOfOutputTensor_GPU,
&elementsInChannelOfOutputTensor_host,
sizeof(int),
cudaMemcpyHostToDevice);
int channelsInFilter_times_elementsInChannelOfOutputTensor_host = 3 * 19 * 19;
int* channelsInFilter_times_elementsInChannelOfOutputTensor_GPU;
cudaMalloc(&channelsInFilter_times_elementsInChannelOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(
channelsInFilter_times_elementsInChannelOfOutputTensor_GPU,
&channelsInFilter_times_elementsInChannelOfOutputTensor_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_host = 3 * 590 * 19 * 19;
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU;
cudaMalloc(&elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(
elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU,
&elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInInputTensor = 3 * 608 * 608 * 4;
float* inputTensor_host = new float[elementsInInputTensor];
for (int i = 0; i < elementsInInputTensor; ++i) {
inputTensor_host[i] = ((float)(i % 255)) / 255.0;
}
float* inputTensor_GPU;
cudaMalloc(&inputTensor_GPU, elementsInInputTensor * sizeof(float));
cudaMemcpy(
inputTensor_GPU,
inputTensor_host,
elementsInInputTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] inputTensor_host;
int horizontalFilterStride_host = 1;
int* horizontalFilterStride_GPU;
cudaMalloc(&horizontalFilterStride_GPU, sizeof(int));
cudaMemcpy(
horizontalFilterStride_GPU,
&horizontalFilterStride_host,
sizeof(int),
cudaMemcpyHostToDevice);
int channelsInImage_host = 3;
int* channelsInImage_GPU;
cudaMalloc(&channelsInImage_GPU, sizeof(int));
cudaMemcpy(channelsInImage_GPU, &channelsInImage_host, sizeof(int), cudaMemcpyHostToDevice);
int verticalFilterStride_host = 1;
int* verticalFilterStride_GPU;
cudaMalloc(&verticalFilterStride_GPU, sizeof(int));
cudaMemcpy(
verticalFilterStride_GPU,
&verticalFilterStride_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInCrossSectionOfImage_host = 3 * 608;
int* elementsInCrossSectionOfImage_GPU;
cudaMalloc(&elementsInCrossSectionOfImage_GPU, sizeof(int));
cudaMemcpy(
elementsInCrossSectionOfImage_GPU,
&elementsInCrossSectionOfImage_host,
sizeof(int),
cudaMemcpyHostToDevice);
int elementsInImage_host = 3 * 608 * 608;
int* elementsInImage_GPU;
cudaMalloc(&elementsInImage_GPU, sizeof(int));
cudaMemcpy(elementsInImage_GPU, &elementsInImage_host, sizeof(int), cudaMemcpyHostToDevice);
int filters = 6 * 3;
int elementsInFilterTensor = 6 * 3 * 3 * 590 * 590;
float* filterTensor_host = new float[elementsInFilterTensor];
std::default_random_engine randomNumberGenerator(time(NULL));
std::normal_distribution<float> normalDistribution(0.0, 1.0);
for (int i = 0; i < elementsInFilterTensor; ++i) {
filterTensor_host[i] = normalDistribution(randomNumberGenerator) / sqrt((float)elementsInFilterTensor);
}
float* filterTensor_GPU;
cudaMalloc(&filterTensor_GPU, elementsInFilterTensor * sizeof(float));
cudaMemcpy(
filterTensor_GPU,
filterTensor_host,
elementsInFilterTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] filterTensor_host;
int elementsInOutputSubtensor = 6 * 3 * 19 * 19;
// -------------------------------------------------
// Execute CalculateConvolutionOutputTensor__im2col.
// -------------------------------------------------
CalculateConvolutionOutputTensor__im2col(
convolutionOutputTensor,
elementsInFilter,
elementsInChannelOfOutputTensor_host,
imagesInSubdivision,
channelsInFilter_GPU,
widthOfFilter_GPU,
heightOfOutputTensor_GPU,
widthOfOutputTensor_GPU,
elementsInChannelOfOutputTensor_GPU,
channelsInFilter_times_elementsInChannelOfOutputTensor_GPU,
elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU,
inputTensor_GPU,
horizontalFilterStride_GPU,
channelsInImage_GPU,
verticalFilterStride_GPU,
elementsInCrossSectionOfImage_GPU,
elementsInImage_GPU,
filters,
filterTensor_GPU,
elementsInOutputSubtensor);
cudaFree(channelsInFilter_GPU);
cudaFree(widthOfFilter_GPU);
cudaFree(heightOfOutputTensor_GPU);
cudaFree(widthOfOutputTensor_GPU);
cudaFree(elementsInChannelOfOutputTensor_GPU);
cudaFree(channelsInFilter_times_elementsInChannelOfOutputTensor_GPU);
cudaFree(elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_GPU);
cudaFree(inputTensor_GPU);
cudaFree(horizontalFilterStride_GPU);
cudaFree(channelsInImage_GPU);
cudaFree(verticalFilterStride_GPU);
cudaFree(elementsInCrossSectionOfImage_GPU);
cudaFree(elementsInImage_GPU);
cudaFree(filterTensor_GPU);
// --------------------------------------------------
// Make sure that convolutionOutputTensor is correct.
// --------------------------------------------------
float* convolutionOutputTensor_test = new float[6 * 3 * 19 * 19 * 4];
cudaMemcpy(
convolutionOutputTensor_test,
convolutionOutputTensor,
6 * 3 * 19 * 19 * 4 * sizeof(float),
cudaMemcpyDeviceToHost);
printf("convolutionOutputTensor_test: {");
for (int i = 0; i < 18; ++i) {
printf("%f, ", convolutionOutputTensor_test[i]);
}
printf("...}\n");
delete[] convolutionOutputTensor_test;
cudaFree(convolutionOutputTensor);
return 0;
}
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
int* channelsInFilter,
int* widthOfFilter,
int* heightOfOutputTensor,
int* widthOfOutputTensor,
int* elementsInChannelOfOutputTensor_GPU_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor,
float* inputTensor_child,
int* horizontalFilterStride,
int* channelsInImage,
int* verticalFilterStride,
int* elementsInCrossSectionOfImage,
int* elementsInImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child);
// Allow use of __global__.
#include <cuda_runtime.h>
// Allow declaration of cublasHandle.
#include "cublas_v2.h"
// Allow use of blockIdx.x, blockDim.x, and threadIdx.x.
#include <device_launch_parameters.h>
__global__
void im2col(
float* col_child,
int* channelsInFilter_child,
int* widthOfFilter_child,
int* heightOfOutputTensor_child,
int* widthOfOutputTensor_child,
int* elementsInChannelOfOutputTensor_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor_child,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_child,
float* inputTensor_child_child,
int* horizontalFilterStride_child,
int* channelsInImage_child,
int* verticalFilterStride_child,
int* elementsInCrossSectionOfImage_child,
int* image_child,
int* elementsInImage_child);
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
int* channelsInFilter,
int* widthOfFilter,
int* heightOfOutputTensor,
int* widthOfOutputTensor,
int* elementsInChannelOfOutputTensor_GPU_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor,
float* inputTensor_child,
int* horizontalFilterStride,
int* channelsInImage,
int* verticalFilterStride,
int* elementsInCrossSectionOfImage,
int* elementsInImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child)
{
// -----------------------------------------
// Define and declare parameters for im2col.
// -----------------------------------------
// Define parameters for the execution configuration of im2col.
int threads_per_block_for_im2col = 885;
int blocks_for_im2col =
(elementsInFilter_child + threads_per_block_for_im2col - 1) / threads_per_block_for_im2col;
// Declare col.
float* col;
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int elementsInFilter_times_elementsInChannelOfOutputTensor =
elementsInFilter_child * elementsInChannelOfOutputTensor_host_child;
cudaMalloc(&col, elementsInFilter_times_elementsInChannelOfOutputTensor * sizeof(float));
// -----------------------------------------------------------------------------
// Define parameters for calculating the matrix product of filterTensor and col.
// -----------------------------------------------------------------------------
// Define a cublasHandle_t object called cublasHandle.
// Declaring cublasHandle requires '#include "cublas_v2.h"'.
// Defining cublasHandle requires adding "cublas.lib" to
// Properties -> Linker -> Input -> Additional Dependencies.
cublasHandle_t cublasHandle;
cublasCreate(&cublasHandle);
// Define parameters for (not) including
// a portion of a third matrix in product_filterTensor_and_col.
float one = 1.0;
float zero = 0.0;
// ------------------------------------------------------------
// For each image in subdivision,
// sculpt image into matrix col.
// Calculate the matrix product of filterTensor and col and
// store the product as a subtensor of convolutionOutputTensor.
// ------------------------------------------------------------
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int image_times_elementsInOutputSubtensor;
int* image_GPU;
cudaMalloc(&image_GPU, sizeof(int));
for (int image_host = 0; image_host < imagesInSubdivision_child; ++image_host) {
cudaMemcpy(image_GPU, &image_host, sizeof(int), cudaMemcpyHostToDevice);
im2col<<<blocks_for_im2col, threads_per_block_for_im2col>>>
(col,
channelsInFilter,
widthOfFilter,
heightOfOutputTensor,
widthOfOutputTensor,
elementsInChannelOfOutputTensor_GPU_child,
channelsInFilter_times_elementsInChannelOfOutputTensor,
elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor,
inputTensor_child,
horizontalFilterStride,
channelsInImage,
verticalFilterStride,
elementsInCrossSectionOfImage,
image_GPU,
elementsInImage);
cudaDeviceSynchronize();
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
image_times_elementsInOutputSubtensor = image_host * elementsInOutputSubtensor_child;
cublasSgemm(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
elementsInChannelOfOutputTensor_host_child,
filters_child,
elementsInFilter_child,
&one,
col,
elementsInChannelOfOutputTensor_host_child,
filterTensor,
elementsInFilter_child,
&zero,
convolutionOutputTensor_child + image_times_elementsInOutputSubtensor,
elementsInChannelOfOutputTensor_host_child);
}
cudaFree(col);
cudaFree(image_GPU);
}
__global__
void im2col(
float* col_child,
int* channelsInFilter_child,
int* widthOfFilter_child,
int* heightOfOutputTensor_child,
int* widthOfOutputTensor_child,
int* elementsInChannelOfOutputTensor_child,
int* channelsInFilter_times_elementsInChannelOfOutputTensor_child,
int* elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_child,
float* inputTensor_child_child,
int* horizontalFilterStride_child,
int* channelsInImage_child,
int* verticalFilterStride_child,
int* elementsInCrossSectionOfImage_child,
int* image,
int* elementsInImage_child)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int c_prime = index % (*channelsInFilter_child);
int temp = (index - c_prime) / (*channelsInFilter_child);
int w_prime = temp % (*widthOfFilter_child);
int h_prime = temp / (*widthOfFilter_child);
for (int h = 0; h < (*heightOfOutputTensor_child); ++h) {
for (int w = 0; w < (*widthOfOutputTensor_child); ++w) {
col_child[
w +
h * (*widthOfOutputTensor_child) +
c_prime * (*elementsInChannelOfOutputTensor_child) +
w_prime * (*channelsInFilter_times_elementsInChannelOfOutputTensor_child) +
h_prime * (*elementsInCrossSectionOfFilter_times_elementsInChannelOfOutputTensor_child)] =
inputTensor_child_child[
c_prime +
(w * (*horizontalFilterStride_child) + w_prime) * (*channelsInImage_child) +
(h * (*verticalFilterStride_child) + h_prime) * (*elementsInCrossSectionOfImage_child) +
(*image) * (*elementsInImage_child)];
}
}
}
After reading through the NVIDIA articles that Robert Crovella provided me, I rewrote my solution "CalculateConvolutionOutputTensor__im2col" to have threads in each block load from contiguous global memory. I used less indexing arithmetic and fewer parameters. I saw a method speed-up of (1 method / 0.445 s) / (1 method / 0.774 s) = 1.7, and an im2col kernel speed-up of (1 kernel / 35.27 ms) / (1 kernel / 128.15 ms) = 3.6. Thanks for pointing me to useful specific reading.
im2col used to take 128.15 ms; now it takes only 32.12 ms. Sgemm takes 6.34 ms now; probably took about the same then. Their total is 38.46 ms. The pair is run four times, for a total of 153.84 ms. I wonder how to speed up im2col more, and to reduce the 274.16 ms in "overhead".
To sculpt an image into matrix col, I had the (3*590/2) threads in each of (2*590*19*19) blocks transfer half cross sections of a filter-shaped portion of an image sequentially to col. I believe that each thread loaded from global memory physically adjacent to the memory accessed by the previous thread, and that each thread stored to global memory physically adjacent to the memory stored to by the previous thread. I did notice that 11 threads in the last warp in each block went unused.
I think I might take th31 up on their suggestion and move this optimization thread to Code Review.
// Allow use of cudaMalloc.
#include <cuda_runtime.h>
// Allow use of structs in namespace chrono.
#include <ctime>
// Allow construction of a default_random_engine.
#include <random>
// Allow use of CalculateConvolutionOutputTensor__im2col.
#include "CalculateConvolutionOutputTensor__im2col.h"
int main()
{
// --------------------------------------------------------------------------
// Declare and define parameters of CalculateConvolutionOutputTensor__im2col.
// --------------------------------------------------------------------------
float* convolutionOutputTensor;
cudaMalloc(&convolutionOutputTensor, 6 * 3 * 19 * 19 * 4 * sizeof(float));
int elementsInFilter = 3 * 590 * 590;
int elementsInChannelOfOutputTensor = 19 * 19;
int imagesInSubdivision = 4;
int elementsInInputTensor = 3 * 608 * 608 * 4;
float* inputTensor_host = new float[elementsInInputTensor];
for (int i = 0; i < elementsInInputTensor; ++i) {
inputTensor_host[i] = ((float)(i % 255)) / 255.0;
}
float* inputTensor_GPU;
cudaMalloc(&inputTensor_GPU, elementsInInputTensor * sizeof(float));
cudaMemcpy(
inputTensor_GPU,
inputTensor_host,
elementsInInputTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] inputTensor_host;
int heightOfFilter_host = 590;
int* heightOfFilter_GPU;
cudaMalloc(&heightOfFilter_GPU, sizeof(int));
cudaMemcpy(heightOfFilter_GPU, &heightOfFilter_host, sizeof(int), cudaMemcpyHostToDevice);
int channelsInImage_host = 3;
int* channelsInImage_GPU;
cudaMalloc(&channelsInImage_GPU, sizeof(int));
cudaMemcpy(channelsInImage_GPU, &channelsInImage_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfImage_host = 608;
int* widthOfImage_GPU;
cudaMalloc(&widthOfImage_GPU, sizeof(int));
cudaMemcpy(widthOfImage_GPU, &widthOfImage_host, sizeof(int), cudaMemcpyHostToDevice);
int widthOfOutputTensor_host = 19;
int* widthOfOutputTensor_GPU;
cudaMalloc(&widthOfOutputTensor_GPU, sizeof(int));
cudaMemcpy(widthOfOutputTensor_GPU, &widthOfOutputTensor_host, sizeof(int), cudaMemcpyHostToDevice);
int heightOfImage_host = 608;
int* heightOfImage_GPU;
cudaMalloc(&heightOfImage_GPU, sizeof(int));
cudaMemcpy(heightOfImage_GPU, &heightOfImage_host, sizeof(int), cudaMemcpyHostToDevice);
int filters = 6 * 3;
int elementsInFilterTensor = 6 * 3 * 3 * 590 * 590;
float* filterTensor_host = new float[elementsInFilterTensor];
std::default_random_engine randomNumberGenerator(time(NULL));
std::normal_distribution<float> normalDistribution(0.0, 1.0);
for (int i = 0; i < elementsInFilterTensor; ++i) {
filterTensor_host[i] = normalDistribution(randomNumberGenerator) / sqrt((float)elementsInFilterTensor);
}
float* filterTensor_GPU;
cudaMalloc(&filterTensor_GPU, elementsInFilterTensor * sizeof(float));
cudaMemcpy(
filterTensor_GPU,
filterTensor_host,
elementsInFilterTensor * sizeof(float),
cudaMemcpyHostToDevice);
delete[] filterTensor_host;
int elementsInOutputSubtensor = 6 * 3 * 19 * 19;
// -------------------------------------------------
// Execute CalculateConvolutionOutputTensor__im2col.
// -------------------------------------------------
CalculateConvolutionOutputTensor__im2col(
convolutionOutputTensor,
elementsInFilter,
elementsInChannelOfOutputTensor,
imagesInSubdivision,
inputTensor_GPU,
heightOfFilter_GPU,
channelsInImage_GPU,
widthOfImage_GPU,
widthOfOutputTensor_GPU,
heightOfImage_GPU,
filters,
filterTensor_GPU,
elementsInOutputSubtensor);
cudaFree(inputTensor_GPU);
cudaFree(heightOfFilter_GPU);
cudaFree(channelsInImage_GPU);
cudaFree(widthOfImage_GPU);
cudaFree(widthOfOutputTensor_GPU);
cudaFree(heightOfImage_GPU);
cudaFree(filterTensor_GPU);
// --------------------------------------------------
// Make sure that convolutionOutputTensor is correct.
// --------------------------------------------------
float* convolutionOutputTensor_test = new float[6 * 3 * 19 * 19 * 4];
cudaMemcpy(
convolutionOutputTensor_test,
convolutionOutputTensor,
6 * 3 * 19 * 19 * 4 * sizeof(float),
cudaMemcpyDeviceToHost);
printf("convolutionOutputTensor_test: {");
for (int i = 0; i < 18; ++i) {
printf("%f, ", convolutionOutputTensor_test[i]);
}
printf("...}\n");
delete[] convolutionOutputTensor_test;
cudaFree(convolutionOutputTensor);
return 0;
}
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
float* inputTensor_child,
int* heightOfFilter,
int* channelsInImage,
int* widthOfImage,
int* widthOfOutputTensor,
int* heightOfImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child);
// Allow use of __global__.
#include <cuda_runtime.h>
// Allow declaration of cublasHandle.
#include "cublas_v2.h"
// Allow use of blockIdx.x, blockDim.x, and threadIdx.x.
#include <device_launch_parameters.h>
__global__
void im2col(
float* col_child,
float* inputTensor_child_child,
int* heightOfFilter_child,
int* channelsInImage_child,
int* widthOfImage_child,
int* widthOfOutputTensor_child,
int* image,
int* heightOfImage_child);
void CalculateConvolutionOutputTensor__im2col(
float* convolutionOutputTensor_child,
int elementsInFilter_child,
int elementsInChannelOfOutputTensor_host_child,
int imagesInSubdivision_child,
float* inputTensor_child,
int* heightOfFilter,
int* channelsInImage,
int* widthOfImage,
int* widthOfOutputTensor,
int* heightOfImage,
int filters_child,
float* filterTensor,
int elementsInOutputSubtensor_child)
{
// -----------------------------------------
// Define and declare parameters for im2col.
// -----------------------------------------
// Define parameters for the execution configuration of im2col.
int threads_per_block_for_im2col = 3 * 590 / 2;
int blocks_for_im2col = 2 * 590 * 19 * 19;
// Declare col.
float* col;
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int elementsInFilter_times_elementsInChannelOfOutputTensor =
elementsInFilter_child * elementsInChannelOfOutputTensor_host_child;
cudaMalloc(&col, elementsInFilter_times_elementsInChannelOfOutputTensor * sizeof(float));
// -----------------------------------------------------------------------------
// Define parameters for calculating the matrix product of filterTensor and col.
// -----------------------------------------------------------------------------
// Define a cublasHandle_t object called cublasHandle.
// Declaring cublasHandle requires '#include "cublas_v2.h"'.
// Defining cublasHandle requires adding "cublas.lib" to
// Properties -> Linker -> Input -> Additional Dependencies.
cublasHandle_t cublasHandle;
cublasCreate(&cublasHandle);
// Define parameters for (not) including
// a portion of a third matrix in product_filterTensor_and_col.
float one = 1.0;
float zero = 0.0;
// ------------------------------------------------------------
// For each image in subdivision,
// sculpt image into matrix col.
// Calculate the matrix product of filterTensor and col and
// store the product as a subtensor of convolutionOutputTensor.
// ------------------------------------------------------------
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
int image_times_elementsInOutputSubtensor;
int* image_GPU;
cudaMalloc(&image_GPU, sizeof(int));
for (int image_host = 0; image_host < imagesInSubdivision_child; ++image_host) {
cudaMemcpy(image_GPU, &image_host, sizeof(int), cudaMemcpyHostToDevice);
im2col
<<<blocks_for_im2col,
threads_per_block_for_im2col>>>
(col,
inputTensor_child,
heightOfFilter,
channelsInImage,
widthOfImage,
widthOfOutputTensor,
image_GPU,
heightOfImage);
cudaDeviceSynchronize();
// The following statement is required to
// prevent automatic casting of a product to an eight-byte integer.
image_times_elementsInOutputSubtensor = image_host * elementsInOutputSubtensor_child;
cublasSgemm(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
filters_child,
elementsInChannelOfOutputTensor_host_child,
elementsInFilter_child,
&one,
filterTensor,
filters_child,
col,
elementsInFilter_child,
&zero,
convolutionOutputTensor_child + image_times_elementsInOutputSubtensor,
filters_child);
float element = 0.0;
}
cudaFree(col);
cudaFree(image_GPU);
}
__global__
void im2col(
float* col_child,
float* inputTensor_child_child,
int* heightOfFilter_child,
int* channelsInImage_child,
int* widthOfImage_child,
int* widthOfOutputTensor_child,
int* image,
int* heightOfImage_child)
{
col_child[blockIdx.x * blockDim.x + threadIdx.x] =
inputTensor_child_child[
threadIdx.x +
(blockIdx.x % 2) * blockDim.x +
((blockIdx.x % (2 * (*heightOfFilter_child))) / 2) * (*channelsInImage_child) * (*widthOfImage_child) +
(blockIdx.x / (2 * (*heightOfFilter_child))) * (*channelsInImage_child) +
(blockIdx.x / (2 * (*heightOfFilter_child) * (*widthOfOutputTensor_child))) * (*channelsInImage_child) * (*widthOfImage_child) +
(*image) * (*channelsInImage_child) * (*widthOfImage_child) * (*heightOfImage_child)];
}