azureazure-storageazure-sdk

Azure-sdk-for-cpp parallelism


I've integrated new azure-sdk-for-cpp in my project. Time for read/write is significantly bad. Using blockBlobClient StageBlock and CommitBlockList, any example from SDK, almost 1 to 1 same implementation

Is there a way to turn on some kind of flag to enable parallelism or multithreading for SDK to improve upload speed.

 int offset= 0;
while(buffer_size - buffer.size()  < length) {
    int remaining= buffer_size - buffer.size();

    buffer.insert(buffer.end(), data + offset, data + offset + remaining);

    auto block_id= GetBlockId(std::to_string(m_block_count++));

    auto block_content= ::Azure::Core::IO::MemoryBodyStream(buffer.data(), buffer.size());

    uncomitted_blocks_ids.push_back(block_id);

    blob_client.AsBlockBlobClient().StageBlock(block_id, block_content); 

    buffer.clear();

    offset += remaining;
    length -= remaining;
}

client.CommitBlockList(uncomitted_blocks_ids);

return 0;

For the download I saw that there is option for Concurrency within blob options, but testing showed increase in speed ~10-20%

Thanks in advance!

UPDATE:

I'm looking how to do upload_block_async, which was present in old sdk


Solution

  • Is there a way to turn on some kind of flag to enable parallelism or multithreading for SDK to improve upload speed.

    You can use the below code that improve the upload speed in Azure's C++ SDK, and it provides basic single-threaded calls for uploading blocks, but you can use multiple threads explicitly to parallelize the upload.

    To achieve you can divide the data into multiple chunks and use asynchronous operations and run multiple uploads at the same time.

    Code:

    #include <azure/storage/blobs.hpp>
    #include <fstream>
    #include <future>
    #include <iostream>
    #include <iomanip>
    #include <sstream>
    #include <vector>
    #include <chrono>
    
    using namespace Azure::Storage::Blobs;
    
    const int BlockSize = 4 * 1024 * 1024;  
    const int MaxConcurrency = 8;
    
    std::string GetBlockId(const std::string& blockNumber) {
        std::stringstream ss;
        ss << std::setw(8) << std::setfill('0') << blockNumber;
        return ss.str();
    }
    
    void UploadBlockAsync(const std::string& block_id, const std::vector<uint8_t>& data,
        BlockBlobClient& blockBlobClient) {
        auto block_content = Azure::Core::IO::MemoryBodyStream(data.data(), data.size());
        blockBlobClient.StageBlock(block_id, block_content);
    }
    
    int UploadFileWithParallelism(const std::string& file_path, BlockBlobClient& blockBlobClient) {
        std::ifstream file(file_path, std::ios::binary | std::ios::ate);
        if (!file) {
            std::cerr << "Failed to open file: " << file_path << std::endl;
            return 1;
        }
    
        std::streamsize file_size = file.tellg();
        file.seekg(0, std::ios::beg);
    
        std::vector<std::string> uncommitted_block_ids;
        std::vector<std::future<void>> futures;
        int block_count = 0;
        int offset = 0;
    
        auto start = std::chrono::high_resolution_clock::now();
    
        while (offset < file_size) {
            int current_block_size = std::min(static_cast<int>(BlockSize), static_cast<int>(file_size) - offset);
            std::vector<uint8_t> block_data(current_block_size);
            if (!file.read(reinterpret_cast<char*>(block_data.data()), current_block_size)) {
                std::cerr << "Failed to read block from file." << std::endl;
                return 1;
            }
            auto block_id = GetBlockId(std::to_string(block_count++));
            uncommitted_block_ids.push_back(block_id);
            futures.push_back(std::async(std::launch::async, UploadBlockAsync, block_id, block_data, std::ref(blockBlobClient)));
    
            offset += current_block_size;
            if (futures.size() >= MaxConcurrency) {
                for (auto& future : futures) future.get();
                futures.clear();
            }
        }
    
        for (auto& future : futures) future.get();
        blockBlobClient.CommitBlockList(uncommitted_block_ids);
        auto end = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> elapsed = end - start;
        double time_taken = elapsed.count(); // Time in seconds
    
        // Calculate upload speed in MB/s
        double upload_speed = (file_size / (1024.0 * 1024.0)) / time_taken;
    
        std::cout << "Upload completed successfully." << std::endl;
        std::cout << "Time taken: " << time_taken << " seconds" << std::endl;
        std::cout << "Average upload speed: " << upload_speed << " MB/s" << std::endl;
    
        return 0;
    }
    
    int main() {
        // Define your connection string, container/blob name, and file path.
        std::string connection_string = "xxx";
        std::string container_name = "result";
        std::string blob_name = "test.mp4";
        std::string file_path = "xxx";
    
        // Create a BlockBlobClient using the connection string.
        BlobContainerClient container_client = BlobContainerClient::CreateFromConnectionString(connection_string, container_name);
        BlockBlobClient block_blob_client = container_client.GetBlockBlobClient(blob_name);
    
        // Perform the upload with parallelism.
        int result = UploadFileWithParallelism(file_path, block_blob_client);
    
        return result;
    }
    

    The above code divides the file into chunks (blocks), uploads them concurrently using async tasks, and finally commits the block list to complete the upload. The upload time and speed are measured and displayed after the process is finished.

    Output:

    Upload completed successfully.
    Time taken: 15.3139 seconds
    Average upload speed: 4.3364 MB/s
    

    enter image description here

    The above code uploads the 66 MB file in Azure blob storage with 15 seconds.

    Portal: enter image description here