I am posting a simple c++ Apache orc file reading program which:
#include <iostream>
#include <list>
#include <memory>
#include <chrono>
// Orc specific headers.
#include <orc/Reader.hh>
#include <orc/ColumnPrinter.hh>
#include <orc/Exceptions.hh>
#include <orc/OrcFile.hh>
int main(int argc, char const *argv[])
{
auto begin = std::chrono::steady_clock::now();
orc::RowReaderOptions m_RowReaderOpts;
orc::ReaderOptions m_ReaderOpts;
std::unique_ptr<orc::Reader> m_Reader;
std::unique_ptr<orc::RowReader> m_RowReader;
auto builder = orc::SearchArgumentFactory::newBuilder();
std::string required_symbol("FILTERME");
/// THIS LINE SHOULD FILTER DATA BASED ON COLUMNS.
/// INSTEAD OF FILTERING IT TRAVERSE EACH ROW OF ORC FILE.
builder->equals("column_name", orc::PredicateDataType::STRING, orc::Literal(required_symbol.c_str(), required_symbol.size()));
std::string file_path("/orc/file/path.orc");
m_Reader = orc::createReader(orc::readFile(file_path.c_str()), m_ReaderOpts);
m_RowReader = m_Reader->createRowReader(m_RowReaderOpts);
m_RowReaderOpts.searchArgument(builder->build());
auto batch = m_RowReader->createRowBatch(5000);
try
{
std::cout << builder->build()->toString() << std::endl;
while(m_RowReader->next(*batch))
{
const auto &struct_batch = dynamic_cast<const orc::StructVectorBatch&>(*batch.get());
/** DO CALCULATIONS */
}
}
catch(const std::exception& e)
{
std::cerr << e.what() << '\n';
}
auto end = std::chrono::steady_clock::now();
std::cout << "Total Time taken to read ORC file: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count() << " ms.\n";
return 0;
}
I tried searching on google for almost a week and tried to convert every possible java program into c++ to make my code works.
I tried to use the example in the STACKOVERFLOW LINK which has a similar issue but didn't work for me.
Finally after trying multiple scenarios, I have resolved the above issue with ORC data filtering.
It was because of using the incorrect column number, I am not sure why there is a difference between the column id
of the columns to fetch and columns to filter.
In above example I tried to filter data with column name
and issue of filtering ORC with column name is still there. But unfortulately it is working fine with column number.
New Code:
#include <iostream>
#include <list>
#include <memory>
#include <chrono>
// Orc specific headers.
#include <orc/Reader.hh>
#include <orc/ColumnPrinter.hh>
#include <orc/Exceptions.hh>
#include <orc/OrcFile.hh>
int main(int argc, char const *argv[])
{
auto begin = std::chrono::steady_clock::now();
orc::RowReaderOptions m_RowReaderOpts;
orc::ReaderOptions m_ReaderOpts;
std::unique_ptr<orc::Reader> m_Reader;
std::unique_ptr<orc::RowReader> m_RowReader;
auto builder = orc::SearchArgumentFactory::newBuilder();
std::string required_symbol("FILTERME");
// <-- HERE COLUMN IDS ARE STARTING FROM 0-N. -->
std::list<uint64_t> cols = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
m_RowReaderOpts.include(cols);
int column_id = 7; // IN cols ABOVE, THIS COLUMN_ID 7 IS ACTUALLY 6. WHICH MEANS COLUMN_ID TO FILTER COLUMN IS +1 OF COLUMN ID PROVIDED IN DATA FETCH.
builder->equals(column_id, orc::PredicateDataType::STRING, orc::Literal(required_symbol.c_str(), required_symbol.size()));
std::string file_path("/orc/file/path.orc");
m_Reader = orc::createReader(orc::readFile(file_path.c_str()), m_ReaderOpts);
m_RowReader = m_Reader->createRowReader(m_RowReaderOpts);
m_RowReaderOpts.searchArgument(builder->build());
auto batch = m_RowReader->createRowBatch(5000);
try
{
std::cout << builder->build()->toString() << std::endl;
while(m_RowReader->next(*batch))
{
const auto &struct_batch = dynamic_cast<const orc::StructVectorBatch&>(*batch.get());
/** DO CALCULATIONS */
}
}
catch(const std::exception& e)
{
std::cerr << e.what() << '\n';
}
auto end = std::chrono::steady_clock::now();
std::cout << "Total Time taken to read ORC file: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count() << " ms.\n";
return 0;
}
As per my understanding while resolving above issue is, column ids for fetching data starts from 0-N and for filtering it is 1-N. This is why you should provide 1
, when you require to filter data at column 0
.