csegmentation-faultduckdb

How to alter the datatype of a Column?


I want to change the data type of a column of a table in a DuckDB Database.

With

query2_c= ALTER TABLE populationShort ALTER Year SET DATA TYPE DATE;

(C Language Binding) I get Segmentation fault (core dumped) error.

The description of the segmentation fault error, obtained with the gdbdebugger, is :

    duckdb::DeprecatedMaterializeResult(duckdb_result*)

How to correctly modify via query the data type of the column "Year"?

I upgraded duckdb version from 1.2.0 to the latest 1.3.2. And the problem persists. So it must be something in the code used

This is the complete code:

        duckdb_database CSVDuckDB = NULL;
        duckdb_connection CSVDuckDBConnection = NULL;
        duckdb_result CSVDuckDBResult;
        duckdb_state CSVDuckDBState;

        const char* CSVDuckDBCompletePath = CSVDuckDBCompletePath_s.c_str();
        if (duckdb_open(CSVDuckDBCompletePath, &CSVDuckDB) == DuckDBError)
        {
            fprintf(stderr, "Failed to open CSVDuckDB\n");
            // Clean-up
            duckdb_destroy_result(&CSVDuckDBResult);
            duckdb_disconnect(&CSVDuckDBConnection);
            duckdb_close(&CSVDuckDB);
        }

        if (duckdb_connect(CSVDuckDB, &CSVDuckDBConnection) == DuckDBError)
        {
            fprintf(stderr, "Failed to open connection to CSVDuckDB\n");
            // Clean-up
            duckdb_destroy_result(&CSVDuckDBResult);
            duckdb_disconnect(&CSVDuckDBConnection);
            duckdb_close(&CSVDuckDB);
        }


        std::string justname_s_pure = justname_s_splitted[0];
        std::string gettablename = std::format("{}", justname_s_pure);
        std::cout << "gettablename= " << gettablename << std::endl;
        std::string query = "CREATE OR REPLACE TABLE ";
        query.append(gettablename); 



        query.append(" AS SELECT * FROM read_csv('");
        std::string getfilename = std::format("{}", fileName_s);
        query.append(getfilename);
        //query.append("', sample_size = -1);");
        query.append("', strict_mode = false, ignore_errors = true);");
        const char* query_c = query.c_str();

        std::cout << "query_c= " << query_c << std::endl;

        CSVDuckDBState = duckdb_query(CSVDuckDBConnection, query_c, &CSVDuckDBResult);


        if (CSVDuckDBState == DuckDBError)
        {
            std::cout << "duckdb_query produced DuckDBError" << std::endl;
            // Clean-up
            duckdb_destroy_result(&CSVDuckDBResult);
            duckdb_disconnect(&CSVDuckDBConnection);
            duckdb_close(&CSVDuckDB);
        }
        else
        {
            std::cout << "duckdb_query is ok" << std::endl;
            duckdb_result_type CSVDuckDBReturnType = duckdb_result_return_type(CSVDuckDBResult);
            std::cout << "CSVDuckDBReturnType= " << CSVDuckDBReturnType << std::endl;
        }

        query = "SELECT column_name, ordinal_position, data_type FROM information_schema.columns;";
        query_c = query.c_str();

        std::cout << "query_c= " << query_c << std::endl;

        CSVDuckDBState = duckdb_query(CSVDuckDBConnection, query_c, &CSVDuckDBResult);
        if (CSVDuckDBState == DuckDBError)
        {
            std::cout << "duckdb_query of information_schema.columns produced DuckDBError" << std::endl;
            // Clean-up
            duckdb_destroy_result(&CSVDuckDBResult);
            duckdb_disconnect(&CSVDuckDBConnection);
            duckdb_close(&CSVDuckDB);
        }        idx_t col_count = duckdb_column_count(&CSVDuckDBResult);
        idx_t row_count = duckdb_row_count(&CSVDuckDBResult);

        for (size_t row_idx = 0; row_idx < row_count; row_idx++)
        {
            std::cout << "row:" << row_idx << std::endl;
            char* val_idx_0 = duckdb_value_varchar(&CSVDuckDBResult, 0, row_idx);
            std::string val_idx_0_s (val_idx_0);
            printf("%s ", val_idx_0);
            char* val_idx_2 = duckdb_value_varchar(&CSVDuckDBResult, 2, row_idx);
            printf("%s ", val_idx_2);
            std::string val_idx_2_s (val_idx_2);
            if (
                 (Grasp::IsStringIntoVect(date_csv_fields, Grasp::StringToLower(val_idx_0_s)))
                 &&
                 (Grasp::IsStringIntoVect(duckdbNumericalDataTypes, val_idx_2_s))
               )
            {
                std::cout << "IT HAS TO BE CHANGED TO DATE TYPE" << std::endl;
                // https://duckdb.org/docs/stable/sql/statements/alter_table.html
                std::string query2 = "ALTER TABLE ";
                query2.append(gettablename);
                query2.append(" ALTER '");
                std::string getValIdx0s = std::format("{}", val_idx_0_s);
                query2.append(getValIdx0s);
                query2.append("' SET DATA TYPE USING MAKE_DATE('year',1,1);");
                const char* query2_c = query2.c_str();
                std::cout << "query2_c= " << query2_c << std::endl;;

                CSVDuckDBState = duckdb_query(CSVDuckDBConnection, query2_c, &CSVDuckDBResult);
                if (CSVDuckDBState == DuckDBError)
                {
                    std::cout << "duckdb_query of altering the data type produced DuckDBError" << std::endl;
                    // Clean-up
                    duckdb_destroy_result(&CSVDuckDBResult);
                    duckdb_disconnect(&CSVDuckDBConnection);
                    duckdb_close(&CSVDuckDB);
                }

            }
            printf("\n");


        }

        // Clean-up
        duckdb_destroy_result(&CSVDuckDBResult);
        duckdb_disconnect(&CSVDuckDBConnection);
        duckdb_close(&CSVDuckDB);

From the complete output:

gettablename= populationShort
query_c= CREATE OR REPLACE TABLE populationShort AS SELECT * FROM  
read_csv('/home/raphy/Downloads/CSVFiles/populationShort.csv',  
strict_mode = false, ignore_errors = true);
duckdb_query is ok
CSVDuckDBReturnType= 2
query_c= SELECT column_name, ordinal_position, data_type FROM 
information_schema.columns;
row:0
Country Name VARCHAR 
row:1
Country Code VARCHAR 
row:2
Year BIGINT IT HAS TO BE CHANGED TO DATE TYPE
query2_c= ALTER TABLE populationShort ALTER 'Year' SET DATA TYPE 
DATE;
duckdb_query of altering the data type produced DuckDBError

row:3
Segmentation fault (core dumped)

(base) raphy@raohy:/var/crash$ gdb /home/raphy/MyPrj/builddir/
MyPrj ./core_MyPrj.27982

Program terminated with signal SIGSEGV, Segmentation fault.
#0  0x00007652aed97360 in    
duckdb::DeprecatedMaterializeResult(duckdb_result*) ()
   from /home/raphy/MyPrj/./src/DuckDB/lib/libduckdb.so

it is clear that the code causing the segmentation fault is the following part:

                // https://duckdb.org/docs/stable/sql/statements/alter_table.html
                std::string query2 = "ALTER TABLE ";
                query2.append(gettablename);
                query2.append(" ALTER '");
                std::string getValIdx0s = std::format("{}", val_idx_0_s);
                query2.append(getValIdx0s);
                query2.append("' SET DATA TYPE USING MAKE_DATE('year',1,1);"); // N.B.: or with the alternative presented in Update 01, see below in the question.
                const char* query2_c = query2.c_str();
                std::cout << "query2_c= " << query2_c << std::endl;;

                CSVDuckDBState = duckdb_query(CSVDuckDBConnection, query2_c, &CSVDuckDBResult);
                if (CSVDuckDBState == DuckDBError)
                {
                    std::cout << "duckdb_query of altering the data type produced DuckDBError" << std::endl;
                    // Clean-up
                    duckdb_destroy_result(&CSVDuckDBResult);
                    duckdb_disconnect(&CSVDuckDBConnection);
                    duckdb_close(&CSVDuckDB);
                }

This is the content of populationShort.csv file :

Country Name,Country Code,Year,Value
Aruba,ABW,1960,54922
Aruba,ABW,1961,55578
Aruba,ABW,1962,56320      

And these are the date_csv_fields and duckdbNumericalDataTypes vectors:

        std::vector<std::string> date_csv_fields = {
            "month",
            "year",
            "day"
        };
    
        std::vector<std::string> duckdbNumericalDataTypes = {
            "BIGINT",
            "DECIMAL",
            "FLOAT",
            "DOUBLE",
            "HUGEINT",
            "SMALLINT",
            "UBIGINT",
            "UHUGEINT",
            "UINTEGER",
            "USMALLINT",
            "UTINYINT"
        };            

I get

Program terminated with signal SIGSEGV, Segmentation fault.
#0  0x000076efd0d97360 in     
duckdb::DeprecatedMaterializeResult(duckdb_result*) ()

also if the query executed is:

query2_c= ALTER TABLE populationShort ALTER 'Year' SET DATA TYPE DATE

Update 01

I modified query2 as follows to avoid pointed out SQL syntax problem, and separate concerns (SQL code vs C SIGSEGV):

                std::string query2 = "ALTER TABLE ";
                query2.append(gettablename);
                query2.append(" ALTER \"");
                std::string getValIdx0s = std::format("{}", val_idx_0_s);
                query2.append(getValIdx0s);
                //query2.append("' SET DATA TYPE USING MAKE_DATE('year',1,1);");
                query2.append("\" SET DATA TYPE DATE;");
                const char* query2_c = query2.c_str();
                std::cout << "query2_c= " << query2_c << std::endl;;

resulting in:

query2_c= ALTER TABLE populationShort ALTER "Year" SET DATA TYPE DATE;

But still get

Segmentation fault (core dumped)
Program terminated with signal SIGSEGV, Segmentation fault.
#0  0x000072894dd97360 in duckdb::DeprecatedMaterializeResult(duckdb_result*) ()

Solution

  • CSVDuckDBResult overwriting

    Understanding

    Simplifying your code around uses of CSVDuckDBResult, we have:

        duckdb_result CSVDuckDBResult;
        […]
        CSVDuckDBState = duckdb_query(CSVDuckDBConnection, query_c, &CSVDuckDBResult);
        […]
        for (size_t row_idx = 0; row_idx < row_count; row_idx++)
        {
            char* val_idx_0 = duckdb_value_varchar(&CSVDuckDBResult, 0, row_idx);
            […]
            if ([…])
            {
                […]
                CSVDuckDBState = duckdb_query(CSVDuckDBConnection, query2_c, &CSVDuckDBResult);
                […]
            }
        }
        duckdb_destroy_result(&CSVDuckDBResult);
    

    If you look closely, results for both query_c and query2_c are written to the same CSVDuckDBResult variable, which is still in use in the loop while getting reused in the if([…]).

    Thus unrolling your loop over your fourth columns makes:

    Fixing

    So, as you are using two results concurrently, you just have to declare another result dedicated to the ALTER:

        duckdb_result CSVDuckDBResult;
        […]
        CSVDuckDBState = duckdb_query(CSVDuckDBConnection, query_c, &CSVDuckDBResult);
        […]
        for (size_t row_idx = 0; row_idx < row_count; row_idx++)
        {
            char* val_idx_0 = duckdb_value_varchar(&CSVDuckDBResult, 0, row_idx);
            […]
            if ([…])
            {
                duckdb_result CSVDuckDBResult2; // ← Declare here
                […]
                CSVDuckDBState = duckdb_query(CSVDuckDBConnection, query2_c, &CSVDuckDBResult2); // ← Change to 2
                […]
                duckdb_destroy_result(&CSVDuckDBResult2); // ← Get out of the "if (CSVDuckDBState == DuckDBError)", and change to 2
            }
        }
        duckdb_destroy_result(&CSVDuckDBResult);