c

Issue with interpreting chemical formulae containing multiple levels of parentheses


I am working on a C program to interpret chemical formulae that may include multiple levels of parentheses, and I am facing difficulties with correctly interpreting these formulae. The goal is to associate each atom with a variable based on a provided formula.
I am using the code below to process chemical formulae with varying levels of complexity. For example, for the formula {"2F2(SO4)3", 'A'}, the processing is correct; however, for {"Na(H2(SO3)4)5", 'B'}, the interpretation is incorrect. The expected result for {"Na(H2(SO3)4)5", 'B'} should be Na + H10 + S20 + O60, but it is resulting in Na + H10 + S10 + O15, indicating that the processing of nested parentheses is not functioning as expected.
From what I observed, the logic is multiplying the innermost parenthesis by the factor of the outermost parenthesis. For example, in (H2(SO3)4)5, it is multiplying 'O3' by 5 instead of by 4 before then multiplying by 5.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

typedef struct {
    char symbol[3];  // Atom symbol (e.g., "H", "O")
} Atom;

typedef struct {
    char term[50];    // Formula term (e.g., "2H2O", "3CO2")
    char variable;    // Variable associated with the term
} Association;

void printVariableAtomTable(Atom *atoms, int numAtoms, Association *terms, int numTerms) {
    // Print table header
    printf("\nTable of Association between Variables and Elements:\n");
    printf("Variable: ");
    for (int i = 0; i < numTerms; i++) {
        printf("%c  ", terms[i].variable);
    }
    printf("\n");

    // Create matrix to store the quantity of each atom associated with each variable
    int **table = (int **)malloc(numAtoms * sizeof(int *));
    if (table == NULL) {
        printf("Error: Failed to allocate memory for table.\n");
        exit(1);
    }

    for (int i = 0; i < numAtoms; i++) {
        table[i] = (int *)calloc(numTerms, sizeof(int));
        if (table[i] == NULL) {
            printf("Error: Failed to allocate memory for table.\n");
            exit(1);
        }
    }

    // Fill the table with the quantity of each atom associated with each variable
    for (int j = 0; j < numTerms; j++) {
        char *term = terms[j].term;
        int termCoefficient = 1;
        int multiplier = 1;

        // Check if there is a numeric coefficient associated with the term (if any)
        char *coeffEnd = strchr(term, '(');
        if (coeffEnd != NULL) {
            sscanf(coeffEnd + 1, "%d", &termCoefficient);
        }

        int k = 0;
        while (term[k] != '\0') {
            if (isdigit(term[k])) {
                multiplier = term[k] - '0'; // Convert the numeric char to integer
                k++;
                continue;
            }

            if (isupper(term[k])) {
                char symbol[3] = { term[k], '\0' };
                int m = k + 1;
                while (term[m] != '\0' && islower(term[m])) {
                    strncat(symbol, &term[m], 1);
                    m++;
                }

                int elementCoefficient = 1;
                if (term[m] != '\0' && isdigit(term[m])) {
                    elementCoefficient = term[m] - '0';
                    m++;
                }

                int elementIndex = -1;
                for (int n = 0; n < numAtoms; n++) {
                    if (strcmp(atoms[n].symbol, symbol) == 0) {
                        elementIndex = n;
                        break;
                    }
                }

                if (elementIndex != -1) {
                    table[elementIndex][j] += elementCoefficient * multiplier * termCoefficient;
                }

                k = m;
            } else if (term[k] == '(') {
                // Start of a group within parentheses
                int start = k + 1;
                int depth = 1;
                int end = start;

                // Find the end of the group within parentheses
                while (term[end] != '\0' && depth > 0) {
                    if (term[end] == '(') {
                        depth++;
                    } else if (term[end] == ')') {
                        depth--;
                    }
                    end++;
                }

                // Process the group within parentheses
                int groupCoefficient = 1;
                if (term[end] != '\0' && isdigit(term[end])) {
                    sscanf(&term[end], "%d", &groupCoefficient);
                }

                int innerCoefficient = 1;
                int n = start;
                while (n < end) {
                    if (isupper(term[n])) {
                        char groupSymbol[3] = { term[n], '\0' };
                        int m = n + 1;
                        while (term[m] != '\0' && islower(term[m])) {
                            strncat(groupSymbol, &term[m], 1);
                            m++;
                        }

                        int groupIndex = -1;
                        for (int a = 0; a < numAtoms; a++) {
                            if (strcmp(atoms[a].symbol, groupSymbol) == 0) {
                                groupIndex = a;
                                break;
                            }
                        }

                        if (groupIndex != -1) {
                            if (term[m] != '\0' && isdigit(term[m])) {
                                sscanf(&term[m], "%d", &innerCoefficient);
                                while (term[m] != '\0' && isdigit(term[m])) {
                                    m++;
                                }
                            }
                            table[groupIndex][j] += termCoefficient * innerCoefficient * groupCoefficient * multiplier;
                        }

                        n = m;
                    } else {
                        n++;
                    }
                }

                k = end;
            } else {
                k++;
            }
        }
    }

    // Print the table of association between variables and elements
    for (int i = 0; i < numAtoms; i++) {
        printf("%s: ", atoms[i].symbol);
        for (int j = 0; j < numTerms; j++) {
            if (table[i][j] != 0) {
                if (table[i][j] == 1) {
                    printf("%c  ", terms[j].variable);
                } else {
                    printf("%d%c  ", table[i][j], terms[j].variable);
                }
            } else {
                printf("0%c  ", terms[j].variable);
            }
        }
        printf("\n");
    }

    // Free allocated memory for the table
    for (int i = 0; i < numAtoms; i++) {
        free(table[i]);
    }
    free(table);
}

int main() {
    // Example input data (atoms and terms)
    Atom atoms[] = { {"F"}, {"O"}, {"S"}, {"H"}, {"Na"} };
    Association terms[] = { {"2F2(SO4)3", 'A'}, {"Na(H2(SO3)4)5", 'B'} };
    int numAtoms = sizeof(atoms) / sizeof(Atom);
    int numTerms = sizeof(terms) / sizeof(Association);

    // Function call
    printVariableAtomTable(atoms, numAtoms, terms, numTerms);

    return 0;
}

Result

Table of Association between Variables and Elements:
Variable: A  B  
F: 4A  0B  
O: 24A  15B  
S: 6A  10B  
H: 0A  10B  
Na: 0A  B  

How can I modify my code to correct the interpretation of formulae with multiple levels of parentheses?
Is there a more efficient way to handle the analysis of chemical formulae with varying complexity, including nested parentheses?
I appreciate any help or suggestions to solve this formula interpretation problem. Thank you!


Solution

  • To address the issue of interpreting chemical formulas with nested parentheses, I modified the original code to handle this complexity by utilizing a recursive approach to correctly process formulas. Below is the modified version of the code, which yields the expected results for examples like "Na(H2(SO3)4)5".

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <ctype.h>
    
    typedef struct {
        char symbol[3];  // Atom symbol (e.g., "H", "O")
    } Atom;
    
    typedef struct {
        char term[50];    // Formula term (e.g., "2H2O", "3CO2")
        char set;         // Unknown associated with the term
    } Association;
    
    void processChemicalFormula(char *formula, int coefficient, int table[][50], Atom *atoms, int numAtoms, int termIndex);
    
    void printUnknownsTable(Atom *atoms, int numAtoms, Association *terms, int numTerms) {
        printf("\nAssociation Table between Unknowns and Elements:\n");
        printf("X: ");
        
        // Print header with unknowns
        for (int i = 0; i < numTerms; i++) {
            printf("%c  ", terms[i].set);
        }
        printf("\n");
    
        // Create and initialize matrix to store quantities associated with each unknown
        int table[numAtoms][50];
        memset(table, 0, sizeof(table));
    
        // Fill the table with quantities of each atom associated with each unknown
        for (int j = 0; j < numTerms; j++) {
            processChemicalFormula(terms[j].term, 1, table, atoms, numAtoms, j);
        }
    
        // Print the elements associated with each atom for each unknown
        for (int i = 0; i < numAtoms; i++) {
            printf("%s: ", atoms[i].symbol);
            for (int j = 0; j < numTerms; j++) {
                if (table[i][j] != 0) {
                    printf("%d%c  ", table[i][j], terms[j].set);
                } else {
                    printf("0%c  ", terms[j].set);
                }
            }
            printf("\n");
        }
    }
    
    // Recursive function to process the chemical formula and fill the association table
    void processChemicalFormula(char *formula, int coefficient, int table[][50], Atom *atoms, int numAtoms, int termIndex) {
        int len = strlen(formula);
        int i = 0;
        int termCoefficient = 1;  // Initialize term coefficient as 1 by default
    
        // Check if the first character of the formula is a digit
        if (isdigit(formula[i])) {
            // Use sscanf to read the coefficient from the current position (i)
            sscanf(&formula[i], "%d", &termCoefficient);
    
            // Update index (i) to move past the read coefficient
            while (isdigit(formula[i]) && i < len) {
                i++;
            }
    
            // Multiply termCoefficient by the total coefficient of the term
            termCoefficient *= coefficient;
        }
    
        while (i < len) {
            if (isalpha(formula[i])) {
                // Start of an atom symbol
                char symbol[3] = { formula[i], '\0' };
                i++;
    
                while (islower(formula[i]) && i < len) {
                    strncat(symbol, &formula[i], 1);
                    i++;
                }
    
                // Check if there's a numeric coefficient associated with the atom
                int atomCoefficient = 1;
                if (isdigit(formula[i])) {
                    sscanf(&formula[i], "%d", &atomCoefficient);
                    while (isdigit(formula[i]) && i < len) {
                        i++;
                    }
                }
    
                // Find the index of the atom in the list of atoms
                int atomIndex = -1;
                for (int k = 0; k < numAtoms; k++) {
                    if (strcmp(atoms[k].symbol, symbol) == 0) {
                        atomIndex = k;
                        break;
                    }
                }
    
                // Fill the table with the quantity of atoms associated with the term and unknown
                if (atomIndex != -1) {
                    table[atomIndex][termIndex] += coefficient * termCoefficient * atomCoefficient;
                }
            } else if (formula[i] == '(') {
                // Start of a group within parentheses
                int j = i + 1;
                int depth = 1;
    
                // Find the end of the group within parentheses
                while (j < len && depth > 0) {
                    if (formula[j] == '(') {
                        depth++;
                    } else if (formula[j] == ')') {
                        depth--;
                    }
                    j++;
                }
    
                // Process the coefficient of the group within parentheses, if any
                int groupCoefficient = 1;
                if (j < len && isdigit(formula[j])) {
                    sscanf(&formula[j], "%d", &groupCoefficient);
                    while (isdigit(formula[j]) && j < len) {
                        j++;
                    }
                }
    
                // Recursively process the group within parentheses
                processChemicalFormula(&formula[i+1], coefficient * termCoefficient * groupCoefficient, table, atoms, numAtoms, termIndex);
                i = j; // Update index to after the processed group
            } else {
                i++;
            }
        }
    }
    
    int main() {
        Atom atoms[] = { {"F"}, {"O"}, {"S"}, {"H"}, {"Na"} };
        Association terms[] = { {"2F2(SO4)3", 'A'}, {"Na(H2(SO3)4)5", 'B'} };
        int numAtoms = sizeof(atoms) / sizeof(Atom);
        int numTerms = sizeof(terms) / sizeof(Association);
    
        printUnknownsTable(atoms, numAtoms, terms, numTerms);
    
        return 0;
    }
    

    Result:

    Association Table between Unknowns and Elements:
    Unknown: A  B  
    F: 4A  0B  
    O: 24A  60B  
    S: 6A  20B  
    H: 0A  10B  
    Na: 0A  1B