cstringparsingflex3bison

What is the proper way to use string return value for Bison?


What is the proper way to get the resulting string from the one or more tokens that make up the rule that Bison has found?

timer:
    TILDE amount  {
        printf("Timer: Amount:%s\n",$2 );
        $$ = $2;
      }

  | TILDE WORD amount { 
        printf("Timer: %s Amount:%s\n",$2, $3 );
        // make timer string
        $$ = malloc(strlen($2) + strlen($3) + 10);
        sprintf($$, "%s %s", $2, $3);
        free($2);
        free($3);
      }

  | TILDE MULTIWORD amount  { 
        printf("Timer: %s Amount:%s\n",$2, $3 );
        // make timer string
        $$ = malloc(strlen($2) + strlen($3) + 10);
        sprintf($$, "%s %s", $2, $3);
        free($2);
      }
  ;

time, amount, and WORD are all type string, so is MULTIWORD.

For example, if the first rule matches, meaning there is a TILDE "~" and an amount, usually a number like "3" in string format, what is the best way to get the result value, ie the 'timer' to have a value of just the amount?

If the second rule were to match, how would I go about adding WORD and amount into one string and putting them in the $$ value. I couldn't find anything in the Bison documentation about this.

Edit: Forgot to mention the problem. The string that is printed out has more than just what I wanted to be passed on. For example, with an amount defined by:

amount:
    // an empty amount - for one word timers
    LCURL RCURL {
      $$ = malloc(5);
      $$[0] = ' ';
      $$[1] = '\0';
    }

  | LCURL NUMBER RCURL  { 
      // get string for amount
      $$ = malloc(100);
      sprintf($$, "%.3lf", $2);
    }

  | LCURL NUMBER UNIT RCURL { 
      // get string for amount
      // remove % from unit 
      $$ = malloc(100 + strlen($3) + 5);
      sprintf($$, "%.3lf %s", $2, $3); 
    }

  | LCURL WORD RCURL  {
      $$ = $2;
    }

  | LCURL WORD UNIT RCURL {
      $$ = malloc(strlen($2) + strlen($3) + 5);
      sprintf($$, "%s%s", $2, $3);
    }

  | LCURL MULTIWORD RCURL {
      $$ = $2;
    }

  | LCURL MULTIWORD UNIT RCURL {
      $$ = malloc(strlen($2) + strlen($3) + 5);
      sprintf($$, "%s%s", $2, $3);
    }
  ;

When I put $2 in the string it will also include the RCURL behind it. This leads to lots of random strings and incorrect parsing.

The full lexer file:

%{
  #include "Cooklang.tab.h"
  #include <stdlib.h>
  void showError();

%}


SYMB_CHAR             "$"|"="|"+"|"-"|"_"|"*"|"`"
PUNC_CHAR             "!"|"?"|","|"."|"/"|"&"|"("|")"|":"
NEW_LINE              "\n"
WHITE_SPACE           " "|"\t"
ALPHA_CHAR            [a-zA-Z]
COOKLANG_CHAR         ">"|"|"|"~"|"@"|"#"|":"|"{"|"}"|"%"



ZERO                  "0" 
NON_ZERO_DIGIT        [0-9]


DIGIT                 ({ZERO}|{NON_ZERO_DIGIT}){WHITE_SPACE}*
INTEGER               ({ZERO}|({NON_ZERO_DIGIT}{DIGIT}*)){WHITE_SPACE}*
DECIMAL               {INTEGER}"."{INTEGER}{WHITE_SPACE}*
FRACTIONAL            {INTEGER}{WHITE_SPACE}*"/"{WHITE_SPACE}*{INTEGER}{WHITE_SPACE}*


WORD                  ({ALPHA_CHAR}|{DIGIT}|{SYMB_CHAR})+{WHITE_SPACE}*

HWORD                 "#"{WORD}
ATWORD                "@"{WORD}

MULTIWORD             {WORD}{2,}


UNIT                  "%"{WHITE_SPACE}*({MULTIWORD}*|{PUNC_CHAR}*)?



%%

[ \t]

"{"                   {return LCURL;}
"}"                   {return RCURL;}
"~"                   {return TILDE;}

{ATWORD}              { yytext++;
                        yylval.string = yytext;
                        return ATWORD;
}

{HWORD}               { yytext++;
                        yylval.string = yytext;
                        return HWORD;
                      }

                      
{UNIT}                { yytext++;
                        yylval.string = yytext;
                        return UNIT;}

{DIGIT}               { yylval.number = strtod(yytext, NULL); return NUMBER;}
{INTEGER}             { yylval.number = strtod(yytext, NULL); return NUMBER;}
{DECIMAL}             { yylval.number = strtod(yytext, NULL); return NUMBER;}
{FRACTIONAL}          { char * tok = strtok(yytext, "/");

                        double first = strtod(tok, NULL);

                        tok = strtok(NULL, "/");

                        double second = strtod(tok, NULL);

                        double final = first/second;

                        yylval.number = final;

                        return NUMBER;}

{WORD}                { yylval.string = yytext;
                        return WORD;
                      }
{MULTIWORD}           { yylval.string = yytext;
                        return MULTIWORD;
                      }



{PUNC_CHAR}           { yylval.character = yytext[0];
                        printf("char: |%c|", yytext[0]);
                        return PUNC_CHAR; }

{NEW_LINE}            {return NL;}




%%

int main( int argc, char ** argv ){
  ++argv;
  --argc;

  if( argc > 0 ){
    yyin = fopen(argv[0], "r");
  } else {
    yyin  = stdin;
  }

  yyparse();

  yylex();
  
  printf("\n");
  
  return 0;
}

Solution

  • This lexer action is incorrect:

    yylval.string = yytext;  /* NEVER do this */
    

    yytext is a pointer to a temporary storage buffer owned by the lexer. It's contents are only valid until the next call to the lexer; after that, the pointer may no longer be valid (if the buffer was reallocated) or the contents will have changed.

    So you must copy the string pointed to by yytext if you need it to outlast the lexer action (which you certainly need if you want to pass the string to a parser).

    You can just use yylval.string = strdup(yytext) if you have strdup (or include an implementation), or you can dynamically allocate the storage yourself; if you choose the latter solution, recall that yyleng contains the length of the text so there is no need for strlen:

        /* +1 for the null terminator */
    yylval.string = malloc(yyleng + 1);  
        /* should check for allocation failure */
    memcpy(yylval.string, yytext, yyleng + 1);
    

    Don't forget to free() the copy when you no longer need it.