compiler-constructionbisonflex-lexeryacclex

Why does Bison (Yacc) print new lines for apparently no reason?


I am working towards building a compiler for a simple formatting language using Flex and Bison.

At this stage, I still have not printed anything to yyout anywhere. I have some error cases where something is printed to the output file, but that is irrelevant to the example below. All my other print statements print to the console. Therefore, I expect the output file to be empty. However, with the following input file:

\begin {document}
    
\tabsize(5)
\title{"Why I Love Compiler Design"}
\author{"COMP421 Student"}
\date{29/12/2016}
\pagesetup{30,100}
    
\end{document}

The output file generated is:

output file screenshot

There are 9 empty lines, corresponding to the 9 lines I had in my input file. The output I expect, however, is only 1 empty line.

This is my .l file:

%{
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include "y.tab.h"

    void yyerror(const char *);
    int yylex(void);

    extern FILE *yyout;
    extern int  yyparse();
%}

%option yylineno

%%

^\\ { printf("LEX returned token BSLASH\n"); return BSLASH; }
\{  { printf("LEX returned token LBRACE\n"); return LBRACE; }
\}  { printf("LEX returned token RBRACE\n"); return RBRACE; }
\(  { printf("LEX returned token LPAREN\n"); return LPAREN; }
\)  { printf("LEX returned token RPAREN\n"); return RPAREN; }
,   { printf("LEX returned token COMMA\n"); return COMMA; }

begin    { printf("LEX returned token BEGIN_\n"); return BEGIN_; }
end      { printf("LEX returned token END\n"); return END; }
document { printf("LEX returned token DOCUMENT\n"); return DOCUMENT; }

pagesetup { printf("LEX returned token PAGESETUP\n"); return PAGESETUP; }
tabsize   { printf("LEX returned token TABSIZE\n"); return TABSIZE; }
title     { printf("LEX returned token TITLE\n"); return TITLE; }
author    { printf("LEX returned token AUTHOR\n"); return AUTHOR; }
date      { printf("LEX returned token DATE\n"); return DATE; }

(((0[1-9]|[12][0-9]|30)[-/ ]?(0[13-9]|1[012])|31[-/ ]?(0[13578]|1[02])|(0[1-9]|1[0-9]|2[0-8])[-/ ]?02)[-/ ]?[0-9]{4}|29[-/ ]?02[-/ ]?([0-9]{2}(([2468][048]|[02468][48])|[13579][26])|([13579][26]|[02468][048]|0[0-9]|1[0-6])00))  { printf("LEX returned token DDMMYYYYDATE\n"); yylval.sValue = yytext; return DDMMYYYYDATE; }
[0-9]*[1-9][0-9]*   { printf("LEX returned token INTEGER\n"); yylval.iValue = atoi(yytext); return INTEGER; }
\".*\"              { printf("LEX returned token STRING\n"); yylval.sValue = yytext; return STRING; }

    /* skip whitespace which is not part of a string */
[ \t] ;

. yyerror("invalid character");

%%

int main(int argc, char *argv[]) {
    if ( argc != 3)
        yyerror("ERROR You need 2 args: inputFileName outputFileName");
    else {
        yyin = fopen(argv[1], "r");
        yyout = fopen(argv[2], "w");
        yyparse();
        fclose(yyin);
        fclose(yyout);
    }

    return 0;
}

This is my .y file:

%{
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include "y.tab.h"

    void yyerror(const char *);
    int yylex(void);

    extern FILE *yyout;
    extern int  yylineno;

    int docPropertyCounters[5];

    typedef enum {PAGE_SETUP, TAB_SIZE, DOC_TITLE, DOC_AUTHOR, DOC_DATE} document_property;

    static inline char *stringFromDocPropertyEnum(document_property indexOfProperty) {
        static char *strings[] = { "\\pagesetup{}", "\\tabsize()", "\\title{}", "\\author{}", "\\date{}"};
        return strings[indexOfProperty];
    }
%}

%union { 
    int iValue;
    char* sValue;
}; 

%start file

%token BSLASH LBRACE RBRACE LPAREN RPAREN COMMA

%token BEGIN_ END DOCUMENT

%token PAGESETUP TABSIZE TITLE AUTHOR DATE

%token <iValue> INTEGER

%token <sValue> DDMMYYYYDATE STRING

%%

file: beginDocument docProperties endDocument
            {
                for (int i = 0; i < sizeof(docPropertyCounters)/sizeof(docPropertyCounters[0]); i++) 
                    if (docPropertyCounters[i] < 1) 
                        fprintf(stderr, "SYNTAX ERROR: Your source file does not contain the required document property %s", stringFromDocPropertyEnum(i)); 
                    else if (docPropertyCounters[i] > 1) 
                        fprintf(stderr, "SYNTAX ERROR: Your source file contains more than one instance of the document property %s", stringFromDocPropertyEnum(i));
            }
          |
          ;

beginDocument: BSLASH BEGIN_ LBRACE DOCUMENT RBRACE;

docProperties: docProperties docProperty
               |
               ;                

docProperty:    pageSetupProperty { docPropertyCounters[PAGE_SETUP]++; }
                | tabSizeProperty { docPropertyCounters[TAB_SIZE]++; }
                | titleProperty   { docPropertyCounters[DOC_TITLE]++; }
                | authorProperty  { docPropertyCounters[DOC_AUTHOR]++; }
                | dateProperty    { docPropertyCounters[DOC_DATE]++; }
                ;   

pageSetupProperty: BSLASH PAGESETUP LBRACE INTEGER COMMA INTEGER RBRACE;

tabSizeProperty: BSLASH TABSIZE LPAREN INTEGER RPAREN;

titleProperty: BSLASH TITLE LBRACE STRING RBRACE;

authorProperty: BSLASH AUTHOR LBRACE STRING RBRACE;

dateProperty: BSLASH DATE LBRACE DDMMYYYYDATE RBRACE;

endDocument: BSLASH END LBRACE DOCUMENT RBRACE;

%%

int yywrap(void) {
    return 1;
}

void yyerror(const char* str) 
{
    fprintf(stderr,"SYNTAX ERROR near line [%d]: %s\n",yylineno, str);
}

Solution

  • These lines contain a carriage return and/or line feed \r\n because you have not put it into the whitespace pattern.

    Perhaps you should have:

    [ \t\r\n]      ;
    

    You should also be careful about using C style comments in the specification. Sometimes these are treated as patterns. I always advise students to only put C style comments in actual C code. For example,it is better to do this:

    [ \t\r\n]      ;  /* skip whitespace which is not part of a string */
    

    and never put comments elsewhere. Others may disagree, but I find it avoids an awful lot of grief in flex and bison.

    PS: I haven't tested my suggestion on your code....