I'm developing a compiler for a C-like language using Bison and Flex. The compiler, at the moment, is able to recognize a language with declaration, assignment and print statements and arithmetic and logic expressions (using only int variables). It generates a 3AC (and some instruction for managing memory). This is my Bison code:
%{
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include "list.h"
int yylex();
void yyerror(char *s);
TList list = NULL;
int i=0;
char* tmp() {
char* t = (char*)malloc(sizeof(char*));
sprintf(t, "t%d", i);
i++;
return t;
}
%}
%union {
int number;
char* identifier;
}
%token <number> NUM
%token <identifier> ID
%token PRINT INT ENDFILE
%left '+' '-'
%left '*' '/'
%right UMINUS
%left OR
%left AND
%right NOT
%nonassoc EQ LT GT LE GE NE
%type <identifier> expr
%type <identifier> comp
%type <identifier> bexpr
%%
program : lstmt ENDFILE { return 0; }
;
lstmt : lstmt stmt ';'
| stmt ';'
| lstmt openb lstmt closeb
| openb lstmt closeb
;
openb : '{' { printf("list = insertElement(list);\n"); }
;
closeb : '}' { printf("list = removeElement(list);\n"); }
;
stmt : INT ID { printf("addVar(\"%s\", &list->table);\n", $2); }
| INT ID '=' NUM {
printf("addVar(\"%s\", &list->table);\n", $2);
printf("setVarList(\"%s\", %d, list);\n", $2, $4);
}
| ID '=' expr { printf("setVarList(\"%s\", %s, list);\n", $1, $3); }
| PRINT '(' ID ')' { printf("printf(\"%%s: %%d\\n\", \"%s\", getVarList(\"%s\", list));\n", $3, $3); }
| ID '=' bexpr { printf("setVarList(\"%s\", %s, list);\n", $1, $3); }
;
bexpr : bexpr OR bexpr {
$$ = tmp();
printf("%s = %s || %s;\n", $$, $1, $3);
}
| bexpr AND bexpr {
$$ = tmp();
printf("%s = %s && %s;\n", $$, $1, $3);
}
| expr OR bexpr {
$$ = tmp();
printf("%s = %s || %s;\n", $$, $1, $3);
}
| expr AND bexpr {
$$ = tmp();
printf("%s = %s && %s;\n", $$, $1, $3);
}
| bexpr OR expr {
$$ = tmp();
printf("%s = %s || %s;\n", $$, $1, $3);
}
| bexpr AND expr {
$$ = tmp();
printf("%s = %s && %s;\n", $$, $1, $3);
}
| NOT bexpr {
$$ = tmp();
printf("%s = !%s;\n", $$, $2);
}
| '(' bexpr ')' { $$ = $2; }
| comp { $$ = $1; }
;
comp : expr LT expr {
$$ = tmp();
printf("%s = %s < %s;\n", $$, $1, $3);
}
| expr LE expr {
$$ = tmp();
printf("%s = %s <= %s;\n", $$, $1, $3);
}
| expr GT expr {
$$ = tmp();
printf("%s = %s > %s;\n", $$, $1, $3);
}
| expr GE expr {
$$ = tmp();
printf("%s = %s >= %s;\n", $$, $1, $3);
}
| expr EQ expr {
$$ = tmp();
printf("%s = %s == %s;\n", $$, $1, $3);
}
| expr NE expr {
$$ = tmp();
printf("%s = %s != %s;\n", $$, $1, $3);
}
| expr AND expr {
$$ = tmp();
printf("%s = %s && %s;\n", $$, $1, $3);
}
| expr OR expr {
$$ = tmp();
printf("%s = %s || %s;\n", $$, $1, $3);
}
| NOT expr {
$$ = tmp();
printf("%s = !%s;\n", $$, $2);
}
;
expr : expr '+' expr {
$$ = tmp();
printf("%s = %s + %s;\n", $$, $1, $3);
}
| expr '-' expr {
$$ = tmp();
printf("%s = %s - %s;\n", $$, $1, $3);
}
| expr '*' expr {
$$ = tmp();
printf("%s = %s * %s;\n", $$, $1, $3);
}
| expr '/' expr {
$$ = tmp();
printf("%s = %s / %s;\n", $$, $1, $3);
}
| '(' expr ')' { $$ = $2; }
| '-' expr %prec UMINUS {
$$ = tmp();
printf("%s = -%s;\n", $$, $2);
}
| ID {
$$ = tmp();
printf("%s = getVarList(\"%s\", list);\n", $$, $1);
}
| NUM {
$$ = tmp();
printf("%s = %d;\n", $$, $1);
}
;
%%
int main () {
list = insertElement(list);
if(yyparse() !=0)
fprintf(stderr, "Abonormal exit\n");
fprintf(fopen("temp.h", "w"), "#include \"list.h\"\n\nTList list = NULL;\nint t" );
for(int j=0; j<i-1; j++) {
fprintf(fopen("temp.h", "a"), "%d, t", j);
}
fprintf(fopen("temp.h", "a"), "%d;", i-1);
return 0;
}
void yyerror (char *s) {
fprintf(stderr, "Error: %s\n", s);
}
As you can see the grammar for logic expression is a little bit complex, but the compiler does what it should. The behavior is C-like so integer values can be used in AND/OR/NOT.
My idea for the grammar was this:
bexpr : bexpr OR bexpr
| bexpr AND bexpr
| NOT bexpr
| '(' bexpr ')'
| comp
| expr
;
comp : expr LT expr
| expr LE expr
| expr GT expr
| expr GE expr
| expr EQ expr
| expr NE expr
;
But in this way I get two conflicts, 1 shift/reduce and 1 reduce/reduce. There's a way to simplify the grammar?
My advice is to not try to distinguish grammatically between bexpr
and expr
. It cannot really be done accurately because you allow variables to be used as boolean values. Your current grammar is a valiant effort, to be sure, but when you add conditional statements to your grammar, you will find that
if ((a)) ...
will be flagged as a syntax error (assuming the syntax of a conditional statement is C-like: "if" '(' bexpr ')' stmt | "if" '(' bexpr ')' stmt "else" stmt
). And the attempt to ban use of arithmetic expressions as arguments to boolean operators is easily circumvented, because a AND (1 + -1)
is a valid bexpr
. One might also ask why (a < b) == (b < c)
should not be valid syntax. Granted, it's a bit obfuscated, but it's a convenient way to write "b
is between a
and c
".
If you really want to disallow the use of arithmetic operations as arguments to the boolean operators AND
, OR
and NOT
, you can improve your grammar by creating two parallel hierarchies, or by simply marking the type of the expression as part of its semantic value, and doing the check in each semantic action. The advantage of the second option is two-fold: it simplifies the grammar, eliminating duplication, and it provides the possibility of much more precise error messages ("attempt to use arithmetic expression as boolean value" instead of "syntax error").