I am new to racket and am building a Lexer using the parser-tools/lex module and want to be able to tokenize a number that comes after a 'gosub' statement as a line number token. I am having trouble trying to figure out how to identify it as a line number, and not a regular number. I am reading in .txt files that look like this:
10 read A 20 read B 30 gosub 400 40 if C = 400 then write C 400 C = A + B : return $$
the "400" in "gosub 400" gets read as a number token.
I want this exepected output for my gosub token and line num token: ... token-GOSUB: GOSUB token-line-num: 400 ...
`#lang racket
;;; IMPORT
;; Import the lexer tools
(require parser-tools/lex
(prefix-in : parser-tools/lex-sre) ; names from lex-sre are prefixed with :
; to avoid name collisions
)
;;; REGULAR EXPRESSIONS
;; Names for regular expressions matching letters and digits.
;; Note that :or are prefixed with a : due to (prefix-in : ...) above
(define-lex-abbrevs
[read "read"]
[write "write"]
[goto "goto"]
[gosub "gosub"]
[line-num (:/ #\1 #\9)]
[letter (:or (:/ "a" "z") (:/ #\A #\Z) "?" "!")]
[digit (:/ #\0 #\9)]
[mult-op (or "*" "/")]
[add-op (or "+" "-")]
[end-of-file "$$"]
[paren-start "("]
[paren-end ")"]
)
;;; TOKENS
;; Tokens such as numbers (and identifiers and strings) carry a value
;; In the example only the NUMBER token is used, but you may need more.
(define-tokens value-tokens (NUMBER END-OF-PROGRAM READ WRITE GOTO GOSUB LINE-NUM IDENTIFIER MULT-OP ADD-OP PAREN-START PAREN-END))
;; Tokens that don't carry a value.
(define-empty-tokens op-tokens (newline := = < > ^ \( \) EOF))
;;; LEXER
;; The construct lexer-src-pos evaluates to a function which scans an input port
;; returning one position-token at a time.
;; A position token contains besides the actual token also source location information
;; (i.e. you can see where in the file the token was read)
(define lex
(lexer-src-pos
[(eof) ; input: eof of file
'EOF] ; output: the symbol EOF
[(:+ end-of-file)
(token-END-OF-PROGRAM (string->symbol lexeme))]
[(:or #\tab #\space #\newline) ; input: whitespace
(return-without-pos (lex input-port))] ; output: the next token
; (i.e. skip the whitespace)
["\r" ; input: newline
(token-newline)] ; ouput: a newline-token
; ; note: (token-newline) returns 'newline
[(:or ":" ":=" "^" "<" ">" "=") ; input: an operator
(string->symbol lexeme)] ; output: corresponding symbol
[(:or "+" "-") ; input: "+" or "-"
(token-ADD-OP (string->symbol lexeme))] ; ouput: an ADD-OP token
[(:or "*" "/") ; input: "*" or "/"
(token-MULT-OP (string->symbol lexeme))] ; output: a MULT-OP token
[(:+ digit) ; input: digits
(token-NUMBER (string->number lexeme))] ; outout: a NUMBER token whose value is the number
[(:+ read) ; input: the string "read"
(token-READ lexeme)] ; output: READ token
[(:+ write) ; input: the string "write"
(token-WRITE lexeme)] ; output: WRITE token
[(:+ goto)
(token-GOTO lexeme)]
[(:+ gosub) ; match "gosub" followed by one or more digits
(token-GOSUB lexeme)]
** [(:+ line-num) ; match "gosub" followed by one or more digits
(token-LINE-NUM lexeme)] **
[(:+ letter) ; input: Alphabetic letter
(token-IDENTIFIER lexeme)] ; output: IDENTIFIER token whose value is the word
[(:+ paren-start) ; input: (
(token-PAREN-START lexeme)] ; output: PAREN-START token
[(:+ paren-end) ; input: )
(token-PAREN-END lexeme)] ; output: PAREN-END token
))
(define (string->tokens s)
(port->tokens (open-input-file s)))
(define (port->tokens in)
(define token (lex in))
(if (eq? (position-token-token token) 'EOF)
'()
(cons token (port->tokens in))))
(provide string->tokens)`
I have tried used regexp, but am not really sure how it properly use it when working with tokens from the parser-tools/lex in racket. It did not result in anything and simply returned as a number again.
I added a new abbreviation:
[gosub+ (concatenation "gosub " (repetition 0 +inf.0 digit))]
If this string is found, is parsed with its own function:
(define (gosub-tokens in)
(let ((token (gosub-lex in)))
(if (eq? (position-token-token token) 'EOF) '()
(cons token (gosub-tokens in)))))
and its own lex
:
(define gosub-lex
(lexer-src-pos
[(eof)
'EOF]
[(:+ gosub)
(token-GOSUB lexeme)]
[(:or #\tab #\space #\newline)
(return-without-pos (gosub-lex input-port))]
[(:+ digit)
(token-LINE-NUM (string->number lexeme))]))
The full code:
#lang racket
(require parser-tools/lex
(prefix-in : parser-tools/lex-sre))
;;; REGULAR EXPRESSIONS
;; Names for regular expressions matching letters and digits.
;; Note that :or are prefixed with a : due to (prefix-in : ...) above
(define-lex-abbrevs
[read "read"]
[write "write"]
[goto "goto"]
[digit (:/ #\0 #\9)]
[gosub "gosub"]
[gosub+ (concatenation "gosub " (repetition 0 +inf.0 digit))]
[letter (:or (:/ "a" "z") (:/ #\A #\Z) "?" "!")]
[mult-op (or "*" "/")]
[add-op (or "+" "-")]
[end-of-file "$$"]
[paren-start "("]
[paren-end ")"])
;;; TOKENS
(define-tokens value-tokens (NUMBER END-OF-PROGRAM READ WRITE GOTO GOSUB GOSUB+ LINE-NUM IDENTIFIER MULT-OP ADD-OP PAREN-START PAREN-END))
(define-empty-tokens op-tokens (newline := = < > ^ \( \) EOF))
;;; LEXER
(define gosub-lex
(lexer-src-pos
[(eof)
'EOF]
[(:+ gosub)
(token-GOSUB lexeme)]
[(:or #\tab #\space #\newline)
(return-without-pos (gosub-lex input-port))]
[(:+ digit)
(token-LINE-NUM (string->number lexeme))]))
(define lex
(lexer-src-pos
[(eof)
'EOF]
[(:+ end-of-file)
(token-END-OF-PROGRAM (string->symbol lexeme))]
[(:or #\tab #\space #\newline)
(return-without-pos (lex input-port))]
["\r"
(token-newline)]
[(:or ":" ":=" "^" "<" ">" "=")
(string->symbol lexeme)]
[(:or "+" "-")
(token-ADD-OP (string->symbol lexeme))]
[(:or "*" "/")
(token-MULT-OP (string->symbol lexeme))]
[(:+ digit)
(token-NUMBER (string->number lexeme))]
[(:+ read)
(token-READ lexeme)]
[(:+ write)
(token-WRITE lexeme)]
[(:+ goto)
(token-GOTO lexeme)]
[(:+ gosub+)
(token-GOSUB+ lexeme)]
[(:+ letter)
(token-IDENTIFIER lexeme)]
[(:+ paren-start)
(token-PAREN-START lexeme)]
[(:+ paren-end)
(token-PAREN-END lexeme)]))
(define (string->tokens s)
(port->tokens (open-input-file s)))
(define (gosub-tokens in)
(let ((token (gosub-lex in)))
(if (eq? (position-token-token token) 'EOF) '()
(cons token (gosub-tokens in)))))
(define (port->tokens in)
(let ((token (lex in)))
(cond ((eq? (position-token-token token) 'EOF) '())
((eq? (token-name (position-token-token token)) 'GOSUB+)
(append (gosub-tokens (open-input-string (token-value (position-token-token token))))
(port->tokens in)))
(else (cons token (port->tokens in))))))
(provide string->tokens)