Racket: Lexer and Parser
This article ports lexer from Go implementation and ports parser from Rust implementation to Racket.
1 Lexer
First, we need some structures that record metadata for us.
(struct pos (line column) #:transparent) (struct lexer (name input ; list char state ; for report position as format: (line, column) line column ; lexing helpers offset start tokens) #:mutable #:transparent) (struct token (typ val pos) #:transparent)
Then we need some helpers to handle some char utilities.
(define (alpha-numeric? c) (and (char? c) (or (char-ci=? c #\_) (char-alphabetic? c) (char-numeric? c)))) (define (end-of-line? c) (match c [#\newline #t] [else #f]))
Core is state-function, which using function as state to run up a state machine.
(define (run lexer) (when (lexer-state lexer) (set-lexer-state! lexer ((lexer-state lexer) lexer)) (run lexer)))
Some utilities about update input char flow and metadata.
(define (next l) (set-lexer-offset! l (add1 (lexer-offset l))) (set-lexer-column! l (add1 (lexer-column l))) (define c (peek-char (lexer-input l) (- (lexer-offset l) (lexer-start l)))) (if (eof-object? c) c (when (end-of-line? c) (set-lexer-line! l (add1 (lexer-line l))) (set-lexer-column! l 0)))) (define (peek l) (peek-char (lexer-input l) (- (lexer-offset l) (lexer-start l)))) (define (ignore l) (read-string (- (lexer-offset l) (lexer-start l)) (lexer-input l)) (set-lexer-start! l (lexer-offset l))) (define (new-item l ty value) (channel-put (lexer-tokens l) (token ty value (pos (lexer-line l) (lexer-column l))))) (define (emit l ty) (define value (read-string (- (lexer-offset l) (lexer-start l)) (lexer-input l))) (match value [(? eof-object?) (new-item l 'EOF value)] ["true" (new-item l 'true value)] ["false" (new-item l 'false value)] ["and" (new-item l 'and value)] ["or" (new-item l 'or value)] [else (new-item l ty value)]) (set-lexer-start! l (lexer-offset l))) (define (accept? l valid) (cond [(and (char? (peek l)) (string-contains? valid (string (peek l)))) (next l) #t] [else #f])) (define (accept-run l valid) (let loop ([c (peek l)]) (when (and (char? c) (string-contains? valid (string c))) (next l) (loop (peek l))))) (define (scan-number? l) (define digits "0123456789") (when (and (accept? l "0") (accept? l "xX")) (set! digits "0123456789abcdefABCDEF")) (accept-run l digits) (when (accept? l ".") (accept-run l digits)) (when (accept? l "eE") (accept? l "+-") (accept-run l "0123456789")) ; Next thing mustn't be alphanumeric. (cond [(alpha-numeric? (peek l)) (next l) #f] [else #t]))
Our state functions.
(define (lex-white-space l) (let loop ([c (peek l)]) (when (and (not (eof-object? c)) (or (char-whitespace? c) (end-of-line? c))) (next l) (loop (peek l)))) (ignore l) (match (peek l) [(? eof-object?) (emit l 'EOF) #f] [#\+ (next l) (emit l 'add) lex-white-space] [#\- (next l) (emit l 'sub) lex-white-space] [#\* (next l) (emit l 'mul) lex-white-space] [#\/ (next l) (emit l 'div) lex-white-space] [#\^ (next l) (emit l '^) lex-white-space] [#\= (next l) (emit l 'eq) lex-white-space] [(? char-numeric?) lex-number] [(? alpha-numeric?) lex-identifier] [c (error 'unknown "don't know what to do with: `~a`" c)])) (define (lex-identifier l) (let loop ([c (peek l)]) (when (alpha-numeric? c) (next l) (loop (peek l)))) (emit l 'identifier) lex-white-space) (define (lex-number l) (when (not (scan-number? l)) (error 'bad-number-syntax "bad number syntax: `~a`" (peek-string (- (lexer-offset l) (lexer-start l)) 0 (lexer-input l)))) (emit l 'number) lex-white-space)
Finally, put them together.
(define (lex name input-port) (define l (lexer name input-port ; state lex-white-space ; line column 1 0 ; start offset 0 0 (make-channel))) (thread (λ () (run l))) l)
2 Parser
Utilities
(struct parser (name lexer tokens offset) #:mutable #:transparent) (define (parse name input) (define lexer (lex name input)) (define p (parser name lexer (vector) 0)) (parse-expr p #f 1)) (define (peek p [n 0]) (get-token p (+ (parser-offset p) n))) (define (take p) (define origin (parser-offset p)) (set-parser-offset! p (add1 origin)) (get-token p origin)) (define (consume p . wants) (predict p wants)) (define (predict p . wants) (for ([i (length wants)] [want wants]) (define tok (peek p i)) (unless (eq? (token-typ tok) want) (error 'unexpected-token "want ~a, got ~a" want (token-typ tok))))) (define (get-token p fixed-offset) (when (vector-empty? (parser-tokens p)) (increase-token-stream p)) (define tokens (parser-tokens p)) (if (>= fixed-offset (vector-length tokens)) (let ([last-token (vector-ref tokens (sub1 (vector-length tokens)))]) (case (token-typ last-token) [(EOF) last-token] [else (increase-token-stream p) (get-token p fixed-offset)])) (vector-ref tokens fixed-offset))) (define (increase-token-stream p) (define l (parser-lexer p)) (define new-last-token (channel-get (lexer-tokens l))) (set-parser-tokens! p (vector-append (parser-tokens p) (vector new-last-token))))
Operators utilities.
(define (right-assoc? token) (case (token-typ token) [(^) #t] [else #f])) (define (precedence token) (define op** '((eq) (and or) (add sub) (mul div ^))) (define m (make-hash)) (for ([i (length op**)] [op* op**]) (for ([op op*]) (hash-set! m op (+ 2 i)))) (hash-ref m (token-typ token) 0))
Ast
(struct expr () #:transparent) (struct binary expr (op left right) #:transparent)
Parsers
(define (parse-expr p left-hand-side previous-primary) (define lhs (if left-hand-side left-hand-side (parse-unary p))) (let loop ([lookahead (peek p)]) (when (>= (precedence lookahead) previous-primary) (define operator lookahead) (take p) (define rhs (parse-unary p)) (set! lookahead (peek p)) (let loop () (when (or (> (precedence lookahead) (precedence operator)) (and (right-assoc? lookahead) (= (precedence lookahead) (precedence operator)))) (set! rhs (parse-expr p rhs (precedence lookahead))) (set! lookahead (peek p)) (loop))) (set! lhs (binary (token-typ operator) lhs rhs)) (loop lookahead))) lhs) (define (parse-unary p) (define tok (peek p)) (case (token-typ tok) [(number) (take p) (string->number (token-val tok))] [(true) (take p) 'true] [(false) (take p) 'false] [(identifier) (take p) (token-val tok)] [else (error 'unknown "~a" tok)]))