UP | HOME

Racket: Lexer and Parser

This article ports lexer from Go implementation and ports parser from Rust implementation to Racket.

1. Lexer

First, we need some structures that record metadata for us.

(struct pos (line column) #:transparent)

(struct lexer
  (name
   input ; list char
   state
   ; for report position as format: (line, column)
   line column
   ; lexing helpers
   offset start
   tokens)
  #:mutable
  #:transparent)

(struct token (typ val pos) #:transparent)

Then we need some helpers to handle some char utilities.

(define (alpha-numeric? c)
  (and (char? c)
       (or (char-ci=? c #\_)
           (char-alphabetic? c)
           (char-numeric? c))))

(define (end-of-line? c)
  (match c
    [#\newline #t]
    [else #f]))

Core is state-function, which using function as state to run up a state machine.

(define (run lexer)
  (when (lexer-state lexer)
    (set-lexer-state! lexer
                      ((lexer-state lexer) lexer))
    (run lexer)))

Some utilities about update input char flow and metadata.

(define (next l)
  (set-lexer-offset! l (add1 (lexer-offset l)))
  (set-lexer-column! l (add1 (lexer-column l)))
  (define c (peek-char (lexer-input l)
                       (- (lexer-offset l) (lexer-start l))))
  (if (eof-object? c)
      c
      (when (end-of-line? c)
        (set-lexer-line! l (add1 (lexer-line l)))
        (set-lexer-column! l 0))))

(define (peek l) (peek-char (lexer-input l)
                            (- (lexer-offset l) (lexer-start l))))

(define (ignore l)
  (read-string (- (lexer-offset l) (lexer-start l))
               (lexer-input l))
  (set-lexer-start! l (lexer-offset l)))

(define (new-item l ty value)
  (channel-put (lexer-tokens l) (token ty value (pos (lexer-line l) (lexer-column l)))))

(define (emit l ty)
  (define value
    (read-string (- (lexer-offset l) (lexer-start l))
                 (lexer-input l)))
  (match value
    [(? eof-object?) (new-item l 'EOF value)]
    ["true" (new-item l 'true value)]
    ["false" (new-item l 'false value)]
    ["and" (new-item l 'and value)]
    ["or" (new-item l 'or value)]
    [else (new-item l ty value)])
  (set-lexer-start! l (lexer-offset l)))

(define (accept? l valid)
  (cond
    [(and (char? (peek l))
          (string-contains? valid (string (peek l))))
     (next l)
     #t]
    [else #f]))

(define (accept-run l valid)
  (let loop ([c (peek l)])
    (when (and (char? c) (string-contains? valid (string c)))
      (next l)
      (loop (peek l)))))

(define (scan-number? l)
  (define digits "0123456789")
  (when (and (accept? l "0") (accept? l "xX"))
    (set! digits "0123456789abcdefABCDEF"))
  (accept-run l digits)
  (when (accept? l ".")
    (accept-run l digits))
  (when (accept? l "eE")
    (accept? l "+-")
    (accept-run l "0123456789"))
  ; Next thing mustn't be alphanumeric.
  (cond
    [(alpha-numeric? (peek l))
     (next l)
     #f]
    [else #t]))

Our state functions.

(define (lex-white-space l)
  (let loop ([c (peek l)])
    (when (and (not (eof-object? c))
               (or (char-whitespace? c)
                   (end-of-line? c)))
      (next l)
      (loop (peek l))))
  (ignore l)

  (match (peek l)
    [(? eof-object?) (emit l 'EOF)
                     #f]
    [#\+ (next l)
         (emit l 'add)
         lex-white-space]
    [#\- (next l)
         (emit l 'sub)
         lex-white-space]
    [#\* (next l)
         (emit l 'mul)
         lex-white-space]
    [#\/ (next l)
         (emit l 'div)
         lex-white-space]
    [#\^ (next l)
         (emit l '^)
         lex-white-space]
    [#\= (next l)
         (emit l 'eq)
         lex-white-space]
    [(? char-numeric?) lex-number]
    [(? alpha-numeric?) lex-identifier]
    [c (error 'unknown "don't know what to do with: `~a`" c)]))

(define (lex-identifier l)
  (let loop ([c (peek l)])
    (when (alpha-numeric? c)
      (next l)
      (loop (peek l))))

  (emit l 'identifier)
  lex-white-space)

(define (lex-number l)
  (when (not (scan-number? l))
    (error 'bad-number-syntax "bad number syntax: `~a`"
           (peek-string (- (lexer-offset l) (lexer-start l))
                        0 (lexer-input l))))
  (emit l 'number)
  lex-white-space)

Finally, put them together.

(define (lex name input-port)
  (define l (lexer name
                   input-port
                   ; state
                   lex-white-space
                   ; line column
                   1 0
                   ; start offset
                   0 0
                   (make-channel)))
  (thread (λ () (run l)))
  l)

2. Parser

Utilities

(struct parser (name lexer tokens offset)
  #:mutable
  #:transparent)

(define (parse name input)
  (define lexer (lex name input))
  (define p (parser name lexer (vector) 0))
  (parse-expr p #f 1))

(define (peek p [n 0])
  (get-token p (+ (parser-offset p) n)))
(define (take p)
  (define origin (parser-offset p))
  (set-parser-offset! p (add1 origin))
  (get-token p origin))
(define (consume p . wants)
  (predict p wants))
(define (predict p . wants)
  (for ([i (length wants)]
        [want wants])
    (define tok (peek p i))
    (unless (eq? (token-typ tok) want)
      (error 'unexpected-token "want ~a, got ~a" want (token-typ tok)))))

(define (get-token p fixed-offset)
  (when (vector-empty? (parser-tokens p))
    (increase-token-stream p))
  (define tokens (parser-tokens p))
  (if (>= fixed-offset (vector-length tokens))
      (let ([last-token (vector-ref tokens (sub1 (vector-length tokens)))])
        (case (token-typ last-token)
          [(EOF) last-token]
          [else (increase-token-stream p)
                (get-token p fixed-offset)]))
      (vector-ref tokens fixed-offset)))
(define (increase-token-stream p)
  (define l (parser-lexer p))
  (define new-last-token (channel-get (lexer-tokens l)))
  (set-parser-tokens! p
                      (vector-append (parser-tokens p) (vector new-last-token))))

Operators utilities.

(define (right-assoc? token)
  (case (token-typ token)
    [(^) #t]
    [else #f]))
(define (precedence token)
  (define op** '((eq)
                 (and or)
                 (add sub)
                 (mul div ^)))
  (define m (make-hash))
  (for ([i (length op**)]
        [op* op**])
    (for ([op op*])
      (hash-set! m op (+ 2 i))))
  (hash-ref m (token-typ token) 0))

Ast

(struct expr () #:transparent)
(struct binary expr (op left right) #:transparent)

Parsers

(define (parse-expr p left-hand-side previous-primary)
  (define lhs (if left-hand-side
                  left-hand-side
                  (parse-unary p)))

  (let loop ([lookahead (peek p)])
    (when (>= (precedence lookahead) previous-primary)
      (define operator lookahead)
      (take p)
      (define rhs (parse-unary p))
      (set! lookahead (peek p))
      (let loop ()
        (when (or (> (precedence lookahead) (precedence operator))
                  (and (right-assoc? lookahead)
                       (= (precedence lookahead) (precedence operator))))
          (set! rhs (parse-expr p rhs (precedence lookahead)))
          (set! lookahead (peek p))
          (loop)))
      (set! lhs (binary (token-typ operator)
                        lhs rhs))
      (loop lookahead)))

  lhs)

(define (parse-unary p)
  (define tok (peek p))
  (case (token-typ tok)
    [(number) (take p)
              (string->number (token-val tok))]
    [(true) (take p)
            'true]
    [(false) (take p)
             'false]
    [(identifier) (take p)
                  (token-val tok)]
    [else (error 'unknown "~a" tok)]))
Date: 2021-06-04 Fri 00:00
Author: Lîm Tsú-thuàn