;;; -*- mode: lisp -*- ;;; This is a short example of using CL-PPCRE to generate tokens ;;; for CL-YACC. (defpackage "EXAMPLE-PARSER" (:use "COMMON-LISP" "CL-PPCRE" "YACC")) (in-package "EXAMPLE-PARSER") ;;;; Some utilities (defmacro defvar-for-macro (name &optional (value nil value-p) (doc nil doc-p)) `(eval-when (:compile-toplevel :load-toplevel :execute) (defvar ,name ,@(when value-p `(,value)) ,@(when doc-p `(,doc))))) (defmacro defun-for-macro (name args &body body) `(eval-when (:compile-toplevel :load-toplevel :execute) (defun ,name ,args ,@body))) ;;; Dynamic extent used for lexical analysis. (defvar *text-to-parse*) (defvar *start-of-token*) (defvar *end-of-token*) (defvar *token-kind*) (defvar *token-value*) (defun initialize-tokenizer (string) (setf *text-to-parse* string *start-of-token* 0)) ;;; Set of tokens, each has a ppcre parse-tree-synonym. (defvar-for-macro *token-kinds* ()) ;;; Defining Tokens using CL-YACC (defmacro deftoken (kind (pattern) &body body) (let ((pattern (if (stringp pattern) (cl-ppcre::parse-string pattern) pattern)) (handler (gensym))) `(progn (defun ,handler (pos) (setf *token-kind* ,kind) (let ((*end-of-token* pos)) (setf *token-value* (progn ,@body) *start-of-token* *end-of-token*))) (define-parse-tree-synonym ,kind (:sequence ,pattern (:filter ,handler))) (pushnew ,kind *token-kinds*)))) (defun tokenizer-of-string (string) ;; this is not thread safe :) (initialize-tokenizer string) (let ((scanner (create-scanner (list :sequence :start-anchor (list* :alternation (reverse *token-kinds*)))))) #'(lambda () (setf *token-kind* nil) (scan scanner *text-to-parse* :start *start-of-token*) (values *token-kind* *token-value*)))) (defun token-text () (subseq *text-to-parse* *start-of-token* *end-of-token*)) (defun test-tokenizer (string) (loop with f = (tokenizer-of-string string) do (funcall f) (unless *token-kind* (return)) (format t "~&~S ~S" *token-kind* *token-value*))) ;;;; Define our various tokens. ;;; These are ordered, i.e. early ones match before later ones. (deftoken :if ("if") :if) (deftoken :then ("then") :then) (deftoken :else ("else") :else) (deftoken :fi ("fi") :fi) (deftoken :begin ("begin") :begin) (deftoken :end ("end") :end) (deftoken :semicolon (#\;) :semicolon) (deftoken :+ (#\+) '+) (deftoken :- ("-") '-) (deftoken :* (#\*) '*) (deftoken :/ ("/") '/) (deftoken :identifier ("[a-z]\\\w*") (intern (token-text))) (deftoken :constant ("\\\d+") (parse-integer (token-text))) (deftoken :whitespace (" +") :whitespace) ;;;; Function used for reduction when parseing (defun-for-macro reduce-abc2progn (a b c) (declare (ignore a c)) `(progn ,@b)) (defun-for-macro reduce-abc2ac* (a b c) (declare (ignore b)) `(,a ,@c)) (defun-for-macro reduce-abcde2if (a b c d e) (declare (ignore a c e)) `(when ,b ,@d)) (defun-for-macro reduce-abcdefg2if (a b c d e f g) (declare (ignore a c e g)) `(if ,b (progn ,@d) (progn ,@f))) (defun-for-macro reduce-abc2bac (a b c) `(,b ,a ,c)) (defun-for-macro reduce-ab2a (a b) (declare (ignore b)) a) ;;;; Define the grammar (define-parser *parser-tables* (:start-symbol program) (:terminals (:if :then :else :fi :begin :end :+ :- :* :/ :semicolon :identifier :constant)) (:precedence ((:left :* :/) (:left :+ :-))) (program (:begin statements :end #'reduce-abc2progn)) (statements () (statement :semicolon statements #'reduce-abc2ac*)) (statement (expression #'identity) (:if expression :then statements :fi #'reduce-abcde2if) (:if expression :then statements :else statements :fi #'reduce-abcdefg2if)) (expression (:constant #'identity) (:identifier #'identity) (expression :+ expression #'reduce-abc2bac) (expression :- expression #'reduce-abc2bac) (expression :* expression #'reduce-abc2bac) (expression :/ expression #'reduce-abc2bac))) ;;; Main (defun tokenizer-without-whitespace-of-string (text) (let ((inner-tokenizer (tokenizer-of-string text))) (labels ((tokenizer () (multiple-value-bind (kind value) (funcall inner-tokenizer) (if (eq *token-kind* :whitespace) (tokenizer) (values kind value))))) #'tokenizer))) (defun parse-program (program-text) (parse-with-lexer (tokenizer-without-whitespace-of-string program-text) *parser-tables*)) #| (parse-program "begin 1 ; 3+2; if 3 then 4 ; fi; if 3 then 4 ; else 7; fi; end") --> (PROGN 1 (+ 3 2) (WHEN 3 4) (IF 3 (PROGN 4) (PROGN 7))) |#