revamp PEG grammar

This commit is contained in:
Rutger van Beusekom 2018-11-03 11:54:20 +01:00
parent 1e81a66926
commit 2334e6ebde
9 changed files with 1220 additions and 398 deletions

View File

@ -8,7 +8,6 @@
#:use-module (ice-9 getopt-long)
#:use-module (ice-9 local-eval)
#:use-module (ice-9 match)
#:use-module (ice-9 rdelim)
#:use-module (ice-9 pretty-print)
#:use-module (ice-9 receive)
#:use-module (ice-9 regex)
@ -40,12 +39,12 @@
(define (parse-string string)
(let ((parser (cond (%geesh-parser? (@ (gash geesh) parse-string))
(else (@ (gash peg) parse-string)))))
(else (@ (gash grammar) parse-string)))))
(parser string)))
(define (parse port)
(let ((parser (cond (%geesh-parser? (@ (gash geesh) parse))
(else (@ (gash peg) parse)))))
(else (@ (gash grammar) parse)))))
(parser port)))
(define (file-to-ast file-name)

249
gash/grammar.scm Normal file
View File

@ -0,0 +1,249 @@
(define-module (gash grammar)
#:use-module (ice-9 match)
#:use-module (ice-9 pretty-print)
#:use-module (ice-9 rdelim)
#:use-module (srfi srfi-8)
#:use-module (peg)
#:use-module (peg codegen)
#:export (parse
parse-string))
;; (define-syntax define-unwrapped-sexp-parser
;; (lambda (x)
;; (syntax-case x ()
;; ((_ sym accum pat)
;; (let* ((matchf (compile-peg-pattern #'pat (syntax->datum #'accum))))
;; #`(define sym #,matchf))))))
;; (define-unwrapped-sexp-parser eol none (or "\f" "\n" "\r"))
;; (add-peg-compiler! 'eol eol)
;; (define-unwrapped-sexp-parser ws none (or " " "\t" "\v"))
;; (add-peg-compiler! 'ws ws)
;; (define-unwrapped-sexp-parser line none (and "#" (* (and (not-followed-by eol) peg-any))))
;; (add-peg-compiler! 'line line)
;; (define-unwrapped-sexp-parser skip none (* (or ws eol line)))
;; (add-peg-compiler! 'skip skip)
;; (define (wrap-skip-parser-for-users for-syntax parser accumsym s-syn)
;; (display "wrap\n")
;; #`(lambda (str strlen pos)
;; (when #t
;; (format (current-error-port) "~a ~a : ~s\n"
;; (make-string (- pos (or (string-rindex str #\newline 0 pos) 0)) #\space)
;; '#,s-syn
;; (substring str pos (min (+ pos 40) strlen))))
;; (let* ((res (skip str strlen pos))
;; (pos (or (and res (car res)) pos))
;; (res (#,parser str strlen pos)))
;; ;; Try to match the nonterminal.
;; (if res
;; ;; If we matched, do some post-processing to figure out
;; ;; what data to propagate upward.
;; (let* ((at (car res))
;; (body (cadr res)))
;; #,(cond
;; ((eq? accumsym 'name)
;; #``(,at ,'#,s-syn))
;; ((eq? accumsym 'all)
;; #`(list at
;; (cond
;; ((not (list? body))
;; `(,'#,s-syn ,body))
;; ((null? body) `(,'#,s-syn))
;; ((symbol? (car body))
;; `(,'#,s-syn ,body))
;; (else (cons '#,s-syn body)))))
;; ((eq? accumsym 'none) #``(,at ()))
;; (else #``(,at ,body))))
;; ;; If we didn't match, just return false.
;; #f))))
;; (module-set! (resolve-module '(peg codegen)) 'wrap-parser-for-users wrap-skip-parser-for-users)
(define (parse port)
(parse-string (read-string port)))
(define (parse-string input)
(define io-label "")
(define (io-label-name str len pos)
(let ((at (string-skip str char-alphabetic? pos len)))
(set! io-label (substring str pos at))
(if (< at len) (list at '())
#f)))
(define (io-label-match str len pos)
(if (string-prefix? io-label (substring str pos))
(list (+ pos (string-length io-label)) '())
#f))
(define-peg-pattern io-here-label none io-label-name)
(define-peg-pattern io-here-delim none io-label-match)
(define-peg-pattern io-here-document all
(and (+ (and (not-followed-by io-here-delim)
peg-any))
io-here-delim))
(define-peg-string-patterns
"script <-- ws* compound
ws < sp / eol
sp < '\\\n'? (comment / [ \t\v])
comment < [#] (!eol .)*
eol < [\n\r\f]
compound <-- (term (&rpar / sep#))*
sep <- sp* (amp ws* / semi ws* / eof) / ws+
amp <- '&'
semi < ';'!';'
eof < !.
term <- and / or / pipeline
and <-- pipeline and-op ws* term
or <-- pipeline or-op ws* term
and-op < '&&'
or-op < '||'
pipeline <-- '!'? sp* (command (&sep / &or-op / &and-op / &rpar / eof / pipe#))+
and-or <- '&&' / '||'
exclamation <- '!'
pipe < sp* '|' !'|' ws*
command <-- function-def / compound-command / simple-command
compound-command <- (subshell / brace-group / for-clause / case-clause /
if-clause / while-clause / until-clause) (sp* io-redirect)*
simple-command <- ((io-redirect / nonreserved) sp*)+
io-redirect <-- [0-9]* (io-here / io-file)
io-file <-- io-op ([0-9]+ / word)
io-op <- '<&' / '>&' / '>>' / '>' / '<>'/ '<' / '>|'
io-here <-- io-here-op io-here-label sp* eol io-here-document
io-here-op <- '<<-' / '<<'
function-def <-- name sp* lpar rpar# ws* function-body
name <-- !reserved identifier
function-body <-- brace-group (sp* io-redirect)*
subshell <-- lpar compound rpar#
brace-group <-- lbrace ws* compound rbrace#
case-clause <-- 'case' sp* word sp* 'in'# ws* case-item+ ws* 'esac'#
case-item <-- pattern sp* colon? ws* compound? case-sep?
colon < ':'
case-sep < ';;' ws*
pattern <-- (word (!rpar '|'# / !'|' &rpar))+ rpar#
for-clause <-- 'for' sp+ identifier ws+ ('in' sp+ expression)? sep# do-group
expression <-- command
do-group <-- 'do' ws+ compound 'done'#
if-clause <-- 'if' sp+ compound 'then'# ws+ compound else-part? 'fi'#
else-part <-- 'else' ws+ compound /
'elif' ws+ compound 'then'# ws+ compound else-part?
while-clause <-- 'while' compound do-group
until-clause <-- 'until' compound do-group
reserved < ('case' / 'esac' / 'in' / 'if' / 'fi' / 'then' / 'else' /
'elif' / 'for' / 'done' / 'do' / 'until' / 'while') &ws
nonreserved <- !reserved word
word <-- test / substitution / assignment / number / variable /
delim / literal
test <-- ltest sp+ (word sp+)+ rtest#
ltest < '['
rtest < ']'
literal <- !reserved (!']' ![ \t\v\f\n`'\")};|&\\] .)+
identifier <- [_a-zA-Z][_a-zA-Z0-9]*
dollar < '$'
number <-- [0-9]+
substitution <-- dollar lpar compound rpar# / bt ([\\] bt / !bt .)+ bt#
lpar < '('
rpar < ')'
bt < [`]
assignment <-- name assign word?
assign < '='
variable <-- dollar ('*' / '@' / [0-9] / name /
lbrace name (variable-or / variable-and / variable-word / variable-literal / &rbrace) rbrace)
variable-or <-- min variable-word
variable-and <-- plus variable-word
variable-word <- (variable-regex / substitution / variable / variable-literal)+
variable-regex <-- ('%%' / '%' / '##' / '#' / '^^' / '^' /',,' / ',' / '*' / '@' / '?')+ variable-word
variable-literal <- (!rbrace .)+
min < '-'
plus < '+'
lbrace < '{'
rbrace < '}'
delim <-- singlequotes / doublequotes / substitution
sq < [']
dq < [\"]
singlequotes <- sq (!['] .)* sq#
doublequotes <- dq (substitution / variable / (![\"] .))* dq#")
(catch 'syntax-error
(lambda ()
(let* ((match (match-pattern script input))
(end (peg:end match))
(tree (peg:tree match)))
(if (eq? (string-length input) end)
tree
(if match
(begin
(format (current-error-port) "parse error: at offset: ~a\n" end)
(pretty-print tree)
#f)
(begin
(format (current-error-port) "parse error: no match\n")
#f)))))
(lambda (key . args)
(define (line-column input pos)
(let ((length (string-length input)))
(let loop ((lines (string-split input #\newline)) (ln 1) (p 0))
(if (null? lines) (values #f #f input)
(let* ((line (car lines))
(length (string-length line))
(end (+ p length 1))
(last? (null? (cdr lines))))
(if (<= pos end) (values ln (+ (if last? 0 1) (- pos p))
(if last? line
(string-append line "\\n" (cadr lines))))
(loop (cdr lines) (1+ ln) end)))))))
(define (format-peg o)
(match o
(('or l ...) (string-join (map format-peg l) ", or "))
(('and l ...) (string-join (map format-peg l) " "))
((? symbol?) (symbol->string o))
((? string?) o)))
(receive (ln col line) (line-column input (caar args))
(let* ((col (- col 1))
(indent (make-string col #\space)))
(format #t "~a:~a:~a: syntax-error:\n~a\n~a^\n~aexpected: ~a\n"
""
ln col line
indent
indent
(format-peg (cadar args)))
(exit 1))))))

View File

@ -1,395 +0,0 @@
(define-module (gash peg)
#:use-module (ice-9 local-eval)
#:use-module (ice-9 match)
#:use-module (ice-9 pretty-print)
#:use-module (ice-9 peg)
#:use-module (ice-9 peg codegen)
#:use-module (ice-9 rdelim)
#:use-module (ice-9 regex)
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-26)
#:use-module (gash builtins)
#:use-module (gash environment)
#:use-module (gash gash)
#:use-module (gash io)
#:use-module (gash script)
#:export (
parse
parse-string
peg-trace?
))
(define (wrap-parser-for-users for-syntax parser accumsym s-syn)
#`(lambda (str strlen pos)
(when (> (@ (gash gash) %debug-level) 2)
(format (current-error-port) "~a ~a : ~s\n"
(make-string (- pos (or (string-rindex str #\newline 0 pos) 0)) #\space)
'#,s-syn
(substring str pos (min (+ pos 40) strlen))))
(let* ((res (#,parser str strlen pos)))
;; Try to match the nonterminal.
(if res
;; If we matched, do some post-processing to figure out
;; what data to propagate upward.
(let ((at (car res))
(body (cadr res)))
#,(cond
((eq? accumsym 'name)
#`(list at '#,s-syn))
((eq? accumsym 'all)
#`(list (car res)
(cond
((not (list? body))
(list '#,s-syn body))
((null? body) '#,s-syn)
((symbol? (car body))
(list '#,s-syn body))
(else (cons '#,s-syn body)))))
((eq? accumsym 'none) #`(list (car res) '()))
(else #`(begin res))))
;; If we didn't match, just return false.
#f))))
(module-define! (resolve-module '(ice-9 peg codegen))
'wrap-parser-for-users
wrap-parser-for-users)
(define (error? x)
(let loop ((x x))
(if (null? x) #f
(if (not (pair? x))
(eq? 'error x)
(or (loop (car x))
(loop (cdr x)))))))
;;;;;;;;;;;;;;;;;;;;;;;;;; WIP
(define (expand identifier o) ;;identifier-string -> symbol
(define (expand- o)
(let ((dollar-identifier (string-append "$" identifier)))
(match o
((? symbol?) o)
((? string?) (if (string=? o dollar-identifier) (string->symbol identifier) o))
((? list?) (map expand- o))
(_ o))))
(map expand- o))
(define (tostring . args)
(with-output-to-string (cut map display args)))
;; transform ast -> list of expr
;; such that (map eval expr)
(define (DEAD-transform ast)
(format (current-error-port) "transform=~s\n" ast)
(match ast
(('script term "&") (list (background (transform term))))
(('script term) `(,(transform term)))
(('script terms ...) (transform terms))
(('substitution "$(" script ")") (local-eval (cons 'substitute (cddr (car (transform script)))) (the-environment)))
(('substitution "`" script "`") (local-eval (cons 'substitute (cddr (car (transform script)))) (the-environment)))
((('term command)) `(,(transform command)))
((('term command) ...) (map transform command))
((('term command) (('term commands) ...)) (map transform (cons command commands)))
(('compound-list terms ...) (transform terms))
(('if-clause "if" (expression "then" consequent "fi"))
`(if (equal? 0 (status:exit-val ,@(transform expression)))
(begin ,@(transform consequent))))
(('if-clause "if" (expression "then" consequent ('else-part "else" alternative) "fi"))
`(if (equal? 0 (status:exit-val ,@(transform expression)))
(begin ,@(transform consequent))
(begin ,@(transform alternative))))
(('for-clause ("for" identifier sep do-group)) #t)
(('for-clause "for" ((identifier "in" lst sep) do-group))
`(for-each (lambda (,(string->symbol identifier))
(begin ,@(expand identifier (transform do-group))))
(glob ,(transform lst))))
(('do-group "do" (command "done")) (transform command))
(('pipeline command) (pk 1) (let* ((command (transform command))) (or (builtin command) `(pipeline #t ,@command))))
(('pipeline command piped-commands) (pk 2) `(pipeline #t ,@(transform command) ,@(transform piped-commands)))
(('simple-command ('word (assignment name value))) `((lambda _ (let ((name ,(tostring (transform name)))
(value ,(tostring (transform value))))
(stderr "assignment: " name "=" value)
(set! global-variables (assoc-set! global-variables name (glob value)))))))
(('simple-command ('word s)) `((glob ,(transform s))))
(('simple-command ('word s1) ('io-redirect "<<" ('here-document s2))) `((append (glob "echo") (cons "-n" (glob ,s2))) (glob ,(transform s1))))
(('simple-command ('word s1) ('word s2)) `((append (glob ,(transform s1)) (glob ,(transform s2)))))
(('simple-command ('word s1) (('word s2) ...)) `((append (glob ,(transform s1)) (append-map glob (list ,@(map transform s2))))))
(('variable s) s)
(('literal s) (transform s))
(('singlequotes s) (string-concatenate `("'" ,s "'")))
(('doublequotes s) (string-concatenate `("\"" ,s "\"")))
(('backticks s) (string-concatenate `("`" ,s "`")))
(('delim ('singlequotes s ...)) (string-concatenate (map transform s)))
(('delim ('doublequotes s ...)) (string-concatenate (map transform s)))
(('delim ('backticks s ...)) (string-concatenate (map transform s)))
((('pipe _) command) (transform command))
(((('pipe _) command) ...) (map (compose car transform) command))
((_ o) (transform o)) ;; peel the onion: (symbol (...)) -> (...)
(_ ast))) ;; done
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define (parse- input)
(define label "")
(define (label-name str len pos)
(let ((at (string-skip str char-alphabetic? pos len)))
(set! label (substring str pos at))
(if (< at len) (list at '())
#f)))
(define (label-match str len pos)
(if (string-prefix? label (substring str pos)) (list (+ pos (string-length label)) '())
#f))
(define-peg-pattern here-label none label-name)
(define-peg-pattern here-delim none label-match)
(define-peg-pattern here-document all (and (+ (and (not-followed-by here-delim) peg-any)) here-delim))
(define-peg-string-patterns
"script <-- ws* (term (separator term)* separator?)?
term <- (and / or / pipeline) (sp* (and / or /pipeline))*
and <-- pipeline sp* amp-amp ws* pipeline
or <-- pipeline sp* pipe-pipe ws* pipeline
pipe < '|'
pipeline <-- negate? pipeline-head pipeline-tail*
pipeline-head <- sp* command
pipeline-tail <- sp* pipe ws* command
negate <-- '!'
command <-- function / (compound-command (sp+ io-redirect)*) / simple-command (sp+ io-redirect)*
compound-command <- brace-group / subshell / for-clause / case-clause / if-clause / while-clause / until-clause
simple-command <- (sp* (io-redirect sp+)* nonreserved)+
nonreserved <- &(reserved word) word / !reserved word
reserved < 'case' / 'esac' / 'if' / 'fi' / 'then' / 'else' / 'elif' / 'for' / 'done' / 'do' / 'until' / 'while'
function <-- identifier sp* lpar sp* rpar ws* (function-body / error)
function-body <- compound-command io-redirect*
io-redirect <-- [0-9]* sp* (io-here / io-file)
io-file <-- ('<&' / '>&' / '>>' / '>' / '<>'/ '<' / '>|') sp* ([0-9]+ / file-name)
io-here <- ('<<' / '<<-') io-suffix here-document
io-op < '<<-' / '<<' / '<&' / '>&' / '>>' / '>' / '<>'/ '<' / '>|'
io-suffix <- sp* here-label sp* nl
brace-group <-- lbrace (ws* (compound-list / error) ws* rbrace / error)
subshell <-- lpar compound-list separator rpar
compound-list <- term (separator term)*
case-keyword < 'case'
case-clause <-- case-keyword sp+ word ws+ 'in' ws+ case-item* 'esac'
case-item <-- pattern ((compound-list separator)? case-sep ws* / error)
case-sep < ';;'
pattern <-- sp* word (sp* '|' sp* word)* sp* ')' sp*
for-keyword < 'for'
in-keyword < 'in'
for-clause <-- for-keyword sp+ name (ws+ in-keyword sequence)? sp* sequential-sep do-group
sequence <-- (sp+ word)+
do-keyword < 'do'
done-keyword < 'done'
do-group <- do-keyword ws* compound-list separator done-keyword
if-keyword < 'if'
fi-keyword < 'fi'
if-clause <-- if-keyword pipeline separator then-part elif-part* else-part? fi-keyword
then-keyword < 'then'
then-part <-- then-keyword ws* compound-list separator
elif-keyword < 'elif'
elif-part <-- elif-keyword ws* compound-list separator then-keyword ws* compound-list separator else-part?
else-keyword < 'else'
else-part <-- else-keyword ws* compound-list separator
while-keyword < 'while'
while-clause <-- while-keyword ws* compound-list separator do-group
until-keyword < 'until'
until-clause <-- until-keyword ws* compound-list separator do-group
file-name <-- word
name <-- identifier
identifier <- [_a-zA-Z][_a-zA-Z0-9]*
word <-- assignment / (delim / number / variable / variable-and-or / literal)+
number <-- [0-9]+
lsubst < '$('
rsubst < ')'
tick < '`'
substitution <-- lsubst script rsubst / tick script tick
assignment <-- name assign rhs
rhs <- (substitution / word)*
assign < '='
dollar < '$'
literal <-- backslash? (!ws !amp !tick !dollar !pipe !semi !par !nl !sp !rbrace !io-op !dq !sq .)+
variable <-- dollar ('$' / '#' / '*' / '?' / '@' / [0-9] / identifier / lbrace identifier rbrace)
variable-and-or <- dollar lbrace (variable-or / variable-and / variable-hash-hash / variable-hash / variable-percent-percent / variable-percent / variable-slash ) rbrace
variable-and <-- identifier plus (pat / rhs / str)
variable-or <-- identifier minus (pat / rhs / str)
variable-hash <-- identifier hash (pat / rhs / str)
variable-hash-hash <-- identifier hash hash (pat / rhs / str)
variable-percent <-- identifier percent (pat / rhs / str)
variable-percent-percent <-- identifier percent percent (pat / rhs /str)
variable-slash <-- (identifier slash pat slash str) / (identifier slash pat slash) / (identifier slash pat)
pat <-- (!dollar !rbrace !slash .)+
str <-- (!rbrace .)+
delim <- singlequotes / doublequotes / substitution
sq < [']
dq < [\"]
bt < [`]
singlequotes <-- sq (doublequotes / (!sq .))* sq
doublequotes <-- dq (singlequotes / substitution / number / variable / variable-and-or / literal / (!dq .))* dq
break <- amp / semi !semi
separator <- (sp* break ws*) / ws+
sequential-sep <- (semi !semi ws*) / ws+
amp <- '&'
amp-amp < '&&'
pipe-pipe < '||'
backslash <- '\\'
semi < ';'
lpar < '('
rpar < ')'
lbrace < [{]
rbrace < [}]
plus < [+]
minus < '-'
hash < '#'
percent < '%'
slash < '/'
par < lpar / rpar
nl < '\n'
sp < '\t' / ' ' / (escaped-nl sp*)
ws < sp / nl
escaped-nl < (backslash nl)
error <-- .*")
(when (> %debug-level 1)
(format (current-error-port) "input:~s\n" input))
(let* ((match (match-pattern script input))
(end (peg:end match))
(pt (peg:tree match)))
(if (eq? (string-length input) end)
pt
(if match
(begin
(format (current-error-port) "parse error: at offset: ~a\n" end)
(pretty-print pt (current-error-port))
#f)
(begin
(format (current-error-port) "parse error: no match\n")
#f)))))
(define (flatten o)
(keyword-flatten '(and assignent command doublequotes for-clause literal name or pipeline singlequotes substitution word) o))
(define (unspecified? o)
(eq? o *unspecified*))
(define (transform ast)
(when (> %debug-level 1)
(pretty-print ast (current-error-port)))
(match ast
;; FIXME: flatten?
((('assignent _ ...) _ ...) (map transform (flatten ast)))
((('command _ ...) _ ...) (map transform (flatten ast)))
((('doublequotes _ ...) _ ...) (map transform (flatten ast)))
((('for-clause _ ...) _ ...) (map transform (flatten ast)))
((('literal _ ...) _ ...) (map transform (flatten ast)))
((('pipeline _ ...) _ ...) (map transform (flatten ast)))
((('singlequotes _ ...) _ ...) (map transform (flatten ast)))
((('word _ ...) ('word _ ...)) (transform (cons 'word ast)))
((('word _ ...) _ ...) (map transform (flatten ast)))
(('script ('pipeline ('command command ... (word (literal "&")))))
(background `(pipeline ',(map transform command))))
(('script terms ...) `(script ,@(map transform terms)))
(('pipeline ('command command ('io-redirect ('io-file ">" file-name))))
(transform `(pipeline (command ,@(transform command)) (lambda _ (with-output-to-file ,(transform file-name) (lambda _ (display (read-string))))))))
(('pipeline ('command command ('io-redirect "<<" ('here-document here-document))))
(transform `(pipeline (lambda _ (display ,here-document)) (command ,(transform command)))))
(('pipeline o ...)
(let ((commands (map transform o)))
`(pipeline ,@(cons (trace commands) commands))))
(('command o ...) `(command ,@(map transform o)))
(('literal o) (transform o))
(('name o) o)
(('number o) o)
;;(('assignment a b) `(assignment ,(transform a) ',(transform b)))
;; FIXME: to quote or not?
(('assignment a) `(substitution (variable ,(transform a))))
(('assignment a b) `(assignment ,(transform a) ,(transform b)))
;; (('assignment a (and b ('literal _ ...))) `(assignment ,(transform a) ,(transform b)))
;; (('assignment a b)
;; `(assignment ,(transform a) ,(map transform b)))
(('for-clause name sequence (and body ('pipeline _ ...)))
`(for ,(transform name) (lambda _ ,(transform sequence)) (lambda _ ,(transform body))))
(('for-clause name expr body)
`(for ,(transform name) (lambda _ ,(transform expr)) (lambda _ ,@(map transform body))))
(('sequence o)
`(sequence (string-split ,(transform o) #\space)))
(('sequence o ...)
`(sequence (quote ,(map transform o))))
(('and l r) `(and-terms ,(transform l) ,(transform r)))
(('or l r) `(or-terms ,(transform l) ,(transform r)))
(('substitution o) `(substitution ,(transform o)))
(('if-clause expr then) `(if-clause ,(transform expr) ,(transform then)))
(('if-clause expr then else) `(if-clause ,(transform expr) ,(transform then) ,(transform else)))
(('elif-part expr then) `(if-clause ,(transform expr) ,(transform then)))
(('elif-part expr then else) `(if-clause ,(transform expr) ,(transform then) ,(transform else)))
(('then-part o ...) `(begin ,@(map transform o)))
(('else-part o ...) `(begin ,@(map transform o)))
(('word 'singlequotes) "")
(('word o) (transform o))
(('word o ...) `(string-append ,@(map transform o)))
(('function name body)
`(function ,name (lambda ( . args) ,(transform body))))
(('brace-group o) `(brace-group ,(transform o)))
(('file-name o) `(file-name ,(transform o)))
('doublequotes "")
(_ ast)))
(define (remove-line-comments s)
(string-join (map
(lambda (s)
(let ((n (string-index s #\#)))
(if (and n (zero? n)) (string-pad-right s (string-length s) #\space 0 n)
s)))
(string-split s #\newline)) "\n"))
(define (parse-string string)
(let* ((pt ((compose parse- remove-line-comments) string))
(foo (when (> %debug-level 1) (display "tree:\n") (pretty-print pt)))
(flat (flatten pt))
(foo (when (> %debug-level 0) (display "flat:\n") (pretty-print flat)))
(ast (transform flat))
(foo (when (> %debug-level 0) (display "ast:\n") (pretty-print ast))))
(cond ((error? ast)
(stderr "error:") (pretty-print ast (current-error-port)) #f)
((eq? ast 'script)
#t)
(else ast))))
(define (parse port)
(parse-string (read-string port)))

41
peg.scm Normal file
View File

@ -0,0 +1,41 @@
;;;; peg.scm --- Parsing Expression Grammar (PEG) parser generator
;;;;
;;;; Copyright (C) 2010, 2011 Free Software Foundation, Inc.
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
;;;; License as published by the Free Software Foundation; either
;;;; version 3 of the License, or (at your option) any later version.
;;;;
;;;; This library is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;;; Lesser General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU Lesser General Public
;;;; License along with this library; if not, write to the Free Software
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;;;;
(define-module (peg)
#:use-module (peg codegen)
#:use-module (peg string-peg)
;; Note: the most important effect of using string-peg is not whatever
;; functions it exports, but the fact that it adds a new handler to
;; peg-sexp-compile.
#:use-module (peg simplify-tree)
#:use-module (peg using-parsers)
#:use-module (peg cache)
#:re-export (define-peg-pattern
define-peg-string-patterns
match-pattern
search-for-pattern
compile-peg-pattern
keyword-flatten
context-flatten
peg:start
peg:end
peg:string
peg:tree
peg:substring
peg-record?))

45
peg/cache.scm Normal file
View File

@ -0,0 +1,45 @@
;;;; cache.scm --- cache the results of parsing
;;;;
;;;; Copyright (C) 2010, 2011 Free Software Foundation, Inc.
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
;;;; License as published by the Free Software Foundation; either
;;;; version 3 of the License, or (at your option) any later version.
;;;;
;;;; This library is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;;; Lesser General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU Lesser General Public
;;;; License along with this library; if not, write to the Free Software
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;;;;
(define-module (peg cache)
#:export (cg-cached-parser))
;; The results of parsing using a nonterminal are cached. Think of it like a
;; hash with no conflict resolution. Process for deciding on the cache size
;; wasn't very scientific; just ran the benchmarks and stopped a little after
;; the point of diminishing returns on my box.
(define *cache-size* 512)
(define (make-cache)
(make-vector *cache-size* #f))
;; given a syntax object which is a parser function, returns syntax
;; which, if evaluated, will become a parser function that uses a cache.
(define (cg-cached-parser parser)
#`(let ((cache (make-cache)))
(lambda (str strlen at)
(let* ((vref (vector-ref cache (modulo at *cache-size*))))
;; Check to see whether the value is cached.
(if (and vref (eq? (car vref) str) (= (cadr vref) at))
(caddr vref);; If it is return it.
(let ((fres ;; Else calculate it and cache it.
(#,parser str strlen at)))
(vector-set! cache (modulo at *cache-size*)
(list str at fres))
fres))))))

390
peg/codegen.scm Normal file
View File

@ -0,0 +1,390 @@
;;;; codegen.scm --- code generation for composable parsers
;;;;
;;;; Copyright (C) 2011 Free Software Foundation, Inc.
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
;;;; License as published by the Free Software Foundation; either
;;;; version 3 of the License, or (at your option) any later version.
;;;;
;;;; This library is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;;; Lesser General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU Lesser General Public
;;;; License along with this library; if not, write to the Free Software
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;;;;
(define-module (peg codegen)
#:export (compile-peg-pattern wrap-parser-for-users add-peg-compiler!)
#:use-module (ice-9 pretty-print)
#:use-module (system base pmatch))
(define-syntax single?
(syntax-rules ()
"Return #t if X is a list of one element."
((_ x)
(pmatch x
((_) #t)
(else #f)))))
(define-syntax single-filter
(syntax-rules ()
"If EXP is a list of one element, return the element. Otherwise
return EXP."
((_ exp)
(pmatch exp
((,elt) elt)
(,elts elts)))))
(define-syntax push-not-null!
(syntax-rules ()
"If OBJ is non-null, push it onto LST, otherwise do nothing."
((_ lst obj)
(if (not (null? obj))
(push! lst obj)))))
(define-syntax push!
(syntax-rules ()
"Push an object onto a list."
((_ lst obj)
(set! lst (cons obj lst)))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;; CODE GENERATORS
;; These functions generate scheme code for parsing PEGs.
;; Conventions:
;; accum: (all name body none)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Code we generate will have a certain return structure depending on how we're
;; accumulating (the ACCUM variable).
(define (cg-generic-ret accum name body-uneval at)
;; name, body-uneval and at are syntax
#`(let ((body #,body-uneval))
#,(cond
((and (eq? accum 'all) name)
#`(list #,at
(cond
((not (list? body)) (list '#,name body))
((null? body) '#,name)
((symbol? (car body)) (list '#,name body))
(else (cons '#,name body)))))
((eq? accum 'name)
#`(list #,at '#,name))
((eq? accum 'body)
#`(list #,at
(cond
((single? body) (car body))
(else body))))
((eq? accum 'none)
#`(list #,at '()))
(else
(begin
(pretty-print `(cg-generic-ret-error ,accum ,name ,body-uneval ,at))
(pretty-print "Defaulting to accum of none.\n")
#`(list #,at '()))))))
;; The short name makes the formatting below much easier to read.
(define cggr cg-generic-ret)
;; Generates code that matches a particular string.
;; E.g.: (cg-string syntax "abc" 'body)
(define (cg-string pat accum)
(let ((plen (string-length pat)))
#`(lambda (str len pos)
(let ((end (+ pos #,plen)))
(and (<= end len)
(string= str #,pat pos end)
#,(case accum
((all) #`(list end (list 'cg-string #,pat)))
((name) #`(list end 'cg-string))
((body) #`(list end #,pat))
((none) #`(list end '()))
(else (error "bad accum" accum))))))))
;; Generates code for matching any character.
;; E.g.: (cg-peg-any syntax 'body)
(define (cg-peg-any accum)
#`(lambda (str len pos)
(and (< pos len)
#,(case accum
((all) #`(list (1+ pos)
(list 'cg-peg-any (substring str pos (1+ pos)))))
((name) #`(list (1+ pos) 'cg-peg-any))
((body) #`(list (1+ pos) (substring str pos (1+ pos))))
((none) #`(list (1+ pos) '()))
(else (error "bad accum" accum))))))
;; Generates code for matching a range of characters between start and end.
;; E.g.: (cg-range syntax #\a #\z 'body)
(define (cg-range pat accum)
(syntax-case pat ()
((start end)
(if (not (and (char? (syntax->datum #'start))
(char? (syntax->datum #'end))))
(error "range PEG should have characters after it; instead got"
#'start #'end))
#`(lambda (str len pos)
(and (< pos len)
(let ((c (string-ref str pos)))
(and (char>=? c start)
(char<=? c end)
#,(case accum
((all) #`(list (1+ pos) (list 'cg-range (string c))))
((name) #`(list (1+ pos) 'cg-range))
((body) #`(list (1+ pos) (string c)))
((none) #`(list (1+ pos) '()))
(else (error "bad accum" accum))))))))))
;; Generate code to match a pattern and do nothing with the result
(define (cg-ignore pat accum)
(syntax-case pat ()
((inner)
(compile-peg-pattern #'inner 'none))))
(define (cg-capture pat accum)
(syntax-case pat ()
((inner)
(compile-peg-pattern #'inner 'body))))
;; Filters the accum argument to compile-peg-pattern for buildings like string
;; literals (since we don't want to tag them with their name if we're doing an
;; "all" accum).
(define (builtin-accum-filter accum)
(cond
((eq? accum 'all) 'body)
((eq? accum 'name) 'name)
((eq? accum 'body) 'body)
((eq? accum 'none) 'none)))
(define baf builtin-accum-filter)
;; Top-level function builder for AND. Reduces to a call to CG-AND-INT.
(define (cg-and clauses accum)
#`(lambda (str len pos)
(let ((body '()))
#,(cg-and-int clauses (baf accum) #'str #'len #'pos #'body))))
;; Internal function builder for AND (calls itself).
(define (cg-and-int clauses accum str strlen at body)
(syntax-case clauses ()
(()
(cggr accum 'cg-and #`(reverse #,body) at))
((first rest ...)
#`(let ((res (#,(compile-peg-pattern #'first accum) #,str #,strlen #,at)))
(and res
;; update AT and BODY then recurse
(let ((newat (car res))
(newbody (cadr res)))
(set! #,at newat)
(push-not-null! #,body (single-filter newbody))
#,(cg-and-int #'(rest ...) accum str strlen at body)))))))
;; Top-level function builder for OR. Reduces to a call to CG-OR-INT.
(define (cg-or clauses accum)
#`(lambda (str len pos)
#,(cg-or-int clauses (baf accum) #'str #'len #'pos)))
;; Internal function builder for OR (calls itself).
(define (cg-or-int clauses accum str strlen at)
(syntax-case clauses ()
(()
#f)
((first rest ...)
#`(or (#,(compile-peg-pattern #'first accum) #,str #,strlen #,at)
#,(cg-or-int #'(rest ...) accum str strlen at)))))
(define (cg-* args accum)
(syntax-case args ()
((pat)
#`(lambda (str strlen at)
(let ((body '()))
(let lp ((end at) (count 0))
(let* ((match (#,(compile-peg-pattern #'pat (baf accum))
str strlen end))
(new-end (if match (car match) end))
(count (if (> new-end end) (1+ count) count)))
(if (> new-end end)
(push-not-null! body (single-filter (cadr match))))
(if (and (> new-end end)
#,#t)
(lp new-end count)
(let ((success #,#t))
#,#`(and success
#,(cggr (baf accum) 'cg-body
#'(reverse body) #'new-end)))))))))))
(define (cg-+ args accum)
(syntax-case args ()
((pat)
#`(lambda (str strlen at)
(let ((body '()))
(let lp ((end at) (count 0))
(let* ((match (#,(compile-peg-pattern #'pat (baf accum))
str strlen end))
(new-end (if match (car match) end))
(count (if (> new-end end) (1+ count) count)))
(if (> new-end end)
(push-not-null! body (single-filter (cadr match))))
(if (and (> new-end end)
#,#t)
(lp new-end count)
(let ((success #,#'(>= count 1)))
#,#`(and success
#,(cggr (baf accum) 'cg-body
#'(reverse body) #'new-end)))))))))))
(define (cg-? args accum)
(syntax-case args ()
((pat)
#`(lambda (str strlen at)
(let ((body '()))
(let lp ((end at) (count 0))
(let* ((match (#,(compile-peg-pattern #'pat (baf accum))
str strlen end))
(new-end (if match (car match) end))
(count (if (> new-end end) (1+ count) count)))
(if (> new-end end)
(push-not-null! body (single-filter (cadr match))))
(if (and (> new-end end)
#,#'(< count 1))
(lp new-end count)
(let ((success #,#t))
#,#`(and success
#,(cggr (baf accum) 'cg-body
#'(reverse body) #'new-end)))))))))))
(define (cg-followed-by args accum)
(syntax-case args ()
((pat)
#`(lambda (str strlen at)
(let ((body '()))
(let lp ((end at) (count 0))
(let* ((match (#,(compile-peg-pattern #'pat (baf accum))
str strlen end))
(new-end (if match (car match) end))
(count (if (> new-end end) (1+ count) count)))
(if (> new-end end)
(push-not-null! body (single-filter (cadr match))))
(if (and (> new-end end)
#,#'(< count 1))
(lp new-end count)
(let ((success #,#'(= count 1)))
#,#`(and success
#,(cggr (baf accum) 'cg-body #''() #'at)))))))))))
(define (cg-not-followed-by args accum)
(syntax-case args ()
((pat)
#`(lambda (str strlen at)
(let ((body '()))
(let lp ((end at) (count 0))
(let* ((match (#,(compile-peg-pattern #'pat (baf accum))
str strlen end))
(new-end (if match (car match) end))
(count (if (> new-end end) (1+ count) count)))
(if (> new-end end)
(push-not-null! body (single-filter (cadr match))))
(if (and (> new-end end)
#,#'(< count 1))
(lp new-end count)
(let ((success #,#'(= count 1)))
#,#`(if success
#f
#,(cggr (baf accum) 'cg-body #''() #'at)))))))))))
(define (cg-expect-int clauses accum str strlen at)
(syntax-case clauses ()
((pat)
#`(or (#,(compile-peg-pattern #'pat accum) #,str #,strlen #,at)
(throw 'syntax-error (list #,at (syntax->datum #'pat))))))) ;;TODO throw partial match
(define (cg-expect clauses accum)
#`(lambda (str len pos)
#,(cg-expect-int clauses ((@@ (ice-9 peg codegen) baf) accum) #'str #'len #'pos)))
;; Association list of functions to handle different expressions as PEGs
(define peg-compiler-alist '())
(define (add-peg-compiler! symbol function)
(set! peg-compiler-alist
(assq-set! peg-compiler-alist symbol function)))
(add-peg-compiler! 'range cg-range)
(add-peg-compiler! 'ignore cg-ignore)
(add-peg-compiler! 'capture cg-capture)
(add-peg-compiler! 'and cg-and)
(add-peg-compiler! 'or cg-or)
(add-peg-compiler! '* cg-*)
(add-peg-compiler! '+ cg-+)
(add-peg-compiler! '? cg-?)
(add-peg-compiler! 'followed-by cg-followed-by)
(add-peg-compiler! 'not-followed-by cg-not-followed-by)
(add-peg-compiler! 'expect cg-expect)
;; Takes an arbitrary expressions and accumulation variable, then parses it.
;; E.g.: (compile-peg-pattern syntax '(and "abc" (or "-" (range #\a #\z))) 'all)
(define (compile-peg-pattern pat accum)
(syntax-case pat (peg-any)
(peg-any
(cg-peg-any (baf accum)))
(sym (identifier? #'sym) ;; nonterminal
#'sym)
(str (string? (syntax->datum #'str)) ;; literal string
(cg-string (syntax->datum #'str) (baf accum)))
((name . args) (let* ((nm (syntax->datum #'name))
(entry (assq-ref peg-compiler-alist nm)))
(if entry
(entry #'args accum)
(error "Bad peg form" nm #'args
"Not one of" (map car peg-compiler-alist)))))))
;; Packages the results of a parser
(define indent 0)
(define (trace? symbol)
(and #f (not (memq symbol '()))))
(define (wrap-parser-for-users for-syntax parser accumsym s-syn)
#`(lambda (str strlen at)
(when (trace? '#,s-syn)
(format (current-error-port) "~a~a\n"
(make-string indent #\space)
'#,s-syn))
(set! indent (+ indent 4))
(let ((res (#,parser str strlen at)))
(set! indent (- indent 4))
;; Try to match the nonterminal.
(let ((pos (or (and res (car res)) 0)))
(when (and (trace? '#,s-syn) (< at pos))
(format (current-error-port) "~a~a := ~s\tnext: ~s\n"
(make-string indent #\space)
'#,s-syn
(substring str at pos)
(substring str pos (min strlen (+ pos 10))))))
(if res
;; If we matched, do some post-processing to figure out
;; what data to propagate upward.
(let ((at (car res))
(body (cadr res)))
#,(cond
((eq? accumsym 'name)
#`(list at '#,s-syn))
((eq? accumsym 'all)
#`(list (car res)
(cond
((not (list? body))
(list '#,s-syn body))
((null? body) '#,s-syn)
((symbol? (car body))
(list '#,s-syn body))
(else (cons '#,s-syn body)))))
((eq? accumsym 'none) #`(list (car res) '()))
(else #`(begin res))))
;; If we didn't match, just return false.
#f))))

97
peg/simplify-tree.scm Normal file
View File

@ -0,0 +1,97 @@
;;;; simplify-tree.scm --- utility functions for the PEG parser
;;;;
;;;; Copyright (C) 2010, 2011 Free Software Foundation, Inc.
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
;;;; License as published by the Free Software Foundation; either
;;;; version 3 of the License, or (at your option) any later version.
;;;;
;;;; This library is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;;; Lesser General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU Lesser General Public
;;;; License along with this library; if not, write to the Free Software
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;;;;
(define-module (peg simplify-tree)
#:export (keyword-flatten context-flatten string-collapse)
#:use-module (system base pmatch))
(define-syntax single?
(syntax-rules ()
"Return #t if X is a list of one element."
((_ x)
(pmatch x
((_) #t)
(else #f)))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;; POST-PROCESSING FUNCTIONS (TO CANONICALIZE MATCH TREES)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Is everything in LST true?
(define (andlst lst)
(or (null? lst)
(and (car lst) (andlst (cdr lst)))))
;; Is LST a list of strings?
(define (string-list? lst)
(and (list? lst) (not (null? lst))
(andlst (map string? lst))))
;; Groups all strings that are next to each other in LST. Used in
;; STRING-COLLAPSE.
(define (string-group lst)
(if (not (list? lst))
lst
(if (null? lst)
'()
(let ((next (string-group (cdr lst))))
(if (not (string? (car lst)))
(cons (car lst) next)
(if (and (not (null? next))
(list? (car next))
(string? (caar next)))
(cons (cons (car lst) (car next)) (cdr next))
(cons (list (car lst)) next)))))))
;; Collapses all the string in LST.
;; ("a" "b" (c d) "e" "f") -> ("ab" (c d) "ef")
(define (string-collapse lst)
(if (list? lst)
(let ((res (map (lambda (x) (if (string-list? x)
(apply string-append x)
x))
(string-group (map string-collapse lst)))))
(if (single? res) (car res) res))
lst))
;; If LST is an atom, return (list LST), else return LST.
(define (mklst lst)
(if (not (list? lst)) (list lst) lst))
;; Takes a list and "flattens" it, using the predicate TST to know when to stop
;; instead of terminating on atoms (see tutorial).
(define (context-flatten tst lst)
(if (or (not (list? lst)) (null? lst))
lst
(if (tst lst)
(list lst)
(apply append
(map (lambda (x) (mklst (context-flatten tst x)))
lst)))))
;; Takes a list and "flattens" it, using the list of keywords KEYWORD-LST to
;; know when to stop at (see tutorial).
(define (keyword-flatten keyword-lst lst)
(context-flatten
(lambda (x)
(if (or (not (list? x)) (null? x))
#t
(member (car x) keyword-lst)))
lst))

280
peg/string-peg.scm Normal file
View File

@ -0,0 +1,280 @@
;;;; string-peg.scm --- representing PEG grammars as strings
;;;;
;;;; Copyright (C) 2010, 2011 Free Software Foundation, Inc.
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
;;;; License as published by the Free Software Foundation; either
;;;; version 3 of the License, or (at your option) any later version.
;;;;
;;;; This library is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;;; Lesser General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU Lesser General Public
;;;; License along with this library; if not, write to the Free Software
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;;;;
(define-module (peg string-peg)
#:export (peg-as-peg
define-peg-string-patterns
peg-grammar)
#:use-module (peg using-parsers)
#:use-module (peg codegen)
#:use-module (peg simplify-tree))
;; Gets the left-hand depth of a list.
(define (depth lst)
(if (or (not (list? lst)) (null? lst))
0
(+ 1 (depth (car lst)))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;; Parse string PEGs using sexp PEGs.
;; See the variable PEG-AS-PEG for an easier-to-read syntax.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Grammar for PEGs in PEG grammar.
(define peg-as-peg
"grammar <-- (nonterminal ('<--' / '<-' / '<') sp pattern)+
pattern <-- alternative (SLASH sp alternative)*
alternative <-- ([!&]? sp suffix)+
suffix <-- primary ([*+?] sp)*
primary <-- secondary ([#] sp)?
secondary <-- '(' sp pattern ')' sp / '.' sp / literal / charclass / nonterminal !'<'
literal <-- ['] (!['] .)* ['] sp
charclass <-- LB (!']' (CCrange / CCsingle))* RB sp
CCrange <-- . '-' .
CCsingle <-- .
nonterminal <-- [a-zA-Z0-9-]+ sp
sp < [ \t\n]*
SLASH < '/'
LB < '['
RB < ']'
")
(define-syntax define-sexp-parser
(lambda (x)
(syntax-case x ()
((_ sym accum pat)
(let* ((matchf (compile-peg-pattern #'pat (syntax->datum #'accum)))
(accumsym (syntax->datum #'accum))
(syn (wrap-parser-for-users x matchf accumsym #'sym)))
#`(define sym #,syn))))))
(define-sexp-parser peg-grammar all
(+ (and peg-nonterminal (or "<--" "<-" "<") peg-sp peg-pattern)))
(define-sexp-parser peg-pattern all
(and peg-alternative
(* (and (ignore "/") peg-sp peg-alternative))))
(define-sexp-parser peg-alternative all
(+ (and (? (or "!" "&")) peg-sp peg-suffix)))
(define-sexp-parser peg-suffix all
(and peg-primary (* (and (or "*" "+" "?") peg-sp))))
(define-sexp-parser peg-primary all
(and peg-secondary (? (and "#" peg-sp))))
(define-sexp-parser peg-secondary all
(or (and "(" peg-sp peg-pattern ")" peg-sp)
(and "." peg-sp)
peg-literal
peg-charclass
(and peg-nonterminal (not-followed-by "<"))))
(define-sexp-parser peg-literal all
(and "'" (* (and (not-followed-by "'") peg-any)) "'" peg-sp))
(define-sexp-parser peg-charclass all
(and (ignore "[")
(* (and (not-followed-by "]")
(or charclass-range charclass-single)))
(ignore "]")
peg-sp))
(define-sexp-parser charclass-range all (and peg-any "-" peg-any))
(define-sexp-parser charclass-single all peg-any)
(define-sexp-parser peg-nonterminal all
(and (+ (or (range #\a #\z) (range #\A #\Z) (range #\0 #\9) "-")) peg-sp))
(define-sexp-parser peg-sp none
(* (or " " "\t" "\n")))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;; PARSE STRING PEGS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Takes a string representing a PEG grammar and returns syntax that
;; will define all of the nonterminals in the grammar with equivalent
;; PEG s-expressions.
(define (peg-parser str for-syntax)
(let ((parsed (match-pattern peg-grammar str)))
(if (not parsed)
(begin
;; (display "Invalid PEG grammar!\n")
#f)
(let ((lst (peg:tree parsed)))
(cond
((or (not (list? lst)) (null? lst))
lst)
((eq? (car lst) 'peg-grammar)
#`(begin
#,@(map (lambda (x) (peg-nonterm->defn x for-syntax))
(context-flatten (lambda (lst) (<= (depth lst) 2))
(cdr lst))))))))))
;; Macro wrapper for PEG-PARSER. Parses PEG grammars expressed as strings and
;; defines all the appropriate nonterminals.
(define-syntax define-peg-string-patterns
(lambda (x)
(syntax-case x ()
((_ str)
(peg-parser (syntax->datum #'str) x)))))
;; lst has format (nonterm grabber pattern), where
;; nonterm is a symbol (the name of the nonterminal),
;; grabber is a string (either "<", "<-" or "<--"), and
;; pattern is the parse of a PEG pattern expressed as as string.
(define (peg-nonterm->defn lst for-syntax)
(let* ((nonterm (car lst))
(grabber (cadr lst))
(pattern (caddr lst))
(nonterm-name (datum->syntax for-syntax
(string->symbol (cadr nonterm)))))
#`(define-peg-pattern #,nonterm-name
#,(cond
((string=? grabber "<--") (datum->syntax for-syntax 'all))
((string=? grabber "<-") (datum->syntax for-syntax 'body))
(else (datum->syntax for-syntax 'none)))
#,(compressor (peg-pattern->defn pattern for-syntax) for-syntax))))
;; lst has format ('peg-pattern ...).
;; After the context-flatten, (cdr lst) has format
;; (('peg-alternative ...) ...), where the outer list is a collection
;; of elements from a '/' alternative.
(define (peg-pattern->defn lst for-syntax)
#`(or #,@(map (lambda (x) (peg-alternative->defn x for-syntax))
(context-flatten (lambda (x) (eq? (car x) 'peg-alternative))
(cdr lst)))))
;; lst has format ('peg-alternative ...).
;; After the context-flatten, (cdr lst) has the format
;; (item ...), where each item has format either ("!" ...), ("&" ...),
;; or ('peg-suffix ...).
(define (peg-alternative->defn lst for-syntax)
#`(and #,@(map (lambda (x) (peg-body->defn x for-syntax))
(context-flatten (lambda (x) (or (string? (car x))
(eq? (car x) 'peg-suffix)))
(cdr lst)))))
;; lst has the format either
;; ("!" ('peg-suffix ...)), ("&" ('peg-suffix ...)), or
;; ('peg-suffix ...).
(define (peg-body->defn lst for-syntax)
(cond
((equal? (car lst) "&")
#`(followed-by #,(peg-suffix->defn (cadr lst) for-syntax)))
((equal? (car lst) "!")
#`(not-followed-by #,(peg-suffix->defn (cadr lst) for-syntax)))
((eq? (car lst) 'peg-suffix)
(peg-suffix->defn lst for-syntax))
(else `(peg-parse-body-fail ,lst))))
;; lst has format ('peg-suffix <peg-primary> (? (/ "*" "?" "+")))
(define (peg-suffix->defn lst for-syntax)
(let ((inner-defn (peg-primary->defn (cadr lst) for-syntax)))
(cond
((null? (cddr lst))
inner-defn)
((equal? (caddr lst) "*")
#`(* #,inner-defn))
((equal? (caddr lst) "?")
#`(? #,inner-defn))
((equal? (caddr lst) "+")
#`(+ #,inner-defn)))))
;; Parse a primary.
(define (peg-primary->defn lst for-syntax)
(let ((inner-defn (peg-secondary->defn (cadr lst) for-syntax)))
(if (and (pair? (cddr lst)) (equal? (caddr lst) "#")) #`(expect #,inner-defn)
inner-defn)))
(define (peg-secondary->defn lst for-syntax)
(let ((el (cadr lst)))
(cond
((list? el)
(cond
((eq? (car el) 'peg-literal)
(peg-literal->defn el for-syntax))
((eq? (car el) 'peg-charclass)
(peg-charclass->defn el for-syntax))
((eq? (car el) 'peg-nonterminal)
(datum->syntax for-syntax (string->symbol (cadr el))))))
((string? el)
(cond
((equal? el "(")
(peg-pattern->defn (caddr lst) for-syntax))
((equal? el ".")
(datum->syntax for-syntax 'peg-any))
(else (datum->syntax for-syntax
`(peg-parse-any unknown-string ,lst)))))
(else (datum->syntax for-syntax
`(peg-parse-any unknown-el ,lst))))))
;; Trims characters off the front and end of STR.
;; (trim-1chars "'ab'") -> "ab"
(define (trim-1chars str) (substring str 1 (- (string-length str) 1)))
;; Parses a literal.
(define (peg-literal->defn lst for-syntax)
(datum->syntax for-syntax (trim-1chars (cadr lst))))
;; Parses a charclass.
(define (peg-charclass->defn lst for-syntax)
#`(or
#,@(map
(lambda (cc)
(cond
((eq? (car cc) 'charclass-range)
#`(range #,(datum->syntax
for-syntax
(string-ref (cadr cc) 0))
#,(datum->syntax
for-syntax
(string-ref (cadr cc) 2))))
((eq? (car cc) 'charclass-single)
(datum->syntax for-syntax (cadr cc)))))
(context-flatten
(lambda (x) (or (eq? (car x) 'charclass-range)
(eq? (car x) 'charclass-single)))
(cdr lst)))))
;; Compresses a list to save the optimizer work.
;; e.g. (or (and a)) -> a
(define (compressor-core lst)
(if (or (not (list? lst)) (null? lst))
lst
(cond
((and (or (eq? (car lst) 'or) (eq? (car lst) 'and))
(null? (cddr lst)))
(compressor-core (cadr lst)))
((and (eq? (car lst) 'body)
(eq? (cadr lst) 'lit)
(eq? (cadddr lst) 1))
(compressor-core (caddr lst)))
(else (map compressor-core lst)))))
(define (compressor syn for-syntax)
(datum->syntax for-syntax
(compressor-core (syntax->datum syn))))
;; Builds a lambda-expressions for the pattern STR using accum.
(define (peg-string-compile args accum)
(syntax-case args ()
((str-stx) (string? (syntax->datum #'str-stx))
(let ((string (syntax->datum #'str-stx)))
(compile-peg-pattern
(compressor
(peg-pattern->defn
(peg:tree (match-pattern peg-pattern string)) #'str-stx)
#'str-stx)
(if (eq? accum 'all) 'body accum))))
(else (error "Bad embedded PEG string" args))))
(add-peg-compiler! 'peg peg-string-compile)

116
peg/using-parsers.scm Normal file
View File

@ -0,0 +1,116 @@
;;;; using-parsers.scm --- utilities to make using parsers easier
;;;;
;;;; Copyright (C) 2010, 2011 Free Software Foundation, Inc.
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
;;;; License as published by the Free Software Foundation; either
;;;; version 3 of the License, or (at your option) any later version.
;;;;
;;;; This library is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;;; Lesser General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU Lesser General Public
;;;; License along with this library; if not, write to the Free Software
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;;;;
(define-module (peg using-parsers)
#:use-module (peg simplify-tree)
#:use-module (peg codegen)
#:use-module (peg cache)
#:export (match-pattern define-peg-pattern search-for-pattern
prec make-prec peg:start peg:end peg:string
peg:tree peg:substring peg-record?))
;;;
;;; Helper Macros
;;;
(define-syntax until
(syntax-rules ()
"Evaluate TEST. If it is true, return its value. Otherwise,
execute the STMTs and try again."
((_ test stmt stmt* ...)
(let lp ()
(or test
(begin stmt stmt* ... (lp)))))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;; FOR DEFINING AND USING NONTERMINALS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Parses STRING using NONTERM
(define (match-pattern nonterm string)
;; We copy the string before using it because it might have been modified
;; in-place since the last time it was parsed, which would invalidate the
;; cache. Guile uses copy-on-write for strings, so this is fast.
(let ((res (nonterm (string-copy string) (string-length string) 0)))
(if (not res)
#f
(make-prec 0 (car res) string (string-collapse (cadr res))))))
;; Defines a new nonterminal symbol accumulating with ACCUM.
(define-syntax define-peg-pattern
(lambda (x)
(syntax-case x ()
((_ sym accum pat)
(let ((matchf (compile-peg-pattern #'pat (syntax->datum #'accum)))
(accumsym (syntax->datum #'accum)))
;; CODE is the code to parse the string if the result isn't cached.
(let ((syn (wrap-parser-for-users x matchf accumsym #'sym)))
#`(define sym #,(cg-cached-parser syn))))))))
(define (peg-like->peg pat)
(syntax-case pat ()
(str (string? (syntax->datum #'str)) #'(peg str))
(else pat)))
;; Searches through STRING for something that parses to PEG-MATCHER. Think
;; regexp search.
(define-syntax search-for-pattern
(lambda (x)
(syntax-case x ()
((_ pattern string-uncopied)
(let ((pmsym (syntax->datum #'pattern)))
(let ((matcher (compile-peg-pattern (peg-like->peg #'pattern) 'body)))
;; We copy the string before using it because it might have been
;; modified in-place since the last time it was parsed, which would
;; invalidate the cache. Guile uses copy-on-write for strings, so
;; this is fast.
#`(let ((string (string-copy string-uncopied))
(strlen (string-length string-uncopied))
(at 0))
(let ((ret (until (or (>= at strlen)
(#,matcher string strlen at))
(set! at (+ at 1)))))
(if (eq? ret #t) ;; (>= at strlen) succeeded
#f
(let ((end (car ret))
(match (cadr ret)))
(make-prec
at end string
(string-collapse match))))))))))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;; PMATCH STRUCTURE MUNGING
;; Pretty self-explanatory.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define prec
(make-record-type "peg" '(start end string tree)))
(define make-prec
(record-constructor prec '(start end string tree)))
(define (peg:start pm)
(if pm ((record-accessor prec 'start) pm) #f))
(define (peg:end pm)
(if pm ((record-accessor prec 'end) pm) #f))
(define (peg:string pm)
(if pm ((record-accessor prec 'string) pm) #f))
(define (peg:tree pm)
(if pm ((record-accessor prec 'tree) pm) #f))
(define (peg:substring pm)
(if pm (substring (peg:string pm) (peg:start pm) (peg:end pm)) #f))
(define peg-record? (record-predicate prec))