diff --git a/Makefile.am b/Makefile.am index e061889..3108122 100644 --- a/Makefile.am +++ b/Makefile.am @@ -50,6 +50,7 @@ MODULES = \ geesh/eval.scm \ geesh/lexer.scm \ geesh/parser.scm \ + geesh/pattern.scm \ geesh/repl.scm \ geesh/shell.scm \ geesh/word.scm @@ -61,6 +62,7 @@ TESTS = \ tests/environment.scm \ tests/lexer.scm \ tests/parser.scm \ + tests/pattern.scm \ tests/shell.scm \ tests/word.scm diff --git a/geesh/pattern.scm b/geesh/pattern.scm new file mode 100644 index 0000000..d0df559 --- /dev/null +++ b/geesh/pattern.scm @@ -0,0 +1,233 @@ +(define-module (geesh pattern) + #:use-module (ice-9 match) + #:use-module (ice-9 receive) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-26) + #:export (parse-pattern + pattern-quote + pattern-plain? + pattern-match?)) + +(define* (parse-rdelim s1 s2 #:optional (start 0) (end (string-length s1))) + + (define (not-backslash? chr) (not (char=? chr #\\))) + + (define (escaped? index) + (even? (- index (or (string-rindex s1 not-backslash? start index) 0)))) + + (let loop ((index (string-contains s1 s2 start end))) + (match index + (#f (values #f 0)) + (_ (if (escaped? index) + (loop (string-contains s1 s2 (1+ index) end)) + (values (substring s1 start index) (+ (- index start) 2))))))) + +(define* (parse-collating-symbol s #:optional (start 0) + (end (string-length s))) + (parse-rdelim s ".]" start end)) + +(define* (parse-equivalence-class s #:optional (start 0) + (end (string-length s))) + (parse-rdelim s "=]" start end)) + +(define* (parse-character-class s #:optional (start 0) + (end (string-length s))) + (parse-rdelim s ":]" start end)) + +(define ascii-lower-case + (char-set->list (char-set-intersection char-set:ascii + char-set:lower-case))) + +(define ascii-upper-case + (char-set->list (char-set-intersection char-set:ascii + char-set:upper-case))) + +(define ascii-digits + (char-set->list (char-set-intersection char-set:ascii + char-set:digit))) + +(define* (parse-matching-bracket-expression s #:optional (start 0) + (end (string-length s))) + (let loop ((i start) (acc '())) + (match (and (< i end) (string-ref s i)) + (#f (values #f 0)) + (#\] (if (= i start) + (loop (1+ i) (cons #\] acc)) + (values (list->char-set acc) (1+ (- i start))))) + (#\[ (match (and (< (1+ i) end) (string-ref s (1+ i))) + (#\. (receive (result length) + (parse-collating-symbol s (+ i 2) end) + (if result + (throw 'pattern-collating-symbol) + (loop (1+ i) (cons #\[ acc))))) + (#\= (receive (result length) + (parse-equivalence-class s (+ i 2) end) + (if result + (throw 'pattern-equivalence-class) + (loop (1+ i) (cons #\[ acc))))) + (#\: (receive (result length) + (parse-character-class s (+ i 2) end) + (if result + (throw 'pattern-character-class) + (loop (1+ i) (cons #\[ acc))))) + (_ (loop (1+ i) (cons #\[ acc))))) + (#\- (if (or (= i start) + (and (< (1+ i) end) (char=? (string-ref s (1+ i)) #\]))) + (loop (1+ i) (cons #\- acc)) + (let ((alpha (and (pair? acc) (car acc))) + (omega (and (< (1+ i) end) (string-ref s (1+ i))))) + (match `(,alpha . ,omega) + ((#\a . #\z) + (loop (+ i 2) (append ascii-lower-case acc))) + ((#\A . #\Z) + (loop (+ i 2) (append ascii-upper-case acc))) + ((#\0 . #\9) + (loop (+ i 2) (append ascii-digits acc))) + (_ (throw 'pattern-range-expression)))))) + (#\\ (if (< (1+ i) end) + (loop (+ i 2) (cons (string-ref s (1+ i)) acc)) + (loop (1+ i) acc))) + (chr (loop (1+ i) (cons chr acc)))))) + +(define* (parse-bracket-expression s #:optional (start 0) + (end (string-length s))) + (let* ((matching? (not (char=? (string-ref s start) #\!))) + (start* (if matching? start (1+ start)))) + (receive (result length) + (parse-matching-bracket-expression s start* end) + (if (or (not result) matching?) + (values result length) + (values (char-set-complement! result) (1+ length)))))) + +(define* (parse-pattern s #:optional (start 0) (end (string-length s))) + "Parse the string @var{s} as a pattern." + (let loop ((i start) (parts '()) (acc '())) + (match (and (< i end) (string-ref s i)) + (#f (if (null? acc) + (reverse! parts) + (reverse! (cons (reverse! acc) parts)))) + (#\* (if (null? acc) + (loop (1+ i) (cons '* parts) '()) + (loop (1+ i) (cons* '* (reverse! acc) parts) '()))) + (#\? (loop (1+ i) parts (cons '? acc))) + (#\[ (if (< (1+ i) end) + (receive (result length) + (parse-bracket-expression s (1+ i) end) + (if result + (loop (+ i length 1) parts (cons result acc)) + (loop (1+ i) parts (cons #\[ acc)))) + (loop (1+ i) parts (cons #\[ acc)))) + (#\\ (if (< (1+ i) end) + (loop (+ i 2) parts (cons (string-ref s (1+ i)) acc)) + (loop (1+ i) parts acc))) + (chr (loop (1+ i) parts (cons chr acc)))))) + +(define pattern-quote + (let ((specials '(#\\ #\* #\? #\[ #\] #\! #\-))) + (lambda (s) + "Quote all the special characters in @var{s} so that none of +them are treated specially when @var{s} is interpreted as a pattern." + (reverse-list->string + (string-fold (lambda (chr acc) + (if (member chr specials) + (cons* chr #\\ acc) + (cons chr acc))) + '() + s))))) + +(define (pattern-plain? pattern) + "Check if @var{pattern} free of special pattern constructions like +asterisks and bracket expressions. If a pattern is ``plain'' its +source string is the only string that will match it." + (every (match-lambda + ('* #f) + (parts (every (match-lambda + ((or (? char?) (? string?)) #t) + (_ #f)) + parts))) + pattern)) + +(define* (string-starts-with-part s part #:optional (start 0) + (end (string-length s))) + (let loop ((part part) (i start)) + (match (and (< i end) part) + (#f (match part + (() (- i start)) + (((? string? s2) . rest) + (and (string-null? s2) + (loop rest i))) + (_ #f))) + (() (- i start)) + (('? . rest) (loop rest (1+ i))) + (((? string? s2) . rest) + (and (string-prefix? s2 s 0 (string-length s2) i end) + (loop rest (+ i (string-length s2))))) + (((? char? chr) . rest) + (and (char=? (string-ref s i) chr) + (loop rest (1+ i)))) + (((? char-set? cs) . rest) + (and (char-set-contains? cs (string-ref s i)) + (loop rest (1+ i))))))) + +(define (pattern-part-length part) + (let loop ((part part) (length 0)) + (match part + (() length) + (((? string? str) . rest) (loop rest (+ length (string-length str)))) + ((first . rest) (loop rest (1+ length)))))) + +(define* (string-ends-with-part s part #:optional (start 0) + (end (string-length s))) + (let ((start* (- end (pattern-part-length part)))) + (and (>= start* start) + (string-starts-with-part s part start* end)))) + +(define* (string-contains-part s part #:optional (start 0) + (end (string-length s))) + (let loop ((part part) (i start)) + (match part + (() (cons start (- i start))) + (('? . rest) (loop rest (1+ i))) + (((? string? s2) . rest) + (and=> (string-contains s s2 i end) + (lambda (index) + (or (and=> (string-starts-with-part s part index end) + (cut cons index <>)) + (loop part (1+ index)))))) + (((or (? char? cp) (? char-set? cp)) . rest) + (and=> (string-index s cp i end) + (lambda (index) + (or (and=> (string-starts-with-part s part index end) + (cut cons index <>)) + (loop part (1+ index))))))))) + +(define* (pattern-match? pattern str #:optional (start 0) + (end (string-length str)) + #:key explicit-initial-period?) + "Check if @var{str} matches @var{pattern}." + (if (and explicit-initial-period? + (< start end) + (char=? (string-ref str start) #\.)) + (match pattern + (((#\. . _) . _) + (pattern-match? pattern str start end + #:explicit-initial-period? #f)) + ((((? string? s) . _) ._) + (and (string-prefix? "." s) + (pattern-match? pattern str start end + #:explicit-initial-period? #f))) + (_ #f)) + (let loop ((pattern pattern) (i start)) + (match pattern + (() (= i end)) + (('*) #t) + (('* (? pair? part)) (string-ends-with-part str part i end)) + (('* (? pair? part) . rest) + (and=> (string-contains-part str part i end) + (match-lambda + ((match-index . match-length) + (loop rest (+ match-index match-length)))))) + (((? pair? part) . rest) + (and=> (string-starts-with-part str part i end) + (lambda (length) + (loop rest (+ i length))))))))) diff --git a/tests/pattern.scm b/tests/pattern.scm new file mode 100644 index 0000000..44a7b27 --- /dev/null +++ b/tests/pattern.scm @@ -0,0 +1,171 @@ +;;; The Geesh Shell Interpreter +;;; Copyright 2018 Timothy Sample +;;; +;;; This file is part of Geesh. +;;; +;;; Geesh is free software: you can redistribute it and/or modify +;;; it under the terms of the GNU General Public License as published by +;;; the Free Software Foundation, either version 3 of the License, or +;;; (at your option) any later version. +;;; +;;; Geesh is distributed in the hope that it will be useful, +;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with Geesh. If not, see . + +(define-module (test-pattern) + #:use-module (geesh pattern) + #:use-module (srfi srfi-64) + #:use-module (tests automake)) + +;;; Commentary: +;;; +;;; Tests for the pattern module. +;;; +;;; Code: + +(test-begin "pattern") + + +;;; Basic matching + +(test-assert "Matches single characters" + (pattern-match? (parse-pattern "a") "a")) + +(test-assert "Matches a sequence of characters" + (pattern-match? (parse-pattern "abc") "abc")) + +(test-assert "Matches an empty string" + (pattern-match? (parse-pattern "") "")) + +(test-assert "Fails on pattern too long" + (not (pattern-match? (parse-pattern "ab") "a"))) + +(test-assert "Fails on pattern too short" + (not (pattern-match? (parse-pattern "a") "ab"))) + + +;;; Question marks + +(test-assert "Matches a question mark with anything" + (pattern-match? (parse-pattern "???") "abc")) + +(test-assert "Does not match question mark with empty string" + (not (pattern-match? (parse-pattern "?") ""))) + + +;;; Bracket expressions + +(test-assert "Matches with bracket expressions" + (pattern-match? (parse-pattern "[abc][def][ghi]") "aei")) + +(test-assert "Fails on non-matching bracket expressions" + (not (pattern-match? (parse-pattern "[abc][def][ghi]") "aex"))) + +(test-assert "Matches unterminated bracket expression normally" + (pattern-match? (parse-pattern "a[bc") "a[bc")) + +(test-assert "Matches with bracket expressions with left brackets" + (pattern-match? (parse-pattern "a[[]c") "a[c")) + +(test-assert "Matches with bracket expressions with right brackets" + (pattern-match? (parse-pattern "a[]]c") "a]c")) + +(test-assert "Matches with bracket expressions starting with hyphen" + (pattern-match? (parse-pattern "a[-b]c") "a-c")) + +(test-assert "Matches with bracket expressions ending with hyphen" + (pattern-match? (parse-pattern "a[b-]c") "a-c")) + +(test-assert "Matches with bracket expressions containing bang" + (pattern-match? (parse-pattern "a[b!]c") "a!c")) + +(test-assert "Matches with negated bracket expressions" + (pattern-match? (parse-pattern "a[!x]c") "abc")) + +(test-assert "Fails on non-matching negated bracket expressions" + (not (pattern-match? (parse-pattern "a[!x]c") "axc"))) + +;; We do not fully support the following features. However, rather +;; than do something strange, we raise explicit errors when they are +;; encountered. + +(test-error "Does not allow collating symbols" + (parse-pattern "[[.ch.]]")) + +(test-assert "Allows unterminated collating symbols" + (parse-pattern "[[.ch]")) + +(test-error "Does not allow equivalence classes" + (parse-pattern "[[=a=]]")) + +(test-assert "Allows unterminated equivalence classes" + (parse-pattern "[[=a]")) + +(test-error "Does not allow character classes" + (parse-pattern "[[:space:]]")) + +(test-assert "Allows unterminated character classes" + (parse-pattern "[[:space]")) + +(test-error "Does not allow general character ranges" + (parse-pattern "[<->]")) + +(test-assert "Allows [a-z]" + (parse-pattern "[a-z]")) + +(test-assert "Allows [A-Z]" + (parse-pattern "[A-Z]")) + +(test-assert "Allows [0-9]" + (parse-pattern "[0-9]")) + +(test-assert "Matches with allowed character ranges" + (pattern-match? (parse-pattern "[a-z][A-Z][0-9]") "mJ2")) + + +;;; Asterisks + +(test-assert "Matches with asterisk" + (pattern-match? (parse-pattern "*") "abc")) + +(test-assert "Matches empty string with asterisk" + (pattern-match? (parse-pattern "*") "")) + +(test-assert "Matches with trailing asterisk" + (pattern-match? (parse-pattern "foo*") "foobar")) + +(test-assert "Fails on non-matching trailing asterisk" + (not (pattern-match? (parse-pattern "foo*") "goobar"))) + +(test-assert "Matches with leading asterisk" + (pattern-match? (parse-pattern "*bar") "foobar")) + +(test-assert "Fails on non-matching leading asterisk" + (not (pattern-match? (parse-pattern "*bar") "foobaz"))) + +(test-assert "Fails on non-matching leading asterisk (internal match)" + (not (pattern-match? (parse-pattern "*bar") "foobarbaz"))) + +(test-assert "Matches with internal asterisk" + (pattern-match? (parse-pattern "foo*bar") "foo boo! bar")) + +(test-assert "Fails on non-matching internal asterisk (start)" + (not (pattern-match? (parse-pattern "foo*bar") "goo boo! bar"))) + +(test-assert "Fails on non-matching internal asterisk (end)" + (not (pattern-match? (parse-pattern "foo*bar") "foo boo! baz"))) + + +;;; Quoting + +;; TODO: Test quoting. + +(test-end) + +;; Local Variables: +;; eval: (put 'test-error 'scheme-indent-function 1) +;; End: