From 4b7d2404f3a03d980995be00ecf4b0a13bcd2a02 Mon Sep 17 00:00:00 2001 From: Timothy Sample Date: Sat, 10 Nov 2018 21:12:45 -0500 Subject: [PATCH] Fix handling of non-whitespace IFS separators * geesh/word.scm (string-tokenize*): New function. (split-fields): Use it to handle non-whitespace IFS separators. * tests/word.scm: Add tests. --- geesh/word.scm | 58 +++++++++++++++++++++++++++++++++++++++++++------- tests/word.scm | 42 ++++++++++++++++++++++++++++++++++-- 2 files changed, 90 insertions(+), 10 deletions(-) diff --git a/geesh/word.scm b/geesh/word.scm index 0290f06..d027879 100644 --- a/geesh/word.scm +++ b/geesh/word.scm @@ -64,11 +64,40 @@ guaranteed to be a list." (else (loop (cdr xs) (cons (car xs) small-acc) big-acc))))) +(define (string-tokenize* s token-set) + "Split the string @var{s} into a list of substrings, where each +substring is a maximal non-empty contiguous sequence of characters +from the character set @var{token-set} or its compliment." + + (define token-set-complement + (let ((token-set* (char-set-complement token-set))) + (lambda (cs) + (if (eq? cs token-set) token-set* token-set)))) + + (let loop ((index 0) (start 0) (cs token-set) (acc '())) + (cond + ((>= index (string-length s)) + (reverse! (if (> index start) + (cons (substring s start index) acc) + acc))) + ((char-set-contains? cs (string-ref s index)) + (loop (1+ index) start cs acc)) + (else + (loop index index + (token-set-complement cs) + (if (> index start) + (cons (substring s start index) acc) + acc)))))) + (define (split-fields qword ifs) "Split @var{qword} into a list of qwords delimited by any character in the string @var{ifs}." - (define ifs? (cut string-index ifs <>)) + (define char-set:ifs + (string->char-set ifs)) + + (define char-set:ifs/nw + (char-set-difference char-set:ifs char-set:whitespace)) (define (wedge-apart-quote qword) (let loop ((qword (normalize-word qword)) (acc '())) @@ -87,14 +116,27 @@ in the string @var{ifs}." (match qword ((' quote) (wedge-apart-quote quote)) ((' vals) (apply append (infix '(wedge) (map wedge-apart vals)))) - ("" '()) ((? string? str) - (let ((str-parts (string-split str ifs?))) - (if (every string-null? str-parts) - '(wedge) - (filter (lambda (x) - (or (eq? x 'wedge) (not (string-null? x)))) - (infix 'wedge str-parts))))) + (let ((tokens (string-tokenize* str char-set:ifs))) + (append-map (lambda (token) + (if (string-any char-set:ifs token) + ;; Every occurrence of a non-whitespace + ;; separator must delimit a field. This + ;; means that we have to add a blank field + ;; for every non-whitespace separator in + ;; 'token' beyond the first. + (let ((count (string-count token char-set:ifs/nw))) + (cons 'wedge + (append-map (const '("" wedge)) + (iota (max 0 (- count 1)))))) + (list token))) + ;; When a word starts with a non-whitespace + ;; separator, it still delimits two fields, the + ;; one on the left being empty. + (match tokens + (((? (cut string-any char-set:ifs/nw <>)) . rest) + (cons "" tokens)) + (_ tokens))))) (_ (append-map wedge-apart qword)))) (let ((wedged (wedge-apart qword))) diff --git a/tests/word.scm b/tests/word.scm index 7e115e5..e80aba2 100644 --- a/tests/word.scm +++ b/tests/word.scm @@ -359,7 +359,45 @@ the `set' built-in for details on these options.)" ;;; Field splitting (IFS) -;;; -;;; FIXME: Test that field splitting respects the IFS variable. + +(test-equal "Respects IFS value" + '("foo" "bar") + (let ((env (make-test-env '(("IFS" . "-"))))) + (expand-word env '("foo-bar")))) + +(test-equal "Combines multiple whitespace separators" + '("foo" "bar") + (let ((env (make-test-env '(("IFS" . " "))))) + (expand-word env '("foo bar")))) + +(test-equal "Keeps multiple non-whitespace separators" + '("foo" "" "bar") + (let ((env (make-test-env '(("IFS" . "-"))))) + (expand-word env '("foo--bar")))) + +(test-equal "Combines whitespace separators with a non-whitespace separator" + '("foo" "bar") + (let ((env (make-test-env '(("IFS" . "- "))))) + (expand-word env '("foo - bar")))) + +(test-equal "Keeps multiple non-whitespace separators with whitespace" + '("foo" "" "bar") + (let ((env (make-test-env '(("IFS" . "- "))))) + (expand-word env '("foo - - bar")))) + +(test-equal "Splits on leading non-whitespace separator" + '("" "foo") + (let ((env (make-test-env '(("IFS" . "-"))))) + (expand-word env '("-foo")))) + +(test-equal "Does not split on trailing non-whitespace separator" + '("foo") + (let ((env (make-test-env '(("IFS" . "-"))))) + (expand-word env '("foo-")))) + +(test-equal "Makes one field for single non-whitespace separator" + '("") + (let ((env (make-test-env '(("IFS" . "-"))))) + (expand-word env '("-")))) (test-end)