From 194c098ab78f10153028abe11a89fcc2e8f2e7da Mon Sep 17 00:00:00 2001 From: Timothy Sample Date: Sun, 9 Dec 2018 13:30:54 -0500 Subject: [PATCH] sed: Fully parse commands before processing them. --- build-aux/build-guile.sh | 2 + gash/commands/sed.scm | 84 ++++++--- gash/commands/sed/reader.scm | 322 +++++++++++++++++++++++++++++++++++ 3 files changed, 383 insertions(+), 25 deletions(-) create mode 100644 gash/commands/sed/reader.scm diff --git a/build-aux/build-guile.sh b/build-aux/build-guile.sh index 91e1a0c..6817a3c 100755 --- a/build-aux/build-guile.sh +++ b/build-aux/build-guile.sh @@ -77,6 +77,8 @@ ${srcdest}gash/commands/tr.scm ${srcdest}gash/commands/wc.scm ${srcdest}gash/commands/which.scm +${srcdest}gash/commands/sed/reader.scm + " SCRIPTS=" diff --git a/gash/commands/sed.scm b/gash/commands/sed.scm index 1d55117..fc78574 100644 --- a/gash/commands/sed.scm +++ b/gash/commands/sed.scm @@ -1,5 +1,6 @@ ;;; Gash --- Guile As SHell ;;; Copyright © 2018 Jan (janneke) Nieuwenhuizen +;;; Copyright © 2018 Timothy Sample ;;; ;;; This file is part of Gash. ;;; @@ -25,7 +26,9 @@ #:use-module (ice-9 match) #:use-module (ice-9 receive) #:use-module (ice-9 regex) + #:use-module (srfi srfi-26) + #:use-module (gash commands sed reader) #:use-module (gash config) #:use-module (gash guix-utils) #:use-module (gash shell-utils) @@ -35,7 +38,7 @@ sed )) -(define (replace->lambda string modifiers) +(define (replace->lambda string global?) (define (replace->string m s) (list->string (let loop ((lst (string->list string))) @@ -73,9 +76,51 @@ (let* ((refs (- (vector-length m) 2)) (replace (replace->string m string)) (replace (cons* replace (substring l o (match:start m)) r))) - (if (memq #\g modifiers) (loop rest (match:end m) replace) + (if global? (loop rest (match:end m) replace) (loop '() (match:end m) replace)))))))) +(define (replace-escapes str) + (let* ((str (string-replace-string str "\\n" "\n")) + (str (string-replace-string str "\\r" "\r")) + (str (string-replace-string str "\\t" "\t"))) + str)) + +(define extended? (make-parameter #f)) + +(define (substitute str pattern replacement flags) + (let* ((global? (memq 'g flags)) + (flags (cons (if (extended?) regexp/extended regexp/basic) + (if (memq 'i flags) `(,regexp/icase) '()))) + (regexp (apply make-regexp (replace-escapes pattern) flags)) + (proc (replace->lambda (replace-escapes replacement) global?))) + (match (list-matches regexp str) + ((and m+ (_ _ ...)) (proc str m+)) + (_ str)))) + +(define (execute-function function str) + (match function + (('s pattern replacement flags) + (substitute str pattern replacement flags)) + (_ (error "SED: unsupported function" function)))) + +(define (execute-commands commands str) + (match commands + (() str) + ((('always function) . rest) + (execute-commands rest (execute-function function str))) + ((cmd . rest) (error "SED: could not process command" cmd)))) + +(define* (edit-stream commands #:optional + (in (current-input-port)) + (out (current-output-port))) + (let loop ((pattern-space (read-line in))) + (unless (eof-object? pattern-space) + (let ((result (execute-commands commands pattern-space))) + (display result out) + (newline out) + (loop (read-line in)))) + #t)) + (define (sed . args) (let* ((option-spec '((expression (single-char #\e) (value #t)) @@ -87,12 +132,13 @@ (version (single-char #\V)))) (options (getopt-long args option-spec)) (files (option-ref options '() '())) - (extended? (or (option-ref options 'extended #f) - (option-ref options 'posix-extended #f))) (help? (option-ref options 'help #f)) (in-place? (option-ref options 'in-place #f)) (usage? (and (not help?) (or (and (null? files) (isatty? (current-input-port)))))) (version? (option-ref options 'version #f))) + (when (or (option-ref options 'extended #f) + (option-ref options 'posix-extended #f)) + (extended? #t)) (cond (version? (format #t "sed (GASH) ~a\n" %version) (exit 0)) ((or help? usage?) (format (if usage? (current-error-port) #t) "\ @@ -111,33 +157,21 @@ Usage: sed [OPTION]... [SCRIPT] [FILE]... (receive (scripts files) (if (pair? (append script-files scripts)) (values scripts files) (values (list-head files 1) (cdr files))) - (define (script->command o) - (cond ((string-prefix? "s" o) - (let* ((command (substring o 1)) - (string (substring command 1)) - (string (string-replace-string string "\\n" "\n")) - (string (string-replace-string string "\\r" "\r")) - (string (string-replace-string string "\\t" "\t")) - (separator (string-ref command 0))) - (receive (search replace modifier-string) - (apply values (string-split string separator)) - (let* ((modifiers (string->list modifier-string)) - (flags (if extended? (list regexp/extended) (list regexp/basic))) - (flags (if (memq #\i modifiers) (cons regexp/icase flags) - flags))) - `((,search . ,flags) . ,(replace->lambda replace modifiers)))))) - (else (error (format #f "SED: command not supported: ~s\n" o))))) (when (pair? script-files) (error "SED: script files not supported")) - (let ((commands (map script->command scripts))) + (let* ((script (string-join scripts "\n")) + (commands + (call-with-input-string script + (cut read-sed-all <> #:extended? (extended?))))) (cond ((and in-place? (pair? files)) - (for-each (lambda (file) (substitute* file commands)) files)) + (with-atomic-file-replacement + (cut edit-stream commands <> <>))) ((pair? files) (for-each (lambda (file) - (with-input-from-file file - (lambda _ (substitute-port commands)))) + (call-with-input-file file + (cut edit-stream commands <>))) files)) - (else (substitute-port commands)))))))))) + (else (edit-stream commands)))))))))) (use-modules (ice-9 rdelim)) (define main sed) diff --git a/gash/commands/sed/reader.scm b/gash/commands/sed/reader.scm new file mode 100644 index 0000000..8e9387d --- /dev/null +++ b/gash/commands/sed/reader.scm @@ -0,0 +1,322 @@ +;;; Gash --- Guile As SHell +;;; Copyright © 2018 Timothy Sample +;;; +;;; This file is part of Gash. +;;; +;;; Gash is free software; you can redistribute it and/or modify it +;;; under the terms of the GNU General Public License as published by +;;; the Free Software Foundation; either version 3 of the License, or (at +;;; your option) any later version. +;;; +;;; Gash is distributed in the hope that it will be useful, but WITHOUT ANY +;;; WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +;;; FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +;;; details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with Gash. If not, see . + +(define-module (gash commands sed reader) + #:use-module (ice-9 match) + #:use-module (ice-9 rdelim) + #:use-module (rnrs io ports) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-26) + #:export (read-sed + read-sed-all)) + +;;; Commentary: +;;; +;;; This module provides a reader for the `sed' stream editing +;;; language. +;;; +;;; Code: + +(define (next-char port) + "Discard one character from PORT, and return the next character to +be read." + (get-char port) + (lookahead-char port)) + +(define (get-char-while cs port) + "Read text from PORT until a character is found that does not belong +to the character set CS." + (let loop ((chr (lookahead-char port)) (acc '())) + (if (or (eof-object? chr) + (not (char-set-contains? cs chr))) + (reverse-list->string acc) + (loop (next-char port) (cons chr acc))))) + +(define (read-number port) + "Read a nonnegative integer from PORT." + (let* ((str (get-char-while char-set:digit port)) + (n (string->number str))) + (unless n + (error "Expected a number")) + n)) + +(define (read-bracket-expression port) + "Read a regular expression bracket expression from PORT, +assuming that it is positioned just after the initial open +bracket (`['). Return as a string the complete bracket expression, +including both brackets. + +This procedure takes into account all the ways that a close +bracket (`]') may occur in a bracket expression without terminating +it, such as named character classes and backslash escapes." + + (define (read-until-pair chr1 chr2 port) + (let loop ((chunk (read-delimited chr1 port 'concat)) (acc '())) + (unless (and (not (string-null? chunk)) + (char=? (string-ref chunk (1- (string-length chunk))) + chr1)) + (error "Unterminated bracket expression")) + (if (char=? (lookahead-char port) chr2) + (string->list (string-concatenate (reverse! acc))) + (loop (read-delimited chr1 port 'concat) (cons chunk acc))))) + + (define (read-rest) + (let loop ((chr (get-char port)) (acc '())) + (match chr + ((? eof-object?) (error "Unterminated bracket expression")) + (#\] (reverse-list->string (cons #\] acc))) + (#\[ (match (get-char port) + ((? eof-object?) (error "Unterminated bracket expression")) + ((and cc (or #\= #\. #\:)) + (let ((class (read-until-pair cc #\] port))) + (loop (get-char port) (append-reverse class acc)))) + (chr (loop (get-char port) (cons* chr #\[ acc))))) + (#\\ (match (get-char port) + ((? eof-object?) (error "Unterminated bracket expression")) + (chr (loop (get-char port) (cons* chr #\\ acc))))) + (chr (loop (get-char port) (cons chr acc)))))) + + (match (lookahead-char port) + (#\^ (match (next-char port) + (#\] (get-char port) (string-append "[^]" (read-rest))) + (_ (string-append "[^" (read-rest))))) + (#\] (get-char port) (string-append "[]" (read-rest))) + (_ (string-append "[" (read-rest))))) + +(define %extended? (make-parameter #f)) + +(define (read-re-until delim port) + "Read text from PORT as a regular expression until encountering the +delimiting character DELIM. Return the text of the regular expression +with the trailing delimiter discarded. + +This procedure takes into account the ways that the delimiter could +appear in the regular expression without ending it, such as in a +bracket expression or capture group. It order to determine what +constitutes a capture group, it uses the `%extended?' parameter." + (let loop ((chr (lookahead-char port)) (depth 0) (acc '())) + (cond + ((eof-object? chr) + (error "Unterminated regular expression")) + ((char=? chr #\[) + (get-char port) + (let* ((be (read-bracket-expression port)) + (be-chars (string->list be))) + (loop (lookahead-char port) depth (append-reverse! be-chars acc)))) + ((and (%extended?) (char=? chr #\()) + (loop (next-char port) (1+ depth) (cons #\( acc))) + ((and (%extended?) (char=? chr #\))) + (loop (next-char port) (1- depth) (cons #\) acc))) + ((char=? chr #\\) + (if (%extended?) + (match (next-char port) + ((? eof-object?) (error "Unterminated regular expression")) + (nchr (loop (next-char port) depth (cons* nchr chr acc)))) + (match (next-char port) + ((? eof-object?) (error "Unterminated regular expression")) + (#\( (loop (next-char port) (1+ depth) (cons* #\( chr acc))) + (#\) (loop (next-char port) (1- depth) (cons* #\) chr acc))) + (nchr (loop (next-char port) depth (cons* nchr chr acc)))))) + ((and (= depth 0) + (char=? chr delim)) + (get-char port) + (reverse-list->string acc)) + (else (loop (next-char port) depth (cons chr acc)))))) + +(define (read-string-until delim port) + "Read text from PORT until encountering the character DELIM, +taking into account escaping with backslashes (`\\')." + (let loop ((chr (lookahead-char port)) (acc '())) + (cond + ((eof-object? chr) (error "Unterminated string")) + ((char=? chr #\\) + (let ((next-chr (next-char port))) + (if (eof-object? next-chr) + (error "Unterminated string") + (loop (next-char port) (cons* next-chr chr acc))))) + ((and (char=? chr delim)) + (get-char port) + (reverse-list->string acc)) + (else (loop (next-char port) (cons chr acc)))))) + +(define (read-re port) + "Read a delimited regular expression from PORT." + (let ((delim (get-char port))) + (if (eof-object? delim) + (error "Expected regular expression") + (read-re-until delim port)))) + +(define (read-re+string port) + "Read a delimited regular expression and a replacement string from +PORT." + (let ((delim (get-char port))) + (if (eof-object? delim) + (error "Expected regular expression and replacement") + (let* ((re (read-re-until delim port)) + (str (read-string-until delim port))) + `(,re . ,str))))) + +(define (read-string+string port) + "Read two delimited strings from PORT." + (let ((delim (get-char port))) + (if (eof-object? delim) + (error "Expected characters and their replacements") + (let* ((str1 (read-string-until delim port)) + (str2 (read-string-until delim port))) + `(,str1 . ,str2))))) + +(define (read-text port) + "Read text from PORT until either an unescaped newline or end of +file is encountered." + (get-char-while char-set:whitespace port) + (let loop ((chr (get-char port)) (acc '())) + (match chr + ((or (? eof-object?) + #\newline) + (reverse-list->string acc)) + (#\\ + (let ((next-chr (get-char port))) + (if (eof-object? next-chr) + (error "Unterminated text") + (loop (get-char port) (cons next-chr acc))))) + (_ (loop (get-char port) (cons chr acc)))))) + +(define char-set:label + (string->char-set + (string-append "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789._-"))) + +(define label-char? (cut char-set-contains? char-set:label <>)) + +(define (read-label port) + "Read a label from PORT." + (get-char-while char-set:whitespace port) + (get-char-while char-set:label port)) + +(define (read-flags port) + "Read flags (for the `s' command) from PORT." + (let loop ((chr (lookahead-char port)) (acc '())) + (match chr + ((? eof-object?) (reverse! acc)) + (#\g (loop (next-char port) (cons 'g acc))) + (#\i (loop (next-char port) (cons 'i acc))) + (#\p (loop (next-char port) (cons 'p acc))) + ((? (cut char-set-contains? char-set:digit <>)) + (let ((n (read-number port))) + (loop (lookahead-char port) (cons n acc)))) + (#\w + (get-char port) + (let ((filename (read-text port))) + (reverse! (cons `(w ,filename) acc)))) + (_ (reverse! acc))))) + +(define (read-address port) + "Read an address from PORT." + (match (lookahead-char port) + (#\$ '$) + ((? (cut char-set-contains? char-set:digit <>)) (read-number port)) + (_ (read-re port)))) + +(define* (read-function port #:key (depth 0)) + "Read a function and its arguments from PORT." + (get-char-while char-set:whitespace port) + (match (get-char port) + (#\{ `(begin ,@(%read-sed-all port #:depth (1+ depth)))) + (#\a `(a ,(read-text port))) + (#\b `(b ,(read-label port))) + (#\c `(c ,(read-text port))) + (#\d '(d)) + (#\D '(D)) + (#\g '(g)) + (#\G '(G)) + (#\h '(h)) + (#\H '(H)) + (#\i `(i ,(read-text port))) + (#\l '(l)) + (#\n '(n)) + (#\N '(N)) + (#\p '(p)) + (#\P '(P)) + (#\q '(q)) + (#\r `(r ,(read-text port))) + (#\s (match-let (((re . str) (read-re+string port))) + `(s ,re ,str ,(read-flags port)))) + (#\t `(t ,(read-label port))) + (#\w `(w ,(read-text port))) + (#\x '(x)) + (#\y (match-let (((str1 . str2) (read-string+string port))) + `(y ,str1 ,str2))) + (#\: `(: ,(read-label port))) + (#\= `(= ,(1+ (port-line port)))) + (#\# `(comment ,(read-line port))))) + +(define char-set:function + (string->char-set "abcdDgGhHilnNpPqrstwxy:=#")) + +(define function-char? (cut char-set-contains? char-set:function <>)) + +(define (read-addresses port) + "Read zero, one, or two address from PORT, separated by a +comma (`,') and delimited by a function name." + (match (lookahead-char port) + ((? function-char?) '()) + (_ (let ((address1 (read-address port))) + (match (lookahead-char port) + (#\, (let* ((_ (get-char port)) + (address2 (read-address port))) + `(,address1 ,address2))) + (_ `(,address1))))))) + +(define char-set:whitespace+semi (char-set-adjoin char-set:whitespace #\;)) + +(define* (%read-sed port #:key (depth 0)) + "Read a sed command from PORT." + (get-char-while char-set:whitespace+semi port) + (match (lookahead-char port) + ((? eof-object?) (eof-object)) + (#\} + (get-char port) + (if (> depth 0) + (eof-object) + (error "Unmatched close brace"))) + (_ (let* ((addresses (read-addresses port)) + (function (read-function port #:depth depth))) + (match addresses + (() `(always ,function)) + ((address) `(at ,address ,function)) + ((address1 address2) `(in (,address1 . ,address2) ,function))))))) + +(define* (%read-sed-all port #:key (depth 0)) + "Read a sequence of sed commands from PORT." + (let loop ((cmd (%read-sed port #:depth depth)) (acc '())) + (match cmd + ((? eof-object?) (reverse! acc)) + (_ (loop (%read-sed port #:depth depth) (cons cmd acc)))))) + +(define* (read-sed port #:key (extended? #f)) + "Read a sed command from PORT. If EXTENDED? is set, treat regular +expressions as extended rather than basic." + (parameterize ((%extended? extended?)) + (%read-sed port))) + +(define* (read-sed-all port #:key (extended? #f)) + "Read a sequence of sed commands from PORT. If EXTENDED? is set, +treat regular expressions as extended rather than basic." + (parameterize ((%extended? extended?)) + (%read-sed-all port)))