-
Notifications
You must be signed in to change notification settings - Fork 6
/
script.go
973 lines (881 loc) · 29.6 KB
/
script.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
// This file lets users define and execute AWK-like scripts within Go.
package awk
import (
"bufio"
"errors"
"fmt"
"io"
"os"
"regexp"
"strings"
"unicode/utf8"
)
// A scriptAborter is an error that causes the current script to abort but lets
// the rest of the program run.
type scriptAborter struct{ error }
// A recordStopper is thrown when a script wants to continue immediately with
// the next record.
type recordStopper struct{ error }
// A parseState indicates where we are in our parsing state.
type parseState int
// The following are the possibilities for a parseState.
const (
notRunning parseState = iota // Before/after Run was called
atBegin // Before any records are read
inMiddle // While records are being read
atEnd // After all records are read
)
// A stopState describes premature stop conditions.
type stopState int
// The following are possibilities for a stopState.
const (
dontStop stopState = iota // Normal execution
stopRec // Abort the current record
stopScript // Abort the entire script
)
// Choose arbitrary initial sizes for record and field buffers.
const (
initialFieldSize = 4096
initialRecordSize = 4096
)
// A Script encapsulates all of the internal state for an AWK-like script.
type Script struct {
State interface{} // Arbitrary, user-supplied data
Output io.Writer // Output stream (defaults to os.Stdout)
Begin ActionFunc // Action to perform before any input is read
End ActionFunc // Action to perform after all input is read
ConvFmt string // Conversion format for numbers, "%.6g" by default
SubSep string // Separator for simulated multidimensional arrays
NR int // Number of input records seen so far
NF int // Number of fields in the current input record
RT string // Actual string terminating the current record
RStart int // 1-based index of the previous regexp match (Value.Match)
RLength int // Length of the previous regexp match (Value.Match)
MaxRecordSize int // Maximum number of characters allowed in each record
MaxFieldSize int // Maximum number of characters allowed in each field
nf0 int // Value of NF for which F(0) was computed
rs string // Input record separator, newline by default
fs string // Input field separator, space by default
fieldWidths []int // Fixed-width column sizes
fPat string // Input field regular expression
ors string // Output record separator, newline by default
ofs string // Output field separator, space by default
ignCase bool // true: REs are case-insensitive; false: case-sensitive
rules []statement // List of pattern-action pairs to execute
fields []*Value // Fields in the current record; fields[0] is the entire record
regexps map[string]*regexp.Regexp // Map from a regular-expression string to a compiled regular expression
getlineState map[io.Reader]*Script // Parsing state needed to invoke GetLine repeatedly on a given io.Reader
rsScanner *bufio.Scanner // Scanner associated with RS
input io.Reader // Script input stream
state parseState // What we're currently parsing
stop stopState // What we should stop doing
}
// NewScript initializes a new Script with default values.
func NewScript() *Script {
return &Script{
Output: os.Stdout,
ConvFmt: "%.6g",
SubSep: "\034",
NR: 0,
NF: 0,
MaxRecordSize: bufio.MaxScanTokenSize,
MaxFieldSize: bufio.MaxScanTokenSize,
nf0: 0,
rs: "\n",
fs: " ",
ors: "\n",
ofs: " ",
ignCase: false,
rules: make([]statement, 0, 10),
fields: make([]*Value, 0),
regexps: make(map[string]*regexp.Regexp, 10),
getlineState: make(map[io.Reader]*Script),
state: notRunning,
}
}
// abortScript aborts the current script with a formatted error message.
func (s *Script) abortScript(format string, a ...interface{}) {
s.stop = stopScript
panic(scriptAborter{fmt.Errorf(format, a...)})
}
// Copy returns a copy of a Script.
func (s *Script) Copy() *Script {
sc := *s
sc.rules = make([]statement, len(s.rules))
copy(sc.rules, s.rules)
sc.fieldWidths = make([]int, len(s.fieldWidths))
copy(sc.fieldWidths, s.fieldWidths)
sc.fields = make([]*Value, len(s.fields))
copy(sc.fields, s.fields)
sc.regexps = make(map[string]*regexp.Regexp, len(s.regexps))
for k, v := range s.regexps {
sc.regexps[k] = v
}
sc.getlineState = make(map[io.Reader]*Script, len(s.getlineState))
for k, v := range s.getlineState {
sc.getlineState[k] = v
}
return &sc
}
// SetRS sets the input record separator (really, a record terminator). It is
// invalid to call SetRS after the first record is read. (It is acceptable to
// call SetRS from a Begin action, though.) As in AWK, if the record separator
// is a single character, that character is used to separate records; if the
// record separator is multiple characters, it's treated as a regular
// expression (subject to the current setting of Script.IgnoreCase); and if the
// record separator is an empty string, records are separated by blank lines.
// That last case implicitly causes newlines to be accepted as a field
// separator in addition to whatever was specified by SetFS.
func (s *Script) SetRS(rs string) {
if s.state == inMiddle {
s.abortScript("SetRS was called from a running script")
}
s.rs = rs
}
// SetFS sets the input field separator. As in AWK, if the field separator is
// a single space (the default), fields are separated by runs of whitespace; if
// the field separator is any other single character, that character is used to
// separate fields; if the field separator is an empty string, each individual
// character becomes a separate field; and if the field separator is multiple
// characters, it's treated as a regular expression (subject to the current
// setting of Script.IgnoreCase).
func (s *Script) SetFS(fs string) {
s.fs = fs
s.fieldWidths = nil
s.fPat = ""
}
// SetFieldWidths indicates that each record is composed of fixed-width columns
// and specifies the width in characters of each column. It is invalid to pass
// SetFieldWidths a nil argument or a non-positive field width.
func (s *Script) SetFieldWidths(fw []int) {
// Sanity-check the argument.
if fw == nil {
s.abortScript("SetFieldWidths was passed a nil slice")
}
for _, w := range fw {
if w <= 0 {
s.abortScript(fmt.Sprintf("SetFieldWidths was passed an invalid field width (%d)", w))
}
}
// Assign the field widths and reset the field separator and field
// matcher (not strictly but consistent with the SetFS method).
s.fs = " "
s.fieldWidths = fw
s.fPat = ""
}
// SetFPat defines a "field pattern", a regular expression that matches fields.
// This lies in contrast to providing a regular expression to SetFS, which
// matches the separation between fields, not the fields themselves.
func (s *Script) SetFPat(fp string) {
s.fs = " "
s.fieldWidths = nil
s.fPat = fp
}
// recomputeF0 recomputes F(0) by concatenating F(1)...F(NF) with OFS.
func (s *Script) recomputeF0() {
if len(s.fields) >= 1 {
s.fields[0] = s.NewValue(strings.Join(s.FStrings(), s.ofs))
}
s.nf0 = s.NF
}
// SetORS sets the output record separator.
func (s *Script) SetORS(ors string) { s.ors = ors }
// SetOFS sets the output field separator.
func (s *Script) SetOFS(ofs string) {
s.ofs = ofs
s.recomputeF0()
}
// F returns a specified field of the current record. Field numbers are
// 1-based. Field 0 refers to the entire record. Requesting a field greater
// than NF returns a zero value. Requesting a negative field number panics
// with an out-of-bounds error.
func (s *Script) F(i int) *Value {
if i == 0 && s.NF != s.nf0 {
s.recomputeF0()
}
if i < len(s.fields) {
return s.fields[i]
}
return s.NewValue("")
}
// SetF sets a field of the current record to the given Value. Field numbers
// are 1-based. Field 0 refers to the entire record. Setting it causes the
// entire line to be reparsed (and NF recomputed). Setting a field numbered
// larger than NF extends NF to that value. Setting a negative field number
// panics with an out-of-bounds error.
func (s *Script) SetF(i int, v *Value) {
// Zero index: Assign and reparse the entire record.
if i == 0 {
s.splitRecord(v.String())
return
}
// Index larger than NF: extend NF and try again.
if i >= len(s.fields) {
for i >= len(s.fields) {
s.fields = append(s.fields, s.NewValue(""))
}
s.NF = len(s.fields) - 1
}
// Index not larger than (the possibly modified) NF: write the field.
s.fields[i] = v
// Force F(0) to be recomputed the next time it's accessed.
s.nf0 = -1
}
// FStrings returns all fields in the current record as a []string of length
// NF.
func (s *Script) FStrings() []string {
a := make([]string, s.NF)
for i := 0; i < s.NF; i++ {
a[i] = s.F(i + 1).String()
}
return a
}
// FInts returns all fields in the current record as a []int of length NF.
func (s *Script) FInts() []int {
a := make([]int, s.NF)
for i := 0; i < s.NF; i++ {
a[i] = s.F(i + 1).Int()
}
return a
}
// FFloat64s returns all fields in the current record as a []float64 of length
// NF.
func (s *Script) FFloat64s() []float64 {
a := make([]float64, s.NF)
for i := 0; i < s.NF; i++ {
a[i] = s.F(i + 1).Float64()
}
return a
}
// IgnoreCase specifies whether regular-expression and string comparisons
// should be performed in a case-insensitive manner.
func (s *Script) IgnoreCase(ign bool) {
s.ignCase = ign
}
// Println is like fmt.Println but honors the current output stream, output
// field separator, and output record separator. If called with no arguments,
// Println outputs all fields in the current record.
func (s *Script) Println(args ...interface{}) {
// No arguments: Output all fields of the current record.
if args == nil {
for i := 1; i <= s.NF; i++ {
fmt.Fprintf(s.Output, "%v", s.F(i))
if i == s.NF {
fmt.Fprintf(s.Output, "%s", s.ors)
} else {
fmt.Fprintf(s.Output, "%s", s.ofs)
}
}
return
}
// One or more arguments: Output them.
for i, arg := range args {
fmt.Fprintf(s.Output, "%v", arg)
if i == len(args)-1 {
fmt.Fprintf(s.Output, "%s", s.ors)
} else {
fmt.Fprintf(s.Output, "%s", s.ofs)
}
}
}
// A PatternFunc represents a pattern to match against. It is expected to
// examine the state of the given Script then return either true or false. If
// it returns true, the corresponding ActionFunc is executed. Otherwise, the
// corresponding ActionFunc is not executed.
type PatternFunc func(*Script) bool
// An ActionFunc represents an action to perform when the corresponding
// PatternFunc returns true.
type ActionFunc func(*Script)
// A statement represents a single pattern-action pair.
type statement struct {
Pattern PatternFunc
Action ActionFunc
}
// The matchAny pattern is true only in the middle of a script, when a record
// is available for parsing.
func matchAny(s *Script) bool {
return s.state == inMiddle
}
// The printRecord statement outputs the current record verbatim to the current
// output stream.
func printRecord(s *Script) {
fmt.Fprintf(s.Output, "%v%s", s.fields[0], s.ors)
}
// Next stops processing the current record and proceeds with the next record.
func (s *Script) Next() {
if s.stop == dontStop {
s.stop = stopRec
}
panic(recordStopper{errors.New("Unexpected Next invocation")}) // Unexpected if we don't catch it
}
// Exit stops processing the entire script, causing the Run method to return.
func (s *Script) Exit() {
if s.stop == dontStop {
s.stop = stopScript
}
}
// Range combines two patterns into a single pattern that statefully returns
// true between the time the first and second pattern become true (both
// inclusively).
func Range(p1, p2 PatternFunc) PatternFunc {
inRange := false
return func(s *Script) bool {
if inRange {
inRange = !p2(s)
return true
}
inRange = p1(s)
return inRange
}
}
// Auto provides a simplified mechanism for creating various common-case
// PatternFunc functions. It accepts zero, one, or an even number of
// arguments. If given no arguments, it matches every record. If given a
// single argument, its behavior depends on that argument's type:
//
// • A Script.PatternFunc is returned as is.
//
// • A *regexp.Regexp returns a function that matches that regular expression
// against the entire record.
//
// • A string is treated as a regular expression and behaves likewise.
//
// • An int returns a function that matches that int against NR.
//
// • Any other type causes a run-time panic.
//
// If given an even number of arguments, pairs of arguments are treated as
// ranges (cf. the Range function). The PatternFunc returns true if the record
// lies within any of the ranges.
func Auto(v ...interface{}) PatternFunc {
if len(v) == 0 {
// No arguments: Match anything.
return matchAny
}
if len(v)%2 == 0 {
// Even number of arguments other than 0: Return a disjunction
// of ranges.
fList := make([]PatternFunc, len(v)/2)
for i := 0; i < len(v); i += 2 {
f1 := Auto(v[i])
f2 := Auto(v[i+1])
fList[i/2] = Range(f1, f2)
}
return func(s *Script) bool {
// Return true iff any range is true. Note that we
// always evaluate every range to avoid confusing
// results because of statefulness.
m := false
for _, f := range fList {
if f(s) {
m = true
}
}
return m
}
}
if len(v)%2 == 1 {
// Single argument: Decide what to do based on its type.
switch x := v[0].(type) {
case PatternFunc:
// Already a PatternFunc: Return it unmodified.
return x
case string:
// String: Treat as a regular expression that matches
// against F[0].
return func(s *Script) bool {
r, err := s.compileRegexp(x)
if err != nil {
s.abortScript(err.Error())
}
return r.MatchString(s.F(0).String())
}
case int:
// Integer: Match against NR.
return func(s *Script) bool {
return s.NR == x
}
case *regexp.Regexp:
// Regular expression: Convert to a string then,
// dynamically, back to a regular expression. This
// enables dynamic toggling of case sensitivity.
xs := x.String()
return func(s *Script) bool {
r, err := s.compileRegexp(xs)
if err != nil {
s.abortScript(err.Error())
}
return r.MatchString(s.F(0).String())
}
default:
panic(fmt.Sprintf("Auto does not accept arguments of type %T", x))
}
}
panic("Auto expects 0, 1, or an even number of arguments")
}
// AppendStmt appends a pattern-action pair to a Script. If the pattern
// function is nil, the action will be performed on every record. If the
// action function is nil, the record will be output verbatim to the standard
// output device. It is invalid to call AppendStmt from a running script.
func (s *Script) AppendStmt(p PatternFunc, a ActionFunc) {
// Panic if we were called on a running script.
if s.state != notRunning {
s.abortScript("AppendStmt was called from a running script")
}
// Append a statement to the list of rules.
stmt := statement{
Pattern: p,
Action: a,
}
if p == nil {
stmt.Pattern = matchAny
}
if a == nil {
stmt.Action = printRecord
}
s.rules = append(s.rules, stmt)
}
// compileRegexp caches and returns the result of regexp.Compile. It
// automatically prepends "(?i)" to the expression if the script is currently
// set to perform case-insensitive regular-expression matching.
func (s *Script) compileRegexp(expr string) (*regexp.Regexp, error) {
if s.ignCase {
expr = "(?i)" + expr
}
re, found := s.regexps[expr]
if found {
return re, nil
}
var err error
re, err = regexp.Compile(expr)
if err != nil {
return nil, err
}
s.regexps[expr] = re
return re, nil
}
// makeSingleCharFieldSplitter returns a splitter that returns the next field
// by splitting on a single character (except for space, which is a special
// case handled elsewhere).
func (s *Script) makeSingleCharFieldSplitter() func([]byte, bool) (int, []byte, error) {
// Ensure the separator character is valid.
firstRune, _ := utf8.DecodeRuneInString(s.fs)
if firstRune == utf8.RuneError {
return func(data []byte, atEOF bool) (int, []byte, error) {
return 0, nil, errors.New("Invalid rune in separator")
}
}
// The separator is valid. Return a splitter customized to that
// separator.
returnedFinalToken := false // true=already returned a final, non-terminated token; false=didn't
return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
// Scan until we see a separator or run out of data.
for width, i := 0, 0; i < len(data); i += width {
var r rune
r, width = utf8.DecodeRune(data[i:])
if r == utf8.RuneError && i+width >= len(data) && !atEOF {
// Invalid rune at the end of the data.
// Request more data and try again.
return 0, nil, nil
}
if r == firstRune {
return i + width, data[:i], nil
}
}
// We didn't see a separator. If we're at EOF, we have
// a final, non-terminated token. Return it (unless we
// already did).
if atEOF && !returnedFinalToken {
returnedFinalToken = true
return len(data), data, nil
}
// Request more data.
return 0, nil, nil
}
}
// makeREFieldSplitter returns a splitter that returns the next field by
// splitting on a regular expression.
func (s *Script) makeREFieldSplitter() func([]byte, bool) (int, []byte, error) {
// Ensure that the regular expression is valid.
var sepRegexp *regexp.Regexp
var err error
if s.rs == "" {
// A special case in AWK is that if the record terminator is
// empty (implying a blank line) then newlines are accepted as
// a field separator in addition to whatever is specified for
// FS.
sepRegexp, err = s.compileRegexp(`(` + s.fs + `)|(\r?\n)`)
} else {
sepRegexp, err = s.compileRegexp(s.fs)
}
if err != nil {
return func(data []byte, atEOF bool) (int, []byte, error) {
return 0, nil, err
}
}
// The regular expression is valid. Return a splitter customized to
// that regular expression.
returnedFinalToken := false // true=already returned a final, non-terminated token; false=didn't
return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
// If we match the regular expression, return everything up to
// the match.
loc := sepRegexp.FindIndex(data)
if loc != nil {
return loc[1], data[:loc[0]], nil
}
// We didn't see a separator. If we're at EOF, we have a
// final, non-terminated token. Return it (unless we already
// did).
if atEOF && !returnedFinalToken {
returnedFinalToken = true
return len(data), data, nil
}
// Request more data.
return 0, nil, nil
}
}
// makeFixedFieldSplitter returns a splitter than returns the next field by
// splitting a record into fixed-size chunks.
func (s *Script) makeFixedFieldSplitter() func([]byte, bool) (int, []byte, error) {
f := 0 // Index into s.fieldWidths
returnedFinalToken := false // true=already returned a final, non-terminated token; false=didn't
return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
// If we've exhausted s.fieldWidths, return empty-handed.
if f >= len(s.fieldWidths) {
return 0, nil, nil
}
// If we have enough characters for the current field, return a
// token and advance to the next field.
fw := s.fieldWidths[f]
if len(data) >= fw {
f++
return fw, data[:fw], nil
}
// If we don't have enough characters for the current field but
// we're at EOF, return whatever we have (unless we already
// did).
if atEOF && !returnedFinalToken {
returnedFinalToken = true
return len(data), data, nil
}
// If we don't have enough characters for the current field and
// we're not at EOF, request more data.
return 0, nil, nil
}
}
// makeREFieldMatcher returns a splitter that returns the next field by
// matching against a regular expression.
func (s *Script) makeREFieldMatcher() func([]byte, bool) (int, []byte, error) {
// Ensure that the regular expression is valid.
sepRegexp, err := s.compileRegexp(s.fPat)
if err != nil {
return func(data []byte, atEOF bool) (int, []byte, error) {
return 0, nil, err
}
}
// The regular expression is valid. Return a splitter customized to
// that regular expression.
return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
// If we match the regular expression, return the match.
// Otherwise, request more data.
loc := sepRegexp.FindIndex(data)
if loc == nil {
return 0, nil, nil
}
return loc[1], data[loc[0]:loc[1]], nil
}
}
// makeFieldSplitter returns a splitter that returns the next field.
func (s *Script) makeFieldSplitter() func([]byte, bool) (int, []byte, error) {
// If we were given fixed field widths, use them.
if s.fieldWidths != nil {
return s.makeFixedFieldSplitter()
}
// If were given a field-matching regular expression, use it.
if s.fPat != "" {
return s.makeREFieldMatcher()
}
// If the separator is empty, each rune is a separate field.
if s.fs == "" {
return bufio.ScanRunes
}
// If the separator is a single space, return the next word as the
// field.
if s.fs == " " {
return bufio.ScanWords
}
// If the separator is a single character and the record terminator is
// not empty (a special case in AWK), split based on that. This code
// is derived from the bufio.ScanWords source.
if utf8.RuneCountInString(s.fs) == 1 && s.rs != "" {
return s.makeSingleCharFieldSplitter()
}
// If the separator is multiple characters (or the record terminator is
// empty), treat it as a regular expression, and scan based on that.
return s.makeREFieldSplitter()
}
// makeRecordSplitter returns a splitter that returns the next record.
// Although all the AWK documentation I've read define RS as a record
// separator, as far as I can tell, AWK in fact treats it as a record
// *terminator* so we do, too.
func (s *Script) makeRecordSplitter() func([]byte, bool) (int, []byte, error) {
// If the terminator is a single character, scan based on that. This
// code is derived from the bufio.ScanWords source.
if utf8.RuneCountInString(s.rs) == 1 {
// Ensure the terminator character is valid.
firstRune, _ := utf8.DecodeRuneInString(s.rs)
if firstRune == utf8.RuneError {
return func(data []byte, atEOF bool) (int, []byte, error) {
return 0, nil, errors.New("Invalid rune in terminator")
}
}
// The terminator is valid. Return a splitter customized to
// that terminator.
return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
// Scan until we see a terminator or run out of data.
s.RT = string(firstRune)
for width, i := 0, 0; i < len(data); i += width {
var r rune
r, width = utf8.DecodeRune(data[i:])
if r == utf8.RuneError && i+width >= len(data) && !atEOF {
// Invalid rune at the end of the data.
// Request more data and try again.
return 0, nil, nil
}
if r == firstRune {
return i + width, data[:i], nil
}
}
// We didn't see a terminator. If we're at EOF, we
// have a final, non-terminated token. Return it if
// it's nonempty.
if atEOF && len(data) > 0 {
return len(data), data, nil
}
// Request more data.
return 0, nil, nil
}
}
// If the terminator is multiple characters, treat it as a regular
// expression, and scan based on that. Or, as a special case, if the
// terminator is empty, we treat it as a regular expression
// representing one or more blank lines.
return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
// Generate a regular expression based on the current RS and
// IgnoreCase.
var termRegexp *regexp.Regexp
if s.rs == "" {
termRegexp, err = s.compileRegexp(`\r?\n(\r?\n)+`)
} else {
termRegexp, err = s.compileRegexp(s.rs)
}
if err != nil {
return 0, nil, err
}
// If we match the regular expression, return everything up to
// the match.
loc := termRegexp.FindIndex(data)
if loc != nil {
s.RT = string(data[loc[0]:loc[1]])
return loc[1], data[:loc[0]], nil
}
// We didn't see a terminator. If we're at EOF, we have a
// final, non-terminated token. Return it if it's nonempty.
if atEOF && len(data) > 0 {
s.RT = ""
return len(data), data, nil
}
// Request more data.
return 0, nil, nil
}
}
// Read the next record from a stream and return it.
func (s *Script) readRecord() (string, error) {
// Return the next record.
if s.rsScanner.Scan() {
return s.rsScanner.Text(), nil
}
if err := s.rsScanner.Err(); err != nil {
return "", err
}
return "", io.EOF
}
// splitRecord splits a record into fields. It stores the fields in the Script
// struct's F field and update NF. As in real AWK, field 0 is the entire
// record.
func (s *Script) splitRecord(rec string) error {
fsScanner := bufio.NewScanner(strings.NewReader(rec))
fsScanner.Buffer(make([]byte, initialFieldSize), s.MaxFieldSize)
fsScanner.Split(s.makeFieldSplitter())
fields := make([]*Value, 0, 100)
fields = append(fields, s.NewValue(rec))
for fsScanner.Scan() {
fields = append(fields, s.NewValue(fsScanner.Text()))
}
if err := fsScanner.Err(); err != nil {
return err
}
s.fields = fields
s.NF = len(fields) - 1
s.nf0 = s.NF
return nil
}
// GetLine reads the next record from an input stream and returns it. If the
// argument to GetLine is nil, GetLine reads from the current input stream and
// increments NR. Otherwise, it reads from the given io.Reader and does not
// increment NR. Call SetF(0, ...) on the Value returned by GetLine to perform
// the equivalent of AWK's getline with no variable argument.
func (s *Script) GetLine(r io.Reader) (*Value, error) {
// Handle the simpler case of a nil argument (to read from the current
// input stream).
if r == nil {
rec, err := s.readRecord()
if err != nil {
return nil, err
}
s.NR++
return s.NewValue(rec), nil
}
// If we've seen this io.Reader before, reuse its parsing state.
// Otherwise, create a new Script for storing state.
sc := s.getlineState[r]
if sc == nil {
// Copy the given script so we don't alter any of the original
// script's state.
sc = s.Copy()
s.getlineState[r] = sc
// Create (and store) a new scanner based on the record
// terminator.
sc.input = r
sc.rsScanner = bufio.NewScanner(sc.input)
sc.rsScanner.Buffer(make([]byte, initialRecordSize), sc.MaxRecordSize)
sc.rsScanner.Split(sc.makeRecordSplitter())
}
// Read a record from the given reader.
rec, err := sc.readRecord()
if err != nil {
return nil, err
}
return sc.NewValue(rec), nil
}
// Run executes a script against a given input stream. It is perfectly valid
// to run the same script on multiple input streams.
func (s *Script) Run(r io.Reader) (err error) {
// Catch scriptAborter panics and return them as errors. Re-throw all
// other panics.
defer func() {
if r := recover(); r != nil {
if e, ok := r.(scriptAborter); ok {
err = e
} else {
panic(r)
}
}
}()
// Reinitialize most of our state.
s.input = r
s.ConvFmt = "%.6g"
s.NF = 0
s.NR = 0
// Process the Begin action, if any.
if s.Begin != nil {
s.state = atBegin
s.Begin(s)
}
// Create (and store) a new scanner based on the record terminator.
s.rsScanner = bufio.NewScanner(s.input)
s.rsScanner.Buffer(make([]byte, initialRecordSize), s.MaxRecordSize)
s.rsScanner.Split(s.makeRecordSplitter())
// Process each record in turn.
s.state = inMiddle
for {
// Read a record.
s.stop = dontStop
rec, err := s.readRecord()
if err != nil {
if err == io.EOF {
break
}
return err
}
s.NR++
// Split the record into its constituent fields.
err = s.splitRecord(rec)
if err != nil {
return err
}
// Process all applicable actions.
func() {
// An action is able to break out of the
// action-processing loop by calling Next, which throws
// a recordStopper. We catch that and continue
// with the next record.
defer func() {
if r := recover(); r != nil {
if _, ok := r.(recordStopper); !ok {
panic(r)
}
}
}()
// Perform each action whose pattern matches the
// current record.
for _, rule := range s.rules {
if rule.Pattern(s) {
rule.Action(s)
if s.stop != dontStop {
break
}
}
}
}()
// Stop the script if an error occurred or an action calls Exit.
if s.stop == stopScript {
return nil
}
}
// Process the End action, if any.
if s.End != nil {
s.state = atEnd
s.End(s)
}
s.state = notRunning
return nil
}
// RunPipeline chains together a set of scripts into a pipeline, with each
// script sending its output to the next. (Implication: Script.Output will be
// overwritten in all but the last script.) If any script in the pipeline
// fails, a non-nil error will be returned.
func RunPipeline(r io.Reader, ss ...*Script) error {
// Spawn scripts in reverse order so they begin blocked on input.
eChan := make(chan error, len(ss))
for i := len(ss) - 1; i > 0; i-- {
s := ss[i]
pr, pw := io.Pipe()
ss[i-1].Output = pw
go func(i int, pr *io.PipeReader) {
eChan <- s.Run(pr)
if i < len(ss)-1 {
ss[i].Output.(*io.PipeWriter).Close()
}
}(i, pr)
}
// Spawn the first script to enable the rest to begin.
go func() {
eChan <- ss[0].Run(r)
if len(ss) > 1 {
ss[0].Output.(*io.PipeWriter).Close()
}
}()
// Wait for all scripts to finish.
for range ss {
err := <-eChan
if err != nil {
// Error -- close all output pipes then return.
for j := 0; j < len(ss)-1; j++ {
ss[j].Output.(*io.PipeWriter).Close()
}
return err
}
}
return nil
}