Newer
Older
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package regexp implements a simple regular expression library.
//
// The syntax of the regular expressions accepted is:
//
// regexp:
// concatenation { '|' concatenation }
// concatenation:
// { closure }
// closure:
// term [ '*' | '+' | '?' ]
// term:
// '^'
// '$'
// '.'
// character
// '[' [ '^' ] character-ranges ']'
// '(' regexp ')'
//
var (
ErrInternal = os.NewError("internal error");
ErrUnmatchedLpar = os.NewError("unmatched '('");
ErrUnmatchedRpar = os.NewError("unmatched ')'");
ErrUnmatchedLbkt = os.NewError("unmatched '['");
ErrUnmatchedRbkt = os.NewError("unmatched ']'");
ErrBadRange = os.NewError("bad range in character class");
ErrExtraneousBackslash = os.NewError("extraneous backslash");
ErrBadClosure = os.NewError("repeated closure (**, ++, etc.)");
ErrBareClosure = os.NewError("closure applies to nothing");
ErrBadBackslash = os.NewError("illegal backslash escape");
)
// An instruction executed by the NFA
kind() int; // the type of this instruction: _CHAR, _ANY, etc.
next() instr; // the instruction to execute after this one
setNext(i instr);
index() int;
setIndex(i int);
print();
// Fields and methods common to all instructions
func (c *common) next() instr { return c._next }
func (c *common) setNext(i instr) { c._next = i }
func (c *common) index() int { return c._index }
func (c *common) setIndex(i int) { c._index = i }
// The representation of a compiled regular expression.
// The public interface is entirely through methods.
type Regexp struct {
expr string; // the original expression
ch chan<- *Regexp; // reply channel when we're done
Rob Pike
committed
error os.Error; // compile- or run-time error; nil if OK
nbra int; // number of brackets in expression, for subexpressions
_START // beginning of program
_END; // end of program: success
_BOT; // '^' beginning of text
_EOT; // '$' end of text
_CHAR; // 'a' regular character
_CHARCLASS; // [a-z] character class
_ANY; // '.' any character including newline
_NOTNL; // [^\n] special case: any character but newline
_BRA; // '(' parenthesized expression
_EBRA; // ')'; end of '(' parenthesized expression
_ALT; // '|' alternation
_NOP; // do nothing; makes it easy to link without patching
)
// --- START start of program
type _Start struct {
func (start *_Start) kind() int { return _START }
func (start *_Start) print() { print("start") }
type _End struct {
func (end *_End) kind() int { return _END }
func (end *_End) print() { print("end") }
type _Bot struct {
func (bot *_Bot) kind() int { return _BOT }
func (bot *_Bot) print() { print("bot") }
type _Eot struct {
func (eot *_Eot) kind() int { return _EOT }
func (eot *_Eot) print() { print("eot") }
// --- CHAR a regular character
type _Char struct {
func (char *_Char) kind() int { return _CHAR }
func (char *_Char) print() { print("char ", string(char.char)) }
func newChar(char int) *_Char {
c := new(_Char);
c.char = char;
return c;
}
type _CharClass struct {
char int;
negate bool; // is character class negated? ([^a-z])
// vector of int, stored pairwise: [a-z] is (a,z); x is (x,x):
ranges *vector.IntVector;
func (cclass *_CharClass) kind() int { return _CHARCLASS }
print("charclass");
if cclass.negate {
print(" (negated)");
}
for i := 0; i < cclass.ranges.Len(); i += 2 {
l := cclass.ranges.At(i);
r := cclass.ranges.At(i+1);
if l == r {
print(" [", string(l), "]");
} else {
print(" [", string(l), "-", string(r), "]");
}
}
}
func (cclass *_CharClass) addRange(a, b int) {
cclass.ranges.Push(a);
cclass.ranges.Push(b);
func (cclass *_CharClass) matches(c int) bool {
min := cclass.ranges.At(i);
max := cclass.ranges.At(i+1);
if min <= c && c <= max {
return !cclass.negate
}
}
return cclass.negate
}
func newCharClass() *_CharClass {
c := new(_CharClass);
type _Any struct {
func (any *_Any) kind() int { return _ANY }
func (any *_Any) print() { print("any") }
// --- NOTNL any character but newline
type _NotNl struct {
common
}
func (notnl *_NotNl) kind() int { return _NOTNL }
func (notnl *_NotNl) print() { print("notnl") }
// --- BRA parenthesized expression
type _Bra struct {
n int; // subexpression number
func (bra *_Bra) kind() int { return _BRA }
func (bra *_Bra) print() { print("bra", bra.n); }
// --- EBRA end of parenthesized expression
type _Ebra struct {
n int; // subexpression number
}
func (ebra *_Ebra) kind() int { return _EBRA }
func (ebra *_Ebra) print() { print("ebra ", ebra.n); }
type _Alt struct {
func (alt *_Alt) kind() int { return _ALT }
func (alt *_Alt) print() { print("alt(", alt.left.index(), ")"); }
type _Nop struct {
func (nop *_Nop) kind() int { return _NOP }
func (nop *_Nop) print() { print("nop") }
// report error and exit compiling/executing goroutine
Rob Pike
committed
func (re *Regexp) setError(err os.Error) {
func (re *Regexp) add(i instr) instr {
i.setIndex(re.inst.Len());
re.inst.Push(i);
nlpar int; // number of unclosed lpars
c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:len(p.re.expr)]);
p.ch = c;
p.pos += w;
}
return p.ch;
}
p := new(parser);
p.re = re;
p.nextc(); // load p.ch
return p;
func (p *parser) regexp() (start, end instr)
for i := 0; i < len(s); i++ {
if c == int(s[i]) {
return true
}
}
return false
}
func specialcclass(c int) bool {
s := `\-[]`;
for i := 0; i < len(s); i++ {
if c == int(s[i]) {
return true
}
}
return false
}
if p.c() == '^' {
cc.negate = true;
p.nextc();
}
left := -1;
for {
switch c := p.c(); c {
// Is it [^\n]?
if cc.negate && cc.ranges.Len() == 2 &&
cc.ranges.At(0) == '\n' && cc.ranges.At(1) == '\n' {
nl := new(_NotNl);
p.re.add(nl);
return nl;
}
p.re.add(cc);
return cc;
case '-': // do this before backslash processing
p.re.setError(ErrExtraneousBackslash);
case c == 'n':
c = '\n';
case specialcclass(c):
// c is as delivered
default:
}
fallthrough;
default:
p.nextc();
switch {
case left < 0: // first of pair
if p.c() == '-' { // range
p.nextc();
left = c;
} else { // single char
func (p *parser) term() (start, end instr) {
return start, start;
case '(':
p.nextc();
p.re.nbra++; // increment first so first subexpr is \1
nbra := p.re.nbra;
bra := new(_Bra);
ebra := new(_Ebra);
if end == iNULL {
p.re.setError(ErrInternal)
}
case '\\':
c = p.nextc();
switch {
p.re.setError(ErrExtraneousBackslash);
case c == 'n':
c = '\n';
case special(c):
// c is as delivered
default:
}
fallthrough;
default:
p.nextc();
return start, start
}
panic("unreachable");
}
func (p *parser) closure() (start, end instr) {
start, end = p.term();
}
switch p.c() {
case '*':
// (start,end)*:
alt := new(_Alt);
p.re.add(alt);
end.setNext(alt); // after end, do alt
alt.left = start; // alternate brach: return to start
start = alt; // alt becomes new (start, end)
end = alt;
alt := new(_Alt);
p.re.add(alt);
end.setNext(alt); // after end, do alt
alt.left = start; // alternate brach: return to start
end = alt; // start is unchanged; end is alt
alt := new(_Alt);
nop := new(_Nop);
alt.left = start; // alternate branch is start
alt.setNext(nop); // follow on to nop
end.setNext(nop); // after end, go to nop
start = alt; // start is now alt
end = nop; // end is nop pointed to by both branches
}
switch p.nextc() {
case '*', '+', '?':
func (p *parser) concatenation() (start, end instr) {
case nstart == iNULL: // end of this concatenation
if start == iNULL { // this is the empty string
case start == iNULL: // this is first element of concatenation
start, end = nstart, nend;
default:
end = nend;
}
}
panic("unreachable");
}
func (p *parser) regexp() (start, end instr) {
start, end = p.concatenation();
for {
switch p.c() {
default:
alt := new(_Alt);
nop := new(_Nop);
p.re.add(nop);
end.setNext(nop);
nend.setNext(nop);
}
}
panic("unreachable");
}
func unNop(i instr) instr {
for i.kind() == _NOP {
i = i.next()
inst.setNext(unNop(inst.next()));
if inst.kind() == _ALT {
alt := inst.(*_Alt);
alt.left = unNop(alt.left);
print(inst.index(), ": ");
inst.print();
if inst.kind() != _END {
print(" -> ", inst.next().index())
start := new(_Start);
re.add(start);
s, e := p.regexp();
start.setNext(s);
func compiler(str string, ch chan *Regexp) {
re := new(Regexp);
// Compile parses a regular expression and returns, if successful, a Regexp
// object that can be used to match against text.
Rob Pike
committed
func Compile(str string) (regexp *Regexp, error os.Error) {
// Compile in a separate goroutine and wait for the result.
ch := make(chan *Regexp);
go compiler(str, ch);
re := <-ch;
return re, re.error
}
type state struct {
inst instr; // next instruction to execute
match []int; // pairs of bracketing submatches. 0th is start,end
}
// Append new state to to-do list. Leftmost-longest wins so avoid
// adding a state that's already active.
func addState(s []state, inst instr, match []int) []state {
l := len(s);
pos := match[0];
// TODO: Once the state is a vector and we can do insert, have inputs always
// go in order correctly and this "earlier" test is never necessary,
for i := 0; i < l; i++ {
if s[i].inst.index() == index && // same instruction
s[i].match[0] < pos { // earlier match already going; lefmost wins
return s
}
}
if l == cap(s) {
for i := 0; i < l; i++ {
s1[i] = s[i];
}
s = s1;
}
s = s[0:l+1];
s[l].inst = inst;
s[l].match = match;
return s;
}
// Accepts either string or bytes - the logic is identical either way.
// If bytes == nil, scan str.
func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
var s [2][]state; // TODO: use a vector when state values (not ptrs) can be vector elements
s[0] = make([]state, 10)[0:0];
s[1] = make([]state, 10)[0:0];
end := len(str);
if bytes != nil {
end = len(bytes)
}
for pos <= end {
if !found {
// prime the pump if we haven't seen a match yet
for i := 0; i < len(match); i++ {
match[i] = -1; // no match seen; catches cases like "a(b)?c" on "ac"
}
s[out] = addState(s[out], re.start.next(), match);
}
in, out = out, in; // old out state is new in state
s[out] = s[out][0:0]; // clear out state
if len(s[in]) == 0 {
// machine has completed
break;
}
if pos < end {
if bytes == nil {
c, charwidth = utf8.DecodeRuneInString(str[pos:end]);
} else {
c, charwidth = utf8.DecodeRune(bytes[pos:end]);
}
s[in] = addState(s[in], st.inst.next(), st.match)
if pos == end {
s[in] = addState(s[in], st.inst.next(), st.match)
case _CHAR:
if c == st.inst.(*_Char).char {
s[out] = addState(s[out], st.inst.next(), st.match)
case _CHARCLASS:
if st.inst.(*_CharClass).matches(c) {
s[out] = addState(s[out], st.inst.next(), st.match)
s[out] = addState(s[out], st.inst.next(), st.match)
case _NOTNL:
if c != endOfFile && c != '\n' {
s[out] = addState(s[out], st.inst.next(), st.match)
}
case _BRA:
n := st.inst.(*_Bra).n;
s[in] = addState(s[in], st.inst.next(), st.match);
case _EBRA:
n := st.inst.(*_Ebra).n;
s[in] = addState(s[in], st.inst.next(), st.match);
case _ALT:
s[in] = addState(s[in], st.inst.(*_Alt).left, st.match);
s[in] = addState(s[in], st.inst.next(), s1);
(st.match[0] == final.match[0] && pos > final.match[1]) { // longest
final.match[1] = pos;
}
found = true;
default:
// ExecuteString matches the Regexp against the string s.
// The return value is an array of integers, in pairs, identifying the positions of
// substrings matched by the expression.
// s[a[0]:a[1]] is the substring matched by the entire expression.
// s[a[2*i]:a[2*i+1]] for i > 0 is the substring matched by the ith parenthesized subexpression.
// A negative value means the subexpression did not match any element of the string.
func (re *Regexp) ExecuteString(s string) (a []int) {
return re.doExecute(s, nil, 0)
Rob Pike
committed
// Execute matches the Regexp against the byte slice b.
// The return value is an array of integers, in pairs, identifying the positions of
// subslices matched by the expression.
// b[a[0]:a[1]] is the subslice matched by the entire expression.
// b[a[2*i]:a[2*i+1]] for i > 0 is the subslice matched by the ith parenthesized subexpression.
// A negative value means the subexpression did not match any element of the slice.
// An empty array means "no match".
func (re *Regexp) Execute(b []byte) (a []int) {
return re.doExecute("", b, 0)
}
// MatchString returns whether the Regexp matches the string s.
// The return value is a boolean: true for match, false for no match.
func (re *Regexp) MatchString(s string) bool {
return len(re.doExecute(s, nil, 0)) > 0
}
// Match returns whether the Regexp matches the byte slice b.
// The return value is a boolean: true for match, false for no match.
func (re *Regexp) Match(b []byte) bool {
return len(re.doExecute("", b, 0)) > 0
Rob Pike
committed
}
// MatchStrings matches the Regexp against the string s.
// The return value is an array of strings matched by the expression.
// a[0] is the substring matched by the entire expression.
// a[i] for i > 0 is the substring matched by the ith parenthesized subexpression.
// An empty array means ``no match''.
func (re *Regexp) MatchStrings(s string) (a []string) {
r := re.doExecute(s, nil, 0);
Rob Pike
committed
if r == nil {
return nil
}
Rob Pike
committed
for i := 0; i < len(r); i += 2 {
if r[i] != -1 { // -1 means no match for this subexpression
a[i/2] = s[r[i] : r[i+1]]
}
Rob Pike
committed
}
Rob Pike
committed
}
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
// MatchSlices matches the Regexp against the byte slice b.
// The return value is an array of subslices matched by the expression.
// a[0] is the subslice matched by the entire expression.
// a[i] for i > 0 is the subslice matched by the ith parenthesized subexpression.
// An empty array means ``no match''.
func (re *Regexp) MatchSlices(b []byte) (a [][]byte) {
r := re.doExecute("", b, 0);
if r == nil {
return nil
}
a = make([][]byte, len(r)/2);
for i := 0; i < len(r); i += 2 {
if r[i] != -1 { // -1 means no match for this subexpression
a[i/2] = b[r[i] : r[i+1]]
}
}
return
}
// MatchString checks whether a textual regular expression
// matches a string. More complicated queries need
// to use Compile and the full Regexp interface.
func MatchString(pattern string, s string) (matched bool, error os.Error) {
re, err := Compile(pattern);
if err != nil {
return false, err
}
return re.MatchString(s), nil
}
// Match checks whether a textual regular expression
// matches a byte slice. More complicated queries need
// to use Compile and the full Regexp interface.
func Match(pattern string, b []byte) (matched bool, error os.Error) {
Rob Pike
committed
re, err := Compile(pattern);
if err != nil {
return false, err
}
return re.Match(b), nil
Rob Pike
committed
}
// ReplaceAllString returns a copy of src in which all matches for the Regexp
// have been replaced by repl. No support is provided for expressions
// (e.g. \1 or $1) in the replacement string.
func (re *Regexp) ReplaceAllString(src, repl string) string {
lastMatchEnd := 0; // end position of the most recent match
searchPos := 0; // position where we next look for a match
a := re.doExecute(src, nil, searchPos);
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
if len(a) == 0 {
break; // no more matches
}
// Copy the unmatched characters before this match.
io.WriteString(buf, src[lastMatchEnd:a[0]]);
// Now insert a copy of the replacement string, but not for a
// match of the empty string immediately after another match.
// (Otherwise, we get double replacement for patterns that
// match both empty and nonempty strings.)
if a[1] > lastMatchEnd || a[0] == 0 {
io.WriteString(buf, repl);
}
lastMatchEnd = a[1];
// Advance past this match; always advance at least one character.
rune, width := utf8.DecodeRuneInString(src[searchPos:len(src)]);
if searchPos + width > a[1] {
searchPos += width;
} else if searchPos + 1 > a[1] {
// This clause is only needed at the end of the input
// string. In that case, DecodeRuneInString returns width=0.
searchPos++;
} else {
searchPos = a[1];
}
}
// Copy the unmatched characters after the last match.
io.WriteString(buf, src[lastMatchEnd:len(src)]);
return string(buf.Data());
}
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
// ReplaceAll returns a copy of src in which all matches for the Regexp
// have been replaced by repl. No support is provided for expressions
// (e.g. \1 or $1) in the replacement text.
func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
lastMatchEnd := 0; // end position of the most recent match
searchPos := 0; // position where we next look for a match
buf := new(bytes.Buffer);
for searchPos <= len(src) {
a := re.doExecute("", src, searchPos);
if len(a) == 0 {
break; // no more matches
}
// Copy the unmatched characters before this match.
buf.Write(src[lastMatchEnd:a[0]]);
// Now insert a copy of the replacement string, but not for a
// match of the empty string immediately after another match.
// (Otherwise, we get double replacement for patterns that
// match both empty and nonempty strings.)
if a[1] > lastMatchEnd || a[0] == 0 {
buf.Write(repl);
}
lastMatchEnd = a[1];
// Advance past this match; always advance at least one character.
rune, width := utf8.DecodeRune(src[searchPos:len(src)]);
if searchPos + width > a[1] {
searchPos += width;
} else if searchPos + 1 > a[1] {
// This clause is only needed at the end of the input
// string. In that case, DecodeRuneInString returns width=0.
searchPos++;
} else {
searchPos = a[1];
}
}
// Copy the unmatched characters after the last match.
buf.Write(src[lastMatchEnd:len(src)]);
return buf.Data();
}
// QuoteMeta returns a string that quotes all regular expression metacharacters
// inside the argument text; the returned string is a regular expression matching
// the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
func QuoteMeta(s string) string {
b := make([]byte, 2 * len(s));
// A byte loop is correct because all metacharacters are ASCII.
j := 0;
for i := 0; i < len(s); i++ {
if special(int(s[i])) {
b[j] = '\\';
j++;
}
b[j] = s[i];
j++;
}
return string(b[0:j]);
}