Newer
Older
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package regexp implements a simple regular expression library.
//
// The syntax of the regular expressions accepted is:
//
// regexp:
// concatenation { '|' concatenation }
// concatenation:
// { closure }
// closure:
// term [ '*' | '+' | '?' ]
// term:
// '^'
// '$'
// '.'
// character
// '[' [ '^' ] character-ranges ']'
// '(' regexp ')'
//
ErrInternal = os.NewError("internal error");
ErrUnmatchedLpar = os.NewError("unmatched '('");
ErrUnmatchedRpar = os.NewError("unmatched ')'");
ErrUnmatchedLbkt = os.NewError("unmatched '['");
ErrUnmatchedRbkt = os.NewError("unmatched ']'");
ErrBadRange = os.NewError("bad range in character class");
ErrExtraneousBackslash = os.NewError("extraneous backslash");
ErrBadClosure = os.NewError("repeated closure (**, ++, etc.)");
ErrBareClosure = os.NewError("closure applies to nothing");
ErrBadBackslash = os.NewError("illegal backslash escape");
// An instruction executed by the NFA
kind() int; // the type of this instruction: _CHAR, _ANY, etc.
next() instr; // the instruction to execute after this one
// Fields and methods common to all instructions
func (c *common) next() instr { return c._next }
func (c *common) setNext(i instr) { c._next = i }
func (c *common) index() int { return c._index }
func (c *common) setIndex(i int) { c._index = i }
// Regexp is the representation of a compiled regular expression.
// The public interface is entirely through methods.
type Regexp struct {
expr string; // the original expression
prefix string; // initial plain text string
prefixBytes []byte; // initial plain text bytes
inst *vector.Vector;
start instr;
nbra int; // number of brackets in expression, for subexpressions
_END; // end of program: success
_BOT; // '^' beginning of text
_EOT; // '$' end of text
_CHARCLASS; // [a-z] character class
_ANY; // '.' any character including newline
_NOTNL; // [^\n] special case: any character but newline
_BRA; // '(' parenthesized expression
_EBRA; // ')'; end of '(' parenthesized expression
_ALT; // '|' alternation
_NOP; // do nothing; makes it easy to link without patching
)
// --- START start of program
type _Start struct {
func (start *_Start) kind() int { return _START }
func (start *_Start) print() { print("start") }
type _End struct {
func (end *_End) kind() int { return _END }
func (end *_End) print() { print("end") }
type _Bot struct {
func (bot *_Bot) kind() int { return _BOT }
func (bot *_Bot) print() { print("bot") }
type _Eot struct {
func (eot *_Eot) kind() int { return _EOT }
func (eot *_Eot) print() { print("eot") }
// --- CHAR a regular character
type _Char struct {
func (char *_Char) kind() int { return _CHAR }
func (char *_Char) print() { print("char ", string(char.char)) }
func newChar(char int) *_Char {
c := new(_Char);
c.char = char;
return c;
}
type _CharClass struct {
char int;
negate bool; // is character class negated? ([^a-z])
// vector of int, stored pairwise: [a-z] is (a,z); x is (x,x):
ranges *vector.IntVector;
func (cclass *_CharClass) kind() int { return _CHARCLASS }
print(" (negated)")
l := cclass.ranges.At(i);
r := cclass.ranges.At(i + 1);
print(" [", string(l), "]")
print(" [", string(l), "-", string(r), "]")
func (cclass *_CharClass) addRange(a, b int) {
cclass.ranges.Push(a);
cclass.ranges.Push(b);
func (cclass *_CharClass) matches(c int) bool {
for i := 0; i < cclass.ranges.Len(); i = i + 2 {
min := cclass.ranges.At(i);
max := cclass.ranges.At(i + 1);
return !cclass.negate
func newCharClass() *_CharClass {
c := new(_CharClass);
c.ranges = new(vector.IntVector);
type _Any struct {
func (any *_Any) kind() int { return _ANY }
func (any *_Any) print() { print("any") }
// --- NOTNL any character but newline
type _NotNl struct {
func (notnl *_NotNl) kind() int { return _NOTNL }
func (notnl *_NotNl) print() { print("notnl") }
// --- BRA parenthesized expression
type _Bra struct {
n int; // subexpression number
func (bra *_Bra) kind() int { return _BRA }
func (bra *_Bra) print() { print("bra", bra.n) }
// --- EBRA end of parenthesized expression
type _Ebra struct {
n int; // subexpression number
}
func (ebra *_Ebra) kind() int { return _EBRA }
func (ebra *_Ebra) print() { print("ebra ", ebra.n) }
type _Alt struct {
func (alt *_Alt) kind() int { return _ALT }
func (alt *_Alt) print() { print("alt(", alt.left.index(), ")") }
type _Nop struct {
func (nop *_Nop) kind() int { return _NOP }
func (nop *_Nop) print() { print("nop") }
func (re *Regexp) add(i instr) instr {
i.setIndex(re.inst.Len());
re.inst.Push(i);
nlpar int; // number of unclosed lpars
func (p *parser) c() int { return p.ch }
p.ch = endOfFile
c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:]);
p.ch = c;
p.pos += w;
}
return p.ch;
}
p := new(parser);
p.re = re;
p.nextc(); // load p.ch
return p;
}
func special(c int) bool {
return true
return true
if p.c() == '^' {
cc.negate = true;
p.nextc();
}
left := -1;
for {
switch c := p.c(); c {
p.error = ErrBadRange;
return nil;
// Is it [^\n]?
if cc.negate && cc.ranges.Len() == 2 &&
cc.ranges.At(0) == '\n' && cc.ranges.At(1) == '\n' {
nl := new(_NotNl);
p.re.add(nl);
return nl;
}
// Special common case: "[a]" -> "a"
if !cc.negate && cc.ranges.Len() == 2 && cc.ranges.At(0) == cc.ranges.At(1) {
c := newChar(cc.ranges.At(0));
p.re.add(c);
return c;
}
return cc;
case '-': // do this before backslash processing
p.error = ErrBadRange;
return nil;
p.error = ErrExtraneousBackslash;
return nil;
c = '\n'
case specialcclass(c):
// c is as delivered
default:
p.error = ErrBadBackslash;
return nil;
}
fallthrough;
default:
p.nextc();
switch {
case left < 0: // first of pair
if p.c() == '-' { // range
p.nextc();
left = c;
} else { // single char
cc.addRange(c, c)
p.error = ErrBadRange;
return nil;
func (p *parser) term() (start, end instr) {
// term() is the leaf of the recursion, so it's sufficient to pick off the
// error state here for early exit.
// The other functions (closure(), concatenation() etc.) assume
// it's safe to recur to here.
if p.error != nil {
return
return nil, nil
p.error = ErrBareClosure;
p.error = ErrUnmatchedRpar;
return;
p.error = ErrUnmatchedRbkt;
return;
return
p.error = ErrUnmatchedLbkt;
return;
return start, start;
case '(':
p.nextc();
p.re.nbra++; // increment first so first subexpr is \1
nbra := p.re.nbra;
p.error = ErrUnmatchedLpar;
return;
bra := new(_Bra);
ebra := new(_Ebra);
if start == nil {
if end == nil {
p.error = ErrInternal;
return;
end.setNext(ebra)
case '\\':
c = p.nextc();
switch {
p.error = ErrExtraneousBackslash;
return;
c = '\n'
case special(c):
// c is as delivered
default:
p.error = ErrBadBackslash;
return;
}
fallthrough;
default:
p.nextc();
}
panic("unreachable");
}
func (p *parser) closure() (start, end instr) {
start, end = p.term();
if start == nil || p.error != nil {
return
}
switch p.c() {
case '*':
// (start,end)*:
alt := new(_Alt);
p.re.add(alt);
end.setNext(alt); // after end, do alt
alt.left = start; // alternate brach: return to start
start = alt; // alt becomes new (start, end)
alt := new(_Alt);
p.re.add(alt);
end.setNext(alt); // after end, do alt
alt.left = start; // alternate brach: return to start
end = alt; // start is unchanged; end is alt
alt := new(_Alt);
nop := new(_Nop);
alt.left = start; // alternate branch is start
alt.setNext(nop); // follow on to nop
end.setNext(nop); // after end, go to nop
start = alt; // start is now alt
end = nop; // end is nop pointed to by both branches
return
}
switch p.nextc() {
case '*', '+', '?':
p.error = ErrBadClosure
func (p *parser) concatenation() (start, end instr) {
return
case nstart == nil: // end of this concatenation
if start == nil { // this is the empty string
case start == nil: // this is first element of concatenation
start, end = nstart, nend
end = nend;
}
}
panic("unreachable");
}
func (p *parser) regexp() (start, end instr) {
start, end = p.concatenation();
return
for {
switch p.c() {
default:
return
return
alt := new(_Alt);
nop := new(_Nop);
p.re.add(nop);
end.setNext(nop);
nend.setNext(nop);
}
}
panic("unreachable");
}
func unNop(i instr) instr {
i = i.next()
continue
inst.setNext(unNop(inst.next()));
if inst.kind() == _ALT {
alt := inst.(*_Alt);
alt.left = unNop(alt.left);
print(inst.index(), ": ");
inst.print();
if inst.kind() != _END {
print(" -> ", inst.next().index())
func (re *Regexp) doParse() os.Error {
start := new(_Start);
re.add(start);
s, e := p.regexp();
return p.error
if p.error == nil {
re.setPrefix();
if debug {
re.dump();
println();
}
}
// Extract regular text from the beginning of the pattern.
// That text can be used by doExecute to speed up matching.
func (re *Regexp) setPrefix() {
var b []byte;
var utf = make([]byte, utf8.UTFMax);
// First instruction is start; skip that.
i := re.inst.At(0).(instr).next().index();
Rob Pike
committed
Loop:
for i < re.inst.Len() {
inst := re.inst.At(i).(instr);
// stop if this is not a char
if inst.kind() != _CHAR {
break
}
Rob Pike
committed
// stop if this char can be followed by a match for an empty string,
// which includes closures, ^, and $.
switch re.inst.At(inst.next().index()).(instr).kind() {
case _BOT, _EOT, _ALT:
break Loop
}
n := utf8.EncodeRune(inst.(*_Char).char, utf);
b = bytes.Add(b, utf[0:n]);
i = inst.next().index();
}
// point start instruction to first non-CHAR
re.inst.At(0).(instr).setNext(re.inst.At(i).(instr));
re.prefixBytes = b;
re.prefix = string(b);
}
// Compile parses a regular expression and returns, if successful, a Regexp
// object that can be used to match against text.
Rob Pike
committed
func Compile(str string) (regexp *Regexp, error os.Error) {
regexp = new(Regexp);
regexp.expr = str;
regexp.inst = new(vector.Vector);
error = regexp.doParse();
return;
}
// MustCompile is like Compile but panics if the expression cannot be parsed.
// It simplifies safe initialization of global variables holding compiled regular
// expressions.
func MustCompile(str string) *Regexp {
regexp, error := Compile(str);
if error != nil {
panicln(`regexp: compiling "`, str, `": `, error.String())
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
// The match arena allows us to reduce the garbage generated by tossing
// match vectors away as we execute. Matches are ref counted and returned
// to a free list when no longer active. Increases a simple benchmark by 22X.
type matchArena struct {
head *matchVec;
len int; // length of match vector
}
type matchVec struct {
m []int; // pairs of bracketing submatches. 0th is start,end
ref int;
next *matchVec;
}
func (a *matchArena) new() *matchVec {
if a.head == nil {
const N = 10;
block := make([]matchVec, N);
for i := 0; i < N; i++ {
b := &block[i];
b.next = a.head;
a.head = b;
}
}
m := a.head;
a.head = m.next;
m.ref = 0;
if m.m == nil {
m.m = make([]int, a.len)
}
return m;
}
func (a *matchArena) free(m *matchVec) {
m.ref--;
if m.ref == 0 {
m.next = a.head;
a.head = m;
}
}
func (a *matchArena) copy(m *matchVec) *matchVec {
m1 := a.new();
copy(m1.m, m.m);
return m1;
}
func (a *matchArena) noMatch() *matchVec {
m := a.new();
for i := range m.m {
m.m[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
}
m.ref = 1;
return m;
}
type state struct {
inst instr; // next instruction to execute
}
// Append new state to to-do list. Leftmost-longest wins so avoid
// adding a state that's already active. The matchVec will be inc-ref'ed
// if it is assigned to a state.
func (a *matchArena) addState(s []state, inst instr, match *matchVec, pos, end int) []state {
switch inst.kind() {
case _BOT:
if pos == 0 {
s = a.addState(s, inst.next(), match, pos, end)
}
return s;
case _EOT:
if pos == end {
s = a.addState(s, inst.next(), match, pos, end)
}
return s;
case _BRA:
n := inst.(*_Bra).n;
match.m[2*n] = pos;
s = a.addState(s, inst.next(), match, pos, end);
return s;
case _EBRA:
n := inst.(*_Ebra).n;
match.m[2*n+1] = pos;
s = a.addState(s, inst.next(), match, pos, end);
return s;
}
// States are inserted in order so it's sufficient to see if we have the same
// instruction; no need to see if existing match is earlier (it is).
if s[i].inst.index() == index {
return s
if inst.kind() == _ALT {
s = a.addState(s, inst.(*_Alt).left, a.copy(match), pos, end);
// give other branch a copy of this match vector
s = a.addState(s, inst.next(), a.copy(match), pos, end);
// Accepts either string or bytes - the logic is identical either way.
// If bytes == nil, scan str.
func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
s[0] = make([]state, 10)[0:0];
s[1] = make([]state, 10)[0:0];
end := len(str);
if bytestr != nil {
end = len(bytestr)
}
// fast check for initial plain substring
if re.prefix != "" {
var advance int;
if bytestr == nil {
advance = strings.Index(str[pos:], re.prefix)
advance = bytes.Index(bytestr[pos:], re.prefixBytes)
}
if advance == -1 {
return []int{}
}
pos += advance + len(re.prefix);
arena := &matchArena{nil, 2 * (re.nbra + 1)};
for pos <= end {
if !found {
// prime the pump if we haven't seen a match yet
match := arena.noMatch();
match.m[0] = pos;
s[out] = arena.addState(s[out], re.start.next(), match, pos, end);
arena.free(match); // if addState saved it, ref was incremented
// clear out old state
old := s[out];
for _, state := range old {
arena.free(state.match)
}
s[out] = old[0:0]; // truncate state vector
if found && len(s[in]) == 0 {
if pos < end {
c, charwidth = utf8.DecodeRuneInString(str[pos:end])
} else {
c, charwidth = utf8.DecodeRune(bytestr[pos:end])
pos += charwidth;
for _, st := range s[in] {
switch st.inst.kind() {
case _BOT:
case _EOT:
case _CHAR:
if c == st.inst.(*_Char).char {
s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
case _CHARCLASS:
s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
case _NOTNL:
if c != endOfFile && c != '\n' {
s[out] = arena.addState(s[out], st.inst.next(), st.match, pos, end)
case _BRA:
case _EBRA:
case _ALT:
case _END:
st.match.m[0] < final.match.m[0] || // leftmost
(st.match.m[0] == final.match.m[0] && pos-charwidth > final.match.m[1]) { // longest
if final.match != nil {
arena.free(final.match)
}
final.match.ref++;
final.match.m[1] = pos - charwidth;
panic("unknown instruction in execute");
}
}
}
if final.match == nil {
return nil
}
// if match found, back up start of match by width of prefix.
if re.prefix != "" && len(final.match.m) > 0 {
final.match.m[0] -= len(re.prefix)
return final.match.m;
// ExecuteString matches the Regexp against the string s.
// The return value is an array of integers, in pairs, identifying the positions of
// substrings matched by the expression.
// s[a[0]:a[1]] is the substring matched by the entire expression.
// s[a[2*i]:a[2*i+1]] for i > 0 is the substring matched by the ith parenthesized subexpression.
// A negative value means the subexpression did not match any element of the string.
func (re *Regexp) ExecuteString(s string) (a []int) {
return re.doExecute(s, nil, 0)
Rob Pike
committed
// Execute matches the Regexp against the byte slice b.
// The return value is an array of integers, in pairs, identifying the positions of
// subslices matched by the expression.
// b[a[0]:a[1]] is the subslice matched by the entire expression.
// b[a[2*i]:a[2*i+1]] for i > 0 is the subslice matched by the ith parenthesized subexpression.
// A negative value means the subexpression did not match any element of the slice.
// An empty array means "no match".
func (re *Regexp) Execute(b []byte) (a []int) { return re.doExecute("", b, 0) }
// MatchString returns whether the Regexp matches the string s.
// The return value is a boolean: true for match, false for no match.
func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(s, nil, 0)) > 0 }
// Match returns whether the Regexp matches the byte slice b.
// The return value is a boolean: true for match, false for no match.
func (re *Regexp) Match(b []byte) bool { return len(re.doExecute("", b, 0)) > 0 }
Rob Pike
committed
// MatchStrings matches the Regexp against the string s.
// The return value is an array of strings matched by the expression.
// a[0] is the substring matched by the entire expression.
// a[i] for i > 0 is the substring matched by the ith parenthesized subexpression.
// An empty array means ``no match''.
func (re *Regexp) MatchStrings(s string) (a []string) {
r := re.doExecute(s, nil, 0);
Rob Pike
committed
if r == nil {
return nil
Rob Pike
committed
}
Rob Pike
committed
for i := 0; i < len(r); i += 2 {
if r[i] != -1 { // -1 means no match for this subexpression
a[i/2] = s[r[i]:r[i+1]]
Rob Pike
committed
}
Rob Pike
committed
}
// MatchSlices matches the Regexp against the byte slice b.
// The return value is an array of subslices matched by the expression.
// a[0] is the subslice matched by the entire expression.
// a[i] for i > 0 is the subslice matched by the ith parenthesized subexpression.
// An empty array means ``no match''.
func (re *Regexp) MatchSlices(b []byte) (a [][]byte) {
r := re.doExecute("", b, 0);
if r == nil {
return nil
}
a = make([][]byte, len(r)/2);
for i := 0; i < len(r); i += 2 {
if r[i] != -1 { // -1 means no match for this subexpression
a[i/2] = b[r[i]:r[i+1]]
}
// MatchString checks whether a textual regular expression
// matches a string. More complicated queries need
// to use Compile and the full Regexp interface.
func MatchString(pattern string, s string) (matched bool, error os.Error) {
re, err := Compile(pattern);
if err != nil {
return false, err
// Match checks whether a textual regular expression
// matches a byte slice. More complicated queries need
// to use Compile and the full Regexp interface.
func Match(pattern string, b []byte) (matched bool, error os.Error) {
Rob Pike
committed
re, err := Compile(pattern);
if err != nil {
return false, err
Rob Pike
committed
}
Rob Pike
committed
}
// ReplaceAllString returns a copy of src in which all matches for the Regexp
// have been replaced by repl. No support is provided for expressions
// (e.g. \1 or $1) in the replacement string.
func (re *Regexp) ReplaceAllString(src, repl string) string {
lastMatchEnd := 0; // end position of the most recent match
searchPos := 0; // position where we next look for a match