Skip to content
Snippets Groups Projects
regexp.go 33.6 KiB
Newer Older
  • Learn to ignore specific revisions
  • // Use of this source code is governed by a BSD-style
    // license that can be found in the LICENSE file.
    
    
    // Package regexp implements a simple regular expression library.
    //
    // The syntax of the regular expressions accepted is:
    //
    //	regexp:
    //		concatenation { '|' concatenation }
    //	concatenation:
    //		{ closure }
    //	closure:
    //		term [ '*' | '+' | '?' ]
    //	term:
    //		'^'
    //		'$'
    //		'.'
    //		character
    
    //		'[' [ '^' ] { character-range } ']'
    
    //		'(' regexp ')'
    
    //		character [ '-' character ]
    
    // All characters are UTF-8-encoded code points.  Backslashes escape special
    
    // characters, including inside character classes.  The standard Go character
    // escapes are also recognized: \a \b \f \n \r \t \v.
    
    //
    // There are 16 methods of Regexp that match a regular expression and identify
    // the matched text.  Their names are matched by this regular expression:
    //
    //	Find(All)?(String)?(Submatch)?(Index)?
    //
    // If 'All' is present, the routine matches successive non-overlapping
    // matches of the entire expression.  Empty matches abutting a preceding
    // match are ignored.  The return value is a slice containing the successive
    // return values of the corresponding non-'All' routine.  These routines take
    // an extra integer argument, n; if n >= 0, the function returns at most n
    // matches/submatches.
    //
    // If 'String' is present, the argument is a string; otherwise it is a slice
    // of bytes; return values are adjusted as appropriate.
    //
    // If 'Submatch' is present, the return value is a slice identifying the
    // successive submatches of the expression.  Submatches are matches of
    // parenthesized subexpressions within the regular expression, numbered from
    // left to right in order of opening parenthesis.  Submatch 0 is the match of
    // the entire expression, submatch 1 the match of the first parenthesized
    // subexpression, and so on.
    //
    // If 'Index' is present, matches and submatches are identified by byte index
    // pairs within the input string: result[2*n:2*n+1] identifies the indexes of
    // the nth submatch.  The pair for n==0 identifies the match of the entire
    // expression.  If 'Index' is not present, the match is identified by the
    // text of the match/submatch.  If an index is negative, it means that
    // subexpression did not match any string in the input.
    //
    // (There are a few other methods that do not match this pattern.)
    
    package regexp
    
    
    Rob Pike's avatar
    Rob Pike committed
    import (
    
    	"bytes"
    	"io"
    	"os"
    	"strings"
    	"utf8"
    
    Rob Pike's avatar
    Rob Pike committed
    )
    
    
    var debug = false
    
    Rob Pike's avatar
    Rob Pike committed
    
    
    // Error is the local type for a parsing error.
    type Error string
    
    func (e Error) String() string {
    	return string(e)
    }
    
    
    Rob Pike's avatar
    Rob Pike committed
    // Error codes returned by failures to parse an expression.
    
    Russ Cox's avatar
    Russ Cox committed
    var (
    
    	ErrInternal            = Error("internal error")
    	ErrUnmatchedLpar       = Error("unmatched '('")
    	ErrUnmatchedRpar       = Error("unmatched ')'")
    	ErrUnmatchedLbkt       = Error("unmatched '['")
    	ErrUnmatchedRbkt       = Error("unmatched ']'")
    	ErrBadRange            = Error("bad range in character class")
    	ErrExtraneousBackslash = Error("extraneous backslash")
    	ErrBadClosure          = Error("repeated closure (**, ++, etc.)")
    	ErrBareClosure         = Error("closure applies to nothing")
    	ErrBadBackslash        = Error("illegal backslash escape")
    
    Russ Cox's avatar
    Russ Cox committed
    )
    
    Rob Pike's avatar
    Rob Pike committed
    const (
    	iStart     = iota // beginning of program
    	iEnd              // end of program: success
    	iBOT              // '^' beginning of text
    	iEOT              // '$' end of text
    	iChar             // 'a' regular character
    	iCharClass        // [a-z] character class
    	iAny              // '.' any character including newline
    	iNotNL            // [^\n] special case: any character but newline
    
    	iBra              // '(' parenthesized expression: 2*braNum for left, 2*braNum+1 for right
    
    Rob Pike's avatar
    Rob Pike committed
    	iAlt              // '|' alternation
    	iNop              // do nothing; makes it easy to link without patching
    )
    
    Rob Pike's avatar
    Rob Pike committed
    // An instruction executed by the NFA
    type instr struct {
    	kind  int    // the type of this instruction: iChar, iAny, etc.
    	index int    // used only in debugging; could be eliminated
    	next  *instr // the instruction to execute after this one
    	// Special fields valid only for some items.
    	char   int        // iChar
    	braNum int        // iBra, iEbra
    	cclass *charClass // iCharClass
    	left   *instr     // iAlt, other branch
    }
    
    func (i *instr) print() {
    	switch i.kind {
    	case iStart:
    		print("start")
    	case iEnd:
    		print("end")
    	case iBOT:
    		print("bot")
    	case iEOT:
    		print("eot")
    	case iChar:
    		print("char ", string(i.char))
    	case iCharClass:
    		i.cclass.print()
    	case iAny:
    		print("any")
    	case iNotNL:
    		print("notnl")
    	case iBra:
    
    		if i.braNum&1 == 0 {
    			print("bra", i.braNum/2)
    		} else {
    			print("ebra", i.braNum/2)
    		}
    
    Rob Pike's avatar
    Rob Pike committed
    	case iAlt:
    		print("alt(", i.left.index, ")")
    	case iNop:
    		print("nop")
    	}
    
    // Regexp is the representation of a compiled regular expression.
    
    // The public interface is entirely through methods.
    type Regexp struct {
    
    	expr        string // the original expression
    	prefix      string // initial plain text string
    	prefixBytes []byte // initial plain text bytes
    
    Rob Pike's avatar
    Rob Pike committed
    	inst        []*instr
    	start       *instr // first instruction of machine
    	prefixStart *instr // where to start if there is a prefix
    	nbra        int    // number of brackets in expression, for subexpressions
    
    Rob Pike's avatar
    Rob Pike committed
    type charClass struct {
    
    	negate bool // is character class negated? ([^a-z])
    
    	// slice of int, stored pairwise: [a-z] is (a,z); x is (x,x):
    	ranges     []int
    
    Rob Pike's avatar
    Rob Pike committed
    func (cclass *charClass) print() {
    
    	print("charclass")
    
    Rob Pike's avatar
    Rob Pike committed
    	if cclass.negate {
    
    Rob Pike's avatar
    Rob Pike committed
    	}
    
    	for i := 0; i < len(cclass.ranges); i += 2 {
    		l := cclass.ranges[i]
    		r := cclass.ranges[i+1]
    
    Rob Pike's avatar
    Rob Pike committed
    		if l == r {
    
    Rob Pike's avatar
    Rob Pike committed
    		} else {
    
    			print(" [", string(l), "-", string(r), "]")
    
    Rob Pike's avatar
    Rob Pike committed
    func (cclass *charClass) addRange(a, b int) {
    
    Rob Pike's avatar
    Rob Pike committed
    	// range is a through b inclusive
    
    	cclass.ranges = append(cclass.ranges, a, b)
    
    	if a < cclass.cmin {
    		cclass.cmin = a
    	}
    	if b > cclass.cmax {
    		cclass.cmax = b
    	}
    
    Rob Pike's avatar
    Rob Pike committed
    func (cclass *charClass) matches(c int) bool {
    
    	if c < cclass.cmin || c > cclass.cmax {
    		return cclass.negate
    	}
    
    	ranges := cclass.ranges
    
    	for i := 0; i < len(ranges); i = i + 2 {
    		if ranges[i] <= c && c <= ranges[i+1] {
    
    Rob Pike's avatar
    Rob Pike committed
    		}
    	}
    
    	return cclass.negate
    
    Rob Pike's avatar
    Rob Pike committed
    }
    
    
    Rob Pike's avatar
    Rob Pike committed
    func newCharClass() *instr {
    	i := &instr{kind: iCharClass}
    	i.cclass = new(charClass)
    	i.cclass.ranges = make([]int, 0, 4)
    	i.cclass.cmin = 0x10FFFF + 1 // MaxRune + 1
    	i.cclass.cmax = -1
    	return i
    
    Rob Pike's avatar
    Rob Pike committed
    func (re *Regexp) add(i *instr) *instr {
    	i.index = len(re.inst)
    
    	re.inst = append(re.inst, i)
    
    Rob Pike's avatar
    Rob Pike committed
    type parser struct {
    
    	re    *Regexp
    	nlpar int // number of unclosed lpars
    	pos   int
    	ch    int
    
    Rob Pike's avatar
    Rob Pike committed
    func (p *parser) error(err Error) {
    
    Rob Pike's avatar
    Rob Pike committed
    const endOfFile = -1
    
    func (p *parser) c() int { return p.ch }
    
    Rob Pike's avatar
    Rob Pike committed
    func (p *parser) nextc() int {
    
    	if p.pos >= len(p.re.expr) {
    
    		c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:])
    		p.ch = c
    		p.pos += w
    
    func newParser(re *Regexp) *parser {
    
    	p := new(parser)
    	p.re = re
    	p.nextc() // load p.ch
    	return p
    
    }
    
    func special(c int) bool {
    
    Rob Pike's avatar
    Rob Pike committed
    	for _, r := range `\.+*?()|[]^$` {
    		if c == r {
    
    Rob Pike's avatar
    Rob Pike committed
    		}
    	}
    
    func ispunct(c int) bool {
    	for _, r := range "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" {
    
    Rob Pike's avatar
    Rob Pike committed
    		if c == r {
    
    var escapes = []byte("abfnrtv")
    var escaped = []byte("\a\b\f\n\r\t\v")
    
    func escape(c int) int {
    	for i, b := range escapes {
    		if int(b) == c {
    			return i
    		}
    	}
    	return -1
    }
    
    
    Rob Pike's avatar
    Rob Pike committed
    func (p *parser) charClass() *instr {
    	i := newCharClass()
    	cc := i.cclass
    
    Rob Pike's avatar
    Rob Pike committed
    	if p.c() == '^' {
    
    		cc.negate = true
    		p.nextc()
    
    Rob Pike's avatar
    Rob Pike committed
    	}
    
    Rob Pike's avatar
    Rob Pike committed
    	for {
    		switch c := p.c(); c {
    
    Rob Pike's avatar
    Rob Pike committed
    		case ']', endOfFile:
    
    Rob Pike's avatar
    Rob Pike committed
    			if left >= 0 {
    
    				p.error(ErrBadRange)
    
    Rob Pike's avatar
    Rob Pike committed
    			}
    
    			if cc.negate && len(cc.ranges) == 2 &&
    				cc.ranges[0] == '\n' && cc.ranges[1] == '\n' {
    
    Rob Pike's avatar
    Rob Pike committed
    				nl := &instr{kind: iNotNL}
    
    				p.re.add(nl)
    				return nl
    
    			// Special common case: "[a]" -> "a"
    
    			if !cc.negate && len(cc.ranges) == 2 && cc.ranges[0] == cc.ranges[1] {
    
    Rob Pike's avatar
    Rob Pike committed
    				c := &instr{kind: iChar, char: cc.ranges[0]}
    
    Rob Pike's avatar
    Rob Pike committed
    			p.re.add(i)
    			return i
    
    		case '-': // do this before backslash processing
    
    			p.error(ErrBadRange)
    
    Rob Pike's avatar
    Rob Pike committed
    		case '\\':
    
    Rob Pike's avatar
    Rob Pike committed
    			switch {
    
    Rob Pike's avatar
    Rob Pike committed
    			case c == endOfFile:
    
    				p.error(ErrExtraneousBackslash)
    
    			case ispunct(c):
    
    Rob Pike's avatar
    Rob Pike committed
    				// c is as delivered
    
    			case escape(c) >= 0:
    				c = int(escaped[escape(c)])
    
    Rob Pike's avatar
    Rob Pike committed
    			default:
    
    				p.error(ErrBadBackslash)
    
    Rob Pike's avatar
    Rob Pike committed
    			}
    
    Rob Pike's avatar
    Rob Pike committed
    		default:
    
    Rob Pike's avatar
    Rob Pike committed
    			switch {
    
    			case left < 0: // first of pair
    				if p.c() == '-' { // range
    					p.nextc()
    					left = c
    				} else { // single char
    
    Rob Pike's avatar
    Rob Pike committed
    				}
    
    			case left <= c: // second of pair
    				cc.addRange(left, c)
    				left = -1
    
    Rob Pike's avatar
    Rob Pike committed
    			default:
    
    				p.error(ErrBadRange)
    
    Rob Pike's avatar
    Rob Pike committed
    func (p *parser) term() (start, end *instr) {
    
    	switch c := p.c(); c {
    
    Rob Pike's avatar
    Rob Pike committed
    	case '|', endOfFile:
    
    		p.error(ErrBareClosure)
    
    Rob Pike's avatar
    Rob Pike committed
    	case ')':
    
    			p.error(ErrUnmatchedRpar)
    
    Rob Pike's avatar
    Rob Pike committed
    	case ']':
    
    		p.error(ErrUnmatchedRbkt)
    
    Rob Pike's avatar
    Rob Pike committed
    	case '^':
    
    Rob Pike's avatar
    Rob Pike committed
    		start = p.re.add(&instr{kind: iBOT})
    
    		return start, start
    
    Rob Pike's avatar
    Rob Pike committed
    	case '$':
    
    Rob Pike's avatar
    Rob Pike committed
    		start = p.re.add(&instr{kind: iEOT})
    
    		return start, start
    
    Rob Pike's avatar
    Rob Pike committed
    		start = p.re.add(&instr{kind: iAny})
    
    		return start, start
    
    Rob Pike's avatar
    Rob Pike committed
    	case '[':
    
    		p.nextc()
    		start = p.charClass()
    
    Rob Pike's avatar
    Rob Pike committed
    		if p.c() != ']' {
    
    			p.error(ErrUnmatchedLbkt)
    
    Rob Pike's avatar
    Rob Pike committed
    		}
    
    		p.nextc()
    		return start, start
    
    		p.nextc()
    		p.nlpar++
    		p.re.nbra++ // increment first so first subexpr is \1
    		nbra := p.re.nbra
    		start, end = p.regexp()
    
    		if p.c() != ')' {
    
    			p.error(ErrUnmatchedLpar)
    
    		bra := &instr{kind: iBra, braNum: 2 * nbra}
    
    		ebra := &instr{kind: iBra, braNum: 2*nbra + 1}
    
    		if start == nil {
    			if end == nil {
    
    				p.error(ErrInternal)
    
    Rob Pike's avatar
    Rob Pike committed
    			end.next = ebra
    
    Rob Pike's avatar
    Rob Pike committed
    		bra.next = start
    
    	case '\\':
    
    Rob Pike's avatar
    Rob Pike committed
    		case c == endOfFile:
    
    			p.error(ErrExtraneousBackslash)
    
    		case ispunct(c):
    
    			// c is as delivered
    
    		case escape(c) >= 0:
    			c = int(escaped[escape(c)])
    
    			p.error(ErrBadBackslash)
    
    Rob Pike's avatar
    Rob Pike committed
    		start = &instr{kind: iChar, char: c}
    
    		p.re.add(start)
    		return start, start
    
    	panic("unreachable")
    
    Rob Pike's avatar
    Rob Pike committed
    func (p *parser) closure() (start, end *instr) {
    
    	start, end = p.term()
    
    	if start == nil {
    
    	}
    	switch p.c() {
    	case '*':
    		// (start,end)*:
    
    Rob Pike's avatar
    Rob Pike committed
    		alt := &instr{kind: iAlt}
    
    Rob Pike's avatar
    Rob Pike committed
    		end.next = alt   // after end, do alt
    
    		alt.left = start // alternate brach: return to start
    		start = alt      // alt becomes new (start, end)
    		end = alt
    
    	case '+':
    		// (start,end)+:
    
    Rob Pike's avatar
    Rob Pike committed
    		alt := &instr{kind: iAlt}
    
    Rob Pike's avatar
    Rob Pike committed
    		end.next = alt   // after end, do alt
    
    		alt.left = start // alternate brach: return to start
    		end = alt        // start is unchanged; end is alt
    
    	case '?':
    		// (start,end)?:
    
    Rob Pike's avatar
    Rob Pike committed
    		alt := &instr{kind: iAlt}
    
    Rob Pike's avatar
    Rob Pike committed
    		nop := &instr{kind: iNop}
    
    		p.re.add(nop)
    		alt.left = start // alternate branch is start
    
    Rob Pike's avatar
    Rob Pike committed
    		alt.next = nop   // follow on to nop
    		end.next = nop   // after end, go to nop
    
    		start = alt      // start is now alt
    		end = nop        // end is nop pointed to by both branches
    
    	}
    	switch p.nextc() {
    	case '*', '+', '?':
    
    		p.error(ErrBadClosure)
    
    Rob Pike's avatar
    Rob Pike committed
    func (p *parser) concatenation() (start, end *instr) {
    
    		nstart, nend := p.closure()
    
    		case nstart == nil: // end of this concatenation
    			if start == nil { // this is the empty string
    
    Rob Pike's avatar
    Rob Pike committed
    				nop := p.re.add(&instr{kind: iNop})
    
    Rob Pike's avatar
    Rob Pike committed
    			}
    
    			return
    		case start == nil: // this is first element of concatenation
    
    Rob Pike's avatar
    Rob Pike committed
    			end.next = nstart
    
    	panic("unreachable")
    
    Rob Pike's avatar
    Rob Pike committed
    func (p *parser) regexp() (start, end *instr) {
    
    	start, end = p.concatenation()
    
    	for {
    		switch p.c() {
    		default:
    
    			p.nextc()
    			nstart, nend := p.concatenation()
    
    Rob Pike's avatar
    Rob Pike committed
    			alt := &instr{kind: iAlt}
    
    			p.re.add(alt)
    			alt.left = start
    
    Rob Pike's avatar
    Rob Pike committed
    			alt.next = nstart
    			nop := &instr{kind: iNop}
    
    Rob Pike's avatar
    Rob Pike committed
    			end.next = nop
    			nend.next = nop
    
    			start, end = alt, nop
    
    	panic("unreachable")
    
    Rob Pike's avatar
    Rob Pike committed
    func unNop(i *instr) *instr {
    	for i.kind == iNop {
    		i = i.next
    
    func (re *Regexp) eliminateNops() {
    
    	for _, inst := range re.inst {
    
    Rob Pike's avatar
    Rob Pike committed
    		if inst.kind == iEnd {
    
    Rob Pike's avatar
    Rob Pike committed
    		}
    
    Rob Pike's avatar
    Rob Pike committed
    		inst.next = unNop(inst.next)
    		if inst.kind == iAlt {
    			inst.left = unNop(inst.left)
    
    func (re *Regexp) dump() {
    
    	print("prefix <", re.prefix, ">\n")
    
    	for _, inst := range re.inst {
    
    Rob Pike's avatar
    Rob Pike committed
    		print(inst.index, ": ")
    
    Rob Pike's avatar
    Rob Pike committed
    		if inst.kind != iEnd {
    			print(" -> ", inst.next.index)
    
    func (re *Regexp) doParse() {
    
    	p := newParser(re)
    
    Rob Pike's avatar
    Rob Pike committed
    	start := &instr{kind: iStart}
    
    	re.add(start)
    	s, e := p.regexp()
    
    Rob Pike's avatar
    Rob Pike committed
    	start.next = s
    
    Rob Pike's avatar
    Rob Pike committed
    	e.next = re.add(&instr{kind: iEnd})
    
    Rob Pike's avatar
    Rob Pike committed
    	if debug {
    
    Rob Pike's avatar
    Rob Pike committed
    	}
    
    	re.eliminateNops()
    
    Rob Pike's avatar
    Rob Pike committed
    	if debug {
    
    Rob Pike's avatar
    Rob Pike committed
    	}
    
    	re.setPrefix()
    	if debug {
    		re.dump()
    		println()
    
    // Extract regular text from the beginning of the pattern.
    // That text can be used by doExecute to speed up matching.
    
    func (re *Regexp) setPrefix() {
    
    	var b []byte
    	var utf = make([]byte, utf8.UTFMax)
    
    Rob Pike's avatar
    Rob Pike committed
    	var inst *instr
    
    	// First instruction is start; skip that.
    
    Rob Pike's avatar
    Rob Pike committed
    	for inst = re.inst[0].next; inst.kind != iEnd; inst = inst.next {
    
    		// stop if this is not a char
    
    Rob Pike's avatar
    Rob Pike committed
    		if inst.kind != iChar {
    
    		// stop if this char can be followed by a match for an empty string,
    		// which includes closures, ^, and $.
    
    Rob Pike's avatar
    Rob Pike committed
    		switch inst.next.kind {
    		case iBOT, iEOT, iAlt:
    
    Rob Pike's avatar
    Rob Pike committed
    		n := utf8.EncodeRune(utf, inst.char)
    
    		b = append(b, utf[0:n]...)
    
    	// point prefixStart instruction to first non-CHAR after prefix
    
    Rob Pike's avatar
    Rob Pike committed
    	re.prefixStart = inst
    
    	re.prefixBytes = b
    	re.prefix = string(b)
    
    // Compile parses a regular expression and returns, if successful, a Regexp
    // object that can be used to match against text.
    
    func Compile(str string) (regexp *Regexp, error os.Error) {
    
    	regexp = new(Regexp)
    
    	// doParse will panic if there is a parse error.
    	defer func() {
    		if e := recover(); e != nil {
    			regexp = nil
    			error = e.(Error) // Will re-panic if error was not an Error, e.g. nil-pointer exception
    		}
    	}()
    
    Rob Pike's avatar
    Rob Pike committed
    	regexp.inst = make([]*instr, 0, 10)
    
    	regexp.doParse()
    
    }
    
    // MustCompile is like Compile but panics if the expression cannot be parsed.
    // It simplifies safe initialization of global variables holding compiled regular
    // expressions.
    func MustCompile(str string) *Regexp {
    
    	regexp, error := Compile(str)
    
    		panic(`regexp: compiling "` + str + `": ` + error.String())
    
    Rob Pike's avatar
    Rob Pike committed
    
    
    // NumSubexp returns the number of parenthesized subexpressions in this Regexp.
    func (re *Regexp) NumSubexp() int { return re.nbra }
    
    
    // The match arena allows us to reduce the garbage generated by tossing
    // match vectors away as we execute.  Matches are ref counted and returned
    // to a free list when no longer active.  Increases a simple benchmark by 22X.
    type matchArena struct {
    
    	head *matchVec
    	len  int // length of match vector
    
    	m    []int // pairs of bracketing submatches. 0th is start,end
    	ref  int
    	next *matchVec
    
    }
    
    func (a *matchArena) new() *matchVec {
    	if a.head == nil {
    
    		const N = 10
    		block := make([]matchVec, N)
    
    		for i := 0; i < N; i++ {
    
    			b := &block[i]
    			b.next = a.head
    			a.head = b
    
    	m := a.head
    	a.head = m.next
    	m.ref = 0
    
    	if m.m == nil {
    		m.m = make([]int, a.len)
    	}
    
    }
    
    func (a *matchArena) free(m *matchVec) {
    
    		m.next = a.head
    		a.head = m
    
    	}
    }
    
    func (a *matchArena) copy(m *matchVec) *matchVec {
    
    	m1 := a.new()
    	copy(m1.m, m.m)
    	return m1
    
    }
    
    func (a *matchArena) noMatch() *matchVec {
    
    		m.m[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
    
    Rob Pike's avatar
    Rob Pike committed
    type state struct {
    
    Rob Pike's avatar
    Rob Pike committed
    	inst     *instr // next instruction to execute
    	prefixed bool   // this match began with a fixed prefix
    
    Rob Pike's avatar
    Rob Pike committed
    }
    
    // Append new state to to-do list.  Leftmost-longest wins so avoid
    
    // adding a state that's already active.  The matchVec will be inc-ref'ed
    // if it is assigned to a state.
    
    Rob Pike's avatar
    Rob Pike committed
    func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec, pos, end int) []state {
    	switch inst.kind {
    	case iBOT:
    
    Rob Pike's avatar
    Rob Pike committed
    			s = a.addState(s, inst.next, prefixed, match, pos, end)
    
    Rob Pike's avatar
    Rob Pike committed
    	case iEOT:
    
    Rob Pike's avatar
    Rob Pike committed
    			s = a.addState(s, inst.next, prefixed, match, pos, end)
    
    Rob Pike's avatar
    Rob Pike committed
    	case iBra:
    
    		match.m[inst.braNum] = pos
    
    Rob Pike's avatar
    Rob Pike committed
    		s = a.addState(s, inst.next, prefixed, match, pos, end)
    
    	// States are inserted in order so it's sufficient to see if we have the same
    	// instruction; no need to see if existing match is earlier (it is).
    
    Rob Pike's avatar
    Rob Pike committed
    	for i := 0; i < l; i++ {
    
    Rob Pike's avatar
    Rob Pike committed
    		if s[i].inst == inst {
    
    Rob Pike's avatar
    Rob Pike committed
    	}
    
    Russ Cox's avatar
    Russ Cox committed
    	s = append(s, state{inst, prefixed, match})
    
    Rob Pike's avatar
    Rob Pike committed
    	if inst.kind == iAlt {
    		s = a.addState(s, inst.left, prefixed, a.copy(match), pos, end)
    
    		// give other branch a copy of this match vector
    
    Rob Pike's avatar
    Rob Pike committed
    		s = a.addState(s, inst.next, prefixed, a.copy(match), pos, end)
    
    Rob Pike's avatar
    Rob Pike committed
    }
    
    
    // Accepts either string or bytes - the logic is identical either way.
    // If bytes == nil, scan str.
    
    func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
    
    	s[0] = make([]state, 0, 10)
    	s[1] = make([]state, 0, 10)
    
    	in, out := 0, 1
    	var final state
    	found := false
    	end := len(str)
    
    	if bytestr != nil {
    		end = len(bytestr)
    	}
    	// fast check for initial plain substring
    
    	prefixed := false // has this iteration begun by skipping a prefix?
    
    	if re.prefix != "" {
    
    		if bytestr == nil {
    
    			advance = strings.Index(str[pos:], re.prefix)
    
    		} else {
    
    			advance = bytes.Index(bytestr[pos:], re.prefixBytes)
    
    		}
    		if advance == -1 {
    
    		pos += advance + len(re.prefix)
    
    	arena := &matchArena{nil, 2 * (re.nbra + 1)}
    
    Rob Pike's avatar
    Rob Pike committed
    		if !found {
    			// prime the pump if we haven't seen a match yet
    
    			match := arena.noMatch()
    			match.m[0] = pos
    
    				s[out] = arena.addState(s[out], re.prefixStart, true, match, pos, end)
    
    				prefixed = false // next iteration should start at beginning of machine.
    			} else {
    
    Rob Pike's avatar
    Rob Pike committed
    				s[out] = arena.addState(s[out], re.start.next, false, match, pos, end)
    
    			arena.free(match) // if addState saved it, ref was incremented
    
    Rob Pike's avatar
    Rob Pike committed
    		}
    
    		in, out = out, in // old out state is new in state
    
    		// clear out old state
    
    		for _, state := range old {
    			arena.free(state.match)
    		}
    
    		s[out] = old[0:0] // truncate state vector
    
    		if found && len(s[in]) == 0 {
    
    Rob Pike's avatar
    Rob Pike committed
    			// machine has completed
    
    Rob Pike's avatar
    Rob Pike committed
    		}
    
    		charwidth := 1
    		c := endOfFile
    
    			if bytestr == nil {
    
    				c, charwidth = utf8.DecodeRuneInString(str[pos:end])
    
    				c, charwidth = utf8.DecodeRune(bytestr[pos:end])
    
    Rob Pike's avatar
    Rob Pike committed
    		}
    
    		for _, st := range s[in] {
    
    Rob Pike's avatar
    Rob Pike committed
    			switch st.inst.kind {
    			case iBOT:
    			case iEOT:
    			case iChar:
    				if c == st.inst.char {
    					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
    
    Rob Pike's avatar
    Rob Pike committed
    				}
    
    Rob Pike's avatar
    Rob Pike committed
    			case iCharClass:
    				if st.inst.cclass.matches(c) {
    					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
    
    Rob Pike's avatar
    Rob Pike committed
    				}
    
    Rob Pike's avatar
    Rob Pike committed
    			case iAny:
    
    Rob Pike's avatar
    Rob Pike committed
    				if c != endOfFile {
    
    Rob Pike's avatar
    Rob Pike committed
    					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
    
    Rob Pike's avatar
    Rob Pike committed
    				}
    
    Rob Pike's avatar
    Rob Pike committed
    			case iNotNL:
    
    				if c != endOfFile && c != '\n' {
    
    Rob Pike's avatar
    Rob Pike committed
    					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
    
    Rob Pike's avatar
    Rob Pike committed
    			case iBra:
    			case iAlt:
    			case iEnd:
    
    Rob Pike's avatar
    Rob Pike committed
    				// choose leftmost longest
    
    				if !found || // first
    					st.match.m[0] < final.match.m[0] || // leftmost
    					(st.match.m[0] == final.match.m[0] && pos-charwidth > final.match.m[1]) { // longest
    
    					if final.match != nil {
    						arena.free(final.match)
    					}
    
    					final = st
    					final.match.ref++
    					final.match.m[1] = pos - charwidth
    
    Rob Pike's avatar
    Rob Pike committed
    				}
    
    Rob Pike's avatar
    Rob Pike committed
    			default:
    
    				st.inst.print()
    				panic("unknown instruction in execute")
    
    Rob Pike's avatar
    Rob Pike committed
    			}
    		}
    	}
    
    	if final.match == nil {
    		return nil
    	}
    
    	// if match found, back up start of match by width of prefix.
    
    	if final.prefixed && len(final.match.m) > 0 {
    
    		final.match.m[0] -= len(re.prefix)
    
    	return final.match.m
    
    Rob Pike's avatar
    Rob Pike committed
    }
    
    
    // MatchString returns whether the Regexp matches the string s.
    
    // The return value is a boolean: true for match, false for no match.
    
    func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(s, nil, 0)) > 0 }
    
    
    // Match returns whether the Regexp matches the byte slice b.
    // The return value is a boolean: true for match, false for no match.
    
    func (re *Regexp) Match(b []byte) bool { return len(re.doExecute("", b, 0)) > 0 }
    
    // MatchString checks whether a textual regular expression
    // matches a string.  More complicated queries need
    // to use Compile and the full Regexp interface.
    func MatchString(pattern string, s string) (matched bool, error os.Error) {
    
    	re, err := Compile(pattern)
    
    	return re.MatchString(s), nil
    
    // Match checks whether a textual regular expression
    
    // matches a byte slice.  More complicated queries need
    
    // to use Compile and the full Regexp interface.
    
    func Match(pattern string, b []byte) (matched bool, error os.Error) {
    
    	re, err := Compile(pattern)
    
    	return re.Match(b), nil
    
    // ReplaceAllString returns a copy of src in which all matches for the Regexp
    
    // have been replaced by repl.  No support is provided for expressions
    // (e.g. \1 or $1) in the replacement string.
    
    func (re *Regexp) ReplaceAllString(src, repl string) string {
    
    	return re.ReplaceAllStringFunc(src, func(string) string { return repl })
    }
    
    // ReplaceAllStringFunc returns a copy of src in which all matches for the
    // Regexp have been replaced by the return value of of function repl (whose
    // first argument is the matched string).  No support is provided for
    // expressions (e.g. \1 or $1) in the replacement string.
    func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
    
    	lastMatchEnd := 0 // end position of the most recent match
    	searchPos := 0    // position where we next look for a match
    	buf := new(bytes.Buffer)
    
    	for searchPos <= len(src) {
    
    		a := re.doExecute(src, nil, searchPos)
    
    		if len(a) == 0 {
    
    			break // no more matches
    
    		}
    
    		// Copy the unmatched characters before this match.
    
    		io.WriteString(buf, src[lastMatchEnd:a[0]])
    
    
    		// Now insert a copy of the replacement string, but not for a
    		// match of the empty string immediately after another match.
    		// (Otherwise, we get double replacement for patterns that
    		// match both empty and nonempty strings.)
    		if a[1] > lastMatchEnd || a[0] == 0 {
    
    			io.WriteString(buf, repl(src[a[0]:a[1]]))
    
    		lastMatchEnd = a[1]
    
    
    		// Advance past this match; always advance at least one character.
    
    		_, width := utf8.DecodeRuneInString(src[searchPos:])
    
    		if searchPos+width > a[1] {
    
    		} else if searchPos+1 > a[1] {
    
    			// This clause is only needed at the end of the input
    			// string.  In that case, DecodeRuneInString returns width=0.
    
    		}
    	}
    
    	// Copy the unmatched characters after the last match.
    
    	io.WriteString(buf, src[lastMatchEnd:])
    
    	return buf.String()
    
    // ReplaceAll returns a copy of src in which all matches for the Regexp
    // have been replaced by repl.  No support is provided for expressions
    // (e.g. \1 or $1) in the replacement text.
    func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
    
    	return re.ReplaceAllFunc(src, func([]byte) []byte { return repl })
    }
    
    // ReplaceAllFunc returns a copy of src in which all matches for the
    // Regexp have been replaced by the return value of of function repl (whose
    // first argument is the matched []byte).  No support is provided for
    // expressions (e.g. \1 or $1) in the replacement string.
    func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
    
    	lastMatchEnd := 0 // end position of the most recent match
    	searchPos := 0    // position where we next look for a match
    	buf := new(bytes.Buffer)
    
    		a := re.doExecute("", src, searchPos)
    
    			break // no more matches
    
    		}
    
    		// Copy the unmatched characters before this match.
    
    		buf.Write(src[lastMatchEnd:a[0]])
    
    
    		// Now insert a copy of the replacement string, but not for a
    		// match of the empty string immediately after another match.
    		// (Otherwise, we get double replacement for patterns that
    		// match both empty and nonempty strings.)
    		if a[1] > lastMatchEnd || a[0] == 0 {
    
    			buf.Write(repl(src[a[0]:a[1]]))
    
    		lastMatchEnd = a[1]
    
    
    		// Advance past this match; always advance at least one character.
    
    		_, width := utf8.DecodeRune(src[searchPos:])
    
    		if searchPos+width > a[1] {
    
    		} else if searchPos+1 > a[1] {
    
    			// This clause is only needed at the end of the input
    			// string.  In that case, DecodeRuneInString returns width=0.
    
    		}
    	}
    
    	// Copy the unmatched characters after the last match.
    
    	buf.Write(src[lastMatchEnd:])
    
    	return buf.Bytes()
    
    // QuoteMeta returns a string that quotes all regular expression metacharacters
    // inside the argument text; the returned string is a regular expression matching
    // the literal text.  For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
    func QuoteMeta(s string) string {
    
    	b := make([]byte, 2*len(s))
    
    
    	// A byte loop is correct because all metacharacters are ASCII.
    
    	for i := 0; i < len(s); i++ {
    		if special(int(s[i])) {
    
    	return string(b[0:j])