Skip to content
Snippets Groups Projects
regexp.go 38.4 KiB
Newer Older
  • Learn to ignore specific revisions
  • // Use of this source code is governed by a BSD-style
    // license that can be found in the LICENSE file.
    
    
    // Package regexp implements a simple regular expression library.
    //
    // The syntax of the regular expressions accepted is:
    //
    //	regexp:
    //		concatenation { '|' concatenation }
    //	concatenation:
    //		{ closure }
    //	closure:
    //		term [ '*' | '+' | '?' ]
    //	term:
    //		'^'
    //		'$'
    //		'.'
    //		character
    
    //		'[' [ '^' ] { character-range } ']'
    
    //		'(' regexp ')'
    
    //		character [ '-' character ]
    
    // All characters are UTF-8-encoded code points.  Backslashes escape special
    
    // characters, including inside character classes.  The standard Go character
    // escapes are also recognized: \a \b \f \n \r \t \v.
    
    //
    // There are 16 methods of Regexp that match a regular expression and identify
    // the matched text.  Their names are matched by this regular expression:
    //
    //	Find(All)?(String)?(Submatch)?(Index)?
    //
    // If 'All' is present, the routine matches successive non-overlapping
    // matches of the entire expression.  Empty matches abutting a preceding
    // match are ignored.  The return value is a slice containing the successive
    // return values of the corresponding non-'All' routine.  These routines take
    // an extra integer argument, n; if n >= 0, the function returns at most n
    // matches/submatches.
    //
    // If 'String' is present, the argument is a string; otherwise it is a slice
    // of bytes; return values are adjusted as appropriate.
    //
    // If 'Submatch' is present, the return value is a slice identifying the
    // successive submatches of the expression.  Submatches are matches of
    // parenthesized subexpressions within the regular expression, numbered from
    // left to right in order of opening parenthesis.  Submatch 0 is the match of
    // the entire expression, submatch 1 the match of the first parenthesized
    // subexpression, and so on.
    //
    // If 'Index' is present, matches and submatches are identified by byte index
    // pairs within the input string: result[2*n:2*n+1] identifies the indexes of
    // the nth submatch.  The pair for n==0 identifies the match of the entire
    // expression.  If 'Index' is not present, the match is identified by the
    // text of the match/submatch.  If an index is negative, it means that
    // subexpression did not match any string in the input.
    //
    
    // There is also a subset of the methods that can be applied to text read
    // from a RuneReader:
    //
    //	MatchReader, FindReaderIndex, FindReaderSubmatchIndex
    //
    // This set may grow.  Note that regular expression matches may need to
    // examine text beyond the text returned by a match, so the methods that
    // match text from a RuneReader may read arbitrarily far into the input
    // before returning.
    //
    
    // (There are a few other methods that do not match this pattern.)
    
    package regexp
    
    
    Rob Pike's avatar
    Rob Pike committed
    import (
    
    	"bytes"
    	"io"
    	"os"
    	"strings"
    	"utf8"
    
    Rob Pike's avatar
    Rob Pike committed
    )
    
    
    var debug = false
    
    Rob Pike's avatar
    Rob Pike committed
    
    
    // Error is the local type for a parsing error.
    type Error string
    
    func (e Error) String() string {
    	return string(e)
    }
    
    
    Rob Pike's avatar
    Rob Pike committed
    // Error codes returned by failures to parse an expression.
    
    Russ Cox's avatar
    Russ Cox committed
    var (
    
    	ErrInternal            = Error("regexp: internal error")
    	ErrUnmatchedLpar       = Error("regexp: unmatched '('")
    	ErrUnmatchedRpar       = Error("regexp: unmatched ')'")
    	ErrUnmatchedLbkt       = Error("regexp: unmatched '['")
    	ErrUnmatchedRbkt       = Error("regexp: unmatched ']'")
    	ErrBadRange            = Error("regexp: bad range in character class")
    	ErrExtraneousBackslash = Error("regexp: extraneous backslash")
    	ErrBadClosure          = Error("regexp: repeated closure (**, ++, etc.)")
    	ErrBareClosure         = Error("regexp: closure applies to nothing")
    	ErrBadBackslash        = Error("regexp: illegal backslash escape")
    
    Russ Cox's avatar
    Russ Cox committed
    )
    
    Rob Pike's avatar
    Rob Pike committed
    const (
    	iStart     = iota // beginning of program
    	iEnd              // end of program: success
    	iBOT              // '^' beginning of text
    	iEOT              // '$' end of text
    	iChar             // 'a' regular character
    	iCharClass        // [a-z] character class
    	iAny              // '.' any character including newline
    	iNotNL            // [^\n] special case: any character but newline
    
    	iBra              // '(' parenthesized expression: 2*braNum for left, 2*braNum+1 for right
    
    Rob Pike's avatar
    Rob Pike committed
    	iAlt              // '|' alternation
    	iNop              // do nothing; makes it easy to link without patching
    )
    
    Rob Pike's avatar
    Rob Pike committed
    // An instruction executed by the NFA
    type instr struct {
    	kind  int    // the type of this instruction: iChar, iAny, etc.
    	index int    // used only in debugging; could be eliminated
    	next  *instr // the instruction to execute after this one
    	// Special fields valid only for some items.
    	char   int        // iChar
    	braNum int        // iBra, iEbra
    	cclass *charClass // iCharClass
    	left   *instr     // iAlt, other branch
    }
    
    func (i *instr) print() {
    	switch i.kind {
    	case iStart:
    		print("start")
    	case iEnd:
    		print("end")
    	case iBOT:
    		print("bot")
    	case iEOT:
    		print("eot")
    	case iChar:
    		print("char ", string(i.char))
    	case iCharClass:
    		i.cclass.print()
    	case iAny:
    		print("any")
    	case iNotNL:
    		print("notnl")
    	case iBra:
    
    		if i.braNum&1 == 0 {
    			print("bra", i.braNum/2)
    		} else {
    			print("ebra", i.braNum/2)
    		}
    
    Rob Pike's avatar
    Rob Pike committed
    	case iAlt:
    		print("alt(", i.left.index, ")")
    	case iNop:
    		print("nop")
    	}
    
    // Regexp is the representation of a compiled regular expression.
    
    // The public interface is entirely through methods.
    
    // A Regexp is safe for concurrent use by multiple goroutines.
    
    type Regexp struct {
    
    	expr        string // the original expression
    	prefix      string // initial plain text string
    	prefixBytes []byte // initial plain text bytes
    
    Rob Pike's avatar
    Rob Pike committed
    	inst        []*instr
    	start       *instr // first instruction of machine
    	prefixStart *instr // where to start if there is a prefix
    	nbra        int    // number of brackets in expression, for subexpressions
    
    Rob Pike's avatar
    Rob Pike committed
    type charClass struct {
    
    	negate bool // is character class negated? ([^a-z])
    
    	// slice of int, stored pairwise: [a-z] is (a,z); x is (x,x):
    	ranges     []int
    
    Rob Pike's avatar
    Rob Pike committed
    func (cclass *charClass) print() {
    
    	print("charclass")
    
    Rob Pike's avatar
    Rob Pike committed
    	if cclass.negate {
    
    Rob Pike's avatar
    Rob Pike committed
    	}
    
    	for i := 0; i < len(cclass.ranges); i += 2 {
    		l := cclass.ranges[i]
    		r := cclass.ranges[i+1]
    
    Rob Pike's avatar
    Rob Pike committed
    		if l == r {
    
    Rob Pike's avatar
    Rob Pike committed
    		} else {
    
    			print(" [", string(l), "-", string(r), "]")
    
    Rob Pike's avatar
    Rob Pike committed
    func (cclass *charClass) addRange(a, b int) {
    
    Rob Pike's avatar
    Rob Pike committed
    	// range is a through b inclusive
    
    	cclass.ranges = append(cclass.ranges, a, b)
    
    	if a < cclass.cmin {
    		cclass.cmin = a
    	}
    	if b > cclass.cmax {
    		cclass.cmax = b
    	}
    
    Rob Pike's avatar
    Rob Pike committed
    func (cclass *charClass) matches(c int) bool {
    
    	if c < cclass.cmin || c > cclass.cmax {
    		return cclass.negate
    	}
    
    	ranges := cclass.ranges
    
    	for i := 0; i < len(ranges); i = i + 2 {
    		if ranges[i] <= c && c <= ranges[i+1] {
    
    Rob Pike's avatar
    Rob Pike committed
    		}
    	}
    
    	return cclass.negate
    
    Rob Pike's avatar
    Rob Pike committed
    }
    
    
    Rob Pike's avatar
    Rob Pike committed
    func newCharClass() *instr {
    	i := &instr{kind: iCharClass}
    	i.cclass = new(charClass)
    	i.cclass.ranges = make([]int, 0, 4)
    	i.cclass.cmin = 0x10FFFF + 1 // MaxRune + 1
    	i.cclass.cmax = -1
    	return i
    
    Rob Pike's avatar
    Rob Pike committed
    func (re *Regexp) add(i *instr) *instr {
    	i.index = len(re.inst)
    
    	re.inst = append(re.inst, i)
    
    Rob Pike's avatar
    Rob Pike committed
    type parser struct {
    
    	re    *Regexp
    	nlpar int // number of unclosed lpars
    	pos   int
    	ch    int
    
    Rob Pike's avatar
    Rob Pike committed
    func (p *parser) error(err Error) {
    
    func (p *parser) c() int { return p.ch }
    
    Rob Pike's avatar
    Rob Pike committed
    func (p *parser) nextc() int {
    
    	if p.pos >= len(p.re.expr) {
    
    		c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:])
    		p.ch = c
    		p.pos += w
    
    func newParser(re *Regexp) *parser {
    
    	p := new(parser)
    	p.re = re
    	p.nextc() // load p.ch
    	return p
    
    }
    
    func special(c int) bool {
    
    Rob Pike's avatar
    Rob Pike committed
    	for _, r := range `\.+*?()|[]^$` {
    		if c == r {
    
    Rob Pike's avatar
    Rob Pike committed
    		}
    	}
    
    func ispunct(c int) bool {
    	for _, r := range "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" {
    
    Rob Pike's avatar
    Rob Pike committed
    		if c == r {
    
    var escapes = []byte("abfnrtv")
    var escaped = []byte("\a\b\f\n\r\t\v")
    
    func escape(c int) int {
    	for i, b := range escapes {
    		if int(b) == c {
    			return i
    		}
    	}
    	return -1
    }
    
    
    Ben Lynn's avatar
    Ben Lynn committed
    func (p *parser) checkBackslash() int {
    	c := p.c()
    	if c == '\\' {
    		c = p.nextc()
    		switch {
    
    Ben Lynn's avatar
    Ben Lynn committed
    			p.error(ErrExtraneousBackslash)
    		case ispunct(c):
    			// c is as delivered
    		case escape(c) >= 0:
    			c = int(escaped[escape(c)])
    		default:
    			p.error(ErrBadBackslash)
    		}
    	}
    	return c
    }
    
    
    Rob Pike's avatar
    Rob Pike committed
    func (p *parser) charClass() *instr {
    	i := newCharClass()
    	cc := i.cclass
    
    Rob Pike's avatar
    Rob Pike committed
    	if p.c() == '^' {
    
    		cc.negate = true
    		p.nextc()
    
    Rob Pike's avatar
    Rob Pike committed
    	}
    
    Rob Pike's avatar
    Rob Pike committed
    	for {
    		switch c := p.c(); c {
    
    Rob Pike's avatar
    Rob Pike committed
    			if left >= 0 {
    
    				p.error(ErrBadRange)
    
    Rob Pike's avatar
    Rob Pike committed
    			}
    
    			if cc.negate && len(cc.ranges) == 2 &&
    				cc.ranges[0] == '\n' && cc.ranges[1] == '\n' {
    
    Rob Pike's avatar
    Rob Pike committed
    				nl := &instr{kind: iNotNL}
    
    				p.re.add(nl)
    				return nl
    
    			// Special common case: "[a]" -> "a"
    
    			if !cc.negate && len(cc.ranges) == 2 && cc.ranges[0] == cc.ranges[1] {
    
    Rob Pike's avatar
    Rob Pike committed
    				c := &instr{kind: iChar, char: cc.ranges[0]}
    
    Rob Pike's avatar
    Rob Pike committed
    			p.re.add(i)
    			return i
    
    		case '-': // do this before backslash processing
    
    			p.error(ErrBadRange)
    
    Rob Pike's avatar
    Rob Pike committed
    		default:
    
    Ben Lynn's avatar
    Ben Lynn committed
    			c = p.checkBackslash()
    
    Rob Pike's avatar
    Rob Pike committed
    			switch {
    
    			case left < 0: // first of pair
    				if p.c() == '-' { // range
    					p.nextc()
    					left = c
    				} else { // single char
    
    Rob Pike's avatar
    Rob Pike committed
    				}
    
    			case left <= c: // second of pair
    				cc.addRange(left, c)
    				left = -1
    
    Rob Pike's avatar
    Rob Pike committed
    			default:
    
    				p.error(ErrBadRange)
    
    Ben Lynn's avatar
    Ben Lynn committed
    	panic("unreachable")
    
    Rob Pike's avatar
    Rob Pike committed
    func (p *parser) term() (start, end *instr) {
    
    	switch c := p.c(); c {
    
    Ben Lynn's avatar
    Ben Lynn committed
    	case '*', '+', '?':
    
    		p.error(ErrBareClosure)
    
    Rob Pike's avatar
    Rob Pike committed
    	case ')':
    
    			p.error(ErrUnmatchedRpar)
    
    Rob Pike's avatar
    Rob Pike committed
    	case ']':
    
    		p.error(ErrUnmatchedRbkt)
    
    Rob Pike's avatar
    Rob Pike committed
    	case '^':
    
    Rob Pike's avatar
    Rob Pike committed
    		start = p.re.add(&instr{kind: iBOT})
    
    		return start, start
    
    Rob Pike's avatar
    Rob Pike committed
    	case '$':
    
    Rob Pike's avatar
    Rob Pike committed
    		start = p.re.add(&instr{kind: iEOT})
    
    		return start, start
    
    Rob Pike's avatar
    Rob Pike committed
    		start = p.re.add(&instr{kind: iAny})
    
    		return start, start
    
    Rob Pike's avatar
    Rob Pike committed
    	case '[':
    
    		p.nextc()
    		start = p.charClass()
    
    Rob Pike's avatar
    Rob Pike committed
    		if p.c() != ']' {
    
    			p.error(ErrUnmatchedLbkt)
    
    Rob Pike's avatar
    Rob Pike committed
    		}
    
    		p.nextc()
    		return start, start
    
    		p.nextc()
    		p.nlpar++
    		p.re.nbra++ // increment first so first subexpr is \1
    		nbra := p.re.nbra
    		start, end = p.regexp()
    
    		if p.c() != ')' {
    
    			p.error(ErrUnmatchedLpar)
    
    		bra := &instr{kind: iBra, braNum: 2 * nbra}
    
    		ebra := &instr{kind: iBra, braNum: 2*nbra + 1}
    
    		if start == nil {
    			if end == nil {
    
    				p.error(ErrInternal)
    
    Rob Pike's avatar
    Rob Pike committed
    			end.next = ebra
    
    Rob Pike's avatar
    Rob Pike committed
    		bra.next = start
    
    Ben Lynn's avatar
    Ben Lynn committed
    		c = p.checkBackslash()
    
    Rob Pike's avatar
    Rob Pike committed
    		start = &instr{kind: iChar, char: c}
    
    		p.re.add(start)
    		return start, start
    
    	panic("unreachable")
    
    Rob Pike's avatar
    Rob Pike committed
    func (p *parser) closure() (start, end *instr) {
    
    	start, end = p.term()
    
    	if start == nil {
    
    	}
    	switch p.c() {
    	case '*':
    		// (start,end)*:
    
    Rob Pike's avatar
    Rob Pike committed
    		alt := &instr{kind: iAlt}
    
    Rob Pike's avatar
    Rob Pike committed
    		end.next = alt   // after end, do alt
    
    		alt.left = start // alternate brach: return to start
    		start = alt      // alt becomes new (start, end)
    		end = alt
    
    	case '+':
    		// (start,end)+:
    
    Rob Pike's avatar
    Rob Pike committed
    		alt := &instr{kind: iAlt}
    
    Rob Pike's avatar
    Rob Pike committed
    		end.next = alt   // after end, do alt
    
    		alt.left = start // alternate brach: return to start
    		end = alt        // start is unchanged; end is alt
    
    	case '?':
    		// (start,end)?:
    
    Rob Pike's avatar
    Rob Pike committed
    		alt := &instr{kind: iAlt}
    
    Rob Pike's avatar
    Rob Pike committed
    		nop := &instr{kind: iNop}
    
    		p.re.add(nop)
    		alt.left = start // alternate branch is start
    
    Rob Pike's avatar
    Rob Pike committed
    		alt.next = nop   // follow on to nop
    		end.next = nop   // after end, go to nop
    
    		start = alt      // start is now alt
    		end = nop        // end is nop pointed to by both branches
    
    	}
    	switch p.nextc() {
    	case '*', '+', '?':
    
    		p.error(ErrBadClosure)
    
    Rob Pike's avatar
    Rob Pike committed
    func (p *parser) concatenation() (start, end *instr) {
    
    		nstart, nend := p.closure()
    
    		case nstart == nil: // end of this concatenation
    			if start == nil { // this is the empty string
    
    Rob Pike's avatar
    Rob Pike committed
    				nop := p.re.add(&instr{kind: iNop})
    
    Rob Pike's avatar
    Rob Pike committed
    			}
    
    			return
    		case start == nil: // this is first element of concatenation
    
    Rob Pike's avatar
    Rob Pike committed
    			end.next = nstart
    
    	panic("unreachable")
    
    Rob Pike's avatar
    Rob Pike committed
    func (p *parser) regexp() (start, end *instr) {
    
    	start, end = p.concatenation()
    
    	for {
    		switch p.c() {
    		default:
    
    			p.nextc()
    			nstart, nend := p.concatenation()
    
    Rob Pike's avatar
    Rob Pike committed
    			alt := &instr{kind: iAlt}
    
    			p.re.add(alt)
    			alt.left = start
    
    Rob Pike's avatar
    Rob Pike committed
    			alt.next = nstart
    			nop := &instr{kind: iNop}
    
    Rob Pike's avatar
    Rob Pike committed
    			end.next = nop
    			nend.next = nop
    
    			start, end = alt, nop
    
    	panic("unreachable")
    
    Rob Pike's avatar
    Rob Pike committed
    func unNop(i *instr) *instr {
    	for i.kind == iNop {
    		i = i.next
    
    func (re *Regexp) eliminateNops() {
    
    	for _, inst := range re.inst {
    
    Rob Pike's avatar
    Rob Pike committed
    		if inst.kind == iEnd {
    
    Rob Pike's avatar
    Rob Pike committed
    		}
    
    Rob Pike's avatar
    Rob Pike committed
    		inst.next = unNop(inst.next)
    		if inst.kind == iAlt {
    			inst.left = unNop(inst.left)
    
    func (re *Regexp) dump() {
    
    	print("prefix <", re.prefix, ">\n")
    
    	for _, inst := range re.inst {
    
    Rob Pike's avatar
    Rob Pike committed
    		print(inst.index, ": ")
    
    Rob Pike's avatar
    Rob Pike committed
    		if inst.kind != iEnd {
    			print(" -> ", inst.next.index)
    
    func (re *Regexp) doParse() {
    
    	p := newParser(re)
    
    Rob Pike's avatar
    Rob Pike committed
    	start := &instr{kind: iStart}
    
    	re.add(start)
    	s, e := p.regexp()
    
    Rob Pike's avatar
    Rob Pike committed
    	start.next = s
    
    Rob Pike's avatar
    Rob Pike committed
    	e.next = re.add(&instr{kind: iEnd})
    
    Rob Pike's avatar
    Rob Pike committed
    	if debug {
    
    Rob Pike's avatar
    Rob Pike committed
    	}
    
    	re.eliminateNops()
    
    Rob Pike's avatar
    Rob Pike committed
    	if debug {
    
    Rob Pike's avatar
    Rob Pike committed
    	}
    
    	re.setPrefix()
    	if debug {
    		re.dump()
    		println()
    
    // Extract regular text from the beginning of the pattern,
    // possibly after a leading iBOT.
    
    // That text can be used by doExecute to speed up matching.
    
    func (re *Regexp) setPrefix() {
    
    	var b []byte
    	var utf = make([]byte, utf8.UTFMax)
    
    Rob Pike's avatar
    Rob Pike committed
    	var inst *instr
    
    	// First instruction is start; skip that.  Also skip any initial iBOT.
    	inst = re.inst[0].next
    	for inst.kind == iBOT {
    		inst = inst.next
    	}
    
    	for ; inst.kind != iEnd; inst = inst.next {
    
    		// stop if this is not a char
    
    Rob Pike's avatar
    Rob Pike committed
    		if inst.kind != iChar {
    
    		// stop if this char can be followed by a match for an empty string,
    		// which includes closures, ^, and $.
    
    Rob Pike's avatar
    Rob Pike committed
    		switch inst.next.kind {
    		case iBOT, iEOT, iAlt:
    
    Rob Pike's avatar
    Rob Pike committed
    		n := utf8.EncodeRune(utf, inst.char)
    
    		b = append(b, utf[0:n]...)
    
    	// point prefixStart instruction to first non-CHAR after prefix
    
    Rob Pike's avatar
    Rob Pike committed
    	re.prefixStart = inst
    
    	re.prefixBytes = b
    	re.prefix = string(b)
    
    // String returns the source text used to compile the regular expression.
    func (re *Regexp) String() string {
    
    // Compile parses a regular expression and returns, if successful, a Regexp
    // object that can be used to match against text.
    
    func Compile(str string) (regexp *Regexp, error os.Error) {
    
    	regexp = new(Regexp)
    
    	// doParse will panic if there is a parse error.
    	defer func() {
    		if e := recover(); e != nil {
    			regexp = nil
    			error = e.(Error) // Will re-panic if error was not an Error, e.g. nil-pointer exception
    		}
    	}()
    
    Rob Pike's avatar
    Rob Pike committed
    	regexp.inst = make([]*instr, 0, 10)
    
    	regexp.doParse()
    
    }
    
    // MustCompile is like Compile but panics if the expression cannot be parsed.
    // It simplifies safe initialization of global variables holding compiled regular
    // expressions.
    func MustCompile(str string) *Regexp {
    
    	regexp, error := Compile(str)
    
    		panic(`regexp: compiling "` + str + `": ` + error.String())
    
    Rob Pike's avatar
    Rob Pike committed
    
    
    // NumSubexp returns the number of parenthesized subexpressions in this Regexp.
    func (re *Regexp) NumSubexp() int { return re.nbra }
    
    
    // The match arena allows us to reduce the garbage generated by tossing
    // match vectors away as we execute.  Matches are ref counted and returned
    // to a free list when no longer active.  Increases a simple benchmark by 22X.
    type matchArena struct {
    
    	head  *matchVec
    	len   int // length of match vector
    	pos   int
    	atBOT bool // whether we're at beginning of text
    	atEOT bool // whether we're at end of text
    
    	m    []int // pairs of bracketing submatches. 0th is start,end
    	ref  int
    	next *matchVec
    
    }
    
    func (a *matchArena) new() *matchVec {
    	if a.head == nil {
    
    		const N = 10
    		block := make([]matchVec, N)
    
    		for i := 0; i < N; i++ {
    
    			b := &block[i]
    			b.next = a.head
    			a.head = b
    
    	m := a.head
    	a.head = m.next
    	m.ref = 0
    
    	if m.m == nil {
    		m.m = make([]int, a.len)
    	}
    
    }
    
    func (a *matchArena) free(m *matchVec) {
    
    		m.next = a.head
    		a.head = m
    
    	}
    }
    
    func (a *matchArena) copy(m *matchVec) *matchVec {
    
    	m1 := a.new()
    	copy(m1.m, m.m)
    	return m1
    
    }
    
    func (a *matchArena) noMatch() *matchVec {
    
    		m.m[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
    
    Rob Pike's avatar
    Rob Pike committed
    type state struct {
    
    Rob Pike's avatar
    Rob Pike committed
    	inst     *instr // next instruction to execute
    	prefixed bool   // this match began with a fixed prefix
    
    Rob Pike's avatar
    Rob Pike committed
    }
    
    // Append new state to to-do list.  Leftmost-longest wins so avoid
    
    // adding a state that's already active.  The matchVec will be inc-ref'ed
    // if it is assigned to a state.
    
    func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec) []state {
    
    Rob Pike's avatar
    Rob Pike committed
    	switch inst.kind {
    	case iBOT:
    
    		if a.atBOT {
    			s = a.addState(s, inst.next, prefixed, match)
    
    Rob Pike's avatar
    Rob Pike committed
    	case iEOT:
    
    		if a.atEOT {
    			s = a.addState(s, inst.next, prefixed, match)
    
    Rob Pike's avatar
    Rob Pike committed
    	case iBra:
    
    		match.m[inst.braNum] = a.pos
    		s = a.addState(s, inst.next, prefixed, match)
    
    	// States are inserted in order so it's sufficient to see if we have the same
    	// instruction; no need to see if existing match is earlier (it is).
    
    Rob Pike's avatar
    Rob Pike committed
    	for i := 0; i < l; i++ {
    
    Rob Pike's avatar
    Rob Pike committed
    		if s[i].inst == inst {
    
    Rob Pike's avatar
    Rob Pike committed
    	}
    
    Russ Cox's avatar
    Russ Cox committed
    	s = append(s, state{inst, prefixed, match})
    
    Rob Pike's avatar
    Rob Pike committed
    	if inst.kind == iAlt {
    
    		s = a.addState(s, inst.left, prefixed, a.copy(match))
    
    		// give other branch a copy of this match vector
    
    		s = a.addState(s, inst.next, prefixed, a.copy(match))
    
    Rob Pike's avatar
    Rob Pike committed
    }
    
    
    // input abstracts different representations of the input text. It provides
    // one-character lookahead.
    type input interface {
    	step(pos int) (rune int, width int) // advance one rune
    	canCheckPrefix() bool               // can we look ahead without losing info?
    	hasPrefix(re *Regexp) bool
    	index(re *Regexp, pos int) int
    }
    
    // inputString scans a string.
    type inputString struct {
    	str string
    }
    
    func newInputString(str string) *inputString {
    	return &inputString{str: str}
    }
    
    func (i *inputString) step(pos int) (int, int) {
    	if pos < len(i.str) {
    		return utf8.DecodeRuneInString(i.str[pos:len(i.str)])
    	}
    	return endOfText, 0
    }
    
    func (i *inputString) canCheckPrefix() bool {
    	return true
    }
    
    func (i *inputString) hasPrefix(re *Regexp) bool {
    	return strings.HasPrefix(i.str, re.prefix)
    }
    
    func (i *inputString) index(re *Regexp, pos int) int {
    	return strings.Index(i.str[pos:], re.prefix)
    }
    
    // inputBytes scans a byte slice.
    type inputBytes struct {
    	str []byte
    }
    
    func newInputBytes(str []byte) *inputBytes {
    	return &inputBytes{str: str}
    }
    
    func (i *inputBytes) step(pos int) (int, int) {
    	if pos < len(i.str) {
    		return utf8.DecodeRune(i.str[pos:len(i.str)])
    	}
    	return endOfText, 0
    }
    
    func (i *inputBytes) canCheckPrefix() bool {
    	return true
    }
    
    func (i *inputBytes) hasPrefix(re *Regexp) bool {
    	return bytes.HasPrefix(i.str, re.prefixBytes)
    }
    
    func (i *inputBytes) index(re *Regexp, pos int) int {
    	return bytes.Index(i.str[pos:], re.prefixBytes)
    }
    
    // inputReader scans a RuneReader.
    type inputReader struct {
    	r     io.RuneReader
    	atEOT bool
    	pos   int
    }
    
    func newInputReader(r io.RuneReader) *inputReader {
    	return &inputReader{r: r}
    }
    
    func (i *inputReader) step(pos int) (int, int) {
    	if !i.atEOT && pos != i.pos {
    		return endOfText, 0
    
    	}
    	r, w, err := i.r.ReadRune()
    	if err != nil {
    		i.atEOT = true
    		return endOfText, 0
    	}
    	i.pos += w
    	return r, w
    }
    
    func (i *inputReader) canCheckPrefix() bool {
    	return false
    }
    
    func (i *inputReader) hasPrefix(re *Regexp) bool {
    	return false
    }
    
    func (i *inputReader) index(re *Regexp, pos int) int {
    	return -1
    }
    
    // Search match starting from pos bytes into the input.
    func (re *Regexp) doExecute(i input, pos int) []int {
    
    	s[0] = make([]state, 0, 10)
    	s[1] = make([]state, 0, 10)
    
    	in, out := 0, 1
    	var final state
    	found := false
    
    	anchored := re.inst[0].next.kind == iBOT
    	if anchored && pos > 0 {
    		return nil
    	}
    
    	// fast check for initial plain substring
    
    	if i.canCheckPrefix() && re.prefix != "" {
    
    		} else {
    
    			advance = i.index(re, pos)
    			if advance == -1 {
    				return nil
    
    Rob Pike's avatar
    Rob Pike committed
    		pos += advance
    
    	// We look one character ahead so we can match $, which checks whether
    	// we are at EOT.
    	nextChar, nextWidth := i.step(pos)
    	arena := &matchArena{
    		len:   2 * (re.nbra + 1),
    		pos:   pos,
    		atBOT: pos == 0,
    		atEOT: nextChar == endOfText,
    	}
    	for c, startPos := 0, pos; c != endOfText; {
    
    		if !found && (pos == startPos || !anchored) {
    
    Rob Pike's avatar
    Rob Pike committed
    			// prime the pump if we haven't seen a match yet
    
    			match := arena.noMatch()
    			match.m[0] = pos
    
    			s[out] = arena.addState(s[out], re.start.next, false, match)
    
    			arena.free(match) // if addState saved it, ref was incremented
    
    		} else if len(s[out]) == 0 {
    			// machine has completed
    			break
    
    Rob Pike's avatar
    Rob Pike committed
    		}
    
    		in, out = out, in // old out state is new in state
    
    		// clear out old state
    
    		for _, state := range old {
    			arena.free(state.match)
    		}
    
    		s[out] = old[0:0] // truncate state vector
    
    		c = nextChar
    		thisPos := pos
    		pos += nextWidth
    		nextChar, nextWidth = i.step(pos)
    		arena.atEOT = nextChar == endOfText
    		arena.atBOT = false
    		arena.pos = pos
    
    		for _, st := range s[in] {
    
    Rob Pike's avatar
    Rob Pike committed
    			switch st.inst.kind {
    			case iBOT:
    			case iEOT:
    			case iChar:
    				if c == st.inst.char {
    
    					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
    
    Rob Pike's avatar
    Rob Pike committed
    				}
    
    Rob Pike's avatar
    Rob Pike committed
    			case iCharClass:
    				if st.inst.cclass.matches(c) {
    
    					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
    
    Rob Pike's avatar
    Rob Pike committed
    				}
    
    Rob Pike's avatar
    Rob Pike committed
    			case iAny:
    
    				if c != endOfText {
    					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
    
    Rob Pike's avatar
    Rob Pike committed
    				}
    
    Rob Pike's avatar
    Rob Pike committed
    			case iNotNL:
    
    				if c != endOfText && c != '\n' {
    					s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
    
    Rob Pike's avatar
    Rob Pike committed
    			case iBra:
    			case iAlt:
    			case iEnd:
    
    Rob Pike's avatar
    Rob Pike committed
    				// choose leftmost longest
    
    				if !found || // first
    					st.match.m[0] < final.match.m[0] || // leftmost
    
    					(st.match.m[0] == final.match.m[0] && thisPos > final.match.m[1]) { // longest
    
    					if final.match != nil {
    						arena.free(final.match)
    					}
    
    					final = st
    					final.match.ref++
    
    Rob Pike's avatar
    Rob Pike committed
    				}
    
    Rob Pike's avatar
    Rob Pike committed
    			default:
    
    				st.inst.print()
    				panic("unknown instruction in execute")
    
    Rob Pike's avatar
    Rob Pike committed
    			}
    		}
    	}
    
    	if final.match == nil {
    		return nil
    	}
    
    	// if match found, back up start of match by width of prefix.
    
    	if final.prefixed && len(final.match.m) > 0 {
    
    		final.match.m[0] -= len(re.prefix)
    
    	return final.match.m
    
    Rob Pike's avatar
    Rob Pike committed
    }
    
    
    // LiteralPrefix returns a literal string that must begin any match
    // of the regular expression re.  It returns the boolean true if the
    // literal string comprises the entire regular expression.
    func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
    	c := make([]int, len(re.inst)-2) // minus start and end.
    	// First instruction is start; skip that.
    	i := 0
    	for inst := re.inst[0].next; inst.kind != iEnd; inst = inst.next {
    		// stop if this is not a char
    		if inst.kind != iChar {
    			return string(c[:i]), false
    		}
    		c[i] = inst.char
    		i++
    	}
    	return string(c[:i]), true
    }
    
    
    // MatchReader returns whether the Regexp matches the text read by the
    // RuneReader.  The return value is a boolean: true for match, false for no
    // match.
    func (re *Regexp) MatchReader(r io.RuneReader) bool {
    	return len(re.doExecute(newInputReader(r), 0)) > 0
    }
    
    
    // MatchString returns whether the Regexp matches the string s.
    
    // The return value is a boolean: true for match, false for no match.
    
    func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(newInputString(s), 0)) > 0 }
    
    
    // Match returns whether the Regexp matches the byte slice b.
    // The return value is a boolean: true for match, false for no match.
    
    func (re *Regexp) Match(b []byte) bool { return len(re.doExecute(newInputBytes(b), 0)) > 0 }
    
    // MatchReader checks whether a textual regular expression matches the text
    // read by the RuneReader.  More complicated queries need to use Compile and
    // the full Regexp interface.