Newer
Older
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package regexp implements a simple regular expression library.
//
// The syntax of the regular expressions accepted is:
//
// regexp:
// concatenation { '|' concatenation }
// concatenation:
// { closure }
// closure:
// term [ '*' | '+' | '?' ]
// term:
// '^'
// '$'
// '.'
// character
// '[' [ '^' ] { character-range } ']'
// character-range:
// All characters are UTF-8-encoded code points. Backslashes escape special
// characters, including inside character classes. The standard Go character
// escapes are also recognized: \a \b \f \n \r \t \v.
//
// There are 16 methods of Regexp that match a regular expression and identify
// the matched text. Their names are matched by this regular expression:
//
// Find(All)?(String)?(Submatch)?(Index)?
//
// If 'All' is present, the routine matches successive non-overlapping
// matches of the entire expression. Empty matches abutting a preceding
// match are ignored. The return value is a slice containing the successive
// return values of the corresponding non-'All' routine. These routines take
// an extra integer argument, n; if n >= 0, the function returns at most n
// matches/submatches.
//
// If 'String' is present, the argument is a string; otherwise it is a slice
// of bytes; return values are adjusted as appropriate.
//
// If 'Submatch' is present, the return value is a slice identifying the
// successive submatches of the expression. Submatches are matches of
// parenthesized subexpressions within the regular expression, numbered from
// left to right in order of opening parenthesis. Submatch 0 is the match of
// the entire expression, submatch 1 the match of the first parenthesized
// subexpression, and so on.
//
// If 'Index' is present, matches and submatches are identified by byte index
// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
// the nth submatch. The pair for n==0 identifies the match of the entire
// expression. If 'Index' is not present, the match is identified by the
// text of the match/submatch. If an index is negative, it means that
// subexpression did not match any string in the input.
//
// There is also a subset of the methods that can be applied to text read
// from a RuneReader:
//
// MatchReader, FindReaderIndex, FindReaderSubmatchIndex
//
// This set may grow. Note that regular expression matches may need to
// examine text beyond the text returned by a match, so the methods that
// match text from a RuneReader may read arbitrarily far into the input
// before returning.
//
// (There are a few other methods that do not match this pattern.)
"bytes"
"io"
"os"
"strings"
"utf8"
// Error is the local type for a parsing error.
type Error string
func (e Error) String() string {
return string(e)
}
ErrInternal = Error("regexp: internal error")
ErrUnmatchedLpar = Error("regexp: unmatched '('")
ErrUnmatchedRpar = Error("regexp: unmatched ')'")
ErrUnmatchedLbkt = Error("regexp: unmatched '['")
ErrUnmatchedRbkt = Error("regexp: unmatched ']'")
ErrBadRange = Error("regexp: bad range in character class")
ErrExtraneousBackslash = Error("regexp: extraneous backslash")
ErrBadClosure = Error("regexp: repeated closure (**, ++, etc.)")
ErrBareClosure = Error("regexp: closure applies to nothing")
ErrBadBackslash = Error("regexp: illegal backslash escape")
const (
iStart = iota // beginning of program
iEnd // end of program: success
iBOT // '^' beginning of text
iEOT // '$' end of text
iChar // 'a' regular character
iCharClass // [a-z] character class
iAny // '.' any character including newline
iNotNL // [^\n] special case: any character but newline
iBra // '(' parenthesized expression: 2*braNum for left, 2*braNum+1 for right
iAlt // '|' alternation
iNop // do nothing; makes it easy to link without patching
)
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
// An instruction executed by the NFA
type instr struct {
kind int // the type of this instruction: iChar, iAny, etc.
index int // used only in debugging; could be eliminated
next *instr // the instruction to execute after this one
// Special fields valid only for some items.
char int // iChar
braNum int // iBra, iEbra
cclass *charClass // iCharClass
left *instr // iAlt, other branch
}
func (i *instr) print() {
switch i.kind {
case iStart:
print("start")
case iEnd:
print("end")
case iBOT:
print("bot")
case iEOT:
print("eot")
case iChar:
print("char ", string(i.char))
case iCharClass:
i.cclass.print()
case iAny:
print("any")
case iNotNL:
print("notnl")
case iBra:
if i.braNum&1 == 0 {
print("bra", i.braNum/2)
} else {
print("ebra", i.braNum/2)
}
case iAlt:
print("alt(", i.left.index, ")")
case iNop:
print("nop")
}
// Regexp is the representation of a compiled regular expression.
// The public interface is entirely through methods.
// A Regexp is safe for concurrent use by multiple goroutines.
expr string // the original expression
prefix string // initial plain text string
prefixBytes []byte // initial plain text bytes
inst []*instr
start *instr // first instruction of machine
prefixStart *instr // where to start if there is a prefix
nbra int // number of brackets in expression, for subexpressions
negate bool // is character class negated? ([^a-z])
// slice of int, stored pairwise: [a-z] is (a,z); x is (x,x):
ranges []int
cmin, cmax int
print(" (negated)")
for i := 0; i < len(cclass.ranges); i += 2 {
l := cclass.ranges[i]
r := cclass.ranges[i+1]
print(" [", string(l), "]")
print(" [", string(l), "-", string(r), "]")
cclass.ranges = append(cclass.ranges, a, b)
if a < cclass.cmin {
cclass.cmin = a
}
if b > cclass.cmax {
cclass.cmax = b
}
func (cclass *charClass) matches(c int) bool {
if c < cclass.cmin || c > cclass.cmax {
return cclass.negate
}
for i := 0; i < len(ranges); i = i + 2 {
if ranges[i] <= c && c <= ranges[i+1] {
return !cclass.negate
func newCharClass() *instr {
i := &instr{kind: iCharClass}
i.cclass = new(charClass)
i.cclass.ranges = make([]int, 0, 4)
i.cclass.cmin = 0x10FFFF + 1 // MaxRune + 1
i.cclass.cmax = -1
return i
func (re *Regexp) add(i *instr) *instr {
i.index = len(re.inst)
re *Regexp
nlpar int // number of unclosed lpars
pos int
ch int
const endOfText = -1
func (p *parser) c() int { return p.ch }
p.ch = endOfText
c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:])
p.ch = c
p.pos += w
p := new(parser)
p.re = re
p.nextc() // load p.ch
return p
}
func special(c int) bool {
return true
func ispunct(c int) bool {
for _, r := range "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" {
return true
var escapes = []byte("abfnrtv")
var escaped = []byte("\a\b\f\n\r\t\v")
func escape(c int) int {
for i, b := range escapes {
if int(b) == c {
return i
}
}
return -1
}
func (p *parser) checkBackslash() int {
c := p.c()
if c == '\\' {
c = p.nextc()
switch {
case c == endOfText:
p.error(ErrExtraneousBackslash)
case ispunct(c):
// c is as delivered
case escape(c) >= 0:
c = int(escaped[escape(c)])
default:
p.error(ErrBadBackslash)
}
}
return c
}
func (p *parser) charClass() *instr {
i := newCharClass()
cc := i.cclass
cc.negate = true
p.nextc()
case ']', endOfText:
if cc.negate && len(cc.ranges) == 2 &&
cc.ranges[0] == '\n' && cc.ranges[1] == '\n' {
p.re.add(nl)
return nl
if !cc.negate && len(cc.ranges) == 2 && cc.ranges[0] == cc.ranges[1] {
p.re.add(c)
return c
case '-': // do this before backslash processing
case left < 0: // first of pair
if p.c() == '-' { // range
p.nextc()
left = c
} else { // single char
cc.addRange(c, c)
case left <= c: // second of pair
cc.addRange(left, c)
left = -1
case '|', endOfText:
return nil, nil
p.nextc()
start = p.charClass()
p.nextc()
return start, start
p.nextc()
p.nlpar++
p.re.nbra++ // increment first so first subexpr is \1
nbra := p.re.nbra
start, end = p.regexp()
bra := &instr{kind: iBra, braNum: 2 * nbra}
ebra := &instr{kind: iBra, braNum: 2*nbra + 1}
if start == nil {
if end == nil {
p.re.add(start)
return start, start
func (p *parser) closure() (start, end *instr) {
return
}
switch p.c() {
case '*':
// (start,end)*:
alt.left = start // alternate brach: return to start
start = alt // alt becomes new (start, end)
end = alt
alt.left = start // alternate brach: return to start
end = alt // start is unchanged; end is alt
p.re.add(nop)
alt.left = start // alternate branch is start
alt.next = nop // follow on to nop
end.next = nop // after end, go to nop
start = alt // start is now alt
end = nop // end is nop pointed to by both branches
return
}
switch p.nextc() {
case '*', '+', '?':
func (p *parser) concatenation() (start, end *instr) {
nstart, nend := p.closure()
case nstart == nil: // end of this concatenation
if start == nil { // this is the empty string
return
case start == nil: // this is first element of concatenation
start, end = nstart, nend
func (p *parser) regexp() (start, end *instr) {
start, end = p.concatenation()
for {
switch p.c() {
default:
return
p.nextc()
nstart, nend := p.concatenation()
p.re.add(alt)
alt.left = start
alt.next = nstart
nop := &instr{kind: iNop}
func unNop(i *instr) *instr {
for i.kind == iNop {
i = i.next
continue
inst.next = unNop(inst.next)
if inst.kind == iAlt {
inst.left = unNop(inst.left)
print("prefix <", re.prefix, ">\n")
if inst.kind != iEnd {
print(" -> ", inst.next.index)
re.add(start)
s, e := p.regexp()
re.setPrefix()
if debug {
re.dump()
println()
// Extract regular text from the beginning of the pattern,
// possibly after a leading iBOT.
// That text can be used by doExecute to speed up matching.
var b []byte
var utf = make([]byte, utf8.UTFMax)
// First instruction is start; skip that. Also skip any initial iBOT.
inst = re.inst[0].next
for inst.kind == iBOT {
inst = inst.next
}
Rob Pike
committed
Loop:
for ; inst.kind != iEnd; inst = inst.next {
Rob Pike
committed
// stop if this char can be followed by a match for an empty string,
// which includes closures, ^, and $.
switch inst.next.kind {
case iBOT, iEOT, iAlt:
Rob Pike
committed
break Loop
b = append(b, utf[0:n]...)
// point prefixStart instruction to first non-CHAR after prefix
re.prefixBytes = b
re.prefix = string(b)
// String returns the source text used to compile the regular expression.
func (re *Regexp) String() string {
// Compile parses a regular expression and returns, if successful, a Regexp
// object that can be used to match against text.
Rob Pike
committed
func Compile(str string) (regexp *Regexp, error os.Error) {
// doParse will panic if there is a parse error.
defer func() {
if e := recover(); e != nil {
regexp = nil
error = e.(Error) // Will re-panic if error was not an Error, e.g. nil-pointer exception
}
}()
}
// MustCompile is like Compile but panics if the expression cannot be parsed.
// It simplifies safe initialization of global variables holding compiled regular
// expressions.
func MustCompile(str string) *Regexp {
regexp, error := Compile(str)
panic(`regexp: compiling "` + str + `": ` + error.String())
// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
func (re *Regexp) NumSubexp() int { return re.nbra }
// The match arena allows us to reduce the garbage generated by tossing
// match vectors away as we execute. Matches are ref counted and returned
// to a free list when no longer active. Increases a simple benchmark by 22X.
type matchArena struct {
head *matchVec
len int // length of match vector
pos int
atBOT bool // whether we're at beginning of text
atEOT bool // whether we're at end of text
}
type matchVec struct {
m []int // pairs of bracketing submatches. 0th is start,end
ref int
next *matchVec
}
func (a *matchArena) new() *matchVec {
if a.head == nil {
const N = 10
block := make([]matchVec, N)
for i := 0; i < N; i++ {
b := &block[i]
b.next = a.head
a.head = b
m := a.head
a.head = m.next
m.ref = 0
if m.m == nil {
m.m = make([]int, a.len)
}
}
func (a *matchArena) free(m *matchVec) {
m.next = a.head
a.head = m
}
}
func (a *matchArena) copy(m *matchVec) *matchVec {
m1 := a.new()
copy(m1.m, m.m)
return m1
}
func (a *matchArena) noMatch() *matchVec {
for i := range m.m {
m.m[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
inst *instr // next instruction to execute
prefixed bool // this match began with a fixed prefix
match *matchVec
}
// Append new state to to-do list. Leftmost-longest wins so avoid
// adding a state that's already active. The matchVec will be inc-ref'ed
// if it is assigned to a state.
func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec) []state {
if a.atBOT {
s = a.addState(s, inst.next, prefixed, match)
if a.atEOT {
s = a.addState(s, inst.next, prefixed, match)
match.m[inst.braNum] = a.pos
s = a.addState(s, inst.next, prefixed, match)
// States are inserted in order so it's sufficient to see if we have the same
// instruction; no need to see if existing match is earlier (it is).
return s
s = a.addState(s, inst.left, prefixed, a.copy(match))
// give other branch a copy of this match vector
s = a.addState(s, inst.next, prefixed, a.copy(match))
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
// input abstracts different representations of the input text. It provides
// one-character lookahead.
type input interface {
step(pos int) (rune int, width int) // advance one rune
canCheckPrefix() bool // can we look ahead without losing info?
hasPrefix(re *Regexp) bool
index(re *Regexp, pos int) int
}
// inputString scans a string.
type inputString struct {
str string
}
func newInputString(str string) *inputString {
return &inputString{str: str}
}
func (i *inputString) step(pos int) (int, int) {
if pos < len(i.str) {
return utf8.DecodeRuneInString(i.str[pos:len(i.str)])
}
return endOfText, 0
}
func (i *inputString) canCheckPrefix() bool {
return true
}
func (i *inputString) hasPrefix(re *Regexp) bool {
return strings.HasPrefix(i.str, re.prefix)
}
func (i *inputString) index(re *Regexp, pos int) int {
return strings.Index(i.str[pos:], re.prefix)
}
// inputBytes scans a byte slice.
type inputBytes struct {
str []byte
}
func newInputBytes(str []byte) *inputBytes {
return &inputBytes{str: str}
}
func (i *inputBytes) step(pos int) (int, int) {
if pos < len(i.str) {
return utf8.DecodeRune(i.str[pos:len(i.str)])
}
return endOfText, 0
}
func (i *inputBytes) canCheckPrefix() bool {
return true
}
func (i *inputBytes) hasPrefix(re *Regexp) bool {
return bytes.HasPrefix(i.str, re.prefixBytes)
}
func (i *inputBytes) index(re *Regexp, pos int) int {
return bytes.Index(i.str[pos:], re.prefixBytes)
}
// inputReader scans a RuneReader.
type inputReader struct {
r io.RuneReader
atEOT bool
pos int
}
func newInputReader(r io.RuneReader) *inputReader {
return &inputReader{r: r}
}
func (i *inputReader) step(pos int) (int, int) {
if !i.atEOT && pos != i.pos {
return endOfText, 0
}
r, w, err := i.r.ReadRune()
if err != nil {
i.atEOT = true
return endOfText, 0
}
i.pos += w
return r, w
}
func (i *inputReader) canCheckPrefix() bool {
return false
}
func (i *inputReader) hasPrefix(re *Regexp) bool {
return false
}
func (i *inputReader) index(re *Regexp, pos int) int {
return -1
}
// Search match starting from pos bytes into the input.
func (re *Regexp) doExecute(i input, pos int) []int {
s[0] = make([]state, 0, 10)
s[1] = make([]state, 0, 10)
in, out := 0, 1
var final state
found := false
anchored := re.inst[0].next.kind == iBOT
if anchored && pos > 0 {
return nil
}
if i.canCheckPrefix() && re.prefix != "" {
advance := 0
if anchored {
if !i.hasPrefix(re) {
return nil
advance = i.index(re, pos)
if advance == -1 {
return nil
// We look one character ahead so we can match $, which checks whether
// we are at EOT.
nextChar, nextWidth := i.step(pos)
arena := &matchArena{
len: 2 * (re.nbra + 1),
pos: pos,
atBOT: pos == 0,
atEOT: nextChar == endOfText,
}
for c, startPos := 0, pos; c != endOfText; {
if !found && (pos == startPos || !anchored) {
match := arena.noMatch()
match.m[0] = pos
s[out] = arena.addState(s[out], re.start.next, false, match)
arena.free(match) // if addState saved it, ref was incremented
} else if len(s[out]) == 0 {
// machine has completed
break
in, out = out, in // old out state is new in state
// clear out old state
for _, state := range old {
arena.free(state.match)
}
s[out] = old[0:0] // truncate state vector
c = nextChar
thisPos := pos
pos += nextWidth
nextChar, nextWidth = i.step(pos)
arena.atEOT = nextChar == endOfText
arena.atBOT = false
arena.pos = pos
for _, st := range s[in] {
switch st.inst.kind {
case iBOT:
case iEOT:
case iChar:
if c == st.inst.char {
s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
case iCharClass:
if st.inst.cclass.matches(c) {
s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
if c != endOfText {
s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
if c != endOfText && c != '\n' {
s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match)
if !found || // first
st.match.m[0] < final.match.m[0] || // leftmost
(st.match.m[0] == final.match.m[0] && thisPos > final.match.m[1]) { // longest
if final.match != nil {
arena.free(final.match)
}
final = st
final.match.ref++
final.match.m[1] = thisPos
st.inst.print()
panic("unknown instruction in execute")
if final.match == nil {
return nil
}
// if match found, back up start of match by width of prefix.
if final.prefixed && len(final.match.m) > 0 {
final.match.m[0] -= len(re.prefix)
// LiteralPrefix returns a literal string that must begin any match
// of the regular expression re. It returns the boolean true if the
// literal string comprises the entire regular expression.
func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
c := make([]int, len(re.inst)-2) // minus start and end.
// First instruction is start; skip that.
i := 0
for inst := re.inst[0].next; inst.kind != iEnd; inst = inst.next {
// stop if this is not a char
if inst.kind != iChar {
return string(c[:i]), false
}
c[i] = inst.char
i++
}
return string(c[:i]), true
}
// MatchReader returns whether the Regexp matches the text read by the
// RuneReader. The return value is a boolean: true for match, false for no
// match.
func (re *Regexp) MatchReader(r io.RuneReader) bool {
return len(re.doExecute(newInputReader(r), 0)) > 0
}
// MatchString returns whether the Regexp matches the string s.
// The return value is a boolean: true for match, false for no match.
func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(newInputString(s), 0)) > 0 }
// Match returns whether the Regexp matches the byte slice b.
// The return value is a boolean: true for match, false for no match.
func (re *Regexp) Match(b []byte) bool { return len(re.doExecute(newInputBytes(b), 0)) > 0 }
Rob Pike
committed
// MatchReader checks whether a textual regular expression matches the text
// read by the RuneReader. More complicated queries need to use Compile and
// the full Regexp interface.