Newer
Older
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package regexp implements a simple regular expression library.
//
// The syntax of the regular expressions accepted is:
//
// regexp:
// concatenation { '|' concatenation }
// concatenation:
// { closure }
// closure:
// term [ '*' | '+' | '?' ]
// term:
// '^'
// '$'
// '.'
// character
// '[' [ '^' ] { character-range } ']'
// character-range:
// All characters are UTF-8-encoded code points. Backslashes escape special
// characters, including inside character classes. The standard Go character
// escapes are also recognized: \a \b \f \n \r \t \v.
//
// There are 16 methods of Regexp that match a regular expression and identify
// the matched text. Their names are matched by this regular expression:
//
// Find(All)?(String)?(Submatch)?(Index)?
//
// If 'All' is present, the routine matches successive non-overlapping
// matches of the entire expression. Empty matches abutting a preceding
// match are ignored. The return value is a slice containing the successive
// return values of the corresponding non-'All' routine. These routines take
// an extra integer argument, n; if n >= 0, the function returns at most n
// matches/submatches.
//
// If 'String' is present, the argument is a string; otherwise it is a slice
// of bytes; return values are adjusted as appropriate.
//
// If 'Submatch' is present, the return value is a slice identifying the
// successive submatches of the expression. Submatches are matches of
// parenthesized subexpressions within the regular expression, numbered from
// left to right in order of opening parenthesis. Submatch 0 is the match of
// the entire expression, submatch 1 the match of the first parenthesized
// subexpression, and so on.
//
// If 'Index' is present, matches and submatches are identified by byte index
// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
// the nth submatch. The pair for n==0 identifies the match of the entire
// expression. If 'Index' is not present, the match is identified by the
// text of the match/submatch. If an index is negative, it means that
// subexpression did not match any string in the input.
//
// (There are a few other methods that do not match this pattern.)
"bytes"
"io"
"os"
"strings"
"utf8"
// Error is the local type for a parsing error.
type Error string
func (e Error) String() string {
return string(e)
}
ErrInternal = Error("internal error")
ErrUnmatchedLpar = Error("unmatched '('")
ErrUnmatchedRpar = Error("unmatched ')'")
ErrUnmatchedLbkt = Error("unmatched '['")
ErrUnmatchedRbkt = Error("unmatched ']'")
ErrBadRange = Error("bad range in character class")
ErrExtraneousBackslash = Error("extraneous backslash")
ErrBadClosure = Error("repeated closure (**, ++, etc.)")
ErrBareClosure = Error("closure applies to nothing")
ErrBadBackslash = Error("illegal backslash escape")
const (
iStart = iota // beginning of program
iEnd // end of program: success
iBOT // '^' beginning of text
iEOT // '$' end of text
iChar // 'a' regular character
iCharClass // [a-z] character class
iAny // '.' any character including newline
iNotNL // [^\n] special case: any character but newline
iBra // '(' parenthesized expression: 2*braNum for left, 2*braNum+1 for right
iAlt // '|' alternation
iNop // do nothing; makes it easy to link without patching
)
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
// An instruction executed by the NFA
type instr struct {
kind int // the type of this instruction: iChar, iAny, etc.
index int // used only in debugging; could be eliminated
next *instr // the instruction to execute after this one
// Special fields valid only for some items.
char int // iChar
braNum int // iBra, iEbra
cclass *charClass // iCharClass
left *instr // iAlt, other branch
}
func (i *instr) print() {
switch i.kind {
case iStart:
print("start")
case iEnd:
print("end")
case iBOT:
print("bot")
case iEOT:
print("eot")
case iChar:
print("char ", string(i.char))
case iCharClass:
i.cclass.print()
case iAny:
print("any")
case iNotNL:
print("notnl")
case iBra:
if i.braNum&1 == 0 {
print("bra", i.braNum/2)
} else {
print("ebra", i.braNum/2)
}
case iAlt:
print("alt(", i.left.index, ")")
case iNop:
print("nop")
}
// Regexp is the representation of a compiled regular expression.
// The public interface is entirely through methods.
type Regexp struct {
expr string // the original expression
prefix string // initial plain text string
prefixBytes []byte // initial plain text bytes
inst []*instr
start *instr // first instruction of machine
prefixStart *instr // where to start if there is a prefix
nbra int // number of brackets in expression, for subexpressions
negate bool // is character class negated? ([^a-z])
// slice of int, stored pairwise: [a-z] is (a,z); x is (x,x):
ranges []int
cmin, cmax int
print(" (negated)")
for i := 0; i < len(cclass.ranges); i += 2 {
l := cclass.ranges[i]
r := cclass.ranges[i+1]
print(" [", string(l), "]")
print(" [", string(l), "-", string(r), "]")
cclass.ranges = append(cclass.ranges, a, b)
if a < cclass.cmin {
cclass.cmin = a
}
if b > cclass.cmax {
cclass.cmax = b
}
func (cclass *charClass) matches(c int) bool {
if c < cclass.cmin || c > cclass.cmax {
return cclass.negate
}
for i := 0; i < len(ranges); i = i + 2 {
if ranges[i] <= c && c <= ranges[i+1] {
return !cclass.negate
func newCharClass() *instr {
i := &instr{kind: iCharClass}
i.cclass = new(charClass)
i.cclass.ranges = make([]int, 0, 4)
i.cclass.cmin = 0x10FFFF + 1 // MaxRune + 1
i.cclass.cmax = -1
return i
func (re *Regexp) add(i *instr) *instr {
i.index = len(re.inst)
re *Regexp
nlpar int // number of unclosed lpars
pos int
ch int
func (p *parser) c() int { return p.ch }
p.ch = endOfFile
c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:])
p.ch = c
p.pos += w
p := new(parser)
p.re = re
p.nextc() // load p.ch
return p
}
func special(c int) bool {
return true
func ispunct(c int) bool {
for _, r := range "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" {
return true
var escapes = []byte("abfnrtv")
var escaped = []byte("\a\b\f\n\r\t\v")
func escape(c int) int {
for i, b := range escapes {
if int(b) == c {
return i
}
}
return -1
}
func (p *parser) charClass() *instr {
i := newCharClass()
cc := i.cclass
cc.negate = true
p.nextc()
if cc.negate && len(cc.ranges) == 2 &&
cc.ranges[0] == '\n' && cc.ranges[1] == '\n' {
p.re.add(nl)
return nl
if !cc.negate && len(cc.ranges) == 2 && cc.ranges[0] == cc.ranges[1] {
p.re.add(c)
return c
case '-': // do this before backslash processing
case escape(c) >= 0:
c = int(escaped[escape(c)])
case left < 0: // first of pair
if p.c() == '-' { // range
p.nextc()
left = c
} else { // single char
cc.addRange(c, c)
case left <= c: // second of pair
cc.addRange(left, c)
left = -1
return nil, nil
p.nextc()
start = p.charClass()
p.nextc()
return start, start
p.nextc()
p.nlpar++
p.re.nbra++ // increment first so first subexpr is \1
nbra := p.re.nbra
start, end = p.regexp()
bra := &instr{kind: iBra, braNum: 2 * nbra}
ebra := &instr{kind: iBra, braNum: 2*nbra + 1}
if start == nil {
if end == nil {
case escape(c) >= 0:
c = int(escaped[escape(c)])
p.re.add(start)
return start, start
func (p *parser) closure() (start, end *instr) {
return
}
switch p.c() {
case '*':
// (start,end)*:
alt.left = start // alternate brach: return to start
start = alt // alt becomes new (start, end)
end = alt
alt.left = start // alternate brach: return to start
end = alt // start is unchanged; end is alt
p.re.add(nop)
alt.left = start // alternate branch is start
alt.next = nop // follow on to nop
end.next = nop // after end, go to nop
start = alt // start is now alt
end = nop // end is nop pointed to by both branches
return
}
switch p.nextc() {
case '*', '+', '?':
func (p *parser) concatenation() (start, end *instr) {
nstart, nend := p.closure()
case nstart == nil: // end of this concatenation
if start == nil { // this is the empty string
return
case start == nil: // this is first element of concatenation
start, end = nstart, nend
func (p *parser) regexp() (start, end *instr) {
start, end = p.concatenation()
for {
switch p.c() {
default:
return
p.nextc()
nstart, nend := p.concatenation()
p.re.add(alt)
alt.left = start
alt.next = nstart
nop := &instr{kind: iNop}
func unNop(i *instr) *instr {
for i.kind == iNop {
i = i.next
continue
inst.next = unNop(inst.next)
if inst.kind == iAlt {
inst.left = unNop(inst.left)
print("prefix <", re.prefix, ">\n")
if inst.kind != iEnd {
print(" -> ", inst.next.index)
re.add(start)
s, e := p.regexp()
re.setPrefix()
if debug {
re.dump()
println()
// Extract regular text from the beginning of the pattern.
// That text can be used by doExecute to speed up matching.
var b []byte
var utf = make([]byte, utf8.UTFMax)
Rob Pike
committed
Loop:
for inst = re.inst[0].next; inst.kind != iEnd; inst = inst.next {
Rob Pike
committed
// stop if this char can be followed by a match for an empty string,
// which includes closures, ^, and $.
switch inst.next.kind {
case iBOT, iEOT, iAlt:
Rob Pike
committed
break Loop
b = append(b, utf[0:n]...)
// point prefixStart instruction to first non-CHAR after prefix
re.prefixBytes = b
re.prefix = string(b)
// Compile parses a regular expression and returns, if successful, a Regexp
// object that can be used to match against text.
Rob Pike
committed
func Compile(str string) (regexp *Regexp, error os.Error) {
// doParse will panic if there is a parse error.
defer func() {
if e := recover(); e != nil {
regexp = nil
error = e.(Error) // Will re-panic if error was not an Error, e.g. nil-pointer exception
}
}()
}
// MustCompile is like Compile but panics if the expression cannot be parsed.
// It simplifies safe initialization of global variables holding compiled regular
// expressions.
func MustCompile(str string) *Regexp {
regexp, error := Compile(str)
panic(`regexp: compiling "` + str + `": ` + error.String())
// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
func (re *Regexp) NumSubexp() int { return re.nbra }
// The match arena allows us to reduce the garbage generated by tossing
// match vectors away as we execute. Matches are ref counted and returned
// to a free list when no longer active. Increases a simple benchmark by 22X.
type matchArena struct {
head *matchVec
len int // length of match vector
}
type matchVec struct {
m []int // pairs of bracketing submatches. 0th is start,end
ref int
next *matchVec
}
func (a *matchArena) new() *matchVec {
if a.head == nil {
const N = 10
block := make([]matchVec, N)
for i := 0; i < N; i++ {
b := &block[i]
b.next = a.head
a.head = b
m := a.head
a.head = m.next
m.ref = 0
if m.m == nil {
m.m = make([]int, a.len)
}
}
func (a *matchArena) free(m *matchVec) {
m.next = a.head
a.head = m
}
}
func (a *matchArena) copy(m *matchVec) *matchVec {
m1 := a.new()
copy(m1.m, m.m)
return m1
}
func (a *matchArena) noMatch() *matchVec {
for i := range m.m {
m.m[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
inst *instr // next instruction to execute
prefixed bool // this match began with a fixed prefix
match *matchVec
}
// Append new state to to-do list. Leftmost-longest wins so avoid
// adding a state that's already active. The matchVec will be inc-ref'ed
// if it is assigned to a state.
func (a *matchArena) addState(s []state, inst *instr, prefixed bool, match *matchVec, pos, end int) []state {
switch inst.kind {
case iBOT:
if pos == 0 {
s = a.addState(s, inst.next, prefixed, match, pos, end)
if pos == end {
s = a.addState(s, inst.next, prefixed, match, pos, end)
match.m[inst.braNum] = pos
s = a.addState(s, inst.next, prefixed, match, pos, end)
// States are inserted in order so it's sufficient to see if we have the same
// instruction; no need to see if existing match is earlier (it is).
return s
if inst.kind == iAlt {
s = a.addState(s, inst.left, prefixed, a.copy(match), pos, end)
// give other branch a copy of this match vector
s = a.addState(s, inst.next, prefixed, a.copy(match), pos, end)
// Accepts either string or bytes - the logic is identical either way.
// If bytes == nil, scan str.
func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
s[0] = make([]state, 0, 10)
s[1] = make([]state, 0, 10)
in, out := 0, 1
var final state
found := false
end := len(str)
if bytestr != nil {
end = len(bytestr)
}
// fast check for initial plain substring
prefixed := false // has this iteration begun by skipping a prefix?
advance = strings.Index(str[pos:], re.prefix)
advance = bytes.Index(bytestr[pos:], re.prefixBytes)
pos += advance + len(re.prefix)
arena := &matchArena{nil, 2 * (re.nbra + 1)}
for pos <= end {
if !found {
// prime the pump if we haven't seen a match yet
match := arena.noMatch()
match.m[0] = pos
s[out] = arena.addState(s[out], re.prefixStart, true, match, pos, end)
prefixed = false // next iteration should start at beginning of machine.
} else {
s[out] = arena.addState(s[out], re.start.next, false, match, pos, end)
arena.free(match) // if addState saved it, ref was incremented
in, out = out, in // old out state is new in state
// clear out old state
for _, state := range old {
arena.free(state.match)
}
s[out] = old[0:0] // truncate state vector
if found && len(s[in]) == 0 {
charwidth := 1
c := endOfFile
if pos < end {
c, charwidth = utf8.DecodeRuneInString(str[pos:end])
} else {
c, charwidth = utf8.DecodeRune(bytestr[pos:end])
for _, st := range s[in] {
switch st.inst.kind {
case iBOT:
case iEOT:
case iChar:
if c == st.inst.char {
s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
case iCharClass:
if st.inst.cclass.matches(c) {
s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
if c != endOfFile && c != '\n' {
s[out] = arena.addState(s[out], st.inst.next, st.prefixed, st.match, pos, end)
if !found || // first
st.match.m[0] < final.match.m[0] || // leftmost
(st.match.m[0] == final.match.m[0] && pos-charwidth > final.match.m[1]) { // longest
if final.match != nil {
arena.free(final.match)
}
final = st
final.match.ref++
final.match.m[1] = pos - charwidth
st.inst.print()
panic("unknown instruction in execute")
if final.match == nil {
return nil
}
// if match found, back up start of match by width of prefix.
if final.prefixed && len(final.match.m) > 0 {
final.match.m[0] -= len(re.prefix)
// MatchString returns whether the Regexp matches the string s.
// The return value is a boolean: true for match, false for no match.
func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(s, nil, 0)) > 0 }
// Match returns whether the Regexp matches the byte slice b.
// The return value is a boolean: true for match, false for no match.
func (re *Regexp) Match(b []byte) bool { return len(re.doExecute("", b, 0)) > 0 }
Rob Pike
committed
// MatchString checks whether a textual regular expression
// matches a string. More complicated queries need
// to use Compile and the full Regexp interface.
func MatchString(pattern string, s string) (matched bool, error os.Error) {
re, err := Compile(pattern)
if err != nil {
return false, err
return re.MatchString(s), nil
// Match checks whether a textual regular expression
// matches a byte slice. More complicated queries need
// to use Compile and the full Regexp interface.
func Match(pattern string, b []byte) (matched bool, error os.Error) {
re, err := Compile(pattern)
Rob Pike
committed
if err != nil {
return false, err
Rob Pike
committed
}
Rob Pike
committed
}
// ReplaceAllString returns a copy of src in which all matches for the Regexp
// have been replaced by repl. No support is provided for expressions
// (e.g. \1 or $1) in the replacement string.
func (re *Regexp) ReplaceAllString(src, repl string) string {
return re.ReplaceAllStringFunc(src, func(string) string { return repl })
}
// ReplaceAllStringFunc returns a copy of src in which all matches for the
// Regexp have been replaced by the return value of of function repl (whose
// first argument is the matched string). No support is provided for
// expressions (e.g. \1 or $1) in the replacement string.
func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
lastMatchEnd := 0 // end position of the most recent match
searchPos := 0 // position where we next look for a match
buf := new(bytes.Buffer)
a := re.doExecute(src, nil, searchPos)
break // no more matches
}
// Copy the unmatched characters before this match.
io.WriteString(buf, src[lastMatchEnd:a[0]])
// Now insert a copy of the replacement string, but not for a
// match of the empty string immediately after another match.
// (Otherwise, we get double replacement for patterns that
// match both empty and nonempty strings.)
if a[1] > lastMatchEnd || a[0] == 0 {
io.WriteString(buf, repl(src[a[0]:a[1]]))
// Advance past this match; always advance at least one character.
_, width := utf8.DecodeRuneInString(src[searchPos:])
if searchPos+width > a[1] {
searchPos += width
} else if searchPos+1 > a[1] {
// This clause is only needed at the end of the input
// string. In that case, DecodeRuneInString returns width=0.
searchPos++
searchPos = a[1]
}
}
// Copy the unmatched characters after the last match.
io.WriteString(buf, src[lastMatchEnd:])
// ReplaceAll returns a copy of src in which all matches for the Regexp
// have been replaced by repl. No support is provided for expressions
// (e.g. \1 or $1) in the replacement text.
func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
return re.ReplaceAllFunc(src, func([]byte) []byte { return repl })
}
// ReplaceAllFunc returns a copy of src in which all matches for the
// Regexp have been replaced by the return value of of function repl (whose
// first argument is the matched []byte). No support is provided for
// expressions (e.g. \1 or $1) in the replacement string.
func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
lastMatchEnd := 0 // end position of the most recent match
searchPos := 0 // position where we next look for a match
buf := new(bytes.Buffer)
for searchPos <= len(src) {
a := re.doExecute("", src, searchPos)
if len(a) == 0 {
break // no more matches
}
// Copy the unmatched characters before this match.
buf.Write(src[lastMatchEnd:a[0]])
// Now insert a copy of the replacement string, but not for a
// match of the empty string immediately after another match.
// (Otherwise, we get double replacement for patterns that
// match both empty and nonempty strings.)
if a[1] > lastMatchEnd || a[0] == 0 {
buf.Write(repl(src[a[0]:a[1]]))
// Advance past this match; always advance at least one character.
_, width := utf8.DecodeRune(src[searchPos:])
if searchPos+width > a[1] {
searchPos += width
} else if searchPos+1 > a[1] {
// This clause is only needed at the end of the input
// string. In that case, DecodeRuneInString returns width=0.
searchPos++
} else {
searchPos = a[1]
}
}
// Copy the unmatched characters after the last match.
buf.Write(src[lastMatchEnd:])
// QuoteMeta returns a string that quotes all regular expression metacharacters
// inside the argument text; the returned string is a regular expression matching
// the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
func QuoteMeta(s string) string {
b := make([]byte, 2*len(s))
// A byte loop is correct because all metacharacters are ASCII.
for i := 0; i < len(s); i++ {
if special(int(s[i])) {