Newer
Older
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression library.
package regexp
export var ErrInternal = os.NewError("internal error");
export var ErrUnmatchedLpar = os.NewError("unmatched '('");
export var ErrUnmatchedRpar = os.NewError("unmatched ')'");
export var ErrUnmatchedLbkt = os.NewError("unmatched '['");
export var ErrUnmatchedRbkt = os.NewError("unmatched ']'");
export var ErrBadRange = os.NewError("bad range in character class");
export var ErrExtraneousBackslash = os.NewError("extraneous backslash");
export var ErrBadClosure = os.NewError("repeated closure (**, ++, etc.)");
export var ErrBareClosure = os.NewError("closure applies to nothing");
export var ErrBadBackslash = os.NewError("illegal backslash escape");
// An instruction executed by the NFA
type Inst interface {
Type() int; // the type of this instruction: CHAR, ANY, etc.
Next() Inst; // the instruction to execute after this one
SetNext(i Inst);
// Fields and methods common to all instructions
type Common struct {
next Inst;
index int;
}
func (c *Common) Next() Inst { return c.next }
func (c *Common) SetNext(i Inst) { c.next = i }
func (c *Common) Index() int { return c.index }
func (c *Common) SetIndex(i int) { c.index = i }
type RE struct {
expr string; // the original expression
ch chan<- *RE; // reply channel when we're done
error *os.Error; // compile- or run-time error; nil if OK
inst *array.Array;
nbra int; // number of brackets in expression, for subexpressions
= iota;
END; // end of program: success
BOT; // '^' beginning of text
EOT; // '$' end of text
CHAR; // 'a' regular character
ANY; // '.' any character
BRA; // '(' parenthesized expression
EBRA; // ')'; end of '(' parenthesized expression
ALT; // '|' alternation
NOP; // do nothing; makes it easy to link without patching
)
// --- START start of program
type Start struct {
}
func (start *Start) Type() int { return START }
// --- END end of program
type End struct {
}
func (end *End) Type() int { return END }
// --- BOT beginning of text
type Bot struct {
}
func (bot *Bot) Type() int { return BOT }
// --- EOT end of text
type Eot struct {
}
func (eot *Eot) Type() int { return EOT }
// --- CHAR a regular character
type Char struct {
}
func (char *Char) Type() int { return CHAR }
func (char *Char) Print() { print("char ", string(char.char)) }
c.char = char;
return c;
}
// --- CHARCLASS [a-z]
type CharClass struct {
char int;
negate bool; // is character class negated? ([^a-z])
// array of int, stored pairwise: [a-z] is (a,z); x is (x,x):
ranges *array.IntArray;
func (cclass *CharClass) Print() {
print("charclass");
if cclass.negate {
print(" (negated)");
}
for i := 0; i < cclass.ranges.Len(); i += 2 {
l := cclass.ranges.At(i);
r := cclass.ranges.At(i+1);
if l == r {
print(" [", string(l), "]");
} else {
print(" [", string(l), "-", string(r), "]");
}
}
}
func (cclass *CharClass) AddRange(a, b int) {
cclass.ranges.Push(a);
cclass.ranges.Push(b);
func (cclass *CharClass) Matches(c int) bool {
for i := 0; i < cclass.ranges.Len(); i = i+2 {
min := cclass.ranges.At(i);
max := cclass.ranges.At(i+1);
if min <= c && c <= max {
return !cclass.negate
}
}
return cclass.negate
}
c.ranges = array.NewIntArray(0);
// --- ANY any character
type Any struct {
}
func (any *Any) Type() int { return ANY }
// --- BRA parenthesized expression
type Bra struct {
n int; // subexpression number
}
func (bra *Bra) Type() int { return BRA }
func (bra *Bra) Print() { print("bra", bra.n); }
// --- EBRA end of parenthesized expression
type Ebra struct {
n int; // subexpression number
}
// --- ALT alternation
type Alt struct {
left Inst; // other branch
}
func (alt *Alt) Type() int { return ALT }
func (alt *Alt) Print() { print("alt(", alt.left.Index(), ")"); }
// --- NOP no operation
type Nop struct {
}
func (nop *Nop) Type() int { return NOP }
// report error and exit compiling/executing goroutine
func (re *RE) Error(err *os.Error) {
re.error = err;
re.ch <- re;
sys.goexit();
}
func (re *RE) Add(i Inst) Inst {
i.SetIndex(re.inst.Len());
re.inst.Push(i);
nlpar int; // number of unclosed lpars
pos int;
ch int;
}
const EOF = -1
func (p *Parser) c() int {
return p.ch;
}
func (p *Parser) nextc() int {
if p.pos >= len(p.re.expr) {
p.ch = EOF
} else {
p.ch = c;
p.pos += w;
}
return p.ch;
}
func NewParser(re *RE) *Parser {
parser.re = re;
parser.nextc(); // load p.ch
return parser;
}
/*
Grammar:
regexp:
concatenation { '|' concatenation }
concatenation:
{ closure }
closure:
'[' [ '^' ] character-ranges ']'
func (p *Parser) Regexp() (start, end Inst)
var NULL Inst
s := `\.+*?()|[]`;
for i := 0; i < len(s); i++ {
if c == int(s[i]) {
return true
}
}
return false
}
func specialcclass(c int) bool {
s := `\-[]`;
for i := 0; i < len(s); i++ {
if c == int(s[i]) {
return true
}
}
return false
}
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
func (p *Parser) CharClass() Inst {
cc := NewCharClass();
p.re.Add(cc);
if p.c() == '^' {
cc.negate = true;
p.nextc();
}
left := -1;
for {
switch c := p.c(); c {
case ']', EOF:
if left >= 0 {
p.re.Error(ErrBadRange);
}
return cc;
case '-': // do this before backslash processing
p.re.Error(ErrBadRange);
case '\\':
c = p.nextc();
switch {
case c == EOF:
p.re.Error(ErrExtraneousBackslash);
case c == 'n':
c = '\n';
case specialcclass(c):
// c is as delivered
default:
p.re.Error(ErrBadBackslash);
}
fallthrough;
default:
p.nextc();
switch {
case left < 0: // first of pair
if p.c() == '-' { // range
p.nextc();
left = c;
} else { // single char
cc.AddRange(c, c);
}
case left <= c: // second of pair
cc.AddRange(left, c);
left = -1;
default:
p.re.Error(ErrBadRange);
}
}
}
return NULL
}
func (p *Parser) Term() (start, end Inst) {
if p.nlpar == 0 {
p.re.Error(ErrUnmatchedRpar);
}
return NULL, NULL;
case ']':
p.re.Error(ErrUnmatchedRbkt);
case '^':
p.nextc();
start = p.re.Add(new(*Bot));
start = p.re.Add(new(*Eot));
start = p.re.Add(new(*Any));
return start, start;
case '[':
p.nextc();
start = p.CharClass();
if p.c() != ']' {
p.re.Error(ErrUnmatchedLbkt);
}
p.nextc();
return start, start;
case '(':
p.nextc();
p.re.nbra++; // increment first so first subexpr is \1
nbra := p.re.nbra;
start, end = p.Regexp();
if p.c() != ')' {
p.re.Error(ErrUnmatchedLpar);
}
if start == NULL {
if end == NULL { p.re.Error(ErrInternal) }
case '\\':
c = p.nextc();
switch {
case c == EOF:
p.re.Error(ErrExtraneousBackslash);
case c == 'n':
c = '\n';
case special(c):
// c is as delivered
default:
p.re.Error(ErrBadBackslash);
}
fallthrough;
default:
p.nextc();
return start, start
}
panic("unreachable");
}
func (p *Parser) Closure() (start, end Inst) {
}
switch p.c() {
case '*':
// (start,end)*:
end.SetNext(alt); // after end, do alt
alt.left = start; // alternate brach: return to start
start = alt; // alt becomes new (start, end)
end = alt;
end.SetNext(alt); // after end, do alt
alt.left = start; // alternate brach: return to start
end = alt; // start is unchanged; end is alt
alt.left = start; // alternate branch is start
alt.next = nop; // follow on to nop
end.SetNext(nop); // after end, go to nop
start = alt; // start is now alt
end = nop; // end is nop pointed to by both branches
}
switch p.nextc() {
case '*', '+', '?':
p.re.Error(ErrBadClosure);
}
func (p *Parser) Concatenation() (start, end Inst) {
start, end = NULL, NULL;
for {
nstart, nend := p.Closure();
switch {
case nstart == NULL: // end of this concatenation
if start == NULL { // this is the empty string
nop := p.re.Add(new(*Nop));
case start == NULL: // this is first element of concatenation
start, end = nstart, nend;
default:
end = nend;
}
}
panic("unreachable");
}
func (p *Parser) Regexp() (start, end Inst) {
start, end = p.Concatenation();
for {
switch p.c() {
default:
case '|':
p.nextc();
nstart, nend := p.Concatenation();
alt.left = start;
alt.next = nstart;
end.SetNext(nop);
nend.SetNext(nop);
start, end = alt, nop;
}
}
panic("unreachable");
}
func UnNop(i Inst) Inst {
for i.Type() == NOP {
i = i.Next()
func (re *RE) EliminateNops() {
for i := 0; i < re.inst.Len(); i++ {
inst := re.inst.At(i).(Inst);
if inst.Type() == END {
continue
}
inst.SetNext(UnNop(inst.Next()));
if inst.Type() == ALT {
alt := inst.(*Alt);
alt.left = UnNop(alt.left);
func (re *RE) Dump() {
for i := 0; i < re.inst.Len(); i++ {
inst := re.inst.At(i).(Inst);
print(inst.Index(), ": ");
inst.Print();
if inst.Type() != END {
print(" -> ", inst.Next().Index())
func (re *RE) DoParse() {
parser := NewParser(re);
s, e := parser.Regexp();
start.next = s;
e.SetNext(re.Add(new(*End)));
func Compiler(str string, ch chan *RE) {
re := new(*RE);
re.inst = array.New(0);
re.ch = ch;
re.DoParse();
ch <- re;
}
// Public interface has only execute functionality (not yet implemented)
export type Regexp interface {
export func Compile(str string) (regexp Regexp, error *os.Error) {
ch := new(chan *RE);
go Compiler(str, ch);
re := <-ch;
return re, re.error
}
type State struct {
inst Inst; // next instruction to execute
match []int; // pairs of bracketing submatches. 0th is start,end
}
// Append new state to to-do list. Leftmost-longest wins so avoid
// adding a state that's already active.
func AddState(s []State, inst Inst, match []int) []State {
index := inst.Index();
l := len(s);
pos := match[0];
// TODO: Once the state is a vector and we can do insert, have inputs always
// go in order correctly and this "earlier" test is never necessary,
for i := 0; i < l; i++ {
if s[i].inst.Index() == index && // same instruction
s[i].match[0] < pos { // earlier match already going; lefmost wins
return s
}
}
if l == cap(s) {
s1 := new([]State, 2*l)[0:l];
for i := 0; i < l; i++ {
s1[i] = s[i];
}
s = s1;
}
s = s[0:l+1];
s[l].inst = inst;
s[l].match = match;
return s;
}
func (re *RE) DoExecute(str string, pos int) []int {
var s [2][]State; // TODO: use a vector when State values (not ptrs) can be vector elements
s[0] = new([]State, 10)[0:0];
s[1] = new([]State, 10)[0:0];
in, out := 0, 1;
var final State;
found := false;
for pos <= len(str) {
if !found {
// prime the pump if we haven't seen a match yet
match := new([]int, 2*(re.nbra+1));
for i := 0; i < len(match); i++ {
match[i] = -1; // no match seen; catches cases like "a(b)?c" on "ac"
}
match[0] = pos;
s[out] = AddState(s[out], re.start.Next(), match);
}
in, out = out, in; // old out state is new in state
s[out] = s[out][0:0]; // clear out state
if len(s[in]) == 0 {
// machine has completed
break;
}
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
}
for i := 0; i < len(s[in]); i++ {
state := s[in][i];
switch s[in][i].inst.Type() {
case BOT:
if pos == 0 {
s[in] = AddState(s[in], state.inst.Next(), state.match)
}
case EOT:
if pos == len(str) {
s[in] = AddState(s[in], state.inst.Next(), state.match)
}
case CHAR:
if c == state.inst.(*Char).char {
s[out] = AddState(s[out], state.inst.Next(), state.match)
}
case CHARCLASS:
if state.inst.(*CharClass).Matches(c) {
s[out] = AddState(s[out], state.inst.Next(), state.match)
}
case ANY:
if c != EOF {
s[out] = AddState(s[out], state.inst.Next(), state.match)
}
case BRA:
n := state.inst.(*Bra).n;
state.match[2*n] = pos;
s[in] = AddState(s[in], state.inst.Next(), state.match);
case EBRA:
n := state.inst.(*Ebra).n;
state.match[2*n+1] = pos;
s[in] = AddState(s[in], state.inst.Next(), state.match);
case ALT:
s[in] = AddState(s[in], state.inst.(*Alt).left, state.match);
// give other branch a copy of this match vector
s1 := new([]int, 2*(re.nbra+1));
for i := 0; i < len(s1); i++ {
s1[i] = state.match[i]
}
s[in] = AddState(s[in], state.inst.Next(), s1);
case END:
// choose leftmost longest
if !found || // first
state.match[0] < final.match[0] || // leftmost
(state.match[0] == final.match[0] && pos > final.match[1]) { // longest
final = state;
final.match[1] = pos;
}
found = true;
default:
state.inst.Print();
panic("unknown instruction in execute");
}
}