Skip to content
Snippets Groups Projects
regexp.go 14.8 KiB
Newer Older
  • Learn to ignore specific revisions
  • // Copyright 2009 The Go Authors. All rights reserved.
    // Use of this source code is governed by a BSD-style
    // license that can be found in the LICENSE file.
    
    // Regular expression library.
    
    package regexp
    
    
    Rob Pike's avatar
    Rob Pike committed
    import (
    	"os";
    
    Rob Pike's avatar
    Rob Pike committed
    )
    
    
    Rob Pike's avatar
    Rob Pike committed
    export var debug = false;
    
    
    
    export var ErrInternal = os.NewError("internal error");
    export var ErrUnmatchedLpar = os.NewError("unmatched '('");
    export var ErrUnmatchedRpar = os.NewError("unmatched ')'");
    
    Rob Pike's avatar
    Rob Pike committed
    export var ErrUnmatchedLbkt = os.NewError("unmatched '['");
    export var ErrUnmatchedRbkt = os.NewError("unmatched ']'");
    export var ErrBadRange = os.NewError("bad range in character class");
    
    export var ErrExtraneousBackslash = os.NewError("extraneous backslash");
    export var ErrBadClosure = os.NewError("repeated closure (**, ++, etc.)");
    export var ErrBareClosure = os.NewError("closure applies to nothing");
    export var ErrBadBackslash = os.NewError("illegal backslash escape");
    
    // An instruction executed by the NFA
    type Inst interface {
    	Type()	int;	// the type of this instruction: CHAR, ANY, etc.
    
    	Next()	Inst;	// the instruction to execute after this one
    	SetNext(i Inst);
    
    Rob Pike's avatar
    Rob Pike committed
    	Index()	int;
    	SetIndex(i int);
    	Print();
    
    // Fields and methods common to all instructions
    type Common struct {
    	next	Inst;
    	index	int;
    }
    
    func (c *Common) Next() Inst { return c.next }
    func (c *Common) SetNext(i Inst) { c.next = i }
    func (c *Common) Index() int { return c.index }
    func (c *Common) SetIndex(i int) { c.index = i }
    
    
    type RE struct {
    	expr	string;	// the original expression
    
    	ch	chan<- *RE;	// reply channel when we're done
    
    Rob Pike's avatar
    Rob Pike committed
    	error	*os.Error;	// compile- or run-time error; nil if OK
    
    	start	Inst;
    
    Rob Pike's avatar
    Rob Pike committed
    	nbra	int;	// number of brackets in expression, for subexpressions
    
    Rob Pike's avatar
    Rob Pike committed
    	START	// beginning of program
    
    		= iota;
    	END;		// end of program: success
    	BOT;		// '^' beginning of text
    	EOT;		// '$' end of text
    	CHAR;	// 'a' regular character
    
    Rob Pike's avatar
    Rob Pike committed
    	CHARCLASS;	// [a-z] character class
    
    	ANY;		// '.' any character
    	BRA;		// '(' parenthesized expression
    	EBRA;	// ')'; end of '(' parenthesized expression
    	ALT;		// '|' alternation
    	NOP;		// do nothing; makes it easy to link without patching
    )
    
    // --- START start of program
    type Start struct {
    
    }
    
    func (start *Start) Type() int { return START }
    
    Rob Pike's avatar
    Rob Pike committed
    func (start *Start) Print() { print("start") }
    
    
    // --- END end of program
    type End struct {
    
    }
    
    func (end *End) Type() int { return END }
    
    Rob Pike's avatar
    Rob Pike committed
    func (end *End) Print() { print("end") }
    
    
    // --- BOT beginning of text
    type Bot struct {
    
    }
    
    func (bot *Bot) Type() int { return BOT }
    
    Rob Pike's avatar
    Rob Pike committed
    func (bot *Bot) Print() { print("bot") }
    
    
    // --- EOT end of text
    type Eot struct {
    
    }
    
    func (eot *Eot) Type() int { return EOT }
    
    Rob Pike's avatar
    Rob Pike committed
    func (eot *Eot) Print() { print("eot") }
    
    
    // --- CHAR a regular character
    type Char struct {
    
    Rob Pike's avatar
    Rob Pike committed
    	char	int;
    
    }
    
    func (char *Char) Type() int { return CHAR }
    
    Rob Pike's avatar
    Rob Pike committed
    func (char *Char) Print() { print("char ", string(char.char)) }
    
    
    func NewChar(char int) *Char {
    
    	c.char = char;
    	return c;
    }
    
    
    Rob Pike's avatar
    Rob Pike committed
    // --- CHARCLASS [a-z]
    
    type CharClass struct {
    
    Rob Pike's avatar
    Rob Pike committed
    	char	int;
    	negate	bool;	// is character class negated? ([^a-z])
    
    	// array of int, stored pairwise: [a-z] is (a,z); x is (x,x):
    	ranges	*array.IntArray;
    
    Rob Pike's avatar
    Rob Pike committed
    func (cclass *CharClass) Type() int { return CHARCLASS }
    
    Rob Pike's avatar
    Rob Pike committed
    func (cclass *CharClass) Print() {
    	print("charclass");
    	if cclass.negate {
    		print(" (negated)");
    	}
    	for i := 0; i < cclass.ranges.Len(); i += 2 {
    
    		l := cclass.ranges.At(i);
    		r := cclass.ranges.At(i+1);
    
    Rob Pike's avatar
    Rob Pike committed
    		if l == r {
    			print(" [", string(l), "]");
    		} else {
    			print(" [", string(l), "-", string(r), "]");
    		}
    	}
    }
    
    
    func (cclass *CharClass) AddRange(a, b int) {
    
    Rob Pike's avatar
    Rob Pike committed
    	// range is a through b inclusive
    
    	cclass.ranges.Push(a);
    	cclass.ranges.Push(b);
    
    Rob Pike's avatar
    Rob Pike committed
    func (cclass *CharClass) Matches(c int) bool {
    	for i := 0; i < cclass.ranges.Len(); i = i+2 {
    
    		min := cclass.ranges.At(i);
    		max := cclass.ranges.At(i+1);
    
    Rob Pike's avatar
    Rob Pike committed
    		if min <= c && c <= max {
    			return !cclass.negate
    		}
    	}
    	return cclass.negate
    }
    
    
    Rob Pike's avatar
    Rob Pike committed
    func NewCharClass() *CharClass {
    
    	c := new(*CharClass);
    
    	c.ranges = array.NewIntArray(0);
    
    Rob Pike's avatar
    Rob Pike committed
    	return c;
    }
    
    
    // --- ANY any character
    type Any struct {
    
    }
    
    func (any *Any) Type() int { return ANY }
    
    Rob Pike's avatar
    Rob Pike committed
    func (any *Any) Print() { print("any") }
    
    
    // --- BRA parenthesized expression
    type Bra struct {
    
    	n	int;	// subexpression number
    
    }
    
    func (bra *Bra) Type() int { return BRA }
    
    func (bra *Bra) Print() { print("bra", bra.n); }
    
    
    // --- EBRA end of parenthesized expression
    type Ebra struct {
    
    	n	int;	// subexpression number
    }
    
    
    Rob Pike's avatar
    Rob Pike committed
    func (ebra *Ebra) Type() int { return EBRA }
    
    Rob Pike's avatar
    Rob Pike committed
    func (ebra *Ebra) Print() { print("ebra ", ebra.n); }
    
    
    // --- ALT alternation
    type Alt struct {
    
    	left	Inst;	// other branch
    
    }
    
    func (alt *Alt) Type() int { return ALT }
    
    Rob Pike's avatar
    Rob Pike committed
    func (alt *Alt) Print() { print("alt(", alt.left.Index(), ")"); }
    
    
    // --- NOP no operation
    type Nop struct {
    
    }
    
    func (nop *Nop) Type() int { return NOP }
    
    Rob Pike's avatar
    Rob Pike committed
    func (nop *Nop) Print() { print("nop") }
    
    
    // report error and exit compiling/executing goroutine
    func (re *RE) Error(err *os.Error) {
    	re.error = err;
    	re.ch <- re;
    	sys.goexit();
    }
    
    
    Rob Pike's avatar
    Rob Pike committed
    func (re *RE) Add(i Inst) Inst {
    	i.SetIndex(re.inst.Len());
    
    Rob Pike's avatar
    Rob Pike committed
    	return i;
    }
    
    
    type Parser struct {
    	re	*RE;
    
    	nlpar	int;	// number of unclosed lpars
    
    	pos	int;
    	ch	int;
    }
    
    const EOF = -1
    
    func (p *Parser) c() int {
    	return p.ch;
    }
    
    func (p *Parser) nextc() int {
    	if p.pos >= len(p.re.expr) {
    		p.ch = EOF
    	} else {
    
    Rob Pike's avatar
    Rob Pike committed
    		c, w := sys.stringtorune(p.re.expr, p.pos);
    
    		p.ch = c;
    		p.pos += w;
    	}
    	return p.ch;
    }
    
    func NewParser(re *RE) *Parser {
    
    	parser := new(*Parser);
    
    	parser.re = re;
    	parser.nextc();	// load p.ch
    	return parser;
    }
    
    /*
    
    Grammar:
    	regexp:
    		concatenation { '|' concatenation }
    	concatenation:
    		{ closure }
    	closure:
    
    Rob Pike's avatar
    Rob Pike committed
    		term [ '*' | '+' | '?' ]
    
    Rob Pike's avatar
    Rob Pike committed
    		'^'
    		'$'
    
    		'[' [ '^' ] character-ranges ']'
    
    func (p *Parser) Regexp() (start, end Inst)
    
    var NULL Inst
    
    
    func special(c int) bool {
    
    Rob Pike's avatar
    Rob Pike committed
    	s := `\.+*?()|[]`;
    	for i := 0; i < len(s); i++ {
    		if c == int(s[i]) {
    			return true
    		}
    	}
    	return false
    }
    
    func specialcclass(c int) bool {
    	s := `\-[]`;
    
    	for i := 0; i < len(s); i++ {
    		if c == int(s[i]) {
    			return true
    		}
    	}
    	return false
    }
    
    
    Rob Pike's avatar
    Rob Pike committed
    func (p *Parser) CharClass() Inst {
    	cc := NewCharClass();
    	p.re.Add(cc);
    	if p.c() == '^' {
    		cc.negate = true;
    		p.nextc();
    	}
    	left := -1;
    	for {
    		switch c := p.c(); c {
    		case ']', EOF:
    			if left >= 0 {
    				p.re.Error(ErrBadRange);
    			}
    			return cc;
    		case '-':	// do this before backslash processing
    			p.re.Error(ErrBadRange);
    		case '\\':
    			c = p.nextc();
    			switch {
    			case c == EOF:
    				p.re.Error(ErrExtraneousBackslash);
    			case c == 'n':
    				c = '\n';
    			case specialcclass(c):
    				// c is as delivered
    			default:
    				p.re.Error(ErrBadBackslash);
    			}
    			fallthrough;
    		default:
    			p.nextc();
    			switch {
    			case left < 0:	// first of pair
    				if p.c() == '-' {	// range
    					p.nextc();
    					left = c;
    				} else {	// single char
    					cc.AddRange(c, c);
    				}
    			case left <= c:	// second of pair
    				cc.AddRange(left, c);
    				left = -1;
    			default:
    				p.re.Error(ErrBadRange);
    			}
    		}
    	}
    	return NULL
    }
    
    
    func (p *Parser) Term() (start, end Inst) {
    
    	switch c := p.c(); c {
    
    Rob Pike's avatar
    Rob Pike committed
    	case '|', EOF:
    
    		return NULL, NULL;
    
    		p.re.Error(ErrBareClosure);
    
    Rob Pike's avatar
    Rob Pike committed
    	case ')':
    
    		if p.nlpar == 0 {
    			p.re.Error(ErrUnmatchedRpar);
    		}
    		return NULL, NULL;
    
    Rob Pike's avatar
    Rob Pike committed
    	case ']':
    		p.re.Error(ErrUnmatchedRbkt);
    	case '^':
    		p.nextc();
    
    		start = p.re.Add(new(*Bot));
    
    Rob Pike's avatar
    Rob Pike committed
    		return start, start;
    	case '$':
    		p.nextc();
    
    		start = p.re.Add(new(*Eot));
    
    Rob Pike's avatar
    Rob Pike committed
    		return start, start;
    
    	case '.':
    		p.nextc();
    
    		start = p.re.Add(new(*Any));
    
    Rob Pike's avatar
    Rob Pike committed
    		return start, start;
    	case '[':
    		p.nextc();
    		start = p.CharClass();
    		if p.c() != ']' {
    			p.re.Error(ErrUnmatchedLbkt);
    		}
    		p.nextc();
    
    		return start, start;
    	case '(':
    		p.nextc();
    
    Rob Pike's avatar
    Rob Pike committed
    		p.re.nbra++;	// increment first so first subexpr is \1
    		nbra := p.re.nbra;
    
    		start, end = p.Regexp();
    		if p.c() != ')' {
    			p.re.Error(ErrUnmatchedLpar);
    		}
    
    		p.nextc();
    
    Rob Pike's avatar
    Rob Pike committed
    		p.re.Add(bra);
    
    		ebra := new(*Ebra);
    
    Rob Pike's avatar
    Rob Pike committed
    		p.re.Add(ebra);
    
    Rob Pike's avatar
    Rob Pike committed
    		bra.n = nbra;
    		ebra.n = nbra;
    
    Rob Pike's avatar
    Rob Pike committed
    		if start == NULL {
    			if end == NULL { p.re.Error(ErrInternal) }
    
    			end.SetNext(ebra);
    
    		}
    		bra.SetNext(start);
    
    	case '\\':
    		c = p.nextc();
    		switch {
    		case c == EOF:
    			p.re.Error(ErrExtraneousBackslash);
    		case c == 'n':
    			c = '\n';
    		case special(c):
    			// c is as delivered
    		default:
    			p.re.Error(ErrBadBackslash);
    		}
    		fallthrough;
    	default:
    		p.nextc();
    
    		start = NewChar(c);
    
    Rob Pike's avatar
    Rob Pike committed
    		p.re.Add(start);
    
    		return start, start
    	}
    	panic("unreachable");
    }
    
    
    func (p *Parser) Closure() (start, end Inst) {
    
    	start, end = p.Term();
    
    Rob Pike's avatar
    Rob Pike committed
    	if start == NULL {
    
    Rob Pike's avatar
    Rob Pike committed
    		return
    
    	}
    	switch p.c() {
    	case '*':
    		// (start,end)*:
    
    Rob Pike's avatar
    Rob Pike committed
    		p.re.Add(alt);
    
    		end.SetNext(alt);	// after end, do alt
    
    		alt.left = start;	// alternate brach: return to start
    
    		start = alt;	// alt becomes new (start, end)
    		end = alt;
    
    	case '+':
    		// (start,end)+:
    
    Rob Pike's avatar
    Rob Pike committed
    		p.re.Add(alt);
    
    		end.SetNext(alt);	// after end, do alt
    
    		alt.left = start;	// alternate brach: return to start
    
    		end = alt;	// start is unchanged; end is alt
    
    	case '?':
    		// (start,end)?:
    
    Rob Pike's avatar
    Rob Pike committed
    		p.re.Add(alt);
    
    Rob Pike's avatar
    Rob Pike committed
    		p.re.Add(nop);
    
    		alt.left = start;	// alternate branch is start
    
    		alt.next = nop;	// follow on to nop
    		end.SetNext(nop);	// after end, go to nop
    		start = alt;	// start is now alt
    		end = nop;	// end is nop pointed to by both branches
    
    Rob Pike's avatar
    Rob Pike committed
    		return
    
    	}
    	switch p.nextc() {
    	case '*', '+', '?':
    		p.re.Error(ErrBadClosure);
    	}
    
    Rob Pike's avatar
    Rob Pike committed
    	return
    
    func (p *Parser) Concatenation() (start, end Inst) {
    
    	start, end = NULL, NULL;
    	for {
    		nstart, nend := p.Closure();
    		switch {
    
    Rob Pike's avatar
    Rob Pike committed
    		case nstart == NULL:	// end of this concatenation
    			if start == NULL {	// this is the empty string
    
    				nop := p.re.Add(new(*Nop));
    
    Rob Pike's avatar
    Rob Pike committed
    				return nop, nop;
    			}
    
    Rob Pike's avatar
    Rob Pike committed
    			return;
    
    Rob Pike's avatar
    Rob Pike committed
    		case start == NULL:	// this is first element of concatenation
    
    			start, end = nstart, nend;
    		default:
    
    			end.SetNext(nstart);
    
    			end = nend;
    		}
    	}
    	panic("unreachable");
    }
    
    
    func (p *Parser) Regexp() (start, end Inst) {
    
    	start, end = p.Concatenation();
    	for {
    		switch p.c() {
    		default:
    
    Rob Pike's avatar
    Rob Pike committed
    			return;
    
    		case '|':
    			p.nextc();
    			nstart, nend := p.Concatenation();
    
    Rob Pike's avatar
    Rob Pike committed
    			p.re.Add(alt);
    
    			alt.left = start;
    			alt.next = nstart;
    
    Rob Pike's avatar
    Rob Pike committed
    			p.re.Add(nop);
    
    			end.SetNext(nop);
    			nend.SetNext(nop);
    			start, end = alt, nop;
    
    func UnNop(i Inst) Inst {
    	for i.Type() == NOP {
    		i = i.Next()
    
    Rob Pike's avatar
    Rob Pike committed
    func (re *RE) EliminateNops() {
    	for i := 0; i < re.inst.Len(); i++ {
    		inst := re.inst.At(i).(Inst);
    		if inst.Type() == END {
    			continue
    		}
    		inst.SetNext(UnNop(inst.Next()));
    		if inst.Type() == ALT {
    			alt := inst.(*Alt);
    
    			alt.left = UnNop(alt.left);
    
    Rob Pike's avatar
    Rob Pike committed
    func (re *RE) Dump() {
    	for i := 0; i < re.inst.Len(); i++ {
    		inst := re.inst.At(i).(Inst);
    		print(inst.Index(), ": ");
    		inst.Print();
    		if inst.Type() != END {
    			print(" -> ", inst.Next().Index())
    
    Rob Pike's avatar
    Rob Pike committed
    		print("\n");
    
    func (re *RE) DoParse() {
    	parser := NewParser(re);
    
    	start := new(*Start);
    
    Rob Pike's avatar
    Rob Pike committed
    	re.Add(start);
    
    	s, e := parser.Regexp();
    	start.next = s;
    
    	e.SetNext(re.Add(new(*End)));
    
    Rob Pike's avatar
    Rob Pike committed
    	if debug {
    		re.Dump();
    		println();
    	}
    
    Rob Pike's avatar
    Rob Pike committed
    	re.EliminateNops();
    
    Rob Pike's avatar
    Rob Pike committed
    	if debug {
    		re.Dump();
    		println();
    	}
    
    Rob Pike's avatar
    Rob Pike committed
    
    
    func Compiler(str string, ch chan *RE) {
    	re := new(*RE);
    
    	re.expr = str;
    
    	re.ch = ch;
    	re.DoParse();
    	ch <- re;
    }
    
    // Public interface has only execute functionality (not yet implemented)
    export type Regexp interface {
    
    Russ Cox's avatar
    Russ Cox committed
    	Execute(s string) []int
    
    Rob Pike's avatar
    Rob Pike committed
    // Compile in separate goroutine; wait for result
    
    export func Compile(str string) (regexp Regexp, error *os.Error) {
    	ch := new(chan *RE);
    	go Compiler(str, ch);
    	re := <-ch;
    	return re, re.error
    }
    
    Rob Pike's avatar
    Rob Pike committed
    
    type State struct {
    	inst	Inst;	// next instruction to execute
    
    Russ Cox's avatar
    Russ Cox committed
    	match	[]int;	// pairs of bracketing submatches. 0th is start,end
    
    Rob Pike's avatar
    Rob Pike committed
    }
    
    // Append new state to to-do list.  Leftmost-longest wins so avoid
    // adding a state that's already active.
    
    Russ Cox's avatar
    Russ Cox committed
    func AddState(s []State, inst Inst, match []int) []State {
    
    Rob Pike's avatar
    Rob Pike committed
    	index := inst.Index();
    	l := len(s);
    	pos := match[0];
    	// TODO: Once the state is a vector and we can do insert, have inputs always
    	// go in order correctly and this "earlier" test is never necessary,
    	for i := 0; i < l; i++ {
    		if s[i].inst.Index() == index && // same instruction
    		   s[i].match[0] < pos {	// earlier match already going; lefmost wins
    		   	return s
    		 }
    	}
    	if l == cap(s) {
    		s1 := new([]State, 2*l)[0:l];
    		for i := 0; i < l; i++ {
    			s1[i] = s[i];
    		}
    		s = s1;
    	}
    	s = s[0:l+1];
    	s[l].inst = inst;
    	s[l].match = match;
    	return s;
    }
    
    
    Russ Cox's avatar
    Russ Cox committed
    func (re *RE) DoExecute(str string, pos int) []int {
    	var s [2][]State;	// TODO: use a vector when State values (not ptrs) can be vector elements
    
    Rob Pike's avatar
    Rob Pike committed
    	s[0] = new([]State, 10)[0:0];
    	s[1] = new([]State, 10)[0:0];
    	in, out := 0, 1;
    	var final State;
    	found := false;
    	for pos <= len(str) {
    		if !found {
    			// prime the pump if we haven't seen a match yet
    			match := new([]int, 2*(re.nbra+1));
    
    Rob Pike's avatar
    Rob Pike committed
    			for i := 0; i < len(match); i++ {
    				match[i] = -1;	// no match seen; catches cases like "a(b)?c" on "ac"
    			}
    
    Rob Pike's avatar
    Rob Pike committed
    			match[0]  = pos;
    			s[out] = AddState(s[out], re.start.Next(), match);
    		}
    		in, out = out, in;	// old out state is new in state
    		s[out] = s[out][0:0];	// clear out state
    		if len(s[in]) == 0 {
    			// machine has completed
    			break;
    		}
    
    Rob Pike's avatar
    Rob Pike committed
    		charwidth := 1;
    
    Rob Pike's avatar
    Rob Pike committed
    		c := EOF;
    		if pos < len(str) {
    
    Rob Pike's avatar
    Rob Pike committed
    			c, charwidth = sys.stringtorune(str, pos);
    
    Rob Pike's avatar
    Rob Pike committed
    		}
    		for i := 0; i < len(s[in]); i++ {
    			state := s[in][i];
    			switch s[in][i].inst.Type() {
    			case BOT:
    				if pos == 0 {
    					s[in] = AddState(s[in], state.inst.Next(), state.match)
    				}
    			case EOT:
    				if pos == len(str) {
    					s[in] = AddState(s[in], state.inst.Next(), state.match)
    				}
    			case CHAR:
    				if c == state.inst.(*Char).char {
    					s[out] = AddState(s[out], state.inst.Next(), state.match)
    				}
    			case CHARCLASS:
    				if state.inst.(*CharClass).Matches(c) {
    					s[out] = AddState(s[out], state.inst.Next(), state.match)
    				}
    			case ANY:
    				if c != EOF {
    					s[out] = AddState(s[out], state.inst.Next(), state.match)
    				}
    			case BRA:
    				n := state.inst.(*Bra).n;
    				state.match[2*n] = pos;
    				s[in] = AddState(s[in], state.inst.Next(), state.match);
    			case EBRA:
    				n := state.inst.(*Ebra).n;
    				state.match[2*n+1] = pos;
    				s[in] = AddState(s[in], state.inst.Next(), state.match);
    			case ALT:
    				s[in] = AddState(s[in], state.inst.(*Alt).left, state.match);
    				// give other branch a copy of this match vector
    				s1 := new([]int, 2*(re.nbra+1));
    				for i := 0; i < len(s1); i++ {
    					s1[i] = state.match[i]
    				}
    				s[in] = AddState(s[in], state.inst.Next(), s1);
    			case END:
    				// choose leftmost longest
    				if !found ||	// first
    				   state.match[0] < final.match[0] ||	// leftmost
    				   (state.match[0] == final.match[0] && pos > final.match[1])  {	// longest
    					final = state;
    					final.match[1] = pos;
    				}
    				found = true;
    			default:
    				state.inst.Print();
    				panic("unknown instruction in execute");
    			}
    		}
    
    Rob Pike's avatar
    Rob Pike committed
    		pos += charwidth;
    
    Rob Pike's avatar
    Rob Pike committed
    	}
    	return final.match;
    }
    
    
    
    Russ Cox's avatar
    Russ Cox committed
    func (re *RE) Execute(s string) []int {
    
    Rob Pike's avatar
    Rob Pike committed
    	return re.DoExecute(s, 0)
    }