src/pkg/regexp/syntax/compile.go - The Go Programming Language

Golang

Source file src/pkg/regexp/syntax/compile.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package syntax
     6	
     7	import "unicode"
     8	
     9	// A patchList is a list of instruction pointers that need to be filled in (patched).
    10	// Because the pointers haven't been filled in yet, we can reuse their storage
    11	// to hold the list.  It's kind of sleazy, but works well in practice.
    12	// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
    13	// 
    14	// These aren't really pointers: they're integers, so we can reinterpret them
    15	// this way without using package unsafe.  A value l denotes
    16	// p.inst[l>>1].Out (l&1==0) or .Arg (l&1==1). 
    17	// l == 0 denotes the empty list, okay because we start every program
    18	// with a fail instruction, so we'll never want to point at its output link.
    19	type patchList uint32
    20	
    21	func (l patchList) next(p *Prog) patchList {
    22		i := &p.Inst[l>>1]
    23		if l&1 == 0 {
    24			return patchList(i.Out)
    25		}
    26		return patchList(i.Arg)
    27	}
    28	
    29	func (l patchList) patch(p *Prog, val uint32) {
    30		for l != 0 {
    31			i := &p.Inst[l>>1]
    32			if l&1 == 0 {
    33				l = patchList(i.Out)
    34				i.Out = val
    35			} else {
    36				l = patchList(i.Arg)
    37				i.Arg = val
    38			}
    39		}
    40	}
    41	
    42	func (l1 patchList) append(p *Prog, l2 patchList) patchList {
    43		if l1 == 0 {
    44			return l2
    45		}
    46		if l2 == 0 {
    47			return l1
    48		}
    49	
    50		last := l1
    51		for {
    52			next := last.next(p)
    53			if next == 0 {
    54				break
    55			}
    56			last = next
    57		}
    58	
    59		i := &p.Inst[last>>1]
    60		if last&1 == 0 {
    61			i.Out = uint32(l2)
    62		} else {
    63			i.Arg = uint32(l2)
    64		}
    65		return l1
    66	}
    67	
    68	// A frag represents a compiled program fragment.
    69	type frag struct {
    70		i   uint32    // index of first instruction
    71		out patchList // where to record end instruction
    72	}
    73	
    74	type compiler struct {
    75		p *Prog
    76	}
    77	
    78	// Compile compiles the regexp into a program to be executed.
    79	// The regexp should have been simplified already (returned from re.Simplify).
    80	func Compile(re *Regexp) (*Prog, error) {
    81		var c compiler
    82		c.init()
    83		f := c.compile(re)
    84		f.out.patch(c.p, c.inst(InstMatch).i)
    85		c.p.Start = int(f.i)
    86		return c.p, nil
    87	}
    88	
    89	func (c *compiler) init() {
    90		c.p = new(Prog)
    91		c.p.NumCap = 2 // implicit ( and ) for whole match $0
    92		c.inst(InstFail)
    93	}
    94	
    95	var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
    96	var anyRune = []rune{0, unicode.MaxRune}
    97	
    98	func (c *compiler) compile(re *Regexp) frag {
    99		switch re.Op {
   100		case OpNoMatch:
   101			return c.fail()
   102		case OpEmptyMatch:
   103			return c.nop()
   104		case OpLiteral:
   105			if len(re.Rune) == 0 {
   106				return c.nop()
   107			}
   108			var f frag
   109			for j := range re.Rune {
   110				f1 := c.rune(re.Rune[j:j+1], re.Flags)
   111				if j == 0 {
   112					f = f1
   113				} else {
   114					f = c.cat(f, f1)
   115				}
   116			}
   117			return f
   118		case OpCharClass:
   119			return c.rune(re.Rune, re.Flags)
   120		case OpAnyCharNotNL:
   121			return c.rune(anyRuneNotNL, 0)
   122		case OpAnyChar:
   123			return c.rune(anyRune, 0)
   124		case OpBeginLine:
   125			return c.empty(EmptyBeginLine)
   126		case OpEndLine:
   127			return c.empty(EmptyEndLine)
   128		case OpBeginText:
   129			return c.empty(EmptyBeginText)
   130		case OpEndText:
   131			return c.empty(EmptyEndText)
   132		case OpWordBoundary:
   133			return c.empty(EmptyWordBoundary)
   134		case OpNoWordBoundary:
   135			return c.empty(EmptyNoWordBoundary)
   136		case OpCapture:
   137			bra := c.cap(uint32(re.Cap << 1))
   138			sub := c.compile(re.Sub[0])
   139			ket := c.cap(uint32(re.Cap<<1 | 1))
   140			return c.cat(c.cat(bra, sub), ket)
   141		case OpStar:
   142			return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
   143		case OpPlus:
   144			return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
   145		case OpQuest:
   146			return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
   147		case OpConcat:
   148			if len(re.Sub) == 0 {
   149				return c.nop()
   150			}
   151			var f frag
   152			for i, sub := range re.Sub {
   153				if i == 0 {
   154					f = c.compile(sub)
   155				} else {
   156					f = c.cat(f, c.compile(sub))
   157				}
   158			}
   159			return f
   160		case OpAlternate:
   161			var f frag
   162			for _, sub := range re.Sub {
   163				f = c.alt(f, c.compile(sub))
   164			}
   165			return f
   166		}
   167		panic("regexp: unhandled case in compile")
   168	}
   169	
   170	func (c *compiler) inst(op InstOp) frag {
   171		// TODO: impose length limit
   172		f := frag{i: uint32(len(c.p.Inst))}
   173		c.p.Inst = append(c.p.Inst, Inst{Op: op})
   174		return f
   175	}
   176	
   177	func (c *compiler) nop() frag {
   178		f := c.inst(InstNop)
   179		f.out = patchList(f.i << 1)
   180		return f
   181	}
   182	
   183	func (c *compiler) fail() frag {
   184		return frag{}
   185	}
   186	
   187	func (c *compiler) cap(arg uint32) frag {
   188		f := c.inst(InstCapture)
   189		f.out = patchList(f.i << 1)
   190		c.p.Inst[f.i].Arg = arg
   191	
   192		if c.p.NumCap < int(arg)+1 {
   193			c.p.NumCap = int(arg) + 1
   194		}
   195		return f
   196	}
   197	
   198	func (c *compiler) cat(f1, f2 frag) frag {
   199		// concat of failure is failure
   200		if f1.i == 0 || f2.i == 0 {
   201			return frag{}
   202		}
   203	
   204		// TODO: elide nop
   205	
   206		f1.out.patch(c.p, f2.i)
   207		return frag{f1.i, f2.out}
   208	}
   209	
   210	func (c *compiler) alt(f1, f2 frag) frag {
   211		// alt of failure is other
   212		if f1.i == 0 {
   213			return f2
   214		}
   215		if f2.i == 0 {
   216			return f1
   217		}
   218	
   219		f := c.inst(InstAlt)
   220		i := &c.p.Inst[f.i]
   221		i.Out = f1.i
   222		i.Arg = f2.i
   223		f.out = f1.out.append(c.p, f2.out)
   224		return f
   225	}
   226	
   227	func (c *compiler) quest(f1 frag, nongreedy bool) frag {
   228		f := c.inst(InstAlt)
   229		i := &c.p.Inst[f.i]
   230		if nongreedy {
   231			i.Arg = f1.i
   232			f.out = patchList(f.i << 1)
   233		} else {
   234			i.Out = f1.i
   235			f.out = patchList(f.i<<1 | 1)
   236		}
   237		f.out = f.out.append(c.p, f1.out)
   238		return f
   239	}
   240	
   241	func (c *compiler) star(f1 frag, nongreedy bool) frag {
   242		f := c.inst(InstAlt)
   243		i := &c.p.Inst[f.i]
   244		if nongreedy {
   245			i.Arg = f1.i
   246			f.out = patchList(f.i << 1)
   247		} else {
   248			i.Out = f1.i
   249			f.out = patchList(f.i<<1 | 1)
   250		}
   251		f1.out.patch(c.p, f.i)
   252		return f
   253	}
   254	
   255	func (c *compiler) plus(f1 frag, nongreedy bool) frag {
   256		return frag{f1.i, c.star(f1, nongreedy).out}
   257	}
   258	
   259	func (c *compiler) empty(op EmptyOp) frag {
   260		f := c.inst(InstEmptyWidth)
   261		c.p.Inst[f.i].Arg = uint32(op)
   262		f.out = patchList(f.i << 1)
   263		return f
   264	}
   265	
   266	func (c *compiler) rune(r []rune, flags Flags) frag {
   267		f := c.inst(InstRune)
   268		i := &c.p.Inst[f.i]
   269		i.Rune = r
   270		flags &= FoldCase // only relevant flag is FoldCase
   271		if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
   272			// and sometimes not even that
   273			flags &^= FoldCase
   274		}
   275		i.Arg = uint32(flags)
   276		f.out = patchList(f.i << 1)
   277	
   278		// Special cases for exec machine.
   279		switch {
   280		case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
   281			i.Op = InstRune1
   282		case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
   283			i.Op = InstRuneAny
   284		case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
   285			i.Op = InstRuneAnyNotNL
   286		}
   287	
   288		return f
   289	}