src/pkg/regexp/syntax/prog.go - The Go Programming Language

Golang

Source file src/pkg/regexp/syntax/prog.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package syntax
     6	
     7	import (
     8		"bytes"
     9		"strconv"
    10		"unicode"
    11	)
    12	
    13	// Compiled program.
    14	// May not belong in this package, but convenient for now.
    15	
    16	// A Prog is a compiled regular expression program.
    17	type Prog struct {
    18		Inst   []Inst
    19		Start  int // index of start instruction
    20		NumCap int // number of InstCapture insts in re
    21	}
    22	
    23	// An InstOp is an instruction opcode.
    24	type InstOp uint8
    25	
    26	const (
    27		InstAlt InstOp = iota
    28		InstAltMatch
    29		InstCapture
    30		InstEmptyWidth
    31		InstMatch
    32		InstFail
    33		InstNop
    34		InstRune
    35		InstRune1
    36		InstRuneAny
    37		InstRuneAnyNotNL
    38	)
    39	
    40	// An EmptyOp specifies a kind or mixture of zero-width assertions.
    41	type EmptyOp uint8
    42	
    43	const (
    44		EmptyBeginLine EmptyOp = 1 << iota
    45		EmptyEndLine
    46		EmptyBeginText
    47		EmptyEndText
    48		EmptyWordBoundary
    49		EmptyNoWordBoundary
    50	)
    51	
    52	// EmptyOpContext returns the zero-width assertions
    53	// satisfied at the position between the runes r1 and r2.
    54	// Passing r1 == -1 indicates that the position is
    55	// at the beginning of the text.
    56	// Passing r2 == -1 indicates that the position is
    57	// at the end of the text.
    58	func EmptyOpContext(r1, r2 rune) EmptyOp {
    59		var op EmptyOp
    60		if r1 < 0 {
    61			op |= EmptyBeginText | EmptyBeginLine
    62		}
    63		if r1 == '\n' {
    64			op |= EmptyBeginLine
    65		}
    66		if r2 < 0 {
    67			op |= EmptyEndText | EmptyEndLine
    68		}
    69		if r2 == '\n' {
    70			op |= EmptyEndLine
    71		}
    72		if IsWordChar(r1) != IsWordChar(r2) {
    73			op |= EmptyWordBoundary
    74		} else {
    75			op |= EmptyNoWordBoundary
    76		}
    77		return op
    78	}
    79	
    80	// IsWordChar reports whether r is consider a ``word character''
    81	// during the evaluation of the \b and \B zero-width assertions.
    82	// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
    83	func IsWordChar(r rune) bool {
    84		return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
    85	}
    86	
    87	// An Inst is a single instruction in a regular expression program.
    88	type Inst struct {
    89		Op   InstOp
    90		Out  uint32 // all but InstMatch, InstFail
    91		Arg  uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
    92		Rune []rune
    93	}
    94	
    95	func (p *Prog) String() string {
    96		var b bytes.Buffer
    97		dumpProg(&b, p)
    98		return b.String()
    99	}
   100	
   101	// skipNop follows any no-op or capturing instructions
   102	// and returns the resulting pc.
   103	func (p *Prog) skipNop(pc uint32) *Inst {
   104		i := &p.Inst[pc]
   105		for i.Op == InstNop || i.Op == InstCapture {
   106			pc = i.Out
   107			i = &p.Inst[pc]
   108		}
   109		return i
   110	}
   111	
   112	// op returns i.Op but merges all the Rune special cases into InstRune
   113	func (i *Inst) op() InstOp {
   114		op := i.Op
   115		switch op {
   116		case InstRune1, InstRuneAny, InstRuneAnyNotNL:
   117			op = InstRune
   118		}
   119		return op
   120	}
   121	
   122	// Prefix returns a literal string that all matches for the
   123	// regexp must start with.  Complete is true if the prefix
   124	// is the entire match.
   125	func (p *Prog) Prefix() (prefix string, complete bool) {
   126		i := p.skipNop(uint32(p.Start))
   127	
   128		// Avoid allocation of buffer if prefix is empty.
   129		if i.op() != InstRune || len(i.Rune) != 1 {
   130			return "", i.Op == InstMatch
   131		}
   132	
   133		// Have prefix; gather characters.
   134		var buf bytes.Buffer
   135		for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
   136			buf.WriteRune(i.Rune[0])
   137			i = p.skipNop(i.Out)
   138		}
   139		return buf.String(), i.Op == InstMatch
   140	}
   141	
   142	// StartCond returns the leading empty-width conditions that must
   143	// be true in any match.  It returns ^EmptyOp(0) if no matches are possible.
   144	func (p *Prog) StartCond() EmptyOp {
   145		var flag EmptyOp
   146		pc := uint32(p.Start)
   147		i := &p.Inst[pc]
   148	Loop:
   149		for {
   150			switch i.Op {
   151			case InstEmptyWidth:
   152				flag |= EmptyOp(i.Arg)
   153			case InstFail:
   154				return ^EmptyOp(0)
   155			case InstCapture, InstNop:
   156				// skip
   157			default:
   158				break Loop
   159			}
   160			pc = i.Out
   161			i = &p.Inst[pc]
   162		}
   163		return flag
   164	}
   165	
   166	// MatchRune returns true if the instruction matches (and consumes) r.
   167	// It should only be called when i.Op == InstRune.
   168	func (i *Inst) MatchRune(r rune) bool {
   169		rune := i.Rune
   170	
   171		// Special case: single-rune slice is from literal string, not char class.
   172		if len(rune) == 1 {
   173			r0 := rune[0]
   174			if r == r0 {
   175				return true
   176			}
   177			if Flags(i.Arg)&FoldCase != 0 {
   178				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
   179					if r == r1 {
   180						return true
   181					}
   182				}
   183			}
   184			return false
   185		}
   186	
   187		// Peek at the first few pairs.
   188		// Should handle ASCII well.
   189		for j := 0; j < len(rune) && j <= 8; j += 2 {
   190			if r < rune[j] {
   191				return false
   192			}
   193			if r <= rune[j+1] {
   194				return true
   195			}
   196		}
   197	
   198		// Otherwise binary search.
   199		lo := 0
   200		hi := len(rune) / 2
   201		for lo < hi {
   202			m := lo + (hi-lo)/2
   203			if c := rune[2*m]; c <= r {
   204				if r <= rune[2*m+1] {
   205					return true
   206				}
   207				lo = m + 1
   208			} else {
   209				hi = m
   210			}
   211		}
   212		return false
   213	}
   214	
   215	// As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char.
   216	// Since we act on runes, it would be easy to support Unicode here.
   217	func wordRune(r rune) bool {
   218		return r == '_' ||
   219			('A' <= r && r <= 'Z') ||
   220			('a' <= r && r <= 'z') ||
   221			('0' <= r && r <= '9')
   222	}
   223	
   224	// MatchEmptyWidth returns true if the instruction matches
   225	// an empty string between the runes before and after.
   226	// It should only be called when i.Op == InstEmptyWidth.
   227	func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
   228		switch EmptyOp(i.Arg) {
   229		case EmptyBeginLine:
   230			return before == '\n' || before == -1
   231		case EmptyEndLine:
   232			return after == '\n' || after == -1
   233		case EmptyBeginText:
   234			return before == -1
   235		case EmptyEndText:
   236			return after == -1
   237		case EmptyWordBoundary:
   238			return wordRune(before) != wordRune(after)
   239		case EmptyNoWordBoundary:
   240			return wordRune(before) == wordRune(after)
   241		}
   242		panic("unknown empty width arg")
   243	}
   244	
   245	func (i *Inst) String() string {
   246		var b bytes.Buffer
   247		dumpInst(&b, i)
   248		return b.String()
   249	}
   250	
   251	func bw(b *bytes.Buffer, args ...string) {
   252		for _, s := range args {
   253			b.WriteString(s)
   254		}
   255	}
   256	
   257	func dumpProg(b *bytes.Buffer, p *Prog) {
   258		for j := range p.Inst {
   259			i := &p.Inst[j]
   260			pc := strconv.Itoa(j)
   261			if len(pc) < 3 {
   262				b.WriteString("   "[len(pc):])
   263			}
   264			if j == p.Start {
   265				pc += "*"
   266			}
   267			bw(b, pc, "\t")
   268			dumpInst(b, i)
   269			bw(b, "\n")
   270		}
   271	}
   272	
   273	func u32(i uint32) string {
   274		return strconv.FormatUint(uint64(i), 10)
   275	}
   276	
   277	func dumpInst(b *bytes.Buffer, i *Inst) {
   278		switch i.Op {
   279		case InstAlt:
   280			bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
   281		case InstAltMatch:
   282			bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
   283		case InstCapture:
   284			bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
   285		case InstEmptyWidth:
   286			bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
   287		case InstMatch:
   288			bw(b, "match")
   289		case InstFail:
   290			bw(b, "fail")
   291		case InstNop:
   292			bw(b, "nop -> ", u32(i.Out))
   293		case InstRune:
   294			if i.Rune == nil {
   295				// shouldn't happen
   296				bw(b, "rune <nil>")
   297			}
   298			bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
   299			if Flags(i.Arg)&FoldCase != 0 {
   300				bw(b, "/i")
   301			}
   302			bw(b, " -> ", u32(i.Out))
   303		case InstRune1:
   304			bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
   305		case InstRuneAny:
   306			bw(b, "any -> ", u32(i.Out))
   307		case InstRuneAnyNotNL:
   308			bw(b, "anynotnl -> ", u32(i.Out))
   309		}
   310	}