Package pywbem :: Module lex
[frames] | no frames]

Source Code for Module pywbem.lex

   1  # ----------------------------------------------------------------------------- 
   2  # ply: lex.py 
   3  # 
   4  # Author: David M. Beazley (dave@dabeaz.com) 
   5  # 
   6  # Copyright (C) 2001-2009, David M. Beazley 
   7  # 
   8  # This library is free software; you can redistribute it and/or 
   9  # modify it under the terms of the GNU Lesser General Public 
  10  # License as published by the Free Software Foundation; either 
  11  # version 2.1 of the License, or (at your option) any later version. 
  12  # 
  13  # This library is distributed in the hope that it will be useful, 
  14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
  15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU 
  16  # Lesser General Public License for more details. 
  17  # 
  18  # You should have received a copy of the GNU Lesser General Public 
  19  # License along with this library; if not, write to the Free Software 
  20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
  21  # 
  22  # See the file COPYING for a complete copy of the LGPL. 
  23  # ----------------------------------------------------------------------------- 
  24   
  25  __version__    = "3.0" 
  26  __tabversion__ = "3.0"       # Version of table file used 
  27   
  28  import re, sys, types, copy, os 
  29   
  30  # This tuple contains known string types 
  31  try: 
  32      # Python 2.6 
  33      StringTypes = (types.StringType, types.UnicodeType) 
  34  except AttributeError: 
  35      # Python 3.0 
  36      StringTypes = (str, bytes) 
  37   
  38  # Extract the code attribute of a function. Different implementations 
  39  # are for Python 2/3 compatibility. 
  40   
  41  if sys.version_info[0] < 3: 
42 - def func_code(f):
43 return f.func_code
44 else:
45 - def func_code(f):
46 return f.__code__
47 48 # This regular expression is used to match valid token names 49 _is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') 50 51 # Exception thrown when invalid token encountered and no default error 52 # handler is defined. 53
54 -class LexError(Exception):
55 - def __init__(self,message,s):
56 self.args = (message,) 57 self.text = s
58 59 # Token class. This class is used to represent the tokens produced.
60 -class LexToken(object):
61 - def __str__(self):
62 return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos)
63 - def __repr__(self):
64 return str(self)
65 66 # This object is a stand-in for a logging object created by the 67 # logging module. 68
69 -class PlyLogger(object):
70 - def __init__(self,f):
71 self.f = f
72 - def critical(self,msg,*args,**kwargs):
73 self.f.write((msg % args) + "\n")
74
75 - def warning(self,msg,*args,**kwargs):
76 self.f.write("WARNING: "+ (msg % args) + "\n")
77
78 - def error(self,msg,*args,**kwargs):
79 self.f.write("ERROR: " + (msg % args) + "\n")
80 81 info = critical 82 debug = critical
83 84 # Null logger is used when no output is generated. Does nothing.
85 -class NullLogger(object):
86 - def __getattribute__(self,name):
87 return self
88 - def __call__(self,*args,**kwargs):
89 return self
90 91 # ----------------------------------------------------------------------------- 92 # === Lexing Engine === 93 # 94 # The following Lexer class implements the lexer runtime. There are only 95 # a few public methods and attributes: 96 # 97 # input() - Store a new string in the lexer 98 # token() - Get the next token 99 # clone() - Clone the lexer 100 # 101 # lineno - Current line number 102 # lexpos - Current position in the input string 103 # ----------------------------------------------------------------------------- 104
105 -class Lexer:
106 - def __init__(self):
107 self.lexre = None # Master regular expression. This is a list of 108 # tuples (re,findex) where re is a compiled 109 # regular expression and findex is a list 110 # mapping regex group numbers to rules 111 self.lexretext = None # Current regular expression strings 112 self.lexstatere = {} # Dictionary mapping lexer states to master regexs 113 self.lexstateretext = {} # Dictionary mapping lexer states to regex strings 114 self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names 115 self.lexstate = "INITIAL" # Current lexer state 116 self.lexstatestack = [] # Stack of lexer states 117 self.lexstateinfo = None # State information 118 self.lexstateignore = {} # Dictionary of ignored characters for each state 119 self.lexstateerrorf = {} # Dictionary of error functions for each state 120 self.lexreflags = 0 # Optional re compile flags 121 self.lexdata = None # Actual input data (as a string) 122 self.lexpos = 0 # Current position in input text 123 self.lexlen = 0 # Length of the input text 124 self.lexerrorf = None # Error rule (if any) 125 self.lextokens = None # List of valid tokens 126 self.lexignore = "" # Ignored characters 127 self.lexliterals = "" # Literal characters that can be passed through 128 self.lexmodule = None # Module 129 self.lineno = 1 # Current line number 130 self.lexoptimize = 0 # Optimized mode
131
132 - def clone(self,object=None):
133 c = copy.copy(self) 134 135 # If the object parameter has been supplied, it means we are attaching the 136 # lexer to a new object. In this case, we have to rebind all methods in 137 # the lexstatere and lexstateerrorf tables. 138 139 if object: 140 newtab = { } 141 for key, ritem in self.lexstatere.items(): 142 newre = [] 143 for cre, findex in ritem: 144 newfindex = [] 145 for f in findex: 146 if not f or not f[0]: 147 newfindex.append(f) 148 continue 149 newfindex.append((getattr(object,f[0].__name__),f[1])) 150 newre.append((cre,newfindex)) 151 newtab[key] = newre 152 c.lexstatere = newtab 153 c.lexstateerrorf = { } 154 for key, ef in self.lexstateerrorf.items(): 155 c.lexstateerrorf[key] = getattr(object,ef.__name__) 156 c.lexmodule = object 157 return c
158 159 # ------------------------------------------------------------ 160 # writetab() - Write lexer information to a table file 161 # ------------------------------------------------------------
162 - def writetab(self,tabfile,outputdir=""):
163 if isinstance(tabfile,types.ModuleType): 164 return 165 basetabfilename = tabfile.split(".")[-1] 166 filename = os.path.join(outputdir,basetabfilename)+".py" 167 tf = open(filename,"w") 168 tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__)) 169 tf.write("_tabversion = %s\n" % repr(__version__)) 170 tf.write("_lextokens = %s\n" % repr(self.lextokens)) 171 tf.write("_lexreflags = %s\n" % repr(self.lexreflags)) 172 tf.write("_lexliterals = %s\n" % repr(self.lexliterals)) 173 tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) 174 175 tabre = { } 176 # Collect all functions in the initial state 177 initial = self.lexstatere["INITIAL"] 178 initialfuncs = [] 179 for part in initial: 180 for f in part[1]: 181 if f and f[0]: 182 initialfuncs.append(f) 183 184 for key, lre in self.lexstatere.items(): 185 titem = [] 186 for i in range(len(lre)): 187 titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1],self.lexstaterenames[key][i]))) 188 tabre[key] = titem 189 190 tf.write("_lexstatere = %s\n" % repr(tabre)) 191 tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore)) 192 193 taberr = { } 194 for key, ef in self.lexstateerrorf.items(): 195 if ef: 196 taberr[key] = ef.__name__ 197 else: 198 taberr[key] = None 199 tf.write("_lexstateerrorf = %s\n" % repr(taberr)) 200 tf.close()
201 202 # ------------------------------------------------------------ 203 # readtab() - Read lexer information from a tab file 204 # ------------------------------------------------------------
205 - def readtab(self,tabfile,fdict):
206 if isinstance(tabfile,types.ModuleType): 207 lextab = tabfile 208 else: 209 if sys.version_info[0] < 3: 210 exec("import %s as lextab" % tabfile) 211 else: 212 env = { } 213 exec("import %s as lextab" % tabfile, env,env) 214 lextab = env['lextab'] 215 216 if getattr(lextab,"_tabversion","0.0") != __version__: 217 raise ImportError("Inconsistent PLY version") 218 219 self.lextokens = lextab._lextokens 220 self.lexreflags = lextab._lexreflags 221 self.lexliterals = lextab._lexliterals 222 self.lexstateinfo = lextab._lexstateinfo 223 self.lexstateignore = lextab._lexstateignore 224 self.lexstatere = { } 225 self.lexstateretext = { } 226 for key,lre in lextab._lexstatere.items(): 227 titem = [] 228 txtitem = [] 229 for i in range(len(lre)): 230 titem.append((re.compile(lre[i][0],lextab._lexreflags),_names_to_funcs(lre[i][1],fdict))) 231 txtitem.append(lre[i][0]) 232 self.lexstatere[key] = titem 233 self.lexstateretext[key] = txtitem 234 self.lexstateerrorf = { } 235 for key,ef in lextab._lexstateerrorf.items(): 236 self.lexstateerrorf[key] = fdict[ef] 237 self.begin('INITIAL')
238 239 # ------------------------------------------------------------ 240 # input() - Push a new string into the lexer 241 # ------------------------------------------------------------
242 - def input(self,s):
243 # Pull off the first character to see if s looks like a string 244 c = s[:1] 245 if not isinstance(c,StringTypes): 246 raise ValueError("Expected a string") 247 self.lexdata = s 248 self.lexpos = 0 249 self.lexlen = len(s)
250 251 # ------------------------------------------------------------ 252 # begin() - Changes the lexing state 253 # ------------------------------------------------------------
254 - def begin(self,state):
255 if not state in self.lexstatere: 256 raise ValueError("Undefined state") 257 self.lexre = self.lexstatere[state] 258 self.lexretext = self.lexstateretext[state] 259 self.lexignore = self.lexstateignore.get(state,"") 260 self.lexerrorf = self.lexstateerrorf.get(state,None) 261 self.lexstate = state
262 263 # ------------------------------------------------------------ 264 # push_state() - Changes the lexing state and saves old on stack 265 # ------------------------------------------------------------
266 - def push_state(self,state):
267 self.lexstatestack.append(self.lexstate) 268 self.begin(state)
269 270 # ------------------------------------------------------------ 271 # pop_state() - Restores the previous state 272 # ------------------------------------------------------------
273 - def pop_state(self):
274 self.begin(self.lexstatestack.pop())
275 276 # ------------------------------------------------------------ 277 # current_state() - Returns the current lexing state 278 # ------------------------------------------------------------
279 - def current_state(self):
280 return self.lexstate
281 282 # ------------------------------------------------------------ 283 # skip() - Skip ahead n characters 284 # ------------------------------------------------------------
285 - def skip(self,n):
286 self.lexpos += n
287 288 # ------------------------------------------------------------ 289 # opttoken() - Return the next token from the Lexer 290 # 291 # Note: This function has been carefully implemented to be as fast 292 # as possible. Don't make changes unless you really know what 293 # you are doing 294 # ------------------------------------------------------------
295 - def token(self):
296 # Make local copies of frequently referenced attributes 297 lexpos = self.lexpos 298 lexlen = self.lexlen 299 lexignore = self.lexignore 300 lexdata = self.lexdata 301 302 while lexpos < lexlen: 303 # This code provides some short-circuit code for whitespace, tabs, and other ignored characters 304 if lexdata[lexpos] in lexignore: 305 lexpos += 1 306 continue 307 308 # Look for a regular expression match 309 for lexre,lexindexfunc in self.lexre: 310 m = lexre.match(lexdata,lexpos) 311 if not m: continue 312 313 # Create a token for return 314 tok = LexToken() 315 tok.value = m.group() 316 tok.lineno = self.lineno 317 tok.lexpos = lexpos 318 319 i = m.lastindex 320 func,tok.type = lexindexfunc[i] 321 322 if not func: 323 # If no token type was set, it's an ignored token 324 if tok.type: 325 self.lexpos = m.end() 326 return tok 327 else: 328 lexpos = m.end() 329 break 330 331 lexpos = m.end() 332 333 # If token is processed by a function, call it 334 335 tok.lexer = self # Set additional attributes useful in token rules 336 self.lexmatch = m 337 self.lexpos = lexpos 338 339 newtok = func(tok) 340 341 # Every function must return a token, if nothing, we just move to next token 342 if not newtok: 343 lexpos = self.lexpos # This is here in case user has updated lexpos. 344 lexignore = self.lexignore # This is here in case there was a state change 345 break 346 347 # Verify type of the token. If not in the token map, raise an error 348 if not self.lexoptimize: 349 if not newtok.type in self.lextokens: 350 raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( 351 func_code(func).co_filename, func_code(func).co_firstlineno, 352 func.__name__, newtok.type),lexdata[lexpos:]) 353 354 return newtok 355 else: 356 # No match, see if in literals 357 if lexdata[lexpos] in self.lexliterals: 358 tok = LexToken() 359 tok.value = lexdata[lexpos] 360 tok.lineno = self.lineno 361 tok.type = tok.value 362 tok.lexpos = lexpos 363 self.lexpos = lexpos + 1 364 return tok 365 366 # No match. Call t_error() if defined. 367 if self.lexerrorf: 368 tok = LexToken() 369 tok.value = self.lexdata[lexpos:] 370 tok.lineno = self.lineno 371 tok.type = "error" 372 tok.lexer = self 373 tok.lexpos = lexpos 374 self.lexpos = lexpos 375 newtok = self.lexerrorf(tok) 376 if lexpos == self.lexpos: 377 # Error method didn't change text position at all. This is an error. 378 raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) 379 lexpos = self.lexpos 380 if not newtok: continue 381 return newtok 382 383 self.lexpos = lexpos 384 raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:]) 385 386 self.lexpos = lexpos + 1 387 if self.lexdata is None: 388 raise RuntimeError("No input string given with input()") 389 return None
390 391 # Iterator interface
392 - def __iter__(self):
393 return self
394
395 - def next(self):
396 t = self.token() 397 if t is None: 398 raise StopIteration 399 return t
400 401 __next__ = next
402 403 # ----------------------------------------------------------------------------- 404 # ==== Lex Builder === 405 # 406 # The functions and classes below are used to collect lexing information 407 # and build a Lexer object from it. 408 # ----------------------------------------------------------------------------- 409 410 # ----------------------------------------------------------------------------- 411 # get_caller_module_dict() 412 # 413 # This function returns a dictionary containing all of the symbols defined within 414 # a caller further down the call stack. This is used to get the environment 415 # associated with the yacc() call if none was provided. 416 # ----------------------------------------------------------------------------- 417
418 -def get_caller_module_dict(levels):
419 try: 420 raise RuntimeError 421 except RuntimeError: 422 e,b,t = sys.exc_info() 423 f = t.tb_frame 424 while levels > 0: 425 f = f.f_back 426 levels -= 1 427 ldict = f.f_globals.copy() 428 if f.f_globals != f.f_locals: 429 ldict.update(f.f_locals) 430 431 return ldict
432 433 # ----------------------------------------------------------------------------- 434 # _funcs_to_names() 435 # 436 # Given a list of regular expression functions, this converts it to a list 437 # suitable for output to a table file 438 # ----------------------------------------------------------------------------- 439
440 -def _funcs_to_names(funclist,namelist):
441 result = [] 442 for f,name in zip(funclist,namelist): 443 if f and f[0]: 444 result.append((name, f[1])) 445 else: 446 result.append(f) 447 return result
448 449 # ----------------------------------------------------------------------------- 450 # _names_to_funcs() 451 # 452 # Given a list of regular expression function names, this converts it back to 453 # functions. 454 # ----------------------------------------------------------------------------- 455
456 -def _names_to_funcs(namelist,fdict):
457 result = [] 458 for n in namelist: 459 if n and n[0]: 460 result.append((fdict[n[0]],n[1])) 461 else: 462 result.append(n) 463 return result
464 465 # ----------------------------------------------------------------------------- 466 # _form_master_re() 467 # 468 # This function takes a list of all of the regex components and attempts to 469 # form the master regular expression. Given limitations in the Python re 470 # module, it may be necessary to break the master regex into separate expressions. 471 # ----------------------------------------------------------------------------- 472
473 -def _form_master_re(relist,reflags,ldict,toknames):
474 if not relist: return [] 475 regex = "|".join(relist) 476 try: 477 lexre = re.compile(regex,re.VERBOSE | reflags) 478 479 # Build the index to function map for the matching engine 480 lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1) 481 lexindexnames = lexindexfunc[:] 482 483 for f,i in lexre.groupindex.items(): 484 handle = ldict.get(f,None) 485 if type(handle) in (types.FunctionType, types.MethodType): 486 lexindexfunc[i] = (handle,toknames[f]) 487 lexindexnames[i] = f 488 elif handle is not None: 489 lexindexnames[i] = f 490 if f.find("ignore_") > 0: 491 lexindexfunc[i] = (None,None) 492 else: 493 lexindexfunc[i] = (None, toknames[f]) 494 495 return [(lexre,lexindexfunc)],[regex],[lexindexnames] 496 except Exception: 497 m = int(len(relist)/2) 498 if m == 0: m = 1 499 llist, lre, lnames = _form_master_re(relist[:m],reflags,ldict,toknames) 500 rlist, rre, rnames = _form_master_re(relist[m:],reflags,ldict,toknames) 501 return llist+rlist, lre+rre, lnames+rnames
502 503 # ----------------------------------------------------------------------------- 504 # def _statetoken(s,names) 505 # 506 # Given a declaration name s of the form "t_" and a dictionary whose keys are 507 # state names, this function returns a tuple (states,tokenname) where states 508 # is a tuple of state names and tokenname is the name of the token. For example, 509 # calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') 510 # ----------------------------------------------------------------------------- 511
512 -def _statetoken(s,names):
513 nonstate = 1 514 parts = s.split("_") 515 for i in range(1,len(parts)): 516 if not parts[i] in names and parts[i] != 'ANY': break 517 if i > 1: 518 states = tuple(parts[1:i]) 519 else: 520 states = ('INITIAL',) 521 522 if 'ANY' in states: 523 states = tuple(names) 524 525 tokenname = "_".join(parts[i:]) 526 return (states,tokenname)
527 528 529 # ----------------------------------------------------------------------------- 530 # LexerReflect() 531 # 532 # This class represents information needed to build a lexer as extracted from a 533 # user's input file. 534 # -----------------------------------------------------------------------------
535 -class LexerReflect(object):
536 - def __init__(self,ldict,log=None,reflags=0):
537 self.ldict = ldict 538 self.error_func = None 539 self.tokens = [] 540 self.reflags = reflags 541 self.stateinfo = { 'INITIAL' : 'inclusive'} 542 self.files = {} 543 self.error = 0 544 545 if log is None: 546 self.log = PlyLogger(sys.stderr) 547 else: 548 self.log = log
549 550 # Get all of the basic information
551 - def get_all(self):
552 self.get_tokens() 553 self.get_literals() 554 self.get_states() 555 self.get_rules()
556 557 # Validate all of the information
558 - def validate_all(self):
559 self.validate_tokens() 560 self.validate_literals() 561 self.validate_rules() 562 return self.error
563 564 # Get the tokens map
565 - def get_tokens(self):
566 tokens = self.ldict.get("tokens",None) 567 if not tokens: 568 self.log.error("No token list is defined") 569 self.error = 1 570 return 571 572 if not isinstance(tokens,(list, tuple)): 573 self.log.error("tokens must be a list or tuple") 574 self.error = 1 575 return 576 577 if not tokens: 578 self.log.error("tokens is empty") 579 self.error = 1 580 return 581 582 self.tokens = tokens
583 584 # Validate the tokens
585 - def validate_tokens(self):
586 terminals = {} 587 for n in self.tokens: 588 if not _is_identifier.match(n): 589 self.log.error("Bad token name '%s'",n) 590 self.error = 1 591 if n in terminals: 592 self.log.warning("Token '%s' multiply defined", n) 593 terminals[n] = 1
594 595 # Get the literals specifier
596 - def get_literals(self):
597 self.literals = self.ldict.get("literals","")
598 599 # Validate literals
600 - def validate_literals(self):
601 try: 602 for c in self.literals: 603 if not isinstance(c,StringTypes) or len(c) > 1: 604 self.log.error("Invalid literal %s. Must be a single character", repr(c)) 605 self.error = 1 606 continue 607 608 except TypeError: 609 self.log.error("Invalid literals specification. literals must be a sequence of characters") 610 self.error = 1
611
612 - def get_states(self):
613 self.states = self.ldict.get("states",None) 614 # Build statemap 615 if self.states: 616 if not isinstance(self.states,(tuple,list)): 617 self.log.error("states must be defined as a tuple or list") 618 self.error = 1 619 else: 620 for s in self.states: 621 if not isinstance(s,tuple) or len(s) != 2: 622 self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')",repr(s)) 623 self.error = 1 624 continue 625 name, statetype = s 626 if not isinstance(name,StringTypes): 627 self.log.error("State name %s must be a string", repr(name)) 628 self.error = 1 629 continue 630 if not (statetype == 'inclusive' or statetype == 'exclusive'): 631 self.log.error("State type for state %s must be 'inclusive' or 'exclusive'",name) 632 self.error = 1 633 continue 634 if name in self.stateinfo: 635 self.log.error("State '%s' already defined",name) 636 self.error = 1 637 continue 638 self.stateinfo[name] = statetype
639 640 # Get all of the symbols with a t_ prefix and sort them into various 641 # categories (functions, strings, error functions, and ignore characters) 642
643 - def get_rules(self):
644 tsymbols = [f for f in self.ldict if f[:2] == 't_' ] 645 646 # Now build up a list of functions and a list of strings 647 648 self.toknames = { } # Mapping of symbols to token names 649 self.funcsym = { } # Symbols defined as functions 650 self.strsym = { } # Symbols defined as strings 651 self.ignore = { } # Ignore strings by state 652 self.errorf = { } # Error functions by state 653 654 for s in self.stateinfo: 655 self.funcsym[s] = [] 656 self.strsym[s] = [] 657 658 if len(tsymbols) == 0: 659 self.log.error("No rules of the form t_rulename are defined") 660 self.error = 1 661 return 662 663 for f in tsymbols: 664 t = self.ldict[f] 665 states, tokname = _statetoken(f,self.stateinfo) 666 self.toknames[f] = tokname 667 668 if hasattr(t,"__call__"): 669 if tokname == 'error': 670 for s in states: 671 self.errorf[s] = t 672 elif tokname == 'ignore': 673 line = func_code(t).co_firstlineno 674 file = func_code(t).co_filename 675 self.log.error("%s:%d: Rule '%s' must be defined as a string",file,line,t.__name__) 676 self.error = 1 677 else: 678 for s in states: 679 self.funcsym[s].append((f,t)) 680 elif isinstance(t, StringTypes): 681 if tokname == 'ignore': 682 for s in states: 683 self.ignore[s] = t 684 if "\\" in t: 685 self.log.warning("%s contains a literal backslash '\\'",f) 686 687 elif tokname == 'error': 688 self.log.error("Rule '%s' must be defined as a function", f) 689 self.error = 1 690 else: 691 for s in states: 692 self.strsym[s].append((f,t)) 693 else: 694 self.log.error("%s not defined as a function or string", f) 695 self.error = 1 696 697 # Sort the functions by line number 698 for f in self.funcsym.values(): 699 if sys.version_info[0] < 3: 700 f.sort(lambda x,y: cmp(func_code(x[1]).co_firstlineno,func_code(y[1]).co_firstlineno)) 701 else: 702 # Python 3.0 703 f.sort(key=lambda x: func_code(x[1]).co_firstlineno) 704 705 # Sort the strings by regular expression length 706 for s in self.strsym.values(): 707 if sys.version_info[0] < 3: 708 s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) 709 else: 710 # Python 3.0 711 s.sort(key=lambda x: len(x[1]),reverse=True)
712 713 # Validate all of the t_rules collected
714 - def validate_rules(self):
715 for state in self.stateinfo: 716 # Validate all rules defined by functions 717 718 719 720 for fname, f in self.funcsym[state]: 721 line = func_code(f).co_firstlineno 722 file = func_code(f).co_filename 723 self.files[file] = 1 724 725 tokname = self.toknames[fname] 726 if isinstance(f, types.MethodType): 727 reqargs = 2 728 else: 729 reqargs = 1 730 nargs = func_code(f).co_argcount 731 if nargs > reqargs: 732 self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) 733 self.error = 1 734 continue 735 736 if nargs < reqargs: 737 self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) 738 self.error = 1 739 continue 740 741 if not f.__doc__: 742 self.log.error("%s:%d: No regular expression defined for rule '%s'",file,line,f.__name__) 743 self.error = 1 744 continue 745 746 try: 747 c = re.compile("(?P<%s>%s)" % (fname,f.__doc__), re.VERBOSE | self.reflags) 748 if c.match(""): 749 self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file,line,f.__name__) 750 self.error = 1 751 except re.error: 752 _etype, e, _etrace = sys.exc_info() 753 self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file,line,f.__name__,e) 754 if '#' in f.__doc__: 755 self.log.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'",file,line, f.__name__) 756 self.error = 1 757 758 # Validate all rules defined by strings 759 for name,r in self.strsym[state]: 760 tokname = self.toknames[name] 761 if tokname == 'error': 762 self.log.error("Rule '%s' must be defined as a function", name) 763 self.error = 1 764 continue 765 766 if not tokname in self.tokens and tokname.find("ignore_") < 0: 767 self.log.error("Rule '%s' defined for an unspecified token %s",name,tokname) 768 self.error = 1 769 continue 770 771 try: 772 c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | self.reflags) 773 if (c.match("")): 774 self.log.error("Regular expression for rule '%s' matches empty string",name) 775 self.error = 1 776 except re.error: 777 _etype, e, _etrace = sys.exc_info() 778 self.log.error("Invalid regular expression for rule '%s'. %s",name,e) 779 if '#' in r: 780 self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'",name) 781 self.error = 1 782 783 if not self.funcsym[state] and not self.strsym[state]: 784 self.log.error("No rules defined for state '%s'",state) 785 self.error = 1 786 787 # Validate the error function 788 efunc = self.errorf.get(state,None) 789 if efunc: 790 f = efunc 791 line = func_code(f).co_firstlineno 792 file = func_code(f).co_filename 793 self.files[file] = 1 794 795 if isinstance(f, types.MethodType): 796 reqargs = 2 797 else: 798 reqargs = 1 799 nargs = func_code(f).co_argcount 800 if nargs > reqargs: 801 self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) 802 self.error = 1 803 804 if nargs < reqargs: 805 self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) 806 self.error = 1 807 808 for f in self.files: 809 self.validate_file(f)
810 811 812 # ----------------------------------------------------------------------------- 813 # validate_file() 814 # 815 # This checks to see if there are duplicated t_rulename() functions or strings 816 # in the parser input file. This is done using a simple regular expression 817 # match on each line in the given file. 818 # ----------------------------------------------------------------------------- 819
820 - def validate_file(self,filename):
821 import os.path 822 base,ext = os.path.splitext(filename) 823 if ext != '.py': return # No idea what the file is. Return OK 824 825 try: 826 f = open(filename) 827 lines = f.readlines() 828 f.close() 829 except IOError: 830 return # Couldn't find the file. Don't worry about it 831 832 fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') 833 sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') 834 835 counthash = { } 836 linen = 1 837 for l in lines: 838 m = fre.match(l) 839 if not m: 840 m = sre.match(l) 841 if m: 842 name = m.group(1) 843 prev = counthash.get(name) 844 if not prev: 845 counthash[name] = linen 846 else: 847 self.log.error("%s:%d: Rule %s redefined. Previously defined on line %d",filename,linen,name,prev) 848 self.error = 1 849 linen += 1
850 851 # ----------------------------------------------------------------------------- 852 # lex(module) 853 # 854 # Build all of the regular expression rules from definitions in the supplied module 855 # -----------------------------------------------------------------------------
856 -def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0,outputdir="", debuglog=None, errorlog=None):
857 global lexer 858 ldict = None 859 stateinfo = { 'INITIAL' : 'inclusive'} 860 lexobj = Lexer() 861 lexobj.lexoptimize = optimize 862 global token,input 863 864 if errorlog is None: 865 errorlog = PlyLogger(sys.stderr) 866 867 if debug: 868 if debuglog is None: 869 debuglog = PlyLogger(sys.stderr) 870 871 # Get the module dictionary used for the lexer 872 if object: module = object 873 874 if module: 875 _items = [(k,getattr(module,k)) for k in dir(module)] 876 ldict = dict(_items) 877 else: 878 ldict = get_caller_module_dict(2) 879 880 # Collect parser information from the dictionary 881 linfo = LexerReflect(ldict,log=errorlog,reflags=reflags) 882 linfo.get_all() 883 if not optimize: 884 if linfo.validate_all(): 885 raise SyntaxError("Can't build lexer") 886 887 if optimize and lextab: 888 try: 889 lexobj.readtab(lextab,ldict) 890 token = lexobj.token 891 input = lexobj.input 892 lexer = lexobj 893 return lexobj 894 895 except ImportError: 896 pass 897 898 # Dump some basic debugging information 899 if debug: 900 debuglog.info("lex: tokens = %r", linfo.tokens) 901 debuglog.info("lex: literals = %r", linfo.literals) 902 debuglog.info("lex: states = %r", linfo.stateinfo) 903 904 # Build a dictionary of valid token names 905 lexobj.lextokens = { } 906 for n in linfo.tokens: 907 lexobj.lextokens[n] = 1 908 909 # Get literals specification 910 if isinstance(linfo.literals,(list,tuple)): 911 lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals) 912 else: 913 lexobj.lexliterals = linfo.literals 914 915 # Get the stateinfo dictionary 916 stateinfo = linfo.stateinfo 917 918 regexs = { } 919 # Build the master regular expressions 920 for state in stateinfo: 921 regex_list = [] 922 923 # Add rules defined by functions first 924 for fname, f in linfo.funcsym[state]: 925 line = func_code(f).co_firstlineno 926 file = func_code(f).co_filename 927 regex_list.append("(?P<%s>%s)" % (fname,f.__doc__)) 928 if debug: 929 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",fname,f.__doc__, state) 930 931 # Now add all of the simple rules 932 for name,r in linfo.strsym[state]: 933 regex_list.append("(?P<%s>%s)" % (name,r)) 934 if debug: 935 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",name,r, state) 936 937 regexs[state] = regex_list 938 939 # Build the master regular expressions 940 941 if debug: 942 debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====") 943 944 for state in regexs: 945 lexre, re_text, re_names = _form_master_re(regexs[state],reflags,ldict,linfo.toknames) 946 lexobj.lexstatere[state] = lexre 947 lexobj.lexstateretext[state] = re_text 948 lexobj.lexstaterenames[state] = re_names 949 if debug: 950 for i in range(len(re_text)): 951 debuglog.info("lex: state '%s' : regex[%d] = '%s'",state, i, re_text[i]) 952 953 # For inclusive states, we need to add the regular expressions from the INITIAL state 954 for state,stype in stateinfo.items(): 955 if state != "INITIAL" and stype == 'inclusive': 956 lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) 957 lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL']) 958 lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL']) 959 960 lexobj.lexstateinfo = stateinfo 961 lexobj.lexre = lexobj.lexstatere["INITIAL"] 962 lexobj.lexretext = lexobj.lexstateretext["INITIAL"] 963 964 # Set up ignore variables 965 lexobj.lexstateignore = linfo.ignore 966 lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","") 967 968 # Set up error functions 969 lexobj.lexstateerrorf = linfo.errorf 970 lexobj.lexerrorf = linfo.errorf.get("INITIAL",None) 971 if not lexobj.lexerrorf: 972 errorlog.warning("No t_error rule is defined") 973 974 # Check state information for ignore and error rules 975 for s,stype in stateinfo.items(): 976 if stype == 'exclusive': 977 if not s in linfo.errorf: 978 errorlog.warning("No error rule is defined for exclusive state '%s'", s) 979 if not s in linfo.ignore and lexobj.lexignore: 980 errorlog.warning("No ignore rule is defined for exclusive state '%s'", s) 981 elif stype == 'inclusive': 982 if not s in linfo.errorf: 983 linfo.errorf[s] = linfo.errorf.get("INITIAL",None) 984 if not s in linfo.ignore: 985 linfo.ignore[s] = linfo.ignore.get("INITIAL","") 986 987 # Create global versions of the token() and input() functions 988 token = lexobj.token 989 input = lexobj.input 990 lexer = lexobj 991 992 # If in optimize mode, we write the lextab 993 if lextab and optimize: 994 lexobj.writetab(lextab,outputdir) 995 996 return lexobj
997 998 # ----------------------------------------------------------------------------- 999 # runmain() 1000 # 1001 # This runs the lexer as a main program 1002 # ----------------------------------------------------------------------------- 1003
1004 -def runmain(lexer=None,data=None):
1005 if not data: 1006 try: 1007 filename = sys.argv[1] 1008 f = open(filename) 1009 data = f.read() 1010 f.close() 1011 except IndexError: 1012 sys.stdout.write("Reading from standard input (type EOF to end):\n") 1013 data = sys.stdin.read() 1014 1015 if lexer: 1016 _input = lexer.input 1017 else: 1018 _input = input 1019 _input(data) 1020 if lexer: 1021 _token = lexer.token 1022 else: 1023 _token = token 1024 1025 while 1: 1026 tok = _token() 1027 if not tok: break 1028 sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno,tok.lexpos))
1029 1030 # ----------------------------------------------------------------------------- 1031 # @TOKEN(regex) 1032 # 1033 # This decorator function can be used to set the regex expression on a function 1034 # when its docstring might need to be set in an alternative way 1035 # ----------------------------------------------------------------------------- 1036
1037 -def TOKEN(r):
1038 def set_doc(f): 1039 if hasattr(r,"__call__"): 1040 f.__doc__ = r.__doc__ 1041 else: 1042 f.__doc__ = r 1043 return f
1044 return set_doc 1045 1046 # Alternative spelling of the TOKEN decorator 1047 Token = TOKEN 1048