Analysis Software
Documentation for sPHENIX simulation software
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tokenize.py
Go to the documentation of this file. Or view the newest version in sPHENIX GitHub for file tokenize.py
1 #!/usr/bin/env python
2 #
3 # Copyright 2007 Neal Norwitz
4 # Portions Copyright 2007 Google Inc.
5 #
6 # Licensed under the Apache License, Version 2.0 (the "License");
7 # you may not use this file except in compliance with the License.
8 # You may obtain a copy of the License at
9 #
10 # http://www.apache.org/licenses/LICENSE-2.0
11 #
12 # Unless required by applicable law or agreed to in writing, software
13 # distributed under the License is distributed on an "AS IS" BASIS,
14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 # See the License for the specific language governing permissions and
16 # limitations under the License.
17 
18 """Tokenize C++ source code."""
19 
20 __author__ = 'nnorwitz@google.com (Neal Norwitz)'
21 
22 
23 try:
24  # Python 3.x
25  import builtins
26 except ImportError:
27  # Python 2.x
28  import __builtin__ as builtins
29 
30 
31 import sys
32 
33 from cpp import utils
34 
35 
36 if not hasattr(builtins, 'set'):
37  # Nominal support for Python 2.3.
38  from sets import Set as set
39 
40 
41 # Add $ as a valid identifier char since so much code uses it.
42 _letters = 'abcdefghijklmnopqrstuvwxyz'
43 VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
44 HEX_DIGITS = set('0123456789abcdefABCDEF')
45 INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
46 
47 
48 # C++0x string preffixes.
49 _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
50 
51 
52 # Token types.
53 UNKNOWN = 'UNKNOWN'
54 SYNTAX = 'SYNTAX'
55 CONSTANT = 'CONSTANT'
56 NAME = 'NAME'
57 PREPROCESSOR = 'PREPROCESSOR'
58 
59 # Where the token originated from. This can be used for backtracking.
60 # It is always set to WHENCE_STREAM in this code.
61 WHENCE_STREAM, WHENCE_QUEUE = range(2)
62 
63 
64 class Token(object):
65  """Data container to represent a C++ token.
66 
67  Tokens can be identifiers, syntax char(s), constants, or
68  pre-processor directives.
69 
70  start contains the index of the first char of the token in the source
71  end contains the index of the last char of the token in the source
72  """
73 
74  def __init__(self, token_type, name, start, end):
75  self.token_type = token_type
76  self.name = name
77  self.start = start
78  self.end = end
79  self.whence = WHENCE_STREAM
80 
81  def __str__(self):
82  if not utils.DEBUG:
83  return 'Token(%r)' % self.name
84  return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
85 
86  __repr__ = __str__
87 
88 
89 def _GetString(source, start, i):
90  i = source.find('"', i+1)
91  while source[i-1] == '\\':
92  # Count the trailing backslashes.
93  backslash_count = 1
94  j = i - 2
95  while source[j] == '\\':
96  backslash_count += 1
97  j -= 1
98  # When trailing backslashes are even, they escape each other.
99  if (backslash_count % 2) == 0:
100  break
101  i = source.find('"', i+1)
102  return i + 1
103 
104 
105 def _GetChar(source, start, i):
106  # NOTE(nnorwitz): may not be quite correct, should be good enough.
107  i = source.find("'", i+1)
108  while source[i-1] == '\\':
109  # Need to special case '\\'.
110  if (i - 2) > start and source[i-2] == '\\':
111  break
112  i = source.find("'", i+1)
113  # Try to handle unterminated single quotes (in a #if 0 block).
114  if i < 0:
115  i = start
116  return i + 1
117 
118 
119 def GetTokens(source):
120  """Returns a sequence of Tokens.
121 
122  Args:
123  source: string of C++ source code.
124 
125  Yields:
126  Token that represents the next token in the source.
127  """
128  # Cache various valid character sets for speed.
129  valid_identifier_chars = VALID_IDENTIFIER_CHARS
130  hex_digits = HEX_DIGITS
131  int_or_float_digits = INT_OR_FLOAT_DIGITS
132  int_or_float_digits2 = int_or_float_digits | set('.')
133 
134  # Only ignore errors while in a #if 0 block.
135  ignore_errors = False
136  count_ifs = 0
137 
138  i = 0
139  end = len(source)
140  while i < end:
141  # Skip whitespace.
142  while i < end and source[i].isspace():
143  i += 1
144  if i >= end:
145  return
146 
147  token_type = UNKNOWN
148  start = i
149  c = source[i]
150  if c.isalpha() or c == '_': # Find a string token.
151  token_type = NAME
152  while source[i] in valid_identifier_chars:
153  i += 1
154  # String and character constants can look like a name if
155  # they are something like L"".
156  if (source[i] == "'" and (i - start) == 1 and
157  source[start:i] in 'uUL'):
158  # u, U, and L are valid C++0x character preffixes.
159  token_type = CONSTANT
160  i = _GetChar(source, start, i)
161  elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
162  token_type = CONSTANT
163  i = _GetString(source, start, i)
164  elif c == '/' and source[i+1] == '/': # Find // comments.
165  i = source.find('\n', i)
166  if i == -1: # Handle EOF.
167  i = end
168  continue
169  elif c == '/' and source[i+1] == '*': # Find /* comments. */
170  i = source.find('*/', i) + 2
171  continue
172  elif c in ':+-<>&|*=': # : or :: (plus other chars).
173  token_type = SYNTAX
174  i += 1
175  new_ch = source[i]
176  if new_ch == c and c != '>': # Treat ">>" as two tokens.
177  i += 1
178  elif c == '-' and new_ch == '>':
179  i += 1
180  elif new_ch == '=':
181  i += 1
182  elif c in '()[]{}~!?^%;/.,': # Handle single char tokens.
183  token_type = SYNTAX
184  i += 1
185  if c == '.' and source[i].isdigit():
186  token_type = CONSTANT
187  i += 1
188  while source[i] in int_or_float_digits:
189  i += 1
190  # Handle float suffixes.
191  for suffix in ('l', 'f'):
192  if suffix == source[i:i+1].lower():
193  i += 1
194  break
195  elif c.isdigit(): # Find integer.
196  token_type = CONSTANT
197  if c == '0' and source[i+1] in 'xX':
198  # Handle hex digits.
199  i += 2
200  while source[i] in hex_digits:
201  i += 1
202  else:
203  while source[i] in int_or_float_digits2:
204  i += 1
205  # Handle integer (and float) suffixes.
206  for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
207  size = len(suffix)
208  if suffix == source[i:i+size].lower():
209  i += size
210  break
211  elif c == '"': # Find string.
212  token_type = CONSTANT
213  i = _GetString(source, start, i)
214  elif c == "'": # Find char.
215  token_type = CONSTANT
216  i = _GetChar(source, start, i)
217  elif c == '#': # Find pre-processor command.
218  token_type = PREPROCESSOR
219  got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
220  if got_if:
221  count_ifs += 1
222  elif source[i:i+6] == '#endif':
223  count_ifs -= 1
224  if count_ifs == 0:
225  ignore_errors = False
226 
227  # TODO(nnorwitz): handle preprocessor statements (\ continuations).
228  while 1:
229  i1 = source.find('\n', i)
230  i2 = source.find('//', i)
231  i3 = source.find('/*', i)
232  i4 = source.find('"', i)
233  # NOTE(nnorwitz): doesn't handle comments in #define macros.
234  # Get the first important symbol (newline, comment, EOF/end).
235  i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
236 
237  # Handle #include "dir//foo.h" properly.
238  if source[i] == '"':
239  i = source.find('"', i+1) + 1
240  assert i > 0
241  continue
242  # Keep going if end of the line and the line ends with \.
243  if not (i == i1 and source[i-1] == '\\'):
244  if got_if:
245  condition = source[start+4:i].lstrip()
246  if (condition.startswith('0') or
247  condition.startswith('(0)')):
248  ignore_errors = True
249  break
250  i += 1
251  elif c == '\\': # Handle \ in code.
252  # This is different from the pre-processor \ handling.
253  i += 1
254  continue
255  elif ignore_errors:
256  # The tokenizer seems to be in pretty good shape. This
257  # raise is conditionally disabled so that bogus code
258  # in an #if 0 block can be handled. Since we will ignore
259  # it anyways, this is probably fine. So disable the
260  # exception and return the bogus char.
261  i += 1
262  else:
263  sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
264  ('?', i, c, source[i-10:i+10]))
265  raise RuntimeError('unexpected token')
266 
267  if i <= 0:
268  print('Invalid index, exiting now.')
269  return
270  yield Token(token_type, source[start:i], start, i)
271 
272 
273 if __name__ == '__main__':
274  def main(argv):
275  """Driver mostly for testing purposes."""
276  for filename in argv[1:]:
277  source = utils.ReadFile(filename)
278  if source is None:
279  continue
280 
281  for token in GetTokens(source):
282  print('%-12s: %s' % (token.token_type, token.name))
283  # print('\r%6.2f%%' % (100.0 * index / token.end),)
284  sys.stdout.write('\n')
285 
286 
287  main(sys.argv)