18 """Tokenize C++ source code."""
20 __author__ =
'nnorwitz@google.com (Neal Norwitz)'
28 import __builtin__
as builtins
36 if not hasattr(builtins,
'set'):
38 from sets
import Set
as set
42 _letters =
'abcdefghijklmnopqrstuvwxyz'
43 VALID_IDENTIFIER_CHARS =
set(_letters + _letters.upper() +
'_0123456789$')
44 HEX_DIGITS =
set(
'0123456789abcdefABCDEF')
45 INT_OR_FLOAT_DIGITS =
set(
'01234567890eE-+')
49 _STR_PREFIXES =
set((
'R', 'u8', 'u8R', '
u', 'uR', 'U', 'UR', 'L', 'LR'))
57 PREPROCESSOR =
'PREPROCESSOR'
61 WHENCE_STREAM, WHENCE_QUEUE = range(2)
65 """Data container to represent a C++ token.
67 Tokens can be identifiers, syntax char(s), constants, or
68 pre-processor directives.
70 start contains the index of the first char of the token in the source
71 end contains the index of the last char of the token in the source
74 def __init__(self, token_type, name, start, end):
83 return 'Token(%r)' % self.
name
84 return 'Token(%r, %s, %s)' % (self.
name, self.
start, self.
end)
90 i = source.find(
'"', i+1)
91 while source[i-1] ==
'\\':
95 while source[j] ==
'\\':
99 if (backslash_count % 2) == 0:
101 i = source.find(
'"', i+1)
107 i = source.find(
"'", i+1)
108 while source[i-1] ==
'\\':
110 if (i - 2) > start
and source[i-2] ==
'\\':
112 i = source.find(
"'", i+1)
120 """Returns a sequence of Tokens.
123 source: string of C++ source code.
126 Token that represents the next token in the source.
129 valid_identifier_chars = VALID_IDENTIFIER_CHARS
130 hex_digits = HEX_DIGITS
131 int_or_float_digits = INT_OR_FLOAT_DIGITS
132 int_or_float_digits2 = int_or_float_digits |
set(
'.')
135 ignore_errors =
False
142 while i < end
and source[i].isspace():
150 if c.isalpha()
or c ==
'_':
152 while source[i]
in valid_identifier_chars:
156 if (source[i] ==
"'" and (i - start) == 1
and
157 source[start:i]
in 'uUL'):
159 token_type = CONSTANT
161 elif source[i] ==
"'" and source[start:i]
in _STR_PREFIXES:
162 token_type = CONSTANT
164 elif c ==
'/' and source[i+1] ==
'/':
165 i = source.find(
'\n', i)
169 elif c ==
'/' and source[i+1] ==
'*':
170 i = source.find(
'*/', i) + 2
172 elif c
in ':+-<>&|*=':
176 if new_ch == c
and c !=
'>':
178 elif c ==
'-' and new_ch ==
'>':
182 elif c
in '()[]{}~!?^%;/.,':
185 if c ==
'.' and source[i].isdigit():
186 token_type = CONSTANT
188 while source[i]
in int_or_float_digits:
191 for suffix
in (
'l',
'f'):
192 if suffix == source[i:i+1].
lower():
196 token_type = CONSTANT
197 if c ==
'0' and source[i+1]
in 'xX':
200 while source[i]
in hex_digits:
203 while source[i]
in int_or_float_digits2:
206 for suffix
in (
'ull',
'll',
'ul',
'l',
'f',
'u'):
208 if suffix == source[i:i+size].
lower():
212 token_type = CONSTANT
215 token_type = CONSTANT
218 token_type = PREPROCESSOR
219 got_if = source[i:i+3] ==
'#if' and source[i+3:i+4].isspace()
222 elif source[i:i+6] ==
'#endif':
225 ignore_errors =
False
229 i1 = source.find(
'\n', i)
230 i2 = source.find(
'//', i)
231 i3 = source.find(
'/*', i)
232 i4 = source.find(
'"', i)
235 i =
min([x
for x
in (i1, i2, i3, i4, end)
if x != -1])
239 i = source.find(
'"', i+1) + 1
243 if not (i == i1
and source[i-1] ==
'\\'):
245 condition = source[start+4:i].lstrip()
246 if (condition.startswith(
'0')
or
247 condition.startswith(
'(0)')):
263 sys.stderr.write(
'Got invalid token in %s @ %d token:%s: %r\n' %
264 (
'?', i, c, source[i-10:i+10]))
265 raise RuntimeError(
'unexpected token')
268 print(
'Invalid index, exiting now.')
270 yield Token(token_type, source[start:i], start, i)
273 if __name__ ==
'__main__':
275 """Driver mostly for testing purposes."""
276 for filename
in argv[1:]:
277 source = utils.ReadFile(filename)
282 print(
'%-12s: %s' % (token.token_type, token.name))
284 sys.stdout.write(
'\n')