path: root/venv/lib/python3.9/site-packages/pycparser/
diff options
Diffstat (limited to 'venv/lib/python3.9/site-packages/pycparser/')
1 files changed, 554 insertions, 0 deletions
diff --git a/venv/lib/python3.9/site-packages/pycparser/ b/venv/lib/python3.9/site-packages/pycparser/
new file mode 100644
index 00000000..d68d8ebf
--- /dev/null
+++ b/venv/lib/python3.9/site-packages/pycparser/
@@ -0,0 +1,554 @@
+# pycparser:
+# CLexer class: lexer for the C language
+# Eli Bendersky []
+# License: BSD
+import re
+from .ply import lex
+from .ply.lex import TOKEN
+class CLexer(object):
+ """ A lexer for the C language. After building it, set the
+ input text with input(), and call token() to get new
+ tokens.
+ The public attribute filename can be set to an initial
+ filename, but the lexer will update it upon #line
+ directives.
+ """
+ def __init__(self, error_func, on_lbrace_func, on_rbrace_func,
+ type_lookup_func):
+ """ Create a new Lexer.
+ error_func:
+ An error function. Will be called with an error
+ message, line and column as arguments, in case of
+ an error during lexing.
+ on_lbrace_func, on_rbrace_func:
+ Called when an LBRACE or RBRACE is encountered
+ (likely to push/pop type_lookup_func's scope)
+ type_lookup_func:
+ A type lookup function. Given a string, it must
+ return True IFF this string is a name of a type
+ that was defined with a typedef earlier.
+ """
+ self.error_func = error_func
+ self.on_lbrace_func = on_lbrace_func
+ self.on_rbrace_func = on_rbrace_func
+ self.type_lookup_func = type_lookup_func
+ self.filename = ''
+ # Keeps track of the last token returned from self.token()
+ self.last_token = None
+ # Allow either "# line" or "# <num>" to support GCC's
+ # cpp output
+ #
+ self.line_pattern = re.compile(r'([ \t]*line\W)|([ \t]*\d+)')
+ self.pragma_pattern = re.compile(r'[ \t]*pragma\W')
+ def build(self, **kwargs):
+ """ Builds the lexer from the specification. Must be
+ called after the lexer object is created.
+ This method exists separately, because the PLY
+ manual warns against calling lex.lex inside
+ __init__
+ """
+ self.lexer = lex.lex(object=self, **kwargs)
+ def reset_lineno(self):
+ """ Resets the internal line number counter of the lexer.
+ """
+ self.lexer.lineno = 1
+ def input(self, text):
+ self.lexer.input(text)
+ def token(self):
+ self.last_token = self.lexer.token()
+ return self.last_token
+ def find_tok_column(self, token):
+ """ Find the column of the token in its line.
+ """
+ last_cr = self.lexer.lexdata.rfind('\n', 0, token.lexpos)
+ return token.lexpos - last_cr
+ ######################-- PRIVATE --######################
+ ##
+ ## Internal auxiliary methods
+ ##
+ def _error(self, msg, token):
+ location = self._make_tok_location(token)
+ self.error_func(msg, location[0], location[1])
+ self.lexer.skip(1)
+ def _make_tok_location(self, token):
+ return (token.lineno, self.find_tok_column(token))
+ ##
+ ## Reserved keywords
+ ##
+ keywords = (
+ 'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG',
+ 'VOLATILE', 'WHILE', '__INT128',
+ )
+ keywords_new = (
+ '_BOOL', '_COMPLEX',
+ )
+ keyword_map = {}
+ for keyword in keywords:
+ keyword_map[keyword.lower()] = keyword
+ for keyword in keywords_new:
+ keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword
+ ##
+ ## All the tokens recognized by the lexer
+ ##
+ tokens = keywords + keywords_new + (
+ # Identifiers
+ 'ID',
+ # Type identifiers (identifiers previously defined as
+ # types with typedef)
+ # constants
+ # String literals
+ # Operators
+ 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
+ 'LOR', 'LAND', 'LNOT',
+ 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
+ # Assignment
+ # Increment/decrement
+ # Structure dereference (->)
+ 'ARROW',
+ # Conditional operator (?)
+ # Delimiters
+ 'LPAREN', 'RPAREN', # ( )
+ 'LBRACE', 'RBRACE', # { }
+ 'COMMA', 'PERIOD', # . ,
+ 'SEMI', 'COLON', # ; :
+ # Ellipsis (...)
+ # pre-processor
+ 'PPHASH', # '#'
+ 'PPPRAGMA', # 'pragma'
+ )
+ ##
+ ## Regexes for use in tokens
+ ##
+ ##
+ # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)
+ identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*'
+ hex_prefix = '0[xX]'
+ hex_digits = '[0-9a-fA-F]+'
+ bin_prefix = '0[bB]'
+ bin_digits = '[01]+'
+ # integer constants (K&R2: A.2.5.1)
+ integer_suffix_opt = r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?'
+ decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')'
+ octal_constant = '0[0-7]*'+integer_suffix_opt
+ hex_constant = hex_prefix+hex_digits+integer_suffix_opt
+ bin_constant = bin_prefix+bin_digits+integer_suffix_opt
+ bad_octal_constant = '0[0-7]*[89]'
+ # character constants (K&R2: A.2.5.2)
+ # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
+ # directives with Windows paths as filenames (..\..\dir\file)
+ # For the same reason, decimal_escape allows all digit sequences. We want to
+ # parse all correct code, even if it means to sometimes parse incorrect
+ # code.
+ #
+ # The original regexes were taken verbatim from the C syntax definition,
+ # and were later modified to avoid worst-case exponential running time.
+ #
+ # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
+ # decimal_escape = r"""(\d+)"""
+ # hex_escape = r"""(x[0-9a-fA-F]+)"""
+ # bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
+ #
+ # The following modifications were made to avoid the ambiguity that allowed backtracking:
+ # (
+ #
+ # - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape.
+ # - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex
+ # - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal
+ # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape.
+ #
+ # Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways.
+ # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`.
+ simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
+ decimal_escape = r"""(\d+)(?!\d)"""
+ hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
+ bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
+ escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
+ # This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed
+ # 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to
+ escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
+ cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
+ char_const = "'"+cconst_char+"'"
+ wchar_const = 'L'+char_const
+ u8char_const = 'u8'+char_const
+ u16char_const = 'u'+char_const
+ u32char_const = 'U'+char_const
+ multicharacter_constant = "'"+cconst_char+"{2,4}'"
+ unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
+ bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')"""
+ # string literals (K&R2: A.2.6)
+ string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')'
+ string_literal = '"'+string_char+'*"'
+ wstring_literal = 'L'+string_literal
+ u8string_literal = 'u8'+string_literal
+ u16string_literal = 'u'+string_literal
+ u32string_literal = 'U'+string_literal
+ bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
+ # floating constants (K&R2: A.2.5.3)
+ exponent_part = r"""([eE][-+]?[0-9]+)"""
+ fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
+ floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)'
+ binary_exponent_part = r'''([pP][+-]?[0-9]+)'''
+ hex_fractional_constant = '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))"""
+ hex_floating_constant = '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+binary_exponent_part+'[FfLl]?)'
+ ##
+ ## Lexer states: used for preprocessor \n-terminated directives
+ ##
+ states = (
+ # ppline: preprocessor line directives
+ #
+ ('ppline', 'exclusive'),
+ # pppragma: pragma
+ #
+ ('pppragma', 'exclusive'),
+ )
+ def t_PPHASH(self, t):
+ r'[ \t]*\#'
+ if self.line_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos):
+ t.lexer.begin('ppline')
+ self.pp_line = self.pp_filename = None
+ elif self.pragma_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos):
+ t.lexer.begin('pppragma')
+ else:
+ t.type = 'PPHASH'
+ return t
+ ##
+ ## Rules for the ppline state
+ ##
+ @TOKEN(string_literal)
+ def t_ppline_FILENAME(self, t):
+ if self.pp_line is None:
+ self._error('filename before line number in #line', t)
+ else:
+ self.pp_filename = t.value.lstrip('"').rstrip('"')
+ @TOKEN(decimal_constant)
+ def t_ppline_LINE_NUMBER(self, t):
+ if self.pp_line is None:
+ self.pp_line = t.value
+ else:
+ # Ignore: GCC's cpp sometimes inserts a numeric flag
+ # after the file name
+ pass
+ def t_ppline_NEWLINE(self, t):
+ r'\n'
+ if self.pp_line is None:
+ self._error('line number missing in #line', t)
+ else:
+ self.lexer.lineno = int(self.pp_line)
+ if self.pp_filename is not None:
+ self.filename = self.pp_filename
+ t.lexer.begin('INITIAL')
+ def t_ppline_PPLINE(self, t):
+ r'line'
+ pass
+ t_ppline_ignore = ' \t'
+ def t_ppline_error(self, t):
+ self._error('invalid #line directive', t)
+ ##
+ ## Rules for the pppragma state
+ ##
+ def t_pppragma_NEWLINE(self, t):
+ r'\n'
+ t.lexer.lineno += 1
+ t.lexer.begin('INITIAL')
+ def t_pppragma_PPPRAGMA(self, t):
+ r'pragma'
+ return t
+ t_pppragma_ignore = ' \t'
+ def t_pppragma_STR(self, t):
+ '.+'
+ t.type = 'PPPRAGMASTR'
+ return t
+ def t_pppragma_error(self, t):
+ self._error('invalid #pragma directive', t)
+ ##
+ ## Rules for the normal state
+ ##
+ t_ignore = ' \t'
+ # Newlines
+ def t_NEWLINE(self, t):
+ r'\n+'
+ t.lexer.lineno += t.value.count("\n")
+ # Operators
+ t_PLUS = r'\+'
+ t_MINUS = r'-'
+ t_TIMES = r'\*'
+ t_DIVIDE = r'/'
+ t_MOD = r'%'
+ t_OR = r'\|'
+ t_AND = r'&'
+ t_NOT = r'~'
+ t_XOR = r'\^'
+ t_LSHIFT = r'<<'
+ t_RSHIFT = r'>>'
+ t_LOR = r'\|\|'
+ t_LAND = r'&&'
+ t_LNOT = r'!'
+ t_LT = r'<'
+ t_GT = r'>'
+ t_LE = r'<='
+ t_GE = r'>='
+ t_EQ = r'=='
+ t_NE = r'!='
+ # Assignment operators
+ t_EQUALS = r'='
+ t_TIMESEQUAL = r'\*='
+ t_DIVEQUAL = r'/='
+ t_MODEQUAL = r'%='
+ t_PLUSEQUAL = r'\+='
+ t_MINUSEQUAL = r'-='
+ t_LSHIFTEQUAL = r'<<='
+ t_RSHIFTEQUAL = r'>>='
+ t_ANDEQUAL = r'&='
+ t_OREQUAL = r'\|='
+ t_XOREQUAL = r'\^='
+ # Increment/decrement
+ t_PLUSPLUS = r'\+\+'
+ t_MINUSMINUS = r'--'
+ # ->
+ t_ARROW = r'->'
+ # ?
+ t_CONDOP = r'\?'
+ # Delimiters
+ t_LPAREN = r'\('
+ t_RPAREN = r'\)'
+ t_LBRACKET = r'\['
+ t_RBRACKET = r'\]'
+ t_COMMA = r','
+ t_PERIOD = r'\.'
+ t_SEMI = r';'
+ t_COLON = r':'
+ t_ELLIPSIS = r'\.\.\.'
+ # Scope delimiters
+ # To see why on_lbrace_func is needed, consider:
+ # typedef char TT;
+ # void foo(int TT) { TT = 10; }
+ # TT x = 5;
+ # Outside the function, TT is a typedef, but inside (starting and ending
+ # with the braces) it's a parameter. The trouble begins with yacc's
+ # lookahead token. If we open a new scope in brace_open, then TT has
+ # already been read and incorrectly interpreted as TYPEID. So, we need
+ # to open and close scopes from within the lexer.
+ # Similar for the TT immediately outside the end of the function.
+ #
+ @TOKEN(r'\{')
+ def t_LBRACE(self, t):
+ self.on_lbrace_func()
+ return t
+ @TOKEN(r'\}')
+ def t_RBRACE(self, t):
+ self.on_rbrace_func()
+ return t
+ t_STRING_LITERAL = string_literal
+ # The following floating and integer constants are defined as
+ # functions to impose a strict order (otherwise, decimal
+ # is placed before the others because its regex is longer,
+ # and this is bad)
+ #
+ @TOKEN(floating_constant)
+ def t_FLOAT_CONST(self, t):
+ return t
+ @TOKEN(hex_floating_constant)
+ def t_HEX_FLOAT_CONST(self, t):
+ return t
+ @TOKEN(hex_constant)
+ def t_INT_CONST_HEX(self, t):
+ return t
+ @TOKEN(bin_constant)
+ def t_INT_CONST_BIN(self, t):
+ return t
+ @TOKEN(bad_octal_constant)
+ def t_BAD_CONST_OCT(self, t):
+ msg = "Invalid octal constant"
+ self._error(msg, t)
+ @TOKEN(octal_constant)
+ def t_INT_CONST_OCT(self, t):
+ return t
+ @TOKEN(decimal_constant)
+ def t_INT_CONST_DEC(self, t):
+ return t
+ # Must come before bad_char_const, to prevent it from
+ # catching valid char constants as invalid
+ #
+ @TOKEN(multicharacter_constant)
+ def t_INT_CONST_CHAR(self, t):
+ return t
+ @TOKEN(char_const)
+ def t_CHAR_CONST(self, t):
+ return t
+ @TOKEN(wchar_const)
+ def t_WCHAR_CONST(self, t):
+ return t
+ @TOKEN(u8char_const)
+ def t_U8CHAR_CONST(self, t):
+ return t
+ @TOKEN(u16char_const)
+ def t_U16CHAR_CONST(self, t):
+ return t
+ @TOKEN(u32char_const)
+ def t_U32CHAR_CONST(self, t):
+ return t
+ @TOKEN(unmatched_quote)
+ def t_UNMATCHED_QUOTE(self, t):
+ msg = "Unmatched '"
+ self._error(msg, t)
+ @TOKEN(bad_char_const)
+ def t_BAD_CHAR_CONST(self, t):
+ msg = "Invalid char constant %s" % t.value
+ self._error(msg, t)
+ @TOKEN(wstring_literal)
+ def t_WSTRING_LITERAL(self, t):
+ return t
+ @TOKEN(u8string_literal)
+ def t_U8STRING_LITERAL(self, t):
+ return t
+ @TOKEN(u16string_literal)
+ def t_U16STRING_LITERAL(self, t):
+ return t
+ @TOKEN(u32string_literal)
+ def t_U32STRING_LITERAL(self, t):
+ return t
+ # unmatched string literals are caught by the preprocessor
+ @TOKEN(bad_string_literal)
+ def t_BAD_STRING_LITERAL(self, t):
+ msg = "String contains invalid escape code"
+ self._error(msg, t)
+ @TOKEN(identifier)
+ def t_ID(self, t):
+ t.type = self.keyword_map.get(t.value, "ID")
+ if t.type == 'ID' and self.type_lookup_func(t.value):
+ t.type = "TYPEID"
+ return t
+ def t_error(self, t):
+ msg = 'Illegal character %s' % repr(t.value[0])
+ self._error(msg, t)