|
|
| Is the interpretation of backslash-newline capable of splitting commenting sequences on other implementations? |
|
|
|
|
|
|
strings s = "/*//*/"; tags a comment inside of the string literal.;/* multi-line comment */ includes the semicolon in the comment.#define // first line does not tag preprocessor directives as the first line./* newline then: #define */ tags the preprocessor directive, even inside the block comment. |
|
|
|
# define the lexical elements in regex
my $identifier = '\b[_[:alpha:]]\w*\b';
# identifiers are enclosed by word boundries, contain
# alphanumeric text plus the underscore, but cannot
# begin with a number
my $relative = '".*?"';
my $absolute = '<.*?>';
my $include = '(?:^|(?<=\n))\s*\#[ \t]*include[ \t]*?(?:'.$relative.'|'.$absolute.')';
# include directives begin at a newline and then may contain zero or
# more whitespace characters, '#' sign, zero or more whitespace
# characters, the text 'include', and then a path/filename surrounded by
# either angle brackets or doulbe quotes
#
# the separation of include directives from other preprocesssor directives
# is to support an additional requirement to hyperlink header files
my $directive = '(?:^|(?<=\n))\s*\#[ \t]*'.$identifier;
# preprocessor directives begin at a newline and then
# may contain zero or more whitespace characters,
# a '#' sign, zero or more whitespace characters,
# and finally an identifier
my $comment = '(?:/\*.*?\*/|//.*?(?=(?:\n|$)))';
# comments either begin with "/*" and end at the first
# "*/" or begin with "//" and end at the first newline
my $string = '[lL]?".*?(?:(?<!\\\)|[\\\][\\\])"';
# string literals may begin with an L, for wide
# characters, and then a " and continue until the first
# " that is not preceeded by a backslash (unless its \\')
my $literal = '[lL]?\'.*?(?:(?<!\\\)|[\\\][\\\])\'';
# character literals are specified with the same pattern
# as strings except for using single quotation marks,
# although syntactically it should contain at least one
# character
my $keyword = '\b(?:and_eq|and|asm|auto|bitand|bitor|bool|break|case'.
'|catch|char|class|compl|const_cast|const|continue'.
'|default|delete|do|double|dynamic_cast|else|enum'.
'|explicit|export|extern|false|float|for|friend'.
'|goto|if|inline|int|long|mutable|namespace|new'.
'|not_eq|not|operator|or_eq|or|private|protected'.
'|public|register|reinterpret_cast|return|short'.
'|signed|sizeof|static|static_cast|struct|switch'.
'|template|this|throw|true|try|typedef|typeid'.
'|typename|union|unsigned|using|virtual|void'.
'|volatile|wchar_t|while|xor_eq|xor)\b';
# keywords are enclosed by word boundries and must match
# one of the listed alternatives
my $number = '\b\d[xX]?[\daAbBcCdDeEfF]*[lLdDfFuU]?\b';
# numbers are enclosed by word boundries, may begin with
# 0X, contain numeric digits or hexidecimal characters,
# and may be followed by a type determination
my $symbol = '[~!%\^&\*\(\)\+={\[}\]:/;,<\.>\?\|\-]+';
# symbols must match one or more of the listed
# alternatives
my $leftover = '[\#\\\]+';
# 'leftovers' refer to remaining '#' signs or backslashes
# that would be removed by the preprocessor. they must
# match one or more of the listed alternatives
# the following pattern is used to split the input string into tokens
my $delim = '(?:'.$include.'|'.$directive.'|'.$comment.
'|'.$string. '|'.$literal.
'|'.$number. '|'.$symbol.
'|'.'\n+'. '|'.'\b' .')';
#...
# tokenize
my @tokens = split( /( # this grouping causes split
# to also return delimiters
$delim # match by delimiter pattern
)/sox, $input );
#...
|