|
|
Is the interpretation of backslash-newline capable of splitting commenting sequences on other implementations? |
|
|
|
|
|
|
strings s = "/*//*/";
tags a comment inside of the string literal.;/* multi-line comment */
includes the semicolon in the comment.#define // first line
does not tag preprocessor directives as the first line./* newline then: #define */
tags the preprocessor directive, even inside the block comment.
|
|
|
|
# define the lexical elements in regex my $identifier = '\b[_[:alpha:]]\w*\b'; # identifiers are enclosed by word boundries, contain # alphanumeric text plus the underscore, but cannot # begin with a number my $relative = '".*?"'; my $absolute = '<.*?>'; my $include = '(?:^|(?<=\n))\s*\#[ \t]*include[ \t]*?(?:'.$relative.'|'.$absolute.')'; # include directives begin at a newline and then may contain zero or # more whitespace characters, '#' sign, zero or more whitespace # characters, the text 'include', and then a path/filename surrounded by # either angle brackets or doulbe quotes # # the separation of include directives from other preprocesssor directives # is to support an additional requirement to hyperlink header files my $directive = '(?:^|(?<=\n))\s*\#[ \t]*'.$identifier; # preprocessor directives begin at a newline and then # may contain zero or more whitespace characters, # a '#' sign, zero or more whitespace characters, # and finally an identifier my $comment = '(?:/\*.*?\*/|//.*?(?=(?:\n|$)))'; # comments either begin with "/*" and end at the first # "*/" or begin with "//" and end at the first newline my $string = '[lL]?".*?(?:(?<!\\\)|[\\\][\\\])"'; # string literals may begin with an L, for wide # characters, and then a " and continue until the first # " that is not preceeded by a backslash (unless its \\') my $literal = '[lL]?\'.*?(?:(?<!\\\)|[\\\][\\\])\''; # character literals are specified with the same pattern # as strings except for using single quotation marks, # although syntactically it should contain at least one # character my $keyword = '\b(?:and_eq|and|asm|auto|bitand|bitor|bool|break|case'. '|catch|char|class|compl|const_cast|const|continue'. '|default|delete|do|double|dynamic_cast|else|enum'. '|explicit|export|extern|false|float|for|friend'. '|goto|if|inline|int|long|mutable|namespace|new'. '|not_eq|not|operator|or_eq|or|private|protected'. '|public|register|reinterpret_cast|return|short'. '|signed|sizeof|static|static_cast|struct|switch'. '|template|this|throw|true|try|typedef|typeid'. '|typename|union|unsigned|using|virtual|void'. '|volatile|wchar_t|while|xor_eq|xor)\b'; # keywords are enclosed by word boundries and must match # one of the listed alternatives my $number = '\b\d[xX]?[\daAbBcCdDeEfF]*[lLdDfFuU]?\b'; # numbers are enclosed by word boundries, may begin with # 0X, contain numeric digits or hexidecimal characters, # and may be followed by a type determination my $symbol = '[~!%\^&\*\(\)\+={\[}\]:/;,<\.>\?\|\-]+'; # symbols must match one or more of the listed # alternatives my $leftover = '[\#\\\]+'; # 'leftovers' refer to remaining '#' signs or backslashes # that would be removed by the preprocessor. they must # match one or more of the listed alternatives # the following pattern is used to split the input string into tokens my $delim = '(?:'.$include.'|'.$directive.'|'.$comment. '|'.$string. '|'.$literal. '|'.$number. '|'.$symbol. '|'.'\n+'. '|'.'\b' .')'; #... # tokenize my @tokens = split( /( # this grouping causes split # to also return delimiters $delim # match by delimiter pattern )/sox, $input ); #... |