123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222 |
- /**
- * Java tokenizer for codemirror
- *
- * @author Patrick Wied
- * @version 2010-10-07
- */
- var tokenizeJava = (function() {
- // Advance the stream until the given character (not preceded by a
- // backslash) is encountered, or the end of the line is reached.
- function nextUntilUnescaped(source, end) {
- var escaped = false;
- var next;
- while (!source.endOfLine()) {
- var next = source.next();
- if (next == end && !escaped)
- return false;
- escaped = !escaped && next == "\\";
- }
- return escaped;
- }
- // A map of Java's keywords. The a/b/c keyword distinction is
- // very rough, but it gives the parser enough information to parse
- // correct code correctly (we don't care that much how we parse
- // incorrect code). The style information included in these objects
- // is used by the highlighter to pick the correct CSS style for a
- // token.
- var keywords = function(){
- function result(type, style){
- return {type: type, style: "java-" + style};
- }
- // keywords that take a parenthised expression, and then a
- // statement (if)
- var keywordA = result("keyword a", "keyword");
- // keywords that take just a statement (else)
- var keywordB = result("keyword b", "keyword");
- // keywords that optionally take an expression, and form a
- // statement (return)
- var keywordC = result("keyword c", "keyword");
- var operator = result("operator", "keyword");
- var atom = result("atom", "atom");
- return {
- "if": keywordA, "while": keywordA, "with": keywordA,
- "else": keywordB, "do": keywordB, "try": keywordB, "finally": keywordB,
- "return": keywordC, "break": keywordC, "continue": keywordC, "new": keywordC, "throw": keywordC, "throws": keywordB,
- "in": operator, "typeof": operator, "instanceof": operator,
- "catch": result("catch", "keyword"), "for": result("for", "keyword"), "switch": result("switch", "keyword"),
- "case": result("case", "keyword"), "default": result("default", "keyword"),
- "true": atom, "false": atom, "null": atom,
- "class": result("class", "keyword"), "interface": result("interface", "keyword"), "package": keywordC, "import": keywordC,
- "implements": keywordC, "extends": keywordC, "super": keywordC,
- "public": keywordC, "private": keywordC, "protected": keywordC, "transient": keywordC, "this": keywordC,
- "static": keywordC, "final": keywordC, "const": keywordC, "abstract": keywordC, "static": keywordC,
- "int": keywordC, "double": keywordC, "long": keywordC, "boolean": keywordC, "char": keywordC,
- "void": keywordC, "byte": keywordC, "float": keywordC, "short": keywordC
- };
- }();
- // Some helper regexps
- var isOperatorChar = /[+\-*&%=<>!?|]/;
- var isHexDigit = /[0-9A-Fa-f]/;
- var isWordChar = /[\w\$_]/;
- // Wrapper around javaToken that helps maintain parser state (whether
- // we are inside of a multi-line comment and whether the next token
- // could be a regular expression).
- function javaTokenState(inside, regexp) {
- return function(source, setState) {
- var newInside = inside;
- var type = javaToken(inside, regexp, source, function(c) {newInside = c;});
- var newRegexp = type.type == "operator" || type.type == "keyword c" || type.type.match(/^[\[{}\(,;:]$/);
- if (newRegexp != regexp || newInside != inside)
- setState(javaTokenState(newInside, newRegexp));
- return type;
- };
- }
- // The token reader, inteded to be used by the tokenizer from
- // tokenize.js (through jsTokenState). Advances the source stream
- // over a token, and returns an object containing the type and style
- // of that token.
- function javaToken(inside, regexp, source, setInside) {
- function readHexNumber(){
- source.next(); // skip the 'x'
- source.nextWhileMatches(isHexDigit);
- return {type: "number", style: "java-atom"};
- }
- function readNumber() {
- source.nextWhileMatches(/[0-9]/);
- if (source.equals(".")){
- source.next();
- source.nextWhileMatches(/[0-9]/);
- }
- if (source.equals("e") || source.equals("E")){
- source.next();
- if (source.equals("-"))
- source.next();
- source.nextWhileMatches(/[0-9]/);
- }
- return {type: "number", style: "java-atom"};
- }
- // Read a word, look it up in keywords. If not found, it is a
- // variable, otherwise it is a keyword of the type found.
- function readWord() {
- source.nextWhileMatches(isWordChar);
- var word = source.get();
- var known = keywords.hasOwnProperty(word) && keywords.propertyIsEnumerable(word) && keywords[word];
- return known ? {type: known.type, style: known.style, content: word} :
- {type: "variable", style: "java-variable", content: word};
- }
- function readRegexp() {
- nextUntilUnescaped(source, "/");
- source.nextWhileMatches(/[gi]/);
- return {type: "regexp", style: "java-string"};
- }
- // Mutli-line comments are tricky. We want to return the newlines
- // embedded in them as regular newline tokens, and then continue
- // returning a comment token for every line of the comment. So
- // some state has to be saved (inside) to indicate whether we are
- // inside a /* */ sequence.
- function readMultilineComment(start){
- var newInside = "/*";
- var maybeEnd = (start == "*");
- while (true) {
- if (source.endOfLine())
- break;
- var next = source.next();
- if (next == "/" && maybeEnd){
- newInside = null;
- break;
- }
- maybeEnd = (next == "*");
- }
- setInside(newInside);
- return {type: "comment", style: "java-comment"};
- }
- // for reading javadoc
- function readJavaDocComment(start){
- var newInside = "/**";
- var maybeEnd = (start == "*");
- while (true) {
- if (source.endOfLine())
- break;
- var next = source.next();
- if (next == "/" && maybeEnd){
- newInside = null;
- break;
- }
- maybeEnd = (next == "*");
- }
- setInside(newInside);
- return {type: "javadoc", style: "javadoc-comment"};
- }
- // for reading annotations (word based)
- function readAnnotation(){
- source.nextWhileMatches(isWordChar);
- var word = source.get();
- return {type: "annotation", style: "java-annotation", content:word};
- }
- function readOperator() {
- source.nextWhileMatches(isOperatorChar);
- return {type: "operator", style: "java-operator"};
- }
- function readString(quote) {
- var endBackSlash = nextUntilUnescaped(source, quote);
- setInside(endBackSlash ? quote : null);
- return {type: "string", style: "java-string"};
- }
- // Fetch the next token. Dispatches on first character in the
- // stream, or first two characters when the first is a slash.
- if (inside == "\"" || inside == "'")
- return readString(inside);
- var ch = source.next();
- if (inside == "/*")
- return readMultilineComment(ch);
- else if(inside == "/**")
- return readJavaDocComment(ch);
- else if (ch == "\"" || ch == "'")
- return readString(ch);
- // with punctuation, the type of the token is the symbol itself
- else if (/[\[\]{}\(\),;\:\.]/.test(ch))
- return {type: ch, style: "java-punctuation"};
- else if (ch == "0" && (source.equals("x") || source.equals("X")))
- return readHexNumber();
- else if (/[0-9]/.test(ch))
- return readNumber();
- else if (ch == "@"){
- return readAnnotation();
- }else if (ch == "/"){
- if (source.equals("*")){
- source.next();
- if(source.equals("*"))
- return readJavaDocComment(ch);
- return readMultilineComment(ch);
- }
- else if (source.equals("/"))
- { nextUntilUnescaped(source, null); return {type: "comment", style: "java-comment"};}
- else if (regexp)
- return readRegexp();
- else
- return readOperator();
- }
- else if (isOperatorChar.test(ch))
- return readOperator();
- else
- return readWord();
- }
- // The external interface to the tokenizer.
- return function(source, startState) {
- return tokenizer(source, startState || javaTokenState(false, true));
- };
- })();
|