/** * Java tokenizer for codemirror * * @author Patrick Wied * @version 2010-10-07 */ var tokenizeJava = (function() { // Advance the stream until the given character (not preceded by a // backslash) is encountered, or the end of the line is reached. function nextUntilUnescaped(source, end) { var escaped = false; var next; while (!source.endOfLine()) { var next = source.next(); if (next == end && !escaped) return false; escaped = !escaped && next == "\\"; } return escaped; } // A map of Java's keywords. The a/b/c keyword distinction is // very rough, but it gives the parser enough information to parse // correct code correctly (we don't care that much how we parse // incorrect code). The style information included in these objects // is used by the highlighter to pick the correct CSS style for a // token. var keywords = function(){ function result(type, style){ return {type: type, style: "java-" + style}; } // keywords that take a parenthised expression, and then a // statement (if) var keywordA = result("keyword a", "keyword"); // keywords that take just a statement (else) var keywordB = result("keyword b", "keyword"); // keywords that optionally take an expression, and form a // statement (return) var keywordC = result("keyword c", "keyword"); var operator = result("operator", "keyword"); var atom = result("atom", "atom"); return { "if": keywordA, "while": keywordA, "with": keywordA, "else": keywordB, "do": keywordB, "try": keywordB, "finally": keywordB, "return": keywordC, "break": keywordC, "continue": keywordC, "new": keywordC, "throw": keywordC, "throws": keywordB, "in": operator, "typeof": operator, "instanceof": operator, "catch": result("catch", "keyword"), "for": result("for", "keyword"), "switch": result("switch", "keyword"), "case": result("case", "keyword"), "default": result("default", "keyword"), "true": atom, "false": atom, "null": atom, "class": result("class", "keyword"), "interface": result("interface", "keyword"), "package": keywordC, "import": keywordC, "implements": keywordC, "extends": keywordC, "super": keywordC, "public": keywordC, "private": keywordC, "protected": keywordC, "transient": keywordC, "this": keywordC, "static": keywordC, "final": keywordC, "const": keywordC, "abstract": keywordC, "static": keywordC, "int": keywordC, "double": keywordC, "long": keywordC, "boolean": keywordC, "char": keywordC, "void": keywordC, "byte": keywordC, "float": keywordC, "short": keywordC }; }(); // Some helper regexps var isOperatorChar = /[+\-*&%=<>!?|]/; var isHexDigit = /[0-9A-Fa-f]/; var isWordChar = /[\w\$_]/; // Wrapper around javaToken that helps maintain parser state (whether // we are inside of a multi-line comment and whether the next token // could be a regular expression). function javaTokenState(inside, regexp) { return function(source, setState) { var newInside = inside; var type = javaToken(inside, regexp, source, function(c) {newInside = c;}); var newRegexp = type.type == "operator" || type.type == "keyword c" || type.type.match(/^[\[{}\(,;:]$/); if (newRegexp != regexp || newInside != inside) setState(javaTokenState(newInside, newRegexp)); return type; }; } // The token reader, inteded to be used by the tokenizer from // tokenize.js (through jsTokenState). Advances the source stream // over a token, and returns an object containing the type and style // of that token. function javaToken(inside, regexp, source, setInside) { function readHexNumber(){ source.next(); // skip the 'x' source.nextWhileMatches(isHexDigit); return {type: "number", style: "java-atom"}; } function readNumber() { source.nextWhileMatches(/[0-9]/); if (source.equals(".")){ source.next(); source.nextWhileMatches(/[0-9]/); } if (source.equals("e") || source.equals("E")){ source.next(); if (source.equals("-")) source.next(); source.nextWhileMatches(/[0-9]/); } return {type: "number", style: "java-atom"}; } // Read a word, look it up in keywords. If not found, it is a // variable, otherwise it is a keyword of the type found. function readWord() { source.nextWhileMatches(isWordChar); var word = source.get(); var known = keywords.hasOwnProperty(word) && keywords.propertyIsEnumerable(word) && keywords[word]; return known ? {type: known.type, style: known.style, content: word} : {type: "variable", style: "java-variable", content: word}; } function readRegexp() { nextUntilUnescaped(source, "/"); source.nextWhileMatches(/[gi]/); return {type: "regexp", style: "java-string"}; } // Mutli-line comments are tricky. We want to return the newlines // embedded in them as regular newline tokens, and then continue // returning a comment token for every line of the comment. So // some state has to be saved (inside) to indicate whether we are // inside a /* */ sequence. function readMultilineComment(start){ var newInside = "/*"; var maybeEnd = (start == "*"); while (true) { if (source.endOfLine()) break; var next = source.next(); if (next == "/" && maybeEnd){ newInside = null; break; } maybeEnd = (next == "*"); } setInside(newInside); return {type: "comment", style: "java-comment"}; } // for reading javadoc function readJavaDocComment(start){ var newInside = "/**"; var maybeEnd = (start == "*"); while (true) { if (source.endOfLine()) break; var next = source.next(); if (next == "/" && maybeEnd){ newInside = null; break; } maybeEnd = (next == "*"); } setInside(newInside); return {type: "javadoc", style: "javadoc-comment"}; } // for reading annotations (word based) function readAnnotation(){ source.nextWhileMatches(isWordChar); var word = source.get(); return {type: "annotation", style: "java-annotation", content:word}; } function readOperator() { source.nextWhileMatches(isOperatorChar); return {type: "operator", style: "java-operator"}; } function readString(quote) { var endBackSlash = nextUntilUnescaped(source, quote); setInside(endBackSlash ? quote : null); return {type: "string", style: "java-string"}; } // Fetch the next token. Dispatches on first character in the // stream, or first two characters when the first is a slash. if (inside == "\"" || inside == "'") return readString(inside); var ch = source.next(); if (inside == "/*") return readMultilineComment(ch); else if(inside == "/**") return readJavaDocComment(ch); else if (ch == "\"" || ch == "'") return readString(ch); // with punctuation, the type of the token is the symbol itself else if (/[\[\]{}\(\),;\:\.]/.test(ch)) return {type: ch, style: "java-punctuation"}; else if (ch == "0" && (source.equals("x") || source.equals("X"))) return readHexNumber(); else if (/[0-9]/.test(ch)) return readNumber(); else if (ch == "@"){ return readAnnotation(); }else if (ch == "/"){ if (source.equals("*")){ source.next(); if(source.equals("*")) return readJavaDocComment(ch); return readMultilineComment(ch); } else if (source.equals("/")) { nextUntilUnescaped(source, null); return {type: "comment", style: "java-comment"};} else if (regexp) return readRegexp(); else return readOperator(); } else if (isOperatorChar.test(ch)) return readOperator(); else return readWord(); } // The external interface to the tokenizer. return function(source, startState) { return tokenizer(source, startState || javaTokenState(false, true)); }; })();