tokenizejava.js 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. /**
  2. * Java tokenizer for codemirror
  3. *
  4. * @author Patrick Wied
  5. * @version 2010-10-07
  6. */
  7. var tokenizeJava = (function() {
  8. // Advance the stream until the given character (not preceded by a
  9. // backslash) is encountered, or the end of the line is reached.
  10. function nextUntilUnescaped(source, end) {
  11. var escaped = false;
  12. var next;
  13. while (!source.endOfLine()) {
  14. var next = source.next();
  15. if (next == end && !escaped)
  16. return false;
  17. escaped = !escaped && next == "\\";
  18. }
  19. return escaped;
  20. }
  21. // A map of Java's keywords. The a/b/c keyword distinction is
  22. // very rough, but it gives the parser enough information to parse
  23. // correct code correctly (we don't care that much how we parse
  24. // incorrect code). The style information included in these objects
  25. // is used by the highlighter to pick the correct CSS style for a
  26. // token.
  27. var keywords = function(){
  28. function result(type, style){
  29. return {type: type, style: "java-" + style};
  30. }
  31. // keywords that take a parenthised expression, and then a
  32. // statement (if)
  33. var keywordA = result("keyword a", "keyword");
  34. // keywords that take just a statement (else)
  35. var keywordB = result("keyword b", "keyword");
  36. // keywords that optionally take an expression, and form a
  37. // statement (return)
  38. var keywordC = result("keyword c", "keyword");
  39. var operator = result("operator", "keyword");
  40. var atom = result("atom", "atom");
  41. return {
  42. "if": keywordA, "while": keywordA, "with": keywordA,
  43. "else": keywordB, "do": keywordB, "try": keywordB, "finally": keywordB,
  44. "return": keywordC, "break": keywordC, "continue": keywordC, "new": keywordC, "throw": keywordC, "throws": keywordB,
  45. "in": operator, "typeof": operator, "instanceof": operator,
  46. "catch": result("catch", "keyword"), "for": result("for", "keyword"), "switch": result("switch", "keyword"),
  47. "case": result("case", "keyword"), "default": result("default", "keyword"),
  48. "true": atom, "false": atom, "null": atom,
  49. "class": result("class", "keyword"), "interface": result("interface", "keyword"), "package": keywordC, "import": keywordC,
  50. "implements": keywordC, "extends": keywordC, "super": keywordC,
  51. "public": keywordC, "private": keywordC, "protected": keywordC, "transient": keywordC, "this": keywordC,
  52. "static": keywordC, "final": keywordC, "const": keywordC, "abstract": keywordC, "static": keywordC,
  53. "int": keywordC, "double": keywordC, "long": keywordC, "boolean": keywordC, "char": keywordC,
  54. "void": keywordC, "byte": keywordC, "float": keywordC, "short": keywordC
  55. };
  56. }();
  57. // Some helper regexps
  58. var isOperatorChar = /[+\-*&%=<>!?|]/;
  59. var isHexDigit = /[0-9A-Fa-f]/;
  60. var isWordChar = /[\w\$_]/;
  61. // Wrapper around javaToken that helps maintain parser state (whether
  62. // we are inside of a multi-line comment and whether the next token
  63. // could be a regular expression).
  64. function javaTokenState(inside, regexp) {
  65. return function(source, setState) {
  66. var newInside = inside;
  67. var type = javaToken(inside, regexp, source, function(c) {newInside = c;});
  68. var newRegexp = type.type == "operator" || type.type == "keyword c" || type.type.match(/^[\[{}\(,;:]$/);
  69. if (newRegexp != regexp || newInside != inside)
  70. setState(javaTokenState(newInside, newRegexp));
  71. return type;
  72. };
  73. }
  74. // The token reader, inteded to be used by the tokenizer from
  75. // tokenize.js (through jsTokenState). Advances the source stream
  76. // over a token, and returns an object containing the type and style
  77. // of that token.
  78. function javaToken(inside, regexp, source, setInside) {
  79. function readHexNumber(){
  80. source.next(); // skip the 'x'
  81. source.nextWhileMatches(isHexDigit);
  82. return {type: "number", style: "java-atom"};
  83. }
  84. function readNumber() {
  85. source.nextWhileMatches(/[0-9]/);
  86. if (source.equals(".")){
  87. source.next();
  88. source.nextWhileMatches(/[0-9]/);
  89. }
  90. if (source.equals("e") || source.equals("E")){
  91. source.next();
  92. if (source.equals("-"))
  93. source.next();
  94. source.nextWhileMatches(/[0-9]/);
  95. }
  96. return {type: "number", style: "java-atom"};
  97. }
  98. // Read a word, look it up in keywords. If not found, it is a
  99. // variable, otherwise it is a keyword of the type found.
  100. function readWord() {
  101. source.nextWhileMatches(isWordChar);
  102. var word = source.get();
  103. var known = keywords.hasOwnProperty(word) && keywords.propertyIsEnumerable(word) && keywords[word];
  104. return known ? {type: known.type, style: known.style, content: word} :
  105. {type: "variable", style: "java-variable", content: word};
  106. }
  107. function readRegexp() {
  108. nextUntilUnescaped(source, "/");
  109. source.nextWhileMatches(/[gi]/);
  110. return {type: "regexp", style: "java-string"};
  111. }
  112. // Mutli-line comments are tricky. We want to return the newlines
  113. // embedded in them as regular newline tokens, and then continue
  114. // returning a comment token for every line of the comment. So
  115. // some state has to be saved (inside) to indicate whether we are
  116. // inside a /* */ sequence.
  117. function readMultilineComment(start){
  118. var newInside = "/*";
  119. var maybeEnd = (start == "*");
  120. while (true) {
  121. if (source.endOfLine())
  122. break;
  123. var next = source.next();
  124. if (next == "/" && maybeEnd){
  125. newInside = null;
  126. break;
  127. }
  128. maybeEnd = (next == "*");
  129. }
  130. setInside(newInside);
  131. return {type: "comment", style: "java-comment"};
  132. }
  133. // for reading javadoc
  134. function readJavaDocComment(start){
  135. var newInside = "/**";
  136. var maybeEnd = (start == "*");
  137. while (true) {
  138. if (source.endOfLine())
  139. break;
  140. var next = source.next();
  141. if (next == "/" && maybeEnd){
  142. newInside = null;
  143. break;
  144. }
  145. maybeEnd = (next == "*");
  146. }
  147. setInside(newInside);
  148. return {type: "javadoc", style: "javadoc-comment"};
  149. }
  150. // for reading annotations (word based)
  151. function readAnnotation(){
  152. source.nextWhileMatches(isWordChar);
  153. var word = source.get();
  154. return {type: "annotation", style: "java-annotation", content:word};
  155. }
  156. function readOperator() {
  157. source.nextWhileMatches(isOperatorChar);
  158. return {type: "operator", style: "java-operator"};
  159. }
  160. function readString(quote) {
  161. var endBackSlash = nextUntilUnescaped(source, quote);
  162. setInside(endBackSlash ? quote : null);
  163. return {type: "string", style: "java-string"};
  164. }
  165. // Fetch the next token. Dispatches on first character in the
  166. // stream, or first two characters when the first is a slash.
  167. if (inside == "\"" || inside == "'")
  168. return readString(inside);
  169. var ch = source.next();
  170. if (inside == "/*")
  171. return readMultilineComment(ch);
  172. else if(inside == "/**")
  173. return readJavaDocComment(ch);
  174. else if (ch == "\"" || ch == "'")
  175. return readString(ch);
  176. // with punctuation, the type of the token is the symbol itself
  177. else if (/[\[\]{}\(\),;\:\.]/.test(ch))
  178. return {type: ch, style: "java-punctuation"};
  179. else if (ch == "0" && (source.equals("x") || source.equals("X")))
  180. return readHexNumber();
  181. else if (/[0-9]/.test(ch))
  182. return readNumber();
  183. else if (ch == "@"){
  184. return readAnnotation();
  185. }else if (ch == "/"){
  186. if (source.equals("*")){
  187. source.next();
  188. if(source.equals("*"))
  189. return readJavaDocComment(ch);
  190. return readMultilineComment(ch);
  191. }
  192. else if (source.equals("/"))
  193. { nextUntilUnescaped(source, null); return {type: "comment", style: "java-comment"};}
  194. else if (regexp)
  195. return readRegexp();
  196. else
  197. return readOperator();
  198. }
  199. else if (isOperatorChar.test(ch))
  200. return readOperator();
  201. else
  202. return readWord();
  203. }
  204. // The external interface to the tokenizer.
  205. return function(source, startState) {
  206. return tokenizer(source, startState || javaTokenState(false, true));
  207. };
  208. })();