tokenize.js 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. // A framework for simple tokenizers. Takes care of newlines and
  2. // white-space, and of getting the text from the source stream into
  3. // the token object. A state is a function of two arguments -- a
  4. // string stream and a setState function. The second can be used to
  5. // change the tokenizer's state, and can be ignored for stateless
  6. // tokenizers. This function should advance the stream over a token
  7. // and return a string or object containing information about the next
  8. // token, or null to pass and have the (new) state be called to finish
  9. // the token. When a string is given, it is wrapped in a {style, type}
  10. // object. In the resulting object, the characters consumed are stored
  11. // under the content property. Any whitespace following them is also
  12. // automatically consumed, and added to the value property. (Thus,
  13. // content is the actual meaningful part of the token, while value
  14. // contains all the text it spans.)
  15. function tokenizer(source, state) {
  16. // Newlines are always a separate token.
  17. function isWhiteSpace(ch) {
  18. // The messy regexp is because IE's regexp matcher is of the
  19. // opinion that non-breaking spaces are no whitespace.
  20. return ch != "\n" && /^[\s\u00a0]*$/.test(ch);
  21. }
  22. var tokenizer = {
  23. state: state,
  24. take: function(type) {
  25. if (typeof(type) == "string")
  26. type = {style: type, type: type};
  27. type.content = (type.content || "") + source.get();
  28. if (!/\n$/.test(type.content))
  29. source.nextWhile(isWhiteSpace);
  30. type.value = type.content + source.get();
  31. return type;
  32. },
  33. next: function () {
  34. if (!source.more()) throw StopIteration;
  35. var type;
  36. if (source.equals("\n")) {
  37. source.next();
  38. return this.take("whitespace");
  39. }
  40. if (source.applies(isWhiteSpace))
  41. type = "whitespace";
  42. else
  43. while (!type)
  44. type = this.state(source, function(s) {tokenizer.state = s;});
  45. return this.take(type);
  46. }
  47. };
  48. return tokenizer;
  49. }