123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657 |
- // A framework for simple tokenizers. Takes care of newlines and
- // white-space, and of getting the text from the source stream into
- // the token object. A state is a function of two arguments -- a
- // string stream and a setState function. The second can be used to
- // change the tokenizer's state, and can be ignored for stateless
- // tokenizers. This function should advance the stream over a token
- // and return a string or object containing information about the next
- // token, or null to pass and have the (new) state be called to finish
- // the token. When a string is given, it is wrapped in a {style, type}
- // object. In the resulting object, the characters consumed are stored
- // under the content property. Any whitespace following them is also
- // automatically consumed, and added to the value property. (Thus,
- // content is the actual meaningful part of the token, while value
- // contains all the text it spans.)
- function tokenizer(source, state) {
- // Newlines are always a separate token.
- function isWhiteSpace(ch) {
- // The messy regexp is because IE's regexp matcher is of the
- // opinion that non-breaking spaces are no whitespace.
- return ch != "\n" && /^[\s\u00a0]*$/.test(ch);
- }
- var tokenizer = {
- state: state,
- take: function(type) {
- if (typeof(type) == "string")
- type = {style: type, type: type};
- type.content = (type.content || "") + source.get();
- if (!/\n$/.test(type.content))
- source.nextWhile(isWhiteSpace);
- type.value = type.content + source.get();
- return type;
- },
- next: function () {
- if (!source.more()) throw StopIteration;
- var type;
- if (source.equals("\n")) {
- source.next();
- return this.take("whitespace");
- }
-
- if (source.applies(isWhiteSpace))
- type = "whitespace";
- else
- while (!type)
- type = this.state(source, function(s) {tokenizer.state = s;});
- return this.take(type);
- }
- };
- return tokenizer;
- }
|