1
2
3
4
5
6
7
8
9
10
11 package psiprobe.tokenizer;
12
13 import java.io.IOException;
14 import java.io.Reader;
15 import java.util.Collections;
16 import java.util.List;
17
18 import org.slf4j.Logger;
19 import org.slf4j.LoggerFactory;
20
21
22
23
24 public class Tokenizer {
25
26
27 private static final Logger logger = LoggerFactory.getLogger(Tokenizer.class);
28
29
30 public static final int TT_TOKEN = 0;
31
32
33 public static final int TT_SYMBOL = 1;
34
35
36 public static final int TT_BLOCK = 2;
37
38
39 public static final int TT_ERROR = 3;
40
41
42 private Reader reader;
43
44
45 private final List<TokenizerSymbol> symbols;
46
47
48 private int pushCount;
49
50
51 private final TokenizerToken token;
52
53
54 private final TokenizerToken upcomingToken;
55
56
57 private int cachePosition;
58
59
60 private int cacheSize;
61
62
63 private final char[] cacheBuffer;
64
65
66 private int cachePinPosition;
67
68
69
70
71 public Tokenizer() {
72 this(null, 4096);
73 }
74
75
76
77
78
79
80 public Tokenizer(Reader reader) {
81 this(reader, 4096);
82 }
83
84
85
86
87
88
89
90 public Tokenizer(Reader reader, int cacheBufferSize) {
91 symbols = new UniqueList<>();
92 token = new TokenizerToken();
93 upcomingToken = new TokenizerToken();
94 cacheBuffer = new char[cacheBufferSize];
95 setReader(reader);
96 }
97
98
99
100
101
102
103
104
105 private void loadCache(int count) throws IOException {
106 int charToRead = count == 0 ? 0 : count - 1;
107 if (cachePosition + charToRead >= cacheSize) {
108 if (cacheSize == 0) {
109 cacheSize = reader.read(cacheBuffer, 0, cacheBuffer.length);
110 cachePosition = 0;
111 } else if (cacheSize == cacheBuffer.length) {
112
113 int halfCacheSize = cacheSize / 2;
114
115 System.arraycopy(cacheBuffer, halfCacheSize, cacheBuffer, 0, halfCacheSize);
116 cachePosition -= halfCacheSize;
117 if (cachePinPosition != -1) {
118 cachePinPosition -= halfCacheSize;
119 }
120
121 int charsRead = reader.read(cacheBuffer, halfCacheSize, cacheSize - halfCacheSize);
122 if (charsRead == -1) {
123 cacheSize = halfCacheSize;
124 } else {
125 cacheSize = charsRead + halfCacheSize;
126 }
127 }
128 }
129 }
130
131
132
133
134
135
136
137
138 public Token getToken() throws IOException {
139 if (token.type == Tokenizer.TT_ERROR) {
140 return nextToken();
141 }
142 return token;
143 }
144
145
146
147
148
149
150
151
152 public Token nextToken() throws IOException {
153 if (pushCount > 0) {
154 pushCount--;
155 return token;
156 }
157 if (upcomingToken.type != Tokenizer.TT_ERROR) {
158 token.assign(upcomingToken);
159 upcomingToken.type = Tokenizer.TT_ERROR;
160 return token;
161 }
162
163 token.init();
164 char[] chr = new char[1];
165 while (hasMore()) {
166 read(chr, 1);
167
168 int symbolIndex = lookupSymbol(chr[0]);
169
170 if (symbolIndex != -1) {
171
172 TokenizerToken workToken =
173 token.type == Tokenizer.TT_TOKEN && token.text.length() > 0 ? upcomingToken : token;
174 TokenizerSymbol symbol = symbols.get(symbolIndex);
175 boolean hideSymbol = symbol.hidden;
176
177 if (!hideSymbol) {
178 workToken.init();
179 workToken.text.append(symbol.startText);
180 workToken.type = Tokenizer.TT_SYMBOL;
181 workToken.name = symbol.name;
182 }
183
184 if (symbol.tailText != null) {
185
186 while (hasMore() && !compare(symbol.tailText.toCharArray(), 0)) {
187 read(chr, 1);
188 if (!hideSymbol) {
189 workToken.text.append(chr);
190 workToken.innerText.append(chr);
191 }
192 }
193
194 if (!hideSymbol) {
195 workToken.text.append(symbol.tailText);
196 }
197 workToken.type = Tokenizer.TT_BLOCK;
198 }
199
200 if (token.text.length() > 0) {
201 break;
202 }
203 } else {
204 token.text.append(chr);
205 token.type = Tokenizer.TT_TOKEN;
206 }
207 }
208 return token;
209 }
210
211
212
213
214 public void pushBack() {
215 pushCount++;
216 }
217
218
219
220
221
222
223 public void setReader(Reader reader) {
224 this.reader = reader;
225 cachePosition = 0;
226 cachePinPosition = -1;
227 cacheSize = 0;
228 token.type = TT_ERROR;
229 upcomingToken.type = TT_ERROR;
230 }
231
232
233
234
235
236
237
238
239
240
241
242 private boolean compare(char[] chars, int offs) throws IOException {
243 char[] subStr = new char[chars.length - offs];
244 cachePinPosition = cachePosition;
245 read(subStr, subStr.length);
246 for (int i = 0; i < subStr.length; i++) {
247 if (subStr[i] != chars[i + offs]) {
248 cachePosition = cachePinPosition;
249 cachePinPosition = -1;
250 return false;
251 }
252 }
253 return true;
254 }
255
256
257
258
259
260
261
262
263
264
265 private int lookupSymbol(char chr) throws IOException {
266 int result = -1;
267
268 Character chrObj = chr;
269 int index = Collections.binarySearch(symbols, chrObj);
270
271 if (index >= 0) {
272
273
274 while (index > 0 && symbols.get(index - 1).compareTo(chrObj) == 0) {
275 index--;
276 }
277 while (index < symbols.size()) {
278 TokenizerSymbol symbol = symbols.get(index);
279 if (symbol.compareTo(chrObj) != 0) {
280 break;
281 }
282 if (compare(symbol.startText.toCharArray(), 1)) {
283 result = index;
284 break;
285 }
286 index++;
287 }
288 }
289 return result;
290 }
291
292
293
294
295
296
297
298
299
300 private void read(char[] chrs, int count) throws IOException {
301 loadCache(count);
302 int endPoint = cachePosition + count - 1 >= cacheSize ? cacheSize : cachePosition + count - 1;
303 if (cachePosition <= endPoint) {
304 System.arraycopy(cacheBuffer, cachePosition, chrs, 0, endPoint - cachePosition + 1);
305 }
306 cachePosition = endPoint + 1;
307 }
308
309
310
311
312
313
314
315
316 public boolean hasMore() throws IOException {
317 loadCache(0);
318 return cachePosition < cacheSize || upcomingToken.type != Tokenizer.TT_ERROR || pushCount > 0;
319 }
320
321
322
323
324
325
326 public void addSymbol(String text) {
327 symbols.add(new TokenizerSymbol(null, text, null, false, false, true, false));
328 }
329
330
331
332
333
334
335
336 public void addSymbol(String text, boolean hidden) {
337 symbols.add(new TokenizerSymbol(null, text, null, hidden, false, true, false));
338 }
339
340
341
342
343
344
345
346
347 public void addSymbol(String startText, String endText, boolean hidden) {
348 symbols.add(new TokenizerSymbol(null, startText, endText, hidden, false, true, false));
349 }
350
351
352
353
354
355
356 public void addSymbol(TokenizerSymbol symbol) {
357 symbols.add(symbol);
358 }
359
360
361
362
363
364
365
366
367
368
369 public String getNextString(String defaultValue) throws IOException {
370 return hasMore() ? nextToken().getInnerText() : defaultValue;
371 }
372
373
374
375
376
377
378
379
380
381
382
383 public boolean getNextBoolean(String trueValue, boolean defaultValue) throws IOException {
384 return hasMore() ? trueValue.equalsIgnoreCase(nextToken().getInnerText()) : defaultValue;
385 }
386
387
388
389
390
391
392
393
394
395
396 public long getNextLong(long defaultValue) throws IOException {
397 String stval = getNextString(null);
398
399 if (stval == null) {
400 return defaultValue;
401 }
402
403 try {
404 return Long.parseLong(stval);
405 } catch (NumberFormatException e) {
406 logger.trace("", e);
407 return defaultValue;
408 }
409 }
410
411
412
413
414 private static class TokenizerToken implements Token {
415
416
417 final StringBuilder text = new StringBuilder();
418
419
420 final StringBuilder innerText = new StringBuilder();
421
422
423 String name = "";
424
425
426 int type = Tokenizer.TT_ERROR;
427
428
429 int line;
430
431
432 int col;
433
434
435
436
437 public TokenizerToken() {
438 type = Tokenizer.TT_ERROR;
439 }
440
441 @Override
442 public String getText() {
443 return text.toString();
444 }
445
446 @Override
447 public String getInnerText() {
448 return type == Tokenizer.TT_BLOCK ? innerText.toString() : getText();
449 }
450
451 @Override
452 public String getName() {
453 return name;
454 }
455
456 @Override
457 public int getType() {
458 return type;
459 }
460
461 @Override
462 public int getLine() {
463 return line;
464 }
465
466 @Override
467 public int getCol() {
468 return col;
469 }
470
471 @Override
472 public String toString() {
473 return getText();
474 }
475
476
477
478
479
480
481 public void assign(TokenizerToken token) {
482 this.text.setLength(0);
483 this.text.append(token.text);
484 this.innerText.setLength(0);
485 this.innerText.append(token.innerText);
486 this.name = token.name;
487 this.type = token.type;
488 this.col = token.col;
489 this.line = token.line;
490 }
491
492
493
494
495 public void init() {
496 text.setLength(0);
497 innerText.setLength(0);
498 name = "";
499 }
500
501 }
502
503 }