View Javadoc
1   /*
2    * Licensed under the GPL License. You may not use this file except in compliance with the License.
3    * You may obtain a copy of the License at
4    *
5    *   https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
6    *
7    * THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
8    * WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR
9    * PURPOSE.
10   */
11  package psiprobe.tokenizer;
12  
13  import java.io.IOException;
14  import java.io.Reader;
15  import java.util.Collections;
16  import java.util.List;
17  
18  import org.slf4j.Logger;
19  import org.slf4j.LoggerFactory;
20  
21  /**
22   * The Class Tokenizer.
23   */
24  public class Tokenizer {
25  
26    /** The Constant logger. */
27    private static final Logger logger = LoggerFactory.getLogger(Tokenizer.class);
28  
29    /** The Constant TT_TOKEN. */
30    public static final int TT_TOKEN = 0;
31  
32    /** The Constant TT_SYMBOL. */
33    public static final int TT_SYMBOL = 1;
34  
35    /** The Constant TT_BLOCK. */
36    public static final int TT_BLOCK = 2;
37  
38    /** The Constant TT_ERROR. */
39    public static final int TT_ERROR = 3;
40  
41    /** The reader. */
42    private Reader reader;
43  
44    /** The symbols. */
45    private final List<TokenizerSymbol> symbols;
46  
47    /** The push count. */
48    private int pushCount;
49  
50    /** The token. */
51    private final TokenizerToken token;
52  
53    /** The upcoming token. */
54    private final TokenizerToken upcomingToken;
55  
56    /** The cache position. */
57    private int cachePosition;
58  
59    /** The cache size. */
60    private int cacheSize;
61  
62    /** The cache buffer. */
63    private final char[] cacheBuffer;
64  
65    /** The cache pin position. */
66    private int cachePinPosition;
67  
68    /**
69     * Instantiates a new tokenizer.
70     */
71    public Tokenizer() {
72      this(null, 4096);
73    }
74  
75    /**
76     * Instantiates a new tokenizer.
77     *
78     * @param reader the reader
79     */
80    public Tokenizer(Reader reader) {
81      this(reader, 4096);
82    }
83  
84    /**
85     * Instantiates a new tokenizer.
86     *
87     * @param reader the reader
88     * @param cacheBufferSize the cache buffer size
89     */
90    public Tokenizer(Reader reader, int cacheBufferSize) {
91      symbols = new UniqueList<>();
92      token = new TokenizerToken();
93      upcomingToken = new TokenizerToken();
94      cacheBuffer = new char[cacheBufferSize];
95      setReader(reader);
96    }
97  
98    /**
99     * Load cache.
100    *
101    * @param count the count
102    *
103    * @throws IOException Signals that an I/O exception has occurred.
104    */
105   private void loadCache(int count) throws IOException {
106     int charToRead = count == 0 ? 0 : count - 1;
107     if (cachePosition + charToRead >= cacheSize) {
108       if (cacheSize == 0) {
109         cacheSize = reader.read(cacheBuffer, 0, cacheBuffer.length);
110         cachePosition = 0;
111       } else if (cacheSize == cacheBuffer.length) {
112         // make sure we do not read beyond the stream
113         int halfCacheSize = cacheSize / 2;
114         // copy the lower half into the upper half
115         System.arraycopy(cacheBuffer, halfCacheSize, cacheBuffer, 0, halfCacheSize);
116         cachePosition -= halfCacheSize;
117         if (cachePinPosition != -1) {
118           cachePinPosition -= halfCacheSize;
119         }
120 
121         int charsRead = reader.read(cacheBuffer, halfCacheSize, cacheSize - halfCacheSize);
122         if (charsRead == -1) {
123           cacheSize = halfCacheSize;
124         } else {
125           cacheSize = charsRead + halfCacheSize;
126         }
127       }
128     }
129   }
130 
131   /**
132    * Gets the token.
133    *
134    * @return the token
135    *
136    * @throws IOException Signals that an I/O exception has occurred.
137    */
138   public Token getToken() throws IOException {
139     if (token.type == Tokenizer.TT_ERROR) {
140       return nextToken();
141     }
142     return token;
143   }
144 
145   /**
146    * Next token.
147    *
148    * @return the token
149    *
150    * @throws IOException Signals that an I/O exception has occurred.
151    */
152   public Token nextToken() throws IOException {
153     if (pushCount > 0) {
154       pushCount--;
155       return token;
156     }
157     if (upcomingToken.type != Tokenizer.TT_ERROR) {
158       token.assign(upcomingToken);
159       upcomingToken.type = Tokenizer.TT_ERROR;
160       return token;
161     }
162 
163     token.init();
164     char[] chr = new char[1];
165     while (hasMore()) {
166       read(chr, 1);
167 
168       int symbolIndex = lookupSymbol(chr[0]);
169 
170       if (symbolIndex != -1) {
171         // we have found a symbol
172         TokenizerToken workToken =
173             token.type == Tokenizer.TT_TOKEN && token.text.length() > 0 ? upcomingToken : token;
174         TokenizerSymbol symbol = symbols.get(symbolIndex);
175         boolean hideSymbol = symbol.hidden;
176 
177         if (!hideSymbol) {
178           workToken.init();
179           workToken.text.append(symbol.startText);
180           workToken.type = Tokenizer.TT_SYMBOL;
181           workToken.name = symbol.name;
182         }
183 
184         if (symbol.tailText != null) {
185           // the symbol is a block, look for the tailText
186           while (hasMore() && !compare(symbol.tailText.toCharArray(), 0)) {
187             read(chr, 1);
188             if (!hideSymbol) {
189               workToken.text.append(chr);
190               workToken.innerText.append(chr);
191             }
192           }
193 
194           if (!hideSymbol) {
195             workToken.text.append(symbol.tailText);
196           }
197           workToken.type = Tokenizer.TT_BLOCK;
198         }
199 
200         if (token.text.length() > 0) {
201           break;
202         }
203       } else {
204         token.text.append(chr);
205         token.type = Tokenizer.TT_TOKEN;
206       }
207     }
208     return token;
209   }
210 
211   /**
212    * Push back.
213    */
214   public void pushBack() {
215     pushCount++;
216   }
217 
218   /**
219    * Sets the reader.
220    *
221    * @param reader the new reader
222    */
223   public void setReader(Reader reader) {
224     this.reader = reader;
225     cachePosition = 0;
226     cachePinPosition = -1;
227     cacheSize = 0;
228     token.type = TT_ERROR;
229     upcomingToken.type = TT_ERROR;
230   }
231 
232   /**
233    * Compare.
234    *
235    * @param chars the chars
236    * @param offs the offs
237    *
238    * @return true, if successful
239    *
240    * @throws IOException Signals that an I/O exception has occurred.
241    */
242   private boolean compare(char[] chars, int offs) throws IOException {
243     char[] subStr = new char[chars.length - offs];
244     cachePinPosition = cachePosition;
245     read(subStr, subStr.length);
246     for (int i = 0; i < subStr.length; i++) {
247       if (subStr[i] != chars[i + offs]) {
248         cachePosition = cachePinPosition;
249         cachePinPosition = -1;
250         return false;
251       }
252     }
253     return true;
254   }
255 
256   /**
257    * Lookup symbol.
258    *
259    * @param chr the chr
260    *
261    * @return the int
262    *
263    * @throws IOException Signals that an I/O exception has occurred.
264    */
265   private int lookupSymbol(char chr) throws IOException {
266     int result = -1;
267 
268     Character chrObj = chr;
269     int index = Collections.binarySearch(symbols, chrObj);
270 
271     if (index >= 0) {
272       // the index could be anywhere within a group of symbols with the same first letter
273       // so we need to scroll up the group to make sure we start test from the beginning
274       while (index > 0 && symbols.get(index - 1).compareTo(chrObj) == 0) {
275         index--;
276       }
277       while (index < symbols.size()) {
278         TokenizerSymbol symbol = symbols.get(index);
279         if (symbol.compareTo(chrObj) != 0) {
280           break;
281         }
282         if (compare(symbol.startText.toCharArray(), 1)) {
283           result = index;
284           break;
285         }
286         index++;
287       }
288     }
289     return result;
290   }
291 
292   /**
293    * Read.
294    *
295    * @param chrs the chrs
296    * @param count the count
297    *
298    * @throws IOException Signals that an I/O exception has occurred.
299    */
300   private void read(char[] chrs, int count) throws IOException {
301     loadCache(count);
302     int endPoint = cachePosition + count - 1 >= cacheSize ? cacheSize : cachePosition + count - 1;
303     if (cachePosition <= endPoint) {
304       System.arraycopy(cacheBuffer, cachePosition, chrs, 0, endPoint - cachePosition + 1);
305     }
306     cachePosition = endPoint + 1;
307   }
308 
309   /**
310    * Checks for more.
311    *
312    * @return true, if successful
313    *
314    * @throws IOException Signals that an I/O exception has occurred.
315    */
316   public boolean hasMore() throws IOException {
317     loadCache(0);
318     return cachePosition < cacheSize || upcomingToken.type != Tokenizer.TT_ERROR || pushCount > 0;
319   }
320 
321   /**
322    * Adds the symbol.
323    *
324    * @param text the text
325    */
326   public void addSymbol(String text) {
327     symbols.add(new TokenizerSymbol(null, text, null, false, false, true, false));
328   }
329 
330   /**
331    * Adds the symbol.
332    *
333    * @param text the text
334    * @param hidden the hidden
335    */
336   public void addSymbol(String text, boolean hidden) {
337     symbols.add(new TokenizerSymbol(null, text, null, hidden, false, true, false));
338   }
339 
340   /**
341    * Adds the symbol.
342    *
343    * @param startText the start text
344    * @param endText the end text
345    * @param hidden the hidden
346    */
347   public void addSymbol(String startText, String endText, boolean hidden) {
348     symbols.add(new TokenizerSymbol(null, startText, endText, hidden, false, true, false));
349   }
350 
351   /**
352    * Adds the symbol.
353    *
354    * @param symbol the symbol
355    */
356   public void addSymbol(TokenizerSymbol symbol) {
357     symbols.add(symbol);
358   }
359 
360   /**
361    * Gets the next string.
362    *
363    * @param defaultValue the default value
364    *
365    * @return the next string
366    *
367    * @throws IOException Signals that an I/O exception has occurred.
368    */
369   public String getNextString(String defaultValue) throws IOException {
370     return hasMore() ? nextToken().getInnerText() : defaultValue;
371   }
372 
373   /**
374    * Gets the next boolean.
375    *
376    * @param trueValue the true value
377    * @param defaultValue the default value
378    *
379    * @return the next boolean
380    *
381    * @throws IOException Signals that an I/O exception has occurred.
382    */
383   public boolean getNextBoolean(String trueValue, boolean defaultValue) throws IOException {
384     return hasMore() ? trueValue.equalsIgnoreCase(nextToken().getInnerText()) : defaultValue;
385   }
386 
387   /**
388    * Gets the next long.
389    *
390    * @param defaultValue the default value
391    *
392    * @return the next long
393    *
394    * @throws IOException Signals that an I/O exception has occurred.
395    */
396   public long getNextLong(long defaultValue) throws IOException {
397     String stval = getNextString(null);
398 
399     if (stval == null) {
400       return defaultValue;
401     }
402 
403     try {
404       return Long.parseLong(stval);
405     } catch (NumberFormatException e) {
406       logger.trace("", e);
407       return defaultValue;
408     }
409   }
410 
411   /**
412    * The Class TokenizerToken.
413    */
414   private static class TokenizerToken implements Token {
415 
416     /** The text. */
417     final StringBuilder text = new StringBuilder();
418 
419     /** The inner text. */
420     final StringBuilder innerText = new StringBuilder();
421 
422     /** The name. */
423     String name = "";
424 
425     /** The type. */
426     int type = Tokenizer.TT_ERROR;
427 
428     /** The line. */
429     int line;
430 
431     /** The col. */
432     int col;
433 
434     /**
435      * Instantiates a new tokenizer token.
436      */
437     public TokenizerToken() {
438       type = Tokenizer.TT_ERROR;
439     }
440 
441     @Override
442     public String getText() {
443       return text.toString();
444     }
445 
446     @Override
447     public String getInnerText() {
448       return type == Tokenizer.TT_BLOCK ? innerText.toString() : getText();
449     }
450 
451     @Override
452     public String getName() {
453       return name;
454     }
455 
456     @Override
457     public int getType() {
458       return type;
459     }
460 
461     @Override
462     public int getLine() {
463       return line;
464     }
465 
466     @Override
467     public int getCol() {
468       return col;
469     }
470 
471     @Override
472     public String toString() {
473       return getText();
474     }
475 
476     /**
477      * Assign.
478      *
479      * @param token the token
480      */
481     public void assign(TokenizerToken token) {
482       this.text.setLength(0);
483       this.text.append(token.text);
484       this.innerText.setLength(0);
485       this.innerText.append(token.innerText);
486       this.name = token.name;
487       this.type = token.type;
488       this.col = token.col;
489       this.line = token.line;
490     }
491 
492     /**
493      * Inits the.
494      */
495     public void init() {
496       text.setLength(0);
497       innerText.setLength(0);
498       name = "";
499     }
500 
501   }
502 
503 }