GNU Classpath (0.95) | |
Frames | No Frames |
1: /* StreamTokenizer.java -- parses streams of characters into tokens 2: Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003 Free Software Foundation 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: package java.io; 39: 40: /** 41: * This class parses streams of characters into tokens. There are a 42: * million-zillion flags that can be set to control the parsing, as 43: * described under the various method headings. 44: * 45: * @author Warren Levy (warrenl@cygnus.com) 46: * @date October 25, 1998. 47: */ 48: /* Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3 49: * "The Java Language Specification", ISBN 0-201-63451-1 50: * plus online API docs for JDK 1.2 beta from http://www.javasoft.com. 51: * Status: Believed complete and correct. 52: */ 53: 54: public class StreamTokenizer 55: { 56: /** A constant indicating that the end of the stream has been read. */ 57: public static final int TT_EOF = -1; 58: 59: /** A constant indicating that the end of the line has been read. */ 60: public static final int TT_EOL = '\n'; 61: 62: /** A constant indicating that a number token has been read. */ 63: public static final int TT_NUMBER = -2; 64: 65: /** A constant indicating that a word token has been read. */ 66: public static final int TT_WORD = -3; 67: 68: /** A constant indicating that no tokens have been read yet. */ 69: private static final int TT_NONE = -4; 70: 71: /** 72: * Contains the type of the token read resulting from a call to nextToken 73: * The rules are as follows: 74: * <ul> 75: * <li>For a token consisting of a single ordinary character, this is the 76: * value of that character.</li> 77: * <li>For a quoted string, this is the value of the quote character</li> 78: * <li>For a word, this is TT_WORD</li> 79: * <li>For a number, this is TT_NUMBER</li> 80: * <li>For the end of the line, this is TT_EOL</li> 81: * <li>For the end of the stream, this is TT_EOF</li> 82: * </ul> 83: */ 84: public int ttype = TT_NONE; 85: 86: /** The String associated with word and string tokens. */ 87: public String sval; 88: 89: /** The numeric value associated with number tokens. */ 90: public double nval; 91: 92: /* Indicates whether end-of-line is recognized as a token. */ 93: private boolean eolSignificant = false; 94: 95: /* Indicates whether word tokens are automatically made lower case. */ 96: private boolean lowerCase = false; 97: 98: /* Indicates whether C++ style comments are recognized and skipped. */ 99: private boolean slashSlash = false; 100: 101: /* Indicates whether C style comments are recognized and skipped. */ 102: private boolean slashStar = false; 103: 104: /* Attribute tables of each byte from 0x00 to 0xFF. */ 105: private boolean[] whitespace = new boolean[256]; 106: private boolean[] alphabetic = new boolean[256]; 107: private boolean[] numeric = new boolean[256]; 108: private boolean[] quote = new boolean[256]; 109: private boolean[] comment = new boolean[256]; 110: 111: /* The Reader associated with this class. */ 112: private PushbackReader in; 113: 114: /* Indicates if a token has been pushed back. */ 115: private boolean pushedBack = false; 116: 117: /* Contains the current line number of the reader. */ 118: private int lineNumber = 1; 119: 120: /** 121: * This method reads bytes from an <code>InputStream</code> and tokenizes 122: * them. For details on how this method operates by default, see 123: * <code>StreamTokenizer(Reader)</code>. 124: * 125: * @param is The <code>InputStream</code> to read from 126: * 127: * @deprecated Since JDK 1.1. 128: */ 129: public StreamTokenizer(InputStream is) 130: { 131: this(new InputStreamReader(is)); 132: } 133: 134: /** 135: * This method initializes a new <code>StreamTokenizer</code> to read 136: * characters from a <code>Reader</code> and parse them. The char values 137: * have their hight bits masked so that the value is treated a character 138: * in the range of 0x0000 to 0x00FF. 139: * <p> 140: * This constructor sets up the parsing table to parse the stream in the 141: * following manner: 142: * <ul> 143: * <li>The values 'A' through 'Z', 'a' through 'z' and 0xA0 through 0xFF 144: * are initialized as alphabetic</li> 145: * <li>The values 0x00 through 0x20 are initialized as whitespace</li> 146: * <li>The values '\'' and '"' are initialized as quote characters</li> 147: * <li>'/' is a comment character</li> 148: * <li>Numbers will be parsed</li> 149: * <li>EOL is not treated as significant</li> 150: * <li>C and C++ (//) comments are not recognized</li> 151: * </ul> 152: * 153: * @param r The <code>Reader</code> to read chars from 154: */ 155: public StreamTokenizer(Reader r) 156: { 157: in = new PushbackReader(r); 158: 159: whitespaceChars(0x00, 0x20); 160: wordChars('A', 'Z'); 161: wordChars('a', 'z'); 162: wordChars(0xA0, 0xFF); 163: commentChar('/'); 164: quoteChar('\''); 165: quoteChar('"'); 166: parseNumbers(); 167: } 168: 169: /** 170: * This method sets the comment attribute on the specified 171: * character. Other attributes for the character are cleared. 172: * 173: * @param ch The character to set the comment attribute for, passed as an int 174: */ 175: public void commentChar(int ch) 176: { 177: if (ch >= 0 && ch <= 255) 178: { 179: comment[ch] = true; 180: whitespace[ch] = false; 181: alphabetic[ch] = false; 182: numeric[ch] = false; 183: quote[ch] = false; 184: } 185: } 186: 187: /** 188: * This method sets a flag that indicates whether or not the end of line 189: * sequence terminates and is a token. The defaults to <code>false</code> 190: * 191: * @param flag <code>true</code> if EOF is significant, <code>false</code> 192: * otherwise 193: */ 194: public void eolIsSignificant(boolean flag) 195: { 196: eolSignificant = flag; 197: } 198: 199: /** 200: * This method returns the current line number. Note that if the 201: * <code>pushBack()</code> method is called, it has no effect on the 202: * line number returned by this method. 203: * 204: * @return The current line number 205: */ 206: public int lineno() 207: { 208: return lineNumber; 209: } 210: 211: /** 212: * This method sets a flag that indicates whether or not alphabetic 213: * tokens that are returned should be converted to lower case. 214: * 215: * @param flag <code>true</code> to convert to lower case, 216: * <code>false</code> otherwise 217: */ 218: public void lowerCaseMode(boolean flag) 219: { 220: lowerCase = flag; 221: } 222: 223: private boolean isWhitespace(int ch) 224: { 225: return (ch >= 0 && ch <= 255 && whitespace[ch]); 226: } 227: 228: private boolean isAlphabetic(int ch) 229: { 230: return ((ch > 255) || (ch >= 0 && alphabetic[ch])); 231: } 232: 233: private boolean isNumeric(int ch) 234: { 235: return (ch >= 0 && ch <= 255 && numeric[ch]); 236: } 237: 238: private boolean isQuote(int ch) 239: { 240: return (ch >= 0 && ch <= 255 && quote[ch]); 241: } 242: 243: private boolean isComment(int ch) 244: { 245: return (ch >= 0 && ch <= 255 && comment[ch]); 246: } 247: 248: /** 249: * This method reads the next token from the stream. It sets the 250: * <code>ttype</code> variable to the appropriate token type and 251: * returns it. It also can set <code>sval</code> or <code>nval</code> 252: * as described below. The parsing strategy is as follows: 253: * <ul> 254: * <li>Skip any whitespace characters.</li> 255: * <li>If a numeric character is encountered, attempt to parse a numeric 256: * value. Leading '-' characters indicate a numeric only if followed by 257: * another non-'-' numeric. The value of the numeric token is terminated 258: * by either the first non-numeric encountered, or the second occurrence of 259: * '-' or '.'. The token type returned is TT_NUMBER and <code>nval</code> 260: * is set to the value parsed.</li> 261: * <li>If an alphabetic character is parsed, all subsequent characters 262: * are read until the first non-alphabetic or non-numeric character is 263: * encountered. The token type returned is TT_WORD and the value parsed 264: * is stored in <code>sval</code>. If lower case mode is set, the token 265: * stored in <code>sval</code> is converted to lower case. The end of line 266: * sequence terminates a word only if EOL signficance has been turned on. 267: * The start of a comment also terminates a word. Any character with a 268: * non-alphabetic and non-numeric attribute (such as white space, a quote, 269: * or a commet) are treated as non-alphabetic and terminate the word.</li> 270: * <li>If a comment character is parsed, then all remaining characters on 271: * the current line are skipped and another token is parsed. Any EOL or 272: * EOF's encountered are not discarded, but rather terminate the comment.</li> 273: * <li>If a quote character is parsed, then all characters up to the 274: * second occurrence of the same quote character are parsed into a 275: * <code>String</code>. This <code>String</code> is stored as 276: * <code>sval</code>, but is not converted to lower case, even if lower case 277: * mode is enabled. The token type returned is the value of the quote 278: * character encountered. Any escape sequences 279: * (\b (backspace), \t (HTAB), \n (linefeed), \f (form feed), \r 280: * (carriage return), \" (double quote), \' (single quote), \\ 281: * (backslash), \XXX (octal esacpe)) are converted to the appropriate 282: * char values. Invalid esacape sequences are left in untranslated. 283: * Unicode characters like ('\ u0000') are not recognized. </li> 284: * <li>If the C++ comment sequence "//" is encountered, and the parser 285: * is configured to handle that sequence, then the remainder of the line 286: * is skipped and another token is read exactly as if a character with 287: * the comment attribute was encountered.</li> 288: * <li>If the C comment sequence "/*" is encountered, and the parser 289: * is configured to handle that sequence, then all characters up to and 290: * including the comment terminator sequence are discarded and another 291: * token is parsed.</li> 292: * <li>If all cases above are not met, then the character is an ordinary 293: * character that is parsed as a token by itself. The char encountered 294: * is returned as the token type.</li> 295: * </ul> 296: * 297: * @return The token type 298: * @exception IOException If an I/O error occurs 299: */ 300: public int nextToken() throws IOException 301: { 302: if (pushedBack) 303: { 304: pushedBack = false; 305: if (ttype != TT_NONE) 306: return ttype; 307: } 308: 309: sval = null; 310: int ch; 311: 312: // Skip whitespace. Deal with EOL along the way. 313: while (isWhitespace(ch = in.read())) 314: if (ch == '\n' || ch == '\r') 315: { 316: lineNumber++; 317: 318: // Throw away \n if in combination with \r. 319: if (ch == '\r' && (ch = in.read()) != '\n') 320: { 321: if (ch != TT_EOF) 322: in.unread(ch); 323: } 324: if (eolSignificant) 325: return (ttype = TT_EOL); 326: } 327: 328: if (ch == '/') 329: if ((ch = in.read()) == '/' && slashSlash) 330: { 331: while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF) 332: ; 333: 334: if (ch != TT_EOF) 335: in.unread(ch); 336: return nextToken(); // Recursive, but not too deep in normal cases 337: } 338: else if (ch == '*' && slashStar) 339: { 340: while (true) 341: { 342: ch = in.read(); 343: if (ch == '*') 344: { 345: if ((ch = in.read()) == '/') 346: break; 347: else if (ch != TT_EOF) 348: in.unread(ch); 349: } 350: else if (ch == '\n' || ch == '\r') 351: { 352: lineNumber++; 353: if (ch == '\r' && (ch = in.read()) != '\n') 354: { 355: if (ch != TT_EOF) 356: in.unread(ch); 357: } 358: } 359: else if (ch == TT_EOF) 360: { 361: break; 362: } 363: } 364: return nextToken(); // Recursive, but not too deep in normal cases 365: } 366: else 367: { 368: if (ch != TT_EOF) 369: in.unread(ch); 370: ch = '/'; 371: } 372: 373: if (ch == TT_EOF) 374: ttype = TT_EOF; 375: else if (isNumeric(ch)) 376: { 377: boolean isNegative = false; 378: if (ch == '-') 379: { 380: // Read ahead to see if this is an ordinary '-' rather than numeric. 381: ch = in.read(); 382: if (isNumeric(ch) && ch != '-') 383: { 384: isNegative = true; 385: } 386: else 387: { 388: if (ch != TT_EOF) 389: in.unread(ch); 390: return (ttype = '-'); 391: } 392: } 393: 394: StringBuffer tokbuf = new StringBuffer(); 395: tokbuf.append((char) ch); 396: 397: int decCount = 0; 398: while (isNumeric(ch = in.read()) && ch != '-') 399: if (ch == '.' && decCount++ > 0) 400: break; 401: else 402: tokbuf.append((char) ch); 403: 404: if (ch != TT_EOF) 405: in.unread(ch); 406: ttype = TT_NUMBER; 407: try 408: { 409: nval = Double.valueOf(tokbuf.toString()).doubleValue(); 410: } 411: catch (NumberFormatException _) 412: { 413: nval = 0.0; 414: } 415: if (isNegative) 416: nval = -nval; 417: } 418: else if (isAlphabetic(ch)) 419: { 420: StringBuffer tokbuf = new StringBuffer(); 421: tokbuf.append((char) ch); 422: while (isAlphabetic(ch = in.read()) || isNumeric(ch)) 423: tokbuf.append((char) ch); 424: if (ch != TT_EOF) 425: in.unread(ch); 426: ttype = TT_WORD; 427: sval = tokbuf.toString(); 428: if (lowerCase) 429: sval = sval.toLowerCase(); 430: } 431: else if (isComment(ch)) 432: { 433: while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF) 434: ; 435: 436: if (ch != TT_EOF) 437: in.unread(ch); 438: return nextToken(); // Recursive, but not too deep in normal cases. 439: } 440: else if (isQuote(ch)) 441: { 442: ttype = ch; 443: StringBuffer tokbuf = new StringBuffer(); 444: while ((ch = in.read()) != ttype && ch != '\n' && ch != '\r' && 445: ch != TT_EOF) 446: { 447: if (ch == '\\') 448: switch (ch = in.read()) 449: { 450: case 'a': ch = 0x7; 451: break; 452: case 'b': ch = '\b'; 453: break; 454: case 'f': ch = 0xC; 455: break; 456: case 'n': ch = '\n'; 457: break; 458: case 'r': ch = '\r'; 459: break; 460: case 't': ch = '\t'; 461: break; 462: case 'v': ch = 0xB; 463: break; 464: case '\n': ch = '\n'; 465: break; 466: case '\r': ch = '\r'; 467: break; 468: case '\"': 469: case '\'': 470: case '\\': 471: break; 472: default: 473: int ch1, nextch; 474: if ((nextch = ch1 = ch) >= '0' && ch <= '7') 475: { 476: ch -= '0'; 477: if ((nextch = in.read()) >= '0' && nextch <= '7') 478: { 479: ch = ch * 8 + nextch - '0'; 480: if ((nextch = in.read()) >= '0' && nextch <= '7' && 481: ch1 >= '0' && ch1 <= '3') 482: { 483: ch = ch * 8 + nextch - '0'; 484: nextch = in.read(); 485: } 486: } 487: } 488: 489: if (nextch != TT_EOF) 490: in.unread(nextch); 491: } 492: 493: tokbuf.append((char) ch); 494: } 495: 496: // Throw away matching quote char. 497: if (ch != ttype && ch != TT_EOF) 498: in.unread(ch); 499: 500: sval = tokbuf.toString(); 501: } 502: else 503: { 504: ttype = ch; 505: } 506: 507: return ttype; 508: } 509: 510: private void resetChar(int ch) 511: { 512: whitespace[ch] = alphabetic[ch] = numeric[ch] = quote[ch] = comment[ch] = 513: false; 514: } 515: 516: /** 517: * This method makes the specified character an ordinary character. This 518: * means that none of the attributes (whitespace, alphabetic, numeric, 519: * quote, or comment) will be set on this character. This character will 520: * parse as its own token. 521: * 522: * @param ch The character to make ordinary, passed as an int 523: */ 524: public void ordinaryChar(int ch) 525: { 526: if (ch >= 0 && ch <= 255) 527: resetChar(ch); 528: } 529: 530: /** 531: * This method makes all the characters in the specified range, range 532: * terminators included, ordinary. This means the none of the attributes 533: * (whitespace, alphabetic, numeric, quote, or comment) will be set on 534: * any of the characters in the range. This makes each character in this 535: * range parse as its own token. 536: * 537: * @param low The low end of the range of values to set the whitespace 538: * attribute for 539: * @param hi The high end of the range of values to set the whitespace 540: * attribute for 541: */ 542: public void ordinaryChars(int low, int hi) 543: { 544: if (low < 0) 545: low = 0; 546: if (hi > 255) 547: hi = 255; 548: for (int i = low; i <= hi; i++) 549: resetChar(i); 550: } 551: 552: /** 553: * This method sets the numeric attribute on the characters '0' - '9' and 554: * the characters '.' and '-'. 555: * When this method is used, the result of giving other attributes 556: * (whitespace, quote, or comment) to the numeric characters may 557: * vary depending on the implementation. For example, if 558: * parseNumbers() and then whitespaceChars('1', '1') are called, 559: * this implementation reads "121" as 2, while some other implementation 560: * will read it as 21. 561: */ 562: public void parseNumbers() 563: { 564: for (int i = 0; i <= 9; i++) 565: numeric['0' + i] = true; 566: 567: numeric['.'] = true; 568: numeric['-'] = true; 569: } 570: 571: /** 572: * Puts the current token back into the StreamTokenizer so 573: * <code>nextToken</code> will return the same value on the next call. 574: * May cause the lineno method to return an incorrect value 575: * if lineno is called before the next call to nextToken. 576: */ 577: public void pushBack() 578: { 579: pushedBack = true; 580: } 581: 582: /** 583: * This method sets the quote attribute on the specified character. 584: * Other attributes for the character are cleared. 585: * 586: * @param ch The character to set the quote attribute for, passed as an int. 587: */ 588: public void quoteChar(int ch) 589: { 590: if (ch >= 0 && ch <= 255) 591: { 592: quote[ch] = true; 593: comment[ch] = false; 594: whitespace[ch] = false; 595: alphabetic[ch] = false; 596: numeric[ch] = false; 597: } 598: } 599: 600: /** 601: * This method removes all attributes (whitespace, alphabetic, numeric, 602: * quote, and comment) from all characters. It is equivalent to calling 603: * <code>ordinaryChars(0x00, 0xFF)</code>. 604: * 605: * @see #ordinaryChars(int, int) 606: */ 607: public void resetSyntax() 608: { 609: ordinaryChars(0x00, 0xFF); 610: } 611: 612: /** 613: * This method sets a flag that indicates whether or not "C++" language style 614: * comments ("//" comments through EOL ) are handled by the parser. 615: * If this is <code>true</code> commented out sequences are skipped and 616: * ignored by the parser. This defaults to <code>false</code>. 617: * 618: * @param flag <code>true</code> to recognized and handle "C++" style 619: * comments, <code>false</code> otherwise 620: */ 621: public void slashSlashComments(boolean flag) 622: { 623: slashSlash = flag; 624: } 625: 626: /** 627: * This method sets a flag that indicates whether or not "C" language style 628: * comments (with nesting not allowed) are handled by the parser. 629: * If this is <code>true</code> commented out sequences are skipped and 630: * ignored by the parser. This defaults to <code>false</code>. 631: * 632: * @param flag <code>true</code> to recognized and handle "C" style comments, 633: * <code>false</code> otherwise 634: */ 635: public void slashStarComments(boolean flag) 636: { 637: slashStar = flag; 638: } 639: 640: /** 641: * This method returns the current token value as a <code>String</code> in 642: * the form "Token[x], line n", where 'n' is the current line numbers and 643: * 'x' is determined as follows. 644: * <p> 645: * <ul> 646: * <li>If no token has been read, then 'x' is "NOTHING" and 'n' is 0</li> 647: * <li>If <code>ttype</code> is TT_EOF, then 'x' is "EOF"</li> 648: * <li>If <code>ttype</code> is TT_EOL, then 'x' is "EOL"</li> 649: * <li>If <code>ttype</code> is TT_WORD, then 'x' is <code>sval</code></li> 650: * <li>If <code>ttype</code> is TT_NUMBER, then 'x' is "n=strnval" where 651: * 'strnval' is <code>String.valueOf(nval)</code>.</li> 652: * <li>If <code>ttype</code> is a quote character, then 'x' is 653: * <code>sval</code></li> 654: * <li>For all other cases, 'x' is <code>ttype</code></li> 655: * </ul> 656: */ 657: public String toString() 658: { 659: String tempstr; 660: if (ttype == TT_EOF) 661: tempstr = "EOF"; 662: else if (ttype == TT_EOL) 663: tempstr = "EOL"; 664: else if (ttype == TT_WORD) 665: tempstr = sval; 666: else if (ttype == TT_NUMBER) 667: tempstr = "n=" + nval; 668: else if (ttype == TT_NONE) 669: tempstr = "NOTHING"; 670: else // must be an ordinary char. 671: tempstr = "\'" + (char) ttype + "\'"; 672: 673: return "Token[" + tempstr + "], line " + lineno(); 674: } 675: 676: /** 677: * This method sets the whitespace attribute for all characters in the 678: * specified range, range terminators included. 679: * 680: * @param low The low end of the range of values to set the whitespace 681: * attribute for 682: * @param hi The high end of the range of values to set the whitespace 683: * attribute for 684: */ 685: public void whitespaceChars(int low, int hi) 686: { 687: if (low < 0) 688: low = 0; 689: if (hi > 255) 690: hi = 255; 691: for (int i = low; i <= hi; i++) 692: { 693: resetChar(i); 694: whitespace[i] = true; 695: } 696: } 697: 698: /** 699: * This method sets the alphabetic attribute for all characters in the 700: * specified range, range terminators included. 701: * 702: * @param low The low end of the range of values to set the alphabetic 703: * attribute for 704: * @param hi The high end of the range of values to set the alphabetic 705: * attribute for 706: */ 707: public void wordChars(int low, int hi) 708: { 709: if (low < 0) 710: low = 0; 711: if (hi > 255) 712: hi = 255; 713: for (int i = low; i <= hi; i++) 714: alphabetic[i] = true; 715: } 716: }
GNU Classpath (0.95) |