GNU Classpath (0.95) | |
Frames | No Frames |
1: /* Parser.java -- HTML parser 2: Copyright (C) 2005 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package javax.swing.text.html.parser; 40: 41: import java.io.IOException; 42: import java.io.Reader; 43: 44: import javax.swing.text.ChangedCharSetException; 45: import javax.swing.text.SimpleAttributeSet; 46: 47: /* 48: * FOR DEVELOPERS: To avoid regression, please run the package test 49: * textsuite/javax.swing.text.html.parser/AllParserTests after your 50: * modifications. 51: */ 52: 53: /** 54: * <p>A simple error-tolerant HTML parser that uses a DTD document 55: * to access data on the possible tokens, arguments and syntax.</p> 56: * <p> The parser reads an HTML content from a Reader and calls various 57: * notifying methods (which should be overridden in a subclass) 58: * when tags or data are encountered.</p> 59: * <p>Some HTML elements need no opening or closing tags. The 60: * task of this parser is to invoke the tag handling methods also when 61: * the tags are not explicitly specified and must be supposed using 62: * information, stored in the DTD. 63: * For example, parsing the document 64: * <p><table><tr><td>a<td>b<td>c</tr> <br> 65: * will invoke exactly the handling methods exactly in the same order 66: * (and with the same parameters) as if parsing the document: <br> 67: * <em><html><head></head><body><table>< 68: * tbody></em><tr><td>a<em></td></em><td>b<em> 69: * </td></em><td>c<em></td></tr></em>< 70: * <em>/tbody></table></body></html></em></p> 71: * (supposed tags are given in italics). The parser also supports 72: * obsolete elements of HTML syntax.<p> 73: * </p> 74: * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) 75: */ 76: public class Parser 77: implements DTDConstants 78: { 79: /** 80: * The document template description that will be used to parse the documents. 81: */ 82: protected DTD dtd; 83: 84: /** 85: * The value of this field determines whether or not the Parser will be 86: * strict in enforcing SGML compatibility. The default value is false, 87: * stating that the parser should do everything to parse and get at least 88: * some information even from the incorrectly written HTML input. 89: */ 90: protected boolean strict; 91: 92: /** 93: * The package level reference to the working HTML parser in this 94: * implementation. 95: */ 96: final gnu.javax.swing.text.html.parser.support.Parser gnu; 97: 98: /** 99: * Creates a new parser that uses the given DTD to access data on the 100: * possible tokens, arguments and syntax. There is no single - step way 101: * to get a default DTD; you must either refer to the implementation - 102: * specific packages, write your own DTD or obtain the working instance 103: * of parser in other way, for example, by calling 104: * {@link javax.swing.text.html.HTMLEditorKit#getParser() }. 105: * @param a_dtd A DTD to use. 106: */ 107: public Parser(DTD a_dtd) 108: { 109: dtd = a_dtd; 110: 111: final Parser j = this; 112: 113: gnu = 114: new gnu.javax.swing.text.html.parser.support.Parser(dtd) 115: { 116: protected final void handleComment(char[] comment) 117: { 118: j.handleComment(comment); 119: } 120: 121: protected final void handleEOFInComment() 122: { 123: j.handleEOFInComment(); 124: } 125: 126: protected final void handleEmptyTag(TagElement tag) 127: throws javax.swing.text.ChangedCharSetException 128: { 129: j.handleEmptyTag(tag); 130: } 131: 132: protected final void handleStartTag(TagElement tag) 133: { 134: j.handleStartTag(tag); 135: } 136: 137: protected final void handleEndTag(TagElement tag) 138: { 139: j.handleEndTag(tag); 140: } 141: 142: protected final void handleError(int line, String message) 143: { 144: j.handleError(line, message); 145: } 146: 147: protected final void handleText(char[] text) 148: { 149: j.handleText(text); 150: } 151: 152: protected final void handleTitle(char[] title) 153: { 154: j.handleTitle(title); 155: } 156: 157: protected final void markFirstTime(Element element) 158: { 159: j.markFirstTime(element); 160: } 161: 162: protected final void startTag(TagElement tag) 163: throws ChangedCharSetException 164: { 165: j.startTag(tag); 166: } 167: 168: protected final void endTag(boolean omitted) 169: { 170: j.endTag(omitted); 171: } 172: 173: protected TagElement makeTag(Element element) 174: { 175: return j.makeTag(element); 176: } 177: 178: protected TagElement makeTag(Element element, boolean isSupposed) 179: { 180: return j.makeTag(element, isSupposed); 181: } 182: }; 183: } 184: 185: /** 186: * Parse the HTML text, calling various methods in response to the 187: * occurence of the corresponding HTML constructions. 188: * @param reader The reader to read the source HTML from. 189: * @throws IOException If the reader throws one. 190: */ 191: public synchronized void parse(Reader reader) 192: throws IOException 193: { 194: gnu.parse(reader); 195: } 196: 197: /** 198: * Parses DTD markup declaration. Currently returns without action. 199: * @return null. 200: * @throws java.io.IOException 201: */ 202: public String parseDTDMarkup() 203: throws IOException 204: { 205: return gnu.parseDTDMarkup(); 206: } 207: 208: /** 209: * Parse DTD document declarations. Currently only parses the document 210: * type declaration markup. 211: * @param strBuff 212: * @return true if this is a valid DTD markup declaration. 213: * @throws IOException 214: */ 215: protected boolean parseMarkupDeclarations(StringBuffer strBuff) 216: throws IOException 217: { 218: return gnu.parseMarkupDeclarations(strBuff); 219: } 220: 221: /** 222: * Get the attributes of the current tag. 223: * @return The attribute set, representing the attributes of the current tag. 224: */ 225: protected SimpleAttributeSet getAttributes() 226: { 227: return gnu.getAttributes(); 228: } 229: 230: /** 231: * Get the number of the document line being parsed. 232: * @return The current line. 233: */ 234: protected int getCurrentLine() 235: { 236: return gnu.hTag.where.beginLine; 237: } 238: 239: /** 240: * Get the current position in the document being parsed. 241: * @return The current position. 242: */ 243: protected int getCurrentPos() 244: { 245: return gnu.hTag.where.startPosition; 246: } 247: 248: /** 249: * The method is called when the HTML end (closing) tag is found or if 250: * the parser concludes that the one should be present in the 251: * current position. The method is called immediatly 252: * before calling the handleEndTag(). 253: * @param omitted True if the tag is no actually present in the document, 254: * but is supposed by the parser (like </html> at the end of the 255: * document). 256: */ 257: protected void endTag(boolean omitted) 258: { 259: // This default implementation does nothing. 260: } 261: 262: /** 263: * Invokes the error handler. The default method in this implementation 264: * finally delegates the call to handleError, also providing the number of the 265: * current line. 266: */ 267: protected void error(String msg) 268: { 269: gnu.error(msg); 270: } 271: 272: /** 273: * Invokes the error handler. The default method in this implementation 274: * finally delegates the call to error (msg+": '"+invalid+"'"). 275: */ 276: protected void error(String msg, String invalid) 277: { 278: gnu.error(msg, invalid); 279: } 280: 281: /** 282: * Invokes the error handler. The default method in this implementation 283: * finally delegates the call to error (parm1+" "+ parm2+" "+ parm3). 284: */ 285: protected void error(String parm1, String parm2, String parm3) 286: { 287: gnu.error(parm1, parm2, parm3); 288: } 289: 290: /** 291: * Invokes the error handler. The default method in this implementation 292: * finally delegates the call to error 293: * (parm1+" "+ parm2+" "+ parm3+" "+ parm4). 294: */ 295: protected void error(String parm1, String parm2, String parm3, String parm4) 296: { 297: gnu.error(parm1, parm2, parm3, parm4); 298: } 299: 300: /** 301: * In this implementation, this is never called and returns without action. 302: */ 303: protected void flushAttributes() 304: { 305: gnu.flushAttributes(); 306: } 307: 308: /** 309: * Handle HTML comment. The default method returns without action. 310: * @param comment The comment being handled 311: */ 312: protected void handleComment(char[] comment) 313: { 314: // This default implementation does nothing. 315: } 316: 317: /** 318: * This is additionally called in when the HTML content terminates 319: * without closing the HTML comment. This can only happen if the 320: * HTML document contains errors (for example, the closing --;gt is 321: * missing. The default method calls the error handler. 322: */ 323: protected void handleEOFInComment() 324: { 325: gnu.error("Unclosed comment"); 326: } 327: 328: /** 329: * Handle the tag with no content, like <br>. The method is 330: * called for the elements that, in accordance with the current DTD, 331: * has an empty content. 332: * @param tag The tag being handled. 333: * @throws javax.swing.text.ChangedCharSetException 334: */ 335: protected void handleEmptyTag(TagElement tag) 336: throws ChangedCharSetException 337: { 338: // This default implementation does nothing. 339: } 340: 341: /** 342: * The method is called when the HTML closing tag ((like </table>) 343: * is found or if the parser concludes that the one should be present 344: * in the current position. 345: * @param tag The tag being handled 346: */ 347: protected void handleEndTag(TagElement tag) 348: { 349: // This default implementation does nothing. 350: } 351: 352: /* Handle error that has occured in the given line. */ 353: protected void handleError(int line, String message) 354: { 355: // This default implementation does nothing. 356: } 357: 358: /** 359: * The method is called when the HTML opening tag ((like <table>) 360: * is found or if the parser concludes that the one should be present 361: * in the current position. 362: * @param tag The tag being handled 363: */ 364: protected void handleStartTag(TagElement tag) 365: { 366: // This default implementation does nothing. 367: } 368: 369: /** 370: * Handle the text section. 371: * <p> For non-preformatted section, the parser replaces 372: * \t, \r and \n by spaces and then multiple spaces 373: * by a single space. Additionaly, all whitespace around 374: * tags is discarded. 375: * </p> 376: * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves 377: * all tabs and spaces, but removes <b>one</b> bounding \r, \n or \r\n, 378: * if it is present. Additionally, it replaces each occurence of \r or \r\n 379: * by a single \n.</p> 380: * 381: * @param text A section text. 382: */ 383: protected void handleText(char[] text) 384: { 385: // This default implementation does nothing. 386: } 387: 388: /** 389: * Handle HTML <title> tag. This method is invoked when 390: * both title starting and closing tags are already behind. 391: * The passed argument contains the concatenation of all 392: * title text sections. 393: * @param title The title text. 394: */ 395: protected void handleTitle(char[] title) 396: { 397: // This default implementation does nothing. 398: } 399: 400: /** 401: * Constructs the tag from the given element. In this implementation, 402: * this is defined, but never called. 403: * @param element the base element of the tag. 404: * @return the tag 405: */ 406: protected TagElement makeTag(Element element) 407: { 408: return makeTag(element, false); 409: } 410: 411: /** 412: * Constructs the tag from the given element. 413: * @param element the tag base {@link javax.swing.text.html.parser.Element} 414: * @param isSupposed true if the tag is not actually present in the 415: * html input, but the parser supposes that it should to occur in 416: * the current location. 417: * @return the tag 418: */ 419: protected TagElement makeTag(Element element, boolean isSupposed) 420: { 421: return new TagElement(element, isSupposed); 422: } 423: 424: /** 425: * This is called when the tag, representing the given element, 426: * occurs first time in the document. 427: * @param element 428: */ 429: protected void markFirstTime(Element element) 430: { 431: // This default implementation does nothing. 432: } 433: 434: /** 435: * The method is called when the HTML opening tag ((like <table>) 436: * is found or if the parser concludes that the one should be present 437: * in the current position. The method is called immediately before 438: * calling the handleStartTag. 439: * @param tag The tag 440: */ 441: protected void startTag(TagElement tag) 442: throws ChangedCharSetException 443: { 444: // This default implementation does nothing. 445: } 446: }
GNU Classpath (0.95) |