Source for javax.swing.text.html.parser.Parser

   1: /* Parser.java -- HTML parser
   2:    Copyright (C) 2005 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package javax.swing.text.html.parser;
  40: 
  41: import java.io.IOException;
  42: import java.io.Reader;
  43: 
  44: import javax.swing.text.ChangedCharSetException;
  45: import javax.swing.text.SimpleAttributeSet;
  46: 
  47: /*
  48:  * FOR DEVELOPERS: To avoid regression, please run the package test
  49:  * textsuite/javax.swing.text.html.parser/AllParserTests after your
  50:  * modifications.
  51:  */
  52: 
  53: /**
  54:  * <p>A simple error-tolerant HTML parser that uses a DTD document
  55:  * to access data on the possible tokens, arguments and syntax.</p>
  56:  * <p> The parser reads an HTML content from a Reader and calls various
  57:  * notifying methods (which should be overridden in a subclass)
  58:  * when tags or data are encountered.</p>
  59:  * <p>Some HTML elements need no opening or closing tags. The
  60:  * task of this parser is to invoke the tag handling methods also when
  61:  * the tags are not explicitly specified and must be supposed using
  62:  * information, stored in the DTD.
  63:  * For  example, parsing the document
  64:  * <p>&lt;table&gt;&lt;tr&gt;&lt;td&gt;a&lt;td&gt;b&lt;td&gt;c&lt;/tr&gt; <br>
  65:  * will invoke exactly the handling methods exactly in the same order
  66:  * (and with the same parameters) as if parsing the document: <br>
  67:  * <em>&lt;html&gt;&lt;head&gt;&lt;/head&gt;&lt;body&gt;&lt;table&gt;&lt;
  68:  * tbody&gt;</em>&lt;tr&gt;&lt;td&gt;a<em>&lt;/td&gt;</em>&lt;td&gt;b<em>
  69:  * &lt;/td&gt;</em>&lt;td&gt;c<em>&lt;/td&gt;&lt;/tr&gt;</em>&lt;
  70:  * <em>/tbody&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;</em></p>
  71:  * (supposed tags are given in italics). The parser also supports
  72:  * obsolete elements of HTML syntax.<p>
  73:  * </p>
  74:  * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
  75:  */
  76: public class Parser 
  77:   implements DTDConstants
  78: {
  79:   /**
  80:    * The document template description that will be used to parse the documents.
  81:    */
  82:   protected DTD dtd;
  83: 
  84:   /**
  85:    * The value of this field determines whether or not the Parser will be
  86:    * strict in enforcing SGML compatibility. The default value is false,
  87:    * stating that the parser should do everything to parse and get at least
  88:    * some information even from the incorrectly written HTML input.
  89:    */
  90:   protected boolean strict;
  91: 
  92:   /**
  93:    * The package level reference to the working HTML parser in this
  94:    * implementation.
  95:    */
  96:   final gnu.javax.swing.text.html.parser.support.Parser gnu;
  97: 
  98:   /**
  99:    * Creates a new parser that uses the given DTD to access data on the
 100:    * possible tokens, arguments and syntax. There is no single - step way
 101:    * to get a default DTD; you must either refer to the implementation -
 102:    * specific packages, write your own DTD or obtain the working instance
 103:    * of parser in other way, for example, by calling
 104:    * {@link javax.swing.text.html.HTMLEditorKit#getParser() }.
 105:    * @param a_dtd A DTD to use.
 106:    */
 107:   public Parser(DTD a_dtd)
 108:   {
 109:     dtd = a_dtd;
 110: 
 111:     final Parser j = this;
 112: 
 113:     gnu =
 114:       new gnu.javax.swing.text.html.parser.support.Parser(dtd)
 115:         {
 116:           protected final void handleComment(char[] comment)
 117:           {
 118:             j.handleComment(comment);
 119:           }
 120: 
 121:           protected final void handleEOFInComment()
 122:           {
 123:             j.handleEOFInComment();
 124:           }
 125: 
 126:           protected final void handleEmptyTag(TagElement tag)
 127:             throws javax.swing.text.ChangedCharSetException
 128:           {
 129:             j.handleEmptyTag(tag);
 130:           }
 131: 
 132:           protected final void handleStartTag(TagElement tag)
 133:           {
 134:             j.handleStartTag(tag);
 135:           }
 136: 
 137:           protected final void handleEndTag(TagElement tag)
 138:           {
 139:             j.handleEndTag(tag);
 140:           }
 141: 
 142:           protected final void handleError(int line, String message)
 143:           {
 144:             j.handleError(line, message);
 145:           }
 146: 
 147:           protected final void handleText(char[] text)
 148:           {
 149:             j.handleText(text);
 150:           }
 151: 
 152:           protected final void handleTitle(char[] title)
 153:           {
 154:             j.handleTitle(title);
 155:           }
 156: 
 157:           protected final void markFirstTime(Element element)
 158:           {
 159:             j.markFirstTime(element);
 160:           }
 161: 
 162:           protected final void startTag(TagElement tag)
 163:             throws ChangedCharSetException
 164:           {
 165:             j.startTag(tag);
 166:           }
 167: 
 168:           protected final void endTag(boolean omitted)
 169:           {
 170:             j.endTag(omitted);
 171:           }
 172: 
 173:           protected TagElement makeTag(Element element)
 174:           {
 175:             return j.makeTag(element);
 176:           }
 177: 
 178:           protected TagElement makeTag(Element element, boolean isSupposed)
 179:           {
 180:             return j.makeTag(element, isSupposed);
 181:           }
 182:         };
 183:   }
 184: 
 185:   /**
 186:    * Parse the HTML text, calling various methods in response to the
 187:    * occurence of the corresponding HTML constructions.
 188:    * @param reader The reader to read the source HTML from.
 189:    * @throws IOException If the reader throws one.
 190:    */
 191:   public synchronized void parse(Reader reader)
 192:     throws IOException
 193:   {
 194:     gnu.parse(reader);
 195:   }
 196: 
 197:   /**
 198:    * Parses DTD markup declaration. Currently returns without action.
 199:    * @return null.
 200:    * @throws java.io.IOException
 201:    */
 202:   public String parseDTDMarkup()
 203:     throws IOException
 204:   {
 205:     return gnu.parseDTDMarkup();
 206:   }
 207: 
 208:   /**
 209:    * Parse DTD document declarations. Currently only parses the document
 210:    * type declaration markup.
 211:    * @param strBuff
 212:    * @return true if this is a valid DTD markup declaration.
 213:    * @throws IOException
 214:    */
 215:   protected boolean parseMarkupDeclarations(StringBuffer strBuff)
 216:     throws IOException
 217:   {
 218:     return gnu.parseMarkupDeclarations(strBuff);
 219:   }
 220: 
 221:   /**
 222:    * Get the attributes of the current tag.
 223:    * @return The attribute set, representing the attributes of the current tag.
 224:    */
 225:   protected SimpleAttributeSet getAttributes()
 226:   {
 227:     return gnu.getAttributes();
 228:   }
 229: 
 230:   /**
 231:    * Get the number of the document line being parsed.
 232:    * @return The current line.
 233:    */
 234:   protected int getCurrentLine()
 235:   {
 236:     return gnu.hTag.where.beginLine;
 237:   }
 238: 
 239:   /**
 240:    * Get the current position in the document being parsed.
 241:    * @return The current position.
 242:    */
 243:   protected int getCurrentPos()
 244:   {
 245:     return gnu.hTag.where.startPosition;
 246:   }
 247: 
 248:   /**
 249:    * The method is called when the HTML end (closing) tag is found or if
 250:    * the parser concludes that the one should be present in the
 251:    * current position. The method is called immediatly
 252:    * before calling the handleEndTag().
 253:    * @param omitted True if the tag is no actually present in the document,
 254:    * but is supposed by the parser (like &lt;/html&gt; at the end of the
 255:    * document).
 256:    */
 257:   protected void endTag(boolean omitted)
 258:   {
 259:     // This default implementation does nothing.
 260:   }
 261: 
 262:   /**
 263:    * Invokes the error handler. The default method in this implementation
 264:    * finally delegates the call to handleError, also providing the number of the
 265:    * current line.
 266:    */
 267:   protected void error(String msg)
 268:   {
 269:     gnu.error(msg);
 270:   }
 271: 
 272:   /**
 273:    * Invokes the error handler. The default method in this implementation
 274:    * finally delegates the call to error (msg+": '"+invalid+"'").
 275:    */
 276:   protected void error(String msg, String invalid)
 277:   {
 278:     gnu.error(msg, invalid);
 279:   }
 280: 
 281:   /**
 282:    * Invokes the error handler. The default method in this implementation
 283:    * finally delegates the call to error (parm1+" "+ parm2+" "+ parm3).
 284:    */
 285:   protected void error(String parm1, String parm2, String parm3)
 286:   {
 287:     gnu.error(parm1, parm2, parm3);
 288:   }
 289: 
 290:   /**
 291:    * Invokes the error handler. The default method in this implementation
 292:    * finally delegates the call to error
 293:    * (parm1+" "+ parm2+" "+ parm3+" "+ parm4).
 294:    */
 295:   protected void error(String parm1, String parm2, String parm3, String parm4)
 296:   {
 297:     gnu.error(parm1, parm2, parm3, parm4);
 298:   }
 299: 
 300:   /**
 301:    * In this implementation, this is never called and returns without action.
 302:    */
 303:   protected void flushAttributes()
 304:   {
 305:     gnu.flushAttributes();
 306:   }
 307: 
 308:   /**
 309:    * Handle HTML comment. The default method returns without action.
 310:    * @param comment The comment being handled
 311:    */
 312:   protected void handleComment(char[] comment)
 313:   {
 314:     // This default implementation does nothing.
 315:   }
 316: 
 317:   /**
 318:    * This is additionally called in when the HTML content terminates
 319:    * without closing the HTML comment. This can only happen if the
 320:    * HTML document contains errors (for example, the closing --;gt is
 321:    * missing. The default method calls the error handler.
 322:    */
 323:   protected void handleEOFInComment()
 324:   {
 325:     gnu.error("Unclosed comment");
 326:   }
 327: 
 328:   /**
 329:    * Handle the tag with no content, like &lt;br&gt;. The method is
 330:    * called for the elements that, in accordance with the current DTD,
 331:    * has an empty content.
 332:    * @param tag The tag being handled.
 333:    * @throws javax.swing.text.ChangedCharSetException
 334:    */
 335:   protected void handleEmptyTag(TagElement tag)
 336:     throws ChangedCharSetException
 337:   {
 338:     // This default implementation does nothing.
 339:   }
 340: 
 341:   /**
 342:    * The method is called when the HTML closing tag ((like &lt;/table&gt;)
 343:    * is found or if the parser concludes that the one should be present
 344:    * in the current position.
 345:    * @param tag The tag being handled
 346:    */
 347:   protected void handleEndTag(TagElement tag)
 348:   {
 349:     // This default implementation does nothing.
 350:   }
 351: 
 352:   /* Handle error that has occured in the given line. */
 353:   protected void handleError(int line, String message)
 354:   {
 355:     // This default implementation does nothing.
 356:   }
 357: 
 358:   /**
 359:    * The method is called when the HTML opening tag ((like &lt;table&gt;)
 360:    * is found or if the parser concludes that the one should be present
 361:    * in the current position.
 362:    * @param tag The tag being handled
 363:    */
 364:   protected void handleStartTag(TagElement tag)
 365:   {
 366:     // This default implementation does nothing.
 367:   }
 368: 
 369:   /**
 370:    * Handle the text section.
 371:    * <p> For non-preformatted section, the parser replaces
 372:    * \t, \r and \n by spaces and then multiple spaces
 373:    * by a single space. Additionaly, all whitespace around
 374:    * tags is discarded.
 375:    * </p>
 376:    * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves
 377:    * all tabs and spaces, but removes <b>one</b>  bounding \r, \n or \r\n,
 378:    * if it is present. Additionally, it replaces each occurence of \r or \r\n
 379:    * by a single \n.</p>
 380:    *
 381:    * @param text A section text.
 382:    */
 383:   protected void handleText(char[] text)
 384:   {
 385:     // This default implementation does nothing.
 386:   }
 387: 
 388:   /**
 389:    * Handle HTML &lt;title&gt; tag. This method is invoked when
 390:    * both title starting and closing tags are already behind.
 391:    * The passed argument contains the concatenation of all
 392:    * title text sections.
 393:    * @param title The title text.
 394:    */
 395:   protected void handleTitle(char[] title)
 396:   {
 397:     // This default implementation does nothing.
 398:   }
 399: 
 400:   /**
 401:    * Constructs the tag from the given element. In this implementation,
 402:    * this is defined, but never called.
 403:    * @param element the base element of the tag.
 404:    * @return the tag
 405:    */
 406:   protected TagElement makeTag(Element element)
 407:   {
 408:     return makeTag(element, false);
 409:   }
 410: 
 411:   /**
 412:    * Constructs the tag from the given element.
 413:    * @param element the tag base {@link javax.swing.text.html.parser.Element}
 414:    * @param isSupposed true if the tag is not actually present in the
 415:    * html input, but the parser supposes that it should to occur in
 416:    * the current location.
 417:    * @return the tag
 418:    */
 419:   protected TagElement makeTag(Element element, boolean isSupposed)
 420:   {
 421:     return new TagElement(element, isSupposed);
 422:   }
 423: 
 424:   /**
 425:    * This is called when the tag, representing the given element,
 426:    * occurs first time in the document.
 427:    * @param element
 428:    */
 429:   protected void markFirstTime(Element element)
 430:   {
 431:     // This default implementation does nothing.
 432:   }
 433: 
 434:   /**
 435:    * The method is called when the HTML opening tag ((like &lt;table&gt;)
 436:    * is found or if the parser concludes that the one should be present
 437:    * in the current position. The method is called immediately before
 438:    * calling the handleStartTag.
 439:    * @param tag The tag
 440:    */
 441:   protected void startTag(TagElement tag)
 442:     throws ChangedCharSetException
 443:   {
 444:     // This default implementation does nothing.
 445:   }
 446: }