Source for javax.swing.text.html.parser.DocumentParser

   1: /* DocumentParser.java -- A parser for HTML documents.
   2:    Copyright (C) 2005 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package javax.swing.text.html.parser;
  40: 
  41: import javax.swing.text.html.parser.Parser;
  42: 
  43: import java.io.IOException;
  44: import java.io.Reader;
  45: 
  46: import javax.swing.text.BadLocationException;
  47: import javax.swing.text.SimpleAttributeSet;
  48: import javax.swing.text.html.HTMLEditorKit;
  49: 
  50: /**
  51:  * <p>A simple error-tolerant HTML parser that uses a DTD document
  52:  * to access data on the possible tokens, arguments and syntax.</p>
  53:  * <p> The parser reads an HTML content from a Reader and calls various
  54:  * notifying methods (which should be overridden in a subclass)
  55:  * when tags or data are encountered.</p>
  56:  * <p>Some HTML elements need no opening or closing tags. The
  57:  * task of this parser is to invoke the tag handling methods also when
  58:  * the tags are not explicitly specified and must be supposed using
  59:  * information, stored in the DTD.
  60:  * For  example, parsing the document
  61:  * <p>&lt;table&gt;&lt;tr&gt;&lt;td&gt;a&lt;td&gt;b&lt;td&gt;c&lt;/tr&gt; <br>
  62:  * will invoke exactly the handling methods exactly in the same order
  63:  * (and with the same parameters) as if parsing the document: <br>
  64:  * <em>&lt;html&gt;&lt;head&gt;&lt;/head&gt;&lt;body&gt;&lt;table&gt;&lt;
  65:  * tbody&gt;</em>&lt;tr&gt;&lt;td&gt;a<em>&lt;/td&gt;</em>&lt;td&gt;b<em>
  66:  * &lt;/td&gt;</em>&lt;td&gt;c<em>&lt;/td&gt;&lt;/tr&gt;</em>&lt;
  67:  * <em>/tbody&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;</em></p>
  68:  * (supposed tags are given in italics). The parser also supports
  69:  * obsolete elements of HTML syntax.<p>
  70:  * </p>
  71:  * In this implementation, DocumentParser is directly derived from its
  72:  * ancestor without changes of functionality.
  73:  * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
  74:  */
  75: public class DocumentParser
  76:   extends Parser
  77:   implements DTDConstants
  78: {
  79:   /**
  80:    * The enclosed working parser class.
  81:    */
  82:   private class gnuParser
  83:     extends gnu.javax.swing.text.html.parser.support.Parser
  84:   {
  85:     private gnuParser(DTD d)
  86:     {
  87:       super(d);
  88:     }
  89: 
  90:     protected final void handleComment(char[] comment)
  91:     {
  92:       parser.handleComment(comment);
  93:       callBack.handleComment(comment, hTag.where.startPosition);
  94:     }
  95: 
  96:     protected final void handleEmptyTag(TagElement tag)
  97:       throws javax.swing.text.ChangedCharSetException
  98:     {
  99:       parser.handleEmptyTag(tag);
 100:       callBack.handleSimpleTag(tag.getHTMLTag(), getAttributes(),
 101:                                hTag.where.startPosition
 102:                               );
 103:     }
 104: 
 105:     protected final void handleEndTag(TagElement tag)
 106:     {
 107:       parser.handleEndTag(tag);
 108:       callBack.handleEndTag(tag.getHTMLTag(), hTag.where.startPosition);
 109:     }
 110: 
 111:     protected final void handleError(int line, String message)
 112:     {
 113:       parser.handleError(line, message);
 114:       callBack.handleError(message, hTag.where.startPosition);
 115:     }
 116: 
 117:     protected final void handleStartTag(TagElement tag)
 118:     {
 119:       parser.handleStartTag(tag);
 120:       SimpleAttributeSet attributes = gnu.getAttributes();
 121: 
 122:       if (tag.fictional())
 123:         attributes.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
 124:                                 Boolean.TRUE
 125:                                );
 126: 
 127:       callBack.handleStartTag(tag.getHTMLTag(), attributes,
 128:                               hTag.where.startPosition
 129:                              );
 130:     }
 131: 
 132:     protected final void handleText(char[] text)
 133:     {
 134:       parser.handleText(text);
 135:       callBack.handleText(text, hTag.where.startPosition);
 136:     }
 137: 
 138:     DTD getDTD()
 139:     {
 140:       return dtd;
 141:     }
 142:   }
 143: 
 144:   /**
 145:    * This field is used to access the identically named
 146:    * methods of the outer class.
 147:    * This is package-private to avoid an accessor method.
 148:    */
 149:   DocumentParser parser = this;
 150: 
 151:   /**
 152:    * The callback.
 153:    * This is package-private to avoid an accessor method.
 154:    */
 155:   HTMLEditorKit.ParserCallback callBack;
 156: 
 157:   /**
 158:    * The reference to the working class of HTML parser that is
 159:    * actually used to parse the document.
 160:    * This is package-private to avoid an accessor method.
 161:    */
 162:   gnuParser gnu;
 163: 
 164:   /**
 165:    * Creates a new parser that uses the given DTD to access data on the
 166:    * possible tokens, arguments and syntax. There is no single - step way
 167:    * to get a default DTD; you must either refer to the implementation -
 168:    * specific packages, write your own DTD or obtain the working instance
 169:    * of parser in other way, for example, by calling
 170:    * {@link javax.swing.text.html.HTMLEditorKit#getParser()}.
 171:    *
 172:    * @param a_dtd a DTD to use.
 173:    */
 174:   public DocumentParser(DTD a_dtd)
 175:   {
 176:     super(a_dtd);
 177:     gnu = new gnuParser(a_dtd);
 178:   }
 179: 
 180:   /**
 181:    * Parses the HTML document, calling methods of the provided
 182:    * callback. This method must be multithread - safe.
 183:    * @param reader The reader to read the HTML document from
 184:    * @param aCallback The callback that is notifyed about the presence
 185:    * of HTML elements in the document.
 186:    * @param ignoreCharSet If thrue, any charset changes during parsing
 187:    * are ignored.
 188:    * @throws java.io.IOException
 189:    */
 190:   public void parse(Reader reader, HTMLEditorKit.ParserCallback aCallback,
 191:                     boolean ignoreCharSet
 192:                    )
 193:              throws IOException
 194:   {
 195:     callBack = aCallback;
 196:     gnu.parse(reader);
 197: 
 198:     callBack.handleEndOfLineString(gnu.getEndOfLineSequence());
 199:     try
 200:       {
 201:         callBack.flush();
 202:       }
 203:     catch (BadLocationException ex)
 204:       {
 205:         // Convert this into the supported type of exception.
 206:         throw new IOException(ex.getMessage());
 207:       }
 208:   }
 209: 
 210:   /**
 211:    * Handle HTML comment. The default method returns without action.
 212:    * @param comment the comment being handled
 213:    */
 214:   protected void handleComment(char[] comment)
 215:   {
 216:     // This default implementation does nothing.
 217:   }
 218: 
 219:   /**
 220:    * Handle the tag with no content, like &lt;br&gt;. The method is
 221:    * called for the elements that, in accordance with the current DTD,
 222:    * has an empty content.
 223:    * @param tag the tag being handled.
 224:    * @throws javax.swing.text.ChangedCharSetException
 225:    */
 226:   protected void handleEmptyTag(TagElement tag)
 227:                          throws javax.swing.text.ChangedCharSetException
 228:   {
 229:     // This default implementation does nothing.
 230:   }
 231: 
 232:   /**
 233:    * The method is called when the HTML closing tag ((like &lt;/table&gt;)
 234:    * is found or if the parser concludes that the one should be present
 235:    * in the current position.
 236:    * @param tag The tag being handled
 237:    */
 238:   protected void handleEndTag(TagElement tag)
 239:   {
 240:     // This default implementation does nothing.
 241:   }
 242: 
 243:   /* Handle error that has occured in the given line. */
 244:   protected void handleError(int line, String message)
 245:   {
 246:     // This default implementation does nothing.
 247:   }
 248: 
 249:   /**
 250:    * The method is called when the HTML opening tag ((like &lt;table&gt;)
 251:    * is found or if the parser concludes that the one should be present
 252:    * in the current position.
 253:    * @param tag The tag being handled
 254:    */
 255:   protected void handleStartTag(TagElement tag)
 256:   {
 257:     // This default implementation does nothing.
 258:   }
 259: 
 260:   /**
 261:    * Handle the text section.
 262:    * @param text a section text.
 263:    */
 264:   protected void handleText(char[] text)
 265:   {
 266:     // This default implementation does nothing.
 267:   }
 268: }