HTMLTokenizer.java

/*
 * HTML Parser
 * Copyright (C) 1997 David McNicol
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * file COPYING for more details.
 */

package cvu.html;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Enumeration;
import java.util.Vector;

/**
 * This class tokenizes a stream of HTML tags and blocks of text. After
 * the stream has been tokenized an Enumeration of tokens can be accessed.
 * @see TagToken
 * @see TextToken
 * @see java.util.Enumeration
 * @author <a href="http://www.strath.ac.uk/~ras97108/">David McNicol</a>
 */
@SuppressWarnings({"rawtypes", "unchecked"})
public class HTMLTokenizer {

	private final int BUF_LEN = 256; // Maximum length of read buffer.
	private Vector tokens;	         // Store for finished tokens.

	private char separator;	// Stores the current separator character.
	private int start;	// Index of the start of the next token.

	/**
	 * Constructs a new HTMLTokenizer using the given filename
	 * to create the input stream.
	 * @param file the name of the file to open.
	 */
	public HTMLTokenizer (String file) {

		InputStream is; // The new input stream.

		// Initialise the variables.
		tokens = new Vector();

		try {
			// Open an input stream using the file name.
			is = new FileInputStream(file);

			// Parse the input stream.
			parseInputStream(is);
		}
		catch (IOException ioe) {
			sk.iway.iwcm.Logger.error(ioe);
		}
	}

   public HTMLTokenizer (char[] charbuf)
   {
		// Initialise the variables.
		tokens = new Vector();
      /*String data = new String(bytebuf);
      int size = data.length();
      char[] charbuf = new char[size];
      int i;
      for (i=0; i<size; i++)
      {
         charbuf[i] = data.charAt(i);
      }*/
      separator = '<';
      processBuffer(charbuf, null, charbuf.length);

      //notebook



	}

	/**
	 * Returns an enumeration of the tokens which have been
	 * created by the HTMLTokenizer.
	 */
	public Enumeration getTokens () {
		return tokens.elements();
	}

	/**
	 * Returns the vector in which the tokens are stored.
	 */
	public Vector getTokenVector () {
		return tokens;
	}

	/**
	 * Parses the input stream given into tokens.
	 * @param is the input stream to parse.
	 */
	private void parseInputStream (InputStream is) throws IOException {

		byte[] readbuf;	     // Refers to the read buffer.
		char[] charbuf;	     // Read buffer converted to characters.
		StringBuffer unused; // Characters still to be processed.
		int length;	     // Length of last chunk of read data.
		int i;		     // Loop variable.

		// Create new buffers.
		readbuf = new byte[BUF_LEN];
		charbuf = new char[BUF_LEN];
		unused = null;

		// Set the separator initially.
		separator = '<';

		// Loop round while the end-of-file has not been reached.
		while (true) {

			// Read in the first chunk of data.
			length = is.read(readbuf);

			// Check for end-of-file.
			if (length < 0) break;

			// Convert the byte array to characters.
			for (i = 0; i < length; i++)
				charbuf[i] = (char) readbuf[i];

			// Process it.
			unused = processBuffer(charbuf, unused, length);
		}
	}

	/**
	 * Processes the given character array. The token buffer will be
	 * updated to start with the contents of the given StringBuffer.
	 * Any leftover parts of the buffer that have not been processed
	 * are returned in a StringBuffer. The next call to processBuffer
	 * will start where the last one left off by putting the returned
	 * StringBuffer in the argument list of the next call.
	 * @param charbuf the character array to be processed.
	 * @param old the leftovers from the last call.
	 * @param len the maximum length of the array to process.
	 */
	private StringBuffer processBuffer (char[] charbuf, StringBuffer old,
	  int len) {

		StringBuffer data; // Stores current token's data.
		int idx;	   // The index of the next separator.
		//int i;		   // Loop variable.

		// Get a buffer for the current token.
		if (old != null)
			data = old;
		else
			data = new StringBuffer(80);

		// Make sure the start index is initialized properly.
		start = 0;
		idx = -1;

		while (true) {

			// Set the new start index.
			start = idx + 1;

			// Get the index of the separator.
			idx = indexOf(separator, charbuf, start, len);

			// Check if the separator appears or not.
			if (idx < 0) {

				// Update the data buffer.
				if (len - start > 0)
				  data.append(charbuf, start, len - start);

				// If there is data in the buffer, return it.
				if (data.length() > 0)
					return data;
				else
					return null;
			}

			// Append the start of the read buffer onto the
			// data buffer.
			data.append(charbuf, start, idx - start);

			// Check if we should create text or a tag.
			if (separator == '<') {

				// Check if there is any content to store.
				if (data.length() > 0) {

					// Create a new TextToken.
					TextToken tt = new TextToken();

					// Put the data into the token.
					tt.setText(data);

					// Store the new TextToken.
					tokens.addElement(tt);
				}
			} else {

				// Convert the data to a string.
				String s = data.toString();

				// Create a new TagToken with it.
				TagToken tt = new TagToken(s);

				// Store the new TagToken.
				tokens.addElement(tt);
			}

			// Create a new, empty data buffer.
			data = new StringBuffer(BUF_LEN);

			// Swap the separator character.
			if (separator == '<')
				separator = '>';
			else
				separator = '<';
		}
	}

	/**
	 * Returns the index of the given character in the given byte
	 * array or -1 if the character does not appear there.
	 * @param c the test character.
	 * @param array the byte array to search.
	 * @param start the first index to search.
	 * @param len the maximum length to search.
	 */
	private int indexOf (char c, char[] array, int start, int len) {
		for (int i = start; i < len; i++)
			if (array[i] == c) return i;

		return -1;
	}
}