HTMLTree.java
/*
* HTML Parser
* Copyright (C) 1997 David McNicol
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* file COPYING for more details.
*/
package cvu.html;
import java.util.Enumeration;
/**
* This class stores an HTML file in tree format. It can be constructed
* from an HTMLTokenizer or a file name, in which case it will create its
* own tokenizer. <p>
* Once the HTML file has been parsed a number of search operations can
* be performed. The nature of the searches are described below, but some
* of their uses are highlighted here: <p>
* <ul>
* <li>Subtree - Finding all of the FORM elements within a BODY element.
* <li>Sibling - Finding all the LI elements within the same UL element.
* <li>All - Finding every occurence of the A element.
* </ul>
* There is also a context search, which performs a subtree search on the
* specified element's parent. This can be thought of as a combination
* between as sibling search and a subtree search.
* @see HTMLTokenizer
* @author <a href="http://www.strath.ac.uk/~ras97108/">David McNicol</a>
*/
@SuppressWarnings({"rawtypes"})
public class HTMLTree {
private HTMLNode root; // The root of the HTML tree.
/**
* Constructs a new HTMLTree using the tokens from the
* specified Enumeration.
*/
public HTMLTree (Enumeration e) {
// Create the root element from the enumeration of tokens.
root = new HTMLNode(null, null, e);
}
/**
* Constructs a new HTMLTree using the tokens from the
* specified HTMLTokenizer.
* @param ht the source of the HTML tokens.
*/
public HTMLTree (HTMLTokenizer ht) {
// Create the root element from the tokens.
root = new HTMLNode(null, null, ht.getTokens());
}
/**
* Constructs a new HTMLTree from the specified HTML file.
* @param filename the name of the HTML file.
*/
public HTMLTree (String filename) {
// Tokenize the HTML file.
HTMLTokenizer ht = new HTMLTokenizer(filename);
// Create the root element from the tokens.
root = new HTMLNode(null, null, ht.getTokens());
}
/**
* Finds the first element with the specified name in the specified
* subtree.
* @param name the name of the element to search for.
* @param tree the subtree to search.
*/
public HTMLNode findInSubtree (String name, HTMLNode tree) {
return find(name, tree, null, true, false);
}
/**
* Finds the next element after the specified one in the subtree.
* If the previous element is not in the subtree then nothing will
* be found.
* @param tree the subtree to search.
* @param prev a previously found element.
*/
public HTMLNode findNextInSubtree (HTMLNode tree,
HTMLNode prev) {
// Return nothing if there is no previous element.
if (prev == null) return null;
// Search the subtree for the next element with the same name.
return find(prev.getName(), tree, prev, true, false);
}
/**
* Finds the first element with the specified name in the entire
* tree.
* @param name the name of the element to search for.
*/
public HTMLNode findInAll (String name) {
return find(name, root, null, true, false);
}
/**
* Finds the next element with the same name as the one specified
* in the entire tree.
* @param prev the previously found element.
*/
public HTMLNode findNextInAll (HTMLNode prev) {
// Return nothing if there is no previous element.
if (prev == null) return null;
// Search for the next element in the entire tree.
return find(prev.getName(), prev.getParent(), prev, true, true);
}
/**
* Find the first element with the specified name in the specified
* element's context (that is, the elements parent's subtree).
* @param name the name of the element to search for.
* @param el the element whose context is to be searched.
*/
public HTMLNode findInContext (String name, HTMLNode el) {
// Return nothing if the arguments are invalid.
if (el == null) return null;
// Search the elements parent's subtree.
return find(name, el.getParent(), null, true, false);
}
/**
* Find the next element with the same name as the specified one
* in the first element's context (that is, the first elements
* parent's subtree).
* If the previous element is not in the subtree then nothing
* will be found.
* @param el the element whose context is to be searched.
* @param the previously found element.
*/
public HTMLNode findNextInContext (HTMLNode el,
HTMLNode prev) {
// Return nothing if the arguments are invalid.
if (el == null) return null;
// Search the elements parent's subtree.
return find(el.getName(), el.getParent(), prev, true, false);
}
/**
* Finds the next element with the same name as the specified
* one amongst that elements siblings (that is, the elements
* parent's children).
* @param el the element whose siblings are to be searched.
*/
public HTMLNode findSibling (HTMLNode el) {
// Return nothing if the element is invalid.
if (el == null) return null;
// Search for a sibling in the elements parent's subtree.
return find(el.getName(), el.getParent(), el, false, false);
}
/**
* Prints a string representation of the HTMLTree.
*/
@Override
public String toString () {
return "HTMLTree[" + root + "]";
}
/**
* Generic find method which searches for a string in the given
* tree's children. However, the search will not start until the
* start element has been passed. The tree's grandchildren will
* be searched recursively if the <code>recursive</code> argument
* is true. The whole tree after the element will be searched
* if the <code>searchParent</code> argument is true. In this
* case the method will recurse back towards the root element.
*/
private HTMLNode find (String name, HTMLNode tree,
HTMLNode start, boolean recursive, boolean searchParent) {
Enumeration children; // The immediate children of the subtree.
Object next; // The next object from the enumeration.
boolean searching; // True if we are actively searching.
HTMLNode child; // One of the subtree's children.
HTMLNode found; // Result of a subtree search.
// Return nothing if the arguments are invalid.
if (name == null || tree == null) return null;
// Check if we should delay the search until we find the
// specified start element.
searching = (start == null);
// Get the subtree's children.
children = tree.getChildren();
// Return nothing if the subtree has no children.
if (children == null) return null;
// Loop through the subtree's children.
while (children.hasMoreElements()) {
// Get the next child from the enumeration.
next = children.nextElement();
// Check if this child is an HTMLNode.
if (! (next instanceof HTMLNode)) continue;
// Cast the child into type HTMLNode.
child = (HTMLNode) next;
if (searching) {
// Check if we have found the element.
if (name.equalsIgnoreCase(child.getName()))
return child;
// Check if we should search grandchildren.
if (recursive) {
// Search the child's subtree.
found = find(name, child, null, true,
false);
// Return the element if we found one.
if (found != null) return found;
}
} else {
// Check if this element is the start element.
if (child == start) searching = true;
}
}
// Check if we should search the subtree's parent tree.
if (searchParent) {
HTMLNode parent = tree.getParent();
// Check if the subtree has a parent.
if (parent == null) return null;
// Otherwise search it, starting after the subtree.
return find(name, parent, tree, true, true);
}
return null;
}
}