So my need here was to be able to extract content from a given tag and the way to identify the tag is by using its ID field. For instance I want to extract the text "some text two" from the below page:
<html><body><div id='one'> some text one </div> <div id='two'> some text two </div></body></html>
Here's the code sample to accomplish this:
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.Div;
import org.htmlparser.util.NodeList;
....
Parser parse = new Parser("[[url here]]");
// if you have html you can alternativey use the parse.setInputHTML method
NodeList lstNodes = parse.extractAllNodesThatMatch (
new AndFilter (new NodeClassFilter (Div.class), new HasAttributeFilter ("id")));
if (lstNodes != null && lstNodes.size() > 0)
{
Div tag = null;
for (int itr=0; itr<lstNodes.size(); itr++)
{
tag = (Div)lstNodes.elementAt(itr);
String idAttribute = tag.getAttribute("id");
if (idAttribute != null && idAttribute.equals("two"))
{
// this will print the div html <div id='two'> some text two </div>
System.out.println(tag.toPlainTextString());
// now I need to extract the text from this div tag
Parser tagParser = new Parser();
tagParser.setInputHTML(tag.toHtml());
StringBean sb = new StringBean ();
tagParser.visitAllNodesWith (sb);
System.out.println(sb.getStrings ()); // this will print the content "some text two"
}
}
}