package extractor;
import java.io.File;
import java.io.IOException;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import sap.FileInputReader;
public class HTMLElementExtractor {
public void extractBoldFont(String content, String tag)
throws ParserException {
/*
* use regular expression to extract italic text, may not be complete
* e.g. in <i><span>blahblah...</span></i>, "blahblah..." will be
* ignored ArrayList<String> italics = new ArrayList<String>();
*
* Matcher matcher =
* Pattern.compile(">[A-Za-z ]+</i>").matcher(content); while
* (matcher.find()) { String str=matcher.group(); Matcher m =
* Pattern.compile("[A-Za-z ]+").matcher(str); if (m.find()) {
* System.out.println(m.group()); } }
*/
Parser parser = new Parser(content);
NodeFilter filter = new TagNameFilter(tag);
NodeList list = parser.extractAllNodesThatMatch(filter);
NodeIterator iterator = list.elements();
while (iterator.hasMoreNodes()) {
TagNode node = (TagNode) iterator.nextNode();
if (tag == "a" && node.getFirstChild() != null) {
// <a href="../../7d/a109d5efcc4644a9f2da2ab27e50dd/content.htm"
// title="Go to specified document">Search Task Panel for BI
// Data Services</a>
System.out.println(node.getFirstChild().toPlainTextString());
} else if (tag == "i" || tag == "b") {// || tag == "em", from the
// content, it seems "<em>"
// is not what we intended
Node nd = node.getNextSibling();
if (nd instanceof TextNode) {
// <i>blah</i>
System.out.println(nd.toPlainTextString());
} else if (nd instanceof TagNode) {
// <i><span class="SAPXDPNavigationPath "
// title="Navigation path">blah</span></i>
System.out.println(nd.getFirstChild().toPlainTextString());
}
}
}
}
public static void main(String[] args) {
HTMLElementExtractor extractor = new HTMLElementExtractor();
String content;
try {
content = new FileInputReader(new File("test/input/content.htm"))
.getStringContent();
extractor.extractBoldFont(content, "b");
} catch (IOException e) {
e.printStackTrace();
} catch (ParserException e) {
e.printStackTrace();
}
}
}