package chapter9;
import java.io.*;
import org.htmlparser.util.*;
import org.htmlparser.Parser;
import org.htmlparser.filters.*;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.ParserException;
/** 使用htmlparser抓取网页链接 */
public class RadarSpecialSearchEngine {
public static void main(String[] args) throws ParserException {
try {
TravelWordTable("D:\\workshop\\docs\\wordlist.txt");
} catch (Exception e) {
e.printStackTrace();
}
}
public static void TravelWordTable(String filename) throws IOException {
try {
String buffer;
FileWriter resultFile = null;
PrintWriter myFile = null;
String dstfile = filename + "_dsturl.txt";
File writefile = new File(dstfile);
if (!writefile.exists()) {
writefile.createNewFile();
}
resultFile = new FileWriter(writefile);
myFile = new PrintWriter(resultFile);
BufferedReader reader = new BufferedReader(new FileReader(filename));
while ((buffer = reader.readLine()) != null) {
String url = "http://www.baidu.com/s?lm=0&si=&rn=10&ie=gb2312&ct=0&wd="
+ buffer + "&pn=0&ver=0&cl=3";
getBaiduUrls(url, "GB2312", myFile);
}
if (myFile != null)
myFile.close();
if (resultFile != null)
resultFile.close();
} catch (ParserException e) {
e.printStackTrace();
}
}
public static void getBaiduUrls(String url, String pageEncoding,
PrintWriter writer) throws ParserException {
NodeList nodeList = null;
try {
Parser parser = new Parser(url);
parser.setEncoding(pageEncoding); // 设置解析编码格式
// Baidu 检索结果的url连接和标题
nodeList = parser.parse(new AndFilter(new HasAttributeFilter(
"target"), new HasAttributeFilter("href")));
} catch (ParserException e) {
e.printStackTrace();
}
if (nodeList != null && nodeList.size() > 0) { // 循环遍历每个Url节点
for (int i = 0; i < nodeList.size(); i++) {
String urlLink = ((LinkTag) nodeList.elementAt(i))
.extractLink();
String LinkName = ((LinkTag) nodeList.elementAt(i))
.getLinkText();
if (urlLink.indexOf("bnu") == 0 || urlLink.indexOf("http") == 0)
System.out.println("结果 " + i + " 标题:" + LinkName);
System.out.println(" 链接:" + urlLink);
writer.println(urlLink);
}
}
}
}
使用htmlparser抓取网页链接
最新推荐文章于 2021-06-17 10:19:50 发布