基于HtmlParser和HttpClient抽取网页关键字,主要抽取网页的title、description、和keywords。HttpClient主要用来访问网页,而HtmlParser主要用来解析返回的html,但是在解析html的过程中,要根据网页的编码进行转换,不然会造成乱码,大多数网页的<meta>标签都会指定网页的编码,所以我们要将网页的编码一起解析出来,在来进行编码的转换。
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import util.Utility;
/**
* 利用HtmlParser和HttpClient两个开源包进行网页的解析和网页的访问
*/
public class HtmlParser {
public static void main(String []args) throws Exception {
getWebAddress();
//getContent("http://www.sogou.com/sogou","");
}
/**
* ͨ从数据库获取网页地址
* @return
* @throws Exception
*/
public static void getWebAddress() throws Exception {
Connection con = Utility.getConnection();
Statement stmt = con.createStatement();
PreparedStatement pstmt = con.prepareStatement("insert into content(userid,webaddress,title,keywords,description) values(?,?,?,?,?)");
List<String> list = getUserIds();
for(int i=162;i<list.size();i++) {
String userid = list.get(i);
System.out.println("第"+i+"个用户"+userid);
List<String> address = new ArrayList<String>();
String sql = "select distinct U from behavior where userid='"+userid+"'";
ResultSet rs = stmt.executeQuery(sql);
while(rs.next()) {
String webaddress = rs.getString("U");
if(webaddress != null) {
if(!webaddress.contains("https")&&!webaddress.endsWith(".zip")&&!webaddress.endsWith(".jar")&&!webaddress.endsWith(".exe")) {
address.add(webaddress);
}
}
}
for(int j=0;j<address.size();j++) {
System.out.println("总共有"+address.size()+"条记录!");
String webaddress = address.get(j);
try {
String t[] = getContent(webaddress,userid);
pstmt.setString(1, userid);
pstmt.setString(2, webaddress);
pstmt.setString(3, t[0]);
pstmt.setString(4, t[1]);
pstmt.setString(5, t[2]);
pstmt.addBatch();
} catch (Throwable e) {
System.out.println(e.toString());
writeToFile(userid,webaddress);//将发生错误的地址写到文件中־
continue;//为了保证程序能一直运行 当有错误发生时直接进行下一个循环
}
System.out.println(webaddress);
System.out.println("第"+j+"条记录!");
}
pstmt.executeBatch();
System.out.println("数据插入完成!!!!!");
}
}
/**
* 从指定的网页获取需要的内容
* @throws Exception
*/
@SuppressWarnings("serial")
public static String[] getContent(String webAddress, String userid) throws Exception {
String address = "";
if (webAddress.contains("http")) {//如果地址不包含http则加上http
address = webAddress;
} else {
address = "http://" + webAddress;
}
String html = visitWeb(address);
String charSet = getCharSet(html);
Parser parse = new Parser();
parse.setInputHTML(html);
//parse.setEncoding(charSet);
Map<String, String> map = new HashMap<String, String>();
NodeFilter newFilter = new NodeClassFilter() {//建立一个过滤器 将Meta下面的keywords和description过滤出来
public boolean accept(Node node) {
if (node instanceof MetaTag) {
MetaTag mt = (MetaTag) node;
if (mt.getMetaTagName() != null) {
return true;
} else {
return false;
}
} else if(node instanceof TitleTag ) {
return true;
}
return false;
}
};
NodeList keywords = parse.extractAllNodesThatMatch(newFilter);
for (int i = 0; i < keywords.size(); i++) {
if (keywords.elementAt(i) instanceof TitleTag) {
TitleTag tt = (TitleTag) keywords.elementAt(i);
map.put("title", tt.getTitle());
} else {
MetaTag mt = (MetaTag) keywords.elementAt(i);
if (mt.getMetaTagName().equals("keywords") | mt.getMetaTagName().equals("Keywords")) {
map.put("keywords", mt.getMetaContent());
} else if(mt.getMetaTagName().equals("description") | mt.getMetaTagName().equals("Description")) {
map.put("description", mt.getMetaContent());
}
}
}
String title = "";
String keyword = "";
String description = "";
//防止乱码 将获得的内容转换为所访问页面的编码
if(map.containsKey("title")) {
title = Utility.getCharset(map.get("title"),charSet);
//System.out.println(title);
}
if (map.containsKey("keywords")) {
keyword = Utility.getCharset(map.get("keywords"),charSet);
//System.out.println(keyword);
}
if (map.containsKey("description")) {
description = Utility.getCharset(map.get("description"),charSet);
//System.out.println(description);
}
String t[] = {title,keyword,description};
System.out.println(charSet);
keywords.removeAll();
return t;
}
/**
* 获得所有用户的id
* @throws Exception
*/
public static List<String> getUserIds() throws Exception {
List<String> list = new ArrayList<String>();
String sql = "select userid from demographic";
ResultSet rs = Utility.getResultSet(sql);
while(rs.next()) {
String userid = rs.getString("userid");
list.add(userid);
}
return list;
}
/**
* 获得网页的编码
* @param html
* @return
* @throws Exception
*/
@SuppressWarnings("serial")
public static String getCharSet(String html) throws Exception {
String charSet = "utf-8";
Parser parser = new Parser();
parser.setInputHTML(html);
NodeFilter newFilter = new NodeClassFilter() {//建立一个过滤器 过滤出charset
public boolean accept(Node node) {
if (node instanceof MetaTag) {
MetaTag mt = (MetaTag) node;
if(mt.getAttribute("http-equiv")!= null) {
if(mt.getAttribute("content").contains("charset")) {
return true;
}
} else if(mt.getAttribute("charset")!= null) {
return true;
} else {
return false;
}
}
return false;
}
};
NodeList keywords = parser.extractAllNodesThatMatch(newFilter);
for(int i = 0;i < keywords.size(); i++) {
MetaTag mt = (MetaTag) keywords.elementAt(i);
if(mt.getAttribute("content") != null) {
String charset = mt.getAttribute("content").toLowerCase().split("charset=")[1];
charSet = charset;
} else {
String charset = mt.getAttribute("charset");
charSet = charset;
}
}
return charSet;
}
/**
* 通过HttpClient访问页面
* @param address
* @return
* @throws Exception
*/
public static String visitWeb(String address) throws Exception{
HttpClient client = new HttpClient();
client.getHttpConnectionManager().getParams().setConnectionTimeout(10000);//设置访问的网页的超时时间为10000ms
client.getHttpConnectionManager().getParams().setSoTimeout(10000) ;//设置读取资源的超时时间为10000ms
GetMethod method = new GetMethod(address);
System.out.print("Connect The Web...");
client.executeMethod(method);
System.out.println("Connect Successful!");
System.out.print("Begin To Read The Source...");
InputStream is = method.getResponseBodyAsStream();
InputStreamReader isr = new InputStreamReader(is,"ISO-8859-1"); //先设置统一 的编码为ISO-8859-1
BufferedReader br = new BufferedReader(isr);
StringBuffer resBuffer = new StringBuffer();
String resTemp = "";
while((resTemp = br.readLine()) != null){
resBuffer.append(resTemp);
}
String html = resBuffer.toString();
System.out.println("Source Read Successful!");
br.close();
isr.close();
is.close();
return html;
}
/**
* 日志记录
* @param userid
* @param address
* @throws Exception
*/
public static void writeToFile(String userid,String address) throws Exception {
FileWriter fw = new FileWriter("E:\\HtmlParser\\log.txt",true);
BufferedWriter bw = new BufferedWriter(fw);
String content = userid + " " +address;
bw.write(content);
bw.newLine();
bw.close();
fw.close();
System.out.println("日志记录!");
}
}
因为有的网页不能访问,所以我们为了保证程序持续执行,要进行try catch并将不能访问的网站进行日志记录,一开始的时候我使用的是HtmlParser来进行网站的访问和页面的解析,但是不知道为什么程序总是死在某一个地方,后来在网上查了相关的资料,所以采用了HttpClient进行网站的访问,效果很好,没有停止过。
一下是字符转换的部分:
public static String getCharset(String str, String charSet) { String newStr = ""; try { newStr = new String(str.getBytes("ISO-8859-1"),charSet); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return newStr; }
charset是从页面上获取到的网页的默认字符。这样处理后,基本上消除了乱码。