通过HtmlParser和HttpClient抽取网页关键字并解决乱码问题

最新推荐文章于 2021-02-27 02:22:25 发布

LiChing08

最新推荐文章于 2021-02-27 02:22:25 发布

阅读量4.2k

点赞数

本文链接：https://blog.csdn.net/liqing08229/article/details/8537784

版权

基于HtmlParser和HttpClient抽取网页关键字，主要抽取网页的title、description、和keywords。HttpClient主要用来访问网页，而HtmlParser主要用来解析返回的html，但是在解析html的过程中，要根据网页的编码进行转换，不然会造成乱码，大多数网页的<meta>标签都会指定网页的编码，所以我们要将网页的编码一起解析出来，在来进行编码的转换。

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import util.Utility;

/**
 * 利用HtmlParser和HttpClient两个开源包进行网页的解析和网页的访问
 */
public class HtmlParser {
	public static void main(String []args) throws Exception {
		getWebAddress();
		//getContent("http://www.sogou.com/sogou","");
	}
	/**
	 * ͨ从数据库获取网页地址
	 * @return
	 * @throws Exception
	 */
	public static void getWebAddress() throws Exception {
		Connection con = Utility.getConnection();
		Statement stmt = con.createStatement();
		PreparedStatement pstmt = con.prepareStatement("insert into content(userid,webaddress,title,keywords,description) values(?,?,?,?,?)");
		List<String> list = getUserIds();
		for(int i=162;i<list.size();i++) {
			String userid = list.get(i);
			System.out.println("第"+i+"个用户"+userid);
			List<String> address = new ArrayList<String>();
			String sql = "select distinct U from behavior where userid='"+userid+"'";
			ResultSet rs = stmt.executeQuery(sql);
			while(rs.next()) {
				String webaddress = rs.getString("U");
				if(webaddress != null) {
					if(!webaddress.contains("https")&&!webaddress.endsWith(".zip")&&!webaddress.endsWith(".jar")&&!webaddress.endsWith(".exe")) {
						address.add(webaddress);	
					}
				}
			}
			for(int j=0;j<address.size();j++) {
				System.out.println("总共有"+address.size()+"条记录！");
				String webaddress = address.get(j);
				try {
					String t[] = getContent(webaddress,userid);
					pstmt.setString(1, userid);
					pstmt.setString(2, webaddress);
					pstmt.setString(3, t[0]);
					pstmt.setString(4, t[1]);
					pstmt.setString(5, t[2]);
					pstmt.addBatch();
				} catch (Throwable e) {
					System.out.println(e.toString());
					writeToFile(userid,webaddress);//将发生错误的地址写到文件中־
					continue;//为了保证程序能一直运行 当有错误发生时直接进行下一个循环
				}
				System.out.println(webaddress);
				System.out.println("第"+j+"条记录！");
			}
			pstmt.executeBatch();
			System.out.println("数据插入完成！！！！！");
		}
	}
	/**     
	 * 从指定的网页获取需要的内容
	 * @throws Exception
	 */
	@SuppressWarnings("serial")
	public static String[] getContent(String webAddress, String userid) throws Exception {
		String address = "";
		if (webAddress.contains("http")) {//如果地址不包含http则加上http
			address = webAddress;
		} else {
			address = "http://" + webAddress;
		}
		String html = visitWeb(address);
		String charSet = getCharSet(html);
		Parser parse = new Parser();
		parse.setInputHTML(html);
		//parse.setEncoding(charSet);
		Map<String, String> map = new HashMap<String, String>();
		NodeFilter newFilter = new NodeClassFilter() {//建立一个过滤器 将Meta下面的keywords和description过滤出来
			public boolean accept(Node node) {
				if (node instanceof MetaTag) {
					MetaTag mt = (MetaTag) node;
					if (mt.getMetaTagName() != null) {
						return true;
					} else {
						return false;
					}
				} else if(node instanceof TitleTag ) {
					return true;
				}
				return false;
			}
		};
		NodeList keywords = parse.extractAllNodesThatMatch(newFilter);
		for (int i = 0; i < keywords.size(); i++) {
			if (keywords.elementAt(i) instanceof TitleTag) {
				TitleTag tt = (TitleTag) keywords.elementAt(i);
				map.put("title", tt.getTitle());
			} else {
				MetaTag mt = (MetaTag) keywords.elementAt(i);
				if (mt.getMetaTagName().equals("keywords") | mt.getMetaTagName().equals("Keywords")) {
					map.put("keywords", mt.getMetaContent());
				} else if(mt.getMetaTagName().equals("description") | mt.getMetaTagName().equals("Description")) {
					map.put("description", mt.getMetaContent());
				}
			}
		}
		String title = "";
		String keyword = "";
		String description = "";
	    //防止乱码 将获得的内容转换为所访问页面的编码
		if(map.containsKey("title")) {
			title = Utility.getCharset(map.get("title"),charSet);
			//System.out.println(title);
		}
		if (map.containsKey("keywords")) {
			keyword = Utility.getCharset(map.get("keywords"),charSet);
			//System.out.println(keyword);
		}
		if (map.containsKey("description")) {
			description = Utility.getCharset(map.get("description"),charSet);
			//System.out.println(description);
		}
		String t[] = {title,keyword,description};
		System.out.println(charSet);
		keywords.removeAll();
		return t;
	}

	/**
	 * 获得所有用户的id
	 * @throws Exception
	 */
	public static List<String> getUserIds() throws Exception {
		List<String> list = new ArrayList<String>();
		String sql = "select userid from demographic";
		ResultSet rs = Utility.getResultSet(sql);
		while(rs.next()) {
			String userid = rs.getString("userid");
			list.add(userid);
		}
		return list;
	}
	/**
	 * 获得网页的编码
	 * @param html
	 * @return
	 * @throws Exception
	 */
	@SuppressWarnings("serial")
	public static String getCharSet(String html) throws Exception {
		String charSet = "utf-8";
		Parser parser = new Parser();
		parser.setInputHTML(html);
		NodeFilter newFilter = new NodeClassFilter() {//建立一个过滤器 过滤出charset
			public boolean accept(Node node) {
				if (node instanceof MetaTag) {
					MetaTag mt = (MetaTag) node;
					if(mt.getAttribute("http-equiv")!= null) {
						if(mt.getAttribute("content").contains("charset")) {
							return true;
						}
					} else if(mt.getAttribute("charset")!= null) {
						return true;
					} else {
						return false;
					}
				}
				return false;
			}
		};
		NodeList keywords = parser.extractAllNodesThatMatch(newFilter);
		for(int i = 0;i < keywords.size(); i++) {
			MetaTag mt = (MetaTag) keywords.elementAt(i);
			if(mt.getAttribute("content") != null) {
				String charset = mt.getAttribute("content").toLowerCase().split("charset=")[1];
				charSet = charset;
			} else {
				String charset = mt.getAttribute("charset");
				charSet = charset;
			}
		}
		return charSet;
	}
	/**
	 * 通过HttpClient访问页面
	 * @param address
	 * @return
	 * @throws Exception
	 */
	public static String visitWeb(String address) throws Exception{
		HttpClient client = new HttpClient();
		client.getHttpConnectionManager().getParams().setConnectionTimeout(10000);//设置访问的网页的超时时间为10000ms 
		client.getHttpConnectionManager().getParams().setSoTimeout(10000) ;//设置读取资源的超时时间为10000ms 
		GetMethod method = new GetMethod(address);
		System.out.print("Connect The Web...");
		client.executeMethod(method);
		System.out.println("Connect Successful!");
		System.out.print("Begin To Read The Source...");
		InputStream is = method.getResponseBodyAsStream();
		InputStreamReader isr = new InputStreamReader(is,"ISO-8859-1"); //先设置统一 的编码为ISO-8859-1
        BufferedReader br = new BufferedReader(isr);  
        StringBuffer resBuffer = new StringBuffer();  
        String resTemp = "";  
        while((resTemp = br.readLine()) != null){  
            resBuffer.append(resTemp);  
        }  
        String html = resBuffer.toString(); 
        System.out.println("Source Read Successful!");
        br.close();
        isr.close();
        is.close();
        return html;
	}
	/**
	 * 日志记录
	 * @param userid
	 * @param address
	 * @throws Exception
	 */
	public static void writeToFile(String userid,String address) throws Exception {
		FileWriter fw = new FileWriter("E:\\HtmlParser\\log.txt",true);
		BufferedWriter bw = new BufferedWriter(fw);
		String content = userid + " " +address;
		bw.write(content);
		bw.newLine();
		bw.close();
		fw.close();
		System.out.println("日志记录！");
	}
}

因为有的网页不能访问，所以我们为了保证程序持续执行，要进行try catch并将不能访问的网站进行日志记录，一开始的时候我使用的是HtmlParser来进行网站的访问和页面的解析，但是不知道为什么程序总是死在某一个地方，后来在网上查了相关的资料，所以采用了HttpClient进行网站的访问，效果很好，没有停止过。

一下是字符转换的部分：

public static String getCharset(String str, String charSet) {
		String newStr = "";
		try {
			newStr = new String(str.getBytes("ISO-8859-1"),charSet);
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		return newStr;
	}










charset是从页面上获取到的网页的默认字符。这样处理后，基本上消除了乱码。