java-jsoup自适应爬取网页表格的内容


  在爬取数据的过程中,我们有时候需要爬取页面中的表格 但表格的样式千变万化  下面的类和方法可以解析大部分的表格  得到 属性名 和 对应值.


  需要的包链接:

  http://download.csdn.net/detail/q383965374/5960953


类如下:


TestCrawTable  -----测试类

package com;

import java.io.IOException;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class TestCrawTable {
	public static void main(String[] args) {
		try {
			Document document = Jsoup.connect(
					"http://www.landnj.cn/LandInfo.aspx?flag=gongshi&Id=172")
					.get();
			if (document != null) {
				List<TableElement> tableElemts = DataTableUtil
						.getFitElement(document);
				if (tableElemts != null && tableElemts.size() > 0) {
					List<PropertyInfo> propertyInfos = TableUtil
							.extractPropertyInfos(tableElemts);
					if (propertyInfos != null && propertyInfos.size() > 0) {
						for (PropertyInfo propertyInfo : propertyInfos) {
							System.out.println(propertyInfo.getName() + "  "
									+ propertyInfo.getValue());
						}
						System.out.println("-----------------------------------");
					}
				}
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
}


DataTableUtil --------验证是否是我们需要的table

package com;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Tag;
import org.jsoup.select.Elements;



public class DataTableUtil {

	private final static int NUM = 5;
	
	//要抓取的表格可能出现的属性名
	static String[] Propertys={"地块编号","宗地编号","地块位置","用地性质","规划面积","出让面积","发布时间","挂牌起始价","位置","交易时间","面积","规划用途","容积率","起价","成交价","交易方式","竞得人"};

	/**
	 * 找到适合的装数据的TABle
	 * 
	 * @param location
	 * @param document
	 * @return
	 */
	//取最里面的table进入isValueElement方法检测是不是我们需要的table
	public static List<TableElement> getFitElement(Document document) {	
		if (Propertys != null) {
			Element element = document.getElementsByTag("body").get(0);
			List<TableElement> fitElments = new ArrayList<TableElement>();
			Elements tableElements = element.getElementsByTag("table");
			if (tableElements != null && tableElements.size() > 0) {
				for (int i = 0; i < tableElements.size(); i++) {
					Element tableElement = tableElements.get(i);
					Elements ces = tableElement.getElementsByTag("table");
					if (ces != null && ces.size() > 1) {
					} else {
						TableElement te;
						if ((te = isValueElement(Propertys,tableElement)) != null) {
							fitElments.add(te);
						}
					}
				}
			} else {
				return null;
			}
			return fitElments;
		}
		return null;
	}
	//找出属性名所在的行,把属性名所在的行之上的信息去掉
	//ri为属性名所在行的结尾索引
	//row为属性名所在行占的行数
	//ri-row为属性名所在行的开始索引,这里取属性名所在行到表格的结尾行
	private static Element removeRedundance(String[] Propertys,
			Element element) {
		Elements tres = element.getElementsByTag("tr");
		Element trElement = tres.get(0);
		Elements tde = trElement.getElementsByTag("td");
		int row = 1;
		for (Element tdElement : tde) {
			String attribute = tdElement.attr("rowspan");
			if (attribute != null && !attribute.equals("")) {
				int rowSpan = Integer.valueOf(attribute);
				if (rowSpan > row) {
					row = rowSpan;
				}
			}
		}
		List<Element> elements = new ArrayList<Element>();
		for (int i = 0; i < row; i++) {
			elements.add(tres.get(i));
		}
		int ri = 0;
		while (!isValueElements(Propertys, elements)) {
			elements = new ArrayList<Element>();
			row = 1;
			Elements tdes = tres.get(ri).getElementsByTag("td");
			for (Element tdElement : tdes) {
				String attribute = tdElement.attr("rowspan");
				if (attribute != null && !attribute.equals("")) {
					int rowSpan = Integer.valueOf(attribute);
					if (rowSpan > row) {
						row = rowSpan;
					}
				}
			}
			for (int i = 0; i < row; i++) {
				elements.add(tres.get(ri + i));
			}
			ri = ri + row;
		}
		if (ri > 0) {
			Elements trs = element.getElementsByTag("tr");
			int size = trs.size();
			Element newElement = new Element(Tag.valueOf("table"), "table");
			for (int i = ri-row; i < size; i++) {
				newElement.appendChild(trs.get(i));
			}
			return newElement;
		}
		return element;
	}

	private static boolean isValueElements(
			String[] Propertys, List<Element> trElements) {
		int index = 0;
		int size = trElements.size();
		for (int i = 0; i < size; i++) {
			List<Element> propertyElements = new ArrayList<Element>();
			Element element = trElements.get(i);
			Elements tdElements = element.getElementsByTag("td");
		    Elements thElements = element.getElementsByTag("th");
			if(thElements!=null&&thElements.size()>0){
				for(Element thelement : thElements){
					propertyElements.add(thelement);
				}
			}
			if(tdElements!=null&&tdElements.size()>0){
				for(Element tdelement : tdElements){
					propertyElements.add(tdelement);
				}
			}
			for (Element tdElement : propertyElements) {
				String text = tdElement.text();
				if (!text.trim().equals("")) {
					String value = adjuestmentParm(text);
					if (value != null) {
						value = StringUtil.parseString(value);
						double max = 0.0d;
						for (int j = 0; j < Propertys.length; j++) {
							double temp = SimFeatureUtil.sim(Propertys[j], value);
							if (temp > max) {
								max = temp;
							}
						}
						if (max >= 0.6) {
							index++;
						}
					}
				}
			}
		}
		if (index >= NUM) {
			return true;
		} else {
			return false;
		}
	}

	private static boolean regual(String reg, String string) {
		Pattern pattern = Pattern.compile(reg);
		Matcher ma = pattern.matcher(string);
		if (ma.find()) {
			return true;
		}
		return false;
	}

	//清除获取的信息标题上的单位及一些字符提高匹配的相似度
	public static String adjuestmentParm(String parm) {
		if (regual("\\(", parm)) {
			parm = parm.substring(0, parm.indexOf("("));
		} else if (regual("(", parm)) {
			parm = parm.substring(0, parm.indexOf("("));
		} else if (regual("万元", parm)) {
			parm = parm.substring(0, parm.indexOf("万元"));
		} else if (regual("亩", parm)) {
			parm = parm.substring(0, parm.indexOf("亩"));
		} else if (regual("公顷", parm)) {
			parm = parm.substring(0, parm.indexOf("公顷"));
		} else if (regual("米", parm)) {
			parm = parm.substring(0, parm.indexOf("米"));
		}
		return parm;
	}

	//取出表格中的td或者th单元格的值进行相似度比较 
	//index记录相似值大于0.6的值个数  
	//consist记录连续不相似个数   
	//当连续不相似的个数达到10个时 就break 跳出循环
	//consistFlag记录连续相似个数
	//当index个数大于等于我们设定的个数时说明该table是包含我们要获取数据的table
	//当consistFlag大于2时说明表格内容是竖向的 setCross为false  
	//进入removeRedundance方法检查是否table中的前几行(tr)不是以标题行开始,把这几行去掉
	//例如:http://kmland.km.gov.cn/view.asp?id=7089&frist_lanmu=173
	private static TableElement isValueElement(
			String[] Propertys, Element element) {
		TableElement tableElement = new TableElement();
		List<Element> propertyElements = new ArrayList<Element>();
		Elements tdElements = element.getElementsByTag("td");
	    Elements thElements = element.getElementsByTag("th");
		if(thElements!=null&&thElements.size()>0){
			for(Element thelement : thElements){
				propertyElements.add(thelement);
			}
		}
		if(tdElements!=null&&tdElements.size()>0){
			for(Element tdelement : tdElements){
				propertyElements.add(tdelement);
			}
		}		
		int index = 0;
		int consist = 0;
		int size = propertyElements.size();
		int consistFlag = 0;
		for (int i = 0; i < size; i++) {
			consist++;
			Element tdElement = propertyElements.get(i);
			String text = tdElement.text();
			if (!text.trim().equals("")) {
				String value = adjuestmentParm(text);
				if (value != null) {
					value = StringUtil.parseString(value);
					double max = 0.0d;
					for (int j = 0; j < Propertys.length; j++) {
						double temp = SimFeatureUtil.sim(
								Propertys[j], value);
						if (temp > max) {
							max = temp;
						}
					}
					if (max >= 0.6) {
						index++;
						if (consist == 1) {
							consist = 0;
							consistFlag++;
						} else {
							consist = 0;
						}
					} else {
						if (consist >= 10) {
							break;
						}
					}
				}
			}
		}
		if (index >= NUM) {
			tableElement.setWordNum(index);
			if (consistFlag >= 2) {
				tableElement.setElement(removeRedundance(Propertys, element));
				tableElement.setCross(false);
			} else {
				tableElement.setElement(element);
			}
			return tableElement;
		} else {
			return null;
		}
	}
}

TableUtil -----解析表格
package com;

import java.util.ArrayList;
import java.util.List;

import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;




public class TableUtil {

	//横向的表格进入getAcrossTableValue方法解析 竖向的表格进入getTableValue方法解析
	public static List<PropertyInfo> extractPropertyInfos(List<TableElement> tableElemts)  {
			List<PropertyInfo> propertyInfos = new ArrayList<PropertyInfo>();
			if(tableElemts!=null&&tableElemts.size()>0){
				for (TableElement element : tableElemts) {
					if(element.isCross()){
						propertyInfos.addAll(TableUtil.getAcrossTableValue(element.getElement()));	
					}else{
						propertyInfos.addAll(TableUtil.getTableValue(element.getElement()));	
					}
				}
				return propertyInfos;
			}
			return null;
		}
	
	
	//对于竖向内容的表格先取出属性名占的最大行数存为row
		public static List<PropertyInfo> getTableValue(Element tableElement) {
			Elements trElements = tableElement.getElementsByTag("tr");
			int row = getRowElement(trElements.get(0));
			List<String> propertys = parseTablePropertys(row,trElements);
			return  parsePropertyValue(row, propertys, trElements);
		}

		/**
		 * 提取TABLE属性
		 * 
		 * @param tableElement
		 */
		//把每行的元素存入ptdELementsList列表中,把每行的列数存入length 然后进入parsePropertyString函数对第一行内容提取得到属性名
		private static List<String> parseTablePropertys(int row,Elements trElements) {
			List<String> propertys = new ArrayList<String>();
			List<Elements> ptdELementsList = new ArrayList<Elements>();
			int[] lengths = new int[row]; // 储存每行的长度(列数-单元格个数)
			int[] index = new int[row]; // 每行位置
			for (int i = 0; i < row; i++) {
				Element trElement = trElements.get(i);
				Elements elements = new Elements();
				Elements tdElements = trElement.getElementsByTag("td");
				Elements thElements = trElement.getElementsByTag("th");
				if(tdElements!=null&&tdElements.size()>0){
					elements.addAll(tdElements);
				}
				if(thElements!=null&&thElements.size()>0){
					elements.addAll(thElements);
				}
				ptdELementsList.add(elements);
				lengths[i] = elements.size();
			}
			parsePropertyString(propertys, index, lengths, 0, ptdELementsList);
			return propertys;
		}

		//提取内容的值---遍历除属性行之外的行 依次提取td元素进入parseValues提取内容的值
		//flags记录行的状态
		//valueFlags记录每个属性一个值对多个单元格的状态
		//vtdElements记录每行单元格的元素td
		//size值的总行数
		//length记录每行的单元格td数
		private static List<PropertyInfo> parsePropertyValue(int row, List<String> propertys,
				Elements trElements) {
			int propertysSize = propertys.size();
			List<Elements> vtdElements = new ArrayList<Elements>();
			int trsize = trElements.size();
			int size = trsize - row;
			int[] lengths = new int[size];
			boolean[] flags = new boolean[size];
			boolean[][] valueFlags = new boolean[size][propertysSize];
			
			for (int i = row; i < trsize; i++) {
				Element trElement = trElements.get(i);
				Elements velements = new Elements();
				Elements tdElements = trElement.getElementsByTag("td");
				Elements thElements = trElement.getElementsByTag("th");
				if(thElements!=null&&thElements.size()>0){
					velements.addAll(thElements);
				}
				if(tdElements!=null&&tdElements.size()>0){
					velements.addAll(tdElements);
				}
//				Elements tdElements = trElement.getElementsByTag("td");
				if (velements != null && velements.size() > 0) {
					lengths[i-row] = velements.size();
					vtdElements.add(velements);
					for(int j = 0 ; j < propertysSize;j++){
						valueFlags[i-row][j]= false;
					}
					flags[i-row] = true;
				} else {
					size--;
				}
			}
			return parseValues(propertys, vtdElements, size, lengths, flags,valueFlags);
		}

		/**
		 * 取TABLE里面的数据 包含 ROWSPAN 表明该元素对应多个PROPERTY 包含COLSPAN表明该元素对应多条数据
		 * 
		 * @param propertysSize
		 * @param vtdElements
		 * @param size
		 * @param lengths
		 * @param flags
		 * @param valueFlags
		 */
		//一个单元格的内容对应多条记录的 例子:http://kmland.km.gov.cn/view.asp?id=7089&frist_lanmu=173
		//遇到一对多的值 则取出它的rowspan和colspan的,把对应位置的值赋值,并修改该位置的valueFlag为true
		private static List<PropertyInfo> parseValues(List<String> propertys,
				List<Elements> vtdElements, int size, int[] lengths,
				boolean[] flags, boolean[][] valueFlags) {
			int propertysSize = propertys.size();
			String[][] pValueStrs = new String[size][propertysSize];
			for (int i = 0; i < size; i++) {
				Elements tdValueElements = vtdElements.get(i);
				int k = 0;
				for (int j = 0; j < lengths[i]; j++) {
					Element tdValueElement = null;
					try {
						tdValueElement = tdValueElements.get(j);
					} catch (Exception e) {
						flags[i] = false;
						break;
					}
					if(valueFlags[i][k]){
						while (valueFlags[i][k]) {
							k++;
						}
					}
					if (tdValueElement.hasAttr("rowspan")) {
						int rowspan = Integer.valueOf(tdValueElement.attr("rowspan"));
						int colspan = 1;
						if (tdValueElement.hasAttr("colspan")) {
							colspan = Integer.valueOf(tdValueElement.attr("colspan"));
						}
						if (rowspan > 1) {
							System.out.println(rowspan);
							for (int m = 0; m < rowspan; m++) {
								for (int n = 0; n < colspan; n++) {
									System.out.println("i = " + i +"m=" +m);
									try {
										valueFlags[i + m][k + n] = true;
										pValueStrs[i + m][k + n] = tdValueElement.text();
									} catch (Exception e) {
										System.out.println("i = " + i +"m=" +m +"k+n = "+ (k+n)+ "length");
									}
								}
							}
						}
					}else if (tdValueElement.hasAttr("colspan")) {
						int colspan = Integer.valueOf(tdValueElement
								.attr("colspan"));
						if (colspan > 1) {
							if (colspan >= size - 1) {
								flags[i] = false;
								break;
							}
							for (int m = 0; m < colspan; m++) {
								valueFlags[i][k] = true;
								pValueStrs[i+m][k] = tdValueElement.text();
							}
						}
					}else{
						if(tdValueElement!=null){
							pValueStrs[i][k] = tdValueElement.text();
						}
					}
					k++;
				}
			}
			List<PropertyInfo> propertyInfos = new ArrayList<PropertyInfo>();
			for (int i = 0; i < size; i++) {				
				if (flags[i]) {
					for (int j = 0; j < propertysSize; j++) {						
						if(propertys.get(j)!=null){
							PropertyInfo propertyValue = new PropertyInfo();
							propertyValue.setName(propertys.get(j));
							propertyValue.setValue(pValueStrs[i][j]);
							propertyInfos.add(propertyValue);
						}
					}
				}
			}
			return propertyInfos;
		}
		
		public static List<PropertyInfo> getAcrossTableValue(Element element)
				{	
//			Elements tdElements = element.getElementsByTag("td");
			Elements trElements = element.getElementsByTag("tr");
			List<PropertyInfo> propertyValues = new ArrayList<PropertyInfo>();
			for(Element trElement:trElements){
				Elements tdElements = trElement.getElementsByTag("td");
				int size = tdElements.size();
				if(size % 2 ==0){
					PropertyInfo propertyValue = null;
					for (int i = 0; i < size; i++) {
						Element tdElement = tdElements.get(i);
						String value = tdElement.text();
						if (i % 2 == 0) {
							propertyValue = new PropertyInfo();						
							propertyValue.setName(StringUtil.parseString(value));						
						} else {	
							propertyValue.setValue(value);
							propertyValues.add(propertyValue);
						}
					}
				}
			}	
			return propertyValues;
		}

		/**
		 * 提取属性名
		 * 
		 * @param propertyStrs
		 * @param index
		 * @param lengths
		 * @param ind
		 * @param tdElementsList
		 */
		//遍历属性行中的td,取它的colspan,如果它横跨的大于1则说明它是包含多个小标题的大标题,则进入第二行取值
		private static void parsePropertyString(List<String> propertyStrs,
				int[] index, int[] lengths, int ind, List<Elements> tdElementsList) {
			Elements tdElements = tdElementsList.get(ind);
			for (int i = index[ind]; i < lengths[ind]; i++) {
				index[ind] = index[ind] + 1;
				Element tdElement = tdElements.get(i);
				String attribute = tdElement.attr("colspan");
				String value = StringUtil.parseString(tdElement.text());
//				if (!value.trim().equals("")) {
					if (attribute != null && !attribute.equals("")) {
						int col = Integer.valueOf(attribute);
						if (col > 1) {
							parsePropertyString(propertyStrs, index, lengths,
									ind + 1, tdElementsList);
						} else {
							propertyStrs.add(value);
						}
					} else {
						propertyStrs.add(value);
					}
//				}
			}
		}

		/**
		 * 提取属性占的ROW数 取出属性中含有的最大ROWSPAN数
		 * 
		 * @param trElement
		 * @return
		 */
		private static int getRowElement(Element trElement) {
			Elements tdElements = trElement.getElementsByTag("td");
			int row = 1;
			for (Element tdElement : tdElements) {
				String attribute = tdElement.attr("rowspan");
				if (attribute != null && !attribute.equals("")) {
					int rowSpan = Integer.valueOf(attribute);
					if (rowSpan > row) {
						row = rowSpan;
					}
				}
			}
			return row;
		}
}

下面是一些工具类和元素类

PropertyInfo

package com;

public class PropertyInfo {
	private String name;
	private String value;
	public String getName() {
		return name;
	}
	public void setName(String name) {
		this.name = name;
	}
	public String getValue() {
		return value;
	}
	public void setValue(String value) {
		this.value = value;
	}
}

SimFeatureUtil 相似度对比

package com;


public class SimFeatureUtil {


	private static int min(int one, int two, int three) {
		int min = one;
		if (two < min) {
			min = two;
		}
		if (three < min) {
			min = three;
		}
		return min;
	}


	public static int ld(String str1, String str2) {
		int d[][]; // 矩阵
		int n = str1.length();
		int m = str2.length();
		int i; // 遍历str1的
		int j; // 遍历str2的
		char ch1; // str1的
		char ch2; // str2的
		int temp; // 记录相同字符,在某个矩阵位置值的增量,不是0就是1
		if (n == 0) {
			return m;
		}
		if (m == 0) {
			return n;
		}
		d = new int[n + 1][m + 1];
		for (i = 0; i <= n; i++) { // 初始化第一列
			d[i][0] = i;
		}
		for (j = 0; j <= m; j++) { // 初始化第一行
			d[0][j] = j;
		}
		for (i = 1; i <= n; i++) { // 遍历str1
			ch1 = str1.charAt(i - 1);
			// 去匹配str2
			for (j = 1; j <= m; j++) {
				ch2 = str2.charAt(j - 1);
				if (ch1 == ch2) {
					temp = 0;
				} else {
					temp = 1;
				}
				// 左边+1,上边+1, 左上角+temp取最小
				d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1]+ temp);
			}
		}
		return d[n][m];
	}
	public static double sim(String str1, String str2) {
		try {
			double ld = (double)ld(str1, str2);
			return (1-ld/(double)Math.max(str1.length(), str2.length()));
//			double length = (double)str1.length() /(double) str2.length();
//			return (1 - (double) ld / Math.max(str1.length(), str2.length()))* length;
		} catch (Exception e) {
			return 0.1;
		}
	}


	public static void main(String[] args) {
		String str1 = "地块编号";
		String str2 = "地块编号";
		System.out.println("ld=" + ld(str1, str2));
		System.out.println("sim=" + sim(str1, str2));
	}
}


StringUtil 去除多余字符

package com;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class StringUtil {

	private static Pattern pattern = Pattern.compile("[\\u4e00-\\u9fa5a-zA-Z\\d\\../:]");

	/**
	 * 去掉多余字符
	 * 
	 * @return
	 */
	public static String parseString(String str) {
		StringBuilder sb = new StringBuilder();
		Matcher matcher = pattern.matcher(str);
		while (matcher.find()) {
			sb.append(matcher.group());
		}
		return sb.toString();
	}

}

TableElement
package com;

import org.jsoup.nodes.Element;

public class TableElement {
	
	private Element element;
	private boolean isCross;
	private int wordNum;
	
	public TableElement(){
		isCross = true;
	}
	
	public Element getElement() {
		return element;
	}
	public void setElement(Element element) {
		this.element = element;
	}
	public boolean isCross() {
		return isCross;
	}
	public void setCross(boolean isCross) {
		this.isCross = isCross;
	}
	public int getWordNum() {
		return wordNum;
	}
	public void setWordNum(int wordNum) {
		this.wordNum = wordNum;
	}

}

提取的表格:




运行结果:



完整的例子下载:

java爬取网页表格的例子(运行环境myeclipse)




  • 4
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

张小凡vip

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值