使用jsoup爬取数据并导出excel文件保存

最新推荐文章于 2022-01-10 13:01:17 发布

DFDHZ

最新推荐文章于 2022-01-10 13:01:17 发布

阅读量1k

点赞数

分类专栏： java

本文链接：https://blog.csdn.net/qq_30762453/article/details/72469444

版权

java 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

/**
 * 规则类
 * 
 * 
 */
public class Rule {
	/**
	 * 链接
	 */
	private String url;

	/**
	 * 参数集合
	 */
	private String[] params;
	/**
	 * 参数对应的值
	 */
	private String[] values;

	/**
	 * 对返回的HTML，第一次过滤所用的标签，请先设置type
	 */
	private String resultTagName;

	/**
	 * CLASS / ID / SELECTION 设置resultTagName的类型，默认为ID
	 */
	private int type = ID;

	/**
	 * GET / POST 请求的类型，默认GET
	 */
	private int requestMoethod = GET;

	public final static int GET = 0;
	public final static int POST = 1;

	public final static int CLASS = 0;
	public final static int ID = 1;
	public final static int SELECTION = 2;

	public Rule() {
	}

	public Rule(String url, String[] params, String[] values, String resultTagName, int type, int requestMoethod) {
		super();
		this.url = url;
		this.params = params;
		this.values = values;
		this.resultTagName = resultTagName;
		this.type = type;
		this.requestMoethod = requestMoethod;
	}

	public String getUrl() {
		return url;
	}

	public void setUrl(String url) {
		this.url = url;
	}

	public String[] getParams() {
		return params;
	}

	public void setParams(String[] params) {
		this.params = params;
	}

	public String[] getValues() {
		return values;
	}

	public void setValues(String[] values) {
		this.values = values;
	}

	public String getResultTagName() {
		return resultTagName;
	}

	public void setResultTagName(String resultTagName) {
		this.resultTagName = resultTagName;
	}

	public int getType() {
		return type;
	}

	public void setType(int type) {
		this.type = type;
	}

	public int getRequestMoethod() {
		return requestMoethod;
	}

	public void setRequestMoethod(int requestMoethod) {
		this.requestMoethod = requestMoethod;
	}

}

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.zhy.spider.rule.Rule;
import com.zhy.spider.rule.RuleException;
import com.zhy.spider.util.TextUtil;

public class ExtractService {
	/**
	 * @param rule
	 * @return
	 */
	public static List<List<String>> extract(Rule rule, String urlName) {

		// 进行对rule的必要校验
		validateRule(rule);

		List<List<String>> datas = null;
		try {
			/**
			 * 解析rule
			 */
			String url = rule.getUrl();
			String[] params = rule.getParams();
			String[] values = rule.getValues();
			String resultTagName = rule.getResultTagName();
			int type = rule.getType();
			int requestType = rule.getRequestMoethod();

			URL realUrl = new URL(url);
			HttpURLConnection connection = (HttpURLConnection) realUrl.openConnection();
			// 是否允许缓存，默认true。
			connection.setUseCaches(Boolean.FALSE);
			// 是否开启输出输入，如果是post使用true。默认是false
			// connection.setDoOutput(Boolean.TRUE);
			// connection.setDoInput(Boolean.TRUE);
			// 设置请求头信息
			connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");

			//connection.addRequestProperty("Connection", "close");
			// 设置连接主机超时（单位：毫秒）
			connection.setConnectTimeout(8000);
			// 设置从主机读取数据超时（单位：毫秒）
			connection.setReadTimeout(8000);
			// 设置Cookie
			// connection.addRequestProperty("Cookie","你的Cookies" );

			// 设置查询参数
			String str = "";
			if (params != null) {
				for (int i = 0; i < params.length; i++) {
					str = str + "&" + params[i] + "=" + values[i];
				}
			}

			// 设置请求类型，大小写都行，因为源码里都toUpperCase了。
			switch (requestType) {
			case Rule.GET:
				connection.setRequestMethod("GET");
				break;
			case Rule.POST:
				connection.setRequestMethod("POST");
				break;
			}
			// 获取页面编码
			//String encoding = WebEncoding.getCharset(url);
			// 开始请求
			Document doc = Jsoup.parse(connection.getInputStream(), "gb2312", url + str);
			// 处理返回数据
			Elements results = new Elements();
			switch (type) {
			case Rule.CLASS:
				results = doc.getElementsByClass(resultTagName);
				break;
			case Rule.ID:
				Element result = doc.getElementById(resultTagName);
				results.add(result);
				break;
			case Rule.SELECTION:
				results = doc.select(resultTagName);
				break;
			default:
				// 当resultTagName为空时默认去body标签
				if (TextUtil.isEmpty(resultTagName)) {
					results = doc.getElementsByTag("div");
				}
			}
			datas = LinkTypeData9998(results);
			if (datas == null || datas.size() == 0){
				results = new Elements();
				results = doc.getElementsByTag("title");
				for (Element element : results) {
					List<String> data = new ArrayList<String>();
					data.add(element.text());
					datas.add(data);
				}
			}
		} catch (IOException e) {
			e.printStackTrace();
		}

		return datas;
	}
	
	
	private static List<List<String>> LinkTypeData9998(Elements results) {
		List<List<String>> datas = new ArrayList<List<String>>();
		List<String> data = null;
		for (Element result : results) {
			Elements div1 = result.select("div.Bg");
			for (Element element : div1) {
				data = new ArrayList<String>();
				Elements h4 = element.getElementsByTag("h4");
				if (h4 != null && h4.size()>0){
					for (int i = 0; i < h4.size(); i++) {
						String text  = h4.get(i).text();
						String[] strings = text.split(" ");
						for (int j = 0; j < strings.length; j++) {
							data.add(strings[j]);
						}
					}
				}
				datas.add(data);
			}
		}
		return datas;
	}
	

	/**
	 * 对传入的参数进行必要的校验
	 */
	private static void validateRule(Rule rule) {
		String url = rule.getUrl();
		if (TextUtil.isEmpty(url)) {
			throw new RuleException("url不能为空！");
		}
		if (url.startsWith("http://") || url.startsWith("https://")) {
			System.out.println(url);
		} else {
			throw new RuleException("url的格式不正确！");
		}

		if (rule.getParams() != null && rule.getValues() != null) {
			if (rule.getParams().length != rule.getValues().length) {
				throw new RuleException("参数的键值对个数不匹配！");
			}
		}

	}
}

import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import javax.swing.JOptionPane;

import com.zhy.spider.bean.LinkTypeData;
import com.zhy.spider.core.ExtractService;
import com.zhy.spider.core.ExtractService2;
import com.zhy.spider.rule.Rule;
import com.zhy.spider.util.ExcelService;
import com.zhy.spider.util.WebContent;

public class Test2 {

	public static void main(String[] args) {
		String fileName = "D:\\excel\\04.xls";
		FileOutputStream fos = null;
		ExcelService pd = new ExcelService();
		// 表头
		//String[] tableHeader = { "广告链接" ,"广告名称", "招商热线" , "微信", "企业网址" , "联系地址"};
		//ExcelService.createTableHeader("9928", tableHeader); // --->创建一个表头行
		Rule rule = new Rule("", null, null, null, -1, Rule.GET);
		List<LinkTypeData> extracts = ExtractService.extract(rule,"9998");
		int rowIndex = 1;
		try {
			for (LinkTypeData data : extracts) {
				System.out.println(data.getLinkHref());
				if (data.getLinkHref() != null && !"".equals(data.getLinkHref())){
					Rule rule2 = new Rule(data.getLinkHref(), null, null, null, -1, Rule.GET);
					List<List<String>> extracts2 = ExtractService2.extract(rule2,"9998");
					if (extracts2 != null && extracts2.size()>0){
						List<String> list = new ArrayList<String>();
						list.add(data.getLinkHref());
						for (List<String> list2 : extracts2) {
							for (int i = 0; i < list2.size(); i++) {
								System.out.println(list2.get(i));
								list.add(list2.get(i));
							}
						}
						ExcelService.createTableRow(list, (short) rowIndex);
						rowIndex++;
					}
				}

				System.out.println("***********************************");
			}
			fos = new FileOutputStream(fileName);
			pd.exportExcel(ExcelService.demoSheet, fos);
			JOptionPane.showMessageDialog(null, "表格已成功导出到 : " + fileName);
		} catch (Exception e) {
			JOptionPane.showMessageDialog(null, "表格导出出错，错误信息 ：" + e + "\n错误原因可能是表格已经打开。");
			e.printStackTrace();
		} finally {
			try {
				fos.close();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

}

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.swing.JOptionPane;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFFooter;
import org.apache.poi.hssf.usermodel.HSSFHeader;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
/**
 * 
* @ClassName: ExcelService
* @Description:Excel
*
 */
public class ExcelService {
	// 创建工作本
	public static HSSFWorkbook demoWorkBook = new HSSFWorkbook();
	// 创建表
	public static HSSFSheet demoSheet = demoWorkBook.createSheet();
	// 表头的单元格个数目
	//public static final short cellNumber = (short) tableHeader.length;
	// 数据库表的列数
	public static final int columNumber = 2;

	/**
	 * 创建表头 
	 * @return
	 */
	@SuppressWarnings("deprecation")
	public static void createTableHeader(String str, String[] tableHeader) {
		// 设置表头，从sheet中得到
		HSSFHeader header = demoSheet.getHeader();
		header.setCenter(str);
		// 创建一行
		HSSFRow headerRow = demoSheet.createRow((short) 0);
		for (int i = 0; i < tableHeader.length; i++) {
			// 创建一个单元格
			HSSFCell headerCell = headerRow.createCell((short) i);
			// headerCell.setEncoding(HSSFCell.ENCODING_UTF_16);
			// CellStyle cs = new CellStyle();
			// 设置cell的值
			headerCell.setCellValue(tableHeader[i]);
		}
	}

	/**
	 * 创建行 
	 * @param cells
	 * @param rowIndex
	 */
	@SuppressWarnings("deprecation")
	public static void createTableRow(List<String> cells, short rowIndex) {
		// 创建第rowIndex行
		HSSFRow row = demoSheet.createRow((short) rowIndex);
		if (cells!= null && cells.size() >0){
			for (short i = 0; i < cells.size(); i++) {
				// 创建第i个单元格
				HSSFCell cell = row.createCell((short) i);
				// cell.setEncoding(HSSFCell.ENCODING_UTF_16);
				cell.setCellValue(cells.get(i));
			}
		} else {
			// 创建第i个单元格
			HSSFCell cell = row.createCell((short) 1);
			// cell.setEncoding(HSSFCell.ENCODING_UTF_16);
			cell.setCellValue("-----------------------");
		}
	}
	public static void main(String[] args) {
		String fileName = "D:\\excel\\11206.xls";
		FileOutputStream fos = null;
		try {
			ExcelService pd = new ExcelService();
			ExcelService.createExcelSheeet();
			fos = new FileOutputStream(fileName);
			pd.exportExcel(demoSheet, fos);
			JOptionPane.showMessageDialog(null, "表格已成功导出到 : " + fileName);
		} catch (Exception e) {
			JOptionPane.showMessageDialog(null, "表格导出出错，错误信息 ：" + e + "\n错误原因可能是表格已经打开。");
			e.printStackTrace();
		} finally {
			try {
				fos.close();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}
	/**
	 *创建整个Excel表 
	 * @throws SQLException
	 */
	public static void createExcelSheeet() throws Exception {
		//createTableHeader(); // --->创建一个表头行
		/*while (rs.next()) {
			String isme = null;
			List<String> list = new ArrayList<String>();
			//int falg = 0;
			for (int i = 1; i <= columNumber; i++) {
				if (i==3){
					isme = rs.getString(i);
				} else if (i==4){
					String result = Tea.decrypt(rs.getString(i), "wLSKF~$^)456Sdk");
					try {
						JSONObject body = new JSONObject(result);
						result = "离线消息："+body.optString("D3");
					} catch (Exception e) {
					}
					if (isme.equals("1")) {
						result = "访客：" + result.replaceAll("\r|\n", "");
					} else {
						result = "客服：" + result.replaceAll("\r|\n", "");
					}
					//falg = isValidStr(result);
					list.add(result);
				} else if (i==5){
					long time = Long.valueOf(rs.getString(i));
					list.add(TimeUtil.getFormatMMSecondString(4,time));
				} else {
					list.add(rs.getString(i));
				}
			}
			createTableRow(list, (short) rowIndex);
			rowIndex++;
		}*/
	}

	/**
	 * 导出表格 
	 * 
	 * @param sheet
	 * @param os
	 * @throws IOException
	 */
	public void exportExcel(HSSFSheet sheet, OutputStream os) throws IOException {
		sheet.setGridsPrinted(true);
		HSSFFooter footer = sheet.getFooter();
		footer.setRight("Page " + HSSFFooter.page() + " of " + HSSFFooter.numPages());
		demoWorkBook.write(os);
	}
}

public class TextUtil {
	public static boolean isEmpty(String str) {
		if (str == null || str.trim().length() == 0) {
			return true;
		}
		return false;
	}
}

import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class WebEncoding {
	private static CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
	static {

		detector.add(new ParsingDetector(false));

		detector.add(JChardetFacade.getInstance());

	}

	/**
	 * 测试用例
	 * 
	 * @param args
	 */
	public static void main(String[] args) {
		try {
			System.out.println(getCharset(""));
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	/**
	 * @param strurl
	 *            页面url地址,需要以 http://开始，例：http://www.pujia.com
	 * @return
	 * @throws IOException
	 */
	public static String getCharset(String strurl) throws IOException {
		// 定义URL对象
		URL url = new URL(strurl);
		// 获取http连接对象
		HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
		;
		urlConnection.connect();
		// 网页编码
		String strencoding = null;

		/**
		 * 首先根据header信息，判断页面编码
		 */
		// map存放的是header信息(url页面的头信息)
		Map<String, List<String>> map = urlConnection.getHeaderFields();
		Set<String> keys = map.keySet();
		Iterator<String> iterator = keys.iterator();

		// 遍历,查找字符编码
		String key = null;
		String tmp = null;
		while (iterator.hasNext()) {
			key = iterator.next();
			tmp = map.get(key).toString().toLowerCase();
			// 获取content-type charset
			if (key != null && key.equals("Content-Type")) {
				int m = tmp.indexOf("charset=");
				if (m != -1) {
					strencoding = tmp.substring(m + 8).replace("]", "");
					return strencoding;
				}
			}
		}

		/**
		 * 通过解析meta得到网页编码
		 */
		// 获取网页源码(英文字符和数字不会乱码，所以可以得到正确<meta/>区域)
		StringBuffer sb = new StringBuffer();
		String line;
		try {
			BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
			while ((line = in.readLine()) != null) {
				sb.append(line);
			}
			in.close();
		} catch (Exception e) { // Report any errors that arise
			System.err.println(e);
			System.err.println("Usage:   java   HttpClient   <URL>   [<filename>]");
		}
		String htmlcode = sb.toString();
		// 解析html源码，取出<meta />区域，并取出charset
		String strbegin = "<meta";
		String strend = ">";
		String strtmp;
		int begin = htmlcode.indexOf(strbegin);
		int end = -1;
		int inttmp;
		while (begin > -1) {
			end = htmlcode.substring(begin).indexOf(strend);
			if (begin > -1 && end > -1) {
				strtmp = htmlcode.substring(begin, begin + end).toLowerCase();
				inttmp = strtmp.indexOf("charset");
				if (inttmp > -1) {
					strencoding = strtmp.substring(inttmp + 7, end).replace("=", "").replace("/", "").replace("\"", "")
							.replace("\'", "").replace(" ", "");
					return strencoding;
				}
			}
			htmlcode = htmlcode.substring(begin);
			begin = htmlcode.indexOf(strbegin);
		}

		/**
		 * 分析字节得到网页编码
		 */
		strencoding = getFileEncoding(url);

		// 设置默认网页字符编码
		if (strencoding == null) {
			strencoding = "GBK";
		}

		return strencoding;
	}

	/**
	 * 
	 * <br>
	 * 方法说明：通过网页内容识别网页编码
	 * 
	 * <br>
	 * 输入参数：strUrl 网页链接; timeout 超时设置
	 * 
	 * <br>
	 * 返回类型：网页编码
	 */
	public static String getFileEncoding(URL url) {

		java.nio.charset.Charset charset = null;
		try {

			charset = detector.detectCodepage(url);

		} catch (Exception e) {

			System.out.println(e.getClass() + "分析" + "编码失败");

		}

		if (charset != null)

			return charset.name();

		return null;

	}
}