/**
* 规则类
*
*
*/
public class Rule {
/**
* 链接
*/
private String url;
/**
* 参数集合
*/
private String[] params;
/**
* 参数对应的值
*/
private String[] values;
/**
* 对返回的HTML,第一次过滤所用的标签,请先设置type
*/
private String resultTagName;
/**
* CLASS / ID / SELECTION 设置resultTagName的类型,默认为ID
*/
private int type = ID;
/**
* GET / POST 请求的类型,默认GET
*/
private int requestMoethod = GET;
public final static int GET = 0;
public final static int POST = 1;
public final static int CLASS = 0;
public final static int ID = 1;
public final static int SELECTION = 2;
public Rule() {
}
public Rule(String url, String[] params, String[] values, String resultTagName, int type, int requestMoethod) {
super();
this.url = url;
this.params = params;
this.values = values;
this.resultTagName = resultTagName;
this.type = type;
this.requestMoethod = requestMoethod;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String[] getParams() {
return params;
}
public void setParams(String[] params) {
this.params = params;
}
public String[] getValues() {
return values;
}
public void setValues(String[] values) {
this.values = values;
}
public String getResultTagName() {
return resultTagName;
}
public void setResultTagName(String resultTagName) {
this.resultTagName = resultTagName;
}
public int getType() {
return type;
}
public void setType(int type) {
this.type = type;
}
public int getRequestMoethod() {
return requestMoethod;
}
public void setRequestMoethod(int requestMoethod) {
this.requestMoethod = requestMoethod;
}
}
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.zhy.spider.rule.Rule;
import com.zhy.spider.rule.RuleException;
import com.zhy.spider.util.TextUtil;
public class ExtractService {
/**
* @param rule
* @return
*/
public static List<List<String>> extract(Rule rule, String urlName) {
// 进行对rule的必要校验
validateRule(rule);
List<List<String>> datas = null;
try {
/**
* 解析rule
*/
String url = rule.getUrl();
String[] params = rule.getParams();
String[] values = rule.getValues();
String resultTagName = rule.getResultTagName();
int type = rule.getType();
int requestType = rule.getRequestMoethod();
URL realUrl = new URL(url);
HttpURLConnection connection = (HttpURLConnection) realUrl.openConnection();
// 是否允许缓存,默认true。
connection.setUseCaches(Boolean.FALSE);
// 是否开启输出输入,如果是post使用true。默认是false
// connection.setDoOutput(Boolean.TRUE);
// connection.setDoInput(Boolean.TRUE);
// 设置请求头信息
connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
//connection.addRequestProperty("Connection", "close");
// 设置连接主机超时(单位:毫秒)
connection.setConnectTimeout(8000);
// 设置从主机读取数据超时(单位:毫秒)
connection.setReadTimeout(8000);
// 设置Cookie
// connection.addRequestProperty("Cookie","你的Cookies" );
// 设置查询参数
String str = "";
if (params != null) {
for (int i = 0; i < params.length; i++) {
str = str + "&" + params[i] + "=" + values[i];
}
}
// 设置请求类型,大小写都行,因为源码里都toUpperCase了。
switch (requestType) {
case Rule.GET:
connection.setRequestMethod("GET");
break;
case Rule.POST:
connection.setRequestMethod("POST");
break;
}
// 获取页面编码
//String encoding = WebEncoding.getCharset(url);
// 开始请求
Document doc = Jsoup.parse(connection.getInputStream(), "gb2312", url + str);
// 处理返回数据
Elements results = new Elements();
switch (type) {
case Rule.CLASS:
results = doc.getElementsByClass(resultTagName);
break;
case Rule.ID:
Element result = doc.getElementById(resultTagName);
results.add(result);
break;
case Rule.SELECTION:
results = doc.select(resultTagName);
break;
default:
// 当resultTagName为空时默认去body标签
if (TextUtil.isEmpty(resultTagName)) {
results = doc.getElementsByTag("div");
}
}
datas = LinkTypeData9998(results);
if (datas == null || datas.size() == 0){
results = new Elements();
results = doc.getElementsByTag("title");
for (Element element : results) {
List<String> data = new ArrayList<String>();
data.add(element.text());
datas.add(data);
}
}
} catch (IOException e) {
e.printStackTrace();
}
return datas;
}
private static List<List<String>> LinkTypeData9998(Elements results) {
List<List<String>> datas = new ArrayList<List<String>>();
List<String> data = null;
for (Element result : results) {
Elements div1 = result.select("div.Bg");
for (Element element : div1) {
data = new ArrayList<String>();
Elements h4 = element.getElementsByTag("h4");
if (h4 != null && h4.size()>0){
for (int i = 0; i < h4.size(); i++) {
String text = h4.get(i).text();
String[] strings = text.split(" ");
for (int j = 0; j < strings.length; j++) {
data.add(strings[j]);
}
}
}
datas.add(data);
}
}
return datas;
}
/**
* 对传入的参数进行必要的校验
*/
private static void validateRule(Rule rule) {
String url = rule.getUrl();
if (TextUtil.isEmpty(url)) {
throw new RuleException("url不能为空!");
}
if (url.startsWith("http://") || url.startsWith("https://")) {
System.out.println(url);
} else {
throw new RuleException("url的格式不正确!");
}
if (rule.getParams() != null && rule.getValues() != null) {
if (rule.getParams().length != rule.getValues().length) {
throw new RuleException("参数的键值对个数不匹配!");
}
}
}
}
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import javax.swing.JOptionPane;
import com.zhy.spider.bean.LinkTypeData;
import com.zhy.spider.core.ExtractService;
import com.zhy.spider.core.ExtractService2;
import com.zhy.spider.rule.Rule;
import com.zhy.spider.util.ExcelService;
import com.zhy.spider.util.WebContent;
public class Test2 {
public static void main(String[] args) {
String fileName = "D:\\excel\\04.xls";
FileOutputStream fos = null;
ExcelService pd = new ExcelService();
// 表头
//String[] tableHeader = { "广告链接" ,"广告名称", "招商热线" , "微信", "企业网址" , "联系地址"};
//ExcelService.createTableHeader("9928", tableHeader); // --->创建一个表头行
Rule rule = new Rule("", null, null, null, -1, Rule.GET);
List<LinkTypeData> extracts = ExtractService.extract(rule,"9998");
int rowIndex = 1;
try {
for (LinkTypeData data : extracts) {
System.out.println(data.getLinkHref());
if (data.getLinkHref() != null && !"".equals(data.getLinkHref())){
Rule rule2 = new Rule(data.getLinkHref(), null, null, null, -1, Rule.GET);
List<List<String>> extracts2 = ExtractService2.extract(rule2,"9998");
if (extracts2 != null && extracts2.size()>0){
List<String> list = new ArrayList<String>();
list.add(data.getLinkHref());
for (List<String> list2 : extracts2) {
for (int i = 0; i < list2.size(); i++) {
System.out.println(list2.get(i));
list.add(list2.get(i));
}
}
ExcelService.createTableRow(list, (short) rowIndex);
rowIndex++;
}
}
System.out.println("***********************************");
}
fos = new FileOutputStream(fileName);
pd.exportExcel(ExcelService.demoSheet, fos);
JOptionPane.showMessageDialog(null, "表格已成功导出到 : " + fileName);
} catch (Exception e) {
JOptionPane.showMessageDialog(null, "表格导出出错,错误信息 :" + e + "\n错误原因可能是表格已经打开。");
e.printStackTrace();
} finally {
try {
fos.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.JOptionPane;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFFooter;
import org.apache.poi.hssf.usermodel.HSSFHeader;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
/**
*
* @ClassName: ExcelService
* @Description:Excel
*
*/
public class ExcelService {
// 创建工作本
public static HSSFWorkbook demoWorkBook = new HSSFWorkbook();
// 创建表
public static HSSFSheet demoSheet = demoWorkBook.createSheet();
// 表头的单元格个数目
//public static final short cellNumber = (short) tableHeader.length;
// 数据库表的列数
public static final int columNumber = 2;
/**
* 创建表头
* @return
*/
@SuppressWarnings("deprecation")
public static void createTableHeader(String str, String[] tableHeader) {
// 设置表头,从sheet中得到
HSSFHeader header = demoSheet.getHeader();
header.setCenter(str);
// 创建一行
HSSFRow headerRow = demoSheet.createRow((short) 0);
for (int i = 0; i < tableHeader.length; i++) {
// 创建一个单元格
HSSFCell headerCell = headerRow.createCell((short) i);
// headerCell.setEncoding(HSSFCell.ENCODING_UTF_16);
// CellStyle cs = new CellStyle();
// 设置cell的值
headerCell.setCellValue(tableHeader[i]);
}
}
/**
* 创建行
* @param cells
* @param rowIndex
*/
@SuppressWarnings("deprecation")
public static void createTableRow(List<String> cells, short rowIndex) {
// 创建第rowIndex行
HSSFRow row = demoSheet.createRow((short) rowIndex);
if (cells!= null && cells.size() >0){
for (short i = 0; i < cells.size(); i++) {
// 创建第i个单元格
HSSFCell cell = row.createCell((short) i);
// cell.setEncoding(HSSFCell.ENCODING_UTF_16);
cell.setCellValue(cells.get(i));
}
} else {
// 创建第i个单元格
HSSFCell cell = row.createCell((short) 1);
// cell.setEncoding(HSSFCell.ENCODING_UTF_16);
cell.setCellValue("-----------------------");
}
}
public static void main(String[] args) {
String fileName = "D:\\excel\\11206.xls";
FileOutputStream fos = null;
try {
ExcelService pd = new ExcelService();
ExcelService.createExcelSheeet();
fos = new FileOutputStream(fileName);
pd.exportExcel(demoSheet, fos);
JOptionPane.showMessageDialog(null, "表格已成功导出到 : " + fileName);
} catch (Exception e) {
JOptionPane.showMessageDialog(null, "表格导出出错,错误信息 :" + e + "\n错误原因可能是表格已经打开。");
e.printStackTrace();
} finally {
try {
fos.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
*创建整个Excel表
* @throws SQLException
*/
public static void createExcelSheeet() throws Exception {
//createTableHeader(); // --->创建一个表头行
/*while (rs.next()) {
String isme = null;
List<String> list = new ArrayList<String>();
//int falg = 0;
for (int i = 1; i <= columNumber; i++) {
if (i==3){
isme = rs.getString(i);
} else if (i==4){
String result = Tea.decrypt(rs.getString(i), "wLSKF~$^)456Sdk");
try {
JSONObject body = new JSONObject(result);
result = "离线消息:"+body.optString("D3");
} catch (Exception e) {
}
if (isme.equals("1")) {
result = "访客:" + result.replaceAll("\r|\n", "");
} else {
result = "客服:" + result.replaceAll("\r|\n", "");
}
//falg = isValidStr(result);
list.add(result);
} else if (i==5){
long time = Long.valueOf(rs.getString(i));
list.add(TimeUtil.getFormatMMSecondString(4,time));
} else {
list.add(rs.getString(i));
}
}
createTableRow(list, (short) rowIndex);
rowIndex++;
}*/
}
/**
* 导出表格
*
* @param sheet
* @param os
* @throws IOException
*/
public void exportExcel(HSSFSheet sheet, OutputStream os) throws IOException {
sheet.setGridsPrinted(true);
HSSFFooter footer = sheet.getFooter();
footer.setRight("Page " + HSSFFooter.page() + " of " + HSSFFooter.numPages());
demoWorkBook.write(os);
}
}
public class TextUtil {
public static boolean isEmpty(String str) {
if (str == null || str.trim().length() == 0) {
return true;
}
return false;
}
}
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class WebEncoding {
private static CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
static {
detector.add(new ParsingDetector(false));
detector.add(JChardetFacade.getInstance());
}
/**
* 测试用例
*
* @param args
*/
public static void main(String[] args) {
try {
System.out.println(getCharset(""));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* @param strurl
* 页面url地址,需要以 http://开始,例:http://www.pujia.com
* @return
* @throws IOException
*/
public static String getCharset(String strurl) throws IOException {
// 定义URL对象
URL url = new URL(strurl);
// 获取http连接对象
HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
;
urlConnection.connect();
// 网页编码
String strencoding = null;
/**
* 首先根据header信息,判断页面编码
*/
// map存放的是header信息(url页面的头信息)
Map<String, List<String>> map = urlConnection.getHeaderFields();
Set<String> keys = map.keySet();
Iterator<String> iterator = keys.iterator();
// 遍历,查找字符编码
String key = null;
String tmp = null;
while (iterator.hasNext()) {
key = iterator.next();
tmp = map.get(key).toString().toLowerCase();
// 获取content-type charset
if (key != null && key.equals("Content-Type")) {
int m = tmp.indexOf("charset=");
if (m != -1) {
strencoding = tmp.substring(m + 8).replace("]", "");
return strencoding;
}
}
}
/**
* 通过解析meta得到网页编码
*/
// 获取网页源码(英文字符和数字不会乱码,所以可以得到正确<meta/>区域)
StringBuffer sb = new StringBuffer();
String line;
try {
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
while ((line = in.readLine()) != null) {
sb.append(line);
}
in.close();
} catch (Exception e) { // Report any errors that arise
System.err.println(e);
System.err.println("Usage: java HttpClient <URL> [<filename>]");
}
String htmlcode = sb.toString();
// 解析html源码,取出<meta />区域,并取出charset
String strbegin = "<meta";
String strend = ">";
String strtmp;
int begin = htmlcode.indexOf(strbegin);
int end = -1;
int inttmp;
while (begin > -1) {
end = htmlcode.substring(begin).indexOf(strend);
if (begin > -1 && end > -1) {
strtmp = htmlcode.substring(begin, begin + end).toLowerCase();
inttmp = strtmp.indexOf("charset");
if (inttmp > -1) {
strencoding = strtmp.substring(inttmp + 7, end).replace("=", "").replace("/", "").replace("\"", "")
.replace("\'", "").replace(" ", "");
return strencoding;
}
}
htmlcode = htmlcode.substring(begin);
begin = htmlcode.indexOf(strbegin);
}
/**
* 分析字节得到网页编码
*/
strencoding = getFileEncoding(url);
// 设置默认网页字符编码
if (strencoding == null) {
strencoding = "GBK";
}
return strencoding;
}
/**
*
* <br>
* 方法说明:通过网页内容识别网页编码
*
* <br>
* 输入参数:strUrl 网页链接; timeout 超时设置
*
* <br>
* 返回类型:网页编码
*/
public static String getFileEncoding(URL url) {
java.nio.charset.Charset charset = null;
try {
charset = detector.detectCodepage(url);
} catch (Exception e) {
System.out.println(e.getClass() + "分析" + "编码失败");
}
if (charset != null)
return charset.name();
return null;
}
}