一个抓取网页解析内容的程序。

Crawler.java

package com.web.crawler;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;

public class Crawler implements Runnable {
	private int startIndex;
	private int endIndex;
	private String url;
	private PuzzleList locatList;

	public Crawler(int startIndex, int endIndex, String url,
			PuzzleList locatList) {
		this.startIndex = startIndex;
		this.endIndex = endIndex;
		this.url = url;
		this.locatList = locatList;
	}

	public void run() {
		System.out.println("begin run!");
		int row = 0, column = 0, number = 0;
		String descr = null;
		try {
			for (int urlIndex = this.startIndex; urlIndex <= this.endIndex; urlIndex++) {
				System.out.println("URL_INDEX::" + urlIndex);
				String html = null;
				HttpClient httpClient = new DefaultHttpClient();
				HttpGet httpget = new HttpGet(
						"http://www.menneske.no/arukone/5x5/eng/?number=" + urlIndex);
				try {
					HttpResponse responce = httpClient.execute(httpget);
					int resStatu = responce.getStatusLine().getStatusCode();
					if (resStatu == HttpStatus.SC_OK) {
						HttpEntity entity = responce.getEntity();
						if (entity != null) {
							html = EntityUtils.toString(entity);
						}
					}
				} catch (Exception e) {
					e.printStackTrace();
				} finally {
					httpClient.getConnectionManager().shutdown();
				}

				Pattern p = Pattern
						.compile("</table>Showing puzzle number: [\\d]+<br/>");
				Matcher m = p.matcher(html);
				while (m.find()) {
					String tmp = m.group();
					number = Integer.parseInt(tmp.substring(
							new String("</table>Showing puzzle number: ").length(),
							tmp.indexOf("<br/>")));
				}

				p = Pattern.compile("Difficulty: [A-Za-z ]+<br/><a href=");
				m = p.matcher(html);
				while (m.find()) {
					String tmp = m.group();
					descr = tmp.substring(new String("Difficulty: ").length(),
							tmp.indexOf("<br/><a href="));
				}

				p = Pattern.compile("<td class=\"white\">[1-9]*</td>");
				m = p.matcher(html);
				int tdIndex = 1;
				while (m.find()) {
					String tmp = m.group();
					String numberStr = tmp.substring(new String("<td class=\"white\">").length(), 
							tmp.indexOf("</td>"));
					if(numberStr.length() > 0){
						row = getRow(tdIndex);
						column = getColumn(tdIndex);
						number = Integer.parseInt(numberStr);
						System.out.println(urlIndex +"," + descr + ","+ row + "," + column + "," + number);
						PuzzleLocation locat = new PuzzleLocation(urlIndex, descr, row, column, number);
						locatList.addLocation(locat);
					}
					tdIndex++;
					Thread.currentThread().sleep(100);
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		System.out.println("end run!");
		//Thread.currentThread().notify();
		locatList.finishtask.incrementAndGet();
	}

	private static int getRow(int tdIndex) {
		if (0 == (tdIndex % 5))
			return (tdIndex / 5);
		else
			return ((tdIndex / 5) + 1);
	}

	private static int getColumn(int tdIndex) {
		if (0 == (tdIndex % 5))
			return 5;
		else
			return (tdIndex % 5);
	}
}


PuzzleList.java

package com.web.crawler;

import java.lang.*;
import java.util.*;
import com.web.crawler.PuzzleLocation;
import java.util.concurrent.atomic.AtomicInteger;

public class PuzzleList {
	private ArrayList<PuzzleLocation> locatList =  null;
	public AtomicInteger             finishtask = null;
	
	public PuzzleList(){
		locatList =  new ArrayList<PuzzleLocation>(1500);
		finishtask = new AtomicInteger(0);
	}

	public synchronized void addLocation(PuzzleLocation locat){
		locatList.add(locat);
		//System.out.println("List Size::" + locatList.size());
	}
	
	public void sortResult(){
		Collections.sort(locatList);
	}
	
	public List<PuzzleLocation> getPuzzleList(){
		return this.locatList;
	}
	
	public String toString(){
		StringBuffer  outputBuf = new StringBuffer(102400);
		for(PuzzleLocation locat : locatList){
			outputBuf.append(locat.getIndex()).append(",");
			outputBuf.append(locat.getDescr()).append(",");
			outputBuf.append(locat.getRow()).append(",");
			outputBuf.append(locat.getColumn()).append(",");
			outputBuf.append(locat.getNumber()).append("\n");
			
		}
		return new String(outputBuf);
	}
}

PuzzleLocation.java


package com.web.crawler;

import java.io.Serializable;

public class PuzzleLocation implements Comparable<PuzzleLocation>, Serializable {
	private static final long serialVersionUID = 823498623L;
	private int index;
	private String descrp;
	private int row;
	private int column;
	private int number;

	public int getIndex() {
		return this.index;
	}

	public int getNumber() {
		return this.number;
	}

	public int getRow() {
		return this.row;
	}

	public int getColumn() {
		return this.column;
	}

	public String getDescr() {
		return this.descrp;
	}

	public PuzzleLocation(int index, String descrp, int row, int column,
			int number) {
		this.index = index;
		this.descrp = descrp;
		this.row = row;
		this.column = column;
		this.number = number;
	}

	public int compareTo(PuzzleLocation dest) {
		if (this.index > dest.index) {
			return 1;
		} else {
			if (this.index < dest.index) {
				return -1;
			} else {
				if (this.number > dest.number) {
					return 1;
				} else {
					if (this.number < dest.number) {
						return -1;
					} else {
						if (this.row > dest.row) {
							return 1;
						} else {
							if (this.row < dest.row) {
								return -1;
							} else {
								if (this.column > dest.column) {
									return 1;
								} else {
									if (this.column < dest.column) {
										return -1;
									} else {
										return 0;
									}
								}
							}
						}
					}
				}
			}
		}
	}
}

ExcelUtil.java


package com.web.crawler;

import java.util.*;
import java.io.*;

public class ExcelUtil {
	public static void exportExcel(PuzzleList locatList){
		try{
			File file=new File("C:/puzzle.xls");
			if(!file.exists()){
				file.createNewFile();
			}
			else
			{
				file.delete();
				file.createNewFile();
			}
			FileOutputStream out=new FileOutputStream(file,true);
			for(PuzzleLocation locat : locatList.getPuzzleList()){
				StringBuffer outputBuf = new StringBuffer();
				outputBuf.append(locat.getIndex()).append("\t");
				outputBuf.append(locat.getDescr()).append("\t");
				outputBuf.append(locat.getRow()).append("\t");
				outputBuf.append(locat.getColumn()).append("\t");
				outputBuf.append(locat.getNumber()).append("\n");
				out.write(outputBuf.toString().getBytes("utf-8"));
			}      
        	out.close();  	
		}
		catch(Exception exp){
			exp.printStackTrace();
		}
	}
}

WebCrawler.java

package com.web.crawler;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.*;

public class WebCrawler {

	public static void main(String[] args) {
		System.out.println("Main start!");

		String cfgUrl = null;
		try {
			Properties config = new Properties();
			InputStream inStream = new BufferedInputStream(new FileInputStream(
					"resource/puzzle.property"));
			config.load(inStream);
			
			cfgUrl = config.getProperty("URL");			
		} catch (Exception exp) {
			exp.printStackTrace();
		}
		String paramUrl = cfgUrl.substring(0, cfgUrl.indexOf("="));
		int    paramIndex = Integer
				.parseInt(cfgUrl.substring(cfgUrl.indexOf("=") + 1));

		System.out.println(paramUrl);
		System.out.println(paramIndex);
		/*
		try {
			Thread.currentThread().sleep(50000);
		} catch (Exception exp) {
			exp.printStackTrace();
		}
		*/
		PuzzleList locatList = new PuzzleList();
		
		for(int threadIndex = 1; threadIndex <= 5; threadIndex++){
			int startIndex = (int)(paramIndex + (threadIndex - 1) * (1434 - paramIndex) / 5.0 );
			int endIndex   = (int)(paramIndex + threadIndex * (1434 - paramIndex) / 5.0);
			if(1 != threadIndex){
				startIndex += 1;
			}
			Crawler crawler = new Crawler(startIndex, endIndex, paramUrl, locatList);
			new Thread(crawler).start();
			
		}
		
		while (locatList.finishtask.get() < 5) {
			try {
				Thread.currentThread().sleep(5000);
			} catch (Exception exp) {
				exp.printStackTrace();
			}
		}
		locatList.sortResult();
		ExcelUtil.exportExcel(locatList);
		// System.out.println(html);
		System.out.println("main end!");
		return;
	}
}





  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值