Crawler.java
package com.web.crawler;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
public class Crawler implements Runnable {
private int startIndex;
private int endIndex;
private String url;
private PuzzleList locatList;
public Crawler(int startIndex, int endIndex, String url,
PuzzleList locatList) {
this.startIndex = startIndex;
this.endIndex = endIndex;
this.url = url;
this.locatList = locatList;
}
public void run() {
System.out.println("begin run!");
int row = 0, column = 0, number = 0;
String descr = null;
try {
for (int urlIndex = this.startIndex; urlIndex <= this.endIndex; urlIndex++) {
System.out.println("URL_INDEX::" + urlIndex);
String html = null;
HttpClient httpClient = new DefaultHttpClient();
HttpGet httpget = new HttpGet(
"http://www.menneske.no/arukone/5x5/eng/?number=" + urlIndex);
try {
HttpResponse responce = httpClient.execute(httpget);
int resStatu = responce.getStatusLine().getStatusCode();
if (resStatu == HttpStatus.SC_OK) {
HttpEntity entity = responce.getEntity();
if (entity != null) {
html = EntityUtils.toString(entity);
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
httpClient.getConnectionManager().shutdown();
}
Pattern p = Pattern
.compile("</table>Showing puzzle number: [\\d]+<br/>");
Matcher m = p.matcher(html);
while (m.find()) {
String tmp = m.group();
number = Integer.parseInt(tmp.substring(
new String("</table>Showing puzzle number: ").length(),
tmp.indexOf("<br/>")));
}
p = Pattern.compile("Difficulty: [A-Za-z ]+<br/><a href=");
m = p.matcher(html);
while (m.find()) {
String tmp = m.group();
descr = tmp.substring(new String("Difficulty: ").length(),
tmp.indexOf("<br/><a href="));
}
p = Pattern.compile("<td class=\"white\">[1-9]*</td>");
m = p.matcher(html);
int tdIndex = 1;
while (m.find()) {
String tmp = m.group();
String numberStr = tmp.substring(new String("<td class=\"white\">").length(),
tmp.indexOf("</td>"));
if(numberStr.length() > 0){
row = getRow(tdIndex);
column = getColumn(tdIndex);
number = Integer.parseInt(numberStr);
System.out.println(urlIndex +"," + descr + ","+ row + "," + column + "," + number);
PuzzleLocation locat = new PuzzleLocation(urlIndex, descr, row, column, number);
locatList.addLocation(locat);
}
tdIndex++;
Thread.currentThread().sleep(100);
}
}
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("end run!");
//Thread.currentThread().notify();
locatList.finishtask.incrementAndGet();
}
private static int getRow(int tdIndex) {
if (0 == (tdIndex % 5))
return (tdIndex / 5);
else
return ((tdIndex / 5) + 1);
}
private static int getColumn(int tdIndex) {
if (0 == (tdIndex % 5))
return 5;
else
return (tdIndex % 5);
}
}
PuzzleList.java
package com.web.crawler;
import java.lang.*;
import java.util.*;
import com.web.crawler.PuzzleLocation;
import java.util.concurrent.atomic.AtomicInteger;
public class PuzzleList {
private ArrayList<PuzzleLocation> locatList = null;
public AtomicInteger finishtask = null;
public PuzzleList(){
locatList = new ArrayList<PuzzleLocation>(1500);
finishtask = new AtomicInteger(0);
}
public synchronized void addLocation(PuzzleLocation locat){
locatList.add(locat);
//System.out.println("List Size::" + locatList.size());
}
public void sortResult(){
Collections.sort(locatList);
}
public List<PuzzleLocation> getPuzzleList(){
return this.locatList;
}
public String toString(){
StringBuffer outputBuf = new StringBuffer(102400);
for(PuzzleLocation locat : locatList){
outputBuf.append(locat.getIndex()).append(",");
outputBuf.append(locat.getDescr()).append(",");
outputBuf.append(locat.getRow()).append(",");
outputBuf.append(locat.getColumn()).append(",");
outputBuf.append(locat.getNumber()).append("\n");
}
return new String(outputBuf);
}
}
PuzzleLocation.java
package com.web.crawler;
import java.io.Serializable;
public class PuzzleLocation implements Comparable<PuzzleLocation>, Serializable {
private static final long serialVersionUID = 823498623L;
private int index;
private String descrp;
private int row;
private int column;
private int number;
public int getIndex() {
return this.index;
}
public int getNumber() {
return this.number;
}
public int getRow() {
return this.row;
}
public int getColumn() {
return this.column;
}
public String getDescr() {
return this.descrp;
}
public PuzzleLocation(int index, String descrp, int row, int column,
int number) {
this.index = index;
this.descrp = descrp;
this.row = row;
this.column = column;
this.number = number;
}
public int compareTo(PuzzleLocation dest) {
if (this.index > dest.index) {
return 1;
} else {
if (this.index < dest.index) {
return -1;
} else {
if (this.number > dest.number) {
return 1;
} else {
if (this.number < dest.number) {
return -1;
} else {
if (this.row > dest.row) {
return 1;
} else {
if (this.row < dest.row) {
return -1;
} else {
if (this.column > dest.column) {
return 1;
} else {
if (this.column < dest.column) {
return -1;
} else {
return 0;
}
}
}
}
}
}
}
}
}
}
ExcelUtil.java
package com.web.crawler;
import java.util.*;
import java.io.*;
public class ExcelUtil {
public static void exportExcel(PuzzleList locatList){
try{
File file=new File("C:/puzzle.xls");
if(!file.exists()){
file.createNewFile();
}
else
{
file.delete();
file.createNewFile();
}
FileOutputStream out=new FileOutputStream(file,true);
for(PuzzleLocation locat : locatList.getPuzzleList()){
StringBuffer outputBuf = new StringBuffer();
outputBuf.append(locat.getIndex()).append("\t");
outputBuf.append(locat.getDescr()).append("\t");
outputBuf.append(locat.getRow()).append("\t");
outputBuf.append(locat.getColumn()).append("\t");
outputBuf.append(locat.getNumber()).append("\n");
out.write(outputBuf.toString().getBytes("utf-8"));
}
out.close();
}
catch(Exception exp){
exp.printStackTrace();
}
}
}
WebCrawler.java
package com.web.crawler;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.*;
public class WebCrawler {
public static void main(String[] args) {
System.out.println("Main start!");
String cfgUrl = null;
try {
Properties config = new Properties();
InputStream inStream = new BufferedInputStream(new FileInputStream(
"resource/puzzle.property"));
config.load(inStream);
cfgUrl = config.getProperty("URL");
} catch (Exception exp) {
exp.printStackTrace();
}
String paramUrl = cfgUrl.substring(0, cfgUrl.indexOf("="));
int paramIndex = Integer
.parseInt(cfgUrl.substring(cfgUrl.indexOf("=") + 1));
System.out.println(paramUrl);
System.out.println(paramIndex);
/*
try {
Thread.currentThread().sleep(50000);
} catch (Exception exp) {
exp.printStackTrace();
}
*/
PuzzleList locatList = new PuzzleList();
for(int threadIndex = 1; threadIndex <= 5; threadIndex++){
int startIndex = (int)(paramIndex + (threadIndex - 1) * (1434 - paramIndex) / 5.0 );
int endIndex = (int)(paramIndex + threadIndex * (1434 - paramIndex) / 5.0);
if(1 != threadIndex){
startIndex += 1;
}
Crawler crawler = new Crawler(startIndex, endIndex, paramUrl, locatList);
new Thread(crawler).start();
}
while (locatList.finishtask.get() < 5) {
try {
Thread.currentThread().sleep(5000);
} catch (Exception exp) {
exp.printStackTrace();
}
}
locatList.sortResult();
ExcelUtil.exportExcel(locatList);
// System.out.println(html);
System.out.println("main end!");
return;
}
}