【题记】最近在“猪八戒网”上看到一个任务,采集“袖扣网”的所有商品的编号、价格和图片, 图片保存在以商品编号为名称的文件夹下,编号、价格信息写入excel中保存。
目标网站数据量: 商品列表页面559个(截止2011-12-22),每页商品20个,每个商品平均4个图片。总共需要网络下载量(559(商品列表页面--parent)+559*20(具体商品页面--leaf)+559*20*4=56459次);
测试下载一个商品页面平均需要14s,单线程情况下预计完成任务52小时,自己使用了100个线程,完成任务耗时8小时。
下面先贴出源代码:供大家来批判,同时提供好的改进优化意见。
DataCollector001.java
//=============================================
package org.study.app.crawler;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Vector;
import java.util.concurrent.CountDownLatch;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.log4j.Logger;
import org.study.util.excel.ExcelUtil;
import org.study.util.net.URLGetter;
/**
* 采集袖口网的所有:编号、批发价格已经所有袖口图片。
*
* @author Walker Jong
* @version 1.0, 2011-12-22
*/
public class DataCollector001 extends Thread {
public static final int THREAD_NUM = 100;
private static final String leafToken = "'http://www.xiukouwang.com/product";
private static final String parentToken = "http://www.xiukouwang.com/gallery--b%2C_ANY__t%2C_ANY_-10--";
private static final String[] infoKeys = new String[] { "编 号:</span>",
"class=\"price1\">", "http://www.biilii.com/pic/products/" };
private static final String endToken = "<";
private static final String otherToken = "grid.html";
private static final String defaultDir = "F:/data";
private static final List<URL> toDo = new Vector<URL>(800);
private static final Set<URL> finished = new HashSet<URL>(1000);
private static List<Object[]> infos = new ArrayList<Object[]>(13000);
private static final Logger logger = Logger
.getLogger(DataCollector001.class);
private static final CountDownLatch cdl = new CountDownLatch(THREAD_NUM);
/**
* 处理父节点页面,负责处理页面信息和提取其他的父页面和叶子节点。
*
* @param parent
*/
public static void processParent(URL parent) {
synchronized (finished) {
if (finished.contains(parent)) {
logger.debug("url [" + parent + "] has processed.");
return;
} else {
logger.info("process parent url [" + parent + "]");
}
}
final List<URL> leafs = new ArrayList<URL>();
final Set<String> parents = new HashSet<String>();
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(parent.openStream(),
"UTF-8"));
String str = null, urlStr = null;
int index = -1, nextIndex = -1;
boolean leafEnd = false, urlEnd = false;
while ((str = br.readLine()) != null) {
for (index = 0; index < str.length();) {
if (!leafEnd) {
// 提取叶子页面信息
index = str.indexOf(leafToken, index);
if (index != -1) {
nextIndex = str.indexOf("'",
index + leafToken.length());
urlStr = str.substring(index + 1, nextIndex);
leafs.add(new URL(urlStr));
logger.info("add leaf url[" + urlStr + "]");
index = nextIndex + 1;
} else {
if (leafs.size() > 0) {
index = str.indexOf("pagernum", index);
if (index != -1) {
leafEnd = true;
} else {
break;
}
} else {
break;
}
}
} else {
// 提取父页面信息
index = str.indexOf(parentToken, index);
if (index != -1) {
nextIndex = str.indexOf("\"",
index + parentToken.length());
urlStr = str.substring(index, nextIndex);
if (urlStr.endsWith(otherToken)
&& !parents.contains(urlStr)) {
parents.add(urlStr);
logger.info("add parent url[" + urlStr + "]");
}
index = nextIndex + 1;
} else {
if (parents.size() > 0
&& str.indexOf("pagenum") != -1) {
urlEnd = true;
}
break;
}
}
}// end of for
if (urlEnd) {
break;
}
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
// 标记该页面已经处理
synchronized (finished) {
finished.add(parent);
}
logger.info("leafs.size=" + leafs.size() + ", parents.size="+ parents.size());
// 处理父页面
for (String pStr : parents) {
try {
URL url = new URL(pStr);
synchronized (toDo) {
toDo.add(url);
}
} catch (MalformedURLException e1) {
logger.error(e1);
}
}
// 处理叶子页面
for (URL url : leafs) {
processLeaf(url);
}
}
/**
* 采集叶子节点里的数据。
*/
public static void processLeaf(URL leaf) {
logger.info("process leaf url [" + leaf + "]");
List<URL> picUrls = new ArrayList<URL>();
String str = null, id = null, price = null, picUrl = null;
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(leaf.openStream(),
"UTF-8"));
int index = -1, nextIndex = -1;
boolean picExist = false, picEnd = false;
while ((str = br.readLine()) != null) {
for (index = 0; index < str.length();) {
if (id == null) {
index = str.indexOf(infoKeys[0]);
if (index != -1) {
index += infoKeys[0].length();
nextIndex = str.indexOf(endToken, index);
id = str.substring(index, nextIndex);
index = nextIndex + 1;
} else {
break;
}
}
if (price == null) {
index = str.indexOf(infoKeys[1], index);
if (index != -1) {
index += infoKeys[1].length();
nextIndex = str.indexOf(">", index);
if (nextIndex != -1) {
index = nextIndex + 1;
}
nextIndex = str.indexOf(endToken, index);
price = str.substring(index, nextIndex);
index = nextIndex + 1;
} else {
break;
}
}
if (picUrl == null) { // 提取图片信息
index = str.indexOf(infoKeys[2], index);
if (index != -1) {
nextIndex = str.indexOf("\"",
index + infoKeys[2].length());
picUrl = str.substring(index, nextIndex);
picUrls.add(new URL(picUrl));
logger.info("picture url [" + picUrl + "]");
picUrl = null;
picExist = true;
index = nextIndex + 1;
} else {
if (picExist
&& str.indexOf("img src=", nextIndex + 1) != -1) {
picEnd = true;
}
break;
}
}
}// end of for
if (picEnd) {
Object[] objects = new Object[2];
objects[0] = id;
objects[1] = price;
synchronized (infos) {
infos.add(objects);
}
logger.info("id = " + id + ", price=" + price);
break;
}
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
// 下载图片信息
char postName = 'a';
for (URL url : picUrls) {
URLGetter.download(url, new File(defaultDir + "/" + id, id
+ postName + ".jpg"));
postName++;
}
}
public static void saveDataToExcel() {
File file = new File("data/袖扣编号价格表.xls");
String sheetName = "袖扣编号价格表";
String[] titleNames = new String[] { "编号", "价格" };
try {
ExcelUtil.write(file, sheetName, titleNames, infos);
} catch (IOException e) {
logger.error(e);
}
}
public void run() {
int counter = 3;
while (true) {
URL url = null;
synchronized (toDo) {
int size = toDo.size();
if (size > 0) {
url = toDo.remove(size - 1);
} else {
counter--;
if (counter <= 0) {
cdl.countDown();
break;
}
try {
toDo.wait(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
if (url != null) {
processParent(url);
}
}
}
public static void main(String[] args) {
String parent = "http://www.xiukouwang.com/gallery--b%2C_ANY__t%2C_ANY_-10--1--grid.html";
try {
processParent(new URL(parent));
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
DataCollector001 c = null;
for (int i = 0; i < THREAD_NUM; i++) {
c = new DataCollector001();
c.start();
}
// 等待所有子线程完成后,写入数据
try {
cdl.await();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
saveDataToExcel();
}
}
URLGetter.java
//==========================================
package org.study.util.net;
import java.io.*;
import java.net.*;
import org.apache.log4j.Logger;
public class URLGetter
{
private static final Logger logger = Logger.getLogger(URLGetter.class);
private static final int BUFFER_SIZE = 2048;
private static long defaultFileNo = 1;
private static final String defaultPostName = "txt";
/**
* 下载指定的url文件,保存成为给定的文件名。
*
* @URL url 需要下载的url地址。
* @File file 指定下载文件的名称,也可以包含路径。
* @boolean override 指定当文件已经存在时,是否重写。
*/
public static void download(URL url, File file, boolean override)
{
if(file == null)
{
file = getFile(url);
}
File parent = file.getParentFile();
if(parent != null && !parent.exists()){
parent.mkdirs();
}
if(file.exists())
{
logger.info("file["+file+"] has existed.");
if(!override)
{
return;
}
else
{
logger.info("overwrite the file["+file+"]");
}
}
BufferedInputStream bis = null;
BufferedOutputStream bos = null;
try {
bis = new BufferedInputStream(url.openStream());
bos = new BufferedOutputStream(new FileOutputStream(file));
byte[] buffer = new byte[BUFFER_SIZE];
int len = -1;
while((len = bis.read(buffer)) != -1){
bos.write(buffer, 0, len);
}
logger.info("finished download url["+url+"], file="+file);
} catch (IOException e) {
logger.error(e);
}finally{
if(bis != null){
try {
bis.close();
} catch (IOException e) {
logger.error(e);
}
}
if(bos != null){
try {
bos.close();
} catch (IOException e) {
logger.error(e);
}
}
}
}
/**
* 下载指定的url文件,保存成为给定的文件名。
* 遇到重名文件不重写。
*
* @URL url 需要下载的url地址。
* @File file 指定下载文件的名称,也可以包含路径。
* @boolean override 指定当文件已经存在时,是否重写。
*/
public static void download(URL url, File file)
{
download(url, file, false);
}
/**
* 下载指定的url文件,文件名以url给定的path为准,若path中缺少文件名默认“数字.txt”;
* 遇到重名文件不重写。
*
* @URL url 需要下载的url地址。
* @File file 指定下载文件的名称,也可以包含路径。
* @boolean override 指定当文件已经存在时,是否重写。
*/
public static void download(URL url)
{
download(url, getFile(url), false);
}
public static synchronized File getFile(URL url)
{
String path = url.getPath();
logger.debug("path="+path+" isNull="+(path==null));
if(path == null)
{
path = defaultFileNo+"."+defaultPostName;
}
else if(path.equals("") || path.endsWith("/"))
{
path = path+defaultFileNo+"."+defaultPostName;
}
return new File(path);
}
public static void main(String[] args)
{
try {
URL url = new URL("http://www.baidu.com/");
URLGetter.download(url, null, false);
} catch (MalformedURLException e1) {
logger.error(e1);
return;
}
}
}
ExcelUtil.java
//===========================================
package org.study.util.excel;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.List;
import org.apache.log4j.Logger;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.*;
public class ExcelUtil {
private static final Logger logger = Logger.getLogger(ExcelUtil.class);
public static void write(String[] titleNames, List<Object[]> data) throws IOException{
write("data.xls", "", titleNames, data);
}
public static void write(String fileName, String sheetName,
String[] titleNames, List<Object[]> data) throws IOException{
write(new File(fileName), sheetName, titleNames, data);
}
public static void write(File file, String sheetName, String[] titleNames, List<Object[]> data) throws IOException{
if(file == null){
return;
}
File dir = file.getParentFile();
if(dir != null){
dir.mkdirs();
}
Workbook wb = new HSSFWorkbook();
Sheet s = wb.createSheet(sheetName);
for(int i=0; i<titleNames.length; i++){
s.setColumnWidth(i, 5000);
}
CreationHelper createHelper = wb.getCreationHelper();
// create 2 fonts objects
Font tf = wb.createFont();
Font cf = wb.createFont();
// Set title font and cell font
tf.setFontHeightInPoints((short)16);
tf.setColor(IndexedColors.BLUE.getIndex());
cf.setFontHeightInPoints((short) 12);
cf.setColor(IndexedColors.BLACK.getIndex());
cf.setBoldweight(Font.BOLDWEIGHT_NORMAL);
// create title style and cell style
CellStyle cs = wb.createCellStyle();
CellStyle ts = wb.createCellStyle();
DataFormat df = wb.createDataFormat();
// Set the other cell style and formatting
ts.setDataFormat(df.getFormat("text"));
ts.setFont(tf);
cs.setDataFormat(df.getFormat("text"));
cs.setFont(cf);
Row r = null;
Cell c = null;
//set title for all columns
r = s.createRow(0);
for(int i=0; i<titleNames.length; i++){
c = r.createCell(i);
c.setCellStyle(ts);
c.setCellValue(createHelper.createRichTextString(titleNames[i]));
}
logger.debug("data size="+data.size());
//create rows and cell, then fill data
for(int i=1; i<=data.size(); i++){
Object[] objects = (Object[])data.get(i-1);
r = s.createRow(i);
for(int j=0; j<objects.length; j++){
c = r.createCell(j);
c.setCellStyle(cs);
c.setCellValue(createHelper.createRichTextString(objects[j].toString()));
}
}
// save
OutputStream os = new FileOutputStream(file);
wb.write(os);
os.close();
}
}
用到的jar包,包括commons-loggoing.jar, log4j.jar, poi.jar,都是apache的东西。 commons-logging、log4j都是用来记录日志的,poi是用来保存excel用到的组件。
下载地址:
commons-logging: http://commons.apache.org/logging/
log4j.jar: http://logging.apache.org/log4j/
poi.jar: http://poi.apache.org/
自己觉得待改进的地方:
1、多线程的调度和任务分配、意见同步互斥。
2、已经下载链接的去重问题。
3、代码的设计模式,更加清晰易懂,可重用性。
希望有研究的人,给些意见和建议。
注:本文仅供学习研究使用,并无攻击和冒犯“袖扣网”之意,给贵网站带来不便,敬请谅解。