由于lucene2.0+heritrix一书示例用的网站(http://mobile.pconline.com.cn/,http: //mobile.163.com/)改版了,书上实例不能运行,我又找了一个http://mobile.younet.com/进行开发并成功实现示例,希望感兴趣的同学,近快实践,如果此网站也改了就又得改extractor了,哈哈!
search的Extractor代码如下,(别和书上实例相同)供大家参考:附件里有完整代码
<!--<br /> <br /> Code highlighting produced by Actipro CodeHighlighter (freeware)<br /> http://www.CodeHighlighter.com/<br /> <br /> -->
package
com.luceneheritrixbook.extractor.younet;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Date;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.util.StringUtils;
/**
* <p></p>
* @author cnyqiao@hotmail.com
* @date Feb 6, 2009
*/
public class ExtractYounetMoblie extends Extractor {
@Override
public void extract() {
BufferedWriter bw = null ;
NodeFilter title_filter = new AndFilter( new TagNameFilter( " div " ), new HasAttributeFilter( " class " , " mo_tit " ));
NodeFilter attribute_filter = new AndFilter( new TagNameFilter( " p " ), new HasChildFilter( new AndFilter( new TagNameFilter( " span " ), new HasAttributeFilter( " class " , " gn_sp1 blue1 " ))));
NodeFilter img_filter = new AndFilter( new TagNameFilter( " span " ), new HasChildFilter( new TagNameFilter( " img " )));
// 提取标题信息
try {
// Parser根据过滤器返回所有满足过滤条件的节点
// 迭代逐渐查找
NodeList nodeList = this .getParser().parse(title_filter);
NodeIterator it = nodeList.elements();
StringBuffer title = new StringBuffer();
while (it.hasMoreNodes()) {
Node node = (Node) it.nextNode();
String[] names = node.toPlainTextString().split( " " );
for ( int i = 0 ; i < names.length; i ++ )
title.append(names[i]).append( " - " );
title.append( new Date().getTime());
// 创建要生成的文件
bw = new BufferedWriter( new FileWriter( new File( this .getOutputPath() + title + " .txt " )));
// 获取当前提取页的完整URL地址
int startPos = this .getInuputFilePath().indexOf( " mirror " ) + 6 ;
String url_seg = this .getInuputFilePath().substring(startPos);
url_seg = url_seg.replaceAll( " \\\\ " , " / " );
String url = " http:/ " + url_seg;
// 写入当前提取页的完整URL地址
bw.write(url + NEWLINE);
bw.write(names[ 0 ] + NEWLINE);
bw.write(names[ 1 ] + NEWLINE);
}
// 重置Parser
this .getParser().reset();
Parser attNameParser = null ;
Parser attValueParser = null ;
// Parser parser=new Parser(" http://www.sina.com.cn ");
NodeFilter attributeName_filter = new AndFilter( new TagNameFilter( " span " ), new HasAttributeFilter( " class " , " gn_sp1 blue1 " ));
NodeFilter attributeValue_filter = new AndFilter( new TagNameFilter( " span " ), new HasAttributeFilter( " class " , " gn_sp2 " ));
String attName = "" ;
String attValue = "" ;
// 迭代逐渐查找
nodeList = this .getParser().parse(attribute_filter);
it = nodeList.elements();
while (it.hasMoreNodes()) {
Node node = (Node) it.nextNode();
attNameParser = new Parser();
attNameParser.setEncoding( " GB2312 " );
attNameParser.setInputHTML(node.toHtml());
NodeList attNameNodeList = attNameParser.parse(attributeName_filter);
attName = attNameNodeList.elements().nextNode().toPlainTextString();
attValueParser = new Parser();
attValueParser.setEncoding( " GB2312 " );
attValueParser.setInputHTML(node.toHtml());
NodeList attValueNodeList = attValueParser.parse(attributeValue_filter);
attValue = attValueNodeList.elements().nextNode().toPlainTextString();
bw.write(attName.trim() + attValue.trim());
bw.newLine();
}
// 重置Parser
this .getParser().reset();
String imgUrl = "" ;
String fileType = "" ;
// 迭代逐渐查找
nodeList = this .getParser().parse(img_filter);
it = nodeList.elements();
while (it.hasMoreNodes()) {
Node node = (Node) it.nextNode();
ImageTag imgNode = (ImageTag)node.getChildren().elements().nextNode();
imgUrl = imgNode.getAttribute( " src " );
fileType = imgUrl.trim().substring(imgUrl
.lastIndexOf( " . " ) + 1 );
// 生成新的图片的文件名
String new_iamge_file = StringUtils.encodePassword(imgUrl, HASH_ALGORITHM) + " . " + fileType;
// imgUrl = new HtmlPaserFilterTest().replace(new_iamge_file, "+", " ");
// 利用miorr目录下的图片生成的新的图片
this .copyImage(imgUrl, new_iamge_file);
bw.write(SEPARATOR + NEWLINE);
bw.write(new_iamge_file + NEWLINE);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (bw != null )
bw.close();
} catch (IOException e){
e.printStackTrace();
}
}
}
}
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Date;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.util.StringUtils;
/**
* <p></p>
* @author cnyqiao@hotmail.com
* @date Feb 6, 2009
*/
public class ExtractYounetMoblie extends Extractor {
@Override
public void extract() {
BufferedWriter bw = null ;
NodeFilter title_filter = new AndFilter( new TagNameFilter( " div " ), new HasAttributeFilter( " class " , " mo_tit " ));
NodeFilter attribute_filter = new AndFilter( new TagNameFilter( " p " ), new HasChildFilter( new AndFilter( new TagNameFilter( " span " ), new HasAttributeFilter( " class " , " gn_sp1 blue1 " ))));
NodeFilter img_filter = new AndFilter( new TagNameFilter( " span " ), new HasChildFilter( new TagNameFilter( " img " )));
// 提取标题信息
try {
// Parser根据过滤器返回所有满足过滤条件的节点
// 迭代逐渐查找
NodeList nodeList = this .getParser().parse(title_filter);
NodeIterator it = nodeList.elements();
StringBuffer title = new StringBuffer();
while (it.hasMoreNodes()) {
Node node = (Node) it.nextNode();
String[] names = node.toPlainTextString().split( " " );
for ( int i = 0 ; i < names.length; i ++ )
title.append(names[i]).append( " - " );
title.append( new Date().getTime());
// 创建要生成的文件
bw = new BufferedWriter( new FileWriter( new File( this .getOutputPath() + title + " .txt " )));
// 获取当前提取页的完整URL地址
int startPos = this .getInuputFilePath().indexOf( " mirror " ) + 6 ;
String url_seg = this .getInuputFilePath().substring(startPos);
url_seg = url_seg.replaceAll( " \\\\ " , " / " );
String url = " http:/ " + url_seg;
// 写入当前提取页的完整URL地址
bw.write(url + NEWLINE);
bw.write(names[ 0 ] + NEWLINE);
bw.write(names[ 1 ] + NEWLINE);
}
// 重置Parser
this .getParser().reset();
Parser attNameParser = null ;
Parser attValueParser = null ;
// Parser parser=new Parser(" http://www.sina.com.cn ");
NodeFilter attributeName_filter = new AndFilter( new TagNameFilter( " span " ), new HasAttributeFilter( " class " , " gn_sp1 blue1 " ));
NodeFilter attributeValue_filter = new AndFilter( new TagNameFilter( " span " ), new HasAttributeFilter( " class " , " gn_sp2 " ));
String attName = "" ;
String attValue = "" ;
// 迭代逐渐查找
nodeList = this .getParser().parse(attribute_filter);
it = nodeList.elements();
while (it.hasMoreNodes()) {
Node node = (Node) it.nextNode();
attNameParser = new Parser();
attNameParser.setEncoding( " GB2312 " );
attNameParser.setInputHTML(node.toHtml());
NodeList attNameNodeList = attNameParser.parse(attributeName_filter);
attName = attNameNodeList.elements().nextNode().toPlainTextString();
attValueParser = new Parser();
attValueParser.setEncoding( " GB2312 " );
attValueParser.setInputHTML(node.toHtml());
NodeList attValueNodeList = attValueParser.parse(attributeValue_filter);
attValue = attValueNodeList.elements().nextNode().toPlainTextString();
bw.write(attName.trim() + attValue.trim());
bw.newLine();
}
// 重置Parser
this .getParser().reset();
String imgUrl = "" ;
String fileType = "" ;
// 迭代逐渐查找
nodeList = this .getParser().parse(img_filter);
it = nodeList.elements();
while (it.hasMoreNodes()) {
Node node = (Node) it.nextNode();
ImageTag imgNode = (ImageTag)node.getChildren().elements().nextNode();
imgUrl = imgNode.getAttribute( " src " );
fileType = imgUrl.trim().substring(imgUrl
.lastIndexOf( " . " ) + 1 );
// 生成新的图片的文件名
String new_iamge_file = StringUtils.encodePassword(imgUrl, HASH_ALGORITHM) + " . " + fileType;
// imgUrl = new HtmlPaserFilterTest().replace(new_iamge_file, "+", " ");
// 利用miorr目录下的图片生成的新的图片
this .copyImage(imgUrl, new_iamge_file);
bw.write(SEPARATOR + NEWLINE);
bw.write(new_iamge_file + NEWLINE);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (bw != null )
bw.close();
} catch (IOException e){
e.printStackTrace();
}
}
}
}
运行书上的heritrix实例,并按书上的默认设置进行抓取如下URI:(请自己分析整理)
<!--<br /> <br /> Code highlighting produced by Actipro CodeHighlighter (freeware)<br /> http://www.CodeHighlighter.com/<br /> <br /> -->
http://mobile.younet.com/files/list_1.html
http://mobile.younet.com/files/list_2.html
http://mobile.younet.com/files/list_3.html
http://mobile.younet.com/files/list_2.html
http://mobile.younet.com/files/list_3.html