对比一下以前的代码,基本格式都是相同的
http://blog.csdn.net/caoxu1987728/archive/2008/05/06/2402204.aspx
package
my.extractor;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.logging.Logger;
import java.util.logging.Level;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.extractor.Extractor;
import org.archive.crawler.extractor.ExtractorHTML;
import org.archive.crawler.extractor.Link;
import org.archive.crawler.settings.SimpleType;
import org.archive.crawler.settings.Type;
import org.archive.io.ReplayCharSequence;
import org.archive.util.HttpRecorder;
import org.archive.util.TextUtils;
public class Mobile163Extractor extends Extractor
{
protected boolean ignoreUnexceptionHTML = true ;
private static final String url163 = " http://mobile.163.com/0011/product/0011000B/special/1/left.html " ;
private static Logger logger = Logger.getLogger(Mobile163Extractor. class .getName());
public Mobile163Extractor(String name)
{
this (name, " Mobile163 extractor.Extracts links from HTML documents " );
}
public Mobile163Extractor(String name,String description)
{
super (name,description);
}
protected void extract(CrawlURI curi)
{
String url = curi.toString();
if (url.equals(url163))
{
ReplayCharSequence cs = null ;
try
{
HttpRecorder hr = curi.getHttpRecorder();
if (hr == null )
{
throw new IOException( " why is recorder null here " );
}
cs = hr.getReplayCharSequence();
}
catch (IOException e)
{
curi.addLocalizedError( this .getName(),e, " failed get of replay char sequence " + curi.toString()
+ "" + e.getMessage());
logger.log(Level.SEVERE, " failed get of replay char sequenece in "
+ Thread.currentThread().getName(),e);
}
if (cs == null )
{
return ;
}
String content = cs.toString();
try
{
BufferedReader reader = new BufferedReader( new StringReader(content));
String line = reader.readLine();
while (line != null )
{
if (line.endsWith( " .html" " ))
{
String fullstr = null ;
fullstr = " http://mobile.163.com " + line.substring(line.indexOf( " url: " ) + 4 ,line.length() - 1 );
addLinkFromString(curi,fullstr, "" ,Link.NAVLINK_HOP);
System.out.println(fullstr);
}
line = reader.readLine();
}
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
private void addLinkFromString(CrawlURI curi,String url,CharSequence context, char hopType)
{
try
{
curi.createAndAddLinkRelativeToBase(url, context.toString(), hopType);
}
catch (URIException e)
{
if (getController() != null )
{
getController().logUriError(e,curi.getUURI(),url);
}
else
{
logger.info( " failed creatAndAddLinkRelativeToBase " + curi + " , " + url + " , " + context + " , " + hopType + " : " + e);
}
}
}
}
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.logging.Logger;
import java.util.logging.Level;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.extractor.Extractor;
import org.archive.crawler.extractor.ExtractorHTML;
import org.archive.crawler.extractor.Link;
import org.archive.crawler.settings.SimpleType;
import org.archive.crawler.settings.Type;
import org.archive.io.ReplayCharSequence;
import org.archive.util.HttpRecorder;
import org.archive.util.TextUtils;
public class Mobile163Extractor extends Extractor
{
protected boolean ignoreUnexceptionHTML = true ;
private static final String url163 = " http://mobile.163.com/0011/product/0011000B/special/1/left.html " ;
private static Logger logger = Logger.getLogger(Mobile163Extractor. class .getName());
public Mobile163Extractor(String name)
{
this (name, " Mobile163 extractor.Extracts links from HTML documents " );
}
public Mobile163Extractor(String name,String description)
{
super (name,description);
}
protected void extract(CrawlURI curi)
{
String url = curi.toString();
if (url.equals(url163))
{
ReplayCharSequence cs = null ;
try
{
HttpRecorder hr = curi.getHttpRecorder();
if (hr == null )
{
throw new IOException( " why is recorder null here " );
}
cs = hr.getReplayCharSequence();
}
catch (IOException e)
{
curi.addLocalizedError( this .getName(),e, " failed get of replay char sequence " + curi.toString()
+ "" + e.getMessage());
logger.log(Level.SEVERE, " failed get of replay char sequenece in "
+ Thread.currentThread().getName(),e);
}
if (cs == null )
{
return ;
}
String content = cs.toString();
try
{
BufferedReader reader = new BufferedReader( new StringReader(content));
String line = reader.readLine();
while (line != null )
{
if (line.endsWith( " .html" " ))
{
String fullstr = null ;
fullstr = " http://mobile.163.com " + line.substring(line.indexOf( " url: " ) + 4 ,line.length() - 1 );
addLinkFromString(curi,fullstr, "" ,Link.NAVLINK_HOP);
System.out.println(fullstr);
}
line = reader.readLine();
}
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
private void addLinkFromString(CrawlURI curi,String url,CharSequence context, char hopType)
{
try
{
curi.createAndAddLinkRelativeToBase(url, context.toString(), hopType);
}
catch (URIException e)
{
if (getController() != null )
{
getController().logUriError(e,curi.getUURI(),url);
}
else
{
logger.info( " failed creatAndAddLinkRelativeToBase " + curi + " , " + url + " , " + context + " , " + hopType + " : " + e);
}
}
}
}
在扩展一下善后工作吧
package
my.postprocessor;
import java.util.logging.Logger;
import org.archive.crawler.datamodel.CandidateURI;
import org.archive.crawler.postprocessor.FrontierScheduler;
public class FrontierSchedulerFor163Mobile extends FrontierScheduler
{
private static final Logger LOGGER = Logger.getLogger(FrontierSchedulerFor163Mobile. class .getName());
public FrontierSchedulerFor163Mobile(String name)
{
super (name);
}
protected void schedule(CandidateURI caUri)
{
String url = caUri.toString();
try
{
if (url.indexOf( " mobile.163.com/0011/product/0011000B/product " ) !=- 1
|| url.indexOf( " mobile.163.com/0011/product/0011000B/mark " ) !=- 1
|| url.endsWith( " .gif " )
|| url.endsWith( " .jpg " )
|| url.endsWith( " .jpeg " )
|| url.indexOf( " robots.txt " ) !=- 1
|| url.indexOf( " dns " ) !=- 1 )
{
if (url.indexOf( " # " ) !=- 1 )
getController().getFrontier().schedule(caUri)
}
else
{
return ;
}
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
import java.util.logging.Logger;
import org.archive.crawler.datamodel.CandidateURI;
import org.archive.crawler.postprocessor.FrontierScheduler;
public class FrontierSchedulerFor163Mobile extends FrontierScheduler
{
private static final Logger LOGGER = Logger.getLogger(FrontierSchedulerFor163Mobile. class .getName());
public FrontierSchedulerFor163Mobile(String name)
{
super (name);
}
protected void schedule(CandidateURI caUri)
{
String url = caUri.toString();
try
{
if (url.indexOf( " mobile.163.com/0011/product/0011000B/product " ) !=- 1
|| url.indexOf( " mobile.163.com/0011/product/0011000B/mark " ) !=- 1
|| url.endsWith( " .gif " )
|| url.endsWith( " .jpg " )
|| url.endsWith( " .jpeg " )
|| url.indexOf( " robots.txt " ) !=- 1
|| url.indexOf( " dns " ) !=- 1 )
{
if (url.indexOf( " # " ) !=- 1 )
getController().getFrontier().schedule(caUri)
}
else
{
return ;
}
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
这个就不多说了,与前一篇一个模型。