package xstreamTest;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import com.thoughtworks.xstream.XStream;
import com.thoughtworks.xstream.io.xml.DomDriver;
public class ExtractorXpathConfig {
public static void main(String[] args) {
ExtractorXpathConfig.write();
ExtractorXpathConfig.read();
}
public static void write() {
XStream sm = new XStream(new DomDriver());
FilterConfig fc = new FilterConfig();
List<SeedConfig> seedConfig = new ArrayList<SeedConfig>();
List<ExtractorConfig> extratorConfig = new ArrayList<ExtractorConfig>();
List<WriterConfig> writerConfig = new ArrayList<WriterConfig>();
SeedConfig sc = new SeedConfig("http://www.qq.com/");
SeedConfig sc1 = new SeedConfig("http://www.sina.com/");
seedConfig.add(sc);
seedConfig.add(sc1);
ExtractorConfig ec = new ExtractorConfig(
"只抓取配置div/a/@href|div/h1/a/@href", "只解析配置div/a/@href|",
"即抓取又解析功能配置div/a/@href");
extratorConfig.add(ec);
WriterConfig wc = new WriterConfig("singerName",
"div/a/@href|div/h1/a/");
WriterConfig wc1 = new WriterConfig("singerGender",
"div/a/@href|div/h1/gender");
WriterConfig wc2 = new WriterConfig("singerAge",
"div/a/@href|div/h1/age");
WriterConfig wc3 = new WriterConfig("singerCountry",
"div/a/@href|div/h1/age");
wc3.setOtherConfig("福建省地方第三方");
writerConfig.add(wc);
writerConfig.add(wc1);
writerConfig.add(wc2);
writerConfig.add(wc3);
fc.setExtratorConfig(extratorConfig);
fc.setSeedConfig(seedConfig);
fc.setWriterConfig(writerConfig);
try {
OutputStream out = new FileOutputStream(new File("xpathConfig.xml"));
OutputStreamWriter writer = new OutputStreamWriter(out, Charset
.forName("utf-8"));
writer.write("/n");
sm.toXML(fc, writer);
out.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public static void read() {
XStream sm = new XStream(new DomDriver());
try {
InputStream in = new FileInputStream(new File("xpathConfig.xml"));
InputStreamReader reader = new InputStreamReader(in, Charset
.forName("utf-8"));
FilterConfig fc = (FilterConfig) sm.fromXML(in);
System.out.println(fc.getCharSet());
List<ExtractorConfig> extratorConfig = fc.getExtratorConfig();
for(ExtractorConfig ec:extratorConfig){
System.out.println(ec.getBothUrls());
System.out.println(ec.getFetchUrls());
System.out.println(ec.getWriteUrls());
}
in.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* 配置模板类
*
* @author ffshi
*
*/
class FilterConfig {
private List<SeedConfig> seedConfig;
private List<ExtractorConfig> extratorConfig;
private List<WriterConfig> writerConfig;
private String charSet = "UTF-8";
public String getCharSet() {
return charSet;
}
public void setCharSet(String charSet) {
this.charSet = charSet;
}
public FilterConfig() {
seedConfig = new ArrayList<SeedConfig>();
extratorConfig = new ArrayList<ExtractorConfig>();
writerConfig = new ArrayList<WriterConfig>();
}
public List<SeedConfig> getSeedConfig() {
return seedConfig;
}
public void setSeedConfig(List<SeedConfig> seedConfig) {
this.seedConfig = seedConfig;
}
public List<ExtractorConfig> getExtratorConfig() {
return extratorConfig;
}
public void setExtratorConfig(List<ExtractorConfig> extratorConfig) {
this.extratorConfig = extratorConfig;
}
public List<WriterConfig> getWriterConfig() {
return writerConfig;
}
public void setWriterConfig(List<WriterConfig> writerConfig) {
this.writerConfig = writerConfig;
}
}
/**
* 种子页设置bean
*
* @author ffshi
*
*/
class SeedConfig {
private String seed;
public SeedConfig() {
}
public String getSeed() {
return seed;
}
public void setSeed(String seed) {
this.seed = seed;
}
public SeedConfig(String seed) {
super();
this.seed = seed;
}
}
/**
* url过滤配置类
*
* @author ffshi
*
*/
class ExtractorConfig {
private String fetchUrls;
private String writeUrls;
private String bothUrls;
public ExtractorConfig() {
}
public ExtractorConfig(String fetchUrls, String writeUrls, String bothUrls) {
super();
this.bothUrls = bothUrls;
this.fetchUrls = fetchUrls;
this.writeUrls = writeUrls;
}
public String getFetchUrls() {
return fetchUrls;
}
public void setFetchUrls(String fetchUrls) {
this.fetchUrls = fetchUrls;
}
public String getWriteUrls() {
return writeUrls;
}
public void setWriteUrls(String writeUrls) {
this.writeUrls = writeUrls;
}
public String getBothUrls() {
return bothUrls;
}
public void setBothUrls(String bothUrls) {
this.bothUrls = bothUrls;
}
}
/**
* 结构化抽取的配置类
*
* @author ffshi
*
*/
class WriterConfig {
private String fieldName;
private String fieldXpath;
private String otherConfig;
private boolean bool;
public boolean isBool() {
return bool;
}
public void setBool(boolean bool) {
this.bool = bool;
}
public WriterConfig() {
}
public WriterConfig(String fieldName, String fieldXpath) {
super();
this.fieldName = fieldName;
this.fieldXpath = fieldXpath;
}
public String getFieldName() {
return fieldName;
}
public void setFieldName(String fieldName) {
this.fieldName = fieldName;
}
public String getFieldXpath() {
return fieldXpath;
}
public void setFieldXpath(String fieldXpath) {
this.fieldXpath = fieldXpath;
}
public String getOtherConfig() {
return otherConfig;
}
public void setOtherConfig(String otherConfig) {
this.otherConfig = otherConfig;
}
}
生成的xml格式如下:
<xstreamTest.FilterConfig>
<seedConfig>
<xstreamTest.SeedConfig>
<seed>http://www.qq.com/</seed>
</xstreamTest.SeedConfig>
<xstreamTest.SeedConfig>
<seed>http://www.sina.com/</seed>
</xstreamTest.SeedConfig>
</seedConfig>
<extratorConfig>
<xstreamTest.ExtractorConfig>
<fetchUrls>只抓取配置div/a/@href|div/h1/a/@href</fetchUrls>
<writeUrls>只解析配置div/a/@href|</writeUrls>
<bothUrls>即抓取又解析功能配置div/a/@href</bothUrls>
</xstreamTest.ExtractorConfig>
</extratorConfig>
<writerConfig>
<xstreamTest.WriterConfig>
<fieldName>singerName</fieldName>
<fieldXpath>div/a/@href|div/h1/a/</fieldXpath>
<bool>false</bool>
</xstreamTest.WriterConfig>
<xstreamTest.WriterConfig>
<fieldName>singerGender</fieldName>
<fieldXpath>div/a/@href|div/h1/gender</fieldXpath>
<bool>false</bool>
</xstreamTest.WriterConfig>
<xstreamTest.WriterConfig>
<fieldName>singerAge</fieldName>
<fieldXpath>div/a/@href|div/h1/age</fieldXpath>
<bool>false</bool>
</xstreamTest.WriterConfig>
<xstreamTest.WriterConfig>
<fieldName>singerCountry</fieldName>
<fieldXpath>div/a/@href|div/h1/age</fieldXpath>
<otherConfig>福建省地方第三方</otherConfig>
<bool>false</bool>
</xstreamTest.WriterConfig>
</writerConfig>
<charSet>UTF-8</charSet>
</xstreamTest.FilterConfig>