xml文件读写实例

这是一个目前在做的项目需要使用的xml文件读写实现。记起来以备后忘和供有需要的同学学习。

xml文件读写类:

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.lt.cj.config.entities.ConfigModel;
import org.lt.cj.config.entities.TMallConfigModel;
import org.lt.cj.core.Seed;

public class XMLConfigWriter {

/*创建淘宝商城的配置文件*/
public Document buildUpMallDocument(TMallConfigModel missionConfig) throws MissionConfigException, EnterUrlsException {

if (missionConfig == null) {
throw new MissionConfigException();
} else if (missionConfig.getSeeds().isEmpty()) {
return null;
}

// Create the root element
Element rootElement = new Element("website");

/* 设置网站属性 */
/* 设置网站名称 */
rootElement.setAttribute("name", missionConfig.getWebsiteName());
/*设置网站地址*/
rootElement.setAttribute("url", missionConfig.getWebsiteUrl());

//添加任务名称
Element taskElement = new Element("taskName");
taskElement.addContent(missionConfig.getTaskName());
rootElement.addContent(taskElement);

//构造种子列表节点
Element seeds = new Element("seeds");
for (int i = 0; i < missionConfig.getSeeds().size(); i++) {

Element seedElement = new Element("seed");
Element seedNameElement = new Element("seedName");
seedNameElement.addContent(missionConfig.getSeeds().get(i).getSeedName());
Element seedUrlElement = new Element("seedUrl");
seedUrlElement.addContent(missionConfig.getSeeds().get(i).getUrl());
Element seedSortNameElement = new Element("sortName");
seedSortNameElement.addContent(missionConfig.getSeeds().get(i).getSortName());

seedElement.addContent(seedSortNameElement);
seedElement.addContent(seedNameElement);
seedElement.addContent(seedUrlElement);
seeds.addContent(seedElement);
}
rootElement.addContent(seeds);

//定义匹配的要采集的URL链接fitUrl的节点
Element fiturls = new Element("fitUrls");
for (int i = 0; i < missionConfig.getFitUrlRegs().size(); i++) {
Element fitUrl = new Element("fit_url");
fitUrl.addContent(missionConfig.getFitUrlRegs().get(i));
fiturls.addContent(fitUrl);
}
rootElement.addContent(fiturls);//添加到根节点

//并发工作线程数
Element workingThreadsElement = new Element("workingThreads");
workingThreadsElement.addContent("" + missionConfig.getWorkingThreads());
rootElement.addContent(workingThreadsElement);//添加到根节点
//定义页面编码节点
Element pageEncodingElement = new Element("pageEncoding");
pageEncodingElement.addContent(missionConfig.getPageEncoding());
rootElement.addContent(pageEncodingElement);//添加到根节点

//定义下载图片控制标志节点
Element dwdPhoFlagElement = new Element("dwdPhoFlag");
dwdPhoFlagElement.addContent(missionConfig.getDwdPhoFlag());
rootElement.addContent(dwdPhoFlagElement);

//定义原语言节点
Element oriLan = new Element("orien_lan");
oriLan.addContent(missionConfig.getOrigLanguage());
Element transLan = new Element("trans_lan");
transLan.addContent(missionConfig.getTranLanguage());
rootElement.addContent(oriLan);//添加到根节点
rootElement.addContent(transLan);//添加到根节点

//定义匹配抓取信息的产品页面Url节点
Element pageUrlRegs = new Element("pageUrlRegs");
for (int i = 0; i < missionConfig.getPageReg().size(); i++) {
Element pageUrl = new Element("pageUrl");
pageUrl.addContent(missionConfig.getFitUrlRegs().get(i));
pageUrlRegs.addContent(pageUrl);
}
rootElement.addContent(pageUrlRegs);//添加到根节点

Map<String, List<String>> map = missionConfig.getEntityReg();
List<String> list = null;

Element pathElements = new Element("pathElements");
//直接循环算啦
//=====================================
Iterator iter = map.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry e = (Map.Entry) iter.next();
Element element = new Element(e.getKey() + "");
map = missionConfig.getEntityReg();
list = map.get(e.getKey() + "");
for (int i = 0; i < list.size(); i++) {

Element path = new Element("path");
path.addContent(list.get(i));
element.addContent(path);
}
pathElements.addContent(element);
}
rootElement.addContent(pathElements);

/* ===================================================== */
Document myDocument = new Document(rootElement);
return myDocument;
}

/* 创建文档文件 */
public void createConfigFile(Document document, String filepath) {

try {
/* 定义XML输出器 */
XMLOutputter xmlOutPutter = new XMLOutputter();
xmlOutPutter.setFormat(Format.getPrettyFormat());
File file = new File(filepath);
if (!file.exists()) {
if (file.createNewFile()) {
FileOutputStream fileOutputStream = new FileOutputStream(filepath);
xmlOutPutter.output(document, fileOutputStream);
return;
}
}
FileOutputStream fileOutputStream = new FileOutputStream(filepath);
xmlOutPutter.output(document, fileOutputStream);
} catch (java.io.IOException e) {
e.printStackTrace();
}
}

/* 重写文件 */
public void saveTask(String filePath, ConfigModel configModel) {
try {
TMallConfigModel tMallConfigModel = (TMallConfigModel) configModel;
Document document = buildUpMallDocument(tMallConfigModel);
if (document != null) {
createConfigFile(document, filePath);
}
} catch (MissionConfigException ex) {
Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex);
} catch (EnterUrlsException ex) {
Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex);
}
}

//* xml文件读取方法 */
public TMallConfigModel readMallDocument(String filePath) {

TMallConfigModel model = new TMallConfigModel();
SAXBuilder sb = new SAXBuilder();
try {
//读取基本配置信息
Document doc = sb.build(filePath); //构造文档对象
Element root = doc.getRootElement(); //获取根元素
String websiteName = root.getAttributeValue("name"); //获取网站名称
String websiteAddr = root.getAttributeValue("url"); //获取网站地址
model.setWebsiteName(websiteName); //设置网站名称
model.setWebsiteUrl(websiteAddr); //设置网站地址
Element taskNameElement = root.getChild("taskName"); //获取任务名内容
String taskName = taskNameElement.getText();
model.setTaskName(taskName);

//获取入口种子列表
List<Seed> seedList = new ArrayList();
Element seedsElement = root.getChild("seeds");
List list = seedsElement.getChildren();
for (int i = 0; i < list.size(); i++) {
Element element = (Element) seedsElement.getChildren().get(i);
Seed seed = new Seed();
Element seedNameElement = element.getChild("seedName");
Element seedUrlElement = element.getChild("seedUrl");
Element seedSortNameElement = element.getChild("sortName");
seed.setSeedName(seedNameElement.getTextTrim());
seed.setUrl(seedUrlElement.getTextTrim());
seed.setSortName(seedSortNameElement.getTextTrim());

Element parentSeedElement = element.getChild("parentSeed");
if (parentSeedElement != null) {
Seed parentSeed = new Seed();
Element parentSeedNameElement = parentSeedElement.getChild("seedName");
Element parentSeedUrlElement = parentSeedElement.getChild("seedUrl");
Element parentSeedSortNameElement = parentSeedElement.getChild("sortName");
parentSeed.setSeedName(parentSeedNameElement.getText());
parentSeed.setUrl(parentSeedUrlElement.getTextTrim());
parentSeed.setSortName(parentSeedSortNameElement.getTextTrim());
}
seedList.add(seed);
}
model.setSeeds(seedList);

//获取匹配的要抽取的页面的特定部分内容
list = new ArrayList();
Element extractHtmlElement = root.getChild("extractHtml");
if (extractHtmlElement != null) {
for (int i = 0; i < extractHtmlElement.getChildren().size(); i++) {
Element element = (Element) extractHtmlElement.getChildren().get(i);
list.add(element.getText());
}
}
model.setExtractHtmlReg(list);

//获取匹配URLs
list = new ArrayList();
Element fitUrlsElement = root.getChild("fitUrls");
for (int i = 0; i < fitUrlsElement.getChildren().size(); i++) {
Element element = (Element) fitUrlsElement.getChildren().get(i);
list.add(element.getText());
}
model.setFitUrlRegs(list);

//获取线程数量
Element workingThreadsElement = root.getChild("workingThreads");
String workingCount = workingThreadsElement.getText();
model.setWorkingThreads(Integer.valueOf(workingCount));

//获取解析编码
Element pageEncodingElement = root.getChild("pageEncoding");
String pageEncoding = pageEncodingElement.getText();
model.setPageEncoding(pageEncoding);

//获取是否下载图片的标志
Element dwdPhoFlagElement = root.getChild("dwdPhoFlag");
String dphoFlag = dwdPhoFlagElement.getText();
model.setDwdPhoFlag(dphoFlag);

//获取语言
Element orien_lanElement = root.getChild("orien_lan");
String orien = orien_lanElement.getText();
model.setOrigLanguage(orien);
Element trans_lanElement = root.getChild("trans_lan");
String trans_lan = trans_lanElement.getText();
model.setTranLanguage(trans_lan);

//获取URL正则匹配
Element pageUrlRegsElement = root.getChild("pageUrlRegs");
list = new ArrayList();
for (int i = 0; i < pageUrlRegsElement.getChildren().size(); i++) {
Element element = (Element) pageUrlRegsElement.getChildren().get(i);
list.add(element.getText());
}
model.setPageReg(list);

//获取余下的匹配规则
Map<String, List<String>> entityReg = new HashMap();
Element pathElements = root.getChild("pathElements");
for (int i = 0; i < pathElements.getChildren().size(); i++) {

Element element = (Element) pathElements.getChildren().get(i);
List<String> pathList = new ArrayList();
String mapName = element.getName();
for (int j = 0; j < element.getChildren().size(); j++) {

Element childElement = (Element) element.getChildren().get(j);
pathList.add(childElement.getText());
}
entityReg.put(mapName, pathList);
}
model.setEntityReg(entityReg);
} catch (JDOMException ex) {
Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex);
} catch (IOException ex) {
Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex);
}
return model;
}


}


xml文件内容:

<?xml version="1.0" encoding="UTF-8"?>
<website name="taobao_mall" url="http://www.tmall.com/?ver=2011b">
<taskName>caiji_tmall_精品男装_T恤</taskName>
<seeds>
<seed>
<sortName>精品男装/T恤</sortName>
<seedName>精品男装/T恤</seedName>
<seedUrl>http://item.tmall.com/item.htm?id=9351702393</seedUrl>
</seed>
</seeds>
<extractHtml>
<path>div class="list item-view item-miniView"</path>
</extractHtml>
<fitUrls>
<fit_url>http://www\.tmall\.com/go/act/tmall/iwanttobuy\.php.*</fit_url>
<fit_url>http://list\.tmall\.com/.*</fit_url>
<fit_url>http://item\.tmall\.com/item\.htm.*</fit_url>
</fitUrls>
<workingThreads>1</workingThreads>
<pageEncoding>UTF-8</pageEncoding>
<orien_lan>zh</orien_lan>
<trans_lan>en</trans_lan>
<pageUrlRegs>
<pageUrl>http://www\.tmall\.com/go/act/tmall/iwanttobuy\.php.*</pageUrl>
</pageUrlRegs>
<pathElements>
<commnents>
<path>div class="tb-box tshop-psm tshop-psm-bdetailtabl" id="J_Detail"</path>
<path>div id="reviews" class="J_DetailSection" data-reviewApi</path>
</commnents>
<shopAddr>
<path>div class="clearfix tb-header-nav"</path>
<path>div class="nav"</path>
<path>a href</path>
</shopAddr>
<productDetail>
<path>div id="attributes" class="attributes</path>
<path>ul class="attributes-list</path>
<path>li</path>
</productDetail>
<photosPath>
<path>div class="tb-detail-bd tb-clear"</path>
<path>div class="tb-gallery"</path>
<path>div class="tb-booth tb-pic tb-s310"</path>
<path>img id="J_ImgBooth" src</path>
</photosPath>
<category>
<path>ul class="mallCrumbs-nav" id="J_crumbs"</path>
<path>li class="mallCrumbs-nav-item"</path>
</category>
<countSold>
<path>div class="tb-detail-bd tb-clear"</path>
<path>ul class="tb-meta"</path>
<path>li class="tb-sold-out tb-clear"</path>
</countSold>
<shopInfo>
<path>div class="shop-intro"</path>
<path>div class="extend"</path>
<path>li</path>
</shopInfo>
<despPhos>
<path>script</path>
</despPhos>
<thumbPhosPath>
<path>div class="tb-detail-bd tb-clear"</path>
<path>div class="tb-gallery"</path>
<path>ul id="J_UlThumb" class="tb-thumb tb-clearfix"</path>
<path>img src=</path>
</thumbPhosPath>
<productName>
<path>div class="layout grid-s5m0 "</path>
<path>div class="tb-detail-hd"</path>
<path>a target="_blank" href=</path>
</productName>
<productPrice>
<path>div class="tb-detail-bd tb-clear"</path>
<path>ul class="tb-meta"</path>
<path>li id="J_StrPriceModBox" class="tb-detail-price tb-clearfix"</path>
</productPrice>
</pathElements>
</website>
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值