问题
用 sax 在解析 content:encoded 中遇到 html 标签容易 丢失内容,在这里记录下我的解决方案。以防忘记
容易丢失的原因 主要是因为 sax 解析器在 解析到html标签时就会 重新回调 startelement 和 endelement
解决方案的关键在全局变量中 设置一个 string content 并持续拼接 标签中的内容
package com.joosee.app.logic;
import java.util.ArrayList;
import java.util.List;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import android.content.Context;
import android.util.Log;
import com.joosee.app.dao.DataFactory;
import com.joosee.app.pojo.Blog;
import com.joosee.app.pojo.Category;
import com.joosee.app.system.util.JooseeRegexUtil;
import com.joosee.app.system.util.MD5;
public class JooseeFeedHandler extends DefaultHandler {
private Context context;
private boolean in_item = false;
private boolean in_title = false;
private boolean in_link = false;
private boolean in_content = false;
private boolean in_date = false;
private boolean in_category = false;
private boolean in_mainTitle = false;
private boolean in_description = false;
private boolean in_url = false;
private String item_title;
private List<Blog> li;
private Blog blog;
private String title = "";
private StringBuffer buf = new StringBuffer();
private String currentElement;
private String pubtime;
private String content;
private String description;
private Category category;
private Integer count = 0;
private Integer Max;
private String link;
private DataFactory df;
private ArrayList cateList;
private Boolean isStop = false;
public JooseeFeedHandler(Context c, Integer max) {
super();
this.context = c;
Max = max;
df = DataFactory.getDaoFactoryInstance(c);
}
public int getProgress() {
return count;
}
public List<Blog> getParsedData() {
return li;
}
public String getRssTitle() {
return title;
}
@Override
public void startDocument() throws SAXException {
li = new ArrayList<Blog>();
}
@Override
public void endDocument() throws SAXException {
}
@Override
public void startElement(String namespaceURI, String localName,
String qName, Attributes atts) throws SAXException {
currentElement = qName;
if (localName.equals("item")) {
this.in_item = true;
blog = new Blog();
cateList = new ArrayList();
} else if (localName.equals("title")) {
if (this.in_item) {
this.in_title = true;
count++;
} else {
this.in_mainTitle = true;
}
} else if (localName.equals("link")) {
if (this.in_item) {
this.in_link = true;
}
} else if (localName.equals("encoded")) {
//表明是否还在 encoded标签中
if (this.in_item) {
this.in_content = true;
//开始解析encoded时 先将content 置为null
content = null;
}
} else if (localName.equals("pubDate")) {
if (this.in_item) {
this.in_date = true;
}
} else if (localName.equals("enclosure")) {
if (this.in_item) {
this.in_url = true;
String url = atts.getValue("url");
}
} else if (localName.equals("category")) {
if (this.in_item) {
this.in_category = true;
}
} else if (localName.equals("description")) {
if (this.in_item) {
this.in_description = true;
}
}
}
@Override
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException {
if (count > Max) {
return;
}
if (isStop) {
return;
}
if (localName.equals("item")) {
this.in_item = false;
// li.add(blog);
// 插入db;
isStop = df.handleFeed(blog, cateList);
} else if (localName.equals("title")) {
if (this.in_item) {
blog.setTitle(item_title);
String hashSource = item_title + pubtime;
blog.setHashCode(MD5.enCode(hashSource));
this.in_title = false;
} else {
title = buf.toString().trim();
buf.setLength(0);
this.in_mainTitle = false;
}
} else if (localName.equals("encoded")) {
if (in_item) {
if (content != null && !content.equals("")) {
String imageUrl = JooseeRegexUtil
.getImageUrlFromString(content);
blog.setCoverImage(imageUrl);
blog.setContent((content));
// if(imageUrl!=null){
// Log.d("feed", item_title+" "+imageUrl);
// }else{
// Log.d("feed", item_title+" "+"null");
// }
}
this.in_content = false;
}
} else if (localName.equals("pubDate")) {
if (in_item) {
blog.setUpdateTime((pubtime));
this.in_date = false;
}
} else if (localName.equals("category")) {
if (in_item) {
// blog.setCategory(category);
cateList.add(category);
this.in_category = false;
}
} else if (localName.equals("link")) {
if (in_item) {
blog.setLink(link);
this.in_link = false;
}
} else if (localName.equals("description")) {
if (in_item) {
if (blog.getContent() == null && blog.getContent().equals("")) {
blog.setContent(description);
if (description != null) {
String imageUrl = JooseeRegexUtil
.getImageUrlFromString(description);
blog.setCoverImage(imageUrl);
}
}
this.in_description = false;
}
}
}
@Override
public void characters(char ch[], int start, int length)
throws SAXException {
if (in_item && currentElement.equals("category")) {
if (length > 2) {
String cateName = new String(ch, start, length);
category = df.insertCategory(cateName);
}
}
//用于解析 <![CDATA 中的内容
if (length <= 5)
return;
if (currentElement.equals("title")) {
item_title = new String(ch, start, length);
}
//持续拼接 content 内容 直至 encoded标签关闭
if (currentElement.equals("content:encoded")) {
String contentTmp = new String(ch, start, length);
if (in_content) {
if (content != null) {
content = content + contentTmp;
} else {
content = contentTmp;
}
}
Log.d("blog", content);
}
if (currentElement.equals("description")) {
String contentTmp = new String(ch, start, length);
if (in_description) {
if (description != null) {
description = description + contentTmp;
} else {
description = contentTmp;
}
}
}
if (currentElement.equals("pubDate")) {
pubtime = new String(ch, start, length);
}
if (currentElement.equals("link")) {
link = new String(ch, start, length);
}
if (this.in_item || this.in_mainTitle) {
buf.append(ch, start, length);
}
super.characters(ch, start, length);
}
}