iOS解析HTML

xml,json都有大量的库来解析,我们如何解析html呢?


TFHpple是一个小型的封装,可以用来解析html,它是对libxml的封装,语法是xpath。
今天我看到一个直接用libxml来解析html,参看:http://www.cocoanetics.com/2011/09/taming-html-parsing-with-libxml-1/#comment-3090 那张图画得一目了然,很值得收藏。这个文章中的源码不能遍历所有的html,我做了一点修改可以将html遍历打印出来

// NSData data contains the document data  
// encoding is the NSStringEncoding of the data  
// baseURL the documents base URL, i.e. location  

CFStringEncoding cfenc = CFStringConvertNSStringEncodingToEncoding(encoding);  
CFStringRef cfencstr = CFStringConvertEncodingToIANACharSetName(cfenc);  
const char *enc = CFStringGetCStringPtr(cfencstr, 0);  

htmlDocPtr _htmlDocument = htmlReadDoc([data bytes],  
[[baseURL absoluteString] UTF8String],  
enc,  
XML_PARSE_NOERROR | XML_PARSE_NOWARNING);  
if (_htmlDocument)  
{  
xmlFreeDoc(_htmlDocument);  
}  

xmlNodePtr currentNode = (xmlNodePtr)_htmlDocument;  

while (currentNode)  
{  
// output node if it is an element  

if (currentNode->type == XML_ELEMENT_NODE)  
{  
NSMutableArray *attrArray = [NSMutableArray array];  

for (xmlAttrPtr attrNode = currentNode->properties; attrNode; attrNode = attrNode->next)  
{  
xmlNodePtr contents = attrNode->children;  

[attrArray addObject:[NSString stringWithFormat:@"%s='%s'", attrNode->name, contents->content]];  
}  

NSString *attrString = [attrArray componentsJoinedByString:@" "];  

if ([attrString length])  
{  
attrString = [@" " stringByAppendingString:attrString];  
}  

NSLog(@"<%s%@>", currentNode->name, attrString);  
}  
else if (currentNode->type == XML_TEXT_NODE)  
{  
//NSLog(@"%s", currentNode->content);  
NSLog(@"%@", [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding]);  
}  
else if (currentNode->type == XML_COMMENT_NODE)  
{  
NSLog(@"/* %s */", currentNode->name);  
}  


if (currentNode && currentNode->children)  
{  
currentNode = currentNode->children;  
}  
else if (currentNode && currentNode->next)  
{  
currentNode = currentNode->next;  
}  
else  
{  
currentNode = currentNode->parent;  

// close node  
if (currentNode && currentNode->type == XML_ELEMENT_NODE)  
{  
NSLog(@"", currentNode->name);  
}  

if (currentNode->next)  
{  
currentNode = currentNode->next;  
}  
else  
{  
while(currentNode)  
{  
currentNode = currentNode->parent;  
if (currentNode && currentNode->type == XML_ELEMENT_NODE)  
{  
NSLog(@"", currentNode->name);  
if (strcmp((const char *)currentNode->name, "table") == 0)  
{  
NSLog(@"over");  
}  
}  

if (currentNode == nodes->nodeTab[0])  
{  
break;  
}  

if (currentNode && currentNode->next)  
{  
currentNode = currentNode->next;  
break;  
}  
}  
}  
}  

if (currentNode == nodes->nodeTab[0])  
{  
break;  
}  
}


不过我还是喜欢用TFHpple,因为它很简单,也好用,但是它的功能不是很完完善。比如,不能获取children node,我就写了两个方法,一个是获取children node,一个是获取所有的contents. 还有node的属性content的key与node's content的key一样,都是@"nodeContent", 正确情况下属性的应是@"attributeContent",
所以我写了这个方法,同时修改node属性的content key.
NSDictionary *DictionaryForNode2(xmlNodePtr currentNode, NSMutableDictionary *parentResult)  
{  
NSMutableDictionary *resultForNode = [NSMutableDictionary dictionary];  

if (currentNode->name)  
{  
NSString *currentNodeContent =  
[NSString stringWithCString:(const char *)currentNode->name encoding:NSUTF8StringEncoding];  
[resultForNode setObject:currentNodeContent forKey:@"nodeName"];  
}  

if (currentNode->content)  
{  
NSString *currentNodeContent = [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding];  

if (currentNode->type == XML_TEXT_NODE)  
{  
if (currentNode->parent->type == XML_ELEMENT_NODE)  
{  
[parentResult setObject:currentNodeContent forKey:@"nodeContent"];  
return nil;  
}  

if (currentNode->parent->type == XML_ATTRIBUTE_NODE)  
{  
[parentResult  
setObject:  
[currentNodeContent  
stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]  
forKey:@"attributeContent"];  
return nil;  

}  
}  
}  



xmlAttr *attribute = currentNode->properties;  
if (attribute)  
{  
NSMutableArray *attributeArray = [NSMutableArray array];  
while (attribute)  
{  
NSMutableDictionary *attributeDictionary = [NSMutableDictionary dictionary];  
NSString *attributeName =  
[NSString stringWithCString:(const char *)attribute->name encoding:NSUTF8StringEncoding];  
if (attributeName)  
{  
[attributeDictionary setObject:attributeName forKey:@"attributeName"];  
}  

if (attribute->children)  
{  
NSDictionary *childDictionary = DictionaryForNode2(attribute->children, attributeDictionary);  
if (childDictionary)  
{  
[attributeDictionary setObject:childDictionary forKey:@"attributeContent"];  
}  
}  

if ([attributeDictionary count] > 0)  
{  
[attributeArray addObject:attributeDictionary];  
}  
attribute = attribute->next;  
}  

if ([attributeArray count] > 0)  
{  
[resultForNode setObject:attributeArray forKey:@"nodeAttributeArray"];  
}  
}  

xmlNodePtr childNode = currentNode->children;  
if (childNode)  
{  
NSMutableArray *childContentArray = [NSMutableArray array];  
while (childNode)  
{  
NSDictionary *childDictionary = DictionaryForNode2(childNode, resultForNode);  
if (childDictionary)  
{  
[childContentArray addObject:childDictionary];  
}  
childNode = childNode->next;  
}  
if ([childContentArray count] > 0)  
{  
[resultForNode setObject:childContentArray forKey:@"nodeChildArray"];  
}  
}  

return resultForNode;  
}

TFHppleElement.m里加了两个key 常量
NSString * const TFHppleNodeAttributeContentKey = @"attributeContent";  
NSString * const TFHppleNodeChildArrayKey = @"nodeChildArray";

并修改获取属性方法为:
- (NSDictionary *) attributes  
{  
NSMutableDictionary * translatedAttributes = [NSMutableDictionary dictionary];  
for (NSDictionary * attributeDict in [node objectForKey:TFHppleNodeAttributeArrayKey]) {  
[translatedAttributes setObject:[attributeDict objectForKey:TFHppleNodeAttributeContentKey]  
forKey:[attributeDict objectForKey:TFHppleNodeAttributeNameKey]];  
}  
return translatedAttributes;  
}

并添加获取children node 方法:
- (BOOL) hasChildren  
{  
NSArray *childs = [node objectForKey: TFHppleNodeChildArrayKey];  

if (childs)  
{  
return YES;  
}  

return NO;  
}  

- (NSArray *) children  
{  
if ([self hasChildren])  
return [node objectForKey: TFHppleNodeChildArrayKey];  
return nil;  
}

 

参看:http://giles-wang.blogspot.com/2011/08/iphoneansi.html

原文:http://blog.csdn.net/favormm/article/details/6794487

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值