html解析最重要的就是看清楚节点,看是用DIV取还是用class,搞清楚结构之后,解析规范的网页都不是什么问题。
如果网页不规范,则要看具体情况而定了
把NSData转成NSString类型的数据
NSString * str = [[NSString alloc]initWithData:data encoding:NSUTF8StringEncoding];
把NSString类型的数据转成DocumentRoot类型的文件(DocumentRoot是第三方类库提供的类,把数据转成这种类型才能用第三方类库进一步解析)
DocumentRoot * document = [Element parseHTML:str];
取出所有的DIV:
NSArray * childSecond = [childEl selectElements:@"div"];
for (Element* element in elements) {
if ([[element attribute:@"class"] isEqualToString:@"item"]) {
}
}
该块DIV为
<div class="item">
<div class="pic"><a href="http://www.weiphone.com/iPhone/news/2013-08-02/Come_the_Blackberry_company_to_send_BBM_for_iOS_beta_invites_560440.shtml"><img src="http://resource.weiphone.com/resource/h027/h73/img201308021306430.jpg" alt="" height="100" width="158" /></a></div>
<div class="head">
<h3><a href="http://www.weiphone.com/iPhone/news/2013-08-02/Come_the_Blackberry_company_to_send_BBM_for_iOS_beta_invites_560440.shtml">快来了 黑莓公司发送BBM for iOS测试邀请</a></h3>
<div class="meta">
<span class="timer" title="发表时间">2013/08/02 13:05</span>
<span class="line">|</span>
<a href="http://bbs.weiphone.com/u.php?uid=798198" class="author" title="作者"> 黄晓闷</a> <span class="line">|</span>
<a href="javascript:void(0);" class="link" title="文章来源">weiphone</a>
<div class="funs">
<span class="view" title="浏览次数">2689</span>
<span class="line">|</span>
<a href="http://www.weiphone.com/iPhone/news/2013-08-02/Come_the_Blackberry_company_to_send_BBM_for_iOS_beta_invites_560440.shtml#comment" class="cmt" title="评论次数">5</a>
</div>
</div>
</div>
<div class="desc">
<p>威锋网 8 月 2 日消息,黑莓公司日前向 iOS 用户发送了 BBM 的测试邀请,暗示着该服务的正式登陆已经进入最后阶段。</p>
</div>
</div>
把class为item的DIV的下级DIV全部取出来放入childSecond
NSArray * childSecond = [childEl selectElements:@"div"];
取标签包围的内容
Element * child = [secondEl selectElement:@"a"];
取标签尖括号里的内容
new.detailURL = [child.attributes objectForKey:@"href"];
以下是一个完整的解析方法,解析的网页为http://www.weiphone.com/iPhone/news/index_0.shtml的class为item的DIV
这个方法是在下载完成调用的,传递一个NSData类型的参数进去:
数据模型:
//
// News.h
// LookNewsProject
//
// Created by ibokan on 13-08-01.
// Copyright (c) 2013年 laomaoshiba. All rights reserved.
//
#import <Foundation/Foundation.h>
@interface News : NSObject
//标题,发布时间,详情链接,图片链接,浏览次数,评价,类别,作者,简介,来源
@property(copy,nonatomic)NSString * title, * publishTime, * detailURL, * imgURL, * viewTimes, * evaluateTimes, * category, * author, * intro, * origin;
@end
解析方法:
-(void) analyNews:(NSData *)data
{
//中文转码
NSString * str = [[NSString alloc]initWithData:data encoding:NSUTF8StringEncoding];
//NSLog(@"%@",str);
//html解析
DocumentRoot * document = [Element parseHTML:str];
//以div分割
NSArray * elements= [document selectElements:@"div"];
//创建存储数组
NSMutableArray * newArr = [[NSMutableArray alloc]init];
//循环解析
for (Element* element in elements){
if ([[element attribute:@"class"] isEqualToString:@"item"]){
NSArray * childElement = [element childElements];
//创建新闻实体
News * new = [[News alloc]init];
int i=0;
for(Element* childEl in childElement){
i++;
NSArray * childSecond = [childEl selectElements:@"div"];
for(Element * secondEl in childSecond){
if([[secondEl attribute:@"class"] isEqualToString:@"pic"]){
Element * child = [secondEl selectElement:@"a"];
//获取详细信息
//NSLog(@"详细链接:%@",[child.attributes objectForKey:@"href"]);
new.detailURL = [child.attributes objectForKey:@"href"];
//获取图片链接
//NSLog(@"图片链接:%@",[[child selectElement:@"img"].attributes objectForKey:@"src"]);
new.imgURL = [[child selectElement:@"img"].attributes objectForKey:@"src"];
}
else if([[secondEl attribute:@"class"] isEqualToString:@"head"]){
//获取新闻标题
//NSLog(@"标题:%@",[[secondEl selectElement:@"a"] contentsSource]);
new.title = [[secondEl selectElement:@"a"] contentsSource];
}
else if([[secondEl attribute:@"class"] isEqualToString:@"meta"]){
//获取作者
//NSLog(@"test-------->%@",[[secondEl selectElement:@"div"] contentsSource]);
//NSLog(@"作者:%@",[[secondEl selectElement:@"a"] contentsSource]);
new.author = [[secondEl selectElement:@"a"] contentsSource];
//获取发表时间
//NSLog(@"发表时间:%@",[[secondEl selectElement:@"span"] contentsSource]);
new.publishTime = [[secondEl selectElement:@"span"] contentsSource];
//获取来源
Element * originEl = [[secondEl selectElements:@"a"] objectAtIndex:1] ;
//NSLog(@"来源:%@",[originEl contentsSource]);
new.origin = [originEl contentsSource];
}
else if([[secondEl attribute:@"class"] isEqualToString:@"funs"]){
//获取浏览
//NSLog(@"浏览次数:%@",[[secondEl selectElement:@"span"] contentsSource]);
new.viewTimes = [[secondEl selectElement:@"span"] contentsSource];
//获取评价次数
//NSLog(@"评价次数:%@",[[secondEl selectElement:@"a"] contentsSource]);
new.evaluateTimes = [[secondEl selectElement:@"a"] contentsSource];
}
else if([[secondEl attribute:@"class"] isEqualToString:@"desc"]){
//获取简介
//NSLog(@"简介:%@",[[secondEl selectElement:@"p"] contentsSource]);
new.intro = [[secondEl selectElement:@"p"] contentsSource];
}
}
}
[newArr addObject:new];
}
}
[str release];
[newArr release];
}