今天做了个IMDB简介抓取,用的是HTML解析,不是用提供的API做的,以前用javascript做过一个。第一次用HTML解析,可以说是从一窍不通到了略知一二。用了hpple开源的解析库。新版和旧版有不少出入,绕了很多弯路,才找到了点门道。
效果图:
就一个方法
- (IBAction)searchIMDB:(id)sender {
[[self plot]setString:@" "]; //重置简介区的NSTextField
NSString *str=[NSString stringWithFormat:@"http://207.171.166.140/title/%@/",[sender stringValue]]; //形成imdb网址
NSData *siteData = [[NSData alloc] initWithContentsOfURL:[NSURL URLWithString:str]];
if (siteData) {
TFHpple *xpathParser = [[TFHpple alloc] initWithHTMLData:siteData];
NSArray *elementName = [xpathParser searchWithXPathQuery:@"//h1//text()"];
NSArray *elementStoryline1 = [xpathParser searchWithXPathQuery:@"//p[@itemprop='description']//text()"];
NSArray *elementAddPlot = [xpathParser searchWithXPathQuery:@"id('maindetails_center_bottom')/div[5]/span[2]/span[1]/a//text()"];
TFHppleElement *elementN = [elementName objectAtIndex:0];
TFHppleElement *elementS = [elementStoryline1 objectAtIndex:0];
NSString *strName = [[elementN content]stringByTrimmingCharactersInSet: [NSCharacterSet newlineCharacterSet]]; //去除电影名前后的换行符
[[self movie]setStringValue:strName]; //设置电影名,由label显示出来
if ([elementAddPlot count]) { //如果该表达式不为0,表示抓取到了“Add Full Plot”,意味着没有其他版本简介,就继续在本页面提取简介
[[self plot]insertText:@"\n\t◉"]; //格式修饰
NSString *plotShort = [[NSString alloc]init];
plotShort = [[elementS content]stringByTrimmingCharactersInSet: [NSCharacterSet newlineCharacterSet]];;
[[self plot]insertText:plotShort];
} else{ //表示还有其他版本,也就意味着该页面存在“Plot Summary”,去新页面抓取详细简介
NSString *str2=[NSString stringWithFormat:@"http://207.171.166.140/title/%@/plotsummary",[sender stringValue]];
NSData *siteData2 = [[NSData alloc] initWithContentsOfURL:[NSURL URLWithString:str2]];
if (siteData2) {
TFHpple *xpathParser2 = [[TFHpple alloc] initWithHTMLData:siteData2];
NSArray *elementStoryline = [xpathParser2 searchWithXPathQuery:@"id('tn15content')/p//text()"];
[[self plot]insertText:@"\n\t◉"];
if (elementStoryline) {
for (TFHppleElement *element in elementStoryline) {
NSString *plotDetail = [element content];
NSString *plotd=[[NSString alloc]init];
//NSLog(@"t is %@",element);
if ([plotDetail isEqualToString:@"\n"]) { //处理单个的换行符
[[self plot]insertText:@"\n\n\t◉"];
}
else if ([plotDetail hasSuffix:@"\n "]){ //处理“\nWritten by\n ”成“Written by ”。
plotd=[NSString stringWithFormat:@"%@ ",[plotDetail stringByTrimmingCharactersInSet: [NSCharacterSet whitespaceAndNewlineCharacterSet]]];
}else{ //其他情况去掉前后的换行符,两个换行符的字符串被清除
plotd=[plotDetail stringByTrimmingCharactersInSet: [NSCharacterSet newlineCharacterSet]];}
[[self plot]insertText:plotd];
}
}
}
}
}