同样是官方源码,这个源码稍微复杂一点,坚持看完了.
1 package com.wyf.cascade; 2 3 import java.util.Map; 4 import java.util.Properties; 5 6 import cascading.cascade.Cascade; 7 import cascading.cascade.CascadeConnector; 8 import cascading.cascade.Cascades; 9 import cascading.flow.Flow; 10 import cascading.flow.FlowConnector; 11 import cascading.operation.Identity; 12 import cascading.operation.aggregator.Count; 13 import cascading.operation.regex.RegexFilter; 14 import cascading.operation.regex.RegexGenerator; 15 import cascading.operation.regex.RegexReplace; 16 import cascading.operation.regex.RegexSplitter; 17 import cascading.operation.xml.TagSoupParser; 18 import cascading.operation.xml.XPathGenerator; 19 import cascading.operation.xml.XPathOperation; 20 import cascading.pipe.Each; 21 import cascading.pipe.Every; 22 import cascading.pipe.GroupBy; 23 import cascading.pipe.Pipe; 24 import cascading.pipe.SubAssembly; 25 import cascading.scheme.SequenceFile; 26 import cascading.scheme.TextLine; 27 import cascading.tap.Hfs; 28 import cascading.tap.Lfs; 29 import cascading.tap.Tap; 30 import cascading.tuple.Fields; 31 32 /** 33 * 34 * 35 * @author: wyf 36 * @version: Jul 15, 2013 11:33:04 AM 37 */ 38 public class WordCount { 39 40 /** 41 * 导入爬虫数据装配 42 * 43 * @author: wyf 44 * @version: Jul 15, 2013 11:38:02 AM 45 */ 46 @SuppressWarnings("serial") 47 private static class ImportCrawlDataAssembly extends SubAssembly { 48 public ImportCrawlDataAssembly(String name) { 49 //拆分文本行到url和raw 50 RegexSplitter regexSplitter = new RegexSplitter(new Fields("url", "raw")); 51 Pipe importPipe = new Each(name, new Fields("line"), regexSplitter); 52 //删除所有pdf文档 53 importPipe = new Each(importPipe, new Fields("url"), new RegexFilter(".*\\.pdf$", true)); 54 //把":n1"替换为"\n" 55 RegexReplace regexReplace = new RegexReplace(new Fields("page"), ":nl:", "\n"); 56 importPipe = new Each(importPipe, new Fields("raw"), regexReplace, new Fields("url", "page")); 57 58 //此句强制调用 59 setTails(importPipe); 60 } 61 } 62 63 /** 64 * 计数器拆分 65 * 66 * @author: wyf 67 * @version: Jul 15, 2013 11:38:35 AM 68 */ 69 @SuppressWarnings("serial") 70 private static class WordCountSplitAssembly extends SubAssembly { 71 public WordCountSplitAssembly(String sourceName, String sinkUrlName, String sinkWordName) { 72 //创建一个新的组件,计算所有页面中字数,和一个页面中的字数 73 Pipe pipe = new Pipe(sourceName); 74 75 //利用TagSoup将HTML转成XHTML,只保留"url"和"xml"去掉其它多余的 76 pipe = new Each(pipe, new Fields("page"), new TagSoupParser(new Fields("xml")), new Fields("url", "xml")); 77 78 //对"xml"字段运用XPath(XML Path Language)表达式,提取"body"元素 79 XPathGenerator bodyExtractor = new XPathGenerator(new Fields("body"), XPathOperation.NAMESPACE_XHTML, "//xhtml:body"); 80 pipe = new Each(pipe, new Fields("xml"), bodyExtractor, new Fields("url", "body")); 81 82 //运用另一个XPath表达式删除所有元素,只保留文本节点,删除在"script"元素中的文本节点 83 String elementXPath = "//text()[ name(parent::node()) != 'script']"; 84 XPathGenerator elementRemover = new XPathGenerator(new Fields("words"), XPathOperation.NAMESPACE_XHTML, elementXPath); 85 pipe = new Each(pipe, new Fields("body"), elementRemover, new Fields("url", "words")); 86 87 //用正则表达式将文档打乱成一个个独立的单词,和填充每个单词(新元组)到当前流使用"url"和"word"字段 88 RegexGenerator wordGenerator = new RegexGenerator(new Fields("word"), "(?<!\\pL)(?=\\pL)[^ ]*(?<=\\pL)(?!\\pL)"); 89 pipe = new Each(pipe, new Fields("words"), wordGenerator, new Fields("url", "word")); 90 91 //按"url"分组 92 Pipe urlCountPipe = new GroupBy(sinkUrlName, pipe, new Fields("url", "word")); 93 urlCountPipe = new Every(urlCountPipe, new Fields("url", "word"), new Count(), new Fields("url", "word", "count")); 94 95 //按"word"分组 96 Pipe wordCountPipe = new GroupBy(sinkWordName, pipe, new Fields("word")); 97 wordCountPipe = new Every(wordCountPipe, new Fields("word"), new Count(), new Fields("word", "count")); 98 99 setTails(urlCountPipe, wordCountPipe); 100 } 101 } 102 103 104 /** 105 * 运行主函数 106 * 107 * @author: wyf 108 * @version: Jul 15, 2013 11:41:30 AM 109 */ 110 public static void main(String[] args) { 111 //设置当前工作jar 112 Properties properties = new Properties(); 113 FlowConnector.setApplicationJarClass(properties, WordCount.class); 114 FlowConnector flowConnector = new FlowConnector(properties); 115 116 String inputPath = "/home/wyf/workspace/HadoopCascading/data/url+page.200.txt"; 117 String tempPath = "/home/wyf/workspace/HadoopCascading/data"; 118 String tempPath2 = "/home/wyf/workspace/HadoopCascading/data/local"; 119 String pagesPath = tempPath + "/pages/"; 120 String urlsPath = tempPath + "/urls/"; 121 String wordsPath = tempPath + "/words/"; 122 String localUrlsPath = tempPath2 + "/urls/"; 123 String localWordsPath = tempPath2 + "/words/"; 124 125 //初始化Pipe管道处理爬虫数据装配 126 Pipe importPipe = new ImportCrawlDataAssembly("import pipe"); 127 128 //创建tap实例 129 Tap localPagesSource = new Lfs(new TextLine(), inputPath); 130 Tap importedPages = new Hfs(new SequenceFile(new Fields("url", "page")), pagesPath); 131 132 //链接pipe装配到tap实例 133 Flow importPagesFlow = flowConnector.connect("import pages", localPagesSource, importedPages, importPipe); 134 135 //拆分定义的管道到两个管道 136 SubAssembly wordCountPipe = new WordCountSplitAssembly("wordcount pipe", "url pipe", "word pipe"); 137 138 //创建hadoop SequenceFile 存储计数后的结果 139 Tap sinkUrl = new Hfs(new SequenceFile(new Fields("url", "word", "count")), urlsPath); 140 Tap sinkWord = new Hfs(new SequenceFile(new Fields("word", "count")), wordsPath); 141 142 //绑定多个pipe和tap,此处指定的是pipe名称 143 Map<String, Tap> sinks = Cascades.tapsMap(new String[] { "url pipe", "word pipe" }, Tap.taps(sinkUrl, sinkWord)); 144 Flow count = flowConnector.connect(importedPages, sinks, wordCountPipe); 145 146 //创建一个装配,导出hadoop sequenceFile 到本地文本文件 147 Pipe exportPipe = new Each("export pipe", new Identity()); 148 Tap localSinkUrl = new Lfs(new TextLine(), localUrlsPath); 149 Tap localSinkWord = new Lfs(new TextLine(), localWordsPath); 150 151 Flow exportFromUrl = flowConnector.connect("export url", sinkUrl, localSinkUrl, exportPipe); 152 Flow exportFromWord = flowConnector.connect("export word", sinkWord, localSinkWord, exportPipe); 153 154 //装载flow,并执行 155 Cascade cascade = new CascadeConnector().connect(importPagesFlow, count, exportFromUrl, exportFromWord); 156 cascade.complete(); 157 } 158 }