Cascading(三)之计数器

同样是官方源码,这个源码稍微复杂一点,坚持看完了.

  1 package com.wyf.cascade;
  2 
  3 import java.util.Map;
  4 import java.util.Properties;
  5 
  6 import cascading.cascade.Cascade;
  7 import cascading.cascade.CascadeConnector;
  8 import cascading.cascade.Cascades;
  9 import cascading.flow.Flow;
 10 import cascading.flow.FlowConnector;
 11 import cascading.operation.Identity;
 12 import cascading.operation.aggregator.Count;
 13 import cascading.operation.regex.RegexFilter;
 14 import cascading.operation.regex.RegexGenerator;
 15 import cascading.operation.regex.RegexReplace;
 16 import cascading.operation.regex.RegexSplitter;
 17 import cascading.operation.xml.TagSoupParser;
 18 import cascading.operation.xml.XPathGenerator;
 19 import cascading.operation.xml.XPathOperation;
 20 import cascading.pipe.Each;
 21 import cascading.pipe.Every;
 22 import cascading.pipe.GroupBy;
 23 import cascading.pipe.Pipe;
 24 import cascading.pipe.SubAssembly;
 25 import cascading.scheme.SequenceFile;
 26 import cascading.scheme.TextLine;
 27 import cascading.tap.Hfs;
 28 import cascading.tap.Lfs;
 29 import cascading.tap.Tap;
 30 import cascading.tuple.Fields;
 31 
 32 /**
 33  * 
 34  * 
 35  * @author: wyf
 36  * @version: Jul 15, 2013 11:33:04 AM
 37  */
 38 public class WordCount {
 39 
 40     /**
 41      * 导入爬虫数据装配
 42      * 
 43      * @author: wyf
 44      * @version: Jul 15, 2013 11:38:02 AM
 45      */
 46     @SuppressWarnings("serial")
 47     private static class ImportCrawlDataAssembly extends SubAssembly {
 48         public ImportCrawlDataAssembly(String name) {
 49             //拆分文本行到url和raw
 50             RegexSplitter regexSplitter = new RegexSplitter(new Fields("url", "raw"));
 51             Pipe importPipe = new Each(name, new Fields("line"), regexSplitter);
 52             //删除所有pdf文档
 53             importPipe = new Each(importPipe, new Fields("url"), new RegexFilter(".*\\.pdf$", true));
 54             //把":n1"替换为"\n"
 55             RegexReplace regexReplace = new RegexReplace(new Fields("page"), ":nl:", "\n");
 56             importPipe = new Each(importPipe, new Fields("raw"), regexReplace, new Fields("url", "page"));
 57 
 58             //此句强制调用
 59             setTails(importPipe);
 60         }
 61     }
 62 
 63     /**
 64      * 计数器拆分
 65      * 
 66      * @author: wyf
 67      * @version: Jul 15, 2013 11:38:35 AM
 68      */
 69     @SuppressWarnings("serial")
 70     private static class WordCountSplitAssembly extends SubAssembly {
 71         public WordCountSplitAssembly(String sourceName, String sinkUrlName, String sinkWordName) {
 72             //创建一个新的组件,计算所有页面中字数,和一个页面中的字数
 73             Pipe pipe = new Pipe(sourceName);
 74             
 75             //利用TagSoup将HTML转成XHTML,只保留"url"和"xml"去掉其它多余的
 76             pipe = new Each(pipe, new Fields("page"), new TagSoupParser(new Fields("xml")), new Fields("url", "xml"));
 77             
 78             //对"xml"字段运用XPath(XML Path Language)表达式,提取"body"元素
 79             XPathGenerator bodyExtractor = new XPathGenerator(new Fields("body"), XPathOperation.NAMESPACE_XHTML, "//xhtml:body");
 80             pipe = new Each(pipe, new Fields("xml"), bodyExtractor, new Fields("url", "body"));
 81             
 82             //运用另一个XPath表达式删除所有元素,只保留文本节点,删除在"script"元素中的文本节点
 83             String elementXPath = "//text()[ name(parent::node()) != 'script']";
 84             XPathGenerator elementRemover = new XPathGenerator(new Fields("words"), XPathOperation.NAMESPACE_XHTML, elementXPath);
 85             pipe = new Each(pipe, new Fields("body"), elementRemover, new Fields("url", "words"));
 86             
 87             //用正则表达式将文档打乱成一个个独立的单词,和填充每个单词(新元组)到当前流使用"url"和"word"字段
 88             RegexGenerator wordGenerator = new RegexGenerator(new Fields("word"), "(?<!\\pL)(?=\\pL)[^ ]*(?<=\\pL)(?!\\pL)");
 89             pipe = new Each(pipe, new Fields("words"), wordGenerator, new Fields("url", "word"));
 90 
 91             //按"url"分组
 92             Pipe urlCountPipe = new GroupBy(sinkUrlName, pipe, new Fields("url", "word"));
 93             urlCountPipe = new Every(urlCountPipe, new Fields("url", "word"), new Count(), new Fields("url", "word", "count"));
 94 
 95             //按"word"分组
 96             Pipe wordCountPipe = new GroupBy(sinkWordName, pipe, new Fields("word"));
 97             wordCountPipe = new Every(wordCountPipe, new Fields("word"), new Count(), new Fields("word", "count"));
 98             
 99             setTails(urlCountPipe, wordCountPipe);
100         }
101     }
102 
103     
104     /**
105      * 运行主函数
106      * 
107      * @author: wyf
108      * @version: Jul 15, 2013 11:41:30 AM
109      */
110     public static void main(String[] args) {
111         //设置当前工作jar
112         Properties properties = new Properties();
113         FlowConnector.setApplicationJarClass(properties, WordCount.class);
114         FlowConnector flowConnector = new FlowConnector(properties);
115 
116         String inputPath = "/home/wyf/workspace/HadoopCascading/data/url+page.200.txt";
117         String tempPath = "/home/wyf/workspace/HadoopCascading/data";
118         String tempPath2 = "/home/wyf/workspace/HadoopCascading/data/local";
119         String pagesPath = tempPath + "/pages/";
120         String urlsPath = tempPath + "/urls/";
121         String wordsPath = tempPath + "/words/";
122         String localUrlsPath = tempPath2 + "/urls/";
123         String localWordsPath = tempPath2 + "/words/";
124 
125         //初始化Pipe管道处理爬虫数据装配
126         Pipe importPipe = new ImportCrawlDataAssembly("import pipe");
127 
128         //创建tap实例
129         Tap localPagesSource = new Lfs(new TextLine(), inputPath);
130         Tap importedPages = new Hfs(new SequenceFile(new Fields("url", "page")), pagesPath);
131 
132         //链接pipe装配到tap实例
133         Flow importPagesFlow = flowConnector.connect("import pages", localPagesSource, importedPages, importPipe);
134 
135         //拆分定义的管道到两个管道
136         SubAssembly wordCountPipe = new WordCountSplitAssembly("wordcount pipe", "url pipe", "word pipe");
137 
138         //创建hadoop SequenceFile 存储计数后的结果
139         Tap sinkUrl = new Hfs(new SequenceFile(new Fields("url", "word", "count")), urlsPath);
140         Tap sinkWord = new Hfs(new SequenceFile(new Fields("word", "count")), wordsPath);
141 
142         //绑定多个pipe和tap,此处指定的是pipe名称
143         Map<String, Tap> sinks = Cascades.tapsMap(new String[] { "url pipe", "word pipe" }, Tap.taps(sinkUrl, sinkWord));
144         Flow count = flowConnector.connect(importedPages, sinks, wordCountPipe);
145 
146         //创建一个装配,导出hadoop sequenceFile 到本地文本文件
147         Pipe exportPipe = new Each("export pipe", new Identity());
148         Tap localSinkUrl = new Lfs(new TextLine(), localUrlsPath);
149         Tap localSinkWord = new Lfs(new TextLine(), localWordsPath);
150 
151         Flow exportFromUrl = flowConnector.connect("export url", sinkUrl, localSinkUrl, exportPipe);
152         Flow exportFromWord = flowConnector.connect("export word", sinkWord, localSinkWord, exportPipe);
153 
154         //装载flow,并执行
155         Cascade cascade = new CascadeConnector().connect(importPagesFlow, count, exportFromUrl, exportFromWord);
156         cascade.complete();
157     }
158 }

 

 

转载于:https://www.cnblogs.com/geopanda/p/3192931.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
cascading style sheet(CSS)是一种用于定义HTML文档的样式的语言。它可以通过多种方式应用于HTML元素,包括内部样式表、外部样式表和行内样式。在CSS中,层叠性是一种重要的特性,它允许一个元素同时应用多个选择器的样式,这些选择器可以是不同类型的选择器,也可以是相同类型的选择器。 在HTML文档中使用CSS样式的方式有多种。一种方式是通过在元素标签的内部添加style属性,并在style属性中设定CSS样式,例如:<p style="color: cyan; font-size: 45px;">西安邮电大学</p>。这种方式是在元素内部直接定义样式,适用于对单个元素进行样式设置的情况。 另一种方式是通过内部样式表(嵌入式),将CSS样式写在<style>标签中,并将该标签放置在HTML文档的<head>标签内部。这种方式适用于在整个HTML文档范围内定义样式,在各个元素中使用class或id来调用相应的样式。例如:<div class='red'> 变红色<div>,其中class属性用来调用class类的样式。 总之,cascading style sheet(CSS)是一种用于定义HTML文档样式的语言,它通过多种方式应用于HTML元素,包括内部样式表、外部样式表和行内样式,其中层叠性是一种重要的特性,允许一个元素同时应用多个选择器的样式。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* *3* [CSS(层叠样式表cascading style sheet)——CSS样式、基础选择器](https://blog.csdn.net/hi_Celia/article/details/120785482)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_1"}}] [.reference_item style="max-width: 50%"] - *2* [css的语义---Cascading Style Sheet](https://blog.csdn.net/yangmiaomiaoye/article/details/120763321)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_1"}}] [.reference_item style="max-width: 50%"] [ .reference_list ]

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值