Baidu's C++ Extention on Hadoop

Hadoop Map/Reduce

Hadoop C++ Extention

Created: 07/Dec/09 08:09 AM   Updated: Thursday 07:38 PM
Component/s: task
Affects Version/s: 0.20.1
Fix Version/s: None

 

Time Tracking:
Not Specified

 

<script> /* This table sortting is copy from http://www.js-vault.us hle added the ts_sort_jira_date to sort the jira date in the form YYYY-MM-DD HH:mm a z For example 2008-6-15 11:15 AM PST Look at the sortables_init function to customize the default sortting (column, order) */ addEvent(window, "load", sortables_init); var SORT_COLUMN_INDEX; function sortables_init() { // Find all tables with class sortable and make them sortable var sorted = 1; // 1 to sort table when first loaded, 0 otherwise. var col = 2; // column to sort by default (0,1,2...) with 0 being the first column var order = "DES" // "ASC" for ascending order, "DES" for descending order if (!document.getElementsByTagName) return; tbls = document.getElementsByTagName("table"); for (ti=0;ti<tbls.length;ti++) { thisTbl = tbls[ti]; if (((' '+thisTbl.className+' ').indexOf("sortable") != -1) && (thisTbl.id)) { //initTable(thisTbl.id); ts_makeSortable(thisTbl,sorted,col,order); } } } function ts_makeSortable(table,sorted,col,order) { if (table.rows && table.rows.length > 0) { var firstRow = table.rows[0]; } if (!firstRow) return; // We have a first row: assume it's the header, and make its contents clickable links // We don't want to sort the first and last column File Type and File Size for (var i=0;i<firstRow.cells.length;i++) { var cell = firstRow.cells[i]; var txt = ts_getInnerText(cell); if(cell.className !== "sorttable_nosort") cell.innerHTML = '<a href="#" class="sortheader" οnclick="ts_resortTable(this);return false;">'+txt+'<span class="sortarrow">&nbsp;&nbsp;&nbsp;</span></a>'; if(i==col) lnk = cell.childNodes[0]; } if(sorted==1) ts_resortTable(lnk,sorted,col,order); } function ts_getInnerText(el) { if (typeof el == "string") return el; if (typeof el == "undefined") { return el }; if (el.innerText) return el.innerText; //Not needed but it is faster var str = ""; var cs = el.childNodes; var l = cs.length; for (var i = 0; i < l; i++) { switch (cs[i].nodeType) { case 1: //ELEMENT_NODE str += ts_getInnerText(cs[i]); break; case 3: //TEXT_NODE str += cs[i].nodeValue; break; } } return str; } function ts_resortTable(lnk,sorted,col,order) { // get the span var span; for (var ci=0;ci<lnk.childNodes.length;ci++) { if (lnk.childNodes[ci].tagName && lnk.childNodes[ci].tagName.toLowerCase() == 'span') span = lnk.childNodes[ci]; } var spantext = ts_getInnerText(span); var td = lnk.parentNode; var table = getParent(td,'TABLE'); if(sorted==1) { var column = col } else { var column = td.cellIndex; } // Work out a type for the column if (table.rows.length <= 1) return; var itm = ts_getInnerText(table.rows[1].cells[column]); sortfn = ts_sort_caseinsensitive; if (itm.match(/^/d+[//-]/d+[//-]/d+/s+/d+:/d+/s+/D+/s+/D+$/)) sortfn = ts_sort_jira_date; if (itm.match(/^/d+[//-]/d+[//-]/d+/s+/d+:/d+/s+/D+$/)) sortfn = ts_sort_jira_date; if (itm.match(/^/d+[//-]/d+[//-]/d+/s+/d+:/d+$/)) sortfn = ts_sort_jira_date; if (itm.match(/^/d/d[//-]/d/d[//-]/d/d/d/d$/)) sortfn = ts_sort_date; if (itm.match(/^/d/d[//-]/d/d[//-]/d/d$/)) sortfn = ts_sort_date; if (itm.match(/^[£$]/)) sortfn = ts_sort_currency; if (itm.match(/^[/d/.]+$/)) sortfn = ts_sort_numeric; SORT_COLUMN_INDEX = column; var firstRow = new Array(); var newRows = new Array(); for (i=0;i<table.rows[0].length;i++) { firstRow[i] = table.rows[0][i]; } for (j=1;j<table.rows.length;j++) { newRows[j-1] = table.rows[j]; } newRows.sort(sortfn); if(order=="ASC") { if (span.getAttribute("sortdir") == 'down') { ARROW = '&nbsp;&nbsp;&uarr;'; newRows.reverse(); span.setAttribute('sortdir','up'); } else { ARROW = '&nbsp;&nbsp;&darr;'; span.setAttribute('sortdir','down'); } } else { if (span.getAttribute("sortdir") == 'up') { ARROW = '&nbsp;&nbsp;&darr;'; span.setAttribute('sortdir','down'); } else { ARROW = '&nbsp;&nbsp;&uarr;'; newRows.reverse(); span.setAttribute('sortdir','up'); } } // We appendChild rows that already exist to the tbody, so it moves them rather than creating new ones // don't do sortbottom rows for (i=0;i<newRows.length;i++) { if (!newRows[i].className || (newRows[i].className && (newRows[i].className.indexOf('sortbottom') == -1))) table.tBodies[0].appendChild(newRows[i]);} // do sortbottom rows only for (i=0;i<newRows.length;i++) { if (newRows[i].className && (newRows[i].className.indexOf('sortbottom') != -1)) table.tBodies[0].appendChild(newRows[i]);} // Delete any other arrows there may be showing var allspans = document.getElementsByTagName("span"); for (var ci=0;ci<allspans.length;ci++) { if (allspans[ci].className == 'sortarrow') { if (getParent(allspans[ci],"table") == getParent(lnk,"table")) { // in the same table as us? allspans[ci].innerHTML = '&nbsp;&nbsp;&nbsp;'; } } } span.innerHTML = ARROW; } function getParent(el, pTagName) { if (el == null) return null; else if (el.nodeType == 1 && el.tagName.toLowerCase() == pTagName.toLowerCase()) // Gecko bug, supposed to be uppercase return el; else return getParent(el.parentNode, pTagName); } function ts_sort_date(a,b) { // y2k notes: two digit years less than 50 are treated as 20XX, greater than 50 are treated as 19XX aa = ts_getInnerText(a.cells[SORT_COLUMN_INDEX]); bb = ts_getInnerText(b.cells[SORT_COLUMN_INDEX]); if (aa.length == 10) { dt1 = aa.substr(6,4)+aa.substr(3,2)+aa.substr(0,2); } else { yr = aa.substr(6,2); if (parseInt(yr) < 50) { yr = '20'+yr; } else { yr = '19'+yr; } dt1 = yr+aa.substr(3,2)+aa.substr(0,2); } if (bb.length == 10) { dt2 = bb.substr(6,4)+bb.substr(3,2)+bb.substr(0,2); } else { yr = bb.substr(6,2); if (parseInt(yr) < 50) { yr = '20'+yr; } else { yr = '19'+yr; } dt2 = yr+bb.substr(3,2)+bb.substr(0,2); } if (dt1==dt2) return 0; if (dt1<dt2) return -1; return 1; } function ts_sort_jira_date(a,b) { // The date format of a and b come straight from "jira outlookDate/formatIso8601Date". // Make sure that in the 'look and feel' setup the "Time Format" is: hh:mm a z (For example 04:21 PM PST) // If you see any other format for date and time, you have to change the sort routine to suit your need aa = ts_getInnerText(a.cells[SORT_COLUMN_INDEX]); bb = ts_getInnerText(b.cells[SORT_COLUMN_INDEX]); var tDate,tTime,tHour; tAA = aa.split(//s+/); tDateAA = tAA[0]; tTimeAA = tAA[1].split(/:/); tBB = bb.split(//s+/); tDateBB = tBB[0]; tTimeBB = tBB[1].split(/:/); if(aa.match(//D+/)&&aa.match(/[AP]M/)) { if(aa.match(/PM/)&&parseInt(tTimeAA[0])<12) { tTimeAA[0]= parseInt(tTimeAA[0]) + 12; } if(bb.match(/PM/)&&parseInt(tTimeBB[0])<12) { tTimeBB[0]= parseInt(tTimeBB[0]) + 12; } tHourAA = tAA[2]; tHourBB = tBB[2]; } else tHourAA = ""; if(tTimeAA[0].length<2) { hAA = '0'+tTimeAA[0]; } else { hAA = tTimeAA[0];} if(tTimeAA[1].length<2) { mnAA = '0'+tTimeAA[1]; } else { mnAA = tTimeAA[1];} if(tTimeBB[0].length<2) { hBB = '0'+tTimeBB[0]; } else { hBB = tTimeBB[0];} if(tTimeBB[1].length<2) { mnBB = '0'+tTimeBB[1]; } else { mnBB = tTimeBB[1];} if(tHourAA !== "") { dt1=tDateAA+tHourAA+hAA+mnAA; dt2=tDateBB+tHourBB+hBB+mnBB; } else { dt1=tDateAA+hAA+mnAA; dt2=tDateBB+hBB+mnBB; } if(dt1==dt2) { return 0; } else { if (dt1<dt2) { return -1; } else { return 1; } } } function ts_sort_currency(a,b) { aa = ts_getInnerText(a.cells[SORT_COLUMN_INDEX]).replace(/[^0-9.]/g,''); bb = ts_getInnerText(b.cells[SORT_COLUMN_INDEX]).replace(/[^0-9.]/g,''); return parseFloat(aa) - parseFloat(bb); } function ts_sort_numeric(a,b) { aa = parseFloat(ts_getInnerText(a.cells[SORT_COLUMN_INDEX])); if (isNaN(aa)) aa = 0; bb = parseFloat(ts_getInnerText(b.cells[SORT_COLUMN_INDEX])); if (isNaN(bb)) bb = 0; return aa-bb; } function ts_sort_caseinsensitive(a,b) { aa = ts_getInnerText(a.cells[SORT_COLUMN_INDEX]).toLowerCase(); bb = ts_getInnerText(b.cells[SORT_COLUMN_INDEX]).toLowerCase(); if (aa==bb) return 0; if (aa<bb) return -1; return 1; } function ts_sort_default(a,b) { aa = ts_getInnerText(a.cells[SORT_COLUMN_INDEX]); bb = ts_getInnerText(b.cells[SORT_COLUMN_INDEX]); if (aa==bb) return 0; if (aa<bb) return -1; return 1; } function addEvent(elm, evType, fn, useCapture) // addEvent and removeEvent // cross-browser event handling for IE5+, NS6 and Mozilla // By Scott Andrew { if (elm.addEventListener){ elm.addEventListener(evType, fn, useCapture); return true; } else if (elm.attachEvent){ var r = elm.attachEvent("on"+evType, fn); return r; } else { alert("Handler could not be removed"); } } </script> <!-- TD.colHeaderLink_a { font-family: Arial, Helvetica, sans-serif; font-size: 12px; background-color: #f0f0f0; font-weight: bold; } .colHeaderLink_a a { text-decoration: none; } .colHeaderOver_a { background-color: #aaaaaa; color: #ffffff; font-family: Arial, Helvetica, sans-serif; font-size: 12px; cursor: pointer; cursor: hand; } .colHeaderOver_a a { text-decoration: none; } .colHeaderHighlight_a { background-color: #aaaaaa; color: #ffffff; font-family: Arial, Helvetica, sans-serif; font-size: 12px; } .colHeaderHighlight_a a { text-decoration: none; } .sorttable_nosort { font-family: Arial, Helvetica, sans-serif; font-size: 12px; background-color: #f0f0f0; font-weight: bold; } -->
Environment: hadoop linux

 

Hadoop Flags: Incompatible change
Tags: PIPES C++
Labels:


 Description   « Hide
Hadoop C++ extension is an internal project in baidu, We start it for these reasons:
1 To provide C++ API. We mostly use Streaming before, and we also try to use PIPES, but we do not find PIPES is more efficient than Streaming. So we

think a new C++ extention is needed for us.
2 Even using PIPES or Streaming, it is hard to control memory of hadoop map/reduce Child JVM.
3 It costs so much to read/write/sort TB/PB data by Java. When using PIPES or Streaming, pipe or socket is not efficient to carry so huge data.

What we want to do:
1 We do not use map/reduce Child JVM to do any data processing, which just prepares environment, starts C++ mapper, tells mapper which split it should deal with, and reads report from mapper until that finished. The mapper will read record, ivoke user defined map, to do partition, write spill, combine and merge into file.out. We think these operations can be done by C++ code.
2 Reducer is similar to mapper, it was started after sort finished, it read from sorted files, ivoke user difined reduce, and write to user defined record writer.
3 We also intend to rewrite shuffle and sort with C++, for efficience and memory control.
at first, 1 and 2, then 3.

What's the difference with PIPES:
1 Yes, We will reuse most PIPES code.
2 And, We should do it more completely, nothing changed in scheduling and management, but everything in execution.

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值