行文件分组统计

[/code]有些情况下,对于一个结构化的以行为记录的文本文件,需要按列分组统计,如果数据量小,可以直接导入数据库中,但是当文件很大时,导入数据库不太现实,本程序即实现非数据库条件下,按任意列分组统计行数功能;文件只读一次,按任意分组方式查询。
基本思路:
1.根据指定的列名,构建一颗多叉树,树的高度即为可以分组的条件列数
2.存储树中,各节点名按字典顺序降序排列
3.查询时,根据指定的列名,找到对应的树节点,将其中value值累加返回
以下为第一个初级版本,欢迎指点!!

树节点:
[code="java"]
package org.jf.sta;

import java.util.ArrayList;
import java.util.List;

public class SegNode
{
private String name;
private String id;
private int value;
private List<SegNode> childList;

public SegNode()
{
childList = new ArrayList<SegNode>();
}

public SegNode(String name,String id)
{
this();
this.name = name;
this.id = id;
// childList = new ArrayList<SegNode>();
}
public String getName() {
return name;
}

public String getId() {
return id;
}

public int getValue() {
return value;
}
public void setValue(int value) {
this.value = value;
}

public void addValue(int increment)
{
this.value += increment;
}

public List<SegNode> getChildList()
{
return this.childList;
}

public void addChild(SegNode node)
{
this.childList.add(node);
}

public String toXml()
{

String s="<"+name+" id=\""+this.id+"\" value=\""+this.value+"\">\n";
for(int i=0;i<this.childList.size();i++)
{
s = s+childList.get(i).toXml(" ");
}
s+="</"+name+">\n";
return s;
}

public String toXml(String blank)
{
String s="";
if(childList.size()==0)
{
s=blank+"<"+name+" id=\""+this.id+"\" value=\""+this.value+"\"/>\n";
}else
{
s=blank+"<"+name+" id=\""+this.id+"\" value=\""+this.value+"\">\n";
for(int i=0;i<this.childList.size();i++)
{
s = s+blank+childList.get(i).toXml(blank+" ");
}
s+=blank+"</"+name+">\n";
}

return s;
}
}


统计树

package org.jf.sta;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;


/**
* 行文件统计工具类
* 行文件 :每行一条记录 字段间用分隔符 默认为一到多个空白字符
*
* 一次读取,任意分组
*
* @author junfeng.chen
*
*/
public class LineStaTree {

private SegNode root;
private Map<String,Integer> internalColumnMap;
private String[] columnNames;
private Map<String,Integer> columnMap;
private String seprator="\\s+";


public LineStaTree(Map<String,Integer> column_map)
{
this.columnMap = column_map;
this.columnNames = new String[column_map.size()];
int i=0;
for(String column_name:column_map.keySet())
{
columnNames[i++]=column_name;
}
Arrays.sort(columnNames);
internalColumnMap = new HashMap<String,Integer>();
root = new SegNode();
for(i=0;i<columnNames.length;i++)
{
internalColumnMap.put(columnNames[i], i);
}

}

public void load(InputStream is)
{
BufferedReader br = null;
try{
br = new BufferedReader(new InputStreamReader(is));
String str = br.readLine();
while(str!=null)
{
addLine(str);
}
}catch(IOException e)
{
e.printStackTrace();
}

}

public void load(File file)
{
InputStream is;
try {
if(!file.exists()||file.isDirectory())
throw new RuntimeException("illegle argumet");
is = new FileInputStream(file);
this.load(is);
is.close();
} catch (Exception e)
{
e.printStackTrace();
}

}

public void setSeprator(String sep)
{
seprator = sep;
}

public void addLine(String line)
{
String ss[] = line.split(seprator);
if(ss.length<columnNames.length)
return;
root.addValue(1);
this.add(root, ss, 0);
}

private void add(SegNode parent,String []ss,int column_index)
{

List<SegNode> nodeList = parent.getChildList();
String columnName = columnNames[column_index];
int index = columnMap.get(columnName);
String id=ss[index];
SegNode node = null;
if(nodeList.size()==0)
{
node = new SegNode(columnName,id);
node.addValue(1);
nodeList.add(node);
}else
{
for(int i=0;i<nodeList.size();i++)
{
node = nodeList.get(i);
if(id!=null)
{
if(id.equals(node.getId()))
break;
}
}
if(node==null||!id.equals(node.getId()))
{
node = new SegNode(columnName,id);
nodeList.add(node);
}
node.addValue(1);

}

if(column_index==columnNames.length-1)
return;

add(node,ss,column_index+1);
}


/***
* 1.计算出起始节点的层序号和终点节点的层序号 start end
* 2.移动到第一个统计节点
* 3.遍历 end-start次
* 4.获取末节点id 与首节点id组成串 put进hashmap
*
*
*
*/

//获取第 n 层子节点
private List<SegNode> getChildList(SegNode parent,int count,List<SegNode> list)
{
if(list==null)
list = new ArrayList<SegNode>();
{
List<SegNode> sonList = parent.getChildList();
if(count<=0)
{
list.addAll(sonList);
}else
{
SegNode node = null;
for(int i=0;i<sonList.size();i++)
{
node = sonList.get(i);
getChildList(node,count-1,list);
}
}
}
return list;
}

//移动到末节点的路径 Map<String,Integer> //id1$$id2$$id3 然后将对应位置的id置为空字符串
//相同key的数据累加
//获取首节点列表
//获取末节点列表
//从首节点开始遍历,直到end结束 记录id组成的路径
private Map<String,Integer> getCount(int begin,int end,String []columns)
{
List<SegNode> beginList = this.getChildList(root, begin, null);
SegNode beginNode = null;
Map<String,Integer> result_map = new HashMap<String,Integer>();
if(begin==end)
{
for(int i=0;i<beginList.size();i++)
{
beginNode = beginList.get(i);
Integer intg = result_map.get(beginNode.getId());
if(intg==null)
intg = new Integer(beginNode.getValue());
else
intg=intg+beginNode.getValue();
result_map.put(beginNode.getId(), intg);
}
return result_map;
}
for(int i=0;i<beginList.size();i++)
{
beginNode = beginList.get(i);
travle(beginNode, end-begin, beginNode.getId(), result_map,columns);
}
return result_map;
}

public Map<String,Integer> groupBy(String[] columns)
{
if(columns==null||columns.length==0)
{
Map<String,Integer> map = new HashMap<String,Integer>();
map.put("*", root.getValue());
return map;
}
Arrays.sort(columns);
int startIndex = this.internalColumnMap.get(columns[0]);//节点层 序号
int endIndex = this.internalColumnMap.get(columns[columns.length-1]);

String queryColumns[] = new String[this.columnNames.length];
for(int i=0;i<columnNames.length;i++)//全部置为*
{
queryColumns[i]="*";
}
for(int i=0;i<columns.length;i++)//将本次查询的条件列 置入其中
{
queryColumns[this.internalColumnMap.get(columns[i])]=columns[i];
}
String queryColumns2 [] = new String[endIndex-startIndex+1];
System.arraycopy(queryColumns, startIndex, queryColumns2, 0, queryColumns2.length);
Map<String,Integer> result = this.getCount(startIndex, endIndex, queryColumns2);
return result;
}


/**
*
* @param beginNode
* @param steps
* @param path
* @param map
* @param columns
* @return
*/
private Map<String,Integer> travle(
SegNode beginNode,
int steps,
String parentpath,
Map<String,Integer> map,
String [] columns)
{
if(parentpath==null)
parentpath="";
if(map==null)
map = new HashMap<String,Integer>();
if(steps<=1)
{
List<SegNode> list = beginNode.getChildList();
SegNode node = null;
for(int i=0;i<list.size();i++)
{
node = list.get(i);
String path = parentpath;
if(columns[columns.length-steps].equals("*"))
path = path+"_*";//提前设置 跨层节点为空
else
path = path+"_"+node.getId();
Integer intg = map.get(path);
if(intg!=null)
intg=intg+node.getValue();
else
intg = new Integer(node.getValue());

map.put(path, intg);
}

}else
{
List<SegNode> list = beginNode.getChildList();
SegNode node = null;
String path = parentpath;
for(int i=0;i<list.size();i++)
{
node = list.get(i);
if(columns[columns.length-steps].equals("*"))
map = travle(node, steps-1, path+"_*", map,columns);
else
map = travle(node, steps-1, path+"_"+node.getId(), map,columns);
}
}
return map;
}



public String toXml()
{
String s="<records id=\"*\" value=\""+root.getValue()+"\">\n";
List<SegNode> list = root.getChildList();
for(int i=0;i<list.size();i++)
{
s+=list.get(i).toXml(" ");
}
s+="</records>\n";
return s;
}

public static void main(String args[])
{
String ss[] = new String[]{
"abc 123 234",
"bcd 123 234",
"abc 123 345",
"abc 123 456",
"bcd 123 345",
"bcdd 1d23 3s45",
};
Map<String,Integer> map = new HashMap<String,Integer>();
map.put("tag1", 0);
map.put("tag2", 1);
map.put("tag3", 2);
LineStaTree tree = new LineStaTree(map);
for(String s:ss)
{
tree.addLine(s);
}
Map<String,Integer> map1 = tree.groupBy(new String[]{"tag1","tag2"});//tree.getCount(0,0,new String[]{"tag1"});
System.out.println(map1.size());
Set<String> keys = map1.keySet();
for(String key:keys)
{
System.out.println(key+": "+map1.get(key));
}
System.out.println(tree.toXml());
System.out.println(tree.getChildList(tree.root, 2, null).size());

}



}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值