阿里云ODPS使用总结

从阿里云官网下载ODPS的客户端odpscmd_public.zip,该客户端在linux和windows环境上都可用。

进入该客户端的配置文件odps_client/conf/odps_config.ini ,进行配置,主要配置前三项:

project_name=XXXXXX
access_id=XXXXXX
access_key=XXXXXX
end_point=http://service.odps.aliyun.com/api
tunnel_endpoint=http://dt.odps.aliyun.com
log_view_host=http://logview.odps.aliyun.com
https_check=true
# confirm threshold for query input size(unit: GB)
data_size_confirm=100.0

之后即可运行odps_client/bin/odpscmd进入命令行进行操作。


部分使用过的命令总结如下:

显示所有表:

show  tables;        

查询某张表所有分区:

ls partitions tablename;

查询某表的某分区数据前5条:

select * from tablename where partitionname='xxxxxx' limit 5;


odps dship客户端可以表进行一些操作,后将该客户端整合到了odps客户端中,关于odps dship客户端可参照odps dship客户端使用

此处介绍的是本人在项目中使用到的整合到odps客户端中的:

在shell脚本中如下代码,下载表tablename的xxxxxx分区到本地文件localtablename中,使用“##”作为字段分隔符(若不写默认使用“,”作为分隔符):

cd /home/username/odpsclient/bin
./odpscmd -e "tunnel download -fd '##' tablename/partitionname='xxxxxx' /home/username/localtablename"

注:odps暂不支持下载成标准.csv文件,所以若表中字段本身就包含“,”,则应指定非“,”的其他字符作为分隔符。否则若作为.csv来解析的话就会把字段里的“,”也当作了分隔符,

导致解析出错!!!


JAVA API整理:

odps支持java api,具体可见阿里云官网;此处记录本人曾在项目中用到的java整合spark的api:

OdpsOps odpsOps = new OdpsOps(sc, ODPS_ACCESS_ID, ODPS_ACCESS_KEY, ODPS_END_POINT, ODPS_TUNNEL_URL);

(第一个参数sc 即spak的入口SparkContext)

JavaRDD<T> tRDD  = odpsOps.readTableWithJava(ODPS_PROJECT_NAME, tName, partitionSpec,
new Function2<Record, TableSchema, T>() {
private static final long serialVersionUID = 1L;
@Override
public T call(Record record, TableSchema schema) throws Exception {
return cpFromRecord(record, schema);
}
}, numPartitoins);

(第二个参数是表名;第三个参数是分区名和分区值,如:data_date='20160701',第五个参数是spark的启动的线程数(SparkRDD的分区数))


下面是odps常规java api总结:

odps参数:

<odpses>
<odps name="default">
<projectName>XXX</projectName>
<accessKeyId>XXX</accessKeyId>
<accessKeySecret>XXX</accessKeySecret>
<odpsEndpoint>http://service.odps.aliyun.com/api</odpsEndpoint>
<tunnelEndpoint>http://dt.odps.aliyun.com</tunnelEndpoint>
<dshipDir>/home/jianfeitech/odps_statistic/odps/dship/dship</dshipDir>
<binDir>/home/jianfeitech/odps_statistic/odps/odps-cli/bin/odps</binDir>
<classpath>../rep/res-db-odps-0.0.1-SNAPSHOT.jar:../rep/dmp-odps-0.0.1-SNAPSHOT.jar:../rep/log4j-1.2.17.jar:../rep/slf4j-api-1.6.6.jar:../rep/slf4j-log4j12-1.6.6.jar</classpath>
</odps>

</odpses>

下面是api:

package com.jianfeitech.bd.res.db.odps.access;

import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.lang.StringUtils;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;

import com.aliyun.odps.Column;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.Partition;
import com.aliyun.odps.PartitionSpec;
import com.aliyun.odps.Resources;
import com.aliyun.odps.Table;
import com.aliyun.odps.TableSchema;
import com.aliyun.odps.Tables;
import com.aliyun.odps.account.Account;
import com.aliyun.odps.account.AliyunAccount;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.tunnel.TableTunnel;
import com.aliyun.odps.tunnel.TableTunnel.DownloadSession;
import com.aliyun.odps.tunnel.TableTunnel.UploadSession;
import com.aliyun.odps.tunnel.TunnelException;
import com.jianfeitech.bd.common.conf.access.ResConfAccess;
import com.jianfeitech.bd.common.conf.model.db.Odps;

public abstract class BaseOdpsAccess implements OdpsAccess, Closeable {
protected Logger logger = LoggerFactory.getLogger(BaseOdpsAccess.class);
protected Odps odps;
protected Account account;
protected com.aliyun.odps.Odps conn;
protected TableTunnel tunnel;
protected Tables tables;
protected UploadSession usession;
protected DownloadSession dsession;

public BaseOdpsAccess(Odps odps) {
this.account = new AliyunAccount(odps.getAccessKeyId(),odps.getAccessKeySecret());
this.conn = new com.aliyun.odps.Odps(account);
this.conn.setEndpoint(odps.getOdpsEndpoint());
this.conn.setDefaultProject(odps.getProjectName());
this.tables = this.conn.tables();
this.tunnel = new TableTunnel(conn);
this.tunnel.setEndpoint(odps.getTunnelEndpoint());

}


public BaseOdpsAccess(ResConfAccess rca) {
this(rca,"default");
}


public BaseOdpsAccess(ResConfAccess rca, String odpsName) {
this(rca.getDbOdps(odpsName));
}


public com.aliyun.odps.Odps getConn() {
return this.conn;
}


public TableTunnel gettTunnel() {
return this.tunnel;
}


// partitionPair example "data_date=20161227"
public UploadSession getUploadSession(String tableName, String partitionPair) {
String projectName = this.odps.getProjectName();
try {
if (StringUtils.isEmpty(partitionPair)) {
this.usession = this.tunnel.createUploadSession(projectName, tableName);
} else {
PartitionSpec partitionSpec = new PartitionSpec(partitionPair);
this.usession = this.tunnel.createUploadSession(projectName, tableName, partitionSpec);
}
logger.info("upload session state: "+this.usession.getStatus().toString());
} catch (TunnelException e) {
logger.error("upload session creation failed.",e);
} catch (IOException e) {
logger.error("please check network for upload session.",e);
}
return this.usession;
}


// partitionPair example "data_date=20161227"
public DownloadSession getDownloadSession(String tableName, String partitionPair) {
String projectName = this.getConn().getDefaultProject();
try {
if (StringUtils.isEmpty(partitionPair)) {
this.dsession = this.tunnel.createDownloadSession(projectName, tableName);
} else {
PartitionSpec partitionSpec = new PartitionSpec(partitionPair);
this.dsession = this.tunnel.createDownloadSession(projectName, tableName, partitionSpec);
}
logger.info("download session state: "+this.dsession.getStatus().toString());
} catch (TunnelException e) {
logger.error("download session creation failed.",e);
} catch (IOException e) {
logger.error("please check network for download session.",e);
}
return this.dsession;
}


public List<Map<String, String>> findAllTableName() {
List<Map<String, String>> odpsTables = new ArrayList<>();
for (Table table : this.tables) {
Map<String, String> tableMap = new HashMap<>();
tableMap.put("tableCode", table.getName());
if (StringUtils.isNotBlank(table.getComment())) {
tableMap.put("tableName", table.getComment());
} else {
tableMap.put("tableName", table.getName());
}
odpsTables.add(tableMap);
}
return odpsTables;
}

public List<Map<String, String>> findAllColumnName(String tableName) {
List<Map<String, String>> tableColumns = new ArrayList<>();
Table table=this.tables.get(tableName);
TableSchema tableSchema = table.getSchema();
List<Column> columnList = tableSchema.getColumns();
for (Column column : columnList) {
Map<String, String> columnMap = new HashMap<String, String>();
columnMap.put("columnCode", column.getName());
if (StringUtils.isNotBlank(column.getComment())) {
columnMap.put("columnName", column.getComment());
} else {
columnMap.put("columnName", column.getName());
}
tableColumns.add(columnMap);
}


return tableColumns;
}


public boolean findPartition(String tableName, String partitionName, String partitionValue) {


boolean res = true;
if (null == tableName || null == partitionName) {
return false;
}
Table table = this.tables.get(tableName);
PartitionSpec ps = new PartitionSpec();
ps.set(partitionName, partitionValue);
try {
res = table.hasPartition(ps);
} catch (OdpsException e) {
logger.warn("partition finding faild.",e);;
res = false;
}
return res;
}


public Set<String> findAllPartition(String tName, String pName) {
Set<String> set = new TreeSet<String>();
Table table = this.tables.get(tName);
List<Partition> partitions = table.getPartitions();
for(Partition partition : partitions){
PartitionSpec partitionSpec = partition.getPartitionSpec();
String pValue = partitionSpec.get(pName);
set.add(pValue);
}
return set;
};

public boolean upsertPartition(String tableName, String partitionName, String partitionValue) {
try {
if (findPartition(tableName, partitionName, partitionValue) == false) {
Table table = this.tables.get(tableName);
PartitionSpec ps = new PartitionSpec();
ps.set(partitionName, partitionValue);
table.createPartition(ps);
}
return true;
} catch (OdpsException e) {
logger.warn("upsert partition failed",e);
return false;
}
}


public long getRecordCount(String tableName, String partitionName) {
return getDownloadSession(tableName,partitionName).getRecordCount();
}


public void close() throws IOException {

}

public Object unserializeFromFile(String resourceName) {
   Resources resources = this.conn.resources();
   Object readObject = null;
try {
InputStream resourceAsStream = resources.getResourceAsStream(resourceName);
ObjectInputStream objectInputStream = new ObjectInputStream(resourceAsStream);
readObject = objectInputStream.readObject();
} catch (Exception e) {
logger.error("read odps resource " + resourceName + "error,",e);
}
   return readObject;
}


public static Map getRecordData(Record record) {
Map res = new HashMap();
if (null == record) {
return res;
}
Column[] columns = record.getColumns();
Object[] values = record.toArray();
for (int i = 0; i < columns.length; i++) {
res.put(columns[i].getName(), values[i]);
}
return res;
}


}




补充:

ODPS的sql与常规sql类似,如 

select mac,count(1) as c from http_standard where data_date='20160701' group by mac order by c desc limit 5;









  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值