首先说明。需求很奇怪,但是无所谓了。
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>1.14.4</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_2.11</artifactId>
<version>1.14.4</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_2.11</artifactId>
<version>1.14.4</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-hive_2.11</artifactId>
<version>1.14.4</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.11</artifactId>
<version>1.14.4</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.3.7</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.1.1-cdh6.3.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.0.0-cdh6.3.2</version>
<!-- -->
</dependency>
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.SqlDialect;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.catalog.hive.HiveCatalog;
import java.util.List;
import static org.apache.flink.table.api.Expressions.$;
public class GetPayCardInfoJob {
public static void main(String[] args) throws Exception {
KerberosUtils.do_auth_hive();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//获取http的数据转为datastream
List<PayMentCard> cardPayMents = new HttpUtil2().doCall("http://xxxxxx/api/GetShuakaListByTime?startTime=2022-08-30&endTime=2022-08-31");
DataStreamSource<PayMentCard> dataStreamSource = env.fromCollection(cardPayMents);
//获取tableEnv的运行环境
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
String name = "s2cluster";
String defaultDatabase = "odsiadata";
//下面的hive目录放着cdh下载的hive-site core-site等xml
String hiveConfDir = "D:\\install\\code\\github\\dw_kpi2\\pay_card\\src\\main\\resources\\hive";
//指定hadoop的位置,同上。否则能会报 unknown hosts 错误,能够show databases tables functions 但是不能select
String hadoopConfDir = "D:\\install\\code\\github\\dw_kpi2\\pay_card\\src\\main\\resources\\hive";
String version ="2.1.1";
HiveCatalog hive = new HiveCatalog(name, defaultDatabase, hiveConfDir,hadoopConfDir,version);
tableEnv.registerCatalog("s2cluster", hive);
tableEnv.useCatalog("s2cluster");
tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
Table table = tableEnv.fromDataStream(dataStreamSource,
$("staffName"),
$("routeName"),
$("city"),
$("bancheType"),
$("plateNumber"),
$("shuakaType"),
$("shuakaTime"),
$("tradeType"),
$("vendorName")
);
//注意上面都指定了字段名, 如果你不指定,默认数据为raw,没有字段。
tableEnv.executeSql("select * from "+table+" limit 10").print();
tableEnv.createTemporaryView("httpTable",table);
tableEnv.executeSql("select count(1) from test.card_payment");
//插入到hive表
table.executeInsert("test.card_payment");
}
}
这种不是特别好。因为http的数据一般很少,我这个是看班车的数据 一天差不多是3w多条。