基于Java使用Flink读取CSV文件,针对批处理,多表联合两种方式Table类和Join方法的实现数据处理,再入CSV文件...

Maven依赖

源头

<dependencies>
    <dependency>
        <groupId>org.projectlombok</groupId>
        <artifactId>lombok</artifactId>
        <version>1.18.8</version>
    </dependency>

    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-table-planner_2.11</artifactId>
        <version>1.8.0</version>
    </dependency>

    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-table-api-java-bridge_2.11</artifactId>
        <version>1.8.0</version>
    </dependency>

    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-streaming-scala_2.11</artifactId>
        <version>1.8.0</version>
    </dependency>

    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-table-common</artifactId>
        <version>1.8.0</version>
    </dependency>
</dependencies>

改版

    <dependencies>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.8</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table_2.11</artifactId>
            <version>1.7.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_2.11</artifactId>
            <version>1.8.0</version>
        </dependency>
    </dependencies>

SQL语句

SELECT COUNT(*) FROM T13_REF_AIRPORT_SAT;--11008
--HUB_ID IATA_CD NAME_CN NAME_EN
SELECT COUNT(*) FROM T13_REF_AIRPORT_CITY_LINK;--9676
--*******LINK_ID AIRPORT_HUB_ID CITY_HUB_ID
SELECT COUNT(*) FROM T13_REF_CITY_SAT;--9624
--HUB_ID CITY_CD NAME_CN NAME_EN
SELECT COUNT(*) FROM T13_REF_CITY_COUNTRY_LINK;--9062
--*******LINK_ID COUNTRY_HUB_ID CITY_HUB_ID
SELECT COUNT(*) FROM T13_REF_COUNTRY_SAT;--356
--HUB_ID  COUNTRY_CD NAME_CN NAME_EN

SELECT * 
    FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
    WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
        AND X2.CITY_HUB_ID=X3.HUB_ID
        AND X3.HUB_ID=X4.CITY_HUB_ID
        AND X4.COUNTRY_HUB_ID=X5.HUB_ID;

SELECT COUNT(*) 
    FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
    WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
        AND X2.CITY_HUB_ID=X3.HUB_ID
        AND X3.HUB_ID=X4.CITY_HUB_ID
        AND X4.COUNTRY_HUB_ID=X5.HUB_ID;--16759

SELECT X5.NAME_CN COUNTRY_CN_NAME,COUNT(X1.HUB_ID) COUNT_AIRPORT
    FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
    WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
        AND X2.CITY_HUB_ID=X3.HUB_ID
        AND X3.HUB_ID=X4.CITY_HUB_ID
        AND X4.COUNTRY_HUB_ID=X5.HUB_ID
    GROUP BY X5.NAME_CN
    ORDER BY COUNT_AIRPORT DESC;--254

SELECT 
    X5.COUNTRY_CD,
    X5.NAME_CN COUNTRY_NAME_CN,
    X5.NAME_EN COUNTRY_NAME_EN,
    X3.CITY_CD,
    X3.NAME_CN CITY_CN_NAME,
    X3.NAME_EN CITY_EN_NAME,
COUNT(X1.HUB_ID) COUNT_AIRPORT
    FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
    WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
        AND X2.CITY_HUB_ID=X3.HUB_ID
        AND X3.HUB_ID=X4.CITY_HUB_ID
        AND X4.COUNTRY_HUB_ID=X5.HUB_ID
    GROUP BY X5.COUNTRY_CD,X5.NAME_CN,X5.NAME_EN,X3.CITY_CD,X3.NAME_CN,X3.NAME_EN
    ORDER BY COUNT_AIRPORT DESC;--13030

SELECT 
        X5.COUNTRY_CD,
        X5.NAME_CN COUNTRY_NAME_CN,
        X5.NAME_EN COUNTRY_NAME_EN,
        X3.CITY_CD,
        X3.NAME_CN CITY_CN_NAME,
        X3.NAME_EN CITY_EN_NAME,
        COUNT(X1.HUB_ID) COUNT_AIRPORT
    FROM T13_REF_AIRPORT_SAT X1,T13_REF_AIRPORT_CITY_LINK X2,T13_REF_CITY_SAT X3,T13_REF_CITY_COUNTRY_LINK X4,T13_REF_COUNTRY_SAT X5
    WHERE X1.HUB_ID=X2.AIRPORT_HUB_ID
        AND X2.CITY_HUB_ID=X3.HUB_ID
        AND X3.HUB_ID=X4.CITY_HUB_ID
        AND X4.COUNTRY_HUB_ID=X5.HUB_ID
        AND X3.NAME_EN IS NULL
    GROUP BY X5.COUNTRY_CD,X5.NAME_CN,X5.NAME_EN,X3.CITY_CD,X3.NAME_CN,X3.NAME_EN
    ORDER BY COUNT_AIRPORT DESC;

--COUNTRY_NAME_EN=NULL 19
--CITY_CN_NAME=NULL 1
--CITY_EN_NAME=NULL 1501

Airport_Sat

import lombok.Data;

@Data
public class AirportSat
{
    private String hub_id;
}
import lombok.Data;

@Data
public class AirportCityLink
{
    private String airport_hub_id;
    private String city_hub_id;
}

City_Sat

import lombok.Data;

@Data
public class CitySat
{
    private String hub_id;
    private String city_cd;
    private String name_cn;
    private String name_en;
}
import lombok.Data;

@Data
public class CityCountryLink
{
    private String country_hub_id;
    private String city_hub_id;
}

Country_Sat

import lombok.Data;

@Data
public class CountrySat
{
    private String hub_id;
    private String country_cd;
    private String name_cn;
    private String name_en;
}


点击查看Flink_Csv代码

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.operators.MapOperator;
import org.apache.flink.api.java.operators.SortPartitionOperator;
import org.apache.flink.api.java.tuple.Tuple1;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple7;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.java.BatchTableEnvironment;

import java.text.SimpleDateFormat;
import java.util.Date;

public class FlinkCsv
{
    public static void main(String[] args) throws Exception
    {
        long s4 = System.currentTimeMillis();
        t4();
        System.out.println((System.currentTimeMillis() - s4) + "u");
        long s5 = System.currentTimeMillis();
        t5();
        System.out.println((System.currentTimeMillis() - s5) + "d");
    }

    private static void t5() throws Exception
    {
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        BatchTableEnvironment table_env = BatchTableEnvironment.getTableEnvironment(env);
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss SSS");

        DataSet<AirportSat> data_airportsat = env.readCsvFile("D:\\T13_REF_AIRPORT_SAT.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(true/*, true, false, true, true*/)
                .pojoType(AirportSat.class, "hub_id"/*, "iata_cd", "name_cn", "name_en"*/);

        DataSet<AirportCityLink> data_airportcitylink = env.readCsvFile("D:\\T13_REF_AIRPORT_CITY_LINK.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
                .pojoType(AirportCityLink.class, "airport_hub_id", "city_hub_id");

        DataSet<CitySat> data_citysat = env.readCsvFile("D:\\T13_REF_CITY_SAT.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, true, true)
                .pojoType(CitySat.class, "hub_id", "city_cd", "name_cn", "name_en");

        DataSet<CityCountryLink> data_citycountrylink = env.readCsvFile("D:\\T13_REF_CITY_COUNTRY_LINK.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
                .pojoType(CityCountryLink.class, "country_hub_id", "city_hub_id");

        DataSet<CountrySat> data_countrysat = env.readCsvFile("D:\\T13_REF_COUNTRY_SAT.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, false, false, true, true)
                .pojoType(CountrySat.class, "hub_id", "country_cd", "name_cn", "name_en");

        table_env.registerTable("t13_ref_airport_sat", table_env.fromDataSet(data_airportsat));
        table_env.registerTable("t13_ref_airport_city_link", table_env.fromDataSet(data_airportcitylink));
        table_env.registerTable("t13_ref_city_sat", table_env.fromDataSet(data_citysat));
        table_env.registerTable("t13_ref_city_country_link", table_env.fromDataSet(data_citycountrylink));
        table_env.registerTable("t13_ref_country_sat", table_env.fromDataSet(data_countrysat));


        String sql = "select count(*) \n" +
                "\tfrom t13_ref_airport_sat x1,t13_ref_airport_city_link x2,\n" +
                "\tt13_ref_city_sat x3,t13_ref_city_country_link x4,t13_ref_country_sat x5\n" +
                "\twhere x1.hub_id=x2.airport_hub_id\n" +
                "\t\tand x2.city_hub_id=x3.hub_id\n" +
                "\t\tand x3.hub_id=x4.city_hub_id\n" +
                "\t\tand x4.country_hub_id=x5.hub_id";

        String sql_country = "select x5.name_cn country_cn_name,count(x1.hub_id) count_airport\n" +
                "\tfrom t13_ref_airport_sat x1,t13_ref_airport_city_link x2,\n" +
                "\tt13_ref_city_sat x3,t13_ref_city_country_link x4,t13_ref_country_sat x5\n" +
                "\twhere x1.hub_id=x2.airport_hub_id\n" +
                "\t\tand x2.city_hub_id=x3.hub_id\n" +
                "\t\tand x3.hub_id=x4.city_hub_id\n" +
                "\t\tand x4.country_hub_id=x5.hub_id\n" +
                "\tgroup by x5.name_cn\n" +
                "\torder by count_airport desc";

        String sql_all = "select \n" +
                "\tx5.country_cd,\n" +
                "\tx5.name_cn country_name_cn,\n" +
                "\tx5.name_en country_name_en,\n" +
                "\tx3.city_cd,\n" +
                "\tx3.name_cn city_cn_name,\n" +
                "\tx3.name_en city_en_name,\n" +
                "count(x1.hub_id) count_airport\n" +
                "\tfrom t13_ref_airport_sat x1,t13_ref_airport_city_link x2,t13_ref_city_sat x3,t13_ref_city_country_link x4,t13_ref_country_sat x5\n" +
                "\twhere x1.hub_id=x2.airport_hub_id\n" +
                "\t\tand x2.city_hub_id=x3.hub_id\n" +
                "\t\tand x3.hub_id=x4.city_hub_id\n" +
                "\t\tand x4.country_hub_id=x5.hub_id\n" +
                "\tgroup by x5.country_cd,x5.name_cn,x5.name_en,x3.city_cd,x3.name_cn,x3.name_en\n" +
                "\torder by count_airport desc";


        DataSet<Tuple1<Long>> map = table_env.toDataSet(table_env.sqlQuery(sql),
                TypeInformation.of(new TypeHint<Tuple1<Long>>()
                {
                }));
        map.print();

        DataSet<Tuple2<String, Long>> map_country = table_env.toDataSet(table_env.sqlQuery(sql_country),
                TypeInformation.of(new TypeHint<Tuple2<String, Long>>()
                {
                }));
        System.out.println(map_country.count());
        map_country.print();

        Table result_country = table_env.sqlQuery(sql_country);
        DataSet<Tuple7<String, String, String, String, String, String, Long>> map_all = table_env.toDataSet(table_env.sqlQuery(sql_all),
                TypeInformation.of(new TypeHint<Tuple7<String, String, String, String, String, String, Long>>()
                {
                }));
        System.out.println(map_all.count());
        map_all.print();

        map.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map.csv",
                "\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
        System.out.println("T打印完成______map...");
        map_country.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map_country.csv",
                "\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
        System.out.println("T打印完成______map_country...");
        map_all.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map_all.csv",
                "\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
        System.out.println("T打印完成______map_all...");

        env.execute("Hello!@ Fuck...");
    }

    private static void t4() throws Exception
    {
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss SSS");

        DataSet<AirportSat> data_airportsat = env.readCsvFile("D:\\T13_REF_AIRPORT_SAT.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(true/*, true, false, true, true*/)
                .pojoType(AirportSat.class, "hub_id"/*, "iata_cd", "name_cn", "name_en"*/);

        DataSet<AirportCityLink> data_airportcitylink = env.readCsvFile("D:\\T13_REF_AIRPORT_CITY_LINK.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
                .pojoType(AirportCityLink.class, "airport_hub_id", "city_hub_id");

        DataSet<CitySat> data_citysat = env.readCsvFile("D:\\T13_REF_CITY_SAT.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, true, true)
                .pojoType(CitySat.class, "hub_id", "city_cd", "name_cn", "name_en");

        DataSet<CityCountryLink> data_citycountrylink = env.readCsvFile("D:\\T13_REF_CITY_COUNTRY_LINK.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(false, true, true)
                .pojoType(CityCountryLink.class, "country_hub_id", "city_hub_id");

        DataSet<CountrySat> data_countrysat = env.readCsvFile("D:\\T13_REF_COUNTRY_SAT.csv")
                .fieldDelimiter(",").ignoreFirstLine().includeFields(true, true, false, false, true, true)
                .pojoType(CountrySat.class, "hub_id", "country_cd", "name_cn", "name_en");

        MapOperator<Tuple2<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, CountrySat>,
                Tuple7<String, String, String, String, String, String, Long>> map = data_airportsat
                .join(data_airportcitylink).where("hub_id").equalTo("airport_hub_id")
                .join(data_citysat).where(new KeySelector<Tuple2<AirportSat, AirportCityLink>, String>()
                {
                    @Override
                    public String getKey(Tuple2<AirportSat, AirportCityLink> t) throws Exception
                    {
                        return t.f1.getCity_hub_id();
                    }
                }).equalTo("hub_id")
                .join(data_citycountrylink).where(new KeySelector<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, String>()
                {
                    @Override
                    public String getKey(Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat> t) throws Exception
                    {
                        return t.f1.getHub_id();
                    }
                }).equalTo("city_hub_id")
                .join(data_countrysat).where(new KeySelector<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, String>()
                {
                    @Override
                    public String getKey(Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink> t) throws Exception
                    {
                        return t.f1.getCountry_hub_id();
                    }
                }).equalTo("hub_id")
                .map(new MapFunction<Tuple2<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, CountrySat>,
                        Tuple7<String, String, String, String, String, String, Long>>()
                {

                    @Override
                    public Tuple7<String, String, String, String, String, String, Long> map(
                            Tuple2<Tuple2<Tuple2<Tuple2<AirportSat, AirportCityLink>, CitySat>, CityCountryLink>, CountrySat> t) throws Exception
                    {
                        String country_cd = t.f1.getCountry_cd();
                        String country_cn_name = t.f1.getName_cn();
                        String country_en_name = t.f1.getName_en();
                        String city_cd = t.f0.f0.f1.getCity_cd();
                        String city_cn_name = t.f0.f0.f1.getName_cn();
                        String city_en_name = t.f0.f0.f1.getName_en();
                        long airport = 1L;
                        return new Tuple7<>(country_cd, country_cn_name, country_en_name, city_cd, city_cn_name, city_en_name, airport);
                    }
                });
        //--------------------------------------------------------------------------------------------------------------
        System.out.println("总数量: " + map.count());
        SortPartitionOperator<Tuple2<String, Long>> map_country = map
                .map(new MapFunction<Tuple7<String, String, String, String, String, String, Long>, Tuple2<String, Long>>()
                {
                    @Override
                    public Tuple2<String, Long> map(Tuple7<String, String, String, String, String, String, Long> t) throws Exception
                    {
                        return new Tuple2<>(t.f1, t.f6);
                    }
                }).groupBy(0).sum(1).sortPartition(1, Order.DESCENDING);
        System.out.println("国家分总数量: " + map_country.count());
        //map_country.print();
        SortPartitionOperator<Tuple7<String, String, String, String, String, String, Long>> map_all = map
                .groupBy(0, 1, 2, 3, 4, 5).sum(6).sortPartition(6, Order.DESCENDING);
        System.out.println("全分总数量: " + map_all.count());
        //map_all.print();


        map.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map.csv",
                "\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
        System.out.println("打印完成______map...");
        map_country.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map_country.csv",
                "\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
        System.out.println("打印完成______map_country...");
        map_all.writeAsCsv("D:\\Flink_CSV\\" + sdf.format(new Date()) + "______map_all.csv",
                "\n", ",", FileSystem.WriteMode.OVERWRITE).setParallelism(1);
        System.out.println("打印完成______map_all...");

        env.execute("Hello!@ Fuck...");
    }
}
Flink是一个开源的流处理框架,用于处理大规模的数据流。从Flink 1.12版本开始,Table API和SQL成为Flink的核心API之一,提供了一种声明式编程接口。在Flink 1.17版本中,可以使用Table API或SQL来读取CSV文件。 以下是使用Flink Table API读取CSV文件的基本步骤: 1. 首先,需要将CSV文件作为数据源加入到Flink环境中。使用`readTextFile`方法可以读取文本文件,但这不是处理CSV的最佳方式,因为CSV文件可能包含复杂的分隔符和引号等。 2. 更好的选择是使用`read_csv`方法。这要求首先定义一个CSV格式的schema,以便Flink能够正确解析CSV文件中的数据。这个schema定义了CSV中各列的数据型。 3. 使用`fromPath`或者`fromUri`方法来指定CSV文件的路径,然后通过`read`方法读取数据。 以下是一个简单的代码示例: ```java import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.table.api.bridge.java.BatchTableEnvironment; import org.apache.flink.table.api.Table; import org.apache.flink.table.descriptors.Csv; import org.apache.flink.table.descriptors.Schema; import org.apache.flink.types.Row; public class FlinkReadCSVExample { public static void main(String[] args) throws Exception { // 创建一个执行环境 ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); BatchTableEnvironment tableEnv = BatchTableEnvironment.create(env); // 定义输入的schema,指定各个字段的名称和型 Schema schema = new Schema() .field("id", Types.INT()) .field("name", Types.STRING()) .field("age", Types.INT()); // 注册表,并指定数据源路径和schema tableEnv.connect(new Csv().path("path/to/csvfile.csv")) .withFormat(schema) .withSchema(schema) .createTemporaryTable("csvInput"); // 执行SQL查询 Table result = tableEnv.sqlQuery("SELECT id, name, age FROM csvInput"); // 将Table转换为DataSet DataSet<Row> resultDataSet = tableEnv.toDataSet(result, Row.class); // 执行Flink程序 resultDataSet.print(); } } ``` 注意:上述代码是Flink 1.17版本之前的写法,新版本可能有所不同,具体请参考Flink官方文档和版本更新说明。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值