参考链接:https://clickhouse.com/docs/en/getting-started/example-datasets/ontime
ClickHouse航班测试数据导入测试
1、下载测试数据
# 下载2014年-2022年的航班数据
wget --no-check-certificate --continue https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{1914..2022}_{1..12}.zip
2、创建数据库表
CREATE TABLE `ontime` \
( \
`Year` UInt16, \
`Quarter` UInt8, \
`Month` UInt8, \
`DayofMonth` UInt8, \
`DayOfWeek` UInt8, \
`FlightDate` Date, \
`Reporting_Airline` LowCardinality(String), \
`DOT_ID_Reporting_Airline` Int32, \
`IATA_CODE_Reporting_Airline` LowCardinality(String), \
`Tail_Number` LowCardinality(String), \
`Flight_Number_Reporting_Airline` LowCardinality(String), \
`OriginAirportID` Int32, \
`OriginAirportSeqID` Int32, \
`OriginCityMarketID` Int32, \
`Origin` FixedString(5), \
`OriginCityName` LowCardinality(String), \
`OriginState` FixedString(2), \
`OriginStateFips` FixedString(2), \
`OriginStateName` LowCardinality(String), \
`OriginWac` Int32, \
`DestAirportID` Int32, \
`DestAirportSeqID` Int32, \
`DestCityMarketID` Int32, \
`Dest` FixedString(5), \
`DestCityName` LowCardinality(String), \
`DestState` FixedString(2), \
`DestStateFips` FixedString(2), \
`DestStateName` LowCardinality(String), \
`DestWac` Int32, \
`CRSDepTime` Int32, \
`DepTime` Int32, \
`DepDelay` Int32, \
`DepDelayMinutes` Int32, \
`DepDel15` Int32, \
`DepartureDelayGroups` LowCardinality(String), \
`DepTimeBlk` LowCardinality(String), \
`TaxiOut` Int32, \
`WheelsOff` LowCardinality(String), \
`WheelsOn` LowCardinality(String), \
`TaxiIn` Int32, \
`CRSArrTime` Int32, \
`ArrTime` Int32, \
`ArrDelay` Int32, \
`ArrDelayMinutes` Int32, \
`ArrDel15` Int32, \
`ArrivalDelayGroups` LowCardinality(String), \
`ArrTimeBlk` LowCardinality(String), \
`Cancelled` Int8, \
`CancellationCode` FixedString(1), \
`Diverted` Int8, \
`CRSElapsedTime` Int32, \
`ActualElapsedTime` Int32, \
`AirTime` Int32, \
`Flights` Int32, \
`Distance` Int32, \
`DistanceGroup` Int8, \
`CarrierDelay` Int32, \
`WeatherDelay` Int32, \
`NASDelay` Int32, \
`SecurityDelay` Int32, \
`LateAircraftDelay` Int32, \
`FirstDepTime` Int16, \
`TotalAddGTime` Int16, \
`LongestAddGTime` Int16, \
`DivAirportLandings` Int8, \
`DivReachedDest` Int8, \
`DivActualElapsedTime` Int16, \
`DivArrDelay` Int16, \
`DivDistance` Int16, \
`Div1Airport` LowCardinality(String), \
`Div1AirportID` Int32, \
`Div1AirportSeqID` Int32, \
`Div1WheelsOn` Int16, \
`Div1TotalGTime` Int16, \
`Div1LongestGTime` Int16, \
`Div1WheelsOff` Int16, \
`Div1TailNum` LowCardinality(String), \
`Div2Airport` LowCardinality(String), \
`Div2AirportID` Int32, \
`Div2AirportSeqID` Int32, \
`Div2WheelsOn` Int16, \
`Div2TotalGTime` Int16, \
`Div2LongestGTime` Int16, \
`Div2WheelsOff` Int16, \
`Div2TailNum` LowCardinality(String), \
`Div3Airport` LowCardinality(String), \
`Div3AirportID` Int32, \
`Div3AirportSeqID` Int32, \
`Div3WheelsOn` Int16, \
`Div3TotalGTime` Int16, \
`Div3LongestGTime` Int16, \
`Div3WheelsOff` Int16, \
`Div3TailNum` LowCardinality(String), \
`Div4Airport` LowCardinality(String), \
`Div4AirportID` Int32, \
`Div4AirportSeqID` Int32, \
`Div4WheelsOn` Int16, \
`Div4TotalGTime` Int16, \
`Div4LongestGTime` Int16, \
`Div4WheelsOff` Int16, \
`Div4TailNum` LowCardinality(String), \
`Div5Airport` LowCardinality(String), \
`Div5AirportID` Int32, \
`Div5AirportSeqID` Int32, \
`Div5WheelsOn` Int16, \
`Div5TotalGTime` Int16, \
`Div5LongestGTime` Int16, \
`Div5WheelsOff` Int16, \
`Div5TailNum` LowCardinality(String) \
) ENGINE = MergeTree \
ORDER BY (Year, Quarter, Month, DayofMonth, FlightDate, IATA_CODE_Reporting_Airline);
3、导入测试数据到表中
此处需要注意,增加--input_format_skip_unknown_fields 1参数,跳过多余数据的插入。
input_format_skip_unknown_fields参数说明:
0-禁用(默认),1-启用
ls -1 *.zip | xargs -I{} -P $(nproc) bash -c "echo {}; unzip -cq {} '*.csv' | sed 's/\.00//g' | clickhouse-client --password 123456 --input_format_skip_unknown_fields 1 --input_format_csv_empty_as_default 1 --query='INSERT INTO ontime FORMAT CSVWithNames'"
4、查询测试
① 查询2016年到2020年每个机场延误超过10分钟以上的次数top10
SELECT Origin, count(*) AS c \
FROM ontime \
WHERE DepDelay>10 AND Year>=2016 AND Year<=2020 \
GROUP BY Origin \
ORDER BY c DESC \
LIMIT 10;