本文介绍通过ClickHouse官方客户端导入数据及查询的方法。

说明 本文内容仅限于客户端导入数据,更多导入方式,请参见从MySQL导入数据

云数据库ClickHouse支持如下多种导入方式。

  • 通过ClickHouse官方客户端导入数据。
  • 通过阿里云数据库RDS MySQL导入数据。
  • 通过阿里云消息队列Kafka导入数据。

通过官方客户端导入数据

本部分将结合开源数据集,介绍通过官方客户端clickhouse-client导入数据并查询。

  1. 连接集群。请参考连接集群
  2. 下载数据集。单击下载链接下载开源ontime数据集。
  3. 在ClickHouse中建表:
    $ ./clickhouse-client --multiline --host=<host> --port=<port> --user=<user> --password=<password> 
    
    --建立本地表
    CREATE TABLE ontime_local ON CLUSTER default
    (
        `Year`                            UInt16,
        `Quarter`                         UInt8,
        `Month`                           UInt8,
        `DayofMonth`                      UInt8,
        `DayOfWeek`                       UInt8,
        `FlightDate`                      Date,
        `Reporting_Airline`               String,
        `DOT_ID_Reporting_Airline`        Int32,
        `IATA_CODE_Reporting_Airline`     String,
        `Tail_Number`                     String,
        `Flight_Number_Reporting_Airline` String,
        `OriginAirportID`                 Int32,
        `OriginAirportSeqID`              Int32,
        `OriginCityMarketID`              Int32,
        `Origin`                          FixedString(5),
        `OriginCityName`                  String,
        `OriginState`                     FixedString(2),
        `OriginStateFips`                 String,
        `OriginStateName`                 String,
        `OriginWac`                       Int32,
        `DestAirportID`                   Int32,
        `DestAirportSeqID`                Int32,
        `DestCityMarketID`                Int32,
        `Dest`                            FixedString(5),
        `DestCityName`                    String,
        `DestState`                       FixedString(2),
        `DestStateFips`                   String,
        `DestStateName`                   String,
        `DestWac`                         Int32,
        `CRSDepTime`                      Int32,
        `DepTime`                         Int32,
        `DepDelay`                        Int32,
        `DepDelayMinutes`                 Int32,
        `DepDel15`                        Int32,
        `DepartureDelayGroups`            String,
        `DepTimeBlk`                      String,
        `TaxiOut`                         Int32,
        `WheelsOff`                       Int32,
        `WheelsOn`                        Int32,
        `TaxiIn`                          Int32,
        `CRSArrTime`                      Int32,
        `ArrTime`                         Int32,
        `ArrDelay`                        Int32,
        `ArrDelayMinutes`                 Int32,
        `ArrDel15`                        Int32,
        `ArrivalDelayGroups`              Int32,
        `ArrTimeBlk`                      String,
        `Cancelled`                       UInt8,
        `CancellationCode`                FixedString(1),
        `Diverted`                        UInt8,
        `CRSElapsedTime`                  Int32,
        `ActualElapsedTime`               Int32,
        `AirTime`                         Nullable(Int32),
        `Flights`                         Int32,
        `Distance`                        Int32,
        `DistanceGroup`                   UInt8,
        `CarrierDelay`                    Int32,
        `WeatherDelay`                    Int32,
        `NASDelay`                        Int32,
        `SecurityDelay`                   Int32,
        `LateAircraftDelay`               Int32,
        `FirstDepTime`                    String,
        `TotalAddGTime`                   String,
        `LongestAddGTime`                 String,
        `DivAirportLandings`              String,
        `DivReachedDest`                  String,
        `DivActualElapsedTime`            String,
        `DivArrDelay`                     String,
        `DivDistance`                     String,
        `Div1Airport`                     String,
        `Div1AirportID`                   Int32,
        `Div1AirportSeqID`                Int32,
        `Div1WheelsOn`                    String,
        `Div1TotalGTime`                  String,
        `Div1LongestGTime`                String,
        `Div1WheelsOff`                   String,
        `Div1TailNum`                     String,
        `Div2Airport`                     String,
        `Div2AirportID`                   Int32,
        `Div2AirportSeqID`                Int32,
        `Div2WheelsOn`                    String,
        `Div2TotalGTime`                  String,
        `Div2LongestGTime`                String,
        `Div2WheelsOff`                   String,
        `Div2TailNum`                     String,
        `Div3Airport`                     String,
        `Div3AirportID`                   Int32,
        `Div3AirportSeqID`                Int32,
        `Div3WheelsOn`                    String,
        `Div3TotalGTime`                  String,
        `Div3LongestGTime`                String,
        `Div3WheelsOff`                   String,
        `Div3TailNum`                     String,
        `Div4Airport`                     String,
        `Div4AirportID`                   Int32,
        `Div4AirportSeqID`                Int32,
        `Div4WheelsOn`                    String,
        `Div4TotalGTime`                  String,
        `Div4LongestGTime`                String,
        `Div4WheelsOff`                   String,
        `Div4TailNum`                     String,
        `Div5Airport`                     String,
        `Div5AirportID`                   Int32,
        `Div5AirportSeqID`                Int32,
        `Div5WheelsOn`                    String,
        `Div5TotalGTime`                  String,
        `Div5LongestGTime`                String,
        `Div5WheelsOff`                   String,
        `Div5TailNum`                     String
    )ENGINE = MergeTree(
        '/clickhouse/tables/{database}/{table}/{shard}',
        '{replica}')
     PARTITION BY toYYYYMM(FlightDate)
     PRIMARY KEY (intHash32(FlightDate))
     ORDER BY (intHash32(FlightDate))
     SAMPLE BY intHash32(FlightDate)
    SETTINGS index_granularity= 8192 ;
    
    --建立分布式表
    CREATE TABLE ontime_distributed ON CLUSTER default
     AS ontime_local 
    ENGINE = Distributed(default, default, ontime_local, rand());
    
    --退出
    exit;
  4. 解压下载的开源数据集,导入数据 :
    unzip ontime-data.zip;
    ./clickhouse-client --host=<host> --port=<port> --user=<user> --password=<password> --query="INSERT INTO ontime_distributed FORMAT CSVWithNames" < ontime-data.csv;
  5. 支持使用SQL语句进行数据查询。示例如下:
    $ ./clickhouse-client --multiline --host=<host> --port=<port> --user=<user> --password=<password> 
    
    :) SELECT
        OriginCityName,
        DestCityName,
        count(*) AS flights,
        bar(flights, 0, 20000, 40)
    FROM ontime_distributed 
    WHERE Year = 1988
    GROUP BY OriginCityName, DestCityName 
    ORDER BY flights DESC 
    LIMIT 20;
    
    :) SELECT
        OriginCityName < DestCityName ? OriginCityName : DestCityName AS a,
        OriginCityName < DestCityName ? DestCityName : OriginCityName AS b,
        count(*) AS flights,
        bar(flights, 0, 40000, 40)
    FROM ontime_distributed 
    WHERE Year = 1988 
    GROUP BY a, b 
    ORDER BY flights DESC 
    LIMIT 20;
    
    :) SELECT OriginCityName, count(*) AS flights
    FROM ontime_distributed 
    GROUP BY OriginCityName 
    ORDER BY flights DESC 
    LIMIT 20;