本次测试针对开源自建的Presto与阿里云云原生数据湖分析DLA Presto在OSS数据源上执行TPC-H查询的性能做了对比分析。您可以按照本文介绍自行测试对比,快速了解云原生数据湖分析(DLA)Presto引擎的性能。

  1. 准备TPC-H测试数据。关于如何生成TPC-H测试数据,具体请参考TPC-H官方文档

    TPC-H测试数据生成后,需要分别转换成Parquet和ORC格式,上传到OSS。

  2. 分别在开源Presto和DLA Presto中建表。示例如下:
    对于开源Presto集群,可以在presto中建表(需要修改配置,默认不支持),也可以在hive中建表。以下是hive的建表语句。
    
    CREATE SCHEMA if not exists tpch_1000x_orc
    location  'oss://path/to/tpch/tpch_1000x_orc/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_orc.lineitem (
        l_orderkey bigint,
        l_partkey bigint,
        l_suppkey bigint,
        l_linenumber int,
        l_quantity double,
        l_extendedprice double,
        l_discount double,
        l_tax double,
        l_returnflag string,
        l_linestatus string,
        l_shipdate string,
        l_commitdate string,
        l_receiptdate string,
        l_shipinstruct string,
        l_shipmode string,
        l_comment string
    )
    STORED AS ORC
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/lineitem_orc/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_orc.customer (
        c_custkey bigint,
        c_name string,
        c_address string,
        c_nationkey int,
        c_phone string,
        c_acctbal double,
        c_mktsegment string,
        c_comment string
    )
    STORED AS ORC
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/customer_orc/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_orc.nation (
        n_nationkey int,
        n_name string,
        n_regionkey int,
        n_comment string
    )
    STORED AS ORC
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/nation_orc/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_orc.orders (
        o_orderkey bigint,
        o_custkey bigint,
        o_orderstatus string,
        o_totalprice double,
        o_orderdate string,
        o_orderpriority string,
        o_clerk string,
        o_shippriority int,
        o_comment string
    )
    STORED AS ORC
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/orders_orc/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_orc.part (
        p_partkey bigint,
        p_name string,
        p_mfgr string,
        p_brand string,
        p_type string,
        p_size int,
        p_container string,
        p_retailprice double,
        p_comment string
    )
    STORED AS ORC
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/part_orc/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_orc.partsupp (
        ps_partkey bigint,
        ps_suppkey bigint,
        ps_availqty int,
        ps_supplycost double,
        ps_comment string
    )
    STORED AS ORC
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/partsupp_orc/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_orc.region (
        r_regionkey int,
        r_name string,
        r_comment string
    )
    STORED AS ORC
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/region_orc/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_orc.supplier (
        s_suppkey bigint,
        s_name string,
        s_address string,
        s_nationkey int,
        s_phone string,
        s_acctbal double,
        s_comment string
    )
    STORED AS ORC
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/supplier_orc/'
    ;
    
    CREATE SCHEMA if not exists tpch_1000x_parquet
    location  'oss://path/to/tpch/tpch_1000x_parquet/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_parquet.lineitem (
        l_orderkey bigint,
        l_partkey bigint,
        l_suppkey bigint,
        l_linenumber int,
        l_quantity double,
        l_extendedprice double,
        l_discount double,
        l_tax double,
        l_returnflag string,
        l_linestatus string,
        l_shipdate string,
        l_commitdate string,
        l_receiptdate string,
        l_shipinstruct string,
        l_shipmode string,
        l_comment string
    )
    STORED AS PARQUET
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/lineitem_parquet/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_parquet.customer (
        c_custkey bigint,
        c_name string,
        c_address string,
        c_nationkey int,
        c_phone string,
        c_acctbal double,
        c_mktsegment string,
        c_comment string
    )
    STORED AS PARQUET
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/customer_parquet/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_parquet.nation (
        n_nationkey int,
        n_name string,
        n_regionkey int,
        n_comment string
    )
    STORED AS PARQUET
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/nation_parquet/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_parquet.orders (
        o_orderkey bigint,
        o_custkey bigint,
        o_orderstatus string,
        o_totalprice double,
        o_orderdate string,
        o_orderpriority string,
        o_clerk string,
        o_shippriority int,
        o_comment string
    )
    STORED AS PARQUET
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/orders_parquet/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_parquet.part (
        p_partkey bigint,
        p_name string,
        p_mfgr string,
        p_brand string,
        p_type string,
        p_size int,
        p_container string,
        p_retailprice double,
        p_comment string
    )
    STORED AS PARQUET
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/part_parquet/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_parquet.partsupp (
        ps_partkey bigint,
        ps_suppkey bigint,
        ps_availqty int,
        ps_supplycost double,
        ps_comment string
    )
    STORED AS PARQUET
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/partsupp_parquet/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_parquet.region (
        r_regionkey int,
        r_name string,
        r_comment string
    )
    STORED AS PARQUET
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/region_parquet/'
    ;
    
    CREATE EXTERNAL TABLE tpch_1000x_parquet.supplier (
        s_suppkey bigint,
        s_name string,
        s_address string,
        s_nationkey int,
        s_phone string,
        s_acctbal double,
        s_comment string
    )
    STORED AS PARQUET
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/supplier_parquet/'
    ;
    DLA Presto中的建表语句如下:
    create external table if not exists `tpch_1000x_orc`.`customer` (
        `c_custkey` bigint,
        `c_name` string,
        `c_address` string,
        `c_nationkey` int,
        `c_phone` string,
        `c_acctbal` double,
        `c_mktsegment` string,
        `c_comment` string
    )
    STORED AS `ORC`
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/customer_orc/';
    
    CREATE EXTERNAL TABLE if not exists `tpch_1000x_orc`.`lineitem` (
        `l_orderkey` bigint,
        `l_partkey` bigint,
        `l_suppkey` bigint,
        `l_linenumber` int,
        `l_quantity` double,
        `l_extendedprice` double,
        `l_discount` double,
        `l_tax` double,
        `l_returnflag` string,
        `l_linestatus` string,
        `l_shipdate` string,
        `l_commitdate` string,
        `l_receiptdate` string,
        `l_shipinstruct` string,
        `l_shipmode` string,
        `l_comment` string
    )
    STORED AS `ORC`
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/lineitem_orc/';
    
    create external table if not exists `tpch_1000x_orc`.`nation` (
        `n_nationkey` int,
        `n_name` string,
        `n_regionkey` int,
        `n_comment` string
    )
    STORED AS `ORC`
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/nation_orc/';
    
    create external table if not exists `tpch_1000x_orc`.`orders` (
        `o_orderkey` bigint,
        `o_custkey` bigint,
        `o_orderstatus` string,
        `o_totalprice` double,
        `o_orderdate` string,
        `o_orderpriority` string,
        `o_clerk` string,
        `o_shippriority` int,
        `o_comment` string
    )
    STORED AS `ORC`
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/orders_orc/';
    
    create external table if not exists `tpch_1000x_orc`.`part` (
        `p_partkey` bigint,
        `p_name` string,
        `p_mfgr` string,
        `p_brand` string,
        `p_type` string,
        `p_size` int,
        `p_container` string,
        `p_retailprice` double,
        `p_comment` string
    )
    STORED AS `ORC`
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/part_orc/';
    
    create external table if not exists `tpch_1000x_orc`.`partsupp` (
        `ps_partkey` bigint,
        `ps_suppkey` bigint,
        `ps_availqty` int,
        `ps_supplycost` double,
        `ps_comment` string
    )
    STORED AS `ORC`
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/partsupp_orc/';
    
    create external table if not exists `tpch_1000x_orc`.`region` (
        `r_regionkey` int,
        `r_name` string,
        `r_comment` string
    )
    STORED AS `ORC`
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/region_orc/';
    
    create external table if not exists `tpch_1000x_orc`.`supplier` (
        `s_suppkey` bigint,
        `s_name` string,
        `s_address` string,
        `s_nationkey` int,
        `s_phone` string,
        `s_acctbal` double,
        `s_comment` string
    )
    STORED AS `ORC`
    LOCATION 'oss://path/to/tpch/tpch_1000x_orc/supplier_orc/';
    
    create database if not exists`tpch_1000x_parquet`
    WITH DBPROPERTIES (
        catalog = 'oss',
        location = 'oss://path/to/tpch/tpch_1000x_parquet/'
    );
    
    create external table if not exists `tpch_1000x_parquet`.`customer` (
        `c_custkey` bigint,
        `c_name` string,
        `c_address` string,
        `c_nationkey` int,
        `c_phone` string,
        `c_acctbal` double,
        `c_mktsegment` string,
        `c_comment` string
    )
    STORED AS `PARQUET`
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/customer_parquet/';
    
    CREATE EXTERNAL TABLE if not exists `tpch_1000x_parquet`.`lineitem` (
        `l_orderkey` bigint,
        `l_partkey` bigint,
        `l_suppkey` bigint,
        `l_linenumber` int,
        `l_quantity` double,
        `l_extendedprice` double,
        `l_discount` double,
        `l_tax` double,
        `l_returnflag` string,
        `l_linestatus` string,
        `l_shipdate` string,
        `l_commitdate` string,
        `l_receiptdate` string,
        `l_shipinstruct` string,
        `l_shipmode` string,
        `l_comment` string
    )
    STORED AS `PARQUET`
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/lineitem_parquet/';
    
    create external table if not exists `tpch_1000x_parquet`.`nation` (
        `n_nationkey` int,
        `n_name` string,
        `n_regionkey` int,
        `n_comment` string
    )
    STORED AS `PARQUET`
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/nation_parquet/';
    
    create external table if not exists `tpch_1000x_parquet`.`orders` (
        `o_orderkey` bigint,
        `o_custkey` bigint,
        `o_orderstatus` string,
        `o_totalprice` double,
        `o_orderdate` string,
        `o_orderpriority` string,
        `o_clerk` string,
        `o_shippriority` int,
        `o_comment` string
    )
    STORED AS `PARQUET`
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/orders_parquet/';
    
    create external table if not exists `tpch_1000x_parquet`.`part` (
        `p_partkey` bigint,
        `p_name` string,
        `p_mfgr` string,
        `p_brand` string,
        `p_type` string,
        `p_size` int,
        `p_container` string,
        `p_retailprice` double,
        `p_comment` string
    )
    STORED AS `PARQUET`
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/part_parquet/';
    
    create external table if not exists `tpch_1000x_parquet`.`partsupp` (
        `ps_partkey` bigint,
        `ps_suppkey` bigint,
        `ps_availqty` int,
        `ps_supplycost` double,
        `ps_comment` string
    )
    STORED AS `PARQUET`
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/partsupp_parquet/';
    
    create external table if not exists `tpch_1000x_parquet`.`region` (
        `r_regionkey` int,
        `r_name` string,
        `r_comment` string
    )
    STORED AS `PARQUET`
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/region_parquet/';
    
    create external table if not exists `tpch_1000x_parquet`.`supplier` (
        `s_suppkey` bigint,
        `s_name` string,
        `s_address` string,
        `s_nationkey` int,
        `s_phone` string,
        `s_acctbal` double,
        `s_comment` string
    )
    STORED AS `PARQUET`
    LOCATION 'oss://path/to/tpch/tpch_1000x_parquet/supplier_parquet/';
  3. 分别在开源Presto集群和DLA Presto上面执行TPC-H查询,并记录每条查询的执行时间。
    说明
    • 不要同时在开源Presto集群和DLA Presto上执行查询,避免因为OSS带宽的争抢造成测试结果不准确。
    • DLA Presto需要添加以下hint指定查询在特定的集群中执行。
      /*+cluster=your-cu-name*/