broker load大表导数较慢

Viewed 5

集群信息:
版本:doris-2.1.6
配置:BE * 5, 每台机器的配置 96c, 256GB memory, 3.5TB(SSD) + 21TB(HDD) * 11

问题表现:
hive表含很多map字段,单日分区2500万条数据,100GB orc文件,使用broker load导数到doris表variant,速度很慢,耗时约100min。
导数期间负载没有较大波动,BE Mem相比闲时多占用10GB,cpu几乎无波动。ScannedRows进度缓慢,尝试调节了max_broker_cocurrency, min_bytes_per_broker_scanner, load_parallelism参数都没效果。源码BrokerConfig.java里有个参数hdfs_read_buffer_size_kb默认1024还未尝试调节。

大致的表信息和导数语句如下,还请帮忙看看有没有优化空间。

-- hive源表,单日分区2500万条,orc文件约100GB
create table tmp.hive_source(
    uid string,
    m0 map<string,string>,
    m1 map<string,string>,
    m2 map<string,string>,
    m3 map<string,string>,
    m4 map<string,string>,
    m5 map<string,string>,
    m6 map<string,string>,
    m7 map<string,string>,
    m8 map<string,string>,
    m9 map<string,string>
)
partitioned by (dt string)
stored as orc;


-- doris目标表,导入后show partitions能看到单日分区约500GB
create table tmp.doris_target(
    uid string,
    dt bigint,
    m0 variant,
    m1 variant,
    m2 variant,
    m3 variant,
    m4 variant,
    m5 variant,
    m6 variant,
    m7 variant,
    m8 variant,
    m9 variant
) engine=olap
duplicate key(uid)
partitoin by range(dt)
()
distributed by hash(uid) buckets 50
properties(
"replication_num"="3".
-- ...
"dynamic_partition.hot_partition_num" = "4",
-- 
)


-- broker load 导数语句
-- 耗时约100min
load label tmp.doris_target_label01 (
    data infile ("hdfs://user/hive/warehouse/tmp.db/hive_source/dt=20240101/*")
    into table `tmp.doris_target` partition(P_20240101)
    columns terminated by '\x01' format as 'orc'
    (_col0,_col1,_col2,_col3,_col4,_col5,_col6,_col7,_col8,_col9,_col10)
    set (
        (`dt`=20240101)
        ,(`uid`=`_col0`)
        ,(`m0`=map2json(`_col1`))
        ,(`m1`=map2json(`_col2`))
        ,(`m2`=map2json(`_col3`))
        ,(`m3`=map2json(`_col4`))
        ,(`m4`=map2json(`_col5`))
        ,(`m5`=map2json(`_col6`))
        ,(`m6`=map2json(`_col7`))
        ,(`m7`=map2json(`_col8`))
        ,(`m8`=map2json(`_col9`))
        ,(`m9`=map2json(`_col10`))
    )
) with broker my_hdfs_broker (...)
properties(
    "timeout" = "7200",
    "max_filter_ratio" = "0"
)
// map2json UDF函数
public String evaluate(HashMap<String,Object> map){
    return new ObjectMapper().writeValueAsString(map);
}
0 Answers