集群信息:
版本:doris-2.1.6
配置:BE * 5, 每台机器的配置 96c, 256GB memory, 3.5TB(SSD) + 21TB(HDD) * 11
问题表现:
hive表含很多map字段,单日分区2500万条数据,100GB orc文件,使用broker load导数到doris表variant,速度很慢,耗时约100min。
导数期间负载没有较大波动,BE Mem相比闲时多占用10GB,cpu几乎无波动。ScannedRows进度缓慢,尝试调节了max_broker_cocurrency
, min_bytes_per_broker_scanner
, load_parallelism
参数都没效果。源码BrokerConfig.java里有个参数hdfs_read_buffer_size_kb
默认1024还未尝试调节。
大致的表信息和导数语句如下,还请帮忙看看有没有优化空间。
-- hive源表,单日分区2500万条,orc文件约100GB
create table tmp.hive_source(
uid string,
m0 map<string,string>,
m1 map<string,string>,
m2 map<string,string>,
m3 map<string,string>,
m4 map<string,string>,
m5 map<string,string>,
m6 map<string,string>,
m7 map<string,string>,
m8 map<string,string>,
m9 map<string,string>
)
partitioned by (dt string)
stored as orc;
-- doris目标表,导入后show partitions能看到单日分区约500GB
create table tmp.doris_target(
uid string,
dt bigint,
m0 variant,
m1 variant,
m2 variant,
m3 variant,
m4 variant,
m5 variant,
m6 variant,
m7 variant,
m8 variant,
m9 variant
) engine=olap
duplicate key(uid)
partitoin by range(dt)
()
distributed by hash(uid) buckets 50
properties(
"replication_num"="3".
-- ...
"dynamic_partition.hot_partition_num" = "4",
--
)
-- broker load 导数语句
-- 耗时约100min
load label tmp.doris_target_label01 (
data infile ("hdfs://user/hive/warehouse/tmp.db/hive_source/dt=20240101/*")
into table `tmp.doris_target` partition(P_20240101)
columns terminated by '\x01' format as 'orc'
(_col0,_col1,_col2,_col3,_col4,_col5,_col6,_col7,_col8,_col9,_col10)
set (
(`dt`=20240101)
,(`uid`=`_col0`)
,(`m0`=map2json(`_col1`))
,(`m1`=map2json(`_col2`))
,(`m2`=map2json(`_col3`))
,(`m3`=map2json(`_col4`))
,(`m4`=map2json(`_col5`))
,(`m5`=map2json(`_col6`))
,(`m6`=map2json(`_col7`))
,(`m7`=map2json(`_col8`))
,(`m8`=map2json(`_col9`))
,(`m9`=map2json(`_col10`))
)
) with broker my_hdfs_broker (...)
properties(
"timeout" = "7200",
"max_filter_ratio" = "0"
)
// map2json UDF函数
public String evaluate(HashMap<String,Object> map){
return new ObjectMapper().writeValueAsString(map);
}