perf(executor-task): 减少数据流转

通过设置相同的并行度,让数据读取和数据过滤合并到一个算子里,避免大表base文件扫描失败
This commit is contained in:
v-zhangjc9
2024-05-13 08:44:40 +08:00
parent 80fae0be38
commit e5f945c74b
4 changed files with 223 additions and 6 deletions

View File

@@ -0,0 +1,56 @@
package com.lanyuanxiaoyao.service.command.pro;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.util.BaseFileUtils;
import org.apache.hudi.common.util.ParquetUtils;
import org.apache.hudi.org.apache.avro.Schema;
import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.eclipse.collections.api.factory.Lists;
import org.eclipse.collections.api.list.MutableList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static com.lanyuanxiaoyao.service.command.pro.commands.HudiCommand.reader1;
import static com.lanyuanxiaoyao.service.command.pro.commands.HudiCommand.reader2;
import static com.lanyuanxiaoyao.service.command.pro.commands.HudiCommand.reader3;
import static com.lanyuanxiaoyao.service.command.pro.commands.HudiCommand.time;
public class TestParquetReader {
private static final Logger logger = LoggerFactory.getLogger(TestParquetReader.class);
public static void main(String[] args) {
Configuration configuration = new Configuration();
Path root = new Path("/Users/lanyuanxiaoyao/Downloads/00000007-ecf6-445e-a6ed-43805f3ef27a_5-10-0_20240511083819386.parquet");
time("reader 1", counter -> reader1(counter, configuration, root));
time("reader 2", counter -> reader2(counter, configuration, root));
ParquetUtils baseFileUtils = (ParquetUtils) BaseFileUtils.getInstance(HoodieFileFormat.PARQUET);
Schema schema = baseFileUtils.readAvroSchema(configuration, root);
MutableList<Schema.Field> fields = Lists.mutable.ofAll(schema.getFields())
.select(field -> field.name().equals("CODE"))
.collect(field -> new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal()))
.toList();
Schema readSchema = Schema.createRecord(schema.getName(), null, null, false, fields);
AvroReadSupport.setRequestedProjection(configuration, readSchema);
time("reader 3", counter -> reader3(counter, configuration, root));
/* ParquetUtils baseFileUtils = (ParquetUtils) BaseFileUtils.getInstance(HoodieFileFormat.PARQUET);
Schema schema = baseFileUtils.readAvroSchema(configuration, root);
logger.info("Schema: {}", schema.toString(true));
MutableList<Schema.Field> fields = Lists.mutable.ofAll(schema.getFields())
.select(field -> field.name().equals("CODE"))
.collect(field -> new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal()))
.toList();
Schema readSchema = Schema.createRecord(schema.getName(), null, null, false, fields);
logger.info("Schema: {}", readSchema.toString(true));
AvroReadSupport.setRequestedProjection(configuration, readSchema);
try (ParquetReader<GenericRecord> reader = AvroParquetReader.genericRecordReader(HadoopInputFile.fromPath(root, configuration))) {
GenericRecord record = reader.read();
logger.info("{}", record);
} catch (IOException e) {
throw new RuntimeException(e);
} */
}
}

View File

@@ -0,0 +1,17 @@
<configuration>
<conversionRule conversionWord="clr" converterClass="org.springframework.boot.logging.logback.ColorConverter" />
<conversionRule conversionWord="wex" converterClass="org.springframework.boot.logging.logback.WhitespaceThrowableProxyConverter" />
<conversionRule conversionWord="wEx" converterClass="org.springframework.boot.logging.logback.ExtendedWhitespaceThrowableProxyConverter" />
<appender name="Console" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} %clr(%5p) %clr([%t]){magenta} %clr(%logger{40}){cyan} #@# %m%n%wEx</pattern>
</encoder>
</appender>
<root level="ERROR">
<appender-ref ref="Console"/>
</root>
<logger name="com.lanyuanxiaoyao.service" level="INFO"/>
</configuration>