feat(executor-task): parquet文件读取增加选择列

指定列名可以提高检索速度,默认选择所有列
This commit is contained in:
v-zhangjc9
2024-05-12 17:41:09 +08:00
parent b51176e5c2
commit a1e0b20e87
9 changed files with 172 additions and 83 deletions

View File

@@ -36,9 +36,10 @@ public class ExecutorTaskController {
@RequestParam(value = "scan_queue", defaultValue = "false") Boolean scanQueue, @RequestParam(value = "scan_queue", defaultValue = "false") Boolean scanQueue,
@RequestParam(value = "scan_log", defaultValue = "false") Boolean scanLog, @RequestParam(value = "scan_log", defaultValue = "false") Boolean scanLog,
@RequestParam(value = "scan_base", defaultValue = "false") Boolean scanBase, @RequestParam(value = "scan_base", defaultValue = "false") Boolean scanBase,
@RequestParam(value = "scan_target", defaultValue = "false") Boolean scanTarget @RequestParam(value = "scan_target", defaultValue = "false") Boolean scanTarget,
@RequestParam(value = "filter_fields", required = false) String filterFields
) throws Exception { ) throws Exception {
logger.info("Enter method: scan[key, hdfs, pulsar, pulsarTopic, scanSource, scanQueue, scanLog, scanBase, scanTarget]. " + "key:" + key + "," + "hdfs:" + hdfs + "," + "pulsar:" + pulsar + "," + "pulsarTopic:" + pulsarTopic + "," + "scanSource:" + scanSource + "," + "scanQueue:" + scanQueue + "," + "scanLog:" + scanLog + "," + "scanBase:" + scanBase + "," + "scanTarget:" + scanTarget); logger.info("Enter method: scan[key, hdfs, pulsar, pulsarTopic, scanSource, scanQueue, scanLog, scanBase, scanTarget, filter_fields]. " + "key:" + key + "," + "hdfs:" + hdfs + "," + "pulsar:" + pulsar + "," + "pulsarTopic:" + pulsarTopic + "," + "scanSource:" + scanSource + "," + "scanQueue:" + scanQueue + "," + "scanLog:" + scanLog + "," + "scanBase:" + scanBase + "," + "scanTarget:" + scanTarget + "," + "filter_fields:" + filterFields);
if (!scanSource && !scanQueue && !scanLog && !scanBase && !scanTarget) { if (!scanSource && !scanQueue && !scanLog && !scanBase && !scanTarget) {
throw new RuntimeException("Must choose one mode"); throw new RuntimeException("Must choose one mode");
} }
@@ -48,7 +49,7 @@ public class ExecutorTaskController {
if ((scanLog || scanBase) && StrUtil.isBlank(hdfs)) { if ((scanLog || scanBase) && StrUtil.isBlank(hdfs)) {
throw new RuntimeException("Hdfs path cannot be empty"); throw new RuntimeException("Hdfs path cannot be empty");
} }
return executorTaskService.scanAvro(key, hdfs, pulsar, pulsarTopic, scanSource, scanQueue, scanLog, scanBase, scanTarget); return executorTaskService.scanAvro(key, hdfs, pulsar, pulsarTopic, scanSource, scanQueue, scanLog, scanBase, scanTarget, filterFields);
} }
@GetMapping("latest_op_ts") @GetMapping("latest_op_ts")

View File

@@ -21,7 +21,18 @@ import java.util.Optional;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.flink.client.cli.ClientOptions; import org.apache.flink.client.cli.ClientOptions;
import org.apache.flink.configuration.*; import org.apache.flink.configuration.AkkaOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.CoreOptions;
import org.apache.flink.configuration.DeploymentOptions;
import org.apache.flink.configuration.HeartbeatManagerOptions;
import org.apache.flink.configuration.JobManagerOptions;
import org.apache.flink.configuration.MemorySize;
import org.apache.flink.configuration.PipelineOptions;
import org.apache.flink.configuration.ResourceManagerOptions;
import org.apache.flink.configuration.RestOptions;
import org.apache.flink.configuration.SecurityOptions;
import org.apache.flink.configuration.TaskManagerOptions;
import org.apache.flink.yarn.configuration.YarnConfigOptions; import org.apache.flink.yarn.configuration.YarnConfigOptions;
import org.apache.flink.yarn.configuration.YarnDeploymentTarget; import org.apache.flink.yarn.configuration.YarnDeploymentTarget;
import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataInputStream;
@@ -141,10 +152,23 @@ public class ExecutorTaskService {
Boolean scanQueue, Boolean scanQueue,
Boolean scanLog, Boolean scanLog,
Boolean scanBase, Boolean scanBase,
Boolean scanTarget Boolean scanTarget,
String filterFields
) throws Exception { ) throws Exception {
String taskId = taskId(); String taskId = taskId();
Configuration configuration = generateConfiguration(taskId, "scan " + key);
MutableList<String> types = Lists.mutable.empty();
if (scanSource)
types.add("source");
if (scanQueue)
types.add("queue");
if (scanLog)
types.add("log");
if (scanBase)
types.add("base");
if (scanTarget)
types.add("target");
Configuration configuration = generateConfiguration(taskId, StrUtil.format("scan {} {}", types.makeString(","), key));
MapBuilder<String, Object> builder = MapUtil.builder(); MapBuilder<String, Object> builder = MapUtil.builder();
setEnvironment(configuration, "key", key); setEnvironment(configuration, "key", key);
@@ -164,6 +188,10 @@ public class ExecutorTaskService {
builder.put("pulsar", pulsar); builder.put("pulsar", pulsar);
builder.put("pulsar_topic", pulsarTopic); builder.put("pulsar_topic", pulsarTopic);
} }
if (StrUtil.isNotBlank(filterFields)) {
builder.put("filter_fields", filterFields);
}
ApplicationId applicationId = Runner.run( ApplicationId applicationId = Runner.run(
configuration, configuration,
"com.lanyuanxiaoyao.service.executor.task.DataScanner", "com.lanyuanxiaoyao.service.executor.task.DataScanner",

View File

@@ -9,13 +9,20 @@ import com.lanyuanxiaoyao.service.executor.task.functions.pulsar.ReadPulsarSourc
import com.lanyuanxiaoyao.service.executor.task.helper.ArgumentsHelper; import com.lanyuanxiaoyao.service.executor.task.helper.ArgumentsHelper;
import com.lanyuanxiaoyao.service.executor.task.helper.FlinkHelper; import com.lanyuanxiaoyao.service.executor.task.helper.FlinkHelper;
import com.lanyuanxiaoyao.service.executor.task.helper.HdfsHelper; import com.lanyuanxiaoyao.service.executor.task.helper.HdfsHelper;
import java.util.Arrays;
import java.util.Map; import java.util.Map;
import java.util.function.BiFunction;
import java.util.function.Function;
import org.apache.flink.api.common.eventtime.WatermarkStrategy; import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.eclipse.collections.api.factory.Sets;
import org.eclipse.collections.api.list.ImmutableList; import org.eclipse.collections.api.list.ImmutableList;
import org.eclipse.collections.api.set.ImmutableSet;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@@ -35,6 +42,7 @@ public class DataScanner {
Map<String, Object> metadata = taskContext.getMetadata(); Map<String, Object> metadata = taskContext.getMetadata();
ArgumentsHelper.checkMetadata(taskContext, "key"); ArgumentsHelper.checkMetadata(taskContext, "key");
String key = (String) metadata.get("key"); String key = (String) metadata.get("key");
String[] keys = key.split(",");
Boolean scanQueue = (Boolean) metadata.getOrDefault("scan_queue", false); Boolean scanQueue = (Boolean) metadata.getOrDefault("scan_queue", false);
Boolean scanLog = (Boolean) metadata.getOrDefault("scan_log", false); Boolean scanLog = (Boolean) metadata.getOrDefault("scan_log", false);
Boolean scanBase = (Boolean) metadata.getOrDefault("scan_base", false); Boolean scanBase = (Boolean) metadata.getOrDefault("scan_base", false);
@@ -43,9 +51,23 @@ public class DataScanner {
throw new RuntimeException("Must choose mode scan_queue or scan_log or scan_data"); throw new RuntimeException("Must choose mode scan_queue or scan_log or scan_data");
} }
ImmutableSet<String> filterFields = Sets.immutable.empty();
String fieldText = (String) metadata.get("filter_fields");
if (StrUtil.isNotBlank(fieldText)) {
filterFields = Sets.immutable.of(fieldText.split(",")).collect(StrUtil::trim);
}
StreamExecutionEnvironment environment = FlinkHelper.getBatchEnvironment(); StreamExecutionEnvironment environment = FlinkHelper.getBatchEnvironment();
DataStream<RecordView> source = null; DataStream<String> source = null;
BiFunction<DataStream<RecordView>, Integer, DataStream<String>> filterKeys = (stream, parallelism) -> stream
.map(RecordView::toString)
.setParallelism(parallelism)
.name("Covert record to string")
.filter(line -> StrUtil.containsAny(line, keys))
.setParallelism(parallelism)
.name("Filter target key");
Function<Integer, Integer> parallelismPredict = parallelism -> Math.max(50, Math.min(parallelism / 2, 200));
int totalParallelism = 30; int totalParallelism = 30;
if (scanQueue) { if (scanQueue) {
ArgumentsHelper.checkMetadata(taskContext, "pulsar"); ArgumentsHelper.checkMetadata(taskContext, "pulsar");
@@ -53,10 +75,13 @@ public class DataScanner {
ArgumentsHelper.checkMetadata(taskContext, "pulsar_topic"); ArgumentsHelper.checkMetadata(taskContext, "pulsar_topic");
String pulsarTopic = (String) metadata.get("pulsar_topic"); String pulsarTopic = (String) metadata.get("pulsar_topic");
logger.info("Scan queue topic: {} url: {}", pulsarTopic, pulsarUrl); logger.info("Scan queue topic: {} url: {}", pulsarTopic, pulsarUrl);
DataStream<RecordView> stream = environment DataStream<String> stream = filterKeys.apply(
.fromSource(new ReadPulsarSource(taskContext, pulsarUrl, pulsarTopic), WatermarkStrategy.noWatermarks(), "Read pulsar") environment
.setParallelism(totalParallelism) .fromSource(new ReadPulsarSource(taskContext, pulsarUrl, pulsarTopic), WatermarkStrategy.noWatermarks(), "Read pulsar")
.disableChaining(); .setParallelism(totalParallelism)
.disableChaining(),
totalParallelism
);
if (ObjectUtil.isNull(source)) { if (ObjectUtil.isNull(source)) {
source = stream; source = stream;
} else { } else {
@@ -69,17 +94,24 @@ public class DataScanner {
FileSystem fileSystem = FileSystem.get(new Configuration()); FileSystem fileSystem = FileSystem.get(new Configuration());
HdfsHelper.checkHdfsPath(fileSystem, hdfs); HdfsHelper.checkHdfsPath(fileSystem, hdfs);
ImmutableList<FileStatus> paths = HdfsHelper.hdfsPaths(fileSystem, hdfs);
if (scanLog) { if (scanLog) {
logger.info("Scan log hdfs: {}", hdfs); logger.info("Scan log hdfs: {}", hdfs);
ImmutableList<String> logPaths = HdfsHelper.logPaths(fileSystem, hdfs); ImmutableList<String> logPaths = HdfsHelper.logPaths(paths)
.collect(FileStatus::getPath)
.collect(Path::toString);
int parallelism = HdfsHelper.logScanParallelismPredict(logPaths); int parallelism = HdfsHelper.logScanParallelismPredict(logPaths);
totalParallelism = Math.max(totalParallelism, parallelism); totalParallelism = Math.max(totalParallelism, parallelism);
DataStream<RecordView> stream = environment DataStream<String> stream = filterKeys.apply(
.fromCollection(logPaths.toList()) environment
.name("Read log paths") .fromCollection(logPaths.toList())
.flatMap(new ReadHudiFile()) .name("Read log paths")
.name("Read hudi file") .flatMap(new ReadHudiFile(filterFields))
.setParallelism(parallelism); .name("Read log file")
.setParallelism(parallelism),
parallelismPredict.apply(totalParallelism)
);
if (ObjectUtil.isNull(source)) { if (ObjectUtil.isNull(source)) {
source = stream; source = stream;
} else { } else {
@@ -88,15 +120,18 @@ public class DataScanner {
} }
if (scanBase) { if (scanBase) {
logger.info("Scan base hdfs: {}", hdfs); logger.info("Scan base hdfs: {}", hdfs);
ImmutableList<String> basePaths = HdfsHelper.latestBasePaths(fileSystem, hdfs); ImmutableList<String> basePaths = HdfsHelper.latestBasePaths(paths);
int parallelism = HdfsHelper.baseScanParallelismPredict(basePaths); int parallelism = HdfsHelper.baseScanParallelismPredict(basePaths);
totalParallelism = Math.max(totalParallelism, parallelism); totalParallelism = Math.max(totalParallelism, parallelism);
DataStream<RecordView> stream = environment DataStream<String> stream = filterKeys.apply(
.fromCollection(basePaths.toList()) environment
.name("Read base paths") .fromCollection(basePaths.toList())
.flatMap(new ReadHudiFile()) .name("Read base paths")
.name("Read hudi file") .flatMap(new ReadHudiFile(filterFields))
.setParallelism(parallelism); .name("Read base file")
.setParallelism(parallelism),
parallelismPredict.apply(totalParallelism)
);
if (ObjectUtil.isNull(source)) { if (ObjectUtil.isNull(source)) {
source = stream; source = stream;
} else { } else {
@@ -108,16 +143,10 @@ public class DataScanner {
throw new RuntimeException("Source cannot be null"); throw new RuntimeException("Source cannot be null");
} }
environment.setParallelism(Math.max(50, Math.min(totalParallelism / 2, 200)));
source source
.map(RecordView::toString)
.name("Covert record to string")
.filter(line -> StrUtil.contains(line, key))
.name("Filter target key")
.sinkTo(FlinkHelper.createFileSink(taskContext)) .sinkTo(FlinkHelper.createFileSink(taskContext))
.setParallelism(10) .setParallelism(10)
.name("Output results"); .name("Output results");
environment.execute(StrUtil.format("Search {}", key)); environment.execute(StrUtil.format("Search {}", Arrays.toString(keys)));
} }
} }

View File

@@ -38,7 +38,7 @@ public class LatestOperationTimeScan {
FileSystem fileSystem = FileSystem.get(new Configuration()); FileSystem fileSystem = FileSystem.get(new Configuration());
HdfsHelper.checkHdfsPath(fileSystem, hdfs); HdfsHelper.checkHdfsPath(fileSystem, hdfs);
ImmutableList<String> basePaths = HdfsHelper.latestBasePaths(fileSystem, hdfs); ImmutableList<String> basePaths = HdfsHelper.latestBasePaths(HdfsHelper.hdfsPaths(fileSystem, hdfs));
int parallelism = HdfsHelper.baseScanParallelismPredict(basePaths); int parallelism = HdfsHelper.baseScanParallelismPredict(basePaths);

View File

@@ -7,6 +7,7 @@ import com.lanyuanxiaoyao.service.executor.task.entity.RecordView;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.util.Collector; import org.apache.flink.util.Collector;
@@ -14,6 +15,7 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.HoodieLogFormat;
@@ -22,11 +24,22 @@ import org.apache.hudi.common.table.log.block.HoodieCommandBlock;
import org.apache.hudi.common.table.log.block.HoodieDeleteBlock; import org.apache.hudi.common.table.log.block.HoodieDeleteBlock;
import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock;
import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock; import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock;
import org.apache.hudi.common.util.BaseFileUtils;
import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.ClosableIterator;
import org.apache.hudi.io.storage.HoodieParquetReader; import org.apache.hudi.common.util.ParquetUtils;
import org.apache.hudi.org.apache.avro.Schema; import org.apache.hudi.org.apache.avro.Schema;
import org.apache.hudi.org.apache.avro.generic.GenericRecord;
import org.apache.hudi.org.apache.avro.generic.IndexedRecord; import org.apache.hudi.org.apache.avro.generic.IndexedRecord;
import org.apache.hudi.org.apache.avro.util.Utf8; import org.apache.hudi.org.apache.avro.util.Utf8;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.eclipse.collections.api.factory.Lists;
import org.eclipse.collections.api.factory.Sets;
import org.eclipse.collections.api.list.MutableList;
import org.eclipse.collections.api.set.ImmutableSet;
import org.eclipse.collections.api.set.MutableSet;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@@ -39,6 +52,19 @@ import org.slf4j.LoggerFactory;
public class ReadHudiFile implements FlatMapFunction<String, RecordView> { public class ReadHudiFile implements FlatMapFunction<String, RecordView> {
private static final Logger logger = LoggerFactory.getLogger(ReadHudiFile.class); private static final Logger logger = LoggerFactory.getLogger(ReadHudiFile.class);
private final MutableSet<String> filterFields = Sets.mutable.of(
HoodieRecord.COMMIT_TIME_METADATA_FIELD,
Constants.LATEST_OPERATION_TIMESTAMP_KEY_NAME
);
public ReadHudiFile() {
this(Sets.immutable.empty());
}
public ReadHudiFile(ImmutableSet<String> filterFields) {
this.filterFields.addAll(filterFields.toSet());
}
private RecordView parseData(String source, RecordView.Operation operation, IndexedRecord record) { private RecordView parseData(String source, RecordView.Operation operation, IndexedRecord record) {
Schema schema = record.getSchema(); Schema schema = record.getSchema();
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
@@ -56,12 +82,12 @@ public class ReadHudiFile implements FlatMapFunction<String, RecordView> {
} }
String timestamp = null; String timestamp = null;
Schema.Field commitTimeField = schema.getField(HoodieRecord.COMMIT_TIME_METADATA_FIELD); Schema.Field commitTimeField = schema.getField(HoodieRecord.COMMIT_TIME_METADATA_FIELD);
if (ObjectUtil.isNotNull(commitTimeField)) { if (ObjectUtil.isNotNull(commitTimeField) || ObjectUtil.isNotNull(record.get(commitTimeField.pos()))) {
timestamp = ((Utf8) record.get(commitTimeField.pos())).toString(); timestamp = ((Utf8) record.get(commitTimeField.pos())).toString();
} }
String latestOpTs = null; String latestOpTs = null;
Schema.Field latestOpTsField = schema.getField(Constants.LATEST_OPERATION_TIMESTAMP_KEY_NAME); Schema.Field latestOpTsField = schema.getField(Constants.LATEST_OPERATION_TIMESTAMP_KEY_NAME);
if (ObjectUtil.isNotNull(latestOpTsField)) { if (ObjectUtil.isNotNull(latestOpTsField) || ObjectUtil.isNotNull(record.get(latestOpTsField.pos()))) {
latestOpTs = ((Utf8) record.get(latestOpTsField.pos())).toString(); latestOpTs = ((Utf8) record.get(latestOpTsField.pos())).toString();
} }
@@ -80,26 +106,35 @@ public class ReadHudiFile implements FlatMapFunction<String, RecordView> {
FileSystem readerFilesystem = FileSystem.get(readerConfiguration); FileSystem readerFilesystem = FileSystem.get(readerConfiguration);
Path filePath = new Path(value); Path filePath = new Path(value);
if (FSUtils.isLogFile(filePath)) { if (FSUtils.isLogFile(filePath)) {
readLogFile(readerFilesystem, filePath, out); readLogFile(readerFilesystem, filePath, out, filterFields.toImmutable());
} else if (FSUtils.isDataFile(filePath)) { } else if (FSUtils.isDataFile(filePath)) {
readDataFile(readerFilesystem, filePath, out); readDataFile(readerFilesystem, filePath, out, filterFields.toImmutable());
} else { } else {
logger.warn("Cannot read file format: {}", filePath); logger.warn("Cannot read file format: {}", filePath);
} }
} }
private void readDataFile(FileSystem readerFilesystem, Path dataFilePath, Collector<RecordView> out) throws IOException {
try(HoodieParquetReader<IndexedRecord> reader = new HoodieParquetReader<>(readerFilesystem.getConf(), dataFilePath)) { private void readDataFile(FileSystem readerFilesystem, Path dataFilePath, Collector<RecordView> out, ImmutableSet<String> filterFields) throws IOException {
try(ClosableIterator<IndexedRecord> recordIterator = reader.getRecordIterator()) { Configuration configuration = readerFilesystem.getConf();
while (recordIterator.hasNext()) { ParquetUtils baseFileUtils = (ParquetUtils) BaseFileUtils.getInstance(HoodieFileFormat.PARQUET);
RecordView recordView = parseData(dataFilePath.toString(), RecordView.Operation.RESULT, recordIterator.next()); Schema schema = baseFileUtils.readAvroSchema(configuration, dataFilePath);
out.collect(recordView); MutableList<Schema.Field> fields = Lists.mutable.ofAll(schema.getFields())
} .select(field -> filterFields.isEmpty() || filterFields.anySatisfy(name -> StrUtil.equals(name, field.name())))
.collect(field -> new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal()))
.toList();
Schema readSchema = Schema.createRecord(schema.getName(), null, null, false, fields);
AvroReadSupport.setRequestedProjection(configuration, readSchema);
try (ParquetReader<GenericRecord> reader = AvroParquetReader.genericRecordReader(HadoopInputFile.fromPath(dataFilePath, configuration))) {
GenericRecord record;
while ((record = reader.read()) != null) {
RecordView recordView = parseData(dataFilePath.toString(), RecordView.Operation.RESULT, record);
out.collect(recordView);
} }
} }
} }
private void readLogFile(FileSystem readerFilesystem, Path logFilePath, Collector<RecordView> out) throws IOException { private void readLogFile(FileSystem readerFilesystem, Path logFilePath, Collector<RecordView> out, ImmutableSet<String> filterFields) throws IOException {
try (HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(readerFilesystem, new HoodieLogFile(logFilePath), null)) { try (HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(readerFilesystem, new HoodieLogFile(logFilePath), null)) {
while (reader.hasNext()) { while (reader.hasNext()) {
HoodieLogBlock block = reader.next(); HoodieLogBlock block = reader.next();

View File

@@ -6,12 +6,12 @@ import com.lanyuanxiaoyao.service.configuration.ExecutorProvider;
import com.lanyuanxiaoyao.service.executor.core.TaskContext; import com.lanyuanxiaoyao.service.executor.core.TaskContext;
import java.io.IOException; import java.io.IOException;
import java.util.Optional; import java.util.Optional;
import java.util.function.Predicate;
import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.eclipse.collections.api.factory.Lists; import org.eclipse.collections.api.factory.Lists;
import org.eclipse.collections.api.list.ImmutableList; import org.eclipse.collections.api.list.ImmutableList;
import org.slf4j.Logger; import org.slf4j.Logger;
@@ -49,25 +49,14 @@ public class HdfsHelper {
} }
public static Integer baseScanParallelismPredict(Integer pathNum) { public static Integer baseScanParallelismPredict(Integer pathNum) {
return Math.max(1, Math.min(pathNum / 2, 500)); return Math.max(1, Math.min(pathNum, 500));
} }
public static ImmutableList<String> latestBasePaths(FileSystem fileSystem, String root) throws IOException { public static ImmutableList<String> latestBasePaths(ImmutableList<FileStatus> paths) {
return latestBasePaths(fileSystem, new Path(root)); return paths
}
public static ImmutableList<String> latestBasePaths(FileSystem fileSystem, Path root) throws IOException {
return basePaths(fileSystem, root)
.asParallel(ExecutorProvider.EXECUTORS, 1) .asParallel(ExecutorProvider.EXECUTORS, 1)
.collect(Path::new) .reject(status -> status.getLen() < 1)
.reject(path -> { .collect(FileStatus::getPath)
try {
return FSUtils.getFileSize(fileSystem, path) < 1;
} catch (IOException e) {
logger.error("Get file size error", e);
}
return true;
})
.groupBy(FSUtils::getFileIdFromFilePath) .groupBy(FSUtils::getFileIdFromFilePath)
.multiValuesView() .multiValuesView()
.collect(pathList -> pathList .collect(pathList -> pathList
@@ -87,25 +76,24 @@ public class HdfsHelper {
.toImmutable(); .toImmutable();
} }
public static ImmutableList<String> basePaths(FileSystem fileSystem, String root) throws IOException { public static ImmutableList<FileStatus> basePaths(ImmutableList<FileStatus> paths) throws IOException {
return basePaths(fileSystem, new Path(root)); return paths.select(status -> FSUtils.isBaseFile(status.getPath()));
} }
public static ImmutableList<String> basePaths(FileSystem fileSystem, Path root) throws IOException { public static ImmutableList<FileStatus> logPaths(ImmutableList<FileStatus> paths) {
return hdfsPaths(fileSystem, root, FSUtils::isBaseFile); return paths.select(status -> FSUtils.isLogFile(status.getPath()));
} }
public static ImmutableList<String> logPaths(FileSystem fileSystem, String root) throws IOException { public static ImmutableList<FileStatus> hdfsPaths(FileSystem fileSystem, String root) throws IOException {
return logPaths(fileSystem, new Path(root)); return hdfsPaths(fileSystem, new Path(root));
} }
public static ImmutableList<String> logPaths(FileSystem fileSystem, Path root) throws IOException { /**
return hdfsPaths(fileSystem, root, FSUtils::isLogFile); * 获取hdfs文件列表这个方法是专门给hudi表查询base或log文件使用所以只扫描一层
} */
public static ImmutableList<FileStatus> hdfsPaths(FileSystem fileSystem, Path root) throws IOException {
public static ImmutableList<String> hdfsPaths(FileSystem fileSystem, Path root, Predicate<Path> check) throws IOException {
return Lists.immutable.of(fileSystem.listStatus(root)) return Lists.immutable.of(fileSystem.listStatus(root))
.reject(status -> StrUtil.equals(".hoodie", status.getPath().getName())) .reject(status -> StrUtil.equals(HoodieTableMetaClient.METAFOLDER_NAME, status.getPath().getName()))
.flatCollect(status -> { .flatCollect(status -> {
try { try {
if (status.isDirectory()) { if (status.isDirectory()) {
@@ -116,15 +104,12 @@ public class HdfsHelper {
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
}) });
.collect(FileStatus::getPath)
.select(check::test)
.collect(Path::toString);
} }
public static void createResult(FileSystem fileSystem, TaskContext context, String result) throws IOException { public static void createResult(FileSystem fileSystem, TaskContext context, String result) throws IOException {
Path resultPath = new Path(context.getResultPath() + "/" + context.getTaskId() + "/task-result"); Path resultPath = new Path(context.getResultPath() + "/" + context.getTaskId() + "/task-result");
try(FSDataOutputStream outputStream = fileSystem.create(resultPath)) { try (FSDataOutputStream outputStream = fileSystem.create(resultPath)) {
outputStream.writeUTF(result); outputStream.writeUTF(result);
} }
} }

View File

@@ -21,7 +21,8 @@ public interface TaskService {
@Query("pulsar_topic") String pulsarTopic, @Query("pulsar_topic") String pulsarTopic,
@Query("scan_queue") Boolean scanQueue, @Query("scan_queue") Boolean scanQueue,
@Query("scan_log") Boolean scanLog, @Query("scan_log") Boolean scanLog,
@Query("scan_base") Boolean scanBase @Query("scan_base") Boolean scanBase,
@Query("filter_fields") String filterFields
); );
@Get(value = "/task/latest_op_ts", readTimeout = 2 * 60 * 1000) @Get(value = "/task/latest_op_ts", readTimeout = 2 * 60 * 1000)

View File

@@ -39,7 +39,8 @@ public class TaskController {
@RequestParam(value = "hdfs", required = false) String hdfs, @RequestParam(value = "hdfs", required = false) String hdfs,
@RequestParam(value = "pulsar", required = false) String pulsar, @RequestParam(value = "pulsar", required = false) String pulsar,
@RequestParam(value = "topic", required = false) String topic, @RequestParam(value = "topic", required = false) String topic,
@RequestParam(value = "mode", defaultValue = "") String mode @RequestParam(value = "mode", defaultValue = "") String mode,
@RequestParam(value = "fields", required = false) String fields
) { ) {
if (StrUtil.isBlank(key)) { if (StrUtil.isBlank(key)) {
throw new RuntimeException("Key cannot be blank"); throw new RuntimeException("Key cannot be blank");
@@ -60,7 +61,7 @@ public class TaskController {
throw new RuntimeException("Hdfs path cannot be empty"); throw new RuntimeException("Hdfs path cannot be empty");
} }
ExecutorProvider.EXECUTORS.submit(() -> { ExecutorProvider.EXECUTORS.submit(() -> {
String applicationId = taskService.scan(key, hdfs, pulsar, topic, scanQueue, scanLog, scanBase); String applicationId = taskService.scan(key, hdfs, pulsar, topic, scanQueue, scanLog, scanBase, fields);
logger.info("Task: {}", applicationId); logger.info("Task: {}", applicationId);
}); });
return AmisResponse.responseSuccess(); return AmisResponse.responseSuccess();

View File

@@ -20,6 +20,7 @@ function taskTab() {
pulsar: '${pulsar|default:undefined}', pulsar: '${pulsar|default:undefined}',
topic: '${topic|default:undefined}', topic: '${topic|default:undefined}',
mode: '${scan_mode|default:undefined}', mode: '${scan_mode|default:undefined}',
fields: '${fields|default:undefined}',
} }
} }
}, },
@@ -56,6 +57,14 @@ function taskTab() {
description: '输入表HDFS路径', description: '输入表HDFS路径',
autoComplete: '${base}/table/all_hdfs?key=$term', autoComplete: '${base}/table/all_hdfs?key=$term',
}, },
{
type: 'input-text',
name: 'fields',
label: '指定字段',
visibleOn: '${CONTAINS(scan_mode, \'base\')}',
clearable: true,
description: '逗号分隔可以大幅提高parquet文件检索速度但无法获取指定字段外的字段内容',
},
{ {
type: 'group', type: 'group',
body: [ body: [