refactor(executor-task): 优化日志文件和数据文件读取模式
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
package com.lanyuanxiaoyao.service.executor.task;
|
package com.lanyuanxiaoyao.service.executor.task;
|
||||||
|
|
||||||
|
import cn.hutool.core.util.ObjectUtil;
|
||||||
import cn.hutool.core.util.StrUtil;
|
import cn.hutool.core.util.StrUtil;
|
||||||
import com.lanyuanxiaoyao.service.executor.core.TaskContext;
|
import com.lanyuanxiaoyao.service.executor.core.TaskContext;
|
||||||
import com.lanyuanxiaoyao.service.executor.task.entity.RecordView;
|
import com.lanyuanxiaoyao.service.executor.task.entity.RecordView;
|
||||||
@@ -8,6 +9,7 @@ import com.lanyuanxiaoyao.service.executor.task.helper.ArgumentsHelper;
|
|||||||
import com.lanyuanxiaoyao.service.executor.task.helper.FlinkHelper;
|
import com.lanyuanxiaoyao.service.executor.task.helper.FlinkHelper;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import org.apache.flink.streaming.api.datastream.DataStream;
|
||||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
@@ -49,7 +51,7 @@ public class DataScanner {
|
|||||||
throw new RuntimeException(StrUtil.format("HDFS {} is not exists", hdfs));
|
throw new RuntimeException(StrUtil.format("HDFS {} is not exists", hdfs));
|
||||||
}
|
}
|
||||||
|
|
||||||
ImmutableList<String> paths = Lists.immutable.of(fileSystem.listStatus(new Path(hdfs)))
|
ImmutableList<Path> paths = Lists.immutable.of(fileSystem.listStatus(new Path(hdfs)))
|
||||||
.reject(status -> StrUtil.equals(".hoodie", status.getPath().getName()))
|
.reject(status -> StrUtil.equals(".hoodie", status.getPath().getName()))
|
||||||
.flatCollect(status -> {
|
.flatCollect(status -> {
|
||||||
try {
|
try {
|
||||||
@@ -62,18 +64,32 @@ public class DataScanner {
|
|||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.collect(FileStatus::getPath)
|
.collect(FileStatus::getPath);
|
||||||
.select(path -> (FSUtils.isLogFile(path) && scanLog) || (FSUtils.isDataFile(path) && scanData))
|
|
||||||
.collect(Path::toString);
|
|
||||||
|
|
||||||
StreamExecutionEnvironment environment = FlinkHelper.getBatchEnvironment();
|
StreamExecutionEnvironment environment = FlinkHelper.getBatchEnvironment();
|
||||||
environment.setParallelism(Math.max(paths.size() / 5, 1));
|
environment.setParallelism(20);
|
||||||
environment.fromCollection(paths.toList())
|
|
||||||
.shuffle()
|
DataStream<RecordView> source = null;
|
||||||
.flatMap(new ReadHudiFile())
|
if (scanLog) {
|
||||||
.map(RecordView::toString)
|
ImmutableList<String> logPaths = paths.select(FSUtils::isLogFile).collect(Path::toString);
|
||||||
|
source = environment.fromCollection(logPaths.toList())
|
||||||
|
.flatMap(new ReadHudiFile())
|
||||||
|
.setParallelism(Math.max(1, logPaths.size() / 20));
|
||||||
|
}
|
||||||
|
if (scanData) {
|
||||||
|
ImmutableList<String> dataPaths = paths.select(FSUtils::isDataFile).collect(Path::toString);
|
||||||
|
int parallelism = Math.max(1, dataPaths.size() / 2);
|
||||||
|
if (ObjectUtil.isNull(source)) {
|
||||||
|
source = environment.fromCollection(dataPaths.toList()).flatMap(new ReadHudiFile()).setParallelism(parallelism);
|
||||||
|
} else {
|
||||||
|
source = source.union(environment.fromCollection(dataPaths.toList()).flatMap(new ReadHudiFile()).setParallelism(parallelism));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (ObjectUtil.isNull(source)) {
|
||||||
|
throw new RuntimeException("Source cannot be null");
|
||||||
|
}
|
||||||
|
source.map(RecordView::toString)
|
||||||
.filter(line -> StrUtil.contains(line, key))
|
.filter(line -> StrUtil.contains(line, key))
|
||||||
.disableChaining()
|
|
||||||
.sinkTo(FlinkHelper.createFileSink(taskContext));
|
.sinkTo(FlinkHelper.createFileSink(taskContext));
|
||||||
environment.execute(StrUtil.format("Search {} in {}", key, hdfs));
|
environment.execute(StrUtil.format("Search {} in {}", key, hdfs));
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user