feat(executor-task): parquet文件读取增加选择列

指定列名可以提高检索速度,默认选择所有列
This commit is contained in:
v-zhangjc9
2024-05-12 17:41:09 +08:00
parent b51176e5c2
commit a1e0b20e87
9 changed files with 172 additions and 83 deletions

View File

@@ -36,9 +36,10 @@ public class ExecutorTaskController {
@RequestParam(value = "scan_queue", defaultValue = "false") Boolean scanQueue,
@RequestParam(value = "scan_log", defaultValue = "false") Boolean scanLog,
@RequestParam(value = "scan_base", defaultValue = "false") Boolean scanBase,
@RequestParam(value = "scan_target", defaultValue = "false") Boolean scanTarget
@RequestParam(value = "scan_target", defaultValue = "false") Boolean scanTarget,
@RequestParam(value = "filter_fields", required = false) String filterFields
) throws Exception {
logger.info("Enter method: scan[key, hdfs, pulsar, pulsarTopic, scanSource, scanQueue, scanLog, scanBase, scanTarget]. " + "key:" + key + "," + "hdfs:" + hdfs + "," + "pulsar:" + pulsar + "," + "pulsarTopic:" + pulsarTopic + "," + "scanSource:" + scanSource + "," + "scanQueue:" + scanQueue + "," + "scanLog:" + scanLog + "," + "scanBase:" + scanBase + "," + "scanTarget:" + scanTarget);
logger.info("Enter method: scan[key, hdfs, pulsar, pulsarTopic, scanSource, scanQueue, scanLog, scanBase, scanTarget, filter_fields]. " + "key:" + key + "," + "hdfs:" + hdfs + "," + "pulsar:" + pulsar + "," + "pulsarTopic:" + pulsarTopic + "," + "scanSource:" + scanSource + "," + "scanQueue:" + scanQueue + "," + "scanLog:" + scanLog + "," + "scanBase:" + scanBase + "," + "scanTarget:" + scanTarget + "," + "filter_fields:" + filterFields);
if (!scanSource && !scanQueue && !scanLog && !scanBase && !scanTarget) {
throw new RuntimeException("Must choose one mode");
}
@@ -48,7 +49,7 @@ public class ExecutorTaskController {
if ((scanLog || scanBase) && StrUtil.isBlank(hdfs)) {
throw new RuntimeException("Hdfs path cannot be empty");
}
return executorTaskService.scanAvro(key, hdfs, pulsar, pulsarTopic, scanSource, scanQueue, scanLog, scanBase, scanTarget);
return executorTaskService.scanAvro(key, hdfs, pulsar, pulsarTopic, scanSource, scanQueue, scanLog, scanBase, scanTarget, filterFields);
}
@GetMapping("latest_op_ts")

View File

@@ -21,7 +21,18 @@ import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.flink.client.cli.ClientOptions;
import org.apache.flink.configuration.*;
import org.apache.flink.configuration.AkkaOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.CoreOptions;
import org.apache.flink.configuration.DeploymentOptions;
import org.apache.flink.configuration.HeartbeatManagerOptions;
import org.apache.flink.configuration.JobManagerOptions;
import org.apache.flink.configuration.MemorySize;
import org.apache.flink.configuration.PipelineOptions;
import org.apache.flink.configuration.ResourceManagerOptions;
import org.apache.flink.configuration.RestOptions;
import org.apache.flink.configuration.SecurityOptions;
import org.apache.flink.configuration.TaskManagerOptions;
import org.apache.flink.yarn.configuration.YarnConfigOptions;
import org.apache.flink.yarn.configuration.YarnDeploymentTarget;
import org.apache.hadoop.fs.FSDataInputStream;
@@ -141,10 +152,23 @@ public class ExecutorTaskService {
Boolean scanQueue,
Boolean scanLog,
Boolean scanBase,
Boolean scanTarget
Boolean scanTarget,
String filterFields
) throws Exception {
String taskId = taskId();
Configuration configuration = generateConfiguration(taskId, "scan " + key);
MutableList<String> types = Lists.mutable.empty();
if (scanSource)
types.add("source");
if (scanQueue)
types.add("queue");
if (scanLog)
types.add("log");
if (scanBase)
types.add("base");
if (scanTarget)
types.add("target");
Configuration configuration = generateConfiguration(taskId, StrUtil.format("scan {} {}", types.makeString(","), key));
MapBuilder<String, Object> builder = MapUtil.builder();
setEnvironment(configuration, "key", key);
@@ -164,6 +188,10 @@ public class ExecutorTaskService {
builder.put("pulsar", pulsar);
builder.put("pulsar_topic", pulsarTopic);
}
if (StrUtil.isNotBlank(filterFields)) {
builder.put("filter_fields", filterFields);
}
ApplicationId applicationId = Runner.run(
configuration,
"com.lanyuanxiaoyao.service.executor.task.DataScanner",