[HUDI-2767] Enabling timeline-server-based marker as default (#4112)
- Changes the default config of marker type (HoodieWriteConfig.MARKERS_TYPE or hoodie.write.markers.type) from DIRECT to TIMELINE_SERVER_BASED for Spark Engine. - Adds engine-specific marker type configs: Spark -> TIMELINE_SERVER_BASED, Flink -> DIRECT, Java -> DIRECT. - Uses DIRECT markers as well for Spark structured streaming due to timeline server only available for the first mini-batch. - Fixes the marker creation method for non-partitioned table in TimelineServerBasedWriteMarkers. - Adds the fallback to direct markers even when TIMELINE_SERVER_BASED is configured, in WriteMarkersFactory: when HDFS is used, or embedded timeline server is disabled, the fallback to direct markers happens. - Fixes the closing of timeline service. - Fixes tests that depend on markers, mainly by starting the timeline service for each test.
This commit is contained in:
@@ -45,6 +45,7 @@ import org.apache.hudi.config.metrics.HoodieMetricsDatadogConfig;
|
||||
import org.apache.hudi.config.metrics.HoodieMetricsGraphiteConfig;
|
||||
import org.apache.hudi.config.metrics.HoodieMetricsJmxConfig;
|
||||
import org.apache.hudi.config.metrics.HoodieMetricsPrometheusConfig;
|
||||
import org.apache.hudi.exception.HoodieNotSupportedException;
|
||||
import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode;
|
||||
import org.apache.hudi.index.HoodieIndex;
|
||||
import org.apache.hudi.keygen.SimpleAvroKeyGenerator;
|
||||
@@ -248,14 +249,17 @@ public class HoodieWriteConfig extends HoodieConfig {
|
||||
|
||||
public static final ConfigProperty<String> MARKERS_TYPE = ConfigProperty
|
||||
.key("hoodie.write.markers.type")
|
||||
.defaultValue(MarkerType.DIRECT.toString())
|
||||
.defaultValue(MarkerType.TIMELINE_SERVER_BASED.toString())
|
||||
.sinceVersion("0.9.0")
|
||||
.withDocumentation("Marker type to use. Two modes are supported: "
|
||||
+ "- DIRECT: individual marker file corresponding to each data file is directly "
|
||||
+ "created by the writer. "
|
||||
+ "- TIMELINE_SERVER_BASED: marker operations are all handled at the timeline service "
|
||||
+ "which serves as a proxy. New marker entries are batch processed and stored "
|
||||
+ "in a limited number of underlying files for efficiency.");
|
||||
+ "in a limited number of underlying files for efficiency. If HDFS is used or "
|
||||
+ "timeline server is disabled, DIRECT markers are used as fallback even if this "
|
||||
+ "is configure. For Spark structured streaming, this configuration does not "
|
||||
+ "take effect, i.e., DIRECT markers are always used for Spark structured streaming.");
|
||||
|
||||
public static final ConfigProperty<Integer> MARKERS_TIMELINE_SERVER_BASED_BATCH_NUM_THREADS = ConfigProperty
|
||||
.key("hoodie.markers.timeline_server_based.batch.num_threads")
|
||||
@@ -2175,6 +2179,7 @@ public class HoodieWriteConfig extends HoodieConfig {
|
||||
}
|
||||
|
||||
protected void setDefaults() {
|
||||
writeConfig.setDefaultValue(MARKERS_TYPE, getDefaultMarkersType(engineType));
|
||||
// Check for mandatory properties
|
||||
writeConfig.setDefaults(HoodieWriteConfig.class.getName());
|
||||
// Make sure the props is propagated
|
||||
@@ -2229,5 +2234,18 @@ public class HoodieWriteConfig extends HoodieConfig {
|
||||
// Build WriteConfig at the end
|
||||
return new HoodieWriteConfig(engineType, writeConfig.getProps());
|
||||
}
|
||||
|
||||
private String getDefaultMarkersType(EngineType engineType) {
|
||||
switch (engineType) {
|
||||
case SPARK:
|
||||
return MarkerType.TIMELINE_SERVER_BASED.toString();
|
||||
case FLINK:
|
||||
case JAVA:
|
||||
// Timeline-server-based marker is not supported for Flink and Java engines
|
||||
return MarkerType.DIRECT.toString();
|
||||
default:
|
||||
throw new HoodieNotSupportedException("Unsupported engine " + engineType);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,9 +19,11 @@
|
||||
package org.apache.hudi.table.marker;
|
||||
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.IOType;
|
||||
import org.apache.hudi.common.util.HoodieTimer;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.exception.HoodieRemoteException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
@@ -132,18 +134,24 @@ public class TimelineServerBasedWriteMarkers extends WriteMarkers {
|
||||
|
||||
Map<String, String> paramsMap = new HashMap<>();
|
||||
paramsMap.put(MARKER_DIR_PATH_PARAM, markerDirPath.toString());
|
||||
paramsMap.put(MARKER_NAME_PARAM, partitionPath + "/" + markerFileName);
|
||||
if (StringUtils.isNullOrEmpty(partitionPath)) {
|
||||
paramsMap.put(MARKER_NAME_PARAM, markerFileName);
|
||||
} else {
|
||||
paramsMap.put(MARKER_NAME_PARAM, partitionPath + "/" + markerFileName);
|
||||
}
|
||||
|
||||
boolean success;
|
||||
try {
|
||||
success = executeRequestToTimelineServer(
|
||||
CREATE_MARKER_URL, paramsMap, new TypeReference<Boolean>() {}, RequestMethod.POST);
|
||||
CREATE_MARKER_URL, paramsMap, new TypeReference<Boolean>() {
|
||||
}, RequestMethod.POST);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieRemoteException("Failed to create marker file " + partitionPath + "/" + markerFileName, e);
|
||||
}
|
||||
LOG.info("[timeline-server-based] Created marker file " + partitionPath + "/" + markerFileName
|
||||
+ " in " + timer.endTimer() + " ms");
|
||||
if (success) {
|
||||
return Option.of(new Path(new Path(markerDirPath, partitionPath), markerFileName));
|
||||
return Option.of(new Path(FSUtils.getPartitionPath(markerDirPath, partitionPath), markerFileName));
|
||||
} else {
|
||||
return Option.empty();
|
||||
}
|
||||
|
||||
@@ -18,10 +18,13 @@
|
||||
|
||||
package org.apache.hudi.table.marker;
|
||||
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.fs.StorageSchemes;
|
||||
import org.apache.hudi.common.table.marker.MarkerType;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import com.esotericsoftware.minlog.Log;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -43,6 +46,18 @@ public class WriteMarkersFactory {
|
||||
case DIRECT:
|
||||
return new DirectWriteMarkers(table, instantTime);
|
||||
case TIMELINE_SERVER_BASED:
|
||||
if (!table.getConfig().isEmbeddedTimelineServerEnabled()) {
|
||||
Log.warn("Timeline-server-based markers are configured as the marker type "
|
||||
+ "but embedded timeline server is not enabled. Falling back to direct markers.");
|
||||
return new DirectWriteMarkers(table, instantTime);
|
||||
}
|
||||
String basePath = table.getMetaClient().getBasePath();
|
||||
if (StorageSchemes.HDFS.getScheme().equals(
|
||||
FSUtils.getFs(basePath, table.getContext().getHadoopConf().newCopy()).getScheme())) {
|
||||
Log.warn("Timeline-server-based markers are not supported for HDFS: "
|
||||
+ "base path " + basePath + ". Falling back to direct markers.");
|
||||
return new DirectWriteMarkers(table, instantTime);
|
||||
}
|
||||
return new TimelineServerBasedWriteMarkers(table, instantTime);
|
||||
default:
|
||||
throw new HoodieException("The marker type \"" + markerType.name() + "\" is not supported.");
|
||||
|
||||
@@ -19,8 +19,10 @@
|
||||
package org.apache.hudi.config;
|
||||
|
||||
import org.apache.hudi.common.engine.EngineType;
|
||||
import org.apache.hudi.common.table.marker.MarkerType;
|
||||
import org.apache.hudi.config.HoodieWriteConfig.Builder;
|
||||
import org.apache.hudi.index.HoodieIndex;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.ValueSource;
|
||||
@@ -32,6 +34,7 @@ import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.function.Function;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
@@ -67,63 +70,38 @@ public class TestHoodieWriteConfig {
|
||||
|
||||
@Test
|
||||
public void testDefaultIndexAccordingToEngineType() {
|
||||
// default bloom
|
||||
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").build();
|
||||
assertEquals(HoodieIndex.IndexType.BLOOM, writeConfig.getIndexType());
|
||||
|
||||
// spark default bloom
|
||||
writeConfig = HoodieWriteConfig.newBuilder().withEngineType(EngineType.SPARK).withPath("/tmp").build();
|
||||
assertEquals(HoodieIndex.IndexType.BLOOM, writeConfig.getIndexType());
|
||||
|
||||
// flink default in-memory
|
||||
writeConfig = HoodieWriteConfig.newBuilder().withEngineType(EngineType.FLINK).withPath("/tmp").build();
|
||||
assertEquals(HoodieIndex.IndexType.INMEMORY, writeConfig.getIndexType());
|
||||
testEngineSpecificConfig(HoodieWriteConfig::getIndexType,
|
||||
constructConfigMap(
|
||||
EngineType.SPARK, HoodieIndex.IndexType.BLOOM,
|
||||
EngineType.FLINK, HoodieIndex.IndexType.INMEMORY,
|
||||
EngineType.JAVA, HoodieIndex.IndexType.INMEMORY));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDefaultClusteringPlanStrategyClassAccordingToEngineType() {
|
||||
// Default (as Spark)
|
||||
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").build();
|
||||
assertEquals(
|
||||
HoodieClusteringConfig.SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY,
|
||||
writeConfig.getClusteringPlanStrategyClass());
|
||||
|
||||
// Spark
|
||||
writeConfig = HoodieWriteConfig.newBuilder().withEngineType(EngineType.SPARK).withPath("/tmp").build();
|
||||
assertEquals(
|
||||
HoodieClusteringConfig.SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY,
|
||||
writeConfig.getClusteringPlanStrategyClass());
|
||||
|
||||
// Flink and Java
|
||||
for (EngineType engineType : new EngineType[] {EngineType.FLINK, EngineType.JAVA}) {
|
||||
writeConfig = HoodieWriteConfig.newBuilder().withEngineType(engineType).withPath("/tmp").build();
|
||||
assertEquals(
|
||||
HoodieClusteringConfig.JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY,
|
||||
writeConfig.getClusteringPlanStrategyClass());
|
||||
}
|
||||
testEngineSpecificConfig(HoodieWriteConfig::getClusteringPlanStrategyClass,
|
||||
constructConfigMap(
|
||||
EngineType.SPARK, HoodieClusteringConfig.SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY,
|
||||
EngineType.FLINK, HoodieClusteringConfig.JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY,
|
||||
EngineType.JAVA, HoodieClusteringConfig.JAVA_SIZED_BASED_CLUSTERING_PLAN_STRATEGY));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDefaultClusteringExecutionStrategyClassAccordingToEngineType() {
|
||||
// Default (as Spark)
|
||||
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").build();
|
||||
assertEquals(
|
||||
HoodieClusteringConfig.SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY,
|
||||
writeConfig.getClusteringExecutionStrategyClass());
|
||||
testEngineSpecificConfig(HoodieWriteConfig::getClusteringExecutionStrategyClass,
|
||||
constructConfigMap(
|
||||
EngineType.SPARK, HoodieClusteringConfig.SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY,
|
||||
EngineType.FLINK, HoodieClusteringConfig.JAVA_SORT_AND_SIZE_EXECUTION_STRATEGY,
|
||||
EngineType.JAVA, HoodieClusteringConfig.JAVA_SORT_AND_SIZE_EXECUTION_STRATEGY));
|
||||
}
|
||||
|
||||
// Spark
|
||||
writeConfig = HoodieWriteConfig.newBuilder().withEngineType(EngineType.SPARK).withPath("/tmp").build();
|
||||
assertEquals(
|
||||
HoodieClusteringConfig.SPARK_SORT_AND_SIZE_EXECUTION_STRATEGY,
|
||||
writeConfig.getClusteringExecutionStrategyClass());
|
||||
|
||||
// Flink and Java
|
||||
for (EngineType engineType : new EngineType[] {EngineType.FLINK, EngineType.JAVA}) {
|
||||
writeConfig = HoodieWriteConfig.newBuilder().withEngineType(engineType).withPath("/tmp").build();
|
||||
assertEquals(
|
||||
HoodieClusteringConfig.JAVA_SORT_AND_SIZE_EXECUTION_STRATEGY,
|
||||
writeConfig.getClusteringExecutionStrategyClass());
|
||||
}
|
||||
@Test
|
||||
public void testDefaultMarkersTypeAccordingToEngineType() {
|
||||
testEngineSpecificConfig(HoodieWriteConfig::getMarkersType,
|
||||
constructConfigMap(
|
||||
EngineType.SPARK, MarkerType.TIMELINE_SERVER_BASED,
|
||||
EngineType.FLINK, MarkerType.DIRECT,
|
||||
EngineType.JAVA, MarkerType.DIRECT));
|
||||
}
|
||||
|
||||
private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params) throws IOException {
|
||||
@@ -133,4 +111,44 @@ public class TestHoodieWriteConfig {
|
||||
properties.store(outStream, "Saved on " + new Date(System.currentTimeMillis()));
|
||||
return outStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests the engine-specific configuration values for one configuration key .
|
||||
*
|
||||
* @param getConfigFunc Function to get the config value.
|
||||
* @param expectedConfigMap Expected config map, with key as the engine type
|
||||
* and value as the corresponding config value for the engine.
|
||||
*/
|
||||
private void testEngineSpecificConfig(Function<HoodieWriteConfig, Object> getConfigFunc,
|
||||
Map<EngineType, Object> expectedConfigMap) {
|
||||
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").build();
|
||||
assertEquals(expectedConfigMap.get(EngineType.SPARK), getConfigFunc.apply(writeConfig));
|
||||
|
||||
for (EngineType engineType : expectedConfigMap.keySet()) {
|
||||
writeConfig = HoodieWriteConfig.newBuilder()
|
||||
.withEngineType(engineType).withPath("/tmp").build();
|
||||
assertEquals(expectedConfigMap.get(engineType), getConfigFunc.apply(writeConfig));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs the map.
|
||||
*
|
||||
* @param k1 First engine type.
|
||||
* @param v1 Config value for the first engine type.
|
||||
* @param k2 Second engine type.
|
||||
* @param v2 Config value for the second engine type.
|
||||
* @param k3 Third engine type.
|
||||
* @param v3 Config value for the third engine type.
|
||||
* @return {@link Map<EngineType, Object>} instance, with key as the engine type
|
||||
* and value as the corresponding config value for the engine.
|
||||
*/
|
||||
private Map<EngineType, Object> constructConfigMap(
|
||||
EngineType k1, Object v1, EngineType k2, Object v2, EngineType k3, Object v3) {
|
||||
Map<EngineType, Object> mapping = new HashMap<>();
|
||||
mapping.put(k1, v1);
|
||||
mapping.put(k2, v2);
|
||||
mapping.put(k3, v3);
|
||||
return mapping;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,116 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.marker;
|
||||
|
||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.marker.MarkerType;
|
||||
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
||||
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
|
||||
public class TestWriteMarkersFactory extends HoodieCommonTestHarness {
|
||||
private static final String NON_HDFS_BASE_PATH = "/tmp/dir";
|
||||
private static final String HDFS_BASE_PATH = "hdfs://localhost/dir";
|
||||
private final HoodieWriteConfig writeConfig = Mockito.mock(HoodieWriteConfig.class);
|
||||
private final HoodieTableMetaClient metaClient = Mockito.mock(HoodieTableMetaClient.class);
|
||||
private final HoodieWrapperFileSystem fileSystem = Mockito.mock(HoodieWrapperFileSystem.class);
|
||||
private final HoodieEngineContext context = Mockito.mock(HoodieEngineContext.class);
|
||||
private final HoodieTable table = Mockito.mock(HoodieTable.class);
|
||||
|
||||
@BeforeEach
|
||||
public void init() throws IOException {
|
||||
initMetaClient();
|
||||
}
|
||||
|
||||
public static Stream<Arguments> configParams() {
|
||||
Object[][] data = new Object[][] {
|
||||
{NON_HDFS_BASE_PATH, true}, {HDFS_BASE_PATH, false},
|
||||
{NON_HDFS_BASE_PATH, true}, {HDFS_BASE_PATH, false},
|
||||
};
|
||||
return Stream.of(data).map(Arguments::of);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("configParams")
|
||||
public void testDirectMarkers(String basePath, boolean isTimelineServerEnabled) {
|
||||
testWriteMarkersFactory(
|
||||
MarkerType.DIRECT, basePath, isTimelineServerEnabled, DirectWriteMarkers.class);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTimelineServerBasedMarkersWithTimelineServerEnabled() {
|
||||
testWriteMarkersFactory(
|
||||
MarkerType.TIMELINE_SERVER_BASED, NON_HDFS_BASE_PATH, true,
|
||||
TimelineServerBasedWriteMarkers.class);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTimelineServerBasedMarkersWithTimelineServerDisabled() {
|
||||
// Fallback to direct markers should happen
|
||||
testWriteMarkersFactory(
|
||||
MarkerType.TIMELINE_SERVER_BASED, NON_HDFS_BASE_PATH, false,
|
||||
DirectWriteMarkers.class);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTimelineServerBasedMarkersWithHDFS() {
|
||||
// Fallback to direct markers should happen
|
||||
testWriteMarkersFactory(
|
||||
MarkerType.TIMELINE_SERVER_BASED, HDFS_BASE_PATH, true,
|
||||
DirectWriteMarkers.class);
|
||||
}
|
||||
|
||||
private void testWriteMarkersFactory(
|
||||
MarkerType markerTypeConfig, String basePath, boolean isTimelineServerEnabled,
|
||||
Class<?> expectedWriteMarkersClass) {
|
||||
String instantTime = "001";
|
||||
Mockito.when(table.getConfig()).thenReturn(writeConfig);
|
||||
Mockito.when(writeConfig.isEmbeddedTimelineServerEnabled())
|
||||
.thenReturn(isTimelineServerEnabled);
|
||||
Mockito.when(table.getMetaClient()).thenReturn(metaClient);
|
||||
Mockito.when(metaClient.getFs()).thenReturn(fileSystem);
|
||||
Mockito.when(metaClient.getBasePath()).thenReturn(basePath);
|
||||
Mockito.when(metaClient.getMarkerFolderPath(any())).thenReturn(basePath + ".hoodie/.temp");
|
||||
Mockito.when(table.getContext()).thenReturn(context);
|
||||
Mockito.when(context.getHadoopConf()).thenReturn(new SerializableConfiguration(new Configuration()));
|
||||
Mockito.when(writeConfig.getViewStorageConfig())
|
||||
.thenReturn(FileSystemViewStorageConfig.newBuilder().build());
|
||||
assertEquals(expectedWriteMarkersClass,
|
||||
WriteMarkersFactory.get(markerTypeConfig, table, instantTime).getClass());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user