1
0

[HUDI-2767] Enabling timeline-server-based marker as default (#4112)

- Changes the default config of marker type (HoodieWriteConfig.MARKERS_TYPE or hoodie.write.markers.type) from DIRECT to TIMELINE_SERVER_BASED for Spark Engine.
- Adds engine-specific marker type configs: Spark -> TIMELINE_SERVER_BASED, Flink -> DIRECT, Java -> DIRECT.
- Uses DIRECT markers as well for Spark structured streaming due to timeline server only available for the first mini-batch.
- Fixes the marker creation method for non-partitioned table in TimelineServerBasedWriteMarkers.
- Adds the fallback to direct markers even when TIMELINE_SERVER_BASED is configured, in WriteMarkersFactory: when HDFS is used, or embedded timeline server is disabled, the fallback to direct markers happens.
- Fixes the closing of timeline service.
- Fixes tests that depend on markers, mainly by starting the timeline service for each test.
This commit is contained in:
Y Ethan Guo
2021-11-26 13:41:05 -08:00
committed by GitHub
parent f8e0176eb0
commit d1e83e4ba0
35 changed files with 529 additions and 134 deletions

View File

@@ -45,6 +45,7 @@ import org.apache.hudi.config.metrics.HoodieMetricsDatadogConfig;
import org.apache.hudi.config.metrics.HoodieMetricsGraphiteConfig;
import org.apache.hudi.config.metrics.HoodieMetricsJmxConfig;
import org.apache.hudi.config.metrics.HoodieMetricsPrometheusConfig;
import org.apache.hudi.exception.HoodieNotSupportedException;
import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.keygen.SimpleAvroKeyGenerator;
@@ -248,14 +249,17 @@ public class HoodieWriteConfig extends HoodieConfig {
public static final ConfigProperty<String> MARKERS_TYPE = ConfigProperty
.key("hoodie.write.markers.type")
.defaultValue(MarkerType.DIRECT.toString())
.defaultValue(MarkerType.TIMELINE_SERVER_BASED.toString())
.sinceVersion("0.9.0")
.withDocumentation("Marker type to use. Two modes are supported: "
+ "- DIRECT: individual marker file corresponding to each data file is directly "
+ "created by the writer. "
+ "- TIMELINE_SERVER_BASED: marker operations are all handled at the timeline service "
+ "which serves as a proxy. New marker entries are batch processed and stored "
+ "in a limited number of underlying files for efficiency.");
+ "in a limited number of underlying files for efficiency. If HDFS is used or "
+ "timeline server is disabled, DIRECT markers are used as fallback even if this "
+ "is configure. For Spark structured streaming, this configuration does not "
+ "take effect, i.e., DIRECT markers are always used for Spark structured streaming.");
public static final ConfigProperty<Integer> MARKERS_TIMELINE_SERVER_BASED_BATCH_NUM_THREADS = ConfigProperty
.key("hoodie.markers.timeline_server_based.batch.num_threads")
@@ -2175,6 +2179,7 @@ public class HoodieWriteConfig extends HoodieConfig {
}
protected void setDefaults() {
writeConfig.setDefaultValue(MARKERS_TYPE, getDefaultMarkersType(engineType));
// Check for mandatory properties
writeConfig.setDefaults(HoodieWriteConfig.class.getName());
// Make sure the props is propagated
@@ -2229,5 +2234,18 @@ public class HoodieWriteConfig extends HoodieConfig {
// Build WriteConfig at the end
return new HoodieWriteConfig(engineType, writeConfig.getProps());
}
private String getDefaultMarkersType(EngineType engineType) {
switch (engineType) {
case SPARK:
return MarkerType.TIMELINE_SERVER_BASED.toString();
case FLINK:
case JAVA:
// Timeline-server-based marker is not supported for Flink and Java engines
return MarkerType.DIRECT.toString();
default:
throw new HoodieNotSupportedException("Unsupported engine " + engineType);
}
}
}
}

View File

@@ -19,9 +19,11 @@
package org.apache.hudi.table.marker;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.IOType;
import org.apache.hudi.common.util.HoodieTimer;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.exception.HoodieRemoteException;
import org.apache.hudi.table.HoodieTable;
@@ -132,18 +134,24 @@ public class TimelineServerBasedWriteMarkers extends WriteMarkers {
Map<String, String> paramsMap = new HashMap<>();
paramsMap.put(MARKER_DIR_PATH_PARAM, markerDirPath.toString());
paramsMap.put(MARKER_NAME_PARAM, partitionPath + "/" + markerFileName);
if (StringUtils.isNullOrEmpty(partitionPath)) {
paramsMap.put(MARKER_NAME_PARAM, markerFileName);
} else {
paramsMap.put(MARKER_NAME_PARAM, partitionPath + "/" + markerFileName);
}
boolean success;
try {
success = executeRequestToTimelineServer(
CREATE_MARKER_URL, paramsMap, new TypeReference<Boolean>() {}, RequestMethod.POST);
CREATE_MARKER_URL, paramsMap, new TypeReference<Boolean>() {
}, RequestMethod.POST);
} catch (IOException e) {
throw new HoodieRemoteException("Failed to create marker file " + partitionPath + "/" + markerFileName, e);
}
LOG.info("[timeline-server-based] Created marker file " + partitionPath + "/" + markerFileName
+ " in " + timer.endTimer() + " ms");
if (success) {
return Option.of(new Path(new Path(markerDirPath, partitionPath), markerFileName));
return Option.of(new Path(FSUtils.getPartitionPath(markerDirPath, partitionPath), markerFileName));
} else {
return Option.empty();
}

View File

@@ -18,10 +18,13 @@
package org.apache.hudi.table.marker;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.fs.StorageSchemes;
import org.apache.hudi.common.table.marker.MarkerType;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.table.HoodieTable;
import com.esotericsoftware.minlog.Log;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -43,6 +46,18 @@ public class WriteMarkersFactory {
case DIRECT:
return new DirectWriteMarkers(table, instantTime);
case TIMELINE_SERVER_BASED:
if (!table.getConfig().isEmbeddedTimelineServerEnabled()) {
Log.warn("Timeline-server-based markers are configured as the marker type "
+ "but embedded timeline server is not enabled. Falling back to direct markers.");
return new DirectWriteMarkers(table, instantTime);
}
String basePath = table.getMetaClient().getBasePath();
if (StorageSchemes.HDFS.getScheme().equals(
FSUtils.getFs(basePath, table.getContext().getHadoopConf().newCopy()).getScheme())) {
Log.warn("Timeline-server-based markers are not supported for HDFS: "
+ "base path " + basePath + ". Falling back to direct markers.");
return new DirectWriteMarkers(table, instantTime);
}
return new TimelineServerBasedWriteMarkers(table, instantTime);
default:
throw new HoodieException("The marker type \"" + markerType.name() + "\" is not supported.");