1
0

[HUDI-2044] Integrate consumers with rocksDB and compression within External Spillable Map (#3318)

This commit is contained in:
rmahindra123
2021-07-27 22:31:03 -07:00
committed by GitHub
parent 00cd35f90a
commit 8fef50e237
27 changed files with 405 additions and 109 deletions

View File

@@ -68,7 +68,8 @@ public class EmbeddedTimelineServerHelper {
Option<String> hostAddr = context.getProperty(EngineProperty.EMBEDDED_SERVER_HOST);
EmbeddedTimelineService timelineService = new EmbeddedTimelineService(
context, hostAddr.orElse(null),config.getEmbeddedTimelineServerPort(),
config.getMetadataConfig(), config.getClientSpecifiedViewStorageConfig(), config.getBasePath(),
config.getMetadataConfig(), config.getCommonConfig(),
config.getClientSpecifiedViewStorageConfig(), config.getBasePath(),
config.getEmbeddedTimelineServerThreads(), config.getEmbeddedTimelineServerCompressOutput(),
config.getEmbeddedTimelineServerUseAsync());
timelineService.startServer();

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.client.embedded;
import org.apache.hudi.common.config.HoodieCommonConfig;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.config.SerializableConfiguration;
@@ -46,6 +47,7 @@ public class EmbeddedTimelineService {
private final SerializableConfiguration hadoopConf;
private final FileSystemViewStorageConfig config;
private final HoodieMetadataConfig metadataConfig;
private final HoodieCommonConfig commonConfig;
private final String basePath;
private final int numThreads;
@@ -55,13 +57,14 @@ public class EmbeddedTimelineService {
private transient TimelineService server;
public EmbeddedTimelineService(HoodieEngineContext context, String embeddedTimelineServiceHostAddr, int embeddedTimelineServerPort,
HoodieMetadataConfig metadataConfig, FileSystemViewStorageConfig config, String basePath,
HoodieMetadataConfig metadataConfig, HoodieCommonConfig commonConfig, FileSystemViewStorageConfig config, String basePath,
int numThreads, boolean compressOutput, boolean useAsync) {
setHostAddr(embeddedTimelineServiceHostAddr);
this.context = context;
this.config = config;
this.basePath = basePath;
this.metadataConfig = metadataConfig;
this.commonConfig = commonConfig;
this.hadoopConf = context.getHadoopConf();
this.viewManager = createViewManager();
this.preferredPort = embeddedTimelineServerPort;
@@ -80,7 +83,7 @@ public class EmbeddedTimelineService {
// Reset to default if set to Remote
builder.withStorageType(FileSystemViewStorageType.MEMORY);
}
return FileSystemViewManager.createViewManager(context, metadataConfig, builder.build(), basePath);
return FileSystemViewManager.createViewManager(context, metadataConfig, builder.build(), commonConfig, basePath);
}
public void startServer() throws IOException {

View File

@@ -24,6 +24,7 @@ import org.apache.hudi.client.transaction.ConflictResolutionStrategy;
import org.apache.hudi.common.config.ConfigClassProperty;
import org.apache.hudi.common.config.ConfigGroups;
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.HoodieCommonConfig;
import org.apache.hudi.common.config.HoodieConfig;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.engine.EngineType;
@@ -38,7 +39,6 @@ import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.ExternalSpillableMap;
import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.keygen.constant.KeyGeneratorType;
@@ -59,7 +59,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Properties;
@@ -312,18 +311,6 @@ public class HoodieWriteConfig extends HoodieConfig {
.withDocumentation("When enabled, we allow duplicate keys even if inserts are routed to merge with an existing file (for ensuring file sizing)."
+ " This is only relevant for insert operation, since upsert, delete operations will ensure unique key constraints are maintained.");
public static final ConfigProperty<ExternalSpillableMap.DiskMapType> SPILLABLE_DISK_MAP_TYPE = ConfigProperty
.key("hoodie.spillable.diskmap.type")
.defaultValue(ExternalSpillableMap.DiskMapType.BITCASK)
.withDocumentation("When handling input data that cannot be held in memory, to merge with a file on storage, a spillable diskmap is employed. "
+ "By default, we use a persistent hashmap based loosely on bitcask, that offers O(1) inserts, lookups. "
+ "Change this to `ROCKS_DB` to prefer using rocksDB, for handling the spill.");
public static final ConfigProperty<Boolean> DISK_MAP_BITCASK_COMPRESSION_ENABLED = ConfigProperty
.key("hoodie.diskmap.bitcask.compression.enabled")
.defaultValue(true)
.withDocumentation("Turn on compression for BITCASK disk map used by the External Spillable Map");
public static final ConfigProperty<Integer> CLIENT_HEARTBEAT_INTERVAL_IN_MS_PROP = ConfigProperty
.key("hoodie.client.heartbeat.interval_in_ms")
.defaultValue(60 * 1000)
@@ -388,6 +375,7 @@ public class HoodieWriteConfig extends HoodieConfig {
private FileSystemViewStorageConfig viewStorageConfig;
private HoodiePayloadConfig hoodiePayloadConfig;
private HoodieMetadataConfig metadataConfig;
private HoodieCommonConfig commonConfig;
private EngineType engineType;
/**
@@ -409,6 +397,7 @@ public class HoodieWriteConfig extends HoodieConfig {
this.viewStorageConfig = clientSpecifiedViewStorageConfig;
this.hoodiePayloadConfig = HoodiePayloadConfig.newBuilder().fromProperties(newProps).build();
this.metadataConfig = HoodieMetadataConfig.newBuilder().fromProperties(props).build();
this.commonConfig = HoodieCommonConfig.newBuilder().fromProperties(props).build();
}
public static HoodieWriteConfig.Builder newBuilder() {
@@ -597,14 +586,6 @@ public class HoodieWriteConfig extends HoodieConfig {
return getBoolean(MERGE_ALLOW_DUPLICATE_ON_INSERTS);
}
public ExternalSpillableMap.DiskMapType getSpillableDiskMapType() {
return ExternalSpillableMap.DiskMapType.valueOf(getString(SPILLABLE_DISK_MAP_TYPE).toUpperCase(Locale.ROOT));
}
public boolean isBitCaskDiskMapCompressionEnabled() {
return getBoolean(DISK_MAP_BITCASK_COMPRESSION_ENABLED);
}
public EngineType getEngineType() {
return engineType;
}
@@ -1159,6 +1140,10 @@ public class HoodieWriteConfig extends HoodieConfig {
return metadataConfig;
}
public HoodieCommonConfig getCommonConfig() {
return commonConfig;
}
/**
* Commit call back configs.
*/
@@ -1564,16 +1549,6 @@ public class HoodieWriteConfig extends HoodieConfig {
return this;
}
public Builder withSpillableDiskMapType(ExternalSpillableMap.DiskMapType diskMapType) {
writeConfig.setValue(SPILLABLE_DISK_MAP_TYPE, diskMapType.name());
return this;
}
public Builder withBitcaskDiskMapCompressionEnabled(boolean bitcaskDiskMapCompressionEnabled) {
writeConfig.setValue(DISK_MAP_BITCASK_COMPRESSION_ENABLED, String.valueOf(bitcaskDiskMapCompressionEnabled));
return this;
}
public Builder withHeartbeatIntervalInMs(Integer heartbeatIntervalInMs) {
writeConfig.setValue(CLIENT_HEARTBEAT_INTERVAL_IN_MS_PROP, String.valueOf(heartbeatIntervalInMs));
return this;

View File

@@ -211,7 +211,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
LOG.info("MaxMemoryPerPartitionMerge => " + memoryForMerge);
this.keyToNewRecords = new ExternalSpillableMap<>(memoryForMerge, config.getSpillableMapBasePath(),
new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(tableSchema),
config.getSpillableDiskMapType(), config.isBitCaskDiskMapCompressionEnabled());
config.getCommonConfig().getSpillableDiskMapType(),
config.getCommonConfig().isBitCaskDiskMapCompressionEnabled());
} catch (IOException io) {
throw new HoodieIOException("Cannot instantiate an ExternalSpillableMap", io);
}

View File

@@ -113,7 +113,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload, I, K, O> implem
this.metadata = HoodieTableMetadata.create(context, metadataConfig, config.getBasePath(),
FileSystemViewStorageConfig.FILESYSTEM_VIEW_SPILLABLE_DIR.defaultValue());
this.viewManager = FileSystemViewManager.createViewManager(context, config.getMetadataConfig(), config.getViewStorageConfig(), () -> metadata);
this.viewManager = FileSystemViewManager.createViewManager(context, config.getMetadataConfig(), config.getViewStorageConfig(), config.getCommonConfig(), () -> metadata);
this.metaClient = metaClient;
this.index = getIndex(config, context);
this.taskContextSupplier = context.getTaskContextSupplier();
@@ -123,7 +123,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload, I, K, O> implem
private synchronized FileSystemViewManager getViewManager() {
if (null == viewManager) {
viewManager = FileSystemViewManager.createViewManager(getContext(), config.getMetadataConfig(), config.getViewStorageConfig(), () -> metadata);
viewManager = FileSystemViewManager.createViewManager(getContext(), config.getMetadataConfig(), config.getViewStorageConfig(), config.getCommonConfig(), () -> metadata);
}
return viewManager;
}

View File

@@ -129,6 +129,8 @@ public class HoodieFlinkMergeOnReadTableCompactor<T extends HoodieRecordPayload>
.withReverseReader(config.getCompactionReverseLogReadEnabled())
.withBufferSize(config.getMaxDFSStreamBufferSize())
.withSpillableMapBasePath(config.getSpillableMapBasePath())
.withDiskMapType(config.getCommonConfig().getSpillableDiskMapType())
.withBitCaskDiskMapCompressionEnabled(config.getCommonConfig().isBitCaskDiskMapCompressionEnabled())
.build();
if (!scanner.iterator().hasNext()) {
return new ArrayList<>();

View File

@@ -207,6 +207,8 @@ public class SparkExecuteClusteringCommitActionExecutor<T extends HoodieRecordPa
.withReverseReader(config.getCompactionReverseLogReadEnabled())
.withBufferSize(config.getMaxDFSStreamBufferSize())
.withSpillableMapBasePath(config.getSpillableMapBasePath())
.withDiskMapType(config.getCommonConfig().getSpillableDiskMapType())
.withBitCaskDiskMapCompressionEnabled(config.getCommonConfig().isBitCaskDiskMapCompressionEnabled())
.build();
recordIterators.add(HoodieFileSliceReader.getFileSliceReader(baseFileReader, scanner, readerSchema,

View File

@@ -151,6 +151,8 @@ public class HoodieSparkMergeOnReadTableCompactor<T extends HoodieRecordPayload>
.withReverseReader(config.getCompactionReverseLogReadEnabled())
.withBufferSize(config.getMaxDFSStreamBufferSize())
.withSpillableMapBasePath(config.getSpillableMapBasePath())
.withDiskMapType(config.getCommonConfig().getSpillableDiskMapType())
.withBitCaskDiskMapCompressionEnabled(config.getCommonConfig().isBitCaskDiskMapCompressionEnabled())
.build();
if (!scanner.iterator().hasNext()) {
return new ArrayList<>();

View File

@@ -20,6 +20,7 @@ package org.apache.hudi.io;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.config.HoodieCommonConfig;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieWriteStat;
@@ -27,6 +28,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.util.collection.ExternalSpillableMap;
import org.apache.hudi.config.HoodieCompactionConfig;
import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodieStorageConfig;
@@ -41,16 +43,21 @@ import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.stream.Stream;
import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.params.provider.Arguments.arguments;
@SuppressWarnings("unchecked")
public class TestHoodieMergeHandle extends HoodieClientTestHarness {
@@ -69,14 +76,23 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness {
cleanupResources();
}
@Test
public void testUpsertsForMultipleRecordsInSameFile() throws Exception {
@ParameterizedTest
@MethodSource("testArguments")
public void testUpsertsForMultipleRecordsInSameFile(ExternalSpillableMap.DiskMapType diskMapType,
boolean isCompressionEnabled) throws Exception {
// Create records in a single partition
String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0];
dataGen = new HoodieTestDataGenerator(new String[] {partitionPath});
// Build a common config with diff configs
Properties properties = new Properties();
properties.setProperty(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType.name());
properties.setProperty(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), String.valueOf(isCompressionEnabled));
// Build a write config with bulkinsertparallelism set
HoodieWriteConfig cfg = getConfigBuilder().build();
HoodieWriteConfig cfg = getConfigBuilder()
.withProperties(properties)
.build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) {
FileSystem fs = FSUtils.getFs(basePath, hadoopConf);
@@ -215,10 +231,19 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness {
}
}
@Test
public void testHoodieMergeHandleWriteStatMetrics() throws Exception {
@ParameterizedTest
@MethodSource("testArguments")
public void testHoodieMergeHandleWriteStatMetrics(ExternalSpillableMap.DiskMapType diskMapType,
boolean isCompressionEnabled) throws Exception {
// insert 100 records
HoodieWriteConfig config = getConfigBuilder().build();
// Build a common config with diff configs
Properties properties = new Properties();
properties.setProperty(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType.name());
properties.setProperty(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), String.valueOf(isCompressionEnabled));
HoodieWriteConfig config = getConfigBuilder()
.withProperties(properties)
.build();
try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config);) {
String newCommitTime = "100";
writeClient.startCommitWithTime(newCommitTime);
@@ -317,6 +342,16 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness {
.withBulkInsertParallelism(2).withWriteStatusClass(TestWriteStatus.class);
}
private static Stream<Arguments> testArguments() {
// Arg1: ExternalSpillableMap Type, Arg2: isDiskMapCompressionEnabled
return Stream.of(
arguments(ExternalSpillableMap.DiskMapType.BITCASK, false),
arguments(ExternalSpillableMap.DiskMapType.ROCKS_DB, false),
arguments(ExternalSpillableMap.DiskMapType.BITCASK, true),
arguments(ExternalSpillableMap.DiskMapType.ROCKS_DB, true)
);
}
/**
* Overridden so that we can capture and inspect all success records.
*/