[HUDI-2472] Enabling metadata table for TestHoodieMergeOnReadTable and TestHoodieCompactor (#4023)
This commit is contained in:
committed by
GitHub
parent
459b34240b
commit
c8617d9390
@@ -33,8 +33,9 @@ import org.apache.hudi.common.table.log.HoodieLogFormat;
|
||||
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||
import org.apache.hudi.common.testutils.FileCreateUtils;
|
||||
import org.apache.hudi.common.testutils.HoodieTestTable;
|
||||
import org.apache.hudi.common.testutils.HoodieMetadataTestTable;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieStorageConfig;
|
||||
import org.apache.hudi.io.storage.HoodieAvroParquetConfig;
|
||||
import org.apache.hudi.io.storage.HoodieOrcConfig;
|
||||
@@ -47,6 +48,7 @@ import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.metadata.HoodieTableMetadataWriter;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.orc.CompressionKind;
|
||||
@@ -56,7 +58,7 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@@ -64,15 +66,21 @@ import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.common.testutils.FileCreateUtils.baseFileName;
|
||||
|
||||
public class HoodieWriteableTestTable extends HoodieTestTable {
|
||||
public class HoodieWriteableTestTable extends HoodieMetadataTestTable {
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieWriteableTestTable.class);
|
||||
|
||||
protected final Schema schema;
|
||||
protected final BloomFilter filter;
|
||||
protected final boolean populateMetaFields;
|
||||
|
||||
protected HoodieWriteableTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) {
|
||||
super(basePath, fs, metaClient);
|
||||
protected HoodieWriteableTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient,
|
||||
Schema schema, BloomFilter filter) {
|
||||
this(basePath, fs, metaClient, schema, filter, null);
|
||||
}
|
||||
|
||||
protected HoodieWriteableTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, Schema schema,
|
||||
BloomFilter filter, HoodieTableMetadataWriter metadataWriter) {
|
||||
super(basePath, fs, metaClient, metadataWriter);
|
||||
this.schema = schema;
|
||||
this.filter = filter;
|
||||
this.populateMetaFields = metaClient.getTableConfig().populateMetaFields();
|
||||
@@ -139,19 +147,18 @@ public class HoodieWriteableTestTable extends HoodieTestTable {
|
||||
return this;
|
||||
}
|
||||
|
||||
public HoodieWriteableTestTable withLogAppends(HoodieRecord... records) throws Exception {
|
||||
return withLogAppends(Arrays.asList(records));
|
||||
}
|
||||
|
||||
public HoodieWriteableTestTable withLogAppends(List<HoodieRecord> records) throws Exception {
|
||||
for (List<HoodieRecord> groupedRecords: records.stream()
|
||||
public Map<String, List<HoodieLogFile>> withLogAppends(List<HoodieRecord> records) throws Exception {
|
||||
Map<String, List<HoodieLogFile>> partitionToLogfilesMap = new HashMap<>();
|
||||
for (List<HoodieRecord> groupedRecords : records.stream()
|
||||
.collect(Collectors.groupingBy(HoodieRecord::getCurrentLocation)).values()) {
|
||||
appendRecordsToLogFile(groupedRecords);
|
||||
final Pair<String, HoodieLogFile> appendedLogFile = appendRecordsToLogFile(groupedRecords);
|
||||
partitionToLogfilesMap.computeIfAbsent(
|
||||
appendedLogFile.getKey(), k -> new ArrayList<>()).add(appendedLogFile.getValue());
|
||||
}
|
||||
return this;
|
||||
return partitionToLogfilesMap;
|
||||
}
|
||||
|
||||
private void appendRecordsToLogFile(List<HoodieRecord> groupedRecords) throws Exception {
|
||||
private Pair<String, HoodieLogFile> appendRecordsToLogFile(List<HoodieRecord> groupedRecords) throws Exception {
|
||||
String partitionPath = groupedRecords.get(0).getPartitionPath();
|
||||
HoodieRecordLocation location = groupedRecords.get(0).getCurrentLocation();
|
||||
try (HoodieLogFormat.Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath))
|
||||
@@ -170,6 +177,7 @@ public class HoodieWriteableTestTable extends HoodieTestTable {
|
||||
return null;
|
||||
}
|
||||
}).collect(Collectors.toList()), header));
|
||||
return Pair.of(partitionPath, logWriter.getLogFile());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,6 +30,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.log.HoodieLogFormat;
|
||||
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
@@ -38,7 +39,9 @@ import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
@@ -105,14 +108,18 @@ public class HoodieFlinkWriteableTestTable extends HoodieWriteableTestTable {
|
||||
return (HoodieFlinkWriteableTestTable) withInserts(partition, fileId, records, new org.apache.hudi.client.FlinkTaskContextSupplier(null));
|
||||
}
|
||||
|
||||
public HoodieFlinkWriteableTestTable withLogAppends(List<HoodieRecord> records) throws Exception {
|
||||
for (List<HoodieRecord> groupedRecords: records.stream().collect(Collectors.groupingBy(HoodieRecord::getCurrentLocation)).values()) {
|
||||
appendRecordsToLogFile(groupedRecords);
|
||||
public Map<String, List<HoodieLogFile>> withLogAppends(List<HoodieRecord> records) throws Exception {
|
||||
Map<String, List<HoodieLogFile>> partitionToLogfilesMap = new HashMap<>();
|
||||
for (List<HoodieRecord> groupedRecords : records.stream().collect(
|
||||
Collectors.groupingBy(HoodieRecord::getCurrentLocation)).values()) {
|
||||
final Pair<String, HoodieLogFile> appendedLogFile = appendRecordsToLogFile(groupedRecords);
|
||||
partitionToLogfilesMap.computeIfAbsent(
|
||||
appendedLogFile.getKey(), k -> new ArrayList<>()).add(appendedLogFile.getValue());
|
||||
}
|
||||
return this;
|
||||
return partitionToLogfilesMap;
|
||||
}
|
||||
|
||||
private void appendRecordsToLogFile(List<HoodieRecord> groupedRecords) throws Exception {
|
||||
private Pair<String, HoodieLogFile> appendRecordsToLogFile(List<HoodieRecord> groupedRecords) throws Exception {
|
||||
String partitionPath = groupedRecords.get(0).getPartitionPath();
|
||||
HoodieRecordLocation location = groupedRecords.get(0).getCurrentLocation();
|
||||
try (HoodieLogFormat.Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath))
|
||||
@@ -131,6 +138,7 @@ public class HoodieFlinkWriteableTestTable extends HoodieWriteableTestTable {
|
||||
return null;
|
||||
}
|
||||
}).collect(Collectors.toList()), header));
|
||||
return Pair.of(partitionPath, logWriter.getLogFile());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,13 +34,14 @@ import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant.State;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.table.view.SyncableFileSystemView;
|
||||
import org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView;
|
||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
||||
import org.apache.hudi.common.testutils.Transformations;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.index.HoodieIndex.IndexType;
|
||||
import org.apache.hudi.metadata.HoodieTableMetadataWriter;
|
||||
import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter;
|
||||
import org.apache.hudi.table.action.deltacommit.AbstractSparkDeltaCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.deltacommit.SparkDeleteDeltaCommitActionExecutor;
|
||||
import org.apache.hudi.testutils.HoodieMergeOnReadTestUtils;
|
||||
@@ -63,6 +64,7 @@ import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@@ -190,11 +192,13 @@ public class TestHoodieMergeOnReadTable extends SparkClientFunctionalTestHarness
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Enable metadata virtual keys in this test once the feature HUDI-2593 is completed
|
||||
@ParameterizedTest
|
||||
@ValueSource(booleans = {true, false})
|
||||
@ValueSource(booleans = {true})
|
||||
public void testLogFileCountsAfterCompaction(boolean populateMetaFields) throws Exception {
|
||||
// insert 100 records
|
||||
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build());
|
||||
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true)
|
||||
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build());
|
||||
addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||
HoodieWriteConfig config = cfgBuilder.build();
|
||||
|
||||
@@ -208,37 +212,40 @@ public class TestHoodieMergeOnReadTable extends SparkClientFunctionalTestHarness
|
||||
|
||||
// Update all the 100 records
|
||||
newCommitTime = "101";
|
||||
writeClient.startCommitWithTime(newCommitTime);
|
||||
|
||||
List<HoodieRecord> updatedRecords = dataGen.generateUpdates(newCommitTime, records);
|
||||
JavaRDD<HoodieRecord> updatedRecordsRDD = jsc().parallelize(updatedRecords, 1);
|
||||
|
||||
HoodieReadClient readClient = new HoodieReadClient(context(), config);
|
||||
updatedRecords = readClient.tagLocation(updatedRecordsRDD).collect();
|
||||
JavaRDD<HoodieRecord> updatedTaggedRecordsRDD = readClient.tagLocation(updatedRecordsRDD);
|
||||
|
||||
writeClient.startCommitWithTime(newCommitTime);
|
||||
writeClient.upsertPreppedRecords(updatedTaggedRecordsRDD, newCommitTime).collect();
|
||||
|
||||
// Write them to corresponding avro logfiles
|
||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||
HoodieTable table = HoodieSparkTable.create(config, context(), metaClient);
|
||||
HoodieSparkWriteableTestTable.of(table, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS)
|
||||
.withLogAppends(updatedRecords);
|
||||
// In writeRecordsToLogFiles, no commit files are getting added, so resetting file-system view state
|
||||
((SyncableFileSystemView) (table.getSliceView())).reset();
|
||||
|
||||
HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(
|
||||
writeClient.getEngineContext().getHadoopConf().get(), config, writeClient.getEngineContext());
|
||||
HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable
|
||||
.of(metaClient, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS, metadataWriter);
|
||||
|
||||
Set<String> allPartitions = updatedRecords.stream()
|
||||
.map(record -> record.getPartitionPath())
|
||||
.collect(Collectors.groupingBy(partitionPath -> partitionPath))
|
||||
.keySet();
|
||||
assertEquals(allPartitions.size(), testTable.listAllBaseFiles().length);
|
||||
|
||||
// Verify that all data file has one log file
|
||||
HoodieTable table = HoodieSparkTable.create(config, context(), metaClient, true);
|
||||
for (String partitionPath : dataGen.getPartitionPaths()) {
|
||||
List<FileSlice> groupedLogFiles =
|
||||
table.getSliceView().getLatestFileSlices(partitionPath).collect(Collectors.toList());
|
||||
for (FileSlice fileSlice : groupedLogFiles) {
|
||||
assertEquals(1, fileSlice.getLogFiles().count(), "There should be 1 log file written for every data file");
|
||||
assertEquals(1, fileSlice.getLogFiles().count(),
|
||||
"There should be 1 log file written for the latest data file - " + fileSlice);
|
||||
}
|
||||
}
|
||||
|
||||
// Mark 2nd delta-instant as completed
|
||||
metaClient.getActiveTimeline().createNewInstant(new HoodieInstant(State.INFLIGHT,
|
||||
HoodieTimeline.DELTA_COMMIT_ACTION, newCommitTime));
|
||||
metaClient.getActiveTimeline().saveAsComplete(
|
||||
new HoodieInstant(State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, newCommitTime), Option.empty());
|
||||
|
||||
// Do a compaction
|
||||
String compactionInstantTime = writeClient.scheduleCompaction(Option.empty()).get().toString();
|
||||
JavaRDD<WriteStatus> result = (JavaRDD<WriteStatus>) writeClient.compact(compactionInstantTime);
|
||||
|
||||
@@ -46,7 +46,6 @@ import org.apache.hudi.index.bloom.SparkHoodieBloomIndexHelper;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.testutils.HoodieClientTestHarness;
|
||||
import org.apache.hudi.testutils.HoodieSparkWriteableTestTable;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
@@ -57,9 +56,6 @@ import org.junit.jupiter.api.Test;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.common.testutils.FileCreateUtils.createDeltaCommit;
|
||||
import static org.apache.hudi.common.testutils.FileCreateUtils.createInflightDeltaCommit;
|
||||
import static org.apache.hudi.common.testutils.FileCreateUtils.createRequestedDeltaCommit;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
@@ -163,7 +159,7 @@ public class TestHoodieCompactor extends HoodieClientTestHarness {
|
||||
// insert 100 records
|
||||
HoodieWriteConfig config = getConfigBuilder()
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).build())
|
||||
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build())
|
||||
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build())
|
||||
.build();
|
||||
try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
|
||||
String newCommitTime = "100";
|
||||
@@ -176,19 +172,14 @@ public class TestHoodieCompactor extends HoodieClientTestHarness {
|
||||
// Update all the 100 records
|
||||
HoodieTable table = HoodieSparkTable.create(config, context);
|
||||
newCommitTime = "101";
|
||||
writeClient.startCommitWithTime(newCommitTime);
|
||||
|
||||
List<HoodieRecord> updatedRecords = dataGen.generateUpdates(newCommitTime, records);
|
||||
JavaRDD<HoodieRecord> updatedRecordsRDD = jsc.parallelize(updatedRecords, 1);
|
||||
HoodieIndex index = new HoodieBloomIndex<>(config, SparkHoodieBloomIndexHelper.getInstance());
|
||||
updatedRecords = tagLocation(index, updatedRecordsRDD, table).collect();
|
||||
JavaRDD<HoodieRecord> updatedTaggedRecordsRDD = tagLocation(index, updatedRecordsRDD, table);
|
||||
|
||||
// Write them to corresponding avro logfiles. Also, set the state transition properly.
|
||||
HoodieSparkWriteableTestTable.of(table, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS)
|
||||
.withLogAppends(updatedRecords);
|
||||
metaClient.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(State.REQUESTED,
|
||||
HoodieTimeline.DELTA_COMMIT_ACTION, newCommitTime), Option.empty());
|
||||
writeClient.commit(newCommitTime, jsc.emptyRDD(), Option.empty());
|
||||
writeClient.startCommitWithTime(newCommitTime);
|
||||
writeClient.upsertPreppedRecords(updatedTaggedRecordsRDD, newCommitTime).collect();
|
||||
metaClient.reloadActiveTimeline();
|
||||
|
||||
// Verify that all data file has one log file
|
||||
@@ -200,9 +191,6 @@ public class TestHoodieCompactor extends HoodieClientTestHarness {
|
||||
assertEquals(1, fileSlice.getLogFiles().count(), "There should be 1 log file written for every data file");
|
||||
}
|
||||
}
|
||||
createDeltaCommit(basePath, newCommitTime);
|
||||
createRequestedDeltaCommit(basePath, newCommitTime);
|
||||
createInflightDeltaCommit(basePath, newCommitTime);
|
||||
|
||||
// Do a compaction
|
||||
table = HoodieSparkTable.create(config, context);
|
||||
|
||||
@@ -25,6 +25,7 @@ import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.metadata.HoodieTableMetadataWriter;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
@@ -39,12 +40,20 @@ import java.util.UUID;
|
||||
public class HoodieSparkWriteableTestTable extends HoodieWriteableTestTable {
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieSparkWriteableTestTable.class);
|
||||
|
||||
private HoodieSparkWriteableTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) {
|
||||
super(basePath, fs, metaClient, schema, filter);
|
||||
private HoodieSparkWriteableTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, Schema schema,
|
||||
BloomFilter filter, HoodieTableMetadataWriter metadataWriter) {
|
||||
super(basePath, fs, metaClient, schema, filter, metadataWriter);
|
||||
}
|
||||
|
||||
public static HoodieSparkWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) {
|
||||
return new HoodieSparkWriteableTestTable(metaClient.getBasePath(), metaClient.getRawFs(), metaClient, schema, filter);
|
||||
return new HoodieSparkWriteableTestTable(metaClient.getBasePath(), metaClient.getRawFs(),
|
||||
metaClient, schema, filter, null);
|
||||
}
|
||||
|
||||
public static HoodieSparkWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter,
|
||||
HoodieTableMetadataWriter metadataWriter) {
|
||||
return new HoodieSparkWriteableTestTable(metaClient.getBasePath(), metaClient.getRawFs(),
|
||||
metaClient, schema, filter, metadataWriter);
|
||||
}
|
||||
|
||||
public static HoodieSparkWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema) {
|
||||
@@ -53,6 +62,13 @@ public class HoodieSparkWriteableTestTable extends HoodieWriteableTestTable {
|
||||
return of(metaClient, schema, filter);
|
||||
}
|
||||
|
||||
public static HoodieSparkWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema,
|
||||
HoodieTableMetadataWriter metadataWriter) {
|
||||
BloomFilter filter = BloomFilterFactory
|
||||
.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name());
|
||||
return of(metaClient, schema, filter, metadataWriter);
|
||||
}
|
||||
|
||||
public static HoodieSparkWriteableTestTable of(HoodieTable hoodieTable, Schema schema) {
|
||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||
return of(metaClient, schema);
|
||||
|
||||
@@ -131,6 +131,7 @@ public class HoodieTestTable {
|
||||
this.basePath = basePath;
|
||||
this.fs = fs;
|
||||
this.metaClient = metaClient;
|
||||
testTableState = HoodieTestTableState.of();
|
||||
}
|
||||
|
||||
public static HoodieTestTable of(HoodieTableMetaClient metaClient) {
|
||||
|
||||
Reference in New Issue
Block a user