1
0

[HUDI-3855] Fixing FILENAME_METADATA_FIELD not being correctly updated in HoodieMergeHandle (#5296)

Fixing FILENAME_METADATA_FIELD not being correctly updated in HoodieMergeHandle, in cases when old-record is carried over from existing file as is.

- Revisited HoodieFileWriter API to accept HoodieKey instead of HoodieRecord
- Fixed FILENAME_METADATA_FIELD not being overridden in cases when simply old record is carried over
- Exposing standard JVM's debugger ports in Docker setup
This commit is contained in:
Alexey Kudinkin
2022-04-12 17:42:15 -07:00
committed by GitHub
parent 2e6e302efe
commit 7b78dff45f
11 changed files with 122 additions and 78 deletions

View File

@@ -21,13 +21,15 @@ package org.apache.hudi.io;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.config.HoodieCommonConfig;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.BaseFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.testutils.HoodieTestTable;
import org.apache.hudi.common.util.collection.ExternalSpillableMap;
import org.apache.hudi.config.HoodieCompactionConfig;
import org.apache.hudi.config.HoodieIndexConfig;
@@ -36,8 +38,6 @@ import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.testutils.HoodieClientTestHarness;
import org.apache.hudi.testutils.HoodieClientTestUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
@@ -51,6 +51,8 @@ import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors;
@@ -94,7 +96,6 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness {
.withProperties(properties)
.build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) {
FileSystem fs = FSUtils.getFs(basePath, hadoopConf);
/**
* Write 1 (only inserts) This will do a bulk insert of 44 records of which there are 2 records repeated 21 times
@@ -202,6 +203,7 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness {
// Check the entire dataset has 47 records still
dataSet = getRecords();
assertEquals(47, dataSet.count(), "Must contain 47 records");
Row[] rows = (Row[]) dataSet.collect();
int record1Count = 0;
int record2Count = 0;
@@ -228,6 +230,22 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness {
// Assert that id2 record count which has been updated to rider-004 and driver-004 is 21, which is the total
// number of records with row_key id2
assertEquals(21, record2Count);
// Validate that all the records only reference the _latest_ base files as part of the
// FILENAME_METADATA_FIELD payload (entailing that corresponding metadata is in-sync with
// the state of the table
HoodieTableFileSystemView tableView =
getHoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline(), HoodieTestTable.of(metaClient).listAllBaseFiles());
Set<String> latestBaseFileNames = tableView.getLatestBaseFiles()
.map(BaseFile::getFileName)
.collect(Collectors.toSet());
Set<Object> metadataFilenameFieldRefs = dataSet.collectAsList().stream()
.map(row -> row.getAs(HoodieRecord.FILENAME_METADATA_FIELD))
.collect(Collectors.toSet());
assertEquals(latestBaseFileNames, metadataFilenameFieldRefs);
}
}