1
0

[HUDI-3664] Fixing Column Stats Index composition (#5181)

Co-authored-by: Sagar Sumit <sagarsumit09@gmail.com>
This commit is contained in:
Alexey Kudinkin
2022-04-02 17:15:52 -07:00
committed by GitHub
parent 74eb09be9b
commit cc3737be50
52 changed files with 1776 additions and 749 deletions

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.avro;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.testutils.SchemaTestUtil;
import org.apache.hudi.exception.SchemaCompatibilityException;
import org.apache.avro.JsonProperties;
@@ -27,12 +28,14 @@ import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import static org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldSchemaFromWriteSchema;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
@@ -88,6 +91,12 @@ public class TestHoodieAvroUtils {
+ "{\"name\":\"decimal_col\",\"type\":[\"null\","
+ "{\"type\":\"bytes\",\"logicalType\":\"decimal\",\"precision\":8,\"scale\":4}],\"default\":null}]}";
private static String SCHEMA_WITH_NESTED_FIELD = "{\"name\":\"MyClass\",\"type\":\"record\",\"namespace\":\"com.acme.avro\",\"fields\":["
+ "{\"name\":\"firstname\",\"type\":\"string\"},"
+ "{\"name\":\"lastname\",\"type\":\"string\"},"
+ "{\"name\":\"student\",\"type\":{\"name\":\"student\",\"type\":\"record\",\"fields\":["
+ "{\"name\":\"firstname\",\"type\":[\"null\" ,\"string\"],\"default\": null},{\"name\":\"lastname\",\"type\":[\"null\" ,\"string\"],\"default\": null}]}}]}";
@Test
public void testPropsPresent() {
Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(EXAMPLE_SCHEMA));
@@ -248,7 +257,7 @@ public class TestHoodieAvroUtils {
}
@Test
public void testGetNestedFieldValWithDecimalFiled() {
public void testGetNestedFieldValWithDecimalField() {
GenericRecord rec = new GenericData.Record(new Schema.Parser().parse(SCHEMA_WITH_DECIMAL_FIELD));
rec.put("key_col", "key");
BigDecimal bigDecimal = new BigDecimal("1234.5678");
@@ -264,4 +273,36 @@ public class TestHoodieAvroUtils {
assertEquals(0, buffer.position());
}
@Test
public void testGetNestedFieldSchema() throws IOException {
Schema schema = SchemaTestUtil.getEvolvedSchema();
GenericRecord rec = new GenericData.Record(schema);
rec.put("field1", "key1");
rec.put("field2", "val1");
rec.put("name", "val2");
rec.put("favorite_number", 2);
// test simple field schema
assertEquals(Schema.create(Schema.Type.STRING), getNestedFieldSchemaFromWriteSchema(rec.getSchema(), "field1"));
GenericRecord rec2 = new GenericData.Record(schema);
rec2.put("field1", "key1");
rec2.put("field2", "val1");
rec2.put("name", "val2");
rec2.put("favorite_number", 12);
// test comparison of non-string type
assertEquals(-1, GenericData.get().compare(rec.get("favorite_number"), rec2.get("favorite_number"), getNestedFieldSchemaFromWriteSchema(rec.getSchema(), "favorite_number")));
// test nested field schema
Schema nestedSchema = new Schema.Parser().parse(SCHEMA_WITH_NESTED_FIELD);
GenericRecord rec3 = new GenericData.Record(nestedSchema);
rec3.put("firstname", "person1");
rec3.put("lastname", "person2");
GenericRecord studentRecord = new GenericData.Record(rec3.getSchema().getField("student").schema());
studentRecord.put("firstname", "person1");
studentRecord.put("lastname", "person2");
rec3.put("student", studentRecord);
assertEquals(Schema.create(Schema.Type.STRING), getNestedFieldSchemaFromWriteSchema(rec3.getSchema(), "student.firstname"));
assertEquals(Schema.create(Schema.Type.STRING), getNestedFieldSchemaFromWriteSchema(nestedSchema, "student.firstname"));
}
}

View File

@@ -1749,40 +1749,39 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
FileCreateUtils.createDeltaCommit(basePath, "100", fs);
HoodieLogFileReader reader = new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(),
fs.getFileStatus(writer.getLogFile().getPath()).getLen()), SchemaTestUtil.getSimpleSchema(),
bufferSize, readBlocksLazily, true);
HoodieLogFile logFile = new HoodieLogFile(writer.getLogFile().getPath(), fs.getFileStatus(writer.getLogFile().getPath()).getLen());
try (HoodieLogFileReader reader = new HoodieLogFileReader(fs, logFile, SchemaTestUtil.getSimpleSchema(), bufferSize, readBlocksLazily, true)) {
assertTrue(reader.hasPrev(), "Last block should be available");
HoodieLogBlock prevBlock = reader.prev();
HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock;
assertTrue(reader.hasPrev(), "Last block should be available");
HoodieLogBlock prevBlock = reader.prev();
HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock;
List<IndexedRecord> recordsRead1 = getRecords(dataBlockRead);
assertEquals(copyOfRecords3.size(), recordsRead1.size(),
"Third records size should be equal to the written records size");
assertEquals(copyOfRecords3, recordsRead1,
"Both records lists should be the same. (ordering guaranteed)");
List<IndexedRecord> recordsRead1 = getRecords(dataBlockRead);
assertEquals(copyOfRecords3.size(), recordsRead1.size(),
"Third records size should be equal to the written records size");
assertEquals(copyOfRecords3, recordsRead1,
"Both records lists should be the same. (ordering guaranteed)");
assertTrue(reader.hasPrev(), "Second block should be available");
prevBlock = reader.prev();
dataBlockRead = (HoodieDataBlock) prevBlock;
List<IndexedRecord> recordsRead2 = getRecords(dataBlockRead);
assertEquals(copyOfRecords2.size(), recordsRead2.size(),
"Read records size should be equal to the written records size");
assertEquals(copyOfRecords2, recordsRead2,
"Both records lists should be the same. (ordering guaranteed)");
assertTrue(reader.hasPrev(), "Second block should be available");
prevBlock = reader.prev();
dataBlockRead = (HoodieDataBlock) prevBlock;
List<IndexedRecord> recordsRead2 = getRecords(dataBlockRead);
assertEquals(copyOfRecords2.size(), recordsRead2.size(),
"Read records size should be equal to the written records size");
assertEquals(copyOfRecords2, recordsRead2,
"Both records lists should be the same. (ordering guaranteed)");
assertTrue(reader.hasPrev(), "First block should be available");
prevBlock = reader.prev();
dataBlockRead = (HoodieDataBlock) prevBlock;
List<IndexedRecord> recordsRead3 = getRecords(dataBlockRead);
assertEquals(copyOfRecords1.size(), recordsRead3.size(),
"Read records size should be equal to the written records size");
assertEquals(copyOfRecords1, recordsRead3,
"Both records lists should be the same. (ordering guaranteed)");
assertTrue(reader.hasPrev(), "First block should be available");
prevBlock = reader.prev();
dataBlockRead = (HoodieDataBlock) prevBlock;
List<IndexedRecord> recordsRead3 = getRecords(dataBlockRead);
assertEquals(copyOfRecords1.size(), recordsRead3.size(),
"Read records size should be equal to the written records size");
assertEquals(copyOfRecords1, recordsRead3,
"Both records lists should be the same. (ordering guaranteed)");
assertFalse(reader.hasPrev());
reader.close();
assertFalse(reader.hasPrev());
}
}
@ParameterizedTest
@@ -1830,19 +1829,20 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
writer.close();
// First round of reads - we should be able to read the first block and then EOF
HoodieLogFileReader reader =
new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(),
fs.getFileStatus(writer.getLogFile().getPath()).getLen()), schema, bufferSize, readBlocksLazily, true);
HoodieLogFile logFile = new HoodieLogFile(writer.getLogFile().getPath(), fs.getFileStatus(writer.getLogFile().getPath()).getLen());
assertTrue(reader.hasPrev(), "Last block should be available");
HoodieLogBlock block = reader.prev();
assertTrue(block instanceof HoodieDataBlock, "Last block should be datablock");
try (HoodieLogFileReader reader =
new HoodieLogFileReader(fs, logFile, schema, bufferSize, readBlocksLazily, true)) {
assertTrue(reader.hasPrev(), "Last block should be available");
assertThrows(CorruptedLogFileException.class, () -> {
reader.prev();
});
reader.close();
assertTrue(reader.hasPrev(), "Last block should be available");
HoodieLogBlock block = reader.prev();
assertTrue(block instanceof HoodieDataBlock, "Last block should be datablock");
assertTrue(reader.hasPrev(), "Last block should be available");
assertThrows(CorruptedLogFileException.class, () -> {
reader.prev();
});
}
}
@ParameterizedTest
@@ -1882,28 +1882,28 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
FileCreateUtils.createDeltaCommit(basePath, "100", fs);
HoodieLogFileReader reader = new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(),
fs.getFileStatus(writer.getLogFile().getPath()).getLen()), SchemaTestUtil.getSimpleSchema(),
bufferSize, readBlocksLazily, true);
HoodieLogFile logFile = new HoodieLogFile(writer.getLogFile().getPath(), fs.getFileStatus(writer.getLogFile().getPath()).getLen());
try (HoodieLogFileReader reader =
new HoodieLogFileReader(fs, logFile, SchemaTestUtil.getSimpleSchema(), bufferSize, readBlocksLazily, true)) {
assertTrue(reader.hasPrev(), "Third block should be available");
reader.moveToPrev();
assertTrue(reader.hasPrev(), "Third block should be available");
reader.moveToPrev();
assertTrue(reader.hasPrev(), "Second block should be available");
reader.moveToPrev();
assertTrue(reader.hasPrev(), "Second block should be available");
reader.moveToPrev();
// After moving twice, this last reader.prev() should read the First block written
assertTrue(reader.hasPrev(), "First block should be available");
HoodieLogBlock prevBlock = reader.prev();
HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock;
List<IndexedRecord> recordsRead = getRecords(dataBlockRead);
assertEquals(copyOfRecords1.size(), recordsRead.size(),
"Read records size should be equal to the written records size");
assertEquals(copyOfRecords1, recordsRead,
"Both records lists should be the same. (ordering guaranteed)");
// After moving twice, this last reader.prev() should read the First block written
assertTrue(reader.hasPrev(), "First block should be available");
HoodieLogBlock prevBlock = reader.prev();
HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock;
List<IndexedRecord> recordsRead = getRecords(dataBlockRead);
assertEquals(copyOfRecords1.size(), recordsRead.size(),
"Read records size should be equal to the written records size");
assertEquals(copyOfRecords1, recordsRead,
"Both records lists should be the same. (ordering guaranteed)");
assertFalse(reader.hasPrev());
reader.close();
assertFalse(reader.hasPrev());
}
}
@Test

View File

@@ -99,15 +99,6 @@ public class FileCreateUtils {
return String.format("%s_%s_%s%s%s.%s", fileId, WRITE_TOKEN, instantTime, fileExtension, HoodieTableMetaClient.MARKER_EXTN, ioType);
}
private static void createMetaFile(String basePath, String instantTime, String suffix) throws IOException {
Path parentPath = Paths.get(basePath, HoodieTableMetaClient.METAFOLDER_NAME);
Files.createDirectories(parentPath);
Path metaFilePath = parentPath.resolve(instantTime + suffix);
if (Files.notExists(metaFilePath)) {
Files.createFile(metaFilePath);
}
}
private static void createMetaFile(String basePath, String instantTime, String suffix, FileSystem fs) throws IOException {
org.apache.hadoop.fs.Path parentPath = new org.apache.hadoop.fs.Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME);
if (!fs.exists(parentPath)) {
@@ -119,12 +110,20 @@ public class FileCreateUtils {
}
}
private static void createMetaFile(String basePath, String instantTime, String suffix) throws IOException {
createMetaFile(basePath, instantTime, suffix, "".getBytes());
}
private static void createMetaFile(String basePath, String instantTime, String suffix, byte[] content) throws IOException {
Path parentPath = Paths.get(basePath, HoodieTableMetaClient.METAFOLDER_NAME);
Files.createDirectories(parentPath);
Path metaFilePath = parentPath.resolve(instantTime + suffix);
if (Files.notExists(metaFilePath)) {
Files.write(metaFilePath, content);
if (content.length == 0) {
Files.createFile(metaFilePath);
} else {
Files.write(metaFilePath, content);
}
}
}

View File

@@ -117,8 +117,12 @@ import static org.apache.hudi.common.util.StringUtils.EMPTY_STRING;
public class HoodieTestTable {
public static final String PHONY_TABLE_SCHEMA =
"{\"namespace\": \"org.apache.hudi.avro.model\", \"type\": \"record\", \"name\": \"PhonyRecord\", \"fields\": []}";
private static final Logger LOG = LogManager.getLogger(HoodieTestTable.class);
private static final Random RANDOM = new Random();
protected static HoodieTestTableState testTableState;
private final List<String> inflightCommits = new ArrayList<>();
@@ -215,7 +219,7 @@ public class HoodieTestTable {
writeStats.addAll(generateHoodieWriteStatForPartitionLogFiles(testTableState.getPartitionToLogFileInfoMap(commitTime), commitTime, bootstrap));
}
Map<String, String> extraMetadata = createImmutableMap("test", "test");
return buildMetadata(writeStats, partitionToReplaceFileIds, Option.of(extraMetadata), operationType, EMPTY_STRING, action);
return buildMetadata(writeStats, partitionToReplaceFileIds, Option.of(extraMetadata), operationType, PHONY_TABLE_SCHEMA, action);
}
public HoodieTestTable moveInflightCommitToComplete(String instantTime, HoodieCommitMetadata metadata) throws IOException {
@@ -779,7 +783,7 @@ public class HoodieTestTable {
this.withBaseFilesInPartition(partition, testTableState.getPartitionToBaseFileInfoMap(commitTime).get(partition));
}
HoodieReplaceCommitMetadata replaceMetadata =
(HoodieReplaceCommitMetadata) buildMetadata(writeStats, partitionToReplaceFileIds, Option.empty(), CLUSTER, EMPTY_STRING,
(HoodieReplaceCommitMetadata) buildMetadata(writeStats, partitionToReplaceFileIds, Option.empty(), CLUSTER, PHONY_TABLE_SCHEMA,
REPLACE_COMMIT_ACTION);
addReplaceCommit(commitTime, Option.empty(), Option.empty(), replaceMetadata);
return replaceMetadata;