[HUDI-114]: added option to overwrite payload implementation in hoodie.properties file
This commit is contained in:
committed by
Balaji Varadarajan
parent
5af3dc6aed
commit
3c90d252cc
@@ -18,8 +18,8 @@
|
|||||||
|
|
||||||
package org.apache.hudi.config;
|
package org.apache.hudi.config;
|
||||||
|
|
||||||
import org.apache.hudi.common.model.HoodieAvroPayload;
|
|
||||||
import org.apache.hudi.common.model.HoodieCleaningPolicy;
|
import org.apache.hudi.common.model.HoodieCleaningPolicy;
|
||||||
|
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
|
||||||
import org.apache.hudi.io.compact.strategy.CompactionStrategy;
|
import org.apache.hudi.io.compact.strategy.CompactionStrategy;
|
||||||
import org.apache.hudi.io.compact.strategy.LogFileSizeBasedCompactionStrategy;
|
import org.apache.hudi.io.compact.strategy.LogFileSizeBasedCompactionStrategy;
|
||||||
|
|
||||||
@@ -82,7 +82,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
|||||||
// 200GB of target IO per compaction
|
// 200GB of target IO per compaction
|
||||||
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class.getName();
|
public static final String DEFAULT_COMPACTION_STRATEGY = LogFileSizeBasedCompactionStrategy.class.getName();
|
||||||
// used to merge records written to log file
|
// used to merge records written to log file
|
||||||
public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName();
|
public static final String DEFAULT_PAYLOAD_CLASS = OverwriteWithLatestAvroPayload.class.getName();
|
||||||
public static final String PAYLOAD_CLASS_PROP = "hoodie.compaction.payload.class";
|
public static final String PAYLOAD_CLASS_PROP = "hoodie.compaction.payload.class";
|
||||||
|
|
||||||
// used to choose a trade off between IO vs Memory when performing compaction process
|
// used to choose a trade off between IO vs Memory when performing compaction process
|
||||||
|
|||||||
@@ -16,7 +16,7 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.hudi;
|
package org.apache.hudi.common.model;
|
||||||
|
|
||||||
import org.apache.hudi.common.util.HoodieAvroUtils;
|
import org.apache.hudi.common.util.HoodieAvroUtils;
|
||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
@@ -34,7 +34,7 @@ public abstract class BaseAvroPayload implements Serializable {
|
|||||||
/**
|
/**
|
||||||
* Avro data extracted from the source converted to bytes.
|
* Avro data extracted from the source converted to bytes.
|
||||||
*/
|
*/
|
||||||
protected final byte[] recordBytes;
|
public final byte[] recordBytes;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* For purposes of preCombining.
|
* For purposes of preCombining.
|
||||||
@@ -49,7 +49,7 @@ public abstract class BaseAvroPayload implements Serializable {
|
|||||||
*/
|
*/
|
||||||
public BaseAvroPayload(GenericRecord record, Comparable orderingVal) {
|
public BaseAvroPayload(GenericRecord record, Comparable orderingVal) {
|
||||||
try {
|
try {
|
||||||
this.recordBytes = HoodieAvroUtils.avroToBytes(record);
|
this.recordBytes = record != null ? HoodieAvroUtils.avroToBytes(record) : new byte[0];
|
||||||
} catch (IOException io) {
|
} catch (IOException io) {
|
||||||
throw new HoodieIOException("Cannot convert GenericRecord to bytes", io);
|
throw new HoodieIOException("Cannot convert GenericRecord to bytes", io);
|
||||||
}
|
}
|
||||||
@@ -16,9 +16,8 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.hudi;
|
package org.apache.hudi.common.model;
|
||||||
|
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
|
||||||
import org.apache.hudi.common.util.HoodieAvroUtils;
|
import org.apache.hudi.common.util.HoodieAvroUtils;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
|
|
||||||
@@ -45,7 +44,7 @@ public class OverwriteWithLatestAvroPayload extends BaseAvroPayload
|
|||||||
}
|
}
|
||||||
|
|
||||||
public OverwriteWithLatestAvroPayload(Option<GenericRecord> record) {
|
public OverwriteWithLatestAvroPayload(Option<GenericRecord> record) {
|
||||||
this(record.get(), (record1) -> 0); // natural order
|
this(record.isPresent() ? record.get() : null, (record1) -> 0); // natural order
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -61,7 +60,12 @@ public class OverwriteWithLatestAvroPayload extends BaseAvroPayload
|
|||||||
@Override
|
@Override
|
||||||
public Option<IndexedRecord> combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException {
|
public Option<IndexedRecord> combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException {
|
||||||
|
|
||||||
GenericRecord genericRecord = (GenericRecord) getInsertValue(schema).get();
|
Option<IndexedRecord> recordOption = getInsertValue(schema);
|
||||||
|
if (!recordOption.isPresent()) {
|
||||||
|
return Option.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
GenericRecord genericRecord = (GenericRecord) recordOption.get();
|
||||||
// combining strategy here trivially ignores currentValue on disk and writes this record
|
// combining strategy here trivially ignores currentValue on disk and writes this record
|
||||||
Object deleteMarker = genericRecord.get("_hoodie_is_deleted");
|
Object deleteMarker = genericRecord.get("_hoodie_is_deleted");
|
||||||
if (deleteMarker instanceof Boolean && (boolean) deleteMarker) {
|
if (deleteMarker instanceof Boolean && (boolean) deleteMarker) {
|
||||||
@@ -73,6 +77,6 @@ public class OverwriteWithLatestAvroPayload extends BaseAvroPayload
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Option<IndexedRecord> getInsertValue(Schema schema) throws IOException {
|
public Option<IndexedRecord> getInsertValue(Schema schema) throws IOException {
|
||||||
return Option.of(HoodieAvroUtils.bytesToAvro(recordBytes, schema));
|
return recordBytes.length == 0 ? Option.empty() : Option.of(HoodieAvroUtils.bytesToAvro(recordBytes, schema));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -18,9 +18,9 @@
|
|||||||
|
|
||||||
package org.apache.hudi.common.table;
|
package org.apache.hudi.common.table;
|
||||||
|
|
||||||
import org.apache.hudi.common.model.HoodieAvroPayload;
|
|
||||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||||
import org.apache.hudi.common.model.HoodieTableType;
|
import org.apache.hudi.common.model.HoodieTableType;
|
||||||
|
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
|
||||||
import org.apache.hudi.common.model.TimelineLayoutVersion;
|
import org.apache.hudi.common.model.TimelineLayoutVersion;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
|
|
||||||
@@ -62,13 +62,12 @@ public class HoodieTableConfig implements Serializable {
|
|||||||
public static final HoodieTableType DEFAULT_TABLE_TYPE = HoodieTableType.COPY_ON_WRITE;
|
public static final HoodieTableType DEFAULT_TABLE_TYPE = HoodieTableType.COPY_ON_WRITE;
|
||||||
public static final HoodieFileFormat DEFAULT_RO_FILE_FORMAT = HoodieFileFormat.PARQUET;
|
public static final HoodieFileFormat DEFAULT_RO_FILE_FORMAT = HoodieFileFormat.PARQUET;
|
||||||
public static final HoodieFileFormat DEFAULT_RT_FILE_FORMAT = HoodieFileFormat.HOODIE_LOG;
|
public static final HoodieFileFormat DEFAULT_RT_FILE_FORMAT = HoodieFileFormat.HOODIE_LOG;
|
||||||
|
public static final String DEFAULT_PAYLOAD_CLASS = OverwriteWithLatestAvroPayload.class.getName();
|
||||||
public static final Integer DEFAULT_TIMELINE_LAYOUT_VERSION = TimelineLayoutVersion.VERSION_0;
|
public static final Integer DEFAULT_TIMELINE_LAYOUT_VERSION = TimelineLayoutVersion.VERSION_0;
|
||||||
|
|
||||||
public static final String DEFAULT_PAYLOAD_CLASS = HoodieAvroPayload.class.getName();
|
|
||||||
public static final String DEFAULT_ARCHIVELOG_FOLDER = "";
|
public static final String DEFAULT_ARCHIVELOG_FOLDER = "";
|
||||||
private Properties props;
|
private Properties props;
|
||||||
|
|
||||||
public HoodieTableConfig(FileSystem fs, String metaPath) {
|
public HoodieTableConfig(FileSystem fs, String metaPath, String payloadClassName) {
|
||||||
Properties props = new Properties();
|
Properties props = new Properties();
|
||||||
Path propertyPath = new Path(metaPath, HOODIE_PROPERTIES_FILE);
|
Path propertyPath = new Path(metaPath, HOODIE_PROPERTIES_FILE);
|
||||||
LOG.info("Loading table properties from " + propertyPath);
|
LOG.info("Loading table properties from " + propertyPath);
|
||||||
@@ -76,6 +75,13 @@ public class HoodieTableConfig implements Serializable {
|
|||||||
try (FSDataInputStream inputStream = fs.open(propertyPath)) {
|
try (FSDataInputStream inputStream = fs.open(propertyPath)) {
|
||||||
props.load(inputStream);
|
props.load(inputStream);
|
||||||
}
|
}
|
||||||
|
if (props.containsKey(HOODIE_PAYLOAD_CLASS_PROP_NAME) && payloadClassName != null
|
||||||
|
&& !props.getProperty(HOODIE_PAYLOAD_CLASS_PROP_NAME).equals(payloadClassName)) {
|
||||||
|
props.setProperty(HOODIE_PAYLOAD_CLASS_PROP_NAME, payloadClassName);
|
||||||
|
try (FSDataOutputStream outputStream = fs.create(propertyPath)) {
|
||||||
|
props.store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis()));
|
||||||
|
}
|
||||||
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieIOException("Could not load Hoodie properties from " + propertyPath, e);
|
throw new HoodieIOException("Could not load Hoodie properties from " + propertyPath, e);
|
||||||
}
|
}
|
||||||
@@ -109,7 +115,7 @@ public class HoodieTableConfig implements Serializable {
|
|||||||
if (!properties.containsKey(HOODIE_TABLE_TYPE_PROP_NAME)) {
|
if (!properties.containsKey(HOODIE_TABLE_TYPE_PROP_NAME)) {
|
||||||
properties.setProperty(HOODIE_TABLE_TYPE_PROP_NAME, DEFAULT_TABLE_TYPE.name());
|
properties.setProperty(HOODIE_TABLE_TYPE_PROP_NAME, DEFAULT_TABLE_TYPE.name());
|
||||||
}
|
}
|
||||||
if (properties.getProperty(HOODIE_TABLE_TYPE_PROP_NAME) == HoodieTableType.MERGE_ON_READ.name()
|
if (properties.getProperty(HOODIE_TABLE_TYPE_PROP_NAME).equals(HoodieTableType.MERGE_ON_READ.name())
|
||||||
&& !properties.containsKey(HOODIE_PAYLOAD_CLASS_PROP_NAME)) {
|
&& !properties.containsKey(HOODIE_PAYLOAD_CLASS_PROP_NAME)) {
|
||||||
properties.setProperty(HOODIE_PAYLOAD_CLASS_PROP_NAME, DEFAULT_PAYLOAD_CLASS);
|
properties.setProperty(HOODIE_PAYLOAD_CLASS_PROP_NAME, DEFAULT_PAYLOAD_CLASS);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -89,13 +89,22 @@ public class HoodieTableMetaClient implements Serializable {
|
|||||||
this(conf, basePath, false);
|
this(conf, basePath, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad) {
|
public HoodieTableMetaClient(Configuration conf, String basePath, String payloadClassName) {
|
||||||
this(conf, basePath, loadActiveTimelineOnLoad, ConsistencyGuardConfig.newBuilder().build(),
|
this(conf, basePath, false, ConsistencyGuardConfig.newBuilder().build(), Option.of(TimelineLayoutVersion.CURR_LAYOUT_VERSION),
|
||||||
Option.of(TimelineLayoutVersion.CURR_LAYOUT_VERSION));
|
payloadClassName);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad,
|
public HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad,
|
||||||
ConsistencyGuardConfig consistencyGuardConfig, Option<TimelineLayoutVersion> layoutVersion)
|
ConsistencyGuardConfig consistencyGuardConfig, Option<TimelineLayoutVersion> layoutVersion) {
|
||||||
|
this(conf, basePath, loadActiveTimelineOnLoad, consistencyGuardConfig, layoutVersion, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad) {
|
||||||
|
this(conf, basePath, loadActiveTimelineOnLoad, ConsistencyGuardConfig.newBuilder().build(), Option.of(TimelineLayoutVersion.CURR_LAYOUT_VERSION), null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad,
|
||||||
|
ConsistencyGuardConfig consistencyGuardConfig, Option<TimelineLayoutVersion> layoutVersion, String payloadClassName)
|
||||||
throws TableNotFoundException {
|
throws TableNotFoundException {
|
||||||
LOG.info("Loading HoodieTableMetaClient from " + basePath);
|
LOG.info("Loading HoodieTableMetaClient from " + basePath);
|
||||||
this.basePath = basePath;
|
this.basePath = basePath;
|
||||||
@@ -106,7 +115,7 @@ public class HoodieTableMetaClient implements Serializable {
|
|||||||
Path metaPathDir = new Path(this.metaPath);
|
Path metaPathDir = new Path(this.metaPath);
|
||||||
this.fs = getFs();
|
this.fs = getFs();
|
||||||
TableNotFoundException.checkTableValidity(fs, basePathDir, metaPathDir);
|
TableNotFoundException.checkTableValidity(fs, basePathDir, metaPathDir);
|
||||||
this.tableConfig = new HoodieTableConfig(fs, metaPath);
|
this.tableConfig = new HoodieTableConfig(fs, metaPath, payloadClassName);
|
||||||
this.tableType = tableConfig.getTableType();
|
this.tableType = tableConfig.getTableType();
|
||||||
this.timelineLayoutVersion = layoutVersion.orElse(tableConfig.getTimelineLayoutVersion());
|
this.timelineLayoutVersion = layoutVersion.orElse(tableConfig.getTimelineLayoutVersion());
|
||||||
this.loadActiveTimelineOnLoad = loadActiveTimelineOnLoad;
|
this.loadActiveTimelineOnLoad = loadActiveTimelineOnLoad;
|
||||||
@@ -127,7 +136,7 @@ public class HoodieTableMetaClient implements Serializable {
|
|||||||
public static HoodieTableMetaClient reload(HoodieTableMetaClient oldMetaClient) {
|
public static HoodieTableMetaClient reload(HoodieTableMetaClient oldMetaClient) {
|
||||||
return new HoodieTableMetaClient(oldMetaClient.hadoopConf.get(), oldMetaClient.basePath,
|
return new HoodieTableMetaClient(oldMetaClient.hadoopConf.get(), oldMetaClient.basePath,
|
||||||
oldMetaClient.loadActiveTimelineOnLoad, oldMetaClient.consistencyGuardConfig,
|
oldMetaClient.loadActiveTimelineOnLoad, oldMetaClient.consistencyGuardConfig,
|
||||||
Option.of(oldMetaClient.timelineLayoutVersion));
|
Option.of(oldMetaClient.timelineLayoutVersion), null);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -284,9 +293,9 @@ public class HoodieTableMetaClient implements Serializable {
|
|||||||
* Helper method to initialize a table, with given basePath, tableType, name, archiveFolder.
|
* Helper method to initialize a table, with given basePath, tableType, name, archiveFolder.
|
||||||
*/
|
*/
|
||||||
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, String tableType,
|
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, String tableType,
|
||||||
String tableName, String archiveLogFolder) throws IOException {
|
String tableName, String archiveLogFolder, String payloadClassName) throws IOException {
|
||||||
return initTableType(hadoopConf, basePath, HoodieTableType.valueOf(tableType), tableName,
|
return initTableType(hadoopConf, basePath, HoodieTableType.valueOf(tableType), tableName,
|
||||||
archiveLogFolder, null, null);
|
archiveLogFolder, payloadClassName, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ package org.apache.hudi;
|
|||||||
|
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
|
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
|
||||||
import org.apache.hudi.common.util.HoodieAvroUtils;
|
import org.apache.hudi.common.util.HoodieAvroUtils;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
|
|||||||
@@ -18,7 +18,7 @@
|
|||||||
|
|
||||||
package org.apache.hudi.payload;
|
package org.apache.hudi.payload;
|
||||||
|
|
||||||
import org.apache.hudi.OverwriteWithLatestAvroPayload;
|
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
package org.apache.hudi
|
package org.apache.hudi
|
||||||
|
|
||||||
import org.apache.hudi.common.model.HoodieTableType
|
import org.apache.hudi.common.model.HoodieTableType
|
||||||
|
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload
|
||||||
import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
|
import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -120,7 +120,7 @@ private[hudi] object HoodieSparkSqlWriter {
|
|||||||
// Create the table if not present
|
// Create the table if not present
|
||||||
if (!exists) {
|
if (!exists) {
|
||||||
HoodieTableMetaClient.initTableType(sparkContext.hadoopConfiguration, path.get, storageType,
|
HoodieTableMetaClient.initTableType(sparkContext.hadoopConfiguration, path.get, storageType,
|
||||||
tblName.get, "archived")
|
tblName.get, "archived", parameters(PAYLOAD_CLASS_OPT_KEY))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create a HoodieWriteClient & issue the write.
|
// Create a HoodieWriteClient & issue the write.
|
||||||
|
|||||||
@@ -16,10 +16,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.avro.generic.GenericRecord
|
import org.apache.avro.generic.GenericRecord
|
||||||
import org.apache.hudi.common.model.EmptyHoodieRecordPayload
|
import org.apache.hudi.common.model.{EmptyHoodieRecordPayload, OverwriteWithLatestAvroPayload}
|
||||||
import org.apache.hudi.common.util.{Option, SchemaTestUtil, TypedProperties}
|
import org.apache.hudi.common.util.{Option, SchemaTestUtil, TypedProperties}
|
||||||
import org.apache.hudi.exception.{HoodieException, HoodieKeyException}
|
import org.apache.hudi.exception.{HoodieException, HoodieKeyException}
|
||||||
import org.apache.hudi.{ComplexKeyGenerator, DataSourceWriteOptions, OverwriteWithLatestAvroPayload, SimpleKeyGenerator}
|
import org.apache.hudi.{ComplexKeyGenerator, DataSourceWriteOptions, SimpleKeyGenerator}
|
||||||
import org.junit.Assert._
|
import org.junit.Assert._
|
||||||
import org.junit.{Before, Test}
|
import org.junit.{Before, Test}
|
||||||
import org.scalatest.junit.AssertionsForJUnit
|
import org.scalatest.junit.AssertionsForJUnit
|
||||||
|
|||||||
@@ -193,12 +193,13 @@ public class DeltaSync implements Serializable {
|
|||||||
*/
|
*/
|
||||||
private void refreshTimeline() throws IOException {
|
private void refreshTimeline() throws IOException {
|
||||||
if (fs.exists(new Path(cfg.targetBasePath))) {
|
if (fs.exists(new Path(cfg.targetBasePath))) {
|
||||||
HoodieTableMetaClient meta = new HoodieTableMetaClient(new Configuration(fs.getConf()), cfg.targetBasePath);
|
HoodieTableMetaClient meta = new HoodieTableMetaClient(new Configuration(fs.getConf()), cfg.targetBasePath,
|
||||||
|
cfg.payloadClassName);
|
||||||
this.commitTimelineOpt = Option.of(meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants());
|
this.commitTimelineOpt = Option.of(meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants());
|
||||||
} else {
|
} else {
|
||||||
this.commitTimelineOpt = Option.empty();
|
this.commitTimelineOpt = Option.empty();
|
||||||
HoodieTableMetaClient.initTableType(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath,
|
HoodieTableMetaClient.initTableType(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath,
|
||||||
cfg.storageType, cfg.targetTableName, "archived");
|
cfg.storageType, cfg.targetTableName, "archived", cfg.payloadClassName);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -260,7 +261,7 @@ public class DeltaSync implements Serializable {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
HoodieTableMetaClient.initTableType(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath,
|
HoodieTableMetaClient.initTableType(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath,
|
||||||
cfg.storageType, cfg.targetTableName, "archived");
|
cfg.storageType, cfg.targetTableName, "archived", cfg.payloadClassName);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!resumeCheckpointStr.isPresent() && cfg.checkpoint != null) {
|
if (!resumeCheckpointStr.isPresent() && cfg.checkpoint != null) {
|
||||||
|
|||||||
@@ -19,8 +19,8 @@
|
|||||||
package org.apache.hudi.utilities.deltastreamer;
|
package org.apache.hudi.utilities.deltastreamer;
|
||||||
|
|
||||||
import org.apache.hudi.HoodieWriteClient;
|
import org.apache.hudi.HoodieWriteClient;
|
||||||
import org.apache.hudi.OverwriteWithLatestAvroPayload;
|
|
||||||
import org.apache.hudi.common.model.HoodieTableType;
|
import org.apache.hudi.common.model.HoodieTableType;
|
||||||
|
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.HoodieTimeline;
|
import org.apache.hudi.common.table.HoodieTimeline;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||||
|
|||||||
@@ -22,10 +22,13 @@ import org.apache.hudi.DataSourceWriteOptions;
|
|||||||
import org.apache.hudi.SimpleKeyGenerator;
|
import org.apache.hudi.SimpleKeyGenerator;
|
||||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||||
import org.apache.hudi.common.model.HoodieTableType;
|
import org.apache.hudi.common.model.HoodieTableType;
|
||||||
|
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
|
||||||
|
import org.apache.hudi.common.table.HoodieTableConfig;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.HoodieTimeline;
|
import org.apache.hudi.common.table.HoodieTimeline;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||||
import org.apache.hudi.common.util.DFSPropertiesConfiguration;
|
import org.apache.hudi.common.util.DFSPropertiesConfiguration;
|
||||||
|
import org.apache.hudi.common.util.FSUtils;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.common.util.TypedProperties;
|
import org.apache.hudi.common.util.TypedProperties;
|
||||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||||
@@ -48,6 +51,7 @@ import org.apache.hudi.utilities.transform.Transformer;
|
|||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
@@ -71,6 +75,7 @@ import org.junit.Test;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Properties;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
@@ -79,6 +84,7 @@ import java.util.function.Function;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
import static org.junit.Assert.fail;
|
import static org.junit.Assert.fail;
|
||||||
|
|
||||||
@@ -149,13 +155,11 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
@Before
|
@Before
|
||||||
public void setup() throws Exception {
|
public void setup() throws Exception {
|
||||||
super.setup();
|
super.setup();
|
||||||
TestDataSource.initDataGen();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@After
|
@After
|
||||||
public void teardown() throws Exception {
|
public void teardown() throws Exception {
|
||||||
super.teardown();
|
super.teardown();
|
||||||
TestDataSource.resetDataGen();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static class TestHelpers {
|
static class TestHelpers {
|
||||||
@@ -174,15 +178,17 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
|
|
||||||
static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String transformerClassName,
|
static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String transformerClassName,
|
||||||
String propsFilename, boolean enableHiveSync) {
|
String propsFilename, boolean enableHiveSync) {
|
||||||
return makeConfig(basePath, op, transformerClassName, propsFilename, enableHiveSync, true);
|
return makeConfig(basePath, op, transformerClassName, propsFilename, enableHiveSync, true,
|
||||||
|
false, null, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String transformerClassName,
|
static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String transformerClassName,
|
||||||
String propsFilename, boolean enableHiveSync, boolean useSchemaProviderClass) {
|
String propsFilename, boolean enableHiveSync, boolean useSchemaProviderClass, boolean updatePayloadClass,
|
||||||
|
String payloadClassName, String storageType) {
|
||||||
HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
|
HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
|
||||||
cfg.targetBasePath = basePath;
|
cfg.targetBasePath = basePath;
|
||||||
cfg.targetTableName = "hoodie_trips";
|
cfg.targetTableName = "hoodie_trips";
|
||||||
cfg.storageType = "COPY_ON_WRITE";
|
cfg.storageType = storageType == null ? "COPY_ON_WRITE" : storageType;
|
||||||
cfg.sourceClassName = TestDataSource.class.getName();
|
cfg.sourceClassName = TestDataSource.class.getName();
|
||||||
cfg.transformerClassName = transformerClassName;
|
cfg.transformerClassName = transformerClassName;
|
||||||
cfg.operation = op;
|
cfg.operation = op;
|
||||||
@@ -190,6 +196,9 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
cfg.sourceOrderingField = "timestamp";
|
cfg.sourceOrderingField = "timestamp";
|
||||||
cfg.propsFilePath = dfsBasePath + "/" + propsFilename;
|
cfg.propsFilePath = dfsBasePath + "/" + propsFilename;
|
||||||
cfg.sourceLimit = 1000;
|
cfg.sourceLimit = 1000;
|
||||||
|
if (updatePayloadClass) {
|
||||||
|
cfg.payloadClassName = payloadClassName;
|
||||||
|
}
|
||||||
if (useSchemaProviderClass) {
|
if (useSchemaProviderClass) {
|
||||||
cfg.schemaProviderClassName = FilebasedSchemaProvider.class.getName();
|
cfg.schemaProviderClassName = FilebasedSchemaProvider.class.getName();
|
||||||
}
|
}
|
||||||
@@ -491,7 +500,7 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
String tableBasePath = dfsBasePath + "/test_table";
|
String tableBasePath = dfsBasePath + "/test_table";
|
||||||
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, Operation.BULK_INSERT,
|
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, Operation.BULK_INSERT,
|
||||||
SqlQueryBasedTransformer.class.getName(), PROPS_FILENAME_TEST_SOURCE, true,
|
SqlQueryBasedTransformer.class.getName(), PROPS_FILENAME_TEST_SOURCE, true,
|
||||||
false);
|
false, false, null, null);
|
||||||
try {
|
try {
|
||||||
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
|
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
|
||||||
fail("Should error out when schema provider is not provided");
|
fail("Should error out when schema provider is not provided");
|
||||||
@@ -501,6 +510,58 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPayloadClassUpdate() throws Exception {
|
||||||
|
String dataSetBasePath = dfsBasePath + "/test_dataset_mor";
|
||||||
|
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(dataSetBasePath, Operation.BULK_INSERT,
|
||||||
|
SqlQueryBasedTransformer.class.getName(), PROPS_FILENAME_TEST_SOURCE, true,
|
||||||
|
true, false, null, "MERGE_ON_READ");
|
||||||
|
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
|
||||||
|
TestHelpers.assertRecordCount(1000, dataSetBasePath + "/*/*.parquet", sqlContext);
|
||||||
|
|
||||||
|
//now create one more deltaStreamer instance and update payload class
|
||||||
|
cfg = TestHelpers.makeConfig(dataSetBasePath, Operation.BULK_INSERT,
|
||||||
|
SqlQueryBasedTransformer.class.getName(), PROPS_FILENAME_TEST_SOURCE, true,
|
||||||
|
true, true, DummyAvroPayload.class.getName(), "MERGE_ON_READ");
|
||||||
|
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf());
|
||||||
|
|
||||||
|
//now assert that hoodie.properties file now has updated payload class name
|
||||||
|
Properties props = new Properties();
|
||||||
|
String metaPath = dataSetBasePath + "/.hoodie/hoodie.properties";
|
||||||
|
FileSystem fs = FSUtils.getFs(cfg.targetBasePath, jsc.hadoopConfiguration());
|
||||||
|
try (FSDataInputStream inputStream = fs.open(new Path(metaPath))) {
|
||||||
|
props.load(inputStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(props.getProperty(HoodieTableConfig.HOODIE_PAYLOAD_CLASS_PROP_NAME), DummyAvroPayload.class.getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPayloadClassUpdateWithCOWTable() throws Exception {
|
||||||
|
String dataSetBasePath = dfsBasePath + "/test_dataset_cow";
|
||||||
|
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(dataSetBasePath, Operation.BULK_INSERT,
|
||||||
|
SqlQueryBasedTransformer.class.getName(), PROPS_FILENAME_TEST_SOURCE, true,
|
||||||
|
true, false, null, null);
|
||||||
|
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
|
||||||
|
TestHelpers.assertRecordCount(1000, dataSetBasePath + "/*/*.parquet", sqlContext);
|
||||||
|
|
||||||
|
//now create one more deltaStreamer instance and update payload class
|
||||||
|
cfg = TestHelpers.makeConfig(dataSetBasePath, Operation.BULK_INSERT,
|
||||||
|
SqlQueryBasedTransformer.class.getName(), PROPS_FILENAME_TEST_SOURCE, true,
|
||||||
|
true, true, DummyAvroPayload.class.getName(), null);
|
||||||
|
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf());
|
||||||
|
|
||||||
|
//now assert that hoodie.properties file does not have payload class prop since it is a COW table
|
||||||
|
Properties props = new Properties();
|
||||||
|
String metaPath = dataSetBasePath + "/.hoodie/hoodie.properties";
|
||||||
|
FileSystem fs = FSUtils.getFs(cfg.targetBasePath, jsc.hadoopConfiguration());
|
||||||
|
try (FSDataInputStream inputStream = fs.open(new Path(metaPath))) {
|
||||||
|
props.load(inputStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
assertFalse(props.containsKey(HoodieTableConfig.HOODIE_PAYLOAD_CLASS_PROP_NAME));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testFilterDupes() throws Exception {
|
public void testFilterDupes() throws Exception {
|
||||||
String tableBasePath = dfsBasePath + "/test_dupes_table";
|
String tableBasePath = dfsBasePath + "/test_dupes_table";
|
||||||
@@ -535,14 +596,14 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
ds2.sync();
|
ds2.sync();
|
||||||
mClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), tableBasePath, true);
|
mClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), tableBasePath, true);
|
||||||
HoodieInstant newLastFinished = mClient.getCommitsTimeline().filterCompletedInstants().lastInstant().get();
|
HoodieInstant newLastFinished = mClient.getCommitsTimeline().filterCompletedInstants().lastInstant().get();
|
||||||
Assert.assertTrue(HoodieTimeline.compareTimestamps(newLastFinished.getTimestamp(), lastFinished.getTimestamp(),
|
assertTrue(HoodieTimeline.compareTimestamps(newLastFinished.getTimestamp(), lastFinished.getTimestamp(),
|
||||||
HoodieTimeline.GREATER));
|
HoodieTimeline.GREATER));
|
||||||
|
|
||||||
// Ensure it is empty
|
// Ensure it is empty
|
||||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
||||||
.fromBytes(mClient.getActiveTimeline().getInstantDetails(newLastFinished).get(), HoodieCommitMetadata.class);
|
.fromBytes(mClient.getActiveTimeline().getInstantDetails(newLastFinished).get(), HoodieCommitMetadata.class);
|
||||||
System.out.println("New Commit Metadata=" + commitMetadata);
|
System.out.println("New Commit Metadata=" + commitMetadata);
|
||||||
Assert.assertTrue(commitMetadata.getPartitionToWriteStats().isEmpty());
|
assertTrue(commitMetadata.getPartitionToWriteStats().isEmpty());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -598,6 +659,13 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static class DummyAvroPayload extends OverwriteWithLatestAvroPayload {
|
||||||
|
|
||||||
|
public DummyAvroPayload(GenericRecord gr, Comparable orderingVal) {
|
||||||
|
super(gr, orderingVal);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return empty table.
|
* Return empty table.
|
||||||
*/
|
*/
|
||||||
|
|||||||
Reference in New Issue
Block a user