[HUDI-3730] Improve meta sync class design and hierarchies (#5854)
* [HUDI-3730] Improve meta sync class design and hierarchies (#5754) * Implements class design proposed in RFC-55 Co-authored-by: jian.feng <fengjian428@gmial.com> Co-authored-by: jian.feng <jian.feng@shopee.com>
This commit is contained in:
@@ -1,276 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.TableSchemaResolver;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.TimelineUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.metadata.HoodieTableMetadataUtil;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
public abstract class AbstractSyncHoodieClient implements AutoCloseable {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(AbstractSyncHoodieClient.class);
|
||||
|
||||
public static final String HOODIE_LAST_COMMIT_TIME_SYNC = "last_commit_time_sync";
|
||||
public static final TypeConverter TYPE_CONVERTOR = new TypeConverter() {};
|
||||
|
||||
protected final HoodieTableMetaClient metaClient;
|
||||
protected final HoodieTableType tableType;
|
||||
protected final FileSystem fs;
|
||||
private final String basePath;
|
||||
private final boolean assumeDatePartitioning;
|
||||
private final boolean useFileListingFromMetadata;
|
||||
private final boolean withOperationField;
|
||||
|
||||
@Deprecated
|
||||
public AbstractSyncHoodieClient(String basePath, boolean assumeDatePartitioning, boolean useFileListingFromMetadata,
|
||||
boolean verifyMetadataFileListing, boolean withOperationField, FileSystem fs) {
|
||||
this(basePath, assumeDatePartitioning, useFileListingFromMetadata, withOperationField, fs);
|
||||
}
|
||||
|
||||
public AbstractSyncHoodieClient(String basePath, boolean assumeDatePartitioning, boolean useFileListingFromMetadata,
|
||||
boolean withOperationField, FileSystem fs) {
|
||||
this.metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
|
||||
this.tableType = metaClient.getTableType();
|
||||
this.basePath = basePath;
|
||||
this.assumeDatePartitioning = assumeDatePartitioning;
|
||||
this.useFileListingFromMetadata = useFileListingFromMetadata;
|
||||
this.withOperationField = withOperationField;
|
||||
this.fs = fs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the table.
|
||||
* @param tableName The table name.
|
||||
* @param storageSchema The table schema.
|
||||
* @param inputFormatClass The input format class of this table.
|
||||
* @param outputFormatClass The output format class of this table.
|
||||
* @param serdeClass The serde class of this table.
|
||||
* @param serdeProperties The serde properties of this table.
|
||||
* @param tableProperties The table properties for this table.
|
||||
*/
|
||||
public abstract void createTable(String tableName, MessageType storageSchema,
|
||||
String inputFormatClass, String outputFormatClass,
|
||||
String serdeClass, Map<String, String> serdeProperties,
|
||||
Map<String, String> tableProperties);
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #tableExists} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public abstract boolean doesTableExist(String tableName);
|
||||
|
||||
public abstract boolean tableExists(String tableName);
|
||||
|
||||
public abstract Option<String> getLastCommitTimeSynced(String tableName);
|
||||
|
||||
public abstract void updateLastCommitTimeSynced(String tableName);
|
||||
|
||||
public abstract Option<String> getLastReplicatedTime(String tableName);
|
||||
|
||||
public abstract void updateLastReplicatedTimeStamp(String tableName, String timeStamp);
|
||||
|
||||
public abstract void deleteLastReplicatedTimeStamp(String tableName);
|
||||
|
||||
public abstract void addPartitionsToTable(String tableName, List<String> partitionsToAdd);
|
||||
|
||||
public abstract void updatePartitionsToTable(String tableName, List<String> changedPartitions);
|
||||
|
||||
public abstract void dropPartitions(String tableName, List<String> partitionsToDrop);
|
||||
|
||||
public void updateTableProperties(String tableName, Map<String, String> tableProperties) {}
|
||||
|
||||
public abstract Map<String, String> getTableSchema(String tableName);
|
||||
|
||||
public HoodieTableType getTableType() {
|
||||
return tableType;
|
||||
}
|
||||
|
||||
public String getBasePath() {
|
||||
return metaClient.getBasePath();
|
||||
}
|
||||
|
||||
public FileSystem getFs() {
|
||||
return fs;
|
||||
}
|
||||
|
||||
public boolean isBootstrap() {
|
||||
return metaClient.getTableConfig().getBootstrapBasePath().isPresent();
|
||||
}
|
||||
|
||||
public void closeQuietly(ResultSet resultSet, Statement stmt) {
|
||||
try {
|
||||
if (stmt != null) {
|
||||
stmt.close();
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
LOG.warn("Could not close the statement opened ", e);
|
||||
}
|
||||
|
||||
try {
|
||||
if (resultSet != null) {
|
||||
resultSet.close();
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
LOG.warn("Could not close the resultset opened ", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the schema for a hoodie table. Depending on the type of table, try to read schema from commit metadata if
|
||||
* present, else fallback to reading from any file written in the latest commit. We will assume that the schema has
|
||||
* not changed within a single atomic write.
|
||||
*
|
||||
* @return Parquet schema for this table
|
||||
*/
|
||||
public MessageType getDataSchema() {
|
||||
try {
|
||||
return new TableSchemaResolver(metaClient).getTableParquetSchema();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieSyncException("Failed to read data schema", e);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isDropPartition() {
|
||||
try {
|
||||
Option<HoodieCommitMetadata> hoodieCommitMetadata = HoodieTableMetadataUtil.getLatestCommitMetadata(metaClient);
|
||||
|
||||
if (hoodieCommitMetadata.isPresent()
|
||||
&& WriteOperationType.DELETE_PARTITION.equals(hoodieCommitMetadata.get().getOperationType())) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new HoodieSyncException("Failed to get commit metadata", e);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
|
||||
public List<String> getPartitionsWrittenToSince(Option<String> lastCommitTimeSynced) {
|
||||
if (!lastCommitTimeSynced.isPresent()) {
|
||||
LOG.info("Last commit time synced is not known, listing all partitions in " + basePath + ",FS :" + fs);
|
||||
HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||
return FSUtils.getAllPartitionPaths(engineContext, basePath, useFileListingFromMetadata, assumeDatePartitioning);
|
||||
} else {
|
||||
LOG.info("Last commit time synced is " + lastCommitTimeSynced.get() + ", Getting commits since then");
|
||||
return TimelineUtils.getPartitionsWritten(metaClient.getActiveTimeline().getCommitsTimeline()
|
||||
.findInstantsAfter(lastCommitTimeSynced.get(), Integer.MAX_VALUE));
|
||||
}
|
||||
}
|
||||
|
||||
public abstract static class TypeConverter implements Serializable {
|
||||
|
||||
static final String DEFAULT_TARGET_TYPE = "DECIMAL";
|
||||
|
||||
protected String targetType;
|
||||
|
||||
public TypeConverter() {
|
||||
this.targetType = DEFAULT_TARGET_TYPE;
|
||||
}
|
||||
|
||||
public TypeConverter(String targetType) {
|
||||
ValidationUtils.checkArgument(Objects.nonNull(targetType));
|
||||
this.targetType = targetType;
|
||||
}
|
||||
|
||||
public void doConvert(ResultSet resultSet, Map<String, String> schema) throws SQLException {
|
||||
schema.put(getColumnName(resultSet), targetType.equalsIgnoreCase(getColumnType(resultSet))
|
||||
? convert(resultSet) : getColumnType(resultSet));
|
||||
}
|
||||
|
||||
public String convert(ResultSet resultSet) throws SQLException {
|
||||
String columnType = getColumnType(resultSet);
|
||||
int columnSize = resultSet.getInt("COLUMN_SIZE");
|
||||
int decimalDigits = resultSet.getInt("DECIMAL_DIGITS");
|
||||
return columnType + String.format("(%s,%s)", columnSize, decimalDigits);
|
||||
}
|
||||
|
||||
public String getColumnName(ResultSet resultSet) throws SQLException {
|
||||
return resultSet.getString(4);
|
||||
}
|
||||
|
||||
public String getColumnType(ResultSet resultSet) throws SQLException {
|
||||
return resultSet.getString(6);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the schema from the log file on path.
|
||||
*/
|
||||
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
|
||||
private MessageType readSchemaFromLogFile(Option<HoodieInstant> lastCompactionCommitOpt, Path path) throws Exception {
|
||||
MessageType messageType = TableSchemaResolver.readSchemaFromLogFile(fs, path);
|
||||
// Fall back to read the schema from last compaction
|
||||
if (messageType == null) {
|
||||
LOG.info("Falling back to read the schema from last compaction " + lastCompactionCommitOpt);
|
||||
return new TableSchemaResolver(this.metaClient).readSchemaFromLastCompaction(lastCompactionCommitOpt);
|
||||
}
|
||||
return messageType;
|
||||
}
|
||||
|
||||
/**
|
||||
* Partition Event captures any partition that needs to be added or updated.
|
||||
*/
|
||||
public static class PartitionEvent {
|
||||
|
||||
public enum PartitionEventType {
|
||||
ADD, UPDATE, DROP
|
||||
}
|
||||
|
||||
public PartitionEventType eventType;
|
||||
public String storagePartition;
|
||||
|
||||
PartitionEvent(PartitionEventType eventType, String storagePartition) {
|
||||
this.eventType = eventType;
|
||||
this.storagePartition = storagePartition;
|
||||
}
|
||||
|
||||
public static PartitionEvent newPartitionAddEvent(String storagePartition) {
|
||||
return new PartitionEvent(PartitionEventType.ADD, storagePartition);
|
||||
}
|
||||
|
||||
public static PartitionEvent newPartitionUpdateEvent(String storagePartition) {
|
||||
return new PartitionEvent(PartitionEventType.UPDATE, storagePartition);
|
||||
}
|
||||
|
||||
public static PartitionEvent newPartitionDropEvent(String storagePartition) {
|
||||
return new PartitionEvent(PartitionEventType.DROP, storagePartition);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,196 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common;
|
||||
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.sync.common.model.FieldSchema;
|
||||
import org.apache.hudi.sync.common.model.Partition;
|
||||
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public interface HoodieMetaSyncOperations {
|
||||
|
||||
String HOODIE_LAST_COMMIT_TIME_SYNC = "last_commit_time_sync";
|
||||
|
||||
/**
|
||||
* Create the table.
|
||||
*
|
||||
* @param tableName The table name.
|
||||
* @param storageSchema The table schema.
|
||||
* @param inputFormatClass The input format class of this table.
|
||||
* @param outputFormatClass The output format class of this table.
|
||||
* @param serdeClass The serde class of this table.
|
||||
* @param serdeProperties The serde properties of this table.
|
||||
* @param tableProperties The table properties for this table.
|
||||
*/
|
||||
default void createTable(String tableName,
|
||||
MessageType storageSchema,
|
||||
String inputFormatClass,
|
||||
String outputFormatClass,
|
||||
String serdeClass,
|
||||
Map<String, String> serdeProperties,
|
||||
Map<String, String> tableProperties) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if table exists in metastore.
|
||||
*/
|
||||
default boolean tableExists(String tableName) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Drop table from metastore.
|
||||
*/
|
||||
default void dropTable(String tableName) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Add partitions to the table in metastore.
|
||||
*/
|
||||
default void addPartitionsToTable(String tableName, List<String> partitionsToAdd) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Update partitions to the table in metastore.
|
||||
*/
|
||||
default void updatePartitionsToTable(String tableName, List<String> changedPartitions) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Drop partitions from the table in metastore.
|
||||
*/
|
||||
default void dropPartitions(String tableName, List<String> partitionsToDrop) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all partitions for the table in the metastore.
|
||||
*/
|
||||
default List<Partition> getAllPartitions(String tableName) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a database already exists in the metastore.
|
||||
*/
|
||||
default boolean databaseExists(String databaseName) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a database in the metastore.
|
||||
*/
|
||||
default void createDatabase(String databaseName) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the schema from metastore.
|
||||
*/
|
||||
default Map<String, String> getMetastoreSchema(String tableName) {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the schema from the Hudi table on storage.
|
||||
*/
|
||||
default MessageType getStorageSchema() {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update schema for the table in the metastore.
|
||||
*/
|
||||
default void updateTableSchema(String tableName, MessageType newSchema) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of field schemas from metastore.
|
||||
*/
|
||||
default List<FieldSchema> getMetastoreFieldSchemas(String tableName) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of field schema from the Hudi table on storage.
|
||||
*/
|
||||
default List<FieldSchema> getStorageFieldSchemas() {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the field comments for table in metastore, by using the ones from storage.
|
||||
*/
|
||||
default void updateTableComments(String tableName, List<FieldSchema> fromMetastore, List<FieldSchema> fromStorage) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the timestamp of last sync.
|
||||
*/
|
||||
default Option<String> getLastCommitTimeSynced(String tableName) {
|
||||
return Option.empty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the timestamp of last sync.
|
||||
*/
|
||||
default void updateLastCommitTimeSynced(String tableName) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the table properties in metastore.
|
||||
*/
|
||||
default void updateTableProperties(String tableName, Map<String, String> tableProperties) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the timestamp of last replication.
|
||||
*/
|
||||
default Option<String> getLastReplicatedTime(String tableName) {
|
||||
return Option.empty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the timestamp of last replication.
|
||||
*/
|
||||
default void updateLastReplicatedTimeStamp(String tableName, String timeStamp) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete the timestamp of last replication.
|
||||
*/
|
||||
default void deleteLastReplicatedTimeStamp(String tableName) {
|
||||
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,161 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common;
|
||||
|
||||
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.TableSchemaResolver;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.table.timeline.TimelineUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.metadata.HoodieTableMetadataUtil;
|
||||
import org.apache.hudi.sync.common.model.Partition;
|
||||
import org.apache.hudi.sync.common.model.PartitionEvent;
|
||||
import org.apache.hudi.sync.common.model.PartitionValueExtractor;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS;
|
||||
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_USE_FILE_LISTING_FROM_METADATA;
|
||||
|
||||
public abstract class HoodieSyncClient implements HoodieMetaSyncOperations, AutoCloseable {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieSyncClient.class);
|
||||
|
||||
protected final HoodieSyncConfig config;
|
||||
protected final PartitionValueExtractor partitionValueExtractor;
|
||||
protected final HoodieTableMetaClient metaClient;
|
||||
|
||||
public HoodieSyncClient(HoodieSyncConfig config) {
|
||||
this.config = config;
|
||||
this.partitionValueExtractor = ReflectionUtils.loadClass(config.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS));
|
||||
this.metaClient = HoodieTableMetaClient.builder()
|
||||
.setConf(config.getHadoopConf())
|
||||
.setBasePath(config.getString(META_SYNC_BASE_PATH))
|
||||
.setLoadActiveTimelineOnLoad(true)
|
||||
.build();
|
||||
}
|
||||
|
||||
public HoodieTimeline getActiveTimeline() {
|
||||
return metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||
}
|
||||
|
||||
public HoodieTableType getTableType() {
|
||||
return metaClient.getTableType();
|
||||
}
|
||||
|
||||
public String getBasePath() {
|
||||
return metaClient.getBasePathV2().toString();
|
||||
}
|
||||
|
||||
public boolean isBootstrap() {
|
||||
return metaClient.getTableConfig().getBootstrapBasePath().isPresent();
|
||||
}
|
||||
|
||||
public boolean isDropPartition() {
|
||||
try {
|
||||
Option<HoodieCommitMetadata> hoodieCommitMetadata = HoodieTableMetadataUtil.getLatestCommitMetadata(metaClient);
|
||||
|
||||
if (hoodieCommitMetadata.isPresent()
|
||||
&& WriteOperationType.DELETE_PARTITION.equals(hoodieCommitMetadata.get().getOperationType())) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new HoodieSyncException("Failed to get commit metadata", e);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MessageType getStorageSchema() {
|
||||
try {
|
||||
return new TableSchemaResolver(metaClient).getTableParquetSchema();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieSyncException("Failed to read schema from storage.", e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> getPartitionsWrittenToSince(Option<String> lastCommitTimeSynced) {
|
||||
if (!lastCommitTimeSynced.isPresent()) {
|
||||
LOG.info("Last commit time synced is not known, listing all partitions in "
|
||||
+ config.getString(META_SYNC_BASE_PATH)
|
||||
+ ",FS :" + config.getHadoopFileSystem());
|
||||
HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||
return FSUtils.getAllPartitionPaths(engineContext,
|
||||
config.getString(META_SYNC_BASE_PATH),
|
||||
config.getBoolean(META_SYNC_USE_FILE_LISTING_FROM_METADATA),
|
||||
config.getBoolean(META_SYNC_ASSUME_DATE_PARTITION));
|
||||
} else {
|
||||
LOG.info("Last commit time synced is " + lastCommitTimeSynced.get() + ", Getting commits since then");
|
||||
return TimelineUtils.getPartitionsWritten(metaClient.getActiveTimeline().getCommitsTimeline()
|
||||
.findInstantsAfter(lastCommitTimeSynced.get(), Integer.MAX_VALUE));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over the storage partitions and find if there are any new partitions that need to be added or updated.
|
||||
* Generate a list of PartitionEvent based on the changes required.
|
||||
*/
|
||||
public List<PartitionEvent> getPartitionEvents(List<Partition> tablePartitions, List<String> partitionStoragePartitions, boolean isDropPartition) {
|
||||
Map<String, String> paths = new HashMap<>();
|
||||
for (Partition tablePartition : tablePartitions) {
|
||||
List<String> hivePartitionValues = tablePartition.getValues();
|
||||
String fullTablePartitionPath =
|
||||
Path.getPathWithoutSchemeAndAuthority(new Path(tablePartition.getStorageLocation())).toUri().getPath();
|
||||
paths.put(String.join(", ", hivePartitionValues), fullTablePartitionPath);
|
||||
}
|
||||
|
||||
List<PartitionEvent> events = new ArrayList<>();
|
||||
for (String storagePartition : partitionStoragePartitions) {
|
||||
Path storagePartitionPath = FSUtils.getPartitionPath(config.getString(META_SYNC_BASE_PATH), storagePartition);
|
||||
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
|
||||
// Check if the partition values or if hdfs path is the same
|
||||
List<String> storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition);
|
||||
|
||||
if (isDropPartition) {
|
||||
events.add(PartitionEvent.newPartitionDropEvent(storagePartition));
|
||||
} else {
|
||||
if (!storagePartitionValues.isEmpty()) {
|
||||
String storageValue = String.join(", ", storagePartitionValues);
|
||||
if (!paths.containsKey(storageValue)) {
|
||||
events.add(PartitionEvent.newPartitionAddEvent(storagePartition));
|
||||
} else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) {
|
||||
events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
}
|
||||
@@ -22,14 +22,19 @@ import org.apache.hudi.common.config.ConfigProperty;
|
||||
import org.apache.hudi.common.config.HoodieConfig;
|
||||
import org.apache.hudi.common.config.HoodieMetadataConfig;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.table.HoodieTableConfig;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
|
||||
import org.apache.hudi.sync.common.util.ConfigUtils;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Properties;
|
||||
import java.util.function.Function;
|
||||
|
||||
/**
|
||||
@@ -37,41 +42,6 @@ import java.util.function.Function;
|
||||
*/
|
||||
public class HoodieSyncConfig extends HoodieConfig {
|
||||
|
||||
@Parameter(names = {"--database"}, description = "name of the target database in meta store", required = true)
|
||||
public String databaseName;
|
||||
|
||||
@Parameter(names = {"--table"}, description = "name of the target table in meta store", required = true)
|
||||
public String tableName;
|
||||
|
||||
@Parameter(names = {"--base-path"}, description = "Base path of the hoodie table to sync", required = true)
|
||||
public String basePath;
|
||||
|
||||
@Parameter(names = {"--base-file-format"}, description = "Format of the base files (PARQUET (or) HFILE)")
|
||||
public String baseFileFormat;
|
||||
|
||||
@Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
|
||||
public List<String> partitionFields;
|
||||
|
||||
@Parameter(names = "--partition-value-extractor", description = "Class which implements PartitionValueExtractor "
|
||||
+ "to extract the partition values from HDFS path")
|
||||
public String partitionValueExtractorClass;
|
||||
|
||||
@Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
|
||||
+ " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
|
||||
public Boolean assumeDatePartitioning;
|
||||
|
||||
@Parameter(names = {"--decode-partition"}, description = "Decode the partition value if the partition has encoded during writing")
|
||||
public Boolean decodePartition;
|
||||
|
||||
@Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata")
|
||||
public Boolean useFileListingFromMetadata;
|
||||
|
||||
@Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.")
|
||||
public Boolean isConditionalSync;
|
||||
|
||||
@Parameter(names = {"--spark-version"}, description = "The spark version")
|
||||
public String sparkVersion;
|
||||
|
||||
public static final ConfigProperty<String> META_SYNC_BASE_PATH = ConfigProperty
|
||||
.key("hoodie.datasource.meta.sync.base.path")
|
||||
.defaultValue("")
|
||||
@@ -150,6 +120,11 @@ public class HoodieSyncConfig extends HoodieConfig {
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Assume partitioning is yyyy/mm/dd");
|
||||
|
||||
public static final ConfigProperty<Boolean> META_SYNC_DECODE_PARTITION = ConfigProperty
|
||||
.key("hoodie.meta.sync.decode_partition")
|
||||
.defaultValue(false) // TODO infer from url encode option
|
||||
.withDocumentation("");
|
||||
|
||||
public static final ConfigProperty<Boolean> META_SYNC_USE_FILE_LISTING_FROM_METADATA = ConfigProperty
|
||||
.key("hoodie.meta.sync.metadata_file_listing")
|
||||
.defaultValue(HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS)
|
||||
@@ -165,24 +140,85 @@ public class HoodieSyncConfig extends HoodieConfig {
|
||||
.defaultValue("")
|
||||
.withDocumentation("The spark version used when syncing with a metastore.");
|
||||
|
||||
public HoodieSyncConfig(TypedProperties props) {
|
||||
super(props);
|
||||
setDefaults();
|
||||
private Configuration hadoopConf;
|
||||
|
||||
this.basePath = getStringOrDefault(META_SYNC_BASE_PATH);
|
||||
this.databaseName = getStringOrDefault(META_SYNC_DATABASE_NAME);
|
||||
this.tableName = getStringOrDefault(META_SYNC_TABLE_NAME);
|
||||
this.baseFileFormat = getStringOrDefault(META_SYNC_BASE_FILE_FORMAT);
|
||||
this.partitionFields = props.getStringList(META_SYNC_PARTITION_FIELDS.key(), ",", Collections.emptyList());
|
||||
this.partitionValueExtractorClass = getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS);
|
||||
this.assumeDatePartitioning = getBooleanOrDefault(META_SYNC_ASSUME_DATE_PARTITION);
|
||||
this.decodePartition = getBooleanOrDefault(KeyGeneratorOptions.URL_ENCODE_PARTITIONING);
|
||||
this.useFileListingFromMetadata = getBooleanOrDefault(META_SYNC_USE_FILE_LISTING_FROM_METADATA);
|
||||
this.isConditionalSync = getBooleanOrDefault(META_SYNC_CONDITIONAL_SYNC);
|
||||
this.sparkVersion = getStringOrDefault(META_SYNC_SPARK_VERSION);
|
||||
public HoodieSyncConfig(Properties props) {
|
||||
this(props, ConfigUtils.createHadoopConf(props));
|
||||
}
|
||||
|
||||
protected void setDefaults() {
|
||||
this.setDefaultValue(META_SYNC_TABLE_NAME);
|
||||
public HoodieSyncConfig(Properties props, Configuration hadoopConf) {
|
||||
super(props);
|
||||
this.hadoopConf = hadoopConf;
|
||||
}
|
||||
|
||||
public void setHadoopConf(Configuration hadoopConf) {
|
||||
this.hadoopConf = hadoopConf;
|
||||
}
|
||||
|
||||
public Configuration getHadoopConf() {
|
||||
return hadoopConf;
|
||||
}
|
||||
|
||||
public FileSystem getHadoopFileSystem() {
|
||||
return FSUtils.getFs(getString(META_SYNC_BASE_PATH), getHadoopConf());
|
||||
}
|
||||
|
||||
public String getAbsoluteBasePath() {
|
||||
return getString(META_SYNC_BASE_PATH);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return props.toString();
|
||||
}
|
||||
|
||||
public static class HoodieSyncConfigParams {
|
||||
@Parameter(names = {"--database"}, description = "name of the target database in meta store", required = true)
|
||||
public String databaseName;
|
||||
@Parameter(names = {"--table"}, description = "name of the target table in meta store", required = true)
|
||||
public String tableName;
|
||||
@Parameter(names = {"--base-path"}, description = "Base path of the hoodie table to sync", required = true)
|
||||
public String basePath;
|
||||
@Parameter(names = {"--base-file-format"}, description = "Format of the base files (PARQUET (or) HFILE)")
|
||||
public String baseFileFormat;
|
||||
@Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
|
||||
public List<String> partitionFields;
|
||||
@Parameter(names = "--partition-value-extractor", description = "Class which implements PartitionValueExtractor "
|
||||
+ "to extract the partition values from HDFS path")
|
||||
public String partitionValueExtractorClass;
|
||||
@Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
|
||||
+ " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
|
||||
public Boolean assumeDatePartitioning;
|
||||
@Parameter(names = {"--decode-partition"}, description = "Decode the partition value if the partition has encoded during writing")
|
||||
public Boolean decodePartition;
|
||||
@Parameter(names = {"--use-file-listing-from-metadata"}, description = "Fetch file listing from Hudi's metadata")
|
||||
public Boolean useFileListingFromMetadata;
|
||||
@Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.")
|
||||
public Boolean isConditionalSync;
|
||||
@Parameter(names = {"--spark-version"}, description = "The spark version")
|
||||
public String sparkVersion;
|
||||
|
||||
@Parameter(names = {"--help", "-h"}, help = true)
|
||||
public boolean help = false;
|
||||
|
||||
public boolean isHelp() {
|
||||
return help;
|
||||
}
|
||||
|
||||
public TypedProperties toProps() {
|
||||
final TypedProperties props = new TypedProperties();
|
||||
props.setPropertyIfNonNull(META_SYNC_BASE_PATH.key(), basePath);
|
||||
props.setPropertyIfNonNull(META_SYNC_DATABASE_NAME.key(), databaseName);
|
||||
props.setPropertyIfNonNull(META_SYNC_TABLE_NAME.key(), tableName);
|
||||
props.setPropertyIfNonNull(META_SYNC_BASE_FILE_FORMAT.key(), baseFileFormat);
|
||||
props.setPropertyIfNonNull(META_SYNC_PARTITION_FIELDS.key(), StringUtils.join(",", partitionFields));
|
||||
props.setPropertyIfNonNull(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), partitionValueExtractorClass);
|
||||
props.setPropertyIfNonNull(META_SYNC_ASSUME_DATE_PARTITION.key(), assumeDatePartitioning);
|
||||
props.setPropertyIfNonNull(META_SYNC_DECODE_PARTITION.key(), decodePartition);
|
||||
props.setPropertyIfNonNull(META_SYNC_USE_FILE_LISTING_FROM_METADATA.key(), useFileListingFromMetadata);
|
||||
props.setPropertyIfNonNull(META_SYNC_CONDITIONAL_SYNC.key(), isConditionalSync);
|
||||
props.setPropertyIfNonNull(META_SYNC_SPARK_VERSION.key(), sparkVersion);
|
||||
return props;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.sync.common.util.ConfigUtils;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* Base class to sync metadata with metastores to make
|
||||
* Hudi table queryable through external systems.
|
||||
*/
|
||||
public abstract class HoodieSyncTool implements AutoCloseable {
|
||||
|
||||
protected Properties props;
|
||||
protected Configuration hadoopConf;
|
||||
|
||||
public HoodieSyncTool(Properties props) {
|
||||
this(props, ConfigUtils.createHadoopConf(props));
|
||||
}
|
||||
|
||||
public HoodieSyncTool(Properties props, Configuration hadoopConf) {
|
||||
this.props = props;
|
||||
this.hadoopConf = hadoopConf;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public HoodieSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {
|
||||
this(props, conf);
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public HoodieSyncTool(Properties props, FileSystem fileSystem) {
|
||||
this(props, fileSystem.getConf());
|
||||
}
|
||||
|
||||
public abstract void syncHoodieTable();
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
// no op
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common.model;
|
||||
|
||||
import org.apache.hudi.common.util.Option;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
public class FieldSchema {
|
||||
|
||||
private final String name;
|
||||
private String type;
|
||||
private Option<String> comment;
|
||||
|
||||
public FieldSchema(String name, String type) {
|
||||
this(name, type, Option.empty());
|
||||
}
|
||||
|
||||
public FieldSchema(String name, String type, String comment) {
|
||||
this(name, type, Option.ofNullable(comment));
|
||||
}
|
||||
|
||||
public FieldSchema(String name, String type, Option<String> comment) {
|
||||
this.name = name;
|
||||
this.type = type;
|
||||
this.comment = comment;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public Option<String> getComment() {
|
||||
return comment;
|
||||
}
|
||||
|
||||
public String getCommentOrEmpty() {
|
||||
return comment.orElse("");
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public void setComment(Option<String> comment) {
|
||||
this.comment = comment;
|
||||
}
|
||||
|
||||
public void setComment(String comment) {
|
||||
this.comment = Option.ofNullable(comment);
|
||||
}
|
||||
|
||||
public boolean updateComment(FieldSchema another) {
|
||||
if (Objects.equals(name, another.getName())
|
||||
&& !Objects.equals(getCommentOrEmpty(), another.getCommentOrEmpty())) {
|
||||
setComment(another.getComment());
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common.model;
|
||||
|
||||
/**
|
||||
* Partition Event captures any partition that needs to be added or updated.
|
||||
*/
|
||||
public class PartitionEvent {
|
||||
|
||||
public enum PartitionEventType {
|
||||
ADD, UPDATE, DROP
|
||||
}
|
||||
|
||||
public PartitionEventType eventType;
|
||||
public String storagePartition;
|
||||
|
||||
PartitionEvent(PartitionEventType eventType, String storagePartition) {
|
||||
this.eventType = eventType;
|
||||
this.storagePartition = storagePartition;
|
||||
}
|
||||
|
||||
public static PartitionEvent newPartitionAddEvent(String storagePartition) {
|
||||
return new PartitionEvent(PartitionEventType.ADD, storagePartition);
|
||||
}
|
||||
|
||||
public static PartitionEvent newPartitionUpdateEvent(String storagePartition) {
|
||||
return new PartitionEvent(PartitionEventType.UPDATE, storagePartition);
|
||||
}
|
||||
|
||||
public static PartitionEvent newPartitionDropEvent(String storagePartition) {
|
||||
return new PartitionEvent(PartitionEventType.DROP, storagePartition);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* HDFS Path contain hive partition values for the keys it is partitioned on. This mapping is not straight forward and
|
||||
* requires a pluggable implementation to extract the partition value from HDFS path.
|
||||
* <p>
|
||||
* e.g. Hive table partitioned by datestr=yyyy-mm-dd and hdfs path /app/hoodie/dataset1/YYYY=[yyyy]/MM=[mm]/DD=[dd]
|
||||
*/
|
||||
public interface PartitionValueExtractor extends Serializable {
|
||||
|
||||
List<String> extractPartitionValuesInPath(String partitionPath);
|
||||
}
|
||||
@@ -18,9 +18,13 @@
|
||||
|
||||
package org.apache.hudi.sync.common.util;
|
||||
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import java.util.Properties;
|
||||
|
||||
public class ConfigUtils {
|
||||
/**
|
||||
@@ -32,6 +36,7 @@ public class ConfigUtils {
|
||||
/**
|
||||
* Convert the key-value config to a map.The format of the config
|
||||
* is a key-value pair just like "k1=v1\nk2=v2\nk3=v3".
|
||||
*
|
||||
* @param keyValueConfig
|
||||
* @return
|
||||
*/
|
||||
@@ -49,7 +54,7 @@ public class ConfigUtils {
|
||||
tableProperties.put(key, value);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Bad key-value config: " + keyValue + ", must be the"
|
||||
+ " format 'key = value'");
|
||||
+ " format 'key = value'");
|
||||
}
|
||||
}
|
||||
return tableProperties;
|
||||
@@ -58,6 +63,7 @@ public class ConfigUtils {
|
||||
/**
|
||||
* Convert map config to key-value string.The format of the config
|
||||
* is a key-value pair just like "k1=v1\nk2=v2\nk3=v3".
|
||||
*
|
||||
* @param config
|
||||
* @return
|
||||
*/
|
||||
@@ -75,4 +81,10 @@ public class ConfigUtils {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static Configuration createHadoopConf(Properties props) {
|
||||
Configuration hadoopConf = new Configuration();
|
||||
props.stringPropertyNames().forEach(k -> hadoopConf.set(k, props.getProperty(k)));
|
||||
return hadoopConf;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
@@ -15,15 +16,10 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sync.common;
|
||||
package org.apache.hudi.sync.common.util;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.sync.common.util.ConfigUtils;
|
||||
import org.apache.hudi.sync.common.util.Parquet2SparkSchemaUtils;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.parquet.schema.GroupType;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.parquet.schema.PrimitiveType;
|
||||
@@ -33,40 +29,18 @@ import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
import static org.apache.parquet.schema.OriginalType.UTF8;
|
||||
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
|
||||
|
||||
/**
|
||||
* Base class to sync Hudi meta data with Metastores to make
|
||||
* Hudi table queryable through external systems.
|
||||
*/
|
||||
public abstract class AbstractSyncTool {
|
||||
protected final Configuration conf;
|
||||
protected final FileSystem fs;
|
||||
protected TypedProperties props;
|
||||
|
||||
public AbstractSyncTool(TypedProperties props, Configuration conf, FileSystem fs) {
|
||||
this.props = props;
|
||||
this.conf = conf;
|
||||
this.fs = fs;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public AbstractSyncTool(Properties props, FileSystem fileSystem) {
|
||||
this(new TypedProperties(props), fileSystem.getConf(), fileSystem);
|
||||
}
|
||||
|
||||
public abstract void syncHoodieTable();
|
||||
|
||||
public class SparkDataSourceTableUtils {
|
||||
/**
|
||||
* Get Spark Sql related table properties. This is used for spark datasource table.
|
||||
* @param schema The schema to write to the table.
|
||||
* @return A new parameters added the spark's table properties.
|
||||
*/
|
||||
protected Map<String, String> getSparkTableProperties(List<String> partitionNames, String sparkVersion,
|
||||
int schemaLengthThreshold, MessageType schema) {
|
||||
public static Map<String, String> getSparkTableProperties(List<String> partitionNames, String sparkVersion,
|
||||
int schemaLengthThreshold, MessageType schema) {
|
||||
// Convert the schema and partition info used by spark sql to hive table properties.
|
||||
// The following code refers to the spark code in
|
||||
// https://github.com/apache/spark/blob/master/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
|
||||
@@ -122,7 +96,7 @@ public abstract class AbstractSyncTool {
|
||||
return sparkProperties;
|
||||
}
|
||||
|
||||
protected Map<String, String> getSparkSerdeProperties(boolean readAsOptimized, String basePath) {
|
||||
public static Map<String, String> getSparkSerdeProperties(boolean readAsOptimized, String basePath) {
|
||||
Map<String, String> sparkSerdeProperties = new HashMap<>();
|
||||
sparkSerdeProperties.put("path", basePath);
|
||||
sparkSerdeProperties.put(ConfigUtils.IS_QUERY_AS_RO_TABLE, String.valueOf(readAsOptimized));
|
||||
@@ -22,13 +22,11 @@ package org.apache.hudi.sync.common.util;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool;
|
||||
import org.apache.hudi.sync.common.HoodieSyncConfig;
|
||||
import org.apache.hudi.sync.common.HoodieSyncTool;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
@@ -36,13 +34,12 @@ import java.util.Properties;
|
||||
* Helper class for syncing Hudi commit data with external metastores.
|
||||
*/
|
||||
public class SyncUtilHelpers {
|
||||
private static final Logger LOG = LogManager.getLogger(SyncUtilHelpers.class);
|
||||
|
||||
/**
|
||||
* Create an instance of an implementation of {@link AbstractSyncTool} that will sync all the relevant meta information
|
||||
* Create an instance of an implementation of {@link HoodieSyncTool} that will sync all the relevant meta information
|
||||
* with an external metastore such as Hive etc. to ensure Hoodie tables can be queried or read via external systems.
|
||||
*
|
||||
* @param metaSyncFQCN The class that implements the sync of the metadata.
|
||||
* @param metaSyncFQCN The class that implements the sync of the metadata.
|
||||
* @param props property map.
|
||||
* @param hadoopConfig Hadoop confs.
|
||||
* @param fs Filesystem used.
|
||||
@@ -62,30 +59,40 @@ public class SyncUtilHelpers {
|
||||
}
|
||||
}
|
||||
|
||||
static AbstractSyncTool instantiateMetaSyncTool(String metaSyncFQCN,
|
||||
TypedProperties props,
|
||||
Configuration hadoopConfig,
|
||||
FileSystem fs,
|
||||
String targetBasePath,
|
||||
String baseFileFormat) {
|
||||
static HoodieSyncTool instantiateMetaSyncTool(String metaSyncFQCN,
|
||||
TypedProperties props,
|
||||
Configuration hadoopConfig,
|
||||
FileSystem fs,
|
||||
String targetBasePath,
|
||||
String baseFileFormat) {
|
||||
TypedProperties properties = new TypedProperties();
|
||||
properties.putAll(props);
|
||||
properties.put(HoodieSyncConfig.META_SYNC_BASE_PATH.key(), targetBasePath);
|
||||
properties.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key(), baseFileFormat);
|
||||
|
||||
if (ReflectionUtils.hasConstructor(metaSyncFQCN,
|
||||
new Class<?>[] {Properties.class, Configuration.class})) {
|
||||
return ((HoodieSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
|
||||
new Class<?>[] {Properties.class, Configuration.class},
|
||||
properties, hadoopConfig));
|
||||
} else if (ReflectionUtils.hasConstructor(metaSyncFQCN,
|
||||
new Class<?>[] {Properties.class})) {
|
||||
return ((HoodieSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
|
||||
new Class<?>[] {Properties.class},
|
||||
properties));
|
||||
} else if (ReflectionUtils.hasConstructor(metaSyncFQCN,
|
||||
new Class<?>[] {TypedProperties.class, Configuration.class, FileSystem.class})) {
|
||||
return ((AbstractSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
|
||||
return ((HoodieSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
|
||||
new Class<?>[] {TypedProperties.class, Configuration.class, FileSystem.class},
|
||||
properties, hadoopConfig, fs));
|
||||
} else if (ReflectionUtils.hasConstructor(metaSyncFQCN,
|
||||
new Class<?>[] {Properties.class, FileSystem.class})) {
|
||||
return ((HoodieSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
|
||||
new Class<?>[] {Properties.class, FileSystem.class},
|
||||
properties, fs));
|
||||
} else {
|
||||
LOG.warn("Falling back to deprecated constructor for class: " + metaSyncFQCN);
|
||||
try {
|
||||
return ((AbstractSyncTool) ReflectionUtils.loadClass(metaSyncFQCN,
|
||||
new Class<?>[] {Properties.class, FileSystem.class}, properties, fs));
|
||||
} catch (Throwable t) {
|
||||
throw new HoodieException("Could not load meta sync class " + metaSyncFQCN, t);
|
||||
}
|
||||
throw new HoodieException("Could not load meta sync class " + metaSyncFQCN
|
||||
+ ": no valid constructor found.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,16 +20,19 @@ package org.apache.hudi.sync.common.util;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool;
|
||||
import org.apache.hudi.sync.common.HoodieSyncTool;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.ValueSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Properties;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
@@ -46,42 +49,44 @@ public class TestSyncUtilHelpers {
|
||||
hadoopConf = fileSystem.getConf();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreateValidSyncClass() {
|
||||
AbstractSyncTool metaSyncTool = SyncUtilHelpers.instantiateMetaSyncTool(
|
||||
ValidMetaSyncClass.class.getName(),
|
||||
@ParameterizedTest
|
||||
@ValueSource(classes = {DummySyncTool1.class, DummySyncTool2.class})
|
||||
public void testCreateValidSyncClass(Class<?> clazz) {
|
||||
HoodieSyncTool syncTool = SyncUtilHelpers.instantiateMetaSyncTool(
|
||||
clazz.getName(),
|
||||
new TypedProperties(),
|
||||
hadoopConf,
|
||||
fileSystem,
|
||||
BASE_PATH,
|
||||
BASE_FORMAT
|
||||
);
|
||||
assertTrue(metaSyncTool instanceof ValidMetaSyncClass);
|
||||
assertTrue(clazz.isAssignableFrom(syncTool.getClass()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure it still works for the deprecated constructor of {@link AbstractSyncTool}
|
||||
* Ensure it still works for the deprecated constructor of {@link HoodieSyncTool}
|
||||
* as we implemented the fallback.
|
||||
*/
|
||||
@Test
|
||||
public void testCreateDeprecatedSyncClass() {
|
||||
@ParameterizedTest
|
||||
@ValueSource(classes = {DeprecatedSyncTool1.class, DeprecatedSyncTool2.class})
|
||||
public void testCreateDeprecatedSyncClass(Class<?> clazz) {
|
||||
Properties properties = new Properties();
|
||||
AbstractSyncTool deprecatedMetaSyncClass = SyncUtilHelpers.instantiateMetaSyncTool(
|
||||
DeprecatedMetaSyncClass.class.getName(),
|
||||
HoodieSyncTool syncTool = SyncUtilHelpers.instantiateMetaSyncTool(
|
||||
clazz.getName(),
|
||||
new TypedProperties(properties),
|
||||
hadoopConf,
|
||||
fileSystem,
|
||||
BASE_PATH,
|
||||
BASE_FORMAT
|
||||
);
|
||||
assertTrue(deprecatedMetaSyncClass instanceof DeprecatedMetaSyncClass);
|
||||
assertTrue(clazz.isAssignableFrom(syncTool.getClass()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreateInvalidSyncClass() {
|
||||
Exception exception = assertThrows(HoodieException.class, () -> {
|
||||
Throwable t = assertThrows(HoodieException.class, () -> {
|
||||
SyncUtilHelpers.instantiateMetaSyncTool(
|
||||
InvalidSyncClass.class.getName(),
|
||||
InvalidSyncTool.class.getName(),
|
||||
new TypedProperties(),
|
||||
hadoopConf,
|
||||
fileSystem,
|
||||
@@ -90,14 +95,14 @@ public class TestSyncUtilHelpers {
|
||||
);
|
||||
});
|
||||
|
||||
String expectedMessage = "Could not load meta sync class " + InvalidSyncClass.class.getName();
|
||||
assertTrue(exception.getMessage().contains(expectedMessage));
|
||||
|
||||
String expectedMessage = "Could not load meta sync class " + InvalidSyncTool.class.getName()
|
||||
+ ": no valid constructor found.";
|
||||
assertEquals(expectedMessage, t.getMessage());
|
||||
}
|
||||
|
||||
public static class ValidMetaSyncClass extends AbstractSyncTool {
|
||||
public ValidMetaSyncClass(TypedProperties props, Configuration conf, FileSystem fs) {
|
||||
super(props, conf, fs);
|
||||
public static class DummySyncTool1 extends HoodieSyncTool {
|
||||
public DummySyncTool1(Properties props, Configuration hadoopConf) {
|
||||
super(props, hadoopConf);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -106,9 +111,9 @@ public class TestSyncUtilHelpers {
|
||||
}
|
||||
}
|
||||
|
||||
public static class DeprecatedMetaSyncClass extends AbstractSyncTool {
|
||||
public DeprecatedMetaSyncClass(Properties props, FileSystem fileSystem) {
|
||||
super(props, fileSystem);
|
||||
public static class DummySyncTool2 extends HoodieSyncTool {
|
||||
public DummySyncTool2(Properties props, Configuration hadoopConf) {
|
||||
super(props, hadoopConf);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -117,8 +122,30 @@ public class TestSyncUtilHelpers {
|
||||
}
|
||||
}
|
||||
|
||||
public static class InvalidSyncClass {
|
||||
public InvalidSyncClass(Properties props) {
|
||||
public static class DeprecatedSyncTool1 extends HoodieSyncTool {
|
||||
public DeprecatedSyncTool1(TypedProperties props, Configuration hadoopConf, FileSystem fs) {
|
||||
super(props, hadoopConf, fs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void syncHoodieTable() {
|
||||
throw new HoodieException("Method unimplemented as its a test class");
|
||||
}
|
||||
}
|
||||
|
||||
public static class DeprecatedSyncTool2 extends HoodieSyncTool {
|
||||
public DeprecatedSyncTool2(Properties props, FileSystem fs) {
|
||||
super(props, fs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void syncHoodieTable() {
|
||||
throw new HoodieException("Method unimplemented as its a test class");
|
||||
}
|
||||
}
|
||||
|
||||
public static class InvalidSyncTool {
|
||||
public InvalidSyncTool(Properties props, FileSystem fs, Configuration hadoopConf) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user