[HUDI-1451] Support bulk insert v2 with Spark 3.0.0 (#2328)
Co-authored-by: Wenning Ding <wenningd@amazon.com> - Added support for bulk insert v2 with datasource v2 api in Spark 3.0.0.
This commit is contained in:
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
/**
|
||||
* Base class for DefaultSource used by Spark datasource v2.
|
||||
*/
|
||||
public class BaseDefaultSource {
|
||||
|
||||
protected SparkSession sparkSession = null;
|
||||
protected Configuration configuration = null;
|
||||
|
||||
protected SparkSession getSparkSession() {
|
||||
if (sparkSession == null) {
|
||||
sparkSession = SparkSession.builder().getOrCreate();
|
||||
}
|
||||
return sparkSession;
|
||||
}
|
||||
|
||||
protected Configuration getConfiguration() {
|
||||
if (configuration == null) {
|
||||
this.configuration = getSparkSession().sparkContext().hadoopConfiguration();
|
||||
}
|
||||
return configuration;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal;
|
||||
|
||||
import org.apache.hudi.client.HoodieInternalWriteStatus;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Base class for HoodieWriterCommitMessage used by Spark datasource v2.
|
||||
*/
|
||||
public class BaseWriterCommitMessage {
|
||||
|
||||
private List<HoodieInternalWriteStatus> writeStatuses;
|
||||
|
||||
public BaseWriterCommitMessage(List<HoodieInternalWriteStatus> writeStatuses) {
|
||||
this.writeStatuses = writeStatuses;
|
||||
}
|
||||
|
||||
public List<HoodieInternalWriteStatus> getWriteStatuses() {
|
||||
return writeStatuses;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "HoodieWriterCommitMessage{" + "writeStatuses=" + Arrays.toString(writeStatuses.toArray()) + '}';
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,113 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal;
|
||||
|
||||
import org.apache.hudi.client.HoodieInternalWriteStatus;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.io.HoodieRowCreateHandle;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Helper class for HoodieBulkInsertDataInternalWriter used by Spark datasource v2.
|
||||
*/
|
||||
public class BulkInsertDataInternalWriterHelper {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(BulkInsertDataInternalWriterHelper.class);
|
||||
|
||||
private final String instantTime;
|
||||
private final int taskPartitionId;
|
||||
private final long taskId;
|
||||
private final long taskEpochId;
|
||||
private final HoodieTable hoodieTable;
|
||||
private final HoodieWriteConfig writeConfig;
|
||||
private final StructType structType;
|
||||
private final List<HoodieInternalWriteStatus> writeStatusList = new ArrayList<>();
|
||||
|
||||
private HoodieRowCreateHandle handle;
|
||||
private String lastKnownPartitionPath = null;
|
||||
private String fileIdPrefix;
|
||||
private int numFilesWritten = 0;
|
||||
|
||||
public BulkInsertDataInternalWriterHelper(HoodieTable hoodieTable, HoodieWriteConfig writeConfig,
|
||||
String instantTime, int taskPartitionId, long taskId, long taskEpochId, StructType structType) {
|
||||
this.hoodieTable = hoodieTable;
|
||||
this.writeConfig = writeConfig;
|
||||
this.instantTime = instantTime;
|
||||
this.taskPartitionId = taskPartitionId;
|
||||
this.taskId = taskId;
|
||||
this.taskEpochId = taskEpochId;
|
||||
this.structType = structType;
|
||||
this.fileIdPrefix = UUID.randomUUID().toString();
|
||||
}
|
||||
|
||||
public void write(InternalRow record) throws IOException {
|
||||
try {
|
||||
String partitionPath = record.getUTF8String(
|
||||
HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toString();
|
||||
|
||||
if ((lastKnownPartitionPath == null) || !lastKnownPartitionPath.equals(partitionPath) || !handle.canWrite()) {
|
||||
LOG.info("Creating new file for partition path " + partitionPath);
|
||||
createNewHandle(partitionPath);
|
||||
lastKnownPartitionPath = partitionPath;
|
||||
}
|
||||
handle.write(record);
|
||||
} catch (Throwable t) {
|
||||
LOG.error("Global error thrown while trying to write records in HoodieRowCreateHandle ", t);
|
||||
throw t;
|
||||
}
|
||||
}
|
||||
|
||||
public List<HoodieInternalWriteStatus> getWriteStatuses() throws IOException {
|
||||
close();
|
||||
return writeStatusList;
|
||||
}
|
||||
|
||||
public void abort() {
|
||||
}
|
||||
|
||||
private void createNewHandle(String partitionPath) throws IOException {
|
||||
if (null != handle) {
|
||||
close();
|
||||
}
|
||||
handle = new HoodieRowCreateHandle(hoodieTable, writeConfig, partitionPath, getNextFileId(),
|
||||
instantTime, taskPartitionId, taskId, taskEpochId, structType);
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
if (null != handle) {
|
||||
writeStatusList.add(handle.close());
|
||||
handle = null;
|
||||
}
|
||||
}
|
||||
|
||||
private String getNextFileId() {
|
||||
return String.format("%s-%d", fileIdPrefix, numFilesWritten++);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,107 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.internal;
|
||||
|
||||
import org.apache.hudi.DataSourceUtils;
|
||||
import org.apache.hudi.client.SparkRDDWriteClient;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant.State;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Helper class for HoodieDataSourceInternalWriter used by Spark datasource v2.
|
||||
*/
|
||||
public class DataSourceInternalWriterHelper {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(DataSourceInternalWriterHelper.class);
|
||||
public static final String INSTANT_TIME_OPT_KEY = "hoodie.instant.time";
|
||||
|
||||
private final String instantTime;
|
||||
private final HoodieTableMetaClient metaClient;
|
||||
private final SparkRDDWriteClient writeClient;
|
||||
private final HoodieTable hoodieTable;
|
||||
private final WriteOperationType operationType;
|
||||
|
||||
public DataSourceInternalWriterHelper(String instantTime, HoodieWriteConfig writeConfig, StructType structType,
|
||||
SparkSession sparkSession, Configuration configuration) {
|
||||
this.instantTime = instantTime;
|
||||
this.operationType = WriteOperationType.BULK_INSERT;
|
||||
this.writeClient = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(new JavaSparkContext(sparkSession.sparkContext())), writeConfig, true);
|
||||
writeClient.setOperationType(operationType);
|
||||
writeClient.startCommitWithTime(instantTime);
|
||||
this.metaClient = new HoodieTableMetaClient(configuration, writeConfig.getBasePath());
|
||||
this.hoodieTable = HoodieSparkTable.create(writeConfig, new HoodieSparkEngineContext(new JavaSparkContext(sparkSession.sparkContext())), metaClient);
|
||||
}
|
||||
|
||||
public boolean useCommitCoordinator() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public void onDataWriterCommit(String message) {
|
||||
LOG.info("Received commit of a data writer = " + message);
|
||||
}
|
||||
|
||||
public void commit(List<HoodieWriteStat> writeStatList) {
|
||||
try {
|
||||
writeClient.commitStats(instantTime, writeStatList, Option.empty(),
|
||||
DataSourceUtils.getCommitActionType(operationType, metaClient.getTableType()));
|
||||
} catch (Exception ioe) {
|
||||
throw new HoodieException(ioe.getMessage(), ioe);
|
||||
} finally {
|
||||
writeClient.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void abort() {
|
||||
LOG.error("Commit " + instantTime + " aborted ");
|
||||
writeClient.rollback(instantTime);
|
||||
writeClient.close();
|
||||
}
|
||||
|
||||
public void createInflightCommit() {
|
||||
metaClient.getActiveTimeline().transitionRequestedToInflight(
|
||||
new HoodieInstant(State.REQUESTED, HoodieTimeline.COMMIT_ACTION, instantTime), Option.empty());
|
||||
}
|
||||
|
||||
public HoodieTable getHoodieTable() {
|
||||
return hoodieTable;
|
||||
}
|
||||
|
||||
public WriteOperationType getWriteOperationType() {
|
||||
return operationType;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user