HUDI-123 Rename code packages/constants to org.apache.hudi (#830)
- Rename com.uber.hoodie to org.apache.hudi - Flag to pass com.uber.hoodie Input formats for hoodie-sync - Works with HUDI demo. - Also tested for backwards compatibility with datasets built by com.uber.hoodie packages - Migration guide : https://cwiki.apache.org/confluence/display/HUDI/Migration+Guide+From+com.uber.hoodie+to+org.apache.hudi
This commit is contained in:
committed by
vinoth chandar
parent
722b6be04a
commit
a4f9d7575f
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.util.HoodieAvroUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
|
||||
/**
|
||||
* Base class for all AVRO record based payloads, that can be ordered based on a field
|
||||
*/
|
||||
public abstract class BaseAvroPayload implements Serializable {
|
||||
|
||||
/**
|
||||
* Avro data extracted from the source converted to bytes
|
||||
*/
|
||||
protected final byte [] recordBytes;
|
||||
|
||||
/**
|
||||
* For purposes of preCombining
|
||||
*/
|
||||
protected final Comparable orderingVal;
|
||||
|
||||
/**
|
||||
* @param record
|
||||
* @param orderingVal
|
||||
*/
|
||||
public BaseAvroPayload(GenericRecord record, Comparable orderingVal) {
|
||||
try {
|
||||
this.recordBytes = HoodieAvroUtils.avroToBytes(record);
|
||||
} catch (IOException io) {
|
||||
throw new HoodieIOException("Cannot convert GenericRecord to bytes", io);
|
||||
}
|
||||
this.orderingVal = orderingVal;
|
||||
if (orderingVal == null) {
|
||||
throw new HoodieException("Ordering value is null for record: " + record);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.util.TypedProperties;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
|
||||
/**
|
||||
* Complex key generator, which takes names of fields to be used for recordKey and partitionPath as
|
||||
* configs.
|
||||
*/
|
||||
public class ComplexKeyGenerator extends KeyGenerator {
|
||||
|
||||
private static final String DEFAULT_PARTITION_PATH = "default";
|
||||
|
||||
private static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/";
|
||||
|
||||
protected final List<String> recordKeyFields;
|
||||
|
||||
protected final List<String> partitionPathFields;
|
||||
|
||||
public ComplexKeyGenerator(TypedProperties props) {
|
||||
super(props);
|
||||
this.recordKeyFields = Arrays.asList(props.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY()).split(","));
|
||||
this.partitionPathFields = Arrays.asList(props
|
||||
.getString(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY()).split(","));
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieKey getKey(GenericRecord record) {
|
||||
if (recordKeyFields == null || partitionPathFields == null) {
|
||||
throw new HoodieException(
|
||||
"Unable to find field names for record key or partition path in cfg");
|
||||
}
|
||||
StringBuilder recordKey = new StringBuilder();
|
||||
for (String recordKeyField : recordKeyFields) {
|
||||
recordKey.append(recordKeyField + ":" + DataSourceUtils.getNestedFieldValAsString(record, recordKeyField) + ",");
|
||||
}
|
||||
recordKey.deleteCharAt(recordKey.length() - 1);
|
||||
StringBuilder partitionPath = new StringBuilder();
|
||||
try {
|
||||
for (String partitionPathField : partitionPathFields) {
|
||||
partitionPath.append(DataSourceUtils.getNestedFieldValAsString(record, partitionPathField));
|
||||
partitionPath.append(DEFAULT_PARTITION_PATH_SEPARATOR);
|
||||
}
|
||||
partitionPath.deleteCharAt(partitionPath.length() - 1);
|
||||
} catch (HoodieException e) {
|
||||
partitionPath = partitionPath.append(DEFAULT_PARTITION_PATH);
|
||||
}
|
||||
|
||||
return new HoodieKey(recordKey.toString(), partitionPath.toString());
|
||||
}
|
||||
|
||||
public List<String> getRecordKeyFields() {
|
||||
return recordKeyFields;
|
||||
}
|
||||
|
||||
public List<String> getPartitionPathFields() {
|
||||
return partitionPathFields;
|
||||
}
|
||||
}
|
||||
250
hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java
Normal file
250
hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java
Normal file
@@ -0,0 +1,250 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.avro.Schema.Field;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.client.embedded.EmbeddedTimelineService;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.common.util.TypedProperties;
|
||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||
import org.apache.hudi.config.HoodieIndexConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.DatasetNotFoundException;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieNotSupportedException;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.PartitionValueExtractor;
|
||||
import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor;
|
||||
import org.apache.hudi.index.HoodieIndex;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
/**
|
||||
* Utilities used throughout the data source
|
||||
*/
|
||||
public class DataSourceUtils {
|
||||
|
||||
/**
|
||||
* Obtain value of the provided field as string, denoted by dot notation. e.g: a.b.c
|
||||
*/
|
||||
public static String getNestedFieldValAsString(GenericRecord record, String fieldName) {
|
||||
Object obj = getNestedFieldVal(record, fieldName);
|
||||
return (obj == null) ? null : obj.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtain value of the provided field, denoted by dot notation. e.g: a.b.c
|
||||
*/
|
||||
public static Object getNestedFieldVal(GenericRecord record, String fieldName) {
|
||||
String[] parts = fieldName.split("\\.");
|
||||
GenericRecord valueNode = record;
|
||||
int i = 0;
|
||||
for (;i < parts.length; i++) {
|
||||
String part = parts[i];
|
||||
Object val = valueNode.get(part);
|
||||
if (val == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
// return, if last part of name
|
||||
if (i == parts.length - 1) {
|
||||
return val;
|
||||
} else {
|
||||
// VC: Need a test here
|
||||
if (!(val instanceof GenericRecord)) {
|
||||
throw new HoodieException("Cannot find a record at part value :" + part);
|
||||
}
|
||||
valueNode = (GenericRecord) val;
|
||||
}
|
||||
}
|
||||
throw new HoodieException(fieldName + "(Part -" + parts[i] + ") field not found in record. "
|
||||
+ "Acceptable fields were :" + valueNode.getSchema().getFields()
|
||||
.stream().map(Field::name).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a key generator class via reflection, passing in any configs needed.
|
||||
*
|
||||
* If the class name of key generator is configured through the properties file, i.e., {@code
|
||||
* props}, use the corresponding key generator class; otherwise, use the default key generator
|
||||
* class specified in {@code DataSourceWriteOptions}.
|
||||
*/
|
||||
public static KeyGenerator createKeyGenerator(TypedProperties props) throws IOException {
|
||||
String keyGeneratorClass = props.getString(
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(),
|
||||
DataSourceWriteOptions.DEFAULT_KEYGENERATOR_CLASS_OPT_VAL()
|
||||
);
|
||||
try {
|
||||
return (KeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, props);
|
||||
} catch (Throwable e) {
|
||||
throw new IOException("Could not load key generator class " + keyGeneratorClass, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a partition value extractor class via reflection, passing in any configs needed
|
||||
*/
|
||||
public static PartitionValueExtractor createPartitionExtractor(String partitionExtractorClass) {
|
||||
try {
|
||||
return (PartitionValueExtractor) ReflectionUtils.loadClass(partitionExtractorClass);
|
||||
} catch (Throwable e) {
|
||||
throw new HoodieException("Could not load partition extractor class " + partitionExtractorClass, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a payload class via reflection, passing in an ordering/precombine value.
|
||||
*/
|
||||
public static HoodieRecordPayload createPayload(String payloadClass, GenericRecord record,
|
||||
Comparable orderingVal) throws IOException {
|
||||
try {
|
||||
return (HoodieRecordPayload) ReflectionUtils
|
||||
.loadClass(payloadClass, new Class<?>[]{GenericRecord.class, Comparable.class}, record, orderingVal);
|
||||
} catch (Throwable e) {
|
||||
throw new IOException("Could not create payload for class: " + payloadClass, e);
|
||||
}
|
||||
}
|
||||
|
||||
public static void checkRequiredProperties(TypedProperties props,
|
||||
List<String> checkPropNames) {
|
||||
checkPropNames.stream().forEach(prop -> {
|
||||
if (!props.containsKey(prop)) {
|
||||
throw new HoodieNotSupportedException("Required property " + prop + " is missing");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public static HoodieWriteClient createHoodieClient(JavaSparkContext jssc, String schemaStr,
|
||||
String basePath, String tblName, Map<String, String> parameters) throws Exception {
|
||||
|
||||
// inline compaction is on by default for MOR
|
||||
boolean inlineCompact = parameters.get(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY())
|
||||
.equals(DataSourceWriteOptions.MOR_STORAGE_TYPE_OPT_VAL());
|
||||
|
||||
// insert/bulk-insert combining to be true, if filtering for duplicates
|
||||
boolean combineInserts = Boolean.parseBoolean(parameters.get(
|
||||
DataSourceWriteOptions.INSERT_DROP_DUPS_OPT_KEY()));
|
||||
|
||||
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder()
|
||||
.withPath(basePath).withAutoCommit(false)
|
||||
.combineInput(combineInserts, true)
|
||||
.withSchema(schemaStr).forTable(tblName).withIndexConfig(
|
||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||
.withPayloadClass(parameters.get(
|
||||
DataSourceWriteOptions
|
||||
.PAYLOAD_CLASS_OPT_KEY()))
|
||||
.withInlineCompaction(inlineCompact)
|
||||
.build())
|
||||
// override above with Hoodie configs specified as options.
|
||||
.withProps(parameters).build();
|
||||
|
||||
return new HoodieWriteClient<>(jssc, writeConfig, true);
|
||||
}
|
||||
|
||||
|
||||
public static JavaRDD<WriteStatus> doWriteOperation(HoodieWriteClient client,
|
||||
JavaRDD<HoodieRecord> hoodieRecords, String commitTime, String operation) {
|
||||
if (operation.equals(DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL())) {
|
||||
return client.bulkInsert(hoodieRecords, commitTime);
|
||||
} else if (operation.equals(DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL())) {
|
||||
return client.insert(hoodieRecords, commitTime);
|
||||
} else {
|
||||
//default is upsert
|
||||
return client.upsert(hoodieRecords, commitTime);
|
||||
}
|
||||
}
|
||||
|
||||
public static HoodieRecord createHoodieRecord(GenericRecord gr, Comparable orderingVal,
|
||||
HoodieKey hKey, String payloadClass) throws IOException {
|
||||
HoodieRecordPayload payload = DataSourceUtils.createPayload(payloadClass, gr, orderingVal);
|
||||
return new HoodieRecord<>(hKey, payload);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static JavaRDD<HoodieRecord> dropDuplicates(JavaSparkContext jssc,
|
||||
JavaRDD<HoodieRecord> incomingHoodieRecords,
|
||||
HoodieWriteConfig writeConfig, Option<EmbeddedTimelineService> timelineService) throws Exception {
|
||||
HoodieReadClient client = null;
|
||||
try {
|
||||
client = new HoodieReadClient<>(jssc, writeConfig, timelineService);
|
||||
return client.tagLocation(incomingHoodieRecords)
|
||||
.filter(r -> !((HoodieRecord<HoodieRecordPayload>) r).isCurrentLocationKnown());
|
||||
} catch (DatasetNotFoundException e) {
|
||||
// this will be executed when there is no hoodie dataset yet
|
||||
// so no dups to drop
|
||||
return incomingHoodieRecords;
|
||||
} finally {
|
||||
if (null != client) {
|
||||
client.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static JavaRDD<HoodieRecord> dropDuplicates(JavaSparkContext jssc,
|
||||
JavaRDD<HoodieRecord> incomingHoodieRecords,
|
||||
Map<String, String> parameters,
|
||||
Option<EmbeddedTimelineService> timelineService)
|
||||
throws Exception {
|
||||
HoodieWriteConfig writeConfig = HoodieWriteConfig
|
||||
.newBuilder()
|
||||
.withPath(parameters.get("path"))
|
||||
.withProps(parameters).build();
|
||||
return dropDuplicates(jssc, incomingHoodieRecords, writeConfig, timelineService);
|
||||
}
|
||||
|
||||
public static HiveSyncConfig buildHiveSyncConfig(TypedProperties props, String basePath) {
|
||||
checkRequiredProperties(props, Arrays.asList(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY()));
|
||||
HiveSyncConfig hiveSyncConfig = new HiveSyncConfig();
|
||||
hiveSyncConfig.basePath = basePath;
|
||||
hiveSyncConfig.usePreApacheInputFormat =
|
||||
props.getBoolean(DataSourceWriteOptions.HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY(),
|
||||
Boolean.valueOf(DataSourceWriteOptions.DEFAULT_USE_PRE_APACHE_INPUT_FORMAT_OPT_VAL()));
|
||||
hiveSyncConfig.assumeDatePartitioning =
|
||||
props.getBoolean(DataSourceWriteOptions.HIVE_ASSUME_DATE_PARTITION_OPT_KEY(),
|
||||
Boolean.valueOf(DataSourceWriteOptions.DEFAULT_HIVE_ASSUME_DATE_PARTITION_OPT_VAL()));
|
||||
hiveSyncConfig.databaseName = props.getString(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(),
|
||||
DataSourceWriteOptions.DEFAULT_HIVE_DATABASE_OPT_VAL());
|
||||
hiveSyncConfig.tableName = props.getString(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY());
|
||||
hiveSyncConfig.hiveUser = props.getString(DataSourceWriteOptions.HIVE_USER_OPT_KEY(),
|
||||
DataSourceWriteOptions.DEFAULT_HIVE_USER_OPT_VAL());
|
||||
hiveSyncConfig.hivePass = props.getString(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(),
|
||||
DataSourceWriteOptions.DEFAULT_HIVE_PASS_OPT_VAL());
|
||||
hiveSyncConfig.jdbcUrl = props.getString(DataSourceWriteOptions.HIVE_URL_OPT_KEY(),
|
||||
DataSourceWriteOptions.DEFAULT_HIVE_URL_OPT_VAL());
|
||||
hiveSyncConfig.partitionFields =
|
||||
props.getStringList(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), ",", new ArrayList<>());
|
||||
hiveSyncConfig.partitionValueExtractorClass =
|
||||
props.getString(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
|
||||
SlashEncodedDayPartitionValueExtractor.class.getName());
|
||||
return hiveSyncConfig;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
|
||||
/**
|
||||
* Empty payload used for deletions
|
||||
*/
|
||||
public class EmptyHoodieRecordPayload implements HoodieRecordPayload<EmptyHoodieRecordPayload> {
|
||||
|
||||
public EmptyHoodieRecordPayload(GenericRecord record, Comparable orderingVal) { }
|
||||
|
||||
@Override
|
||||
public EmptyHoodieRecordPayload preCombine(EmptyHoodieRecordPayload another) {
|
||||
return another;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<IndexedRecord> combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) {
|
||||
return Option.empty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<IndexedRecord> getInsertValue(Schema schema) {
|
||||
return Option.empty();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.HoodieTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
|
||||
/**
|
||||
* List of helpers to aid, construction of instanttime for read and write operations using
|
||||
* datasource
|
||||
*/
|
||||
public class HoodieDataSourceHelpers {
|
||||
|
||||
/**
|
||||
* Checks if the Hoodie dataset has new data since given timestamp. This can be subsequently fed
|
||||
* to an incremental view read, to perform incremental processing.
|
||||
*/
|
||||
public static boolean hasNewCommits(FileSystem fs, String basePath, String commitTimestamp) {
|
||||
return listCommitsSince(fs, basePath, commitTimestamp).size() > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a list of instant times that have occurred, from the given instant timestamp.
|
||||
*/
|
||||
public static List<String> listCommitsSince(FileSystem fs, String basePath,
|
||||
String instantTimestamp) {
|
||||
HoodieTimeline timeline = allCompletedCommitsCompactions(fs, basePath);
|
||||
return timeline.findInstantsAfter(instantTimestamp, Integer.MAX_VALUE).getInstants()
|
||||
.map(HoodieInstant::getTimestamp).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the last successful write operation's instant time
|
||||
*/
|
||||
public static String latestCommit(FileSystem fs, String basePath) {
|
||||
HoodieTimeline timeline = allCompletedCommitsCompactions(fs, basePath);
|
||||
return timeline.lastInstant().get().getTimestamp();
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtain all the commits, compactions that have occurred on the timeline, whose instant times
|
||||
* could be fed into the datasource options.
|
||||
*/
|
||||
public static HoodieTimeline allCompletedCommitsCompactions(FileSystem fs, String basePath) {
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true);
|
||||
if (metaClient.getTableType().equals(HoodieTableType.MERGE_ON_READ)) {
|
||||
return metaClient.getActiveTimeline().getTimelineOfActions(
|
||||
Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
|
||||
HoodieActiveTimeline.DELTA_COMMIT_ACTION));
|
||||
} else {
|
||||
return metaClient.getCommitTimeline().filterCompletedInstants();
|
||||
}
|
||||
}
|
||||
}
|
||||
43
hudi-spark/src/main/java/org/apache/hudi/KeyGenerator.java
Normal file
43
hudi-spark/src/main/java/org/apache/hudi/KeyGenerator.java
Normal file
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi;
|
||||
|
||||
import java.io.Serializable;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.util.TypedProperties;
|
||||
|
||||
/**
|
||||
* Abstract class to extend for plugging in extraction of
|
||||
* {@link HoodieKey}
|
||||
* from an Avro record
|
||||
*/
|
||||
public abstract class KeyGenerator implements Serializable {
|
||||
|
||||
protected transient TypedProperties config;
|
||||
|
||||
protected KeyGenerator(TypedProperties config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a Hoodie Key out of provided generic record.
|
||||
*/
|
||||
public abstract HoodieKey getKey(GenericRecord record);
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.util.TypedProperties;
|
||||
|
||||
/**
|
||||
* Simple Key generator for unpartitioned Hive Tables
|
||||
*/
|
||||
public class NonpartitionedKeyGenerator extends SimpleKeyGenerator {
|
||||
|
||||
private static final String EMPTY_PARTITION = "";
|
||||
|
||||
public NonpartitionedKeyGenerator(TypedProperties props) {
|
||||
super(props);
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieKey getKey(GenericRecord record) {
|
||||
String recordKey = DataSourceUtils.getNestedFieldValAsString(record, recordKeyField);
|
||||
return new HoodieKey(recordKey, EMPTY_PARTITION);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
|
||||
/**
|
||||
* Default payload used for delta streamer.
|
||||
* <p>
|
||||
* 1. preCombine - Picks the latest delta record for a key, based on an ordering field 2.
|
||||
* combineAndGetUpdateValue/getInsertValue - Simply overwrites storage with latest delta record
|
||||
*/
|
||||
public class OverwriteWithLatestAvroPayload extends BaseAvroPayload implements
|
||||
HoodieRecordPayload<OverwriteWithLatestAvroPayload> {
|
||||
|
||||
/**
|
||||
* @param record
|
||||
* @param orderingVal
|
||||
*/
|
||||
public OverwriteWithLatestAvroPayload(GenericRecord record, Comparable orderingVal) {
|
||||
super(record, orderingVal);
|
||||
}
|
||||
|
||||
public OverwriteWithLatestAvroPayload(Option<GenericRecord> record) {
|
||||
this(record.get(), (record1) -> 0); // natural order
|
||||
}
|
||||
|
||||
@Override
|
||||
public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload another) {
|
||||
// pick the payload with greatest ordering value
|
||||
if (another.orderingVal.compareTo(orderingVal) > 0) {
|
||||
return another;
|
||||
} else {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<IndexedRecord> combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema)
|
||||
throws IOException {
|
||||
// combining strategy here trivially ignores currentValue on disk and writes this record
|
||||
return getInsertValue(schema);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<IndexedRecord> getInsertValue(Schema schema) throws IOException {
|
||||
return Option.of(HoodieAvroUtils.bytesToAvro(recordBytes, schema));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.util.TypedProperties;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
|
||||
/**
|
||||
* Simple key generator, which takes names of fields to be used for recordKey and partitionPath as
|
||||
* configs.
|
||||
*/
|
||||
public class SimpleKeyGenerator extends KeyGenerator {
|
||||
|
||||
private static final String DEFAULT_PARTITION_PATH = "default";
|
||||
|
||||
protected final String recordKeyField;
|
||||
|
||||
protected final String partitionPathField;
|
||||
|
||||
public SimpleKeyGenerator(TypedProperties props) {
|
||||
super(props);
|
||||
this.recordKeyField = props.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY());
|
||||
this.partitionPathField = props
|
||||
.getString(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY());
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieKey getKey(GenericRecord record) {
|
||||
if (recordKeyField == null || partitionPathField == null) {
|
||||
throw new HoodieException(
|
||||
"Unable to find field names for record key or partition path in cfg");
|
||||
}
|
||||
|
||||
String recordKey = DataSourceUtils.getNestedFieldValAsString(record, recordKeyField);
|
||||
String partitionPath;
|
||||
try {
|
||||
partitionPath = DataSourceUtils.getNestedFieldValAsString(record, partitionPathField);
|
||||
} catch (HoodieException e) {
|
||||
// if field is not found, lump it into default partition
|
||||
partitionPath = DEFAULT_PARTITION_PATH;
|
||||
}
|
||||
|
||||
return new HoodieKey(recordKey, partitionPath);
|
||||
}
|
||||
}
|
||||
614
hudi-spark/src/main/resources/META-INF/LICENSE.txt
Normal file
614
hudi-spark/src/main/resources/META-INF/LICENSE.txt
Normal file
@@ -0,0 +1,614 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
----
|
||||
This project bundles portions of the 'JQuery' project under the terms of the MIT license.
|
||||
|
||||
Copyright 2012 jQuery Foundation and other contributors
|
||||
http://jquery.com/
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
----
|
||||
This project bundles a derivative of portions of the 'Asciidoctor' project
|
||||
under the terms of the MIT license.
|
||||
|
||||
The MIT License
|
||||
Copyright (C) 2012-2015 Dan Allen, Ryan Waldron and the Asciidoctor Project
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
|
||||
----
|
||||
This project incorporates portions of the 'Protocol Buffers' project avaialble
|
||||
under a '3-clause BSD' license.
|
||||
|
||||
Copyright 2008, Google Inc.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
Code generated by the Protocol Buffer compiler is owned by the owner
|
||||
of the input file used when generating it. This code is not
|
||||
standalone and requires a support library to be linked with it. This
|
||||
support library is itself covered by the above license.
|
||||
|
||||
----
|
||||
This project bundles a derivative image for our Orca Logo. This image is
|
||||
available under the Creative Commons By Attribution 3.0 License.
|
||||
|
||||
Creative Commons Legal Code
|
||||
|
||||
Attribution 3.0 Unported
|
||||
|
||||
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
|
||||
LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN
|
||||
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
|
||||
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
|
||||
REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR
|
||||
DAMAGES RESULTING FROM ITS USE.
|
||||
|
||||
License
|
||||
|
||||
THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE
|
||||
COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY
|
||||
COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS
|
||||
AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.
|
||||
|
||||
BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE
|
||||
TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY
|
||||
BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS
|
||||
CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND
|
||||
CONDITIONS.
|
||||
|
||||
1. Definitions
|
||||
|
||||
a. "Adaptation" means a work based upon the Work, or upon the Work and
|
||||
other pre-existing works, such as a translation, adaptation,
|
||||
derivative work, arrangement of music or other alterations of a
|
||||
literary or artistic work, or phonogram or performance and includes
|
||||
cinematographic adaptations or any other form in which the Work may be
|
||||
recast, transformed, or adapted including in any form recognizably
|
||||
derived from the original, except that a work that constitutes a
|
||||
Collection will not be considered an Adaptation for the purpose of
|
||||
this License. For the avoidance of doubt, where the Work is a musical
|
||||
work, performance or phonogram, the synchronization of the Work in
|
||||
timed-relation with a moving image ("synching") will be considered an
|
||||
Adaptation for the purpose of this License.
|
||||
b. "Collection" means a collection of literary or artistic works, such as
|
||||
encyclopedias and anthologies, or performances, phonograms or
|
||||
broadcasts, or other works or subject matter other than works listed
|
||||
in Section 1(f) below, which, by reason of the selection and
|
||||
arrangement of their contents, constitute intellectual creations, in
|
||||
which the Work is included in its entirety in unmodified form along
|
||||
with one or more other contributions, each constituting separate and
|
||||
independent works in themselves, which together are assembled into a
|
||||
collective whole. A work that constitutes a Collection will not be
|
||||
considered an Adaptation (as defined above) for the purposes of this
|
||||
License.
|
||||
c. "Distribute" means to make available to the public the original and
|
||||
copies of the Work or Adaptation, as appropriate, through sale or
|
||||
other transfer of ownership.
|
||||
d. "Licensor" means the individual, individuals, entity or entities that
|
||||
offer(s) the Work under the terms of this License.
|
||||
e. "Original Author" means, in the case of a literary or artistic work,
|
||||
the individual, individuals, entity or entities who created the Work
|
||||
or if no individual or entity can be identified, the publisher; and in
|
||||
addition (i) in the case of a performance the actors, singers,
|
||||
musicians, dancers, and other persons who act, sing, deliver, declaim,
|
||||
play in, interpret or otherwise perform literary or artistic works or
|
||||
expressions of folklore; (ii) in the case of a phonogram the producer
|
||||
being the person or legal entity who first fixes the sounds of a
|
||||
performance or other sounds; and, (iii) in the case of broadcasts, the
|
||||
organization that transmits the broadcast.
|
||||
f. "Work" means the literary and/or artistic work offered under the terms
|
||||
of this License including without limitation any production in the
|
||||
literary, scientific and artistic domain, whatever may be the mode or
|
||||
form of its expression including digital form, such as a book,
|
||||
pamphlet and other writing; a lecture, address, sermon or other work
|
||||
of the same nature; a dramatic or dramatico-musical work; a
|
||||
choreographic work or entertainment in dumb show; a musical
|
||||
composition with or without words; a cinematographic work to which are
|
||||
assimilated works expressed by a process analogous to cinematography;
|
||||
a work of drawing, painting, architecture, sculpture, engraving or
|
||||
lithography; a photographic work to which are assimilated works
|
||||
expressed by a process analogous to photography; a work of applied
|
||||
art; an illustration, map, plan, sketch or three-dimensional work
|
||||
relative to geography, topography, architecture or science; a
|
||||
performance; a broadcast; a phonogram; a compilation of data to the
|
||||
extent it is protected as a copyrightable work; or a work performed by
|
||||
a variety or circus performer to the extent it is not otherwise
|
||||
considered a literary or artistic work.
|
||||
g. "You" means an individual or entity exercising rights under this
|
||||
License who has not previously violated the terms of this License with
|
||||
respect to the Work, or who has received express permission from the
|
||||
Licensor to exercise rights under this License despite a previous
|
||||
violation.
|
||||
h. "Publicly Perform" means to perform public recitations of the Work and
|
||||
to communicate to the public those public recitations, by any means or
|
||||
process, including by wire or wireless means or public digital
|
||||
performances; to make available to the public Works in such a way that
|
||||
members of the public may access these Works from a place and at a
|
||||
place individually chosen by them; to perform the Work to the public
|
||||
by any means or process and the communication to the public of the
|
||||
performances of the Work, including by public digital performance; to
|
||||
broadcast and rebroadcast the Work by any means including signs,
|
||||
sounds or images.
|
||||
i. "Reproduce" means to make copies of the Work by any means including
|
||||
without limitation by sound or visual recordings and the right of
|
||||
fixation and reproducing fixations of the Work, including storage of a
|
||||
protected performance or phonogram in digital form or other electronic
|
||||
medium.
|
||||
|
||||
2. Fair Dealing Rights. Nothing in this License is intended to reduce,
|
||||
limit, or restrict any uses free from copyright or rights arising from
|
||||
limitations or exceptions that are provided for in connection with the
|
||||
copyright protection under copyright law or other applicable laws.
|
||||
|
||||
3. License Grant. Subject to the terms and conditions of this License,
|
||||
Licensor hereby grants You a worldwide, royalty-free, non-exclusive,
|
||||
perpetual (for the duration of the applicable copyright) license to
|
||||
exercise the rights in the Work as stated below:
|
||||
|
||||
a. to Reproduce the Work, to incorporate the Work into one or more
|
||||
Collections, and to Reproduce the Work as incorporated in the
|
||||
Collections;
|
||||
b. to create and Reproduce Adaptations provided that any such Adaptation,
|
||||
including any translation in any medium, takes reasonable steps to
|
||||
clearly label, demarcate or otherwise identify that changes were made
|
||||
to the original Work. For example, a translation could be marked "The
|
||||
original work was translated from English to Spanish," or a
|
||||
modification could indicate "The original work has been modified.";
|
||||
c. to Distribute and Publicly Perform the Work including as incorporated
|
||||
in Collections; and,
|
||||
d. to Distribute and Publicly Perform Adaptations.
|
||||
e. For the avoidance of doubt:
|
||||
|
||||
i. Non-waivable Compulsory License Schemes. In those jurisdictions in
|
||||
which the right to collect royalties through any statutory or
|
||||
compulsory licensing scheme cannot be waived, the Licensor
|
||||
reserves the exclusive right to collect such royalties for any
|
||||
exercise by You of the rights granted under this License;
|
||||
ii. Waivable Compulsory License Schemes. In those jurisdictions in
|
||||
which the right to collect royalties through any statutory or
|
||||
compulsory licensing scheme can be waived, the Licensor waives the
|
||||
exclusive right to collect such royalties for any exercise by You
|
||||
of the rights granted under this License; and,
|
||||
iii. Voluntary License Schemes. The Licensor waives the right to
|
||||
collect royalties, whether individually or, in the event that the
|
||||
Licensor is a member of a collecting society that administers
|
||||
voluntary licensing schemes, via that society, from any exercise
|
||||
by You of the rights granted under this License.
|
||||
|
||||
The above rights may be exercised in all media and formats whether now
|
||||
known or hereafter devised. The above rights include the right to make
|
||||
such modifications as are technically necessary to exercise the rights in
|
||||
other media and formats. Subject to Section 8(f), all rights not expressly
|
||||
granted by Licensor are hereby reserved.
|
||||
|
||||
4. Restrictions. The license granted in Section 3 above is expressly made
|
||||
subject to and limited by the following restrictions:
|
||||
|
||||
a. You may Distribute or Publicly Perform the Work only under the terms
|
||||
of this License. You must include a copy of, or the Uniform Resource
|
||||
Identifier (URI) for, this License with every copy of the Work You
|
||||
Distribute or Publicly Perform. You may not offer or impose any terms
|
||||
on the Work that restrict the terms of this License or the ability of
|
||||
the recipient of the Work to exercise the rights granted to that
|
||||
recipient under the terms of the License. You may not sublicense the
|
||||
Work. You must keep intact all notices that refer to this License and
|
||||
to the disclaimer of warranties with every copy of the Work You
|
||||
Distribute or Publicly Perform. When You Distribute or Publicly
|
||||
Perform the Work, You may not impose any effective technological
|
||||
measures on the Work that restrict the ability of a recipient of the
|
||||
Work from You to exercise the rights granted to that recipient under
|
||||
the terms of the License. This Section 4(a) applies to the Work as
|
||||
incorporated in a Collection, but this does not require the Collection
|
||||
apart from the Work itself to be made subject to the terms of this
|
||||
License. If You create a Collection, upon notice from any Licensor You
|
||||
must, to the extent practicable, remove from the Collection any credit
|
||||
as required by Section 4(b), as requested. If You create an
|
||||
Adaptation, upon notice from any Licensor You must, to the extent
|
||||
practicable, remove from the Adaptation any credit as required by
|
||||
Section 4(b), as requested.
|
||||
b. If You Distribute, or Publicly Perform the Work or any Adaptations or
|
||||
Collections, You must, unless a request has been made pursuant to
|
||||
Section 4(a), keep intact all copyright notices for the Work and
|
||||
provide, reasonable to the medium or means You are utilizing: (i) the
|
||||
name of the Original Author (or pseudonym, if applicable) if supplied,
|
||||
and/or if the Original Author and/or Licensor designate another party
|
||||
or parties (e.g., a sponsor institute, publishing entity, journal) for
|
||||
attribution ("Attribution Parties") in Licensor's copyright notice,
|
||||
terms of service or by other reasonable means, the name of such party
|
||||
or parties; (ii) the title of the Work if supplied; (iii) to the
|
||||
extent reasonably practicable, the URI, if any, that Licensor
|
||||
specifies to be associated with the Work, unless such URI does not
|
||||
refer to the copyright notice or licensing information for the Work;
|
||||
and (iv) , consistent with Section 3(b), in the case of an Adaptation,
|
||||
a credit identifying the use of the Work in the Adaptation (e.g.,
|
||||
"French translation of the Work by Original Author," or "Screenplay
|
||||
based on original Work by Original Author"). The credit required by
|
||||
this Section 4 (b) may be implemented in any reasonable manner;
|
||||
provided, however, that in the case of a Adaptation or Collection, at
|
||||
a minimum such credit will appear, if a credit for all contributing
|
||||
authors of the Adaptation or Collection appears, then as part of these
|
||||
credits and in a manner at least as prominent as the credits for the
|
||||
other contributing authors. For the avoidance of doubt, You may only
|
||||
use the credit required by this Section for the purpose of attribution
|
||||
in the manner set out above and, by exercising Your rights under this
|
||||
License, You may not implicitly or explicitly assert or imply any
|
||||
connection with, sponsorship or endorsement by the Original Author,
|
||||
Licensor and/or Attribution Parties, as appropriate, of You or Your
|
||||
use of the Work, without the separate, express prior written
|
||||
permission of the Original Author, Licensor and/or Attribution
|
||||
Parties.
|
||||
c. Except as otherwise agreed in writing by the Licensor or as may be
|
||||
otherwise permitted by applicable law, if You Reproduce, Distribute or
|
||||
Publicly Perform the Work either by itself or as part of any
|
||||
Adaptations or Collections, You must not distort, mutilate, modify or
|
||||
take other derogatory action in relation to the Work which would be
|
||||
prejudicial to the Original Author's honor or reputation. Licensor
|
||||
agrees that in those jurisdictions (e.g. Japan), in which any exercise
|
||||
of the right granted in Section 3(b) of this License (the right to
|
||||
make Adaptations) would be deemed to be a distortion, mutilation,
|
||||
modification or other derogatory action prejudicial to the Original
|
||||
Author's honor and reputation, the Licensor will waive or not assert,
|
||||
as appropriate, this Section, to the fullest extent permitted by the
|
||||
applicable national law, to enable You to reasonably exercise Your
|
||||
right under Section 3(b) of this License (right to make Adaptations)
|
||||
but not otherwise.
|
||||
|
||||
5. Representations, Warranties and Disclaimer
|
||||
|
||||
UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR
|
||||
OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY
|
||||
KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE,
|
||||
INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF
|
||||
LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS,
|
||||
WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION
|
||||
OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.
|
||||
|
||||
6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE
|
||||
LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR
|
||||
ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES
|
||||
ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS
|
||||
BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
|
||||
|
||||
7. Termination
|
||||
|
||||
a. This License and the rights granted hereunder will terminate
|
||||
automatically upon any breach by You of the terms of this License.
|
||||
Individuals or entities who have received Adaptations or Collections
|
||||
from You under this License, however, will not have their licenses
|
||||
terminated provided such individuals or entities remain in full
|
||||
compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will
|
||||
survive any termination of this License.
|
||||
b. Subject to the above terms and conditions, the license granted here is
|
||||
perpetual (for the duration of the applicable copyright in the Work).
|
||||
Notwithstanding the above, Licensor reserves the right to release the
|
||||
Work under different license terms or to stop distributing the Work at
|
||||
any time; provided, however that any such election will not serve to
|
||||
withdraw this License (or any other license that has been, or is
|
||||
required to be, granted under the terms of this License), and this
|
||||
License will continue in full force and effect unless terminated as
|
||||
stated above.
|
||||
|
||||
8. Miscellaneous
|
||||
|
||||
a. Each time You Distribute or Publicly Perform the Work or a Collection,
|
||||
the Licensor offers to the recipient a license to the Work on the same
|
||||
terms and conditions as the license granted to You under this License.
|
||||
b. Each time You Distribute or Publicly Perform an Adaptation, Licensor
|
||||
offers to the recipient a license to the original Work on the same
|
||||
terms and conditions as the license granted to You under this License.
|
||||
c. If any provision of this License is invalid or unenforceable under
|
||||
applicable law, it shall not affect the validity or enforceability of
|
||||
the remainder of the terms of this License, and without further action
|
||||
by the parties to this agreement, such provision shall be reformed to
|
||||
the minimum extent necessary to make such provision valid and
|
||||
enforceable.
|
||||
d. No term or provision of this License shall be deemed waived and no
|
||||
breach consented to unless such waiver or consent shall be in writing
|
||||
and signed by the party to be charged with such waiver or consent.
|
||||
e. This License constitutes the entire agreement between the parties with
|
||||
respect to the Work licensed here. There are no understandings,
|
||||
agreements or representations with respect to the Work not specified
|
||||
here. Licensor shall not be bound by any additional provisions that
|
||||
may appear in any communication from You. This License may not be
|
||||
modified without the mutual written agreement of the Licensor and You.
|
||||
f. The rights granted under, and the subject matter referenced, in this
|
||||
License were drafted utilizing the terminology of the Berne Convention
|
||||
for the Protection of Literary and Artistic Works (as amended on
|
||||
September 28, 1979), the Rome Convention of 1961, the WIPO Copyright
|
||||
Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996
|
||||
and the Universal Copyright Convention (as revised on July 24, 1971).
|
||||
These rights and subject matter take effect in the relevant
|
||||
jurisdiction in which the License terms are sought to be enforced
|
||||
according to the corresponding provisions of the implementation of
|
||||
those treaty provisions in the applicable national law. If the
|
||||
standard suite of rights granted under applicable copyright law
|
||||
includes additional rights not granted under this License, such
|
||||
additional rights are deemed to be included in the License; this
|
||||
License is not intended to restrict the license of any rights under
|
||||
applicable law.
|
||||
|
||||
|
||||
Creative Commons Notice
|
||||
|
||||
Creative Commons is not a party to this License, and makes no warranty
|
||||
whatsoever in connection with the Work. Creative Commons will not be
|
||||
liable to You or any party on any legal theory for any damages
|
||||
whatsoever, including without limitation any general, special,
|
||||
incidental or consequential damages arising in connection to this
|
||||
license. Notwithstanding the foregoing two (2) sentences, if Creative
|
||||
Commons has expressly identified itself as the Licensor hereunder, it
|
||||
shall have all rights and obligations of Licensor.
|
||||
|
||||
Except for the limited purpose of indicating to the public that the
|
||||
Work is licensed under the CCPL, Creative Commons does not authorize
|
||||
the use by either party of the trademark "Creative Commons" or any
|
||||
related trademark or logo of Creative Commons without the prior
|
||||
written consent of Creative Commons. Any permitted use will be in
|
||||
compliance with Creative Commons' then-current trademark usage
|
||||
guidelines, as may be published on its website or otherwise made
|
||||
available upon request from time to time. For the avoidance of doubt,
|
||||
this trademark restriction does not form part of this License.
|
||||
|
||||
Creative Commons may be contacted at https://creativecommons.org/.
|
||||
317
hudi-spark/src/main/resources/META-INF/NOTICE.txt
Normal file
317
hudi-spark/src/main/resources/META-INF/NOTICE.txt
Normal file
@@ -0,0 +1,317 @@
|
||||
Apache HUDI
|
||||
Copyright 2019 The Apache Software Foundation
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
|
||||
|
||||
This project includes:
|
||||
aircompressor under Apache License 2.0
|
||||
An open source Java toolkit for Amazon S3 under Apache License, Version 2.0
|
||||
Annotation 1.0 under The Apache Software License, Version 2.0
|
||||
ant under The Apache Software License, Version 2.0
|
||||
ANTLR 3 Runtime under BSD licence
|
||||
ANTLR 4 Runtime under The BSD License
|
||||
ANTLR ST4 4.0.4 under BSD licence
|
||||
AOP alliance under Public Domain
|
||||
aopalliance version 1.0 repackaged as a module under CDDL + GPLv2 with classpath exception
|
||||
Apache Ant Core under The Apache Software License, Version 2.0
|
||||
Apache Ant Launcher under The Apache Software License, Version 2.0
|
||||
Apache Avro under The Apache Software License, Version 2.0
|
||||
Apache Avro IPC under The Apache Software License, Version 2.0
|
||||
Apache Avro Mapred API under The Apache Software License, Version 2.0
|
||||
Apache Calcite Avatica under Apache License, Version 2.0
|
||||
Apache Calcite Avatica Metrics under Apache License, Version 2.0
|
||||
Apache Commons Collections under Apache License, Version 2.0
|
||||
Apache Commons Configuration under Apache License, Version 2.0
|
||||
Apache Commons Crypto under Apache License, Version 2.0
|
||||
Apache Commons IO under Apache License, Version 2.0
|
||||
Apache Commons Lang under Apache License, Version 2.0
|
||||
Apache Commons Logging under The Apache Software License, Version 2.0
|
||||
Apache Curator under The Apache Software License, Version 2.0
|
||||
Apache Derby Database Engine and Embedded JDBC Driver under Apache 2
|
||||
Apache Directory API ASN.1 API under The Apache Software License, Version 2.0
|
||||
Apache Directory LDAP API Utilities under The Apache Software License, Version 2.0
|
||||
Apache Groovy under The Apache Software License, Version 2.0
|
||||
Apache Hadoop Annotations under Apache License, Version 2.0
|
||||
Apache Hadoop Auth under Apache License, Version 2.0
|
||||
Apache Hadoop Client under Apache License, Version 2.0
|
||||
Apache Hadoop Common under Apache License, Version 2.0
|
||||
Apache Hadoop HDFS under Apache License, Version 2.0
|
||||
Apache HBase - Annotations under Apache License, Version 2.0
|
||||
Apache HBase - Client under Apache License, Version 2.0
|
||||
Apache HBase - Protocol under Apache License, Version 2.0
|
||||
Apache HttpClient under Apache License, Version 2.0
|
||||
Apache HttpCore under Apache License, Version 2.0
|
||||
Apache Ivy under The Apache Software License, Version 2.0
|
||||
Apache Log4j under The Apache Software License, Version 2.0
|
||||
Apache Log4j 1.x Compatibility API under The Apache Software License, Version 2.0
|
||||
Apache Log4j API under The Apache Software License, Version 2.0
|
||||
Apache Log4j Core under The Apache Software License, Version 2.0
|
||||
Apache Log4j SLF4J Binding under The Apache Software License, Version 2.0
|
||||
Apache Log4j Web under The Apache Software License, Version 2.0
|
||||
Apache Parquet Avro under The Apache Software License, Version 2.0
|
||||
Apache Parquet Avro (Incubating) under The Apache Software License, Version 2.0
|
||||
Apache Parquet Column under The Apache Software License, Version 2.0
|
||||
Apache Parquet Column (Incubating) under The Apache Software License, Version 2.0
|
||||
Apache Parquet Common under The Apache Software License, Version 2.0
|
||||
Apache Parquet Common (Incubating) under The Apache Software License, Version 2.0
|
||||
Apache Parquet Encodings under The Apache Software License, Version 2.0
|
||||
Apache Parquet Encodings (Incubating) under The Apache Software License, Version 2.0
|
||||
Apache Parquet Format (Incubating) under The Apache Software License, Version 2.0
|
||||
Apache Parquet Generator (Incubating) under The Apache Software License, Version 2.0
|
||||
Apache Parquet Hadoop under The Apache Software License, Version 2.0
|
||||
Apache Parquet Hadoop (Incubating) under The Apache Software License, Version 2.0
|
||||
Apache Parquet Hadoop Bundle under The Apache Software License, Version 2.0
|
||||
Apache Parquet Hadoop Bundle (Incubating) under The Apache Software License, Version 2.0
|
||||
Apache Parquet Jackson under The Apache Software License, Version 2.0
|
||||
Apache Parquet Jackson (Incubating) under The Apache Software License, Version 2.0
|
||||
Apache Thrift under The Apache Software License, Version 2.0
|
||||
Apache Twill API under The Apache Software License, Version 2.0
|
||||
Apache Twill common library under The Apache Software License, Version 2.0
|
||||
Apache Twill core library under The Apache Software License, Version 2.0
|
||||
Apache Twill discovery service API under The Apache Software License, Version 2.0
|
||||
Apache Twill discovery service implementations under The Apache Software License, Version 2.0
|
||||
Apache Twill ZooKeeper client library under The Apache Software License, Version 2.0
|
||||
Apache Velocity under The Apache Software License, Version 2.0
|
||||
Apache XBean :: ASM 5 shaded (repackaged) under null or null
|
||||
ApacheDS I18n under The Apache Software License, Version 2.0
|
||||
ApacheDS Protocol Kerberos Codec under The Apache Software License, Version 2.0
|
||||
ASM Commons under 3-Clause BSD License
|
||||
ASM Core under 3-Clause BSD License
|
||||
ASM Tree under 3-Clause BSD License
|
||||
Bean Validation API under The Apache Software License, Version 2.0
|
||||
BoneCP :: Core Library under Apache v2
|
||||
Calcite Core under Apache License, Version 2.0
|
||||
Calcite Druid under Apache License, Version 2.0
|
||||
Calcite Linq4j under Apache License, Version 2.0
|
||||
chill under Apache 2
|
||||
chill-java under Apache 2
|
||||
com.twitter.common:objectsize under Apache License, Version 2.0
|
||||
Commons BeanUtils Core under The Apache Software License, Version 2.0
|
||||
Commons CLI under The Apache Software License, Version 2.0
|
||||
Commons Codec under The Apache Software License, Version 2.0
|
||||
Commons Compiler under New BSD License
|
||||
Commons Compress under The Apache Software License, Version 2.0
|
||||
Commons Configuration under The Apache Software License, Version 2.0
|
||||
Commons Daemon under The Apache Software License, Version 2.0
|
||||
Commons DBCP under The Apache Software License, Version 2.0
|
||||
Commons Lang under The Apache Software License, Version 2.0
|
||||
Commons Math under The Apache Software License, Version 2.0
|
||||
Commons Net under The Apache Software License, Version 2.0
|
||||
Commons Pool under The Apache Software License, Version 2.0
|
||||
commons-beanutils under Apache License
|
||||
Compress-LZF under Apache License 2.0
|
||||
Curator Client under The Apache Software License, Version 2.0
|
||||
Curator Framework under The Apache Software License, Version 2.0
|
||||
Curator Recipes under The Apache Software License, Version 2.0
|
||||
Data Mapper for Jackson under The Apache Software License, Version 2.0
|
||||
DataNucleus Core under The Apache Software License, Version 2.0
|
||||
DataNucleus JDO API plugin under The Apache Software License, Version 2.0
|
||||
DataNucleus RDBMS plugin under The Apache Software License, Version 2.0
|
||||
Digester under The Apache Software License, Version 2.0
|
||||
Disruptor Framework under The Apache Software License, Version 2.0
|
||||
eigenbase-properties under Apache License, Version 2.0
|
||||
EL under The Apache Software License, Version 2.0
|
||||
empty under The Apache License, Version 2.0
|
||||
fastutil under Apache License, Version 2.0
|
||||
Findbugs Annotations under Apache License under Apache License, Version 2.0
|
||||
FindBugs-jsr305 under The Apache Software License, Version 2.0
|
||||
Fluent API for Apache HttpClient under Apache License, Version 2.0
|
||||
Glassfish Jasper under CDDL 1.0
|
||||
Glassfish Jasper API under Apache License Version 2.0
|
||||
Google Guice - Core Library under The Apache Software License, Version 2.0
|
||||
Google Guice - Extensions - AssistedInject under The Apache Software License, Version 2.0
|
||||
Google Guice - Extensions - Servlet under The Apache Software License, Version 2.0
|
||||
Graphite Integration for Metrics under Apache License 2.0
|
||||
Gson under The Apache Software License, Version 2.0
|
||||
Guava: Google Core Libraries for Java under The Apache Software License, Version 2.0
|
||||
Hadoop Metrics2 Reporter for Dropwizard Metrics under Apache License, Version 2.0
|
||||
hadoop-mapreduce-client-app under Apache License, Version 2.0
|
||||
hadoop-mapreduce-client-common under Apache License, Version 2.0
|
||||
hadoop-mapreduce-client-core under Apache License, Version 2.0
|
||||
hadoop-mapreduce-client-jobclient under Apache License, Version 2.0
|
||||
hadoop-mapreduce-client-shuffle under Apache License, Version 2.0
|
||||
hadoop-yarn-api under Apache License, Version 2.0
|
||||
hadoop-yarn-client under Apache License, Version 2.0
|
||||
hadoop-yarn-common under Apache License, Version 2.0
|
||||
hadoop-yarn-registry under Apache License, Version 2.0
|
||||
hadoop-yarn-server-applicationhistoryservice under Apache License, Version 2.0
|
||||
hadoop-yarn-server-common under Apache License, Version 2.0
|
||||
hadoop-yarn-server-resourcemanager under Apache License, Version 2.0
|
||||
hadoop-yarn-server-web-proxy under Apache License, Version 2.0
|
||||
Hamcrest Core under BSD style
|
||||
HBase - Common under The Apache Software License, Version 2.0
|
||||
HBase - Hadoop Compatibility under The Apache Software License, Version 2.0
|
||||
HBase - Hadoop Two Compatibility under The Apache Software License, Version 2.0
|
||||
HBase - Prefix Tree under The Apache Software License, Version 2.0
|
||||
HBase - Procedure under The Apache Software License, Version 2.0
|
||||
HBase - Server under The Apache Software License, Version 2.0
|
||||
HikariCP under The Apache Software License, Version 2.0
|
||||
Hive Common under The Apache Software License, Version 2.0
|
||||
Hive JDBC under The Apache Software License, Version 2.0
|
||||
Hive Llap Client under The Apache Software License, Version 2.0
|
||||
Hive Llap Common under The Apache Software License, Version 2.0
|
||||
Hive Llap Server under The Apache Software License, Version 2.0
|
||||
Hive Llap Tez under The Apache Software License, Version 2.0
|
||||
Hive Metastore under The Apache Software License, Version 2.0
|
||||
Hive Query Language under The Apache Software License, Version 2.0
|
||||
Hive Serde under The Apache Software License, Version 2.0
|
||||
Hive Service under The Apache Software License, Version 2.0
|
||||
Hive Service RPC under The Apache Software License, Version 2.0
|
||||
Hive Shims under The Apache Software License, Version 2.0
|
||||
Hive Shims 0.23 under The Apache Software License, Version 2.0
|
||||
Hive Shims Common under The Apache Software License, Version 2.0
|
||||
Hive Shims Scheduler under The Apache Software License, Version 2.0
|
||||
Hive Storage API under Apache License, Version 2.0
|
||||
Hive Vector-Code-Gen Utilities under The Apache Software License, Version 2.0
|
||||
HK2 API module under CDDL + GPLv2 with classpath exception
|
||||
HK2 Implementation Utilities under CDDL + GPLv2 with classpath exception
|
||||
hoodie-client under Apache License, Version 2.0
|
||||
hoodie-common under Apache License, Version 2.0
|
||||
hoodie-hadoop-mr under Apache License, Version 2.0
|
||||
hoodie-hive under Apache License, Version 2.0
|
||||
hoodie-spark under Apache License, Version 2.0
|
||||
hoodie-timeline-service under Apache License, Version 2.0
|
||||
htrace-core under The Apache Software License, Version 2.0
|
||||
HttpClient under Apache License
|
||||
IntelliJ IDEA Annotations under The Apache Software License, Version 2.0
|
||||
Jackson under The Apache Software License, Version 2.0
|
||||
Jackson Integration for Metrics under Apache License 2.0
|
||||
Jackson-annotations under The Apache Software License, Version 2.0
|
||||
Jackson-core under The Apache Software License, Version 2.0
|
||||
jackson-databind under The Apache Software License, Version 2.0
|
||||
Jackson-module-paranamer under The Apache Software License, Version 2.0
|
||||
jackson-module-scala under The Apache Software License, Version 2.0
|
||||
jamon-runtime under Mozilla Public License Version 1.1
|
||||
Janino under New BSD License
|
||||
jasper-compiler under The Apache Software License, Version 2.0
|
||||
jasper-runtime under The Apache Software License, Version 2.0
|
||||
Java Authentication SPI for Containers under The Apache Software License, Version 2.0
|
||||
Java Servlet API under CDDL + GPLv2 with classpath exception
|
||||
java-xmlbuilder under Apache License, Version 2.0
|
||||
JavaBeans Activation Framework (JAF) under Common Development and Distribution License (CDDL) v1.0
|
||||
Javalin under The Apache Software License, Version 2.0
|
||||
JavaMail API under Common Development and Distribution License (CDDL) v1.0
|
||||
Javassist under MPL 1.1 or LGPL 2.1 or Apache License 2.0
|
||||
javax.annotation API under CDDL + GPLv2 with classpath exception
|
||||
javax.inject under The Apache Software License, Version 2.0
|
||||
javax.inject:1 as OSGi bundle under CDDL + GPLv2 with classpath exception
|
||||
javax.ws.rs-api under CDDL 1.1 or GPL2 w/ CPE
|
||||
Javolution under BSD License
|
||||
JAX-RS provider for JSON content type under The Apache Software License, Version 2.0 or GNU Lesser General Public License (LGPL), Version 2.1
|
||||
JAXB RI under CDDL 1.1 or GPL2 w/ CPE
|
||||
JCL 1.1.1 implemented over SLF4J under MIT License
|
||||
JCodings under MIT License
|
||||
jcommander under Apache 2.0
|
||||
JDO API under Apache 2
|
||||
jersey-client under CDDL 1.1 or GPL2 w/ CPE
|
||||
jersey-container-servlet under CDDL+GPL License
|
||||
jersey-container-servlet-core under CDDL+GPL License
|
||||
jersey-core under CDDL 1.1 or GPL2 w/ CPE
|
||||
jersey-core-client under CDDL+GPL License
|
||||
jersey-core-common under CDDL+GPL License
|
||||
jersey-core-server under CDDL+GPL License
|
||||
jersey-guice under CDDL 1.1 or GPL2 w/ CPE
|
||||
jersey-json under CDDL 1.1 or GPL2 w/ CPE
|
||||
jersey-media-jaxb under CDDL+GPL License
|
||||
jersey-repackaged-guava under CDDL+GPL License
|
||||
jersey-server under CDDL 1.1 or GPL2 w/ CPE
|
||||
Jettison under Apache License, Version 2.0
|
||||
Jetty :: Aggregate :: All core Jetty under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty :: Asynchronous HTTP Client under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty :: Http Utility under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty :: IO Utility under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty :: Security under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty :: Server Core under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty :: Servlet Handling under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty :: Utilities under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty :: Webapp Application Support under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty :: Websocket :: API under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty :: Websocket :: Client under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty :: Websocket :: Common under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty :: Websocket :: Server under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty :: Websocket :: Servlet Interface under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty :: XML utilities under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty Orbit :: Servlet API under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty Server under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Jetty SSLEngine under Apache License Version 2
|
||||
Jetty Utilities under Apache Software License - Version 2.0 or Eclipse Public License - Version 1.0
|
||||
Joda-Time under Apache 2
|
||||
Joni under MIT License
|
||||
JPam under The Apache Software License, Version 2.0
|
||||
JSch under BSD
|
||||
json4s-ast under ASL
|
||||
json4s-core under ASL
|
||||
json4s-jackson under ASL
|
||||
jsp-api under CDDL
|
||||
JTA 1.1 under The Apache Software License, Version 2.0
|
||||
JUL to SLF4J bridge under MIT License
|
||||
JUnit under Common Public License Version 1.0
|
||||
JVM Integration for Metrics under Apache License 2.0
|
||||
Kryo Shaded under 3-Clause BSD License
|
||||
leveldbjni-all under The BSD 3-Clause License
|
||||
LZ4 and xxHash under The Apache Software License, Version 2.0
|
||||
Metrics Core under Apache License 2.0
|
||||
Metrics Core Library under Apache License 2.0
|
||||
MinLog under New BSD License
|
||||
Netty/All-in-One under Apache License, Version 2.0
|
||||
Objenesis under Apache 2
|
||||
Open JSON under The Apache Software License, Version 2.0
|
||||
opencsv under Apache 2
|
||||
ORC Core under Apache License, Version 2.0
|
||||
org.jetbrains.kotlin:kotlin-stdlib under The Apache License, Version 2.0
|
||||
org.jetbrains.kotlin:kotlin-stdlib-common under The Apache License, Version 2.0
|
||||
org.jetbrains.kotlin:kotlin-stdlib-jdk7 under The Apache License, Version 2.0
|
||||
org.jetbrains.kotlin:kotlin-stdlib-jdk8 under The Apache License, Version 2.0
|
||||
org.pentaho:pentaho-aggdesigner-algorithm under Apache License, Version 2.0
|
||||
oro under Apache License, Version 2.0
|
||||
OSGi resource locator bundle - used by various API providers that rely on META-INF/services mechanism to locate providers. under CDDL + GPLv2 with classpath exception
|
||||
ParaNamer Core under BSD
|
||||
Protocol Buffer Java API under New BSD license
|
||||
Py4J under The New BSD License
|
||||
pyrolite under MIT License
|
||||
RabbitMQ Java Client under ASL 2.0 or GPL v2 or MPL 1.1
|
||||
RoaringBitmap under Apache 2
|
||||
RocksDB JNI under Apache License 2.0 or GNU General Public License, version 2
|
||||
Scala Compiler under BSD 3-Clause
|
||||
Scala Library under BSD 3-Clause
|
||||
scala-parser-combinators under BSD 3-clause
|
||||
scala-xml under BSD 3-clause
|
||||
scalactic under the Apache License, ASL Version 2.0
|
||||
Scalap under BSD 3-Clause
|
||||
scalatest under the Apache License, ASL Version 2.0
|
||||
ServiceLocator Default Implementation under CDDL + GPLv2 with classpath exception
|
||||
Servlet Specification 2.5 API under CDDL 1.0
|
||||
Servlet Specification API under Apache License Version 2.0
|
||||
servlet-api under CDDL
|
||||
SLF4J API Module under MIT License
|
||||
SLF4J LOG4J-12 Binding under MIT License
|
||||
Slider Core under Apache License, Version 2.0
|
||||
Snappy for Java under The Apache Software License, Version 2.0
|
||||
Spark Project Catalyst under Apache 2.0 License
|
||||
Spark Project Core under Apache 2.0 License
|
||||
Spark Project Launcher under Apache 2.0 License
|
||||
Spark Project Networking under Apache 2.0 License
|
||||
Spark Project Shuffle Streaming Service under Apache 2.0 License
|
||||
Spark Project Sketch under Apache 2.0 License
|
||||
Spark Project SQL under Apache 2.0 License
|
||||
Spark Project Tags under Apache 2.0 License
|
||||
Spark Project Unsafe under Apache 2.0 License
|
||||
spark-avro under Apache-2.0
|
||||
StAX API under The Apache Software License, Version 2.0
|
||||
stream-lib under Apache License, Version 2.0
|
||||
Tephra API under The Apache Software License, Version 2.0
|
||||
Tephra Core under The Apache Software License, Version 2.0
|
||||
Tephra HBase 1.0 Compatibility under The Apache Software License, Version 2.0
|
||||
The Netty Project under Apache License, Version 2.0
|
||||
univocity-parsers under Apache 2
|
||||
Xerces2 Java Parser under The Apache Software License, Version 2.0
|
||||
XML Commons External Components XML APIs under The Apache Software License, Version 2.0
|
||||
Xml Compatibility extensions for Jackson under The Apache Software License, Version 2.0 or GNU Lesser General Public License (LGPL), Version 2.1
|
||||
xmlenc Library under The BSD License
|
||||
XZ for Java under Public Domain
|
||||
zookeeper under Apache License, Version 2.0
|
||||
|
||||
@@ -0,0 +1,353 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi
|
||||
|
||||
import java.nio.ByteBuffer
|
||||
import java.sql.{Date, Timestamp}
|
||||
import java.util
|
||||
|
||||
import com.databricks.spark.avro.SchemaConverters
|
||||
import com.databricks.spark.avro.SchemaConverters.IncompatibleSchemaException
|
||||
import org.apache.avro.Schema.Type._
|
||||
import org.apache.avro.generic.GenericData.{Fixed, Record}
|
||||
import org.apache.avro.generic.{GenericData, GenericRecord}
|
||||
import org.apache.avro.{Schema, SchemaBuilder}
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
||||
import org.apache.spark.sql.catalyst.expressions.GenericRow
|
||||
import org.apache.spark.sql.types._
|
||||
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
|
||||
object AvroConversionUtils {
|
||||
|
||||
def createRdd(df: DataFrame, structName: String, recordNamespace: String): RDD[GenericRecord] = {
|
||||
val dataType = df.schema
|
||||
val encoder = RowEncoder.apply(dataType).resolveAndBind()
|
||||
df.queryExecution.toRdd.map(encoder.fromRow)
|
||||
.mapPartitions { records =>
|
||||
if (records.isEmpty) Iterator.empty
|
||||
else {
|
||||
val convertor = createConverterToAvro(dataType, structName, recordNamespace)
|
||||
records.map { x => convertor(x).asInstanceOf[GenericRecord] }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def createDataFrame(rdd: RDD[GenericRecord], schemaStr: String, ss : SparkSession): Dataset[Row] = {
|
||||
if (rdd.isEmpty()) {
|
||||
ss.emptyDataFrame
|
||||
} else {
|
||||
ss.createDataFrame(rdd.mapPartitions { records =>
|
||||
if (records.isEmpty) Iterator.empty
|
||||
else {
|
||||
val schema = Schema.parse(schemaStr)
|
||||
val dataType = convertAvroSchemaToStructType(schema)
|
||||
val convertor = createConverterToRow(schema, dataType)
|
||||
records.map { x => convertor(x).asInstanceOf[Row] }
|
||||
}
|
||||
}, convertAvroSchemaToStructType(Schema.parse(schemaStr))).asInstanceOf[Dataset[Row]]
|
||||
}
|
||||
}
|
||||
|
||||
def getNewRecordNamespace(elementDataType: DataType,
|
||||
currentRecordNamespace: String,
|
||||
elementName: String): String = {
|
||||
|
||||
elementDataType match {
|
||||
case StructType(_) => s"$currentRecordNamespace.$elementName"
|
||||
case _ => currentRecordNamespace
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* NOTE : This part of code is copied from com.databricks.spark.avro.SchemaConverters.scala (133:310) (spark-avro)
|
||||
*
|
||||
* Returns a converter function to convert row in avro format to GenericRow of catalyst.
|
||||
*
|
||||
* @param sourceAvroSchema Source schema before conversion inferred from avro file by passed in
|
||||
* by user.
|
||||
* @param targetSqlType Target catalyst sql type after the conversion.
|
||||
* @return returns a converter function to convert row in avro format to GenericRow of catalyst.
|
||||
*/
|
||||
def createConverterToRow(sourceAvroSchema: Schema,
|
||||
targetSqlType: DataType): AnyRef => AnyRef = {
|
||||
|
||||
def createConverter(avroSchema: Schema,
|
||||
sqlType: DataType, path: List[String]): AnyRef => AnyRef = {
|
||||
val avroType = avroSchema.getType
|
||||
(sqlType, avroType) match {
|
||||
// Avro strings are in Utf8, so we have to call toString on them
|
||||
case (StringType, STRING) | (StringType, ENUM) =>
|
||||
(item: AnyRef) => if (item == null) null else item.toString
|
||||
// Byte arrays are reused by avro, so we have to make a copy of them.
|
||||
case (IntegerType, INT) | (BooleanType, BOOLEAN) | (DoubleType, DOUBLE) |
|
||||
(FloatType, FLOAT) | (LongType, LONG) =>
|
||||
identity
|
||||
case (BinaryType, FIXED) =>
|
||||
(item: AnyRef) =>
|
||||
if (item == null) {
|
||||
null
|
||||
} else {
|
||||
item.asInstanceOf[Fixed].bytes().clone()
|
||||
}
|
||||
case (BinaryType, BYTES) =>
|
||||
(item: AnyRef) =>
|
||||
if (item == null) {
|
||||
null
|
||||
} else {
|
||||
val byteBuffer = item.asInstanceOf[ByteBuffer]
|
||||
val bytes = new Array[Byte](byteBuffer.remaining)
|
||||
byteBuffer.get(bytes)
|
||||
bytes
|
||||
}
|
||||
|
||||
case (struct: StructType, RECORD) =>
|
||||
val length = struct.fields.length
|
||||
val converters = new Array[AnyRef => AnyRef](length)
|
||||
val avroFieldIndexes = new Array[Int](length)
|
||||
var i = 0
|
||||
while (i < length) {
|
||||
val sqlField = struct.fields(i)
|
||||
val avroField = avroSchema.getField(sqlField.name)
|
||||
if (avroField != null) {
|
||||
val converter = createConverter(avroField.schema(), sqlField.dataType,
|
||||
path :+ sqlField.name)
|
||||
converters(i) = converter
|
||||
avroFieldIndexes(i) = avroField.pos()
|
||||
} else if (!sqlField.nullable) {
|
||||
throw new IncompatibleSchemaException(
|
||||
s"Cannot find non-nullable field ${sqlField.name} at path ${path.mkString(".")} " +
|
||||
"in Avro schema\n" +
|
||||
s"Source Avro schema: $sourceAvroSchema.\n" +
|
||||
s"Target Catalyst type: $targetSqlType")
|
||||
}
|
||||
i += 1
|
||||
}
|
||||
|
||||
(item: AnyRef) => {
|
||||
if (item == null) {
|
||||
null
|
||||
} else {
|
||||
val record = item.asInstanceOf[GenericRecord]
|
||||
|
||||
val result = new Array[Any](length)
|
||||
var i = 0
|
||||
while (i < converters.length) {
|
||||
if (converters(i) != null) {
|
||||
val converter = converters(i)
|
||||
result(i) = converter(record.get(avroFieldIndexes(i)))
|
||||
}
|
||||
i += 1
|
||||
}
|
||||
new GenericRow(result)
|
||||
}
|
||||
}
|
||||
case (arrayType: ArrayType, ARRAY) =>
|
||||
val elementConverter = createConverter(avroSchema.getElementType, arrayType.elementType,
|
||||
path)
|
||||
val allowsNull = arrayType.containsNull
|
||||
(item: AnyRef) => {
|
||||
if (item == null) {
|
||||
null
|
||||
} else {
|
||||
item.asInstanceOf[java.lang.Iterable[AnyRef]].asScala.map { element =>
|
||||
if (element == null && !allowsNull) {
|
||||
throw new RuntimeException(s"Array value at path ${path.mkString(".")} is not " +
|
||||
"allowed to be null")
|
||||
} else {
|
||||
elementConverter(element)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
case (mapType: MapType, MAP) if mapType.keyType == StringType =>
|
||||
val valueConverter = createConverter(avroSchema.getValueType, mapType.valueType, path)
|
||||
val allowsNull = mapType.valueContainsNull
|
||||
(item: AnyRef) => {
|
||||
if (item == null) {
|
||||
null
|
||||
} else {
|
||||
item.asInstanceOf[java.util.Map[AnyRef, AnyRef]].asScala.map { x =>
|
||||
if (x._2 == null && !allowsNull) {
|
||||
throw new RuntimeException(s"Map value at path ${path.mkString(".")} is not " +
|
||||
"allowed to be null")
|
||||
} else {
|
||||
(x._1.toString, valueConverter(x._2))
|
||||
}
|
||||
}.toMap
|
||||
}
|
||||
}
|
||||
case (sqlType, UNION) =>
|
||||
if (avroSchema.getTypes.asScala.exists(_.getType == NULL)) {
|
||||
val remainingUnionTypes = avroSchema.getTypes.asScala.filterNot(_.getType == NULL)
|
||||
if (remainingUnionTypes.size == 1) {
|
||||
createConverter(remainingUnionTypes.head, sqlType, path)
|
||||
} else {
|
||||
createConverter(Schema.createUnion(remainingUnionTypes.asJava), sqlType, path)
|
||||
}
|
||||
} else avroSchema.getTypes.asScala.map(_.getType) match {
|
||||
case Seq(t1) => createConverter(avroSchema.getTypes.get(0), sqlType, path)
|
||||
case Seq(a, b) if Set(a, b) == Set(INT, LONG) && sqlType == LongType =>
|
||||
(item: AnyRef) => {
|
||||
item match {
|
||||
case null => null
|
||||
case l: java.lang.Long => l
|
||||
case i: java.lang.Integer => new java.lang.Long(i.longValue())
|
||||
}
|
||||
}
|
||||
case Seq(a, b) if Set(a, b) == Set(FLOAT, DOUBLE) && sqlType == DoubleType =>
|
||||
(item: AnyRef) => {
|
||||
item match {
|
||||
case null => null
|
||||
case d: java.lang.Double => d
|
||||
case f: java.lang.Float => new java.lang.Double(f.doubleValue())
|
||||
}
|
||||
}
|
||||
case other =>
|
||||
sqlType match {
|
||||
case t: StructType if t.fields.length == avroSchema.getTypes.size =>
|
||||
val fieldConverters = t.fields.zip(avroSchema.getTypes.asScala).map {
|
||||
case (field, schema) =>
|
||||
createConverter(schema, field.dataType, path :+ field.name)
|
||||
}
|
||||
|
||||
(item: AnyRef) => if (item == null) {
|
||||
null
|
||||
} else {
|
||||
val i = GenericData.get().resolveUnion(avroSchema, item)
|
||||
val converted = new Array[Any](fieldConverters.length)
|
||||
converted(i) = fieldConverters(i)(item)
|
||||
new GenericRow(converted)
|
||||
}
|
||||
case _ => throw new IncompatibleSchemaException(
|
||||
s"Cannot convert Avro schema to catalyst type because schema at path " +
|
||||
s"${path.mkString(".")} is not compatible " +
|
||||
s"(avroType = $other, sqlType = $sqlType). \n" +
|
||||
s"Source Avro schema: $sourceAvroSchema.\n" +
|
||||
s"Target Catalyst type: $targetSqlType")
|
||||
}
|
||||
}
|
||||
case (left, right) =>
|
||||
throw new IncompatibleSchemaException(
|
||||
s"Cannot convert Avro schema to catalyst type because schema at path " +
|
||||
s"${path.mkString(".")} is not compatible (avroType = $left, sqlType = $right). \n" +
|
||||
s"Source Avro schema: $sourceAvroSchema.\n" +
|
||||
s"Target Catalyst type: $targetSqlType")
|
||||
}
|
||||
}
|
||||
createConverter(sourceAvroSchema, targetSqlType, List.empty[String])
|
||||
}
|
||||
|
||||
def createConverterToAvro(dataType: DataType,
|
||||
structName: String,
|
||||
recordNamespace: String): Any => Any = {
|
||||
dataType match {
|
||||
case BinaryType => (item: Any) =>
|
||||
item match {
|
||||
case null => null
|
||||
case bytes: Array[Byte] => ByteBuffer.wrap(bytes)
|
||||
}
|
||||
case IntegerType | LongType |
|
||||
FloatType | DoubleType | StringType | BooleanType => identity
|
||||
case ByteType => (item: Any) =>
|
||||
if (item == null) null else item.asInstanceOf[Byte].intValue
|
||||
case ShortType => (item: Any) =>
|
||||
if (item == null) null else item.asInstanceOf[Short].intValue
|
||||
case _: DecimalType => (item: Any) => if (item == null) null else item.toString
|
||||
case TimestampType => (item: Any) =>
|
||||
if (item == null) null else item.asInstanceOf[Timestamp].getTime
|
||||
case DateType => (item: Any) =>
|
||||
if (item == null) null else item.asInstanceOf[Date].getTime
|
||||
case ArrayType(elementType, _) =>
|
||||
val elementConverter = createConverterToAvro(
|
||||
elementType,
|
||||
structName,
|
||||
getNewRecordNamespace(elementType, recordNamespace, structName))
|
||||
(item: Any) => {
|
||||
if (item == null) {
|
||||
null
|
||||
} else {
|
||||
val sourceArray = item.asInstanceOf[Seq[Any]]
|
||||
val sourceArraySize = sourceArray.size
|
||||
val targetList = new util.ArrayList[Any](sourceArraySize)
|
||||
var idx = 0
|
||||
while (idx < sourceArraySize) {
|
||||
targetList.add(elementConverter(sourceArray(idx)))
|
||||
idx += 1
|
||||
}
|
||||
targetList
|
||||
}
|
||||
}
|
||||
case MapType(StringType, valueType, _) =>
|
||||
val valueConverter = createConverterToAvro(
|
||||
valueType,
|
||||
structName,
|
||||
getNewRecordNamespace(valueType, recordNamespace, structName))
|
||||
(item: Any) => {
|
||||
if (item == null) {
|
||||
null
|
||||
} else {
|
||||
val javaMap = new util.HashMap[String, Any]()
|
||||
item.asInstanceOf[Map[String, Any]].foreach { case (key, value) =>
|
||||
javaMap.put(key, valueConverter(value))
|
||||
}
|
||||
javaMap
|
||||
}
|
||||
}
|
||||
case structType: StructType =>
|
||||
val builder = SchemaBuilder.record(structName).namespace(recordNamespace)
|
||||
val schema: Schema = SchemaConverters.convertStructToAvro(
|
||||
structType, builder, recordNamespace)
|
||||
val fieldConverters = structType.fields.map(field =>
|
||||
createConverterToAvro(
|
||||
field.dataType,
|
||||
field.name,
|
||||
getNewRecordNamespace(field.dataType, recordNamespace, field.name)))
|
||||
(item: Any) => {
|
||||
if (item == null) {
|
||||
null
|
||||
} else {
|
||||
val record = new Record(schema)
|
||||
val convertersIterator = fieldConverters.iterator
|
||||
val fieldNamesIterator = dataType.asInstanceOf[StructType].fieldNames.iterator
|
||||
val rowIterator = item.asInstanceOf[Row].toSeq.iterator
|
||||
|
||||
while (convertersIterator.hasNext) {
|
||||
val converter = convertersIterator.next()
|
||||
record.put(fieldNamesIterator.next(), converter(rowIterator.next()))
|
||||
}
|
||||
record
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def convertStructTypeToAvroSchema(structType: StructType,
|
||||
structName: String,
|
||||
recordNamespace: String): Schema = {
|
||||
val builder = SchemaBuilder.record(structName).namespace(recordNamespace)
|
||||
SchemaConverters.convertStructToAvro(structType, builder, recordNamespace)
|
||||
}
|
||||
|
||||
def convertAvroSchemaToStructType(avroSchema: Schema): StructType = {
|
||||
SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType];
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,208 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi
|
||||
|
||||
import org.apache.hudi.common.model.HoodieTableType
|
||||
import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
|
||||
|
||||
/**
|
||||
* List of options that can be passed to the Hoodie datasource,
|
||||
* in addition to the hoodie client configs
|
||||
*/
|
||||
|
||||
/**
|
||||
* Options supported for reading hoodie datasets.
|
||||
*/
|
||||
object DataSourceReadOptions {
|
||||
/**
|
||||
* Whether data needs to be read, in
|
||||
* incremental mode (new data since an instantTime)
|
||||
* (or) Read Optimized mode (obtain latest view, based on columnar data)
|
||||
* (or) Real time mode (obtain latest view, based on row & columnar data)
|
||||
*
|
||||
* Default: READ_OPTIMIZED
|
||||
*/
|
||||
val VIEW_TYPE_OPT_KEY = "hoodie.datasource.view.type"
|
||||
val VIEW_TYPE_READ_OPTIMIZED_OPT_VAL = "read_optimized"
|
||||
val VIEW_TYPE_INCREMENTAL_OPT_VAL = "incremental"
|
||||
val VIEW_TYPE_REALTIME_OPT_VAL = "realtime"
|
||||
val DEFAULT_VIEW_TYPE_OPT_VAL = VIEW_TYPE_READ_OPTIMIZED_OPT_VAL
|
||||
val DEFAULTPUSH_DOWN_FILTERS_OPT_VAL = ""
|
||||
|
||||
/**
|
||||
* Instant time to start incrementally pulling data from. The instanttime here need not
|
||||
* necessarily correspond to an instant on the timeline. New data written with an
|
||||
* `instant_time > BEGIN_INSTANTTIME` are fetched out. For e.g: '20170901080000' will get
|
||||
* all new data written after Sep 1, 2017 08:00AM.
|
||||
*
|
||||
* Default: None (Mandatory in incremental mode)
|
||||
*/
|
||||
val BEGIN_INSTANTTIME_OPT_KEY = "hoodie.datasource.read.begin.instanttime"
|
||||
|
||||
|
||||
/**
|
||||
* Instant time to limit incrementally fetched data to. New data written with an
|
||||
* `instant_time <= END_INSTANTTIME` are fetched out.
|
||||
*
|
||||
* Default: latest instant (i.e fetches all new data since begin instant time)
|
||||
*
|
||||
*/
|
||||
val END_INSTANTTIME_OPT_KEY = "hoodie.datasource.read.end.instanttime"
|
||||
|
||||
/**
|
||||
* For use-cases like DeltaStreamer which reads from Hoodie Incremental table and applies opaque map functions,
|
||||
* filters appearing late in the sequence of transformations cannot be automatically pushed down.
|
||||
* This option allows setting filters directly on Hoodie Source
|
||||
*/
|
||||
val PUSH_DOWN_INCR_FILTERS_OPT_KEY = "hoodie.datasource.read.incr.filters"
|
||||
}
|
||||
|
||||
/**
|
||||
* Options supported for writing hoodie datasets.
|
||||
*/
|
||||
object DataSourceWriteOptions {
|
||||
/**
|
||||
* The client operation, that this write should do
|
||||
*
|
||||
* Default: upsert()
|
||||
*/
|
||||
val OPERATION_OPT_KEY = "hoodie.datasource.write.operation"
|
||||
val BULK_INSERT_OPERATION_OPT_VAL = "bulk_insert"
|
||||
val INSERT_OPERATION_OPT_VAL = "insert"
|
||||
val UPSERT_OPERATION_OPT_VAL = "upsert"
|
||||
val DEFAULT_OPERATION_OPT_VAL = UPSERT_OPERATION_OPT_VAL
|
||||
|
||||
/**
|
||||
* The storage type for the underlying data, for this write.
|
||||
* Note that this can't change across writes.
|
||||
*
|
||||
* Default: COPY_ON_WRITE
|
||||
*/
|
||||
val STORAGE_TYPE_OPT_KEY = "hoodie.datasource.write.storage.type"
|
||||
val COW_STORAGE_TYPE_OPT_VAL = HoodieTableType.COPY_ON_WRITE.name
|
||||
val MOR_STORAGE_TYPE_OPT_VAL = HoodieTableType.MERGE_ON_READ.name
|
||||
val DEFAULT_STORAGE_TYPE_OPT_VAL = COW_STORAGE_TYPE_OPT_VAL
|
||||
|
||||
/**
|
||||
* Hive table name, to register the dataset into.
|
||||
*
|
||||
* Default: None (mandatory)
|
||||
*/
|
||||
val TABLE_NAME_OPT_KEY = "hoodie.datasource.write.table.name"
|
||||
|
||||
/**
|
||||
* Field used in preCombining before actual write. When two records have the same
|
||||
* key value, we will pick the one with the largest value for the precombine field,
|
||||
* determined by Object.compareTo(..)
|
||||
*/
|
||||
val PRECOMBINE_FIELD_OPT_KEY = "hoodie.datasource.write.precombine.field"
|
||||
val DEFAULT_PRECOMBINE_FIELD_OPT_VAL = "ts"
|
||||
|
||||
|
||||
/**
|
||||
* Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting.
|
||||
* This will render any value set for `PRECOMBINE_FIELD_OPT_VAL` in-effective
|
||||
*/
|
||||
val PAYLOAD_CLASS_OPT_KEY = "hoodie.datasource.write.payload.class"
|
||||
val DEFAULT_PAYLOAD_OPT_VAL = classOf[OverwriteWithLatestAvroPayload].getName
|
||||
|
||||
/**
|
||||
* Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value
|
||||
* will be obtained by invoking .toString() on the field value. Nested fields can be specified using
|
||||
* the dot notation eg: `a.b.c`
|
||||
*
|
||||
*/
|
||||
val RECORDKEY_FIELD_OPT_KEY = "hoodie.datasource.write.recordkey.field"
|
||||
val DEFAULT_RECORDKEY_FIELD_OPT_VAL = "uuid"
|
||||
|
||||
/**
|
||||
* Partition path field. Value to be used at the `partitionPath` component of `HoodieKey`. Actual
|
||||
* value ontained by invoking .toString()
|
||||
*/
|
||||
val PARTITIONPATH_FIELD_OPT_KEY = "hoodie.datasource.write.partitionpath.field"
|
||||
val DEFAULT_PARTITIONPATH_FIELD_OPT_VAL = "partitionpath"
|
||||
|
||||
/**
|
||||
* Key generator class, that implements will extract the key out of incoming record
|
||||
*
|
||||
*/
|
||||
val KEYGENERATOR_CLASS_OPT_KEY = "hoodie.datasource.write.keygenerator.class"
|
||||
val DEFAULT_KEYGENERATOR_CLASS_OPT_VAL = classOf[SimpleKeyGenerator].getName
|
||||
|
||||
/**
|
||||
* Option keys beginning with this prefix, are automatically added to the commit/deltacommit metadata.
|
||||
* This is useful to store checkpointing information, in a consistent way with the hoodie timeline
|
||||
*/
|
||||
val COMMIT_METADATA_KEYPREFIX_OPT_KEY = "hoodie.datasource.write.commitmeta.key.prefix"
|
||||
val DEFAULT_COMMIT_METADATA_KEYPREFIX_OPT_VAL = "_"
|
||||
|
||||
/**
|
||||
* Flag to indicate whether to drop duplicates upon insert.
|
||||
* By default insert will accept duplicates, to gain extra performance.
|
||||
*/
|
||||
val INSERT_DROP_DUPS_OPT_KEY = "hoodie.datasource.write.insert.drop.duplicates"
|
||||
val DEFAULT_INSERT_DROP_DUPS_OPT_VAL = "false"
|
||||
|
||||
/**
|
||||
* Flag to indicate how many times streaming job should retry for a failed microbatch
|
||||
* By default 3
|
||||
*/
|
||||
val STREAMING_RETRY_CNT_OPT_KEY = "hoodie.datasource.write.streaming.retry.count"
|
||||
val DEFAULT_STREAMING_RETRY_CNT_OPT_VAL = "3"
|
||||
|
||||
/**
|
||||
* Flag to indicate how long (by millisecond) before a retry should issued for failed microbatch
|
||||
* By default 2000 and it will be doubled by every retry
|
||||
*/
|
||||
val STREAMING_RETRY_INTERVAL_MS_OPT_KEY = "hoodie.datasource.write.streaming.retry.interval.ms"
|
||||
val DEFAULT_STREAMING_RETRY_INTERVAL_MS_OPT_VAL = "2000"
|
||||
|
||||
/**
|
||||
* Flag to indicate whether to ignore any non exception error (e.g. writestatus error)
|
||||
* within a streaming microbatch
|
||||
* By default true (in favor of streaming progressing over data integrity)
|
||||
*/
|
||||
val STREAMING_IGNORE_FAILED_BATCH_OPT_KEY = "hoodie.datasource.write.streaming.ignore.failed.batch"
|
||||
val DEFAULT_STREAMING_IGNORE_FAILED_BATCH_OPT_VAL = "true"
|
||||
|
||||
// HIVE SYNC SPECIFIC CONFIGS
|
||||
//NOTE: DO NOT USE uppercase for the keys as they are internally lower-cased. Using upper-cases causes
|
||||
// unexpected issues with config getting reset
|
||||
val HIVE_SYNC_ENABLED_OPT_KEY = "hoodie.datasource.hive_sync.enable"
|
||||
val HIVE_DATABASE_OPT_KEY = "hoodie.datasource.hive_sync.database"
|
||||
val HIVE_TABLE_OPT_KEY = "hoodie.datasource.hive_sync.table"
|
||||
val HIVE_USER_OPT_KEY = "hoodie.datasource.hive_sync.username"
|
||||
val HIVE_PASS_OPT_KEY = "hoodie.datasource.hive_sync.password"
|
||||
val HIVE_URL_OPT_KEY = "hoodie.datasource.hive_sync.jdbcurl"
|
||||
val HIVE_PARTITION_FIELDS_OPT_KEY = "hoodie.datasource.hive_sync.partition_fields"
|
||||
val HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY = "hoodie.datasource.hive_sync.partition_extractor_class"
|
||||
val HIVE_ASSUME_DATE_PARTITION_OPT_KEY = "hoodie.datasource.hive_sync.assume_date_partitioning"
|
||||
val HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY = "hoodie.datasource.hive_sync.use_pre_apache_input_format"
|
||||
|
||||
// DEFAULT FOR HIVE SPECIFIC CONFIGS
|
||||
val DEFAULT_HIVE_SYNC_ENABLED_OPT_VAL = "false"
|
||||
val DEFAULT_HIVE_DATABASE_OPT_VAL = "default"
|
||||
val DEFAULT_HIVE_TABLE_OPT_VAL = "unknown"
|
||||
val DEFAULT_HIVE_USER_OPT_VAL = "hive"
|
||||
val DEFAULT_HIVE_PASS_OPT_VAL = "hive"
|
||||
val DEFAULT_HIVE_URL_OPT_VAL = "jdbc:hive2://localhost:10000"
|
||||
val DEFAULT_HIVE_PARTITION_FIELDS_OPT_VAL = ""
|
||||
val DEFAULT_HIVE_PARTITION_EXTRACTOR_CLASS_OPT_VAL = classOf[SlashEncodedDayPartitionValueExtractor].getCanonicalName
|
||||
val DEFAULT_HIVE_ASSUME_DATE_PARTITION_OPT_VAL = "false"
|
||||
val DEFAULT_USE_PRE_APACHE_INPUT_FORMAT_OPT_VAL = "false"
|
||||
}
|
||||
108
hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala
Normal file
108
hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala
Normal file
@@ -0,0 +1,108 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi
|
||||
|
||||
import org.apache.hudi.DataSourceReadOptions._
|
||||
import org.apache.hudi.exception.HoodieException
|
||||
import org.apache.hudi.hadoop.HoodieROTablePathFilter
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.sql.execution.datasources.DataSource
|
||||
import org.apache.spark.sql.execution.streaming.Sink
|
||||
import org.apache.spark.sql.sources._
|
||||
import org.apache.spark.sql.streaming.OutputMode
|
||||
import org.apache.spark.sql.types.StructType
|
||||
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
|
||||
|
||||
/**
|
||||
* Hoodie Spark Datasource, for reading and writing hoodie datasets
|
||||
*
|
||||
*/
|
||||
class DefaultSource extends RelationProvider
|
||||
with SchemaRelationProvider
|
||||
with CreatableRelationProvider
|
||||
with DataSourceRegister
|
||||
with StreamSinkProvider
|
||||
with Serializable {
|
||||
|
||||
private val log = LogManager.getLogger(classOf[DefaultSource])
|
||||
|
||||
override def createRelation(sqlContext: SQLContext,
|
||||
parameters: Map[String, String]): BaseRelation = {
|
||||
createRelation(sqlContext, parameters, null)
|
||||
}
|
||||
|
||||
override def createRelation(sqlContext: SQLContext,
|
||||
optParams: Map[String, String],
|
||||
schema: StructType): BaseRelation = {
|
||||
// Add default options for unspecified read options keys.
|
||||
val parameters = Map(VIEW_TYPE_OPT_KEY -> DEFAULT_VIEW_TYPE_OPT_VAL) ++: optParams
|
||||
|
||||
val path = parameters.get("path")
|
||||
if (path.isEmpty) {
|
||||
throw new HoodieException("'path' must be specified.")
|
||||
}
|
||||
|
||||
if (parameters(VIEW_TYPE_OPT_KEY).equals(VIEW_TYPE_REALTIME_OPT_VAL)) {
|
||||
throw new HoodieException("Realtime view not supported yet via data source. Please use HiveContext route.")
|
||||
}
|
||||
|
||||
if (parameters(VIEW_TYPE_OPT_KEY).equals(VIEW_TYPE_INCREMENTAL_OPT_VAL)) {
|
||||
new IncrementalRelation(sqlContext, path.get, optParams, schema)
|
||||
} else {
|
||||
// this is just effectively RO view only, where `path` can contain a mix of
|
||||
// non-hoodie/hoodie path files. set the path filter up
|
||||
sqlContext.sparkContext.hadoopConfiguration.setClass(
|
||||
"mapreduce.input.pathFilter.class",
|
||||
classOf[HoodieROTablePathFilter],
|
||||
classOf[org.apache.hadoop.fs.PathFilter]);
|
||||
|
||||
log.info("Constructing hoodie (as parquet) data source with options :" + parameters)
|
||||
// simply return as a regular parquet relation
|
||||
DataSource.apply(
|
||||
sparkSession = sqlContext.sparkSession,
|
||||
userSpecifiedSchema = Option(schema),
|
||||
className = "parquet",
|
||||
options = parameters)
|
||||
.resolveRelation()
|
||||
}
|
||||
}
|
||||
|
||||
override def createRelation(sqlContext: SQLContext,
|
||||
mode: SaveMode,
|
||||
optParams: Map[String, String],
|
||||
df: DataFrame): BaseRelation = {
|
||||
|
||||
val parameters = HoodieSparkSqlWriter.parametersWithWriteDefaults(optParams)
|
||||
HoodieSparkSqlWriter.write(sqlContext, mode, parameters, df)
|
||||
createRelation(sqlContext, parameters, df.schema)
|
||||
}
|
||||
|
||||
override def createSink(sqlContext: SQLContext,
|
||||
optParams: Map[String, String],
|
||||
partitionColumns: Seq[String],
|
||||
outputMode: OutputMode): Sink = {
|
||||
val parameters = HoodieSparkSqlWriter.parametersWithWriteDefaults(optParams)
|
||||
new HoodieStreamingSink(
|
||||
sqlContext,
|
||||
parameters,
|
||||
partitionColumns,
|
||||
outputMode)
|
||||
}
|
||||
|
||||
override def shortName(): String = "hoodie"
|
||||
}
|
||||
@@ -0,0 +1,256 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi
|
||||
|
||||
import java.util
|
||||
|
||||
import org.apache.avro.generic.GenericRecord
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.hadoop.hive.conf.HiveConf
|
||||
import org.apache.hudi.DataSourceWriteOptions._
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.common.util.{FSUtils, TypedProperties}
|
||||
import org.apache.hudi.config.HoodieWriteConfig
|
||||
import org.apache.hudi.exception.HoodieException
|
||||
import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool}
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.api.java.JavaSparkContext
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
|
||||
|
||||
import scala.collection.JavaConversions._
|
||||
import scala.collection.mutable.ListBuffer
|
||||
|
||||
private[hudi] object HoodieSparkSqlWriter {
|
||||
|
||||
private val log = LogManager.getLogger("HoodieSparkSQLWriter")
|
||||
|
||||
def write(sqlContext: SQLContext,
|
||||
mode: SaveMode,
|
||||
parameters: Map[String, String],
|
||||
df: DataFrame): (Boolean, common.util.Option[String]) = {
|
||||
|
||||
val sparkContext = sqlContext.sparkContext
|
||||
val path = parameters.get("path")
|
||||
val tblName = parameters.get(HoodieWriteConfig.TABLE_NAME)
|
||||
if (path.isEmpty || tblName.isEmpty) {
|
||||
throw new HoodieException(s"'${HoodieWriteConfig.TABLE_NAME}', 'path' must be set.")
|
||||
}
|
||||
sparkContext.getConf.getOption("spark.serializer") match {
|
||||
case Some(ser) if ser.equals("org.apache.spark.serializer.KryoSerializer") =>
|
||||
case _ => throw new HoodieException("hoodie only support org.apache.spark.serializer.KryoSerializer as spark.serializer")
|
||||
}
|
||||
val storageType = parameters(STORAGE_TYPE_OPT_KEY)
|
||||
val operation =
|
||||
// It does not make sense to allow upsert() operation if INSERT_DROP_DUPS_OPT_KEY is true
|
||||
// Auto-correct the operation to "insert" if OPERATION_OPT_KEY is set to "upsert" wrongly
|
||||
// or not set (in which case it will be set as "upsert" by parametersWithWriteDefaults()) .
|
||||
if (parameters(INSERT_DROP_DUPS_OPT_KEY).toBoolean &&
|
||||
parameters(OPERATION_OPT_KEY) == UPSERT_OPERATION_OPT_VAL) {
|
||||
|
||||
log.warn(s"$UPSERT_OPERATION_OPT_VAL is not applicable " +
|
||||
s"when $INSERT_DROP_DUPS_OPT_KEY is set to be true, " +
|
||||
s"overriding the $OPERATION_OPT_KEY to be $INSERT_OPERATION_OPT_VAL")
|
||||
|
||||
INSERT_OPERATION_OPT_VAL
|
||||
} else {
|
||||
parameters(OPERATION_OPT_KEY)
|
||||
}
|
||||
|
||||
// register classes & schemas
|
||||
val structName = s"${tblName.get}_record"
|
||||
val nameSpace = s"hoodie.${tblName.get}"
|
||||
sparkContext.getConf.registerKryoClasses(
|
||||
Array(classOf[org.apache.avro.generic.GenericData],
|
||||
classOf[org.apache.avro.Schema]))
|
||||
val schema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, nameSpace)
|
||||
sparkContext.getConf.registerAvroSchemas(schema)
|
||||
log.info(s"Registered avro schema : ${schema.toString(true)}")
|
||||
|
||||
// Convert to RDD[HoodieRecord]
|
||||
val keyGenerator = DataSourceUtils.createKeyGenerator(toProperties(parameters))
|
||||
val genericRecords: RDD[GenericRecord] = AvroConversionUtils.createRdd(df, structName, nameSpace)
|
||||
val hoodieAllIncomingRecords = genericRecords.map(gr => {
|
||||
val orderingVal = DataSourceUtils.getNestedFieldValAsString(
|
||||
gr, parameters(PRECOMBINE_FIELD_OPT_KEY)).asInstanceOf[Comparable[_]]
|
||||
DataSourceUtils.createHoodieRecord(gr,
|
||||
orderingVal, keyGenerator.getKey(gr), parameters(PAYLOAD_CLASS_OPT_KEY))
|
||||
}).toJavaRDD()
|
||||
|
||||
val jsc = new JavaSparkContext(sparkContext)
|
||||
|
||||
val basePath = new Path(parameters("path"))
|
||||
val fs = basePath.getFileSystem(sparkContext.hadoopConfiguration)
|
||||
var exists = fs.exists(new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME))
|
||||
|
||||
// Handle various save modes
|
||||
if (mode == SaveMode.ErrorIfExists && exists) {
|
||||
throw new HoodieException(s"hoodie dataset at $basePath already exists.")
|
||||
}
|
||||
if (mode == SaveMode.Ignore && exists) {
|
||||
log.warn(s"hoodie dataset at $basePath already exists. Ignoring & not performing actual writes.")
|
||||
return (true, common.util.Option.empty())
|
||||
}
|
||||
if (mode == SaveMode.Overwrite && exists) {
|
||||
log.warn(s"hoodie dataset at $basePath already exists. Deleting existing data & overwriting with new data.")
|
||||
fs.delete(basePath, true)
|
||||
exists = false
|
||||
}
|
||||
|
||||
// Create the dataset if not present
|
||||
if (!exists) {
|
||||
HoodieTableMetaClient.initTableType(sparkContext.hadoopConfiguration, path.get, storageType,
|
||||
tblName.get, "archived")
|
||||
}
|
||||
|
||||
// Create a HoodieWriteClient & issue the write.
|
||||
val client = DataSourceUtils.createHoodieClient(jsc, schema.toString, path.get, tblName.get,
|
||||
mapAsJavaMap(parameters)
|
||||
)
|
||||
|
||||
val hoodieRecords =
|
||||
if (parameters(INSERT_DROP_DUPS_OPT_KEY).toBoolean) {
|
||||
DataSourceUtils.dropDuplicates(
|
||||
jsc,
|
||||
hoodieAllIncomingRecords,
|
||||
mapAsJavaMap(parameters), client.getTimelineServer)
|
||||
} else {
|
||||
hoodieAllIncomingRecords
|
||||
}
|
||||
|
||||
if (hoodieRecords.isEmpty()) {
|
||||
log.info("new batch has no new records, skipping...")
|
||||
return (true, common.util.Option.empty())
|
||||
}
|
||||
|
||||
val commitTime = client.startCommit()
|
||||
|
||||
val writeStatuses = DataSourceUtils.doWriteOperation(client, hoodieRecords, commitTime, operation)
|
||||
// Check for errors and commit the write.
|
||||
val errorCount = writeStatuses.rdd.filter(ws => ws.hasErrors).count()
|
||||
val writeSuccessful =
|
||||
if (errorCount == 0) {
|
||||
log.info("No errors. Proceeding to commit the write.")
|
||||
val metaMap = parameters.filter(kv =>
|
||||
kv._1.startsWith(parameters(COMMIT_METADATA_KEYPREFIX_OPT_KEY)))
|
||||
val commitSuccess = if (metaMap.isEmpty) {
|
||||
client.commit(commitTime, writeStatuses)
|
||||
} else {
|
||||
client.commit(commitTime, writeStatuses,
|
||||
common.util.Option.of(new util.HashMap[String, String](mapAsJavaMap(metaMap))))
|
||||
}
|
||||
|
||||
if (commitSuccess) {
|
||||
log.info("Commit " + commitTime + " successful!")
|
||||
}
|
||||
else {
|
||||
log.info("Commit " + commitTime + " failed!")
|
||||
}
|
||||
|
||||
val hiveSyncEnabled = parameters.get(HIVE_SYNC_ENABLED_OPT_KEY).exists(r => r.toBoolean)
|
||||
val syncHiveSucess = if (hiveSyncEnabled) {
|
||||
log.info("Syncing to Hive Metastore (URL: " + parameters(HIVE_URL_OPT_KEY) + ")")
|
||||
val fs = FSUtils.getFs(basePath.toString, jsc.hadoopConfiguration)
|
||||
syncHive(basePath, fs, parameters)
|
||||
} else {
|
||||
true
|
||||
}
|
||||
client.close()
|
||||
commitSuccess && syncHiveSucess
|
||||
} else {
|
||||
log.error(s"$operation failed with ${errorCount} errors :");
|
||||
if (log.isTraceEnabled) {
|
||||
log.trace("Printing out the top 100 errors")
|
||||
writeStatuses.rdd.filter(ws => ws.hasErrors)
|
||||
.take(100)
|
||||
.foreach(ws => {
|
||||
log.trace("Global error :", ws.getGlobalError)
|
||||
if (ws.getErrors.size() > 0) {
|
||||
ws.getErrors.foreach(kt =>
|
||||
log.trace(s"Error for key: ${kt._1}", kt._2))
|
||||
}
|
||||
})
|
||||
}
|
||||
false
|
||||
}
|
||||
(writeSuccessful, common.util.Option.ofNullable(commitTime))
|
||||
}
|
||||
|
||||
/**
|
||||
* Add default options for unspecified write options keys.
|
||||
*
|
||||
* @param parameters
|
||||
* @return
|
||||
*/
|
||||
def parametersWithWriteDefaults(parameters: Map[String, String]): Map[String, String] = {
|
||||
Map(OPERATION_OPT_KEY -> DEFAULT_OPERATION_OPT_VAL,
|
||||
STORAGE_TYPE_OPT_KEY -> DEFAULT_STORAGE_TYPE_OPT_VAL,
|
||||
PRECOMBINE_FIELD_OPT_KEY -> DEFAULT_PRECOMBINE_FIELD_OPT_VAL,
|
||||
PAYLOAD_CLASS_OPT_KEY -> DEFAULT_PAYLOAD_OPT_VAL,
|
||||
RECORDKEY_FIELD_OPT_KEY -> DEFAULT_RECORDKEY_FIELD_OPT_VAL,
|
||||
PARTITIONPATH_FIELD_OPT_KEY -> DEFAULT_PARTITIONPATH_FIELD_OPT_VAL,
|
||||
KEYGENERATOR_CLASS_OPT_KEY -> DEFAULT_KEYGENERATOR_CLASS_OPT_VAL,
|
||||
COMMIT_METADATA_KEYPREFIX_OPT_KEY -> DEFAULT_COMMIT_METADATA_KEYPREFIX_OPT_VAL,
|
||||
INSERT_DROP_DUPS_OPT_KEY -> DEFAULT_INSERT_DROP_DUPS_OPT_VAL,
|
||||
STREAMING_RETRY_CNT_OPT_KEY -> DEFAULT_STREAMING_RETRY_CNT_OPT_VAL,
|
||||
STREAMING_RETRY_INTERVAL_MS_OPT_KEY -> DEFAULT_STREAMING_RETRY_INTERVAL_MS_OPT_VAL,
|
||||
STREAMING_IGNORE_FAILED_BATCH_OPT_KEY -> DEFAULT_STREAMING_IGNORE_FAILED_BATCH_OPT_VAL,
|
||||
HIVE_SYNC_ENABLED_OPT_KEY -> DEFAULT_HIVE_SYNC_ENABLED_OPT_VAL,
|
||||
HIVE_DATABASE_OPT_KEY -> DEFAULT_HIVE_DATABASE_OPT_VAL,
|
||||
HIVE_TABLE_OPT_KEY -> DEFAULT_HIVE_TABLE_OPT_VAL,
|
||||
HIVE_USER_OPT_KEY -> DEFAULT_HIVE_USER_OPT_VAL,
|
||||
HIVE_PASS_OPT_KEY -> DEFAULT_HIVE_PASS_OPT_VAL,
|
||||
HIVE_URL_OPT_KEY -> DEFAULT_HIVE_URL_OPT_VAL,
|
||||
HIVE_PARTITION_FIELDS_OPT_KEY -> DEFAULT_HIVE_PARTITION_FIELDS_OPT_VAL,
|
||||
HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY -> DEFAULT_HIVE_PARTITION_EXTRACTOR_CLASS_OPT_VAL,
|
||||
HIVE_ASSUME_DATE_PARTITION_OPT_KEY -> DEFAULT_HIVE_ASSUME_DATE_PARTITION_OPT_VAL
|
||||
) ++: parameters
|
||||
}
|
||||
|
||||
def toProperties(params: Map[String, String]): TypedProperties = {
|
||||
val props = new TypedProperties()
|
||||
params.foreach(kv => props.setProperty(kv._1, kv._2))
|
||||
props
|
||||
}
|
||||
|
||||
private def syncHive(basePath: Path, fs: FileSystem, parameters: Map[String, String]): Boolean = {
|
||||
val hiveSyncConfig: HiveSyncConfig = buildSyncConfig(basePath, parameters)
|
||||
val hiveConf: HiveConf = new HiveConf()
|
||||
hiveConf.addResource(fs.getConf)
|
||||
new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable()
|
||||
true
|
||||
}
|
||||
|
||||
private def buildSyncConfig(basePath: Path, parameters: Map[String, String]): HiveSyncConfig = {
|
||||
val hiveSyncConfig: HiveSyncConfig = new HiveSyncConfig()
|
||||
hiveSyncConfig.basePath = basePath.toString
|
||||
hiveSyncConfig.usePreApacheInputFormat =
|
||||
parameters.get(HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY).exists(r => r.toBoolean)
|
||||
hiveSyncConfig.assumeDatePartitioning =
|
||||
parameters.get(HIVE_ASSUME_DATE_PARTITION_OPT_KEY).exists(r => r.toBoolean)
|
||||
hiveSyncConfig.databaseName = parameters(HIVE_DATABASE_OPT_KEY)
|
||||
hiveSyncConfig.tableName = parameters(HIVE_TABLE_OPT_KEY)
|
||||
hiveSyncConfig.hiveUser = parameters(HIVE_USER_OPT_KEY)
|
||||
hiveSyncConfig.hivePass = parameters(HIVE_PASS_OPT_KEY)
|
||||
hiveSyncConfig.jdbcUrl = parameters(HIVE_URL_OPT_KEY)
|
||||
hiveSyncConfig.partitionFields =
|
||||
ListBuffer(parameters(HIVE_PARTITION_FIELDS_OPT_KEY).split(",").map(_.trim).filter(!_.isEmpty).toList: _*)
|
||||
hiveSyncConfig.partitionValueExtractorClass = parameters(HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY)
|
||||
hiveSyncConfig
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,122 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hudi
|
||||
|
||||
import org.apache.hudi.exception.HoodieCorruptedDataException
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.sql.execution.streaming.Sink
|
||||
import org.apache.spark.sql.streaming.OutputMode
|
||||
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
|
||||
|
||||
import scala.util.{Failure, Success, Try}
|
||||
|
||||
class HoodieStreamingSink(sqlContext: SQLContext,
|
||||
options: Map[String, String],
|
||||
partitionColumns: Seq[String],
|
||||
outputMode: OutputMode)
|
||||
extends Sink
|
||||
with Serializable {
|
||||
@volatile private var latestBatchId = -1L
|
||||
|
||||
private val log = LogManager.getLogger(classOf[HoodieStreamingSink])
|
||||
|
||||
private val retryCnt = options(DataSourceWriteOptions.STREAMING_RETRY_CNT_OPT_KEY).toInt
|
||||
private val retryIntervalMs = options(DataSourceWriteOptions.STREAMING_RETRY_INTERVAL_MS_OPT_KEY).toLong
|
||||
private val ignoreFailedBatch = options(DataSourceWriteOptions.STREAMING_IGNORE_FAILED_BATCH_OPT_KEY).toBoolean
|
||||
|
||||
private val mode =
|
||||
if (outputMode == OutputMode.Append()) {
|
||||
SaveMode.Append
|
||||
} else {
|
||||
SaveMode.Overwrite
|
||||
}
|
||||
|
||||
override def addBatch(batchId: Long, data: DataFrame): Unit = {
|
||||
retry(retryCnt, retryIntervalMs)(
|
||||
Try(
|
||||
HoodieSparkSqlWriter.write(
|
||||
sqlContext,
|
||||
mode,
|
||||
options,
|
||||
data)
|
||||
) match {
|
||||
case Success((true, commitOps)) =>
|
||||
log.info(s"Micro batch id=$batchId succeeded"
|
||||
+ (commitOps.isPresent match {
|
||||
case true => s" for commit=${commitOps.get()}"
|
||||
case _ => s" with no new commits"
|
||||
}))
|
||||
Success((true, commitOps))
|
||||
case Failure(e) =>
|
||||
// clean up persist rdds in the write process
|
||||
data.sparkSession.sparkContext.getPersistentRDDs
|
||||
.foreach {
|
||||
case (id, rdd) =>
|
||||
rdd.unpersist()
|
||||
}
|
||||
log.error(s"Micro batch id=$batchId threw following expection: ", e)
|
||||
if (ignoreFailedBatch) {
|
||||
log.info(s"Ignore the exception and move on streaming as per " +
|
||||
s"${DataSourceWriteOptions.STREAMING_IGNORE_FAILED_BATCH_OPT_KEY} configuration")
|
||||
Success((true, None))
|
||||
} else {
|
||||
if (retryCnt > 1) log.info(s"Retrying the failed micro batch id=$batchId ...")
|
||||
Failure(e)
|
||||
}
|
||||
case Success((false, commitOps)) =>
|
||||
log.error(s"Micro batch id=$batchId ended up with errors"
|
||||
+ (commitOps.isPresent match {
|
||||
case true => s" for commit=${commitOps.get()}"
|
||||
case _ => s""
|
||||
}))
|
||||
if (ignoreFailedBatch) {
|
||||
log.info(s"Ignore the errors and move on streaming as per " +
|
||||
s"${DataSourceWriteOptions.STREAMING_IGNORE_FAILED_BATCH_OPT_KEY} configuration")
|
||||
Success((true, None))
|
||||
} else {
|
||||
if (retryCnt > 1) log.info(s"Retrying the failed micro batch id=$batchId ...")
|
||||
Failure(new HoodieCorruptedDataException(s"Micro batch id=$batchId ended up with errors"))
|
||||
}
|
||||
}
|
||||
) match {
|
||||
case Failure(e) =>
|
||||
if (!ignoreFailedBatch) {
|
||||
log.error(s"Micro batch id=$batchId threw following expections," +
|
||||
s"aborting streaming app to avoid data loss: ", e)
|
||||
// spark sometimes hangs upon exceptions and keep on hold of the executors
|
||||
// this is to force exit upon errors / exceptions and release all executors
|
||||
// will require redeployment / supervise mode to restart the streaming
|
||||
System.exit(1)
|
||||
}
|
||||
case Success(_) =>
|
||||
log.info(s"Micro batch id=$batchId succeeded")
|
||||
}
|
||||
}
|
||||
|
||||
override def toString: String = s"HoodieStreamingSink[${options("path")}]"
|
||||
|
||||
@annotation.tailrec
|
||||
private def retry[T](n: Int, waitInMillis: Long)(fn: => Try[T]): Try[T] = {
|
||||
fn match {
|
||||
case x: util.Success[T] => x
|
||||
case _ if n > 1 =>
|
||||
Thread.sleep(waitInMillis)
|
||||
retry(n - 1, waitInMillis * 2)(fn)
|
||||
case f => f
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,120 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi
|
||||
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hudi.common.model.{HoodieCommitMetadata, HoodieRecord, HoodieTableType}
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.common.util.ParquetUtils
|
||||
import org.apache.hudi.config.HoodieWriteConfig
|
||||
import org.apache.hudi.exception.HoodieException
|
||||
import org.apache.hudi.table.HoodieTable
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
|
||||
import org.apache.spark.sql.types.StructType
|
||||
import org.apache.spark.sql.{Row, SQLContext}
|
||||
|
||||
import scala.collection.JavaConversions._
|
||||
import scala.collection.mutable
|
||||
|
||||
/**
|
||||
* Relation, that implements the Hoodie incremental view.
|
||||
*
|
||||
* Implemented for Copy_on_write storage.
|
||||
*
|
||||
*/
|
||||
class IncrementalRelation(val sqlContext: SQLContext,
|
||||
val basePath: String,
|
||||
val optParams: Map[String, String],
|
||||
val userSchema: StructType) extends BaseRelation with TableScan {
|
||||
|
||||
private val log = LogManager.getLogger(classOf[IncrementalRelation])
|
||||
|
||||
val fs = new Path(basePath).getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
|
||||
val metaClient = new HoodieTableMetaClient(sqlContext.sparkContext.hadoopConfiguration, basePath, true)
|
||||
// MOR datasets not supported yet
|
||||
if (metaClient.getTableType.equals(HoodieTableType.MERGE_ON_READ)) {
|
||||
throw new HoodieException("Incremental view not implemented yet, for merge-on-read datasets")
|
||||
}
|
||||
// TODO : Figure out a valid HoodieWriteConfig
|
||||
val hoodieTable = HoodieTable.getHoodieTable(metaClient, HoodieWriteConfig.newBuilder().withPath(basePath).build(),
|
||||
sqlContext.sparkContext)
|
||||
val commitTimeline = hoodieTable.getMetaClient.getCommitTimeline.filterCompletedInstants()
|
||||
if (commitTimeline.empty()) {
|
||||
throw new HoodieException("No instants to incrementally pull")
|
||||
}
|
||||
if (!optParams.contains(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY)) {
|
||||
throw new HoodieException(s"Specify the begin instant time to pull from using " +
|
||||
s"option ${DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY}")
|
||||
}
|
||||
|
||||
val lastInstant = commitTimeline.lastInstant().get()
|
||||
|
||||
val commitsToReturn = commitTimeline.findInstantsInRange(
|
||||
optParams(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY),
|
||||
optParams.getOrElse(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY, lastInstant.getTimestamp))
|
||||
.getInstants.iterator().toList
|
||||
|
||||
// use schema from a file produced in the latest instant
|
||||
val latestSchema = {
|
||||
// use last instant if instant range is empty
|
||||
val instant = commitsToReturn.lastOption.getOrElse(lastInstant)
|
||||
val latestMeta = HoodieCommitMetadata
|
||||
.fromBytes(commitTimeline.getInstantDetails(instant).get, classOf[HoodieCommitMetadata])
|
||||
val metaFilePath = latestMeta.getFileIdAndFullPaths(basePath).values().iterator().next()
|
||||
AvroConversionUtils.convertAvroSchemaToStructType(ParquetUtils.readAvroSchema(
|
||||
sqlContext.sparkContext.hadoopConfiguration, new Path(metaFilePath)))
|
||||
}
|
||||
|
||||
val filters = {
|
||||
if (optParams.contains(DataSourceReadOptions.PUSH_DOWN_INCR_FILTERS_OPT_KEY)) {
|
||||
val filterStr = optParams.get(DataSourceReadOptions.PUSH_DOWN_INCR_FILTERS_OPT_KEY).getOrElse("")
|
||||
filterStr.split(",").filter(!_.isEmpty)
|
||||
} else {
|
||||
Array[String]()
|
||||
}
|
||||
}
|
||||
|
||||
override def schema: StructType = latestSchema
|
||||
|
||||
override def buildScan(): RDD[Row] = {
|
||||
val fileIdToFullPath = mutable.HashMap[String, String]()
|
||||
for (commit <- commitsToReturn) {
|
||||
val metadata: HoodieCommitMetadata = HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit)
|
||||
.get, classOf[HoodieCommitMetadata])
|
||||
fileIdToFullPath ++= metadata.getFileIdAndFullPaths(basePath).toMap
|
||||
}
|
||||
// unset the path filter, otherwise if end_instant_time is not the latest instant, path filter set for RO view
|
||||
// will filter out all the files incorrectly.
|
||||
sqlContext.sparkContext.hadoopConfiguration.unset("mapreduce.input.pathFilter.class")
|
||||
val sOpts = optParams.filter(p => !p._1.equalsIgnoreCase("path"))
|
||||
if (fileIdToFullPath.isEmpty) {
|
||||
sqlContext.sparkContext.emptyRDD[Row]
|
||||
} else {
|
||||
log.info("Additional Filters to be applied to incremental source are :" + filters)
|
||||
filters.foldLeft(sqlContext.read.options(sOpts)
|
||||
.schema(latestSchema)
|
||||
.parquet(fileIdToFullPath.values.toList: _*)
|
||||
.filter(String.format("%s >= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp))
|
||||
.filter(String.format("%s <= '%s'",
|
||||
HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp)))((e, f) => e.filter(f))
|
||||
.toDF().rdd
|
||||
}
|
||||
}
|
||||
}
|
||||
38
hudi-spark/src/main/scala/org/apache/hudi/package.scala
Normal file
38
hudi-spark/src/main/scala/org/apache/hudi/package.scala
Normal file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache
|
||||
|
||||
import org.apache.spark.sql.{DataFrame, DataFrameReader, DataFrameWriter}
|
||||
|
||||
package object hudi {
|
||||
|
||||
/**
|
||||
* Adds a method, `hoodie`, to DataFrameWriter
|
||||
*/
|
||||
implicit class AvroDataFrameWriter[T](writer: DataFrameWriter[T]) {
|
||||
def avro: String => Unit = writer.format("org.apache.hudi").save
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a method, `hoodie`, to DataFrameReader
|
||||
*/
|
||||
implicit class AvroDataFrameReader(reader: DataFrameReader) {
|
||||
def avro: String => DataFrame = reader.format("org.apache.hudi").load
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user