1
0

[HUDI-426] Bootstrap datasource integration (#1702)

This commit is contained in:
Udit Mehrotra
2020-08-09 14:06:13 -07:00
committed by GitHub
parent c24c528fb7
commit e4a2d98f79
17 changed files with 1287 additions and 113 deletions

View File

@@ -69,6 +69,11 @@ public class HoodieAvroUtils {
private static ThreadLocal<BinaryDecoder> reuseDecoder = ThreadLocal.withInitial(() -> null);
// As per https://avro.apache.org/docs/current/spec.html#names
private static String INVALID_AVRO_CHARS_IN_NAMES = "[^A-Za-z0-9_]";
private static String INVALID_AVRO_FIRST_CHAR_IN_NAMES = "[^A-Za-z_]";
private static String MASK_FOR_INVALID_CHARS_IN_NAMES = "__";
// All metadata fields are optional strings.
public static final Schema METADATA_FIELD_SCHEMA =
Schema.createUnion(Arrays.asList(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING)));
@@ -444,4 +449,21 @@ public class HoodieAvroUtils {
}
return fieldSchema.getLogicalType() == LogicalTypes.date();
}
public static Schema getNullSchema() {
return Schema.create(Schema.Type.NULL);
}
/**
* Sanitizes Name according to Avro rule for names.
* Removes characters other than the ones mentioned in https://avro.apache.org/docs/current/spec.html#names .
* @param name input name
* @return sanitized name
*/
public static String sanitizeName(String name) {
if (name.substring(0,1).matches(INVALID_AVRO_FIRST_CHAR_IN_NAMES)) {
name = name.replaceFirst(INVALID_AVRO_FIRST_CHAR_IN_NAMES, MASK_FOR_INVALID_CHARS_IN_NAMES);
}
return name.replaceAll(INVALID_AVRO_CHARS_IN_NAMES, MASK_FOR_INVALID_CHARS_IN_NAMES);
}
}

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.common.model;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.fs.FSUtils;
import com.fasterxml.jackson.annotation.JsonAutoDetect;
@@ -126,6 +127,18 @@ public class HoodieCommitMetadata implements Serializable {
return fullPaths;
}
public Map<HoodieFileGroupId, String> getFileGroupIdAndFullPaths(String basePath) {
Map<HoodieFileGroupId, String> fileGroupIdToFullPaths = new HashMap<>();
for (Map.Entry<String, List<HoodieWriteStat>> entry : getPartitionToWriteStats().entrySet()) {
for (HoodieWriteStat stat : entry.getValue()) {
HoodieFileGroupId fileGroupId = new HoodieFileGroupId(stat.getPartitionPath(), stat.getFileId());
Path fullPath = new Path(basePath, stat.getPath());
fileGroupIdToFullPaths.put(fileGroupId, fullPath.toString());
}
}
return fileGroupIdToFullPaths;
}
public String toJsonString() throws IOException {
if (partitionToWriteStats.containsKey(null)) {
LOG.info("partition path is null for " + partitionToWriteStats.get(null));