[HUDI-508] Standardizing on "Table" instead of "Dataset" across code (#1197)
- Docs were talking about storage types before, cWiki moved to "Table" - Most of code already has HoodieTable, HoodieTableMetaClient - correct naming - Replacing renaming use of dataset across code/comments - Few usages in comments and use of Spark SQL DataSet remain unscathed
This commit is contained in:
@@ -44,7 +44,7 @@ public class HiveSyncConfig implements Serializable {
|
||||
@Parameter(names = {"--jdbc-url"}, description = "Hive jdbc connect url", required = true)
|
||||
public String jdbcUrl;
|
||||
|
||||
@Parameter(names = {"--base-path"}, description = "Basepath of hoodie dataset to sync", required = true)
|
||||
@Parameter(names = {"--base-path"}, description = "Basepath of hoodie table to sync", required = true)
|
||||
public String basePath;
|
||||
|
||||
@Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
|
||||
|
||||
@@ -20,7 +20,7 @@ package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.common.util.FSUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.InvalidDatasetException;
|
||||
import org.apache.hudi.exception.InvalidTableException;
|
||||
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
|
||||
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
|
||||
import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent;
|
||||
@@ -43,7 +43,7 @@ import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Tool to sync a hoodie HDFS dataset with a hive metastore table. Either use it as a api
|
||||
* Tool to sync a hoodie HDFS table with a hive metastore table. Either use it as a api
|
||||
* HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive.jar HiveSyncTool [args]
|
||||
* <p>
|
||||
* This utility will get the schema from the latest commit and will sync hive table schema Also this will sync the
|
||||
@@ -80,7 +80,7 @@ public class HiveSyncTool {
|
||||
break;
|
||||
default:
|
||||
LOG.error("Unknown table type " + hoodieHiveClient.getTableType());
|
||||
throw new InvalidDatasetException(hoodieHiveClient.getBasePath());
|
||||
throw new InvalidTableException(hoodieHiveClient.getBasePath());
|
||||
}
|
||||
} catch (RuntimeException re) {
|
||||
LOG.error("Got runtime exception when hive syncing", re);
|
||||
@@ -95,7 +95,7 @@ public class HiveSyncTool {
|
||||
|
||||
// Check if the necessary table exists
|
||||
boolean tableExists = hoodieHiveClient.doesTableExist();
|
||||
// Get the parquet schema for this dataset looking at the latest commit
|
||||
// Get the parquet schema for this table looking at the latest commit
|
||||
MessageType schema = hoodieHiveClient.getDataSchema();
|
||||
// Sync schema if needed
|
||||
syncSchema(tableExists, isRealTime, schema);
|
||||
@@ -146,7 +146,7 @@ public class HiveSyncTool {
|
||||
ParquetHiveSerDe.class.getName());
|
||||
}
|
||||
} else {
|
||||
// Check if the dataset schema has evolved
|
||||
// Check if the table schema has evolved
|
||||
Map<String, String> tableSchema = hoodieHiveClient.getTableSchema();
|
||||
SchemaDifference schemaDiff = SchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields);
|
||||
if (!schemaDiff.isEmpty()) {
|
||||
|
||||
@@ -30,7 +30,7 @@ import org.apache.hudi.common.util.FSUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.exception.InvalidDatasetException;
|
||||
import org.apache.hudi.exception.InvalidTableException;
|
||||
import org.apache.hudi.hive.util.SchemaUtil;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
@@ -337,10 +337,10 @@ public class HoodieHiveClient {
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the schema for a hoodie dataset. Depending on the type of table, read from any file written in the latest
|
||||
* Gets the schema for a hoodie table. Depending on the type of table, read from any file written in the latest
|
||||
* commit. We will assume that the schema has not changed within a single atomic write.
|
||||
*
|
||||
* @return Parquet schema for this dataset
|
||||
* @return Parquet schema for this table
|
||||
*/
|
||||
@SuppressWarnings("WeakerAccess")
|
||||
public MessageType getDataSchema() {
|
||||
@@ -350,12 +350,12 @@ public class HoodieHiveClient {
|
||||
// If this is COW, get the last commit and read the schema from a file written in the
|
||||
// last commit
|
||||
HoodieInstant lastCommit =
|
||||
activeTimeline.lastInstant().orElseThrow(() -> new InvalidDatasetException(syncConfig.basePath));
|
||||
activeTimeline.lastInstant().orElseThrow(() -> new InvalidTableException(syncConfig.basePath));
|
||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
||||
.fromBytes(activeTimeline.getInstantDetails(lastCommit).get(), HoodieCommitMetadata.class);
|
||||
String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny()
|
||||
.orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for commit "
|
||||
+ lastCommit + ", could not get schema for dataset " + metaClient.getBasePath() + ", Metadata :"
|
||||
+ lastCommit + ", could not get schema for table " + metaClient.getBasePath() + ", Metadata :"
|
||||
+ commitMetadata));
|
||||
return readSchemaFromDataFile(new Path(filePath));
|
||||
case MERGE_ON_READ:
|
||||
@@ -390,7 +390,7 @@ public class HoodieHiveClient {
|
||||
.filter(s -> s.contains((metaClient.getTableConfig().getROFileFormat().getFileExtension())))
|
||||
.findAny().map(f -> Pair.of(f, HoodieFileFormat.PARQUET)).orElseThrow(() -> {
|
||||
return new IllegalArgumentException("Could not find any data file written for commit "
|
||||
+ lastDeltaInstant + ", could not get schema for dataset " + metaClient.getBasePath()
|
||||
+ lastDeltaInstant + ", could not get schema for table " + metaClient.getBasePath()
|
||||
+ ", CommitMetadata :" + commitMetadata);
|
||||
});
|
||||
});
|
||||
@@ -408,10 +408,10 @@ public class HoodieHiveClient {
|
||||
}
|
||||
default:
|
||||
LOG.error("Unknown table type " + tableType);
|
||||
throw new InvalidDatasetException(syncConfig.basePath);
|
||||
throw new InvalidTableException(syncConfig.basePath);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new HoodieHiveSyncException("Failed to get dataset schema for " + syncConfig.tableName, e);
|
||||
throw new HoodieHiveSyncException("Failed to get table schema for " + syncConfig.tableName, e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -428,7 +428,7 @@ public class HoodieHiveClient {
|
||||
.fromBytes(activeTimeline.getInstantDetails(lastCompactionCommit).get(), HoodieCommitMetadata.class);
|
||||
String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny()
|
||||
.orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for compaction "
|
||||
+ lastCompactionCommit + ", could not get schema for dataset " + metaClient.getBasePath()));
|
||||
+ lastCompactionCommit + ", could not get schema for table " + metaClient.getBasePath()));
|
||||
return readSchemaFromDataFile(new Path(filePath));
|
||||
}
|
||||
|
||||
|
||||
@@ -95,7 +95,7 @@ public class SchemaUtil {
|
||||
expectedType = expectedType.replaceAll("`", "");
|
||||
|
||||
if (!tableColumnType.equalsIgnoreCase(expectedType)) {
|
||||
// check for incremental datasets, the schema type change is allowed as per evolution
|
||||
// check for incremental queries, the schema type change is allowed as per evolution
|
||||
// rules
|
||||
if (!isSchemaTypeUpdateAllowed(tableColumnType, expectedType)) {
|
||||
throw new HoodieHiveSyncException("Could not convert field Type from " + tableColumnType + " to "
|
||||
|
||||
@@ -154,7 +154,7 @@ public class TestHiveSyncTool {
|
||||
public void testBasicSync() throws Exception {
|
||||
TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
|
||||
String commitTime = "100";
|
||||
TestUtil.createCOWDataset(commitTime, 5);
|
||||
TestUtil.createCOWTable(commitTime, 5);
|
||||
HoodieHiveClient hiveClient =
|
||||
new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
|
||||
assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially",
|
||||
@@ -164,7 +164,7 @@ public class TestHiveSyncTool {
|
||||
tool.syncHoodieTable();
|
||||
assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes",
|
||||
hiveClient.doesTableExist());
|
||||
assertEquals("Hive Schema should match the dataset schema + partition field", hiveClient.getTableSchema().size(),
|
||||
assertEquals("Hive Schema should match the table schema + partition field", hiveClient.getTableSchema().size(),
|
||||
hiveClient.getDataSchema().getColumns().size() + 1);
|
||||
assertEquals("Table partitions should match the number of partitions we wrote", 5,
|
||||
hiveClient.scanTablePartitions().size());
|
||||
@@ -176,7 +176,7 @@ public class TestHiveSyncTool {
|
||||
public void testSyncIncremental() throws Exception {
|
||||
TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
|
||||
String commitTime1 = "100";
|
||||
TestUtil.createCOWDataset(commitTime1, 5);
|
||||
TestUtil.createCOWTable(commitTime1, 5);
|
||||
HoodieHiveClient hiveClient =
|
||||
new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
|
||||
// Lets do the sync
|
||||
@@ -214,7 +214,7 @@ public class TestHiveSyncTool {
|
||||
public void testSyncIncrementalWithSchemaEvolution() throws Exception {
|
||||
TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
|
||||
String commitTime1 = "100";
|
||||
TestUtil.createCOWDataset(commitTime1, 5);
|
||||
TestUtil.createCOWTable(commitTime1, 5);
|
||||
HoodieHiveClient hiveClient =
|
||||
new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
|
||||
// Lets do the sync
|
||||
@@ -250,7 +250,7 @@ public class TestHiveSyncTool {
|
||||
TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
|
||||
String commitTime = "100";
|
||||
String deltaCommitTime = "101";
|
||||
TestUtil.createMORDataset(commitTime, deltaCommitTime, 5);
|
||||
TestUtil.createMORTable(commitTime, deltaCommitTime, 5);
|
||||
HoodieHiveClient hiveClient =
|
||||
new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
|
||||
assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially",
|
||||
@@ -261,7 +261,7 @@ public class TestHiveSyncTool {
|
||||
|
||||
assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes",
|
||||
hiveClient.doesTableExist());
|
||||
assertEquals("Hive Schema should match the dataset schema + partition field", hiveClient.getTableSchema().size(),
|
||||
assertEquals("Hive Schema should match the table schema + partition field", hiveClient.getTableSchema().size(),
|
||||
SchemaTestUtil.getSimpleSchema().getFields().size() + 1);
|
||||
assertEquals("Table partitions should match the number of partitions we wrote", 5,
|
||||
hiveClient.scanTablePartitions().size());
|
||||
@@ -280,11 +280,11 @@ public class TestHiveSyncTool {
|
||||
tool.syncHoodieTable();
|
||||
hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
|
||||
|
||||
assertEquals("Hive Schema should match the evolved dataset schema + partition field",
|
||||
assertEquals("Hive Schema should match the evolved table schema + partition field",
|
||||
hiveClient.getTableSchema().size(), SchemaTestUtil.getEvolvedSchema().getFields().size() + 1);
|
||||
// Sync should add the one partition
|
||||
assertEquals("The 2 partitions we wrote should be added to hive", 6, hiveClient.scanTablePartitions().size());
|
||||
assertEquals("The last commit that was sycned should be 103", deltaCommitTime2,
|
||||
assertEquals("The last commit that was synced should be 103", deltaCommitTime2,
|
||||
hiveClient.getLastCommitTimeSynced().get());
|
||||
}
|
||||
|
||||
@@ -295,7 +295,7 @@ public class TestHiveSyncTool {
|
||||
String deltaCommitTime = "101";
|
||||
String roTablename = TestUtil.hiveSyncConfig.tableName;
|
||||
TestUtil.hiveSyncConfig.tableName = TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE;
|
||||
TestUtil.createMORDataset(commitTime, deltaCommitTime, 5);
|
||||
TestUtil.createMORTable(commitTime, deltaCommitTime, 5);
|
||||
HoodieHiveClient hiveClientRT =
|
||||
new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
|
||||
|
||||
@@ -309,11 +309,11 @@ public class TestHiveSyncTool {
|
||||
assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE
|
||||
+ " should exist after sync completes", hiveClientRT.doesTableExist());
|
||||
|
||||
assertEquals("Hive Schema should match the dataset schema + partition field", hiveClientRT.getTableSchema().size(),
|
||||
assertEquals("Hive Schema should match the table schema + partition field", hiveClientRT.getTableSchema().size(),
|
||||
SchemaTestUtil.getSimpleSchema().getFields().size() + 1);
|
||||
assertEquals("Table partitions should match the number of partitions we wrote", 5,
|
||||
hiveClientRT.scanTablePartitions().size());
|
||||
assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", deltaCommitTime,
|
||||
assertEquals("The last commit that was synced should be updated in the TBLPROPERTIES", deltaCommitTime,
|
||||
hiveClientRT.getLastCommitTimeSynced().get());
|
||||
|
||||
// Now lets create more parititions and these are the only ones which needs to be synced
|
||||
@@ -328,7 +328,7 @@ public class TestHiveSyncTool {
|
||||
tool.syncHoodieTable();
|
||||
hiveClientRT = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
|
||||
|
||||
assertEquals("Hive Schema should match the evolved dataset schema + partition field",
|
||||
assertEquals("Hive Schema should match the evolved table schema + partition field",
|
||||
hiveClientRT.getTableSchema().size(), SchemaTestUtil.getEvolvedSchema().getFields().size() + 1);
|
||||
// Sync should add the one partition
|
||||
assertEquals("The 2 partitions we wrote should be added to hive", 6, hiveClientRT.scanTablePartitions().size());
|
||||
@@ -341,7 +341,7 @@ public class TestHiveSyncTool {
|
||||
public void testMultiPartitionKeySync() throws Exception {
|
||||
TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
|
||||
String commitTime = "100";
|
||||
TestUtil.createCOWDataset(commitTime, 5);
|
||||
TestUtil.createCOWTable(commitTime, 5);
|
||||
|
||||
HiveSyncConfig hiveSyncConfig = HiveSyncConfig.copy(TestUtil.hiveSyncConfig);
|
||||
hiveSyncConfig.partitionValueExtractorClass = MultiPartKeysValueExtractor.class.getCanonicalName();
|
||||
@@ -355,7 +355,7 @@ public class TestHiveSyncTool {
|
||||
HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
assertTrue("Table " + hiveSyncConfig.tableName + " should exist after sync completes", hiveClient.doesTableExist());
|
||||
assertEquals("Hive Schema should match the dataset schema + partition fields", hiveClient.getTableSchema().size(),
|
||||
assertEquals("Hive Schema should match the table schema + partition fields", hiveClient.getTableSchema().size(),
|
||||
hiveClient.getDataSchema().getColumns().size() + 3);
|
||||
assertEquals("Table partitions should match the number of partitions we wrote", 5,
|
||||
hiveClient.scanTablePartitions().size());
|
||||
|
||||
@@ -152,7 +152,7 @@ public class TestUtil {
|
||||
}
|
||||
}
|
||||
|
||||
static void createCOWDataset(String commitTime, int numberOfPartitions)
|
||||
static void createCOWTable(String commitTime, int numberOfPartitions)
|
||||
throws IOException, InitializationError, URISyntaxException {
|
||||
Path path = new Path(hiveSyncConfig.basePath);
|
||||
FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
|
||||
@@ -166,7 +166,7 @@ public class TestUtil {
|
||||
createCommitFile(commitMetadata, commitTime);
|
||||
}
|
||||
|
||||
static void createMORDataset(String commitTime, String deltaCommitTime, int numberOfPartitions)
|
||||
static void createMORTable(String commitTime, String deltaCommitTime, int numberOfPartitions)
|
||||
throws IOException, InitializationError, URISyntaxException, InterruptedException {
|
||||
Path path = new Path(hiveSyncConfig.basePath);
|
||||
FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
|
||||
|
||||
Reference in New Issue
Block a user