1
0

[HUDI-508] Standardizing on "Table" instead of "Dataset" across code (#1197)

- Docs were talking about storage types before, cWiki moved to "Table"
 - Most of code already has HoodieTable, HoodieTableMetaClient - correct naming
 - Replacing renaming use of dataset across code/comments
 - Few usages in comments and use of Spark SQL DataSet remain unscathed
This commit is contained in:
vinoth chandar
2020-01-07 12:52:32 -08:00
committed by GitHub
parent 8306f749a2
commit 9706f659db
73 changed files with 298 additions and 298 deletions

View File

@@ -44,7 +44,7 @@ public class HiveSyncConfig implements Serializable {
@Parameter(names = {"--jdbc-url"}, description = "Hive jdbc connect url", required = true)
public String jdbcUrl;
@Parameter(names = {"--base-path"}, description = "Basepath of hoodie dataset to sync", required = true)
@Parameter(names = {"--base-path"}, description = "Basepath of hoodie table to sync", required = true)
public String basePath;
@Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")

View File

@@ -20,7 +20,7 @@ package org.apache.hudi.hive;
import org.apache.hudi.common.util.FSUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.InvalidDatasetException;
import org.apache.hudi.exception.InvalidTableException;
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent;
@@ -43,7 +43,7 @@ import java.util.Map;
import java.util.stream.Collectors;
/**
* Tool to sync a hoodie HDFS dataset with a hive metastore table. Either use it as a api
* Tool to sync a hoodie HDFS table with a hive metastore table. Either use it as a api
* HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive.jar HiveSyncTool [args]
* <p>
* This utility will get the schema from the latest commit and will sync hive table schema Also this will sync the
@@ -80,7 +80,7 @@ public class HiveSyncTool {
break;
default:
LOG.error("Unknown table type " + hoodieHiveClient.getTableType());
throw new InvalidDatasetException(hoodieHiveClient.getBasePath());
throw new InvalidTableException(hoodieHiveClient.getBasePath());
}
} catch (RuntimeException re) {
LOG.error("Got runtime exception when hive syncing", re);
@@ -95,7 +95,7 @@ public class HiveSyncTool {
// Check if the necessary table exists
boolean tableExists = hoodieHiveClient.doesTableExist();
// Get the parquet schema for this dataset looking at the latest commit
// Get the parquet schema for this table looking at the latest commit
MessageType schema = hoodieHiveClient.getDataSchema();
// Sync schema if needed
syncSchema(tableExists, isRealTime, schema);
@@ -146,7 +146,7 @@ public class HiveSyncTool {
ParquetHiveSerDe.class.getName());
}
} else {
// Check if the dataset schema has evolved
// Check if the table schema has evolved
Map<String, String> tableSchema = hoodieHiveClient.getTableSchema();
SchemaDifference schemaDiff = SchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields);
if (!schemaDiff.isEmpty()) {

View File

@@ -30,7 +30,7 @@ import org.apache.hudi.common.util.FSUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.InvalidDatasetException;
import org.apache.hudi.exception.InvalidTableException;
import org.apache.hudi.hive.util.SchemaUtil;
import com.google.common.base.Preconditions;
@@ -337,10 +337,10 @@ public class HoodieHiveClient {
}
/**
* Gets the schema for a hoodie dataset. Depending on the type of table, read from any file written in the latest
* Gets the schema for a hoodie table. Depending on the type of table, read from any file written in the latest
* commit. We will assume that the schema has not changed within a single atomic write.
*
* @return Parquet schema for this dataset
* @return Parquet schema for this table
*/
@SuppressWarnings("WeakerAccess")
public MessageType getDataSchema() {
@@ -350,12 +350,12 @@ public class HoodieHiveClient {
// If this is COW, get the last commit and read the schema from a file written in the
// last commit
HoodieInstant lastCommit =
activeTimeline.lastInstant().orElseThrow(() -> new InvalidDatasetException(syncConfig.basePath));
activeTimeline.lastInstant().orElseThrow(() -> new InvalidTableException(syncConfig.basePath));
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(activeTimeline.getInstantDetails(lastCommit).get(), HoodieCommitMetadata.class);
String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny()
.orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for commit "
+ lastCommit + ", could not get schema for dataset " + metaClient.getBasePath() + ", Metadata :"
+ lastCommit + ", could not get schema for table " + metaClient.getBasePath() + ", Metadata :"
+ commitMetadata));
return readSchemaFromDataFile(new Path(filePath));
case MERGE_ON_READ:
@@ -390,7 +390,7 @@ public class HoodieHiveClient {
.filter(s -> s.contains((metaClient.getTableConfig().getROFileFormat().getFileExtension())))
.findAny().map(f -> Pair.of(f, HoodieFileFormat.PARQUET)).orElseThrow(() -> {
return new IllegalArgumentException("Could not find any data file written for commit "
+ lastDeltaInstant + ", could not get schema for dataset " + metaClient.getBasePath()
+ lastDeltaInstant + ", could not get schema for table " + metaClient.getBasePath()
+ ", CommitMetadata :" + commitMetadata);
});
});
@@ -408,10 +408,10 @@ public class HoodieHiveClient {
}
default:
LOG.error("Unknown table type " + tableType);
throw new InvalidDatasetException(syncConfig.basePath);
throw new InvalidTableException(syncConfig.basePath);
}
} catch (IOException e) {
throw new HoodieHiveSyncException("Failed to get dataset schema for " + syncConfig.tableName, e);
throw new HoodieHiveSyncException("Failed to get table schema for " + syncConfig.tableName, e);
}
}
@@ -428,7 +428,7 @@ public class HoodieHiveClient {
.fromBytes(activeTimeline.getInstantDetails(lastCompactionCommit).get(), HoodieCommitMetadata.class);
String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny()
.orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for compaction "
+ lastCompactionCommit + ", could not get schema for dataset " + metaClient.getBasePath()));
+ lastCompactionCommit + ", could not get schema for table " + metaClient.getBasePath()));
return readSchemaFromDataFile(new Path(filePath));
}

View File

@@ -95,7 +95,7 @@ public class SchemaUtil {
expectedType = expectedType.replaceAll("`", "");
if (!tableColumnType.equalsIgnoreCase(expectedType)) {
// check for incremental datasets, the schema type change is allowed as per evolution
// check for incremental queries, the schema type change is allowed as per evolution
// rules
if (!isSchemaTypeUpdateAllowed(tableColumnType, expectedType)) {
throw new HoodieHiveSyncException("Could not convert field Type from " + tableColumnType + " to "

View File

@@ -154,7 +154,7 @@ public class TestHiveSyncTool {
public void testBasicSync() throws Exception {
TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
String commitTime = "100";
TestUtil.createCOWDataset(commitTime, 5);
TestUtil.createCOWTable(commitTime, 5);
HoodieHiveClient hiveClient =
new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially",
@@ -164,7 +164,7 @@ public class TestHiveSyncTool {
tool.syncHoodieTable();
assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes",
hiveClient.doesTableExist());
assertEquals("Hive Schema should match the dataset schema + partition field", hiveClient.getTableSchema().size(),
assertEquals("Hive Schema should match the table schema + partition field", hiveClient.getTableSchema().size(),
hiveClient.getDataSchema().getColumns().size() + 1);
assertEquals("Table partitions should match the number of partitions we wrote", 5,
hiveClient.scanTablePartitions().size());
@@ -176,7 +176,7 @@ public class TestHiveSyncTool {
public void testSyncIncremental() throws Exception {
TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
String commitTime1 = "100";
TestUtil.createCOWDataset(commitTime1, 5);
TestUtil.createCOWTable(commitTime1, 5);
HoodieHiveClient hiveClient =
new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
// Lets do the sync
@@ -214,7 +214,7 @@ public class TestHiveSyncTool {
public void testSyncIncrementalWithSchemaEvolution() throws Exception {
TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
String commitTime1 = "100";
TestUtil.createCOWDataset(commitTime1, 5);
TestUtil.createCOWTable(commitTime1, 5);
HoodieHiveClient hiveClient =
new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
// Lets do the sync
@@ -250,7 +250,7 @@ public class TestHiveSyncTool {
TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
String commitTime = "100";
String deltaCommitTime = "101";
TestUtil.createMORDataset(commitTime, deltaCommitTime, 5);
TestUtil.createMORTable(commitTime, deltaCommitTime, 5);
HoodieHiveClient hiveClient =
new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially",
@@ -261,7 +261,7 @@ public class TestHiveSyncTool {
assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes",
hiveClient.doesTableExist());
assertEquals("Hive Schema should match the dataset schema + partition field", hiveClient.getTableSchema().size(),
assertEquals("Hive Schema should match the table schema + partition field", hiveClient.getTableSchema().size(),
SchemaTestUtil.getSimpleSchema().getFields().size() + 1);
assertEquals("Table partitions should match the number of partitions we wrote", 5,
hiveClient.scanTablePartitions().size());
@@ -280,11 +280,11 @@ public class TestHiveSyncTool {
tool.syncHoodieTable();
hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
assertEquals("Hive Schema should match the evolved dataset schema + partition field",
assertEquals("Hive Schema should match the evolved table schema + partition field",
hiveClient.getTableSchema().size(), SchemaTestUtil.getEvolvedSchema().getFields().size() + 1);
// Sync should add the one partition
assertEquals("The 2 partitions we wrote should be added to hive", 6, hiveClient.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be 103", deltaCommitTime2,
assertEquals("The last commit that was synced should be 103", deltaCommitTime2,
hiveClient.getLastCommitTimeSynced().get());
}
@@ -295,7 +295,7 @@ public class TestHiveSyncTool {
String deltaCommitTime = "101";
String roTablename = TestUtil.hiveSyncConfig.tableName;
TestUtil.hiveSyncConfig.tableName = TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE;
TestUtil.createMORDataset(commitTime, deltaCommitTime, 5);
TestUtil.createMORTable(commitTime, deltaCommitTime, 5);
HoodieHiveClient hiveClientRT =
new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
@@ -309,11 +309,11 @@ public class TestHiveSyncTool {
assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE
+ " should exist after sync completes", hiveClientRT.doesTableExist());
assertEquals("Hive Schema should match the dataset schema + partition field", hiveClientRT.getTableSchema().size(),
assertEquals("Hive Schema should match the table schema + partition field", hiveClientRT.getTableSchema().size(),
SchemaTestUtil.getSimpleSchema().getFields().size() + 1);
assertEquals("Table partitions should match the number of partitions we wrote", 5,
hiveClientRT.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", deltaCommitTime,
assertEquals("The last commit that was synced should be updated in the TBLPROPERTIES", deltaCommitTime,
hiveClientRT.getLastCommitTimeSynced().get());
// Now lets create more parititions and these are the only ones which needs to be synced
@@ -328,7 +328,7 @@ public class TestHiveSyncTool {
tool.syncHoodieTable();
hiveClientRT = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
assertEquals("Hive Schema should match the evolved dataset schema + partition field",
assertEquals("Hive Schema should match the evolved table schema + partition field",
hiveClientRT.getTableSchema().size(), SchemaTestUtil.getEvolvedSchema().getFields().size() + 1);
// Sync should add the one partition
assertEquals("The 2 partitions we wrote should be added to hive", 6, hiveClientRT.scanTablePartitions().size());
@@ -341,7 +341,7 @@ public class TestHiveSyncTool {
public void testMultiPartitionKeySync() throws Exception {
TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
String commitTime = "100";
TestUtil.createCOWDataset(commitTime, 5);
TestUtil.createCOWTable(commitTime, 5);
HiveSyncConfig hiveSyncConfig = HiveSyncConfig.copy(TestUtil.hiveSyncConfig);
hiveSyncConfig.partitionValueExtractorClass = MultiPartKeysValueExtractor.class.getCanonicalName();
@@ -355,7 +355,7 @@ public class TestHiveSyncTool {
HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
tool.syncHoodieTable();
assertTrue("Table " + hiveSyncConfig.tableName + " should exist after sync completes", hiveClient.doesTableExist());
assertEquals("Hive Schema should match the dataset schema + partition fields", hiveClient.getTableSchema().size(),
assertEquals("Hive Schema should match the table schema + partition fields", hiveClient.getTableSchema().size(),
hiveClient.getDataSchema().getColumns().size() + 3);
assertEquals("Table partitions should match the number of partitions we wrote", 5,
hiveClient.scanTablePartitions().size());

View File

@@ -152,7 +152,7 @@ public class TestUtil {
}
}
static void createCOWDataset(String commitTime, int numberOfPartitions)
static void createCOWTable(String commitTime, int numberOfPartitions)
throws IOException, InitializationError, URISyntaxException {
Path path = new Path(hiveSyncConfig.basePath);
FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
@@ -166,7 +166,7 @@ public class TestUtil {
createCommitFile(commitMetadata, commitTime);
}
static void createMORDataset(String commitTime, String deltaCommitTime, int numberOfPartitions)
static void createMORTable(String commitTime, String deltaCommitTime, int numberOfPartitions)
throws IOException, InitializationError, URISyntaxException, InterruptedException {
Path path = new Path(hiveSyncConfig.basePath);
FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));