1
0

Refactor hoodie-hive

This commit is contained in:
Prasanna Rajaperumal
2017-05-19 23:47:27 -07:00
committed by prazanna
parent c192dd60b4
commit db6150c5ef
40 changed files with 1614 additions and 2296 deletions

View File

@@ -1,186 +0,0 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.hive;
import com.uber.hoodie.hive.client.SchemaUtil;
import com.uber.hoodie.hive.model.HoodieDatasetReference;
import com.uber.hoodie.hive.model.SchemaDifference;
import com.uber.hoodie.hive.util.TestUtil;
import org.joda.time.DateTime;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.runners.model.InitializationError;
import parquet.schema.MessageType;
import parquet.schema.OriginalType;
import parquet.schema.PrimitiveType;
import java.io.IOException;
import static org.junit.Assert.assertEquals;
public class DatasetSchemaTest {
@Before
public void setUp() throws IOException, InterruptedException {
TestUtil.setUp();
}
@Test
public void testSchemaDiff() throws IOException, InitializationError {
HoodieDatasetReference metadata = TestUtil
.createDataset("test1", "/tmp/hdfs/DatasetSchemaTest/testSchema/", 5, "/nation.schema");
HoodieHiveSchemaSyncTask schema =
HoodieHiveSchemaSyncTask.newBuilder().withReference(metadata)
.withConfiguration(TestUtil.hDroneConfiguration).build();
SchemaDifference diff = schema.getSchemaDifference();
assertEquals("There should be 4 columns to be added", 4, diff.getAddColumnTypes().size());
assertEquals("No update columns expected", 0, diff.getUpdateColumnTypes().size());
assertEquals("No delete columns expected", 0, diff.getDeleteColumns().size());
schema.sync();
schema = HoodieHiveSchemaSyncTask.newBuilder().withReference(metadata)
.withConfiguration(TestUtil.hDroneConfiguration).build();
diff = schema.getSchemaDifference();
assertEquals("After sync, there should not be any new columns to add", 0,
diff.getAddColumnTypes().size());
assertEquals("After sync, there should not be any new columns to update", 0,
diff.getUpdateColumnTypes().size());
assertEquals("After sync, there should not be any new columns to delete", 0,
diff.getDeleteColumns().size());
}
@Test
public void testSchemaEvolution() throws IOException, InitializationError {
int initialPartitionsCount = 5;
HoodieDatasetReference metadata = TestUtil
.createDataset("test1", "/tmp/hdfs/DatasetSchemaTest/testSchema/",
initialPartitionsCount, "/nation.schema");
HoodieHiveSchemaSyncTask schema =
HoodieHiveSchemaSyncTask.newBuilder().withReference(metadata)
.withConfiguration(TestUtil.hDroneConfiguration).build();
schema.sync();
schema = HoodieHiveSchemaSyncTask.newBuilder().withReference(metadata)
.withConfiguration(TestUtil.hDroneConfiguration).build();
SchemaDifference diff = schema.getSchemaDifference();
assertEquals("After sync, diff should be empty", true, diff.isEmpty());
int newSchemaversion = 2;
int newPartitionsCount = 2;
TestUtil.evolveDataset(metadata, newPartitionsCount, "/nation_evolved.schema",
DateTime.now().getMillis(), newSchemaversion);
schema = HoodieHiveSchemaSyncTask.newBuilder().withReference(metadata)
.withConfiguration(TestUtil.hDroneConfiguration).build();
diff = schema.getSchemaDifference();
assertEquals("Schema has evolved, there should be a diff", false, diff.isEmpty());
assertEquals("Schema has evolved, there should be 1 column to add", 1,
diff.getAddColumnTypes().size());
assertEquals("Schema has evolved, there should be 1 column to update", 1,
diff.getUpdateColumnTypes().size());
assertEquals(0, diff.getDeleteColumns().size());
}
/**
* Testing converting array types to Hive field declaration strings,
* according to the Parquet-113 spec:
* https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
*/
@Test
public void testSchemaConvertArray() throws IOException {
// Testing the 3-level annotation structure
MessageType schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeatedGroup().optional(PrimitiveType.PrimitiveTypeName.INT32).named("element")
.named("list").named("int_list").named("ArrayOfInts");
String schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`int_list` ARRAY< int>", schemaString);
// A array of arrays
schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeatedGroup().requiredGroup().as(OriginalType.LIST).repeatedGroup()
.required(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("list")
.named("element").named("list").named("int_list_list").named("ArrayOfArrayOfInts");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`int_list_list` ARRAY< ARRAY< int>>", schemaString);
// A list of integers
schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeated(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("int_list")
.named("ArrayOfInts");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`int_list` ARRAY< int>", schemaString);
// A list of structs with two fields
schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
.required(PrimitiveType.PrimitiveTypeName.INT32).named("num").named("element")
.named("tuple_list").named("ArrayOfTuples");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`tuple_list` ARRAY< STRUCT< `str` : binary, `num` : int>>", schemaString);
// A list of structs with a single field
// For this case, since the inner group name is "array", we treat the
// element type as a one-element struct.
schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
.named("array").named("one_tuple_list").named("ArrayOfOneTuples");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`one_tuple_list` ARRAY< STRUCT< `str` : binary>>", schemaString);
// A list of structs with a single field
// For this case, since the inner group name ends with "_tuple", we also treat the
// element type as a one-element struct.
schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
.named("one_tuple_list_tuple").named("one_tuple_list").named("ArrayOfOneTuples2");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`one_tuple_list` ARRAY< STRUCT< `str` : binary>>", schemaString);
// A list of structs with a single field
// Unlike the above two cases, for this the element type is the type of the
// only field in the struct.
schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
.named("one_tuple_list").named("one_tuple_list").named("ArrayOfOneTuples3");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`one_tuple_list` ARRAY< binary>", schemaString);
// A list of maps
schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeatedGroup().as(OriginalType.MAP).repeatedGroup().as(OriginalType.MAP_KEY_VALUE)
.required(PrimitiveType.PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
.named("string_key").required(PrimitiveType.PrimitiveTypeName.INT32)
.named("int_value").named("key_value").named("array").named("map_list")
.named("ArrayOfMaps");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`map_list` ARRAY< MAP< string, int>>", schemaString);
}
}

View File

@@ -1,99 +0,0 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.hive;
import com.uber.hoodie.hive.client.HoodieHiveClient;
import com.uber.hoodie.hive.model.HoodieDatasetReference;
import com.uber.hoodie.hive.util.TestUtil;
import org.joda.time.DateTime;
import org.junit.Before;
import org.junit.Test;
import org.junit.runners.model.InitializationError;
import parquet.schema.MessageType;
import java.io.IOException;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
public class HDroneDatasetTest {
private HoodieHiveClient hiveClient;
@Before
public void setUp() throws IOException, InterruptedException {
TestUtil.setUp();
hiveClient = new HoodieHiveClient(TestUtil.hDroneConfiguration);
}
@Test
public void testDatasetCreation() throws IOException, InitializationError {
HoodieDatasetReference metadata = TestUtil
.createDataset("test1", "/tmp/hdfs/DatasetSchemaTest/testSchema/", 5, "/nation.schema");
HoodieHiveDatasetSyncTask dataset =
HoodieHiveDatasetSyncTask.newBuilder().withReference(metadata)
.withConfiguration(TestUtil.hDroneConfiguration).build();
assertEquals("There should be 5 new partitions", 5, dataset.getNewPartitions().size());
assertEquals("There should not be any changed partitions", 0,
dataset.getChangedPartitions().size());
assertFalse("Table should not exist", hiveClient.checkTableExists(metadata));
dataset.sync();
dataset = HoodieHiveDatasetSyncTask.newBuilder().withReference(metadata)
.withConfiguration(TestUtil.hDroneConfiguration).build();
assertTrue("Table should exist after flush", hiveClient.checkTableExists(metadata));
assertEquals("After flush, There should not be any new partitions to flush", 0,
dataset.getNewPartitions().size());
assertEquals("After flush, There should not be any modified partitions to flush", 0,
dataset.getChangedPartitions().size());
assertEquals("Table Schema should have 5 fields", 5,
hiveClient.getTableSchema(metadata).size());
}
@Test
public void testDatasetEvolution() throws IOException, InitializationError {
int initialPartitionsCount = 5;
HoodieDatasetReference metadata = TestUtil
.createDataset("test1", "/tmp/hdfs/DatasetSchemaTest/testSchema/",
initialPartitionsCount, "/nation.schema");
HoodieHiveDatasetSyncTask dataset =
HoodieHiveDatasetSyncTask.newBuilder().withReference(metadata)
.withConfiguration(TestUtil.hDroneConfiguration).build();
dataset.sync();
dataset = HoodieHiveDatasetSyncTask.newBuilder(dataset).build();
int newSchemaversion = 2;
int newPartitionsCount = 2;
TestUtil.evolveDataset(metadata, newPartitionsCount, "/nation_evolved.schema",
DateTime.now().getMillis(), newSchemaversion);
dataset = HoodieHiveDatasetSyncTask.newBuilder(dataset).build();
assertEquals("There should be " + newPartitionsCount + " partitions to be added",
newPartitionsCount, dataset.getNewPartitions().size());
dataset.sync();
dataset = HoodieHiveDatasetSyncTask.newBuilder(dataset).build();
MessageType newDatasetSchema = dataset.getSchemaSyncTask().getStorageSchema();
MessageType expectedSchema = TestUtil.readSchema("/nation_evolved.schema");
assertEquals("Table schema should be evolved schema", expectedSchema, newDatasetSchema);
assertEquals("Table schema should have 6 fields", 6,
hiveClient.getTableSchema(metadata).size());
assertEquals("Valid Evolution should be reflected", "BIGINT",
hiveClient.getTableSchema(metadata).get("region_key"));
}
}

View File

@@ -0,0 +1,308 @@
/*
* Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*
*/
package com.uber.hoodie.hive;
import static org.junit.Assert.*;
import com.uber.hoodie.common.util.SchemaTestUtil;
import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent;
import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent.PartitionEventType;
import com.uber.hoodie.hive.util.SchemaUtil;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.List;
import java.util.Optional;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.thrift.TException;
import org.joda.time.DateTime;
import org.junit.Before;
import org.junit.Test;
import org.junit.runners.model.InitializationError;
import parquet.schema.MessageType;
import parquet.schema.OriginalType;
import parquet.schema.PrimitiveType;
@SuppressWarnings("ConstantConditions")
public class HiveSyncToolTest {
@Before
public void setUp() throws IOException, InterruptedException, URISyntaxException {
TestUtil.setUp();
}
@Before
public void teardown() throws IOException, InterruptedException {
TestUtil.clear();
}
/**
* Testing converting array types to Hive field declaration strings,
* according to the Parquet-113 spec:
* https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
*/
@Test
public void testSchemaConvertArray() throws IOException {
// Testing the 3-level annotation structure
MessageType schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeatedGroup().optional(PrimitiveType.PrimitiveTypeName.INT32).named("element")
.named("list").named("int_list").named("ArrayOfInts");
String schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`int_list` ARRAY< int>", schemaString);
// A array of arrays
schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeatedGroup().requiredGroup().as(OriginalType.LIST).repeatedGroup()
.required(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("list")
.named("element").named("list").named("int_list_list").named("ArrayOfArrayOfInts");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`int_list_list` ARRAY< ARRAY< int>>", schemaString);
// A list of integers
schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeated(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("int_list")
.named("ArrayOfInts");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`int_list` ARRAY< int>", schemaString);
// A list of structs with two fields
schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
.required(PrimitiveType.PrimitiveTypeName.INT32).named("num").named("element")
.named("tuple_list").named("ArrayOfTuples");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`tuple_list` ARRAY< STRUCT< `str` : binary, `num` : int>>", schemaString);
// A list of structs with a single field
// For this case, since the inner group name is "array", we treat the
// element type as a one-element struct.
schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
.named("array").named("one_tuple_list").named("ArrayOfOneTuples");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`one_tuple_list` ARRAY< STRUCT< `str` : binary>>", schemaString);
// A list of structs with a single field
// For this case, since the inner group name ends with "_tuple", we also treat the
// element type as a one-element struct.
schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
.named("one_tuple_list_tuple").named("one_tuple_list").named("ArrayOfOneTuples2");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`one_tuple_list` ARRAY< STRUCT< `str` : binary>>", schemaString);
// A list of structs with a single field
// Unlike the above two cases, for this the element type is the type of the
// only field in the struct.
schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeatedGroup().required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
.named("one_tuple_list").named("one_tuple_list").named("ArrayOfOneTuples3");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`one_tuple_list` ARRAY< binary>", schemaString);
// A list of maps
schema =
parquet.schema.Types.buildMessage().optionalGroup().as(parquet.schema.OriginalType.LIST)
.repeatedGroup().as(OriginalType.MAP).repeatedGroup().as(OriginalType.MAP_KEY_VALUE)
.required(PrimitiveType.PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
.named("string_key").required(PrimitiveType.PrimitiveTypeName.INT32)
.named("int_value").named("key_value").named("array").named("map_list")
.named("ArrayOfMaps");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`map_list` ARRAY< MAP< string, int>>", schemaString);
}
@Test
public void testBasicSync()
throws IOException, InitializationError, URISyntaxException, TException, InterruptedException {
String commitTime = "100";
TestUtil.createCOWDataset(commitTime, 5);
HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
TestUtil.getHiveConf(), TestUtil.fileSystem);
assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially",
hiveClient.doesTableExist());
// Lets do the sync
HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
TestUtil.fileSystem);
tool.syncHoodieTable();
assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes",
hiveClient.doesTableExist());
assertEquals("Hive Schema should match the dataset schema + partition field",
hiveClient.getTableSchema().size(),
hiveClient.getDataSchema().getColumns().size() + 1);
assertEquals("Table partitions should match the number of partitions we wrote", 5,
hiveClient.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES",
commitTime,
hiveClient.getLastCommitTimeSynced().get());
}
@Test
public void testSyncIncremental()
throws IOException, InitializationError, URISyntaxException, TException, InterruptedException {
String commitTime1 = "100";
TestUtil.createCOWDataset(commitTime1, 5);
HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
TestUtil.getHiveConf(), TestUtil.fileSystem);
// Lets do the sync
HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
TestUtil.fileSystem);
tool.syncHoodieTable();
assertEquals("Table partitions should match the number of partitions we wrote", 5,
hiveClient.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES",
commitTime1,
hiveClient.getLastCommitTimeSynced().get());
// Now lets create more parititions and these are the only ones which needs to be synced
DateTime dateTime = DateTime.now().plusDays(6);
String commitTime2 = "101";
TestUtil.addCOWPartitions(1, true, dateTime, commitTime2);
// Lets do the sync
hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
TestUtil.getHiveConf(), TestUtil.fileSystem);
List<String> writtenPartitionsSince = hiveClient
.getPartitionsWrittenToSince(Optional.of(commitTime1));
assertEquals("We should have one partition written after 100 commit", 1,
writtenPartitionsSince.size());
List<Partition> hivePartitions = hiveClient.scanTablePartitions();
List<PartitionEvent> partitionEvents = hiveClient
.getPartitionEvents(hivePartitions, writtenPartitionsSince);
assertEquals("There should be only one paritition event", 1, partitionEvents.size());
assertEquals("The one partition event must of type ADD", PartitionEventType.ADD,
partitionEvents.iterator().next().eventType);
tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
TestUtil.fileSystem);
tool.syncHoodieTable();
// Sync should add the one partition
assertEquals("The one partition we wrote should be added to hive", 6,
hiveClient.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be 101",
commitTime2,
hiveClient.getLastCommitTimeSynced().get());
}
@Test
public void testSyncIncrementalWithSchemaEvolution()
throws IOException, InitializationError, URISyntaxException, TException, InterruptedException {
String commitTime1 = "100";
TestUtil.createCOWDataset(commitTime1, 5);
HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
TestUtil.getHiveConf(), TestUtil.fileSystem);
// Lets do the sync
HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
TestUtil.fileSystem);
tool.syncHoodieTable();
int fields = hiveClient.getTableSchema().size();
// Now lets create more parititions and these are the only ones which needs to be synced
DateTime dateTime = DateTime.now().plusDays(6);
String commitTime2 = "101";
TestUtil.addCOWPartitions(1, false, dateTime, commitTime2);
// Lets do the sync
tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
TestUtil.fileSystem);
tool.syncHoodieTable();
assertEquals("Hive Schema has evolved and should not be 3 more field",
fields + 3,
hiveClient.getTableSchema().size());
assertEquals("Hive Schema has evolved - Field favorite_number has evolved from int to long",
"BIGINT",
hiveClient.getTableSchema().get("favorite_number"));
assertTrue("Hive Schema has evolved - Field favorite_movie was added",
hiveClient.getTableSchema().containsKey("favorite_movie"));
// Sync should add the one partition
assertEquals("The one partition we wrote should be added to hive", 6,
hiveClient.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be 101",
commitTime2,
hiveClient.getLastCommitTimeSynced().get());
}
@Test
public void testSyncMergeOnRead()
throws IOException, InitializationError, URISyntaxException, TException, InterruptedException {
String commitTime = "100";
String deltaCommitTime = "101";
TestUtil.createMORDataset(commitTime, deltaCommitTime, 5);
HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
TestUtil.getHiveConf(), TestUtil.fileSystem);
assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially",
hiveClient.doesTableExist());
// Lets do the sync
HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
TestUtil.fileSystem);
tool.syncHoodieTable();
assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes",
hiveClient.doesTableExist());
assertEquals("Hive Schema should match the dataset schema + partition field",
hiveClient.getTableSchema().size(), SchemaTestUtil.getSimpleSchema().getFields().size() + 1);
assertEquals("Table partitions should match the number of partitions we wrote", 5,
hiveClient.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES",
deltaCommitTime,
hiveClient.getLastCommitTimeSynced().get());
// Now lets create more parititions and these are the only ones which needs to be synced
DateTime dateTime = DateTime.now().plusDays(6);
String commitTime2 = "102";
String deltaCommitTime2 = "103";
TestUtil.addCOWPartitions(1, true, dateTime, commitTime2);
TestUtil.addMORPartitions(1, true, false, dateTime, commitTime2, deltaCommitTime2);
// Lets do the sync
tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
TestUtil.fileSystem);
tool.syncHoodieTable();
hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
TestUtil.getHiveConf(), TestUtil.fileSystem);
assertEquals("Hive Schema should match the evolved dataset schema + partition field",
hiveClient.getTableSchema().size(), SchemaTestUtil.getEvolvedSchema().getFields().size() + 1);
// Sync should add the one partition
assertEquals("The 2 partitions we wrote should be added to hive", 6,
hiveClient.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be 103",
deltaCommitTime2,
hiveClient.getLastCommitTimeSynced().get());
}
}

View File

@@ -0,0 +1,353 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.hive;
import static com.uber.hoodie.common.model.HoodieTestUtils.DEFAULT_TASK_PARTITIONID;
import static org.junit.Assert.fail;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
import com.uber.hoodie.common.BloomFilter;
import com.uber.hoodie.common.minicluster.HdfsTestService;
import com.uber.hoodie.common.minicluster.ZookeeperTestService;
import com.uber.hoodie.common.model.CompactionWriteStat;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieDeltaWriteStat;
import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.model.HoodieWriteStat;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.log.HoodieLogFile;
import com.uber.hoodie.common.table.log.HoodieLogFormat;
import com.uber.hoodie.common.table.log.HoodieLogFormat.Writer;
import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.SchemaTestUtil;
import com.uber.hoodie.hive.util.HiveTestService;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import java.util.UUID;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hive.service.server.HiveServer2;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.zookeeper.server.ZooKeeperServer;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.junit.runners.model.InitializationError;
@SuppressWarnings("SameParameterValue")
public class TestUtil {
private static MiniDFSCluster dfsCluster;
private static ZooKeeperServer zkServer;
private static HiveServer2 hiveServer;
private static Configuration configuration;
static HiveSyncConfig hiveSyncConfig;
private static DateTimeFormatter dtfOut;
static FileSystem fileSystem;
private static Set<String> createdTablesSet = Sets.newHashSet();
public static void setUp() throws IOException, InterruptedException, URISyntaxException {
if (dfsCluster == null) {
HdfsTestService service = new HdfsTestService();
dfsCluster = service.start(true);
configuration = service.getHadoopConf();
}
if (zkServer == null) {
ZookeeperTestService zkService = new ZookeeperTestService(configuration);
zkServer = zkService.start();
}
if (hiveServer == null) {
HiveTestService hiveService = new HiveTestService(configuration);
hiveServer = hiveService.start();
}
fileSystem = FileSystem.get(configuration);
hiveSyncConfig = new HiveSyncConfig();
hiveSyncConfig.jdbcUrl = "jdbc:hive2://127.0.0.1:9999/";
hiveSyncConfig.databaseName = "hdrone_test";
hiveSyncConfig.hiveUser = "";
hiveSyncConfig.hivePass = "";
hiveSyncConfig.databaseName = "testdb";
hiveSyncConfig.tableName = "test1";
hiveSyncConfig.basePath = "/tmp/hdfs/HiveSyncToolTest/";
hiveSyncConfig.assumeDatePartitioning = true;
hiveSyncConfig.partitionFields = Lists.newArrayList("datestr");
dtfOut = DateTimeFormat.forPattern("yyyy/MM/dd");
clear();
}
static void clear() throws IOException {
fileSystem.delete(new Path(hiveSyncConfig.basePath), true);
HoodieTableMetaClient
.initTableType(fileSystem, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE,
hiveSyncConfig.tableName);
HoodieHiveClient client = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(),
fileSystem);
for (String tableName : createdTablesSet) {
client.updateHiveSQL("drop table if exists " + tableName);
}
createdTablesSet.clear();
client.updateHiveSQL(
"drop database if exists " + hiveSyncConfig.databaseName);
client.updateHiveSQL("create database " + hiveSyncConfig.databaseName);
}
static HiveConf getHiveConf() {
return hiveServer.getHiveConf();
}
@SuppressWarnings("unused")
public static void shutdown() {
if (hiveServer != null) {
hiveServer.stop();
}
if (dfsCluster != null) {
dfsCluster.shutdown();
}
if (zkServer != null) {
zkServer.shutdown();
}
}
static void createCOWDataset(String commitTime, int numberOfPartitions)
throws IOException, InitializationError, URISyntaxException, InterruptedException {
Path path = new Path(hiveSyncConfig.basePath);
FileUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
HoodieTableMetaClient
.initTableType(fileSystem, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE,
hiveSyncConfig.tableName);
boolean result = fileSystem.mkdirs(path);
checkResult(result);
DateTime dateTime = DateTime.now();
HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, commitTime);
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
createCommitFile(commitMetadata, commitTime);
}
static void createMORDataset(String commitTime, String deltaCommitTime, int numberOfPartitions)
throws IOException, InitializationError, URISyntaxException, InterruptedException {
Path path = new Path(hiveSyncConfig.basePath);
FileUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
HoodieTableMetaClient
.initTableType(fileSystem, hiveSyncConfig.basePath, HoodieTableType.MERGE_ON_READ,
hiveSyncConfig.tableName);
boolean result = fileSystem.mkdirs(path);
checkResult(result);
DateTime dateTime = DateTime.now();
HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, commitTime);
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
HoodieCompactionMetadata compactionMetadata = new HoodieCompactionMetadata();
commitMetadata.getPartitionToWriteStats()
.forEach((key, value) -> value.stream().map(k -> new CompactionWriteStat(k, key, 0, 0, 0))
.forEach(l -> compactionMetadata.addWriteStat(key, l)));
createCompactionCommitFile(compactionMetadata, commitTime);
// Write a delta commit
HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), true);
createDeltaCommitFile(deltaMetadata, deltaCommitTime);
}
static void addCOWPartitions(int numberOfPartitions, boolean isParquetSchemaSimple,
DateTime startFrom, String commitTime)
throws IOException, URISyntaxException, InterruptedException {
HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions,
isParquetSchemaSimple, startFrom, commitTime);
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
createCommitFile(commitMetadata, commitTime);
}
static void addMORPartitions(int numberOfPartitions, boolean isParquetSchemaSimple,
boolean isLogSchemaSimple, DateTime startFrom,
String commitTime, String deltaCommitTime)
throws IOException, URISyntaxException, InterruptedException {
HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions,
isParquetSchemaSimple, startFrom, commitTime);
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
HoodieCompactionMetadata compactionMetadata = new HoodieCompactionMetadata();
commitMetadata.getPartitionToWriteStats()
.forEach((key, value) -> value.stream().map(k -> new CompactionWriteStat(k, key, 0, 0, 0))
.forEach(l -> compactionMetadata.addWriteStat(key, l)));
createCompactionCommitFile(compactionMetadata, commitTime);
HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), isLogSchemaSimple);
createDeltaCommitFile(deltaMetadata, deltaCommitTime);
}
private static HoodieCommitMetadata createLogFiles(
HashMap<String, List<HoodieWriteStat>> partitionWriteStats, boolean isLogSchemaSimple)
throws InterruptedException, IOException, URISyntaxException {
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
for (Entry<String, List<HoodieWriteStat>> wEntry : partitionWriteStats.entrySet()) {
String partitionPath = wEntry.getKey();
for (HoodieWriteStat wStat : wEntry.getValue()) {
Path path = new Path(wStat.getFullPath());
HoodieDataFile dataFile = new HoodieDataFile(fileSystem.getFileStatus(path));
HoodieLogFile logFile = generateLogData(path, isLogSchemaSimple);
HoodieDeltaWriteStat writeStat = new HoodieDeltaWriteStat();
writeStat.setFileId(dataFile.getFileId());
writeStat.setFullPath(logFile.getPath().toString());
commitMetadata.addWriteStat(partitionPath, writeStat);
}
}
return commitMetadata;
}
private static HoodieCommitMetadata createPartitions(int numberOfPartitions,
boolean isParquetSchemaSimple, DateTime startFrom, String commitTime)
throws IOException, URISyntaxException, InterruptedException {
startFrom = startFrom.withTimeAtStartOfDay();
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
for (int i = 0; i < numberOfPartitions; i++) {
String partitionPath = dtfOut.print(startFrom);
Path partPath = new Path(hiveSyncConfig.basePath + "/" + partitionPath);
fileSystem.makeQualified(partPath);
fileSystem.mkdirs(partPath);
List<HoodieWriteStat> writeStats = createTestData(partPath, isParquetSchemaSimple, commitTime);
startFrom = startFrom.minusDays(1);
writeStats.forEach(s -> commitMetadata.addWriteStat(partitionPath, s));
}
return commitMetadata;
}
private static List<HoodieWriteStat> createTestData(Path partPath, boolean isParquetSchemaSimple,
String commitTime) throws IOException, URISyntaxException, InterruptedException {
List<HoodieWriteStat> writeStats = Lists.newArrayList();
for (int i = 0; i < 5; i++) {
// Create 5 files
String fileId = UUID.randomUUID().toString();
Path filePath = new Path(partPath.toString() + "/" + FSUtils
.makeDataFileName(commitTime, DEFAULT_TASK_PARTITIONID, fileId));
generateParquetData(filePath, isParquetSchemaSimple);
HoodieWriteStat writeStat = new HoodieWriteStat();
writeStat.setFileId(fileId);
writeStat.setFullPath(filePath.toString());
writeStats.add(writeStat);
}
return writeStats;
}
@SuppressWarnings({"unchecked", "deprecation"})
private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple)
throws IOException, URISyntaxException, InterruptedException {
Schema schema = (isParquetSchemaSimple ? SchemaTestUtil.getSimpleSchema()
: SchemaTestUtil.getEvolvedSchema());
org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
BloomFilter filter = new BloomFilter(1000, 0.0001);
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter);
ParquetWriter writer = new ParquetWriter(filePath,
writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE,
ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION,
fileSystem.getConf());
List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil
.generateTestRecords(0, 100)
: SchemaTestUtil.generateEvolvedTestRecords(100, 100));
testRecords.forEach(s -> {
try {
writer.write(s);
} catch (IOException e) {
fail("IOException while writing test records as parquet" + e.toString());
}
});
writer.close();
}
private static HoodieLogFile generateLogData(Path parquetFilePath, boolean isLogSchemaSimple)
throws IOException, InterruptedException, URISyntaxException {
Schema schema = (isLogSchemaSimple ? SchemaTestUtil.getSimpleSchema()
: SchemaTestUtil.getEvolvedSchema());
HoodieDataFile dataFile = new HoodieDataFile(fileSystem.getFileStatus(parquetFilePath));
// Write a log file for this parquet file
Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(parquetFilePath.getParent())
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(dataFile.getFileId())
.overBaseCommit(dataFile.getCommitTime()).withFs(fileSystem).build();
List<IndexedRecord> records = (isLogSchemaSimple ? SchemaTestUtil
.generateTestRecords(0, 100)
: SchemaTestUtil.generateEvolvedTestRecords(100, 100));
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, schema);
logWriter.appendBlock(dataBlock);
logWriter.close();
return logWriter.getLogFile();
}
private static void checkResult(boolean result) throws InitializationError {
if (!result) {
throw new InitializationError("Could not initialize");
}
}
private static void createCommitFile(
HoodieCommitMetadata commitMetadata, String commitTime)
throws IOException {
byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
Path fullPath = new Path(
hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
.makeCommitFileName(commitTime));
FSDataOutputStream fsout = fileSystem.create(fullPath, true);
fsout.write(bytes);
fsout.close();
}
private static void createCompactionCommitFile(
HoodieCompactionMetadata commitMetadata, String commitTime)
throws IOException {
byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
Path fullPath = new Path(
hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
.makeCompactionFileName(commitTime));
FSDataOutputStream fsout = fileSystem.create(fullPath, true);
fsout.write(bytes);
fsout.close();
}
private static void createDeltaCommitFile(
HoodieCommitMetadata deltaCommitMetadata, String deltaCommitTime)
throws IOException {
byte[] bytes = deltaCommitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
Path fullPath = new Path(
hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
.makeDeltaFileName(deltaCommitTime));
FSDataOutputStream fsout = fileSystem.create(fullPath, true);
fsout.write(bytes);
fsout.close();
}
}

View File

@@ -1,44 +0,0 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.hive.util;
import org.apache.hadoop.fs.Path;
import parquet.hadoop.ParquetWriter;
import parquet.hadoop.metadata.CompressionCodecName;
import parquet.schema.MessageType;
import java.io.IOException;
import java.util.List;
public class CsvParquetWriter extends ParquetWriter<List<String>> {
public CsvParquetWriter(Path file, MessageType schema) throws IOException {
this(file, schema, false);
}
public CsvParquetWriter(Path file, MessageType schema, boolean enableDictionary)
throws IOException {
this(file, schema, CompressionCodecName.UNCOMPRESSED, enableDictionary);
}
public CsvParquetWriter(Path file, MessageType schema, CompressionCodecName codecName,
boolean enableDictionary) throws IOException {
super(file, new CsvWriteSupport(schema), codecName,
DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE, enableDictionary, false);
}
}

View File

@@ -1,94 +0,0 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.hive.util;
import org.apache.hadoop.conf.Configuration;
import parquet.column.ColumnDescriptor;
import parquet.hadoop.api.WriteSupport;
import parquet.io.ParquetEncodingException;
import parquet.io.api.Binary;
import parquet.io.api.RecordConsumer;
import parquet.schema.MessageType;
import java.util.HashMap;
import java.util.List;
public class CsvWriteSupport extends WriteSupport<List<String>> {
MessageType schema;
RecordConsumer recordConsumer;
List<ColumnDescriptor> cols;
// TODO: support specifying encodings and compression
public CsvWriteSupport(MessageType schema) {
this.schema = schema;
this.cols = schema.getColumns();
}
@Override public WriteContext init(Configuration config) {
return new WriteContext(schema, new HashMap<String, String>());
}
@Override public void prepareForWrite(RecordConsumer r) {
recordConsumer = r;
}
@Override public void write(List<String> values) {
if (values.size() != cols.size()) {
throw new ParquetEncodingException("Invalid input data. Expecting " +
cols.size() + " columns. Input had " + values.size() + " columns (" + cols + ") : "
+ values);
}
recordConsumer.startMessage();
for (int i = 0; i < cols.size(); ++i) {
String val = values.get(i);
// val.length() == 0 indicates a NULL value.
if (val.length() > 0) {
recordConsumer.startField(cols.get(i).getPath()[0], i);
switch (cols.get(i).getType()) {
case BOOLEAN:
recordConsumer.addBoolean(Boolean.parseBoolean(val));
break;
case FLOAT:
recordConsumer.addFloat(Float.parseFloat(val));
break;
case DOUBLE:
recordConsumer.addDouble(Double.parseDouble(val));
break;
case INT32:
recordConsumer.addInteger(Integer.parseInt(val));
break;
case INT64:
recordConsumer.addLong(Long.parseLong(val));
break;
case BINARY:
recordConsumer.addBinary(stringToBinary(val));
break;
default:
throw new ParquetEncodingException(
"Unsupported column type: " + cols.get(i).getType());
}
recordConsumer.endField(cols.get(i).getPath()[0], i);
}
}
recordConsumer.endMessage();
}
private Binary stringToBinary(Object value) {
return Binary.fromString(value.toString());
}
}

View File

@@ -1,201 +0,0 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.hive.util;
import com.google.common.collect.Sets;
import com.uber.hoodie.common.minicluster.HdfsTestService;
import com.uber.hoodie.common.minicluster.ZookeeperTestService;
import com.uber.hoodie.hive.HoodieHiveConfiguration;
import com.uber.hoodie.hive.client.HoodieHiveClient;
import com.uber.hoodie.hive.model.HoodieDatasetReference;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hive.service.server.HiveServer2;
import org.apache.zookeeper.server.ZooKeeperServer;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.junit.runners.model.InitializationError;
import parquet.schema.MessageType;
import parquet.schema.MessageTypeParser;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.Set;
import java.util.regex.Pattern;
public class TestUtil {
private static MiniDFSCluster dfsCluster;
private static ZooKeeperServer zkServer;
private static HiveServer2 hiveServer;
public static Configuration configuration;
public static HoodieHiveConfiguration hDroneConfiguration;
private static DateTimeFormatter dtfOut;
public static final String CSV_DELIMITER = "|";
private static FileSystem fileSystem;
private static Set<String> createdTablesSet = Sets.newHashSet();
public static void setUp() throws IOException, InterruptedException {
if (dfsCluster == null) {
HdfsTestService service = new HdfsTestService();
dfsCluster = service.start(true);
configuration = service.getHadoopConf();
}
if (zkServer == null) {
ZookeeperTestService zkService = new ZookeeperTestService(configuration);
zkServer = zkService.start();
}
if (hiveServer == null) {
HiveTestService hiveService = new HiveTestService(configuration);
hiveServer = hiveService.start();
}
hDroneConfiguration =
HoodieHiveConfiguration.newBuilder().hiveJdbcUrl("jdbc:hive2://127.0.0.1:9999/")
.hivedb("hdrone_test").jdbcUsername("").jdbcPassword("")
.hadoopConfiguration(hiveServer.getHiveConf()).build();
dtfOut = DateTimeFormat.forPattern("yyyy/MM/dd");
HoodieHiveClient client = new HoodieHiveClient(hDroneConfiguration);
for (String tableName : createdTablesSet) {
client.updateHiveSQL("drop table if exists " + tableName);
}
createdTablesSet.clear();
client.updateHiveSQL(
"drop database if exists " + hDroneConfiguration.getDbName());
client.updateHiveSQL("create database " + hDroneConfiguration.getDbName());
fileSystem = FileSystem.get(configuration);
}
public static void shutdown() {
if (hiveServer != null) {
hiveServer.stop();
}
if (dfsCluster != null) {
dfsCluster.shutdown();
}
if (zkServer != null) {
zkServer.shutdown();
}
}
public static HoodieDatasetReference createDataset(String tableName, String hdfsPath, int numberOfPartitions,
String schemaFile) throws IOException, InitializationError {
Path path = new Path(hdfsPath);
FileUtils.deleteDirectory(new File(hdfsPath));
boolean result = fileSystem.mkdirs(path);
checkResult(result);
HoodieDatasetReference metadata =
new HoodieDatasetReference(tableName, path.toString(),
hDroneConfiguration.getDbName());
DateTime dateTime = DateTime.now();
createPartitions(metadata, numberOfPartitions, schemaFile, dateTime, 1);
createdTablesSet.add(metadata.getDatabaseTableName());
return metadata;
}
private static void createPartitions(HoodieDatasetReference metadata, int numberOfPartitions,
String schemaFile, DateTime startFrom, int schemaVersion) throws IOException {
startFrom = startFrom.withTimeAtStartOfDay();
for (int i = 0; i < numberOfPartitions; i++) {
Path partPath = new Path(metadata.getBaseDatasetPath() + "/" + dtfOut.print(startFrom));
fileSystem.makeQualified(partPath);
fileSystem.mkdirs(partPath);
createTestData(partPath, schemaFile, schemaVersion);
startFrom = startFrom.minusDays(1);
}
}
private static void createTestData(Path partPath, String schemaFile, int schemaVersion)
throws IOException {
for (int i = 0; i < 5; i++) {
// Create 5 files
Path filePath =
new Path(partPath.toString() + "/" + getParquetFilePath(schemaVersion, i));
generateParquetData(filePath, schemaFile);
}
}
private static String getParquetFilePath(int version, int iteration) {
return "test.topic.name@sjc1@SV_" + version + "@" + iteration + ".parquet";
}
public static MessageType readSchema(String schemaFile) throws IOException {
return MessageTypeParser
.parseMessageType(IOUtils.toString(TestUtil.class.getResourceAsStream(schemaFile)));
}
public static void generateParquetData(Path filePath, String schemaFile) throws IOException {
MessageType schema = readSchema(schemaFile);
CsvParquetWriter writer = new CsvParquetWriter(filePath, schema);
BufferedReader br = new BufferedReader(
new InputStreamReader(TestUtil.class.getResourceAsStream(getDataFile(schemaFile))));
String line;
try {
while ((line = br.readLine()) != null) {
String[] fields = line.split(Pattern.quote(CSV_DELIMITER));
writer.write(Arrays.asList(fields));
}
writer.close();
} finally {
br.close();
}
InputStreamReader io = null;
FSDataOutputStream hdfsPath = null;
try {
io = new FileReader(filePath.toString());
hdfsPath = fileSystem.create(filePath);
IOUtils.copy(io, hdfsPath);
} finally {
if (io != null) {
io.close();
}
if (hdfsPath != null) {
hdfsPath.close();
}
}
}
private static String getDataFile(String schemaFile) {
return schemaFile.replaceAll(".schema", ".csv");
}
private static void checkResult(boolean result) throws InitializationError {
if (!result) {
throw new InitializationError("Could not initialize");
}
}
public static void evolveDataset(HoodieDatasetReference metadata, int newPartitionCount,
String newSchema, Long startFrom, int schemaVersion) throws IOException {
createPartitions(metadata, newPartitionCount, newSchema,
new DateTime(startFrom).plusDays(newPartitionCount + 1), schemaVersion);
}
}