[HUDI-159] Redesigning bundles for lighter-weight integrations
- Documented principles applied for redesign at packaging/README.md - No longer depends on incl commons-codec, commons-io, commons-pool, commons-dbcp, commons-lang, commons-logging, avro-mapred - Introduce new FileIOUtils & added checkstyle rule for illegal import of above - Parquet, Avro dependencies moved to provided scope to enable being picked up from Hive/Spark/Presto instead - Pickup jackson jars for Hive sync tool from HIVE_HOME & unbundling jackson everywhere - Remove hive-jdbc standalone jar from being bundled in Spark/Hive/Utilities bundles - 6.5x reduced number of classes across bundles
This commit is contained in:
committed by
Balaji Varadarajan
parent
0e6f078ec4
commit
7a973a6944
@@ -65,20 +65,6 @@
|
||||
<artifactId>joda-time</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Apache Commons -->
|
||||
<dependency>
|
||||
<groupId>commons-dbcp</groupId>
|
||||
<artifactId>commons-dbcp</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-pool</groupId>
|
||||
<artifactId>commons-pool</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.beust</groupId>
|
||||
<artifactId>jcommander</artifactId>
|
||||
|
||||
@@ -44,12 +44,12 @@ fi
|
||||
HIVE_EXEC=`ls ${HIVE_HOME}/lib/hive-exec-*.jar | tr '\n' ':'`
|
||||
HIVE_SERVICE=`ls ${HIVE_HOME}/lib/hive-service-*.jar | grep -v rpc | tr '\n' ':'`
|
||||
HIVE_METASTORE=`ls ${HIVE_HOME}/lib/hive-metastore-*.jar | tr '\n' ':'`
|
||||
# Hive 1.x/CDH has standalone jdbc jar which is no longer available in 2.x
|
||||
HIVE_JDBC=`ls ${HIVE_HOME}/lib/hive-jdbc-*standalone*.jar | tr '\n' ':'`
|
||||
HIVE_JDBC=`ls ${HIVE_HOME}/lib/hive-jdbc-*.jar | tr '\n' ':'`
|
||||
if [ -z "${HIVE_JDBC}" ]; then
|
||||
HIVE_JDBC=`ls ${HIVE_HOME}/lib/hive-jdbc-*.jar | grep -v handler | tr '\n' ':'`
|
||||
fi
|
||||
HIVE_JARS=$HIVE_METASTORE:$HIVE_SERVICE:$HIVE_EXEC:$HIVE_SERVICE:$HIVE_JDBC
|
||||
HIVE_JACKSON=`ls ${HIVE_HOME}/lib/jackson-*.jar | tr '\n' ':'`
|
||||
HIVE_JARS=$HIVE_METASTORE:$HIVE_SERVICE:$HIVE_EXEC:$HIVE_SERVICE:$HIVE_JDBC:$HIVE_JACKSON
|
||||
|
||||
HADOOP_HIVE_JARS=${HIVE_JARS}:${HADOOP_HOME}/share/hadoop/common/*:${HADOOP_HOME}/share/hadoop/mapreduce/*:${HADOOP_HOME}/share/hadoop/hdfs/*:${HADOOP_HOME}/share/hadoop/common/lib/*:${HADOOP_HOME}/share/hadoop/hdfs/lib/*
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ import com.google.common.collect.Maps;
|
||||
import java.io.IOException;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DatabaseMetaData;
|
||||
import java.sql.Driver;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
@@ -33,9 +33,7 @@ import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.commons.dbcp.BasicDataSource;
|
||||
import org.apache.commons.dbcp.ConnectionFactory;
|
||||
import org.apache.commons.dbcp.DriverConnectionFactory;
|
||||
import jline.internal.Log;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
@@ -57,13 +55,13 @@ import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.exception.InvalidDatasetException;
|
||||
import org.apache.hudi.hive.util.SchemaUtil;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.format.converter.ParquetMetadataConverter;
|
||||
import org.apache.parquet.hadoop.ParquetFileReader;
|
||||
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.thrift.TException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@SuppressWarnings("ConstantConditions")
|
||||
public class HoodieHiveClient {
|
||||
@@ -80,7 +78,7 @@ public class HoodieHiveClient {
|
||||
}
|
||||
}
|
||||
|
||||
private static Logger LOG = LoggerFactory.getLogger(HoodieHiveClient.class);
|
||||
private static Logger LOG = LogManager.getLogger(HoodieHiveClient.class);
|
||||
private final HoodieTableMetaClient metaClient;
|
||||
private final HoodieTableType tableType;
|
||||
private final PartitionValueExtractor partitionValueExtractor;
|
||||
@@ -473,20 +471,18 @@ public class HoodieHiveClient {
|
||||
|
||||
private void createHiveConnection() {
|
||||
if (connection == null) {
|
||||
BasicDataSource ds = new HiveDataSource();
|
||||
ds.setDriverClassName(HiveDriver.class.getCanonicalName());
|
||||
ds.setUrl(getHiveJdbcUrlWithDefaultDBName());
|
||||
if (syncConfig.hiveUser != null) {
|
||||
ds.setUsername(syncConfig.hiveUser);
|
||||
ds.setPassword(syncConfig.hivePass);
|
||||
}
|
||||
LOG.info("Getting Hive Connection from Datasource " + ds);
|
||||
try {
|
||||
this.connection = ds.getConnection();
|
||||
LOG.info("Successfully got Hive Connection from Datasource " + ds);
|
||||
Class.forName(HiveDriver.class.getCanonicalName());
|
||||
} catch (ClassNotFoundException e) {
|
||||
Log.error("Unable to load Hive driver class", e);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
this.connection = DriverManager.getConnection(syncConfig.jdbcUrl, syncConfig.hiveUser, syncConfig.hivePass);
|
||||
LOG.info("Successfully established Hive connection to " + syncConfig.jdbcUrl);
|
||||
} catch (SQLException e) {
|
||||
throw new HoodieHiveSyncException(
|
||||
"Cannot create hive connection " + getHiveJdbcUrlWithDefaultDBName(), e);
|
||||
throw new HoodieHiveSyncException("Cannot create hive connection " + getHiveJdbcUrlWithDefaultDBName(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -627,54 +623,4 @@ public class HoodieHiveClient {
|
||||
return new PartitionEvent(PartitionEventType.UPDATE, storagePartition);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* There is a bug in BasicDataSource implementation (dbcp-1.4) which does not allow custom version of Driver (needed
|
||||
* to talk to older version of HiveServer2 including CDH-5x). This is fixed in dbcp-2x but we are using dbcp1.4.
|
||||
* Adding a workaround here. TODO: varadarb We need to investigate moving to dbcp-2x
|
||||
*/
|
||||
protected class HiveDataSource extends BasicDataSource {
|
||||
|
||||
protected ConnectionFactory createConnectionFactory() throws SQLException {
|
||||
try {
|
||||
Driver driver = HiveDriver.class.newInstance();
|
||||
// Can't test without a validationQuery
|
||||
if (validationQuery == null) {
|
||||
setTestOnBorrow(false);
|
||||
setTestOnReturn(false);
|
||||
setTestWhileIdle(false);
|
||||
}
|
||||
|
||||
// Set up the driver connection factory we will use
|
||||
String user = username;
|
||||
if (user != null) {
|
||||
connectionProperties.put("user", user);
|
||||
} else {
|
||||
log("DBCP DataSource configured without a 'username'");
|
||||
}
|
||||
|
||||
String pwd = password;
|
||||
if (pwd != null) {
|
||||
connectionProperties.put("password", pwd);
|
||||
} else {
|
||||
log("DBCP DataSource configured without a 'password'");
|
||||
}
|
||||
|
||||
ConnectionFactory driverConnectionFactory = new DriverConnectionFactory(driver, url, connectionProperties);
|
||||
return driverConnectionFactory;
|
||||
} catch (Throwable x) {
|
||||
LOG.warn("Got exception trying to instantiate connection factory. Trying default instantiation", x);
|
||||
return super.createConnectionFactory();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "HiveDataSource{"
|
||||
+ "driverClassName='" + driverClassName + '\''
|
||||
+ ", driverClassLoader=" + driverClassLoader
|
||||
+ ", url='" + url + '\''
|
||||
+ '}';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,6 +36,8 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HoodieHiveSyncException;
|
||||
import org.apache.hudi.hive.SchemaDifference;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
import org.apache.parquet.schema.DecimalMetadata;
|
||||
import org.apache.parquet.schema.GroupType;
|
||||
@@ -43,15 +45,13 @@ import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.parquet.schema.OriginalType;
|
||||
import org.apache.parquet.schema.PrimitiveType;
|
||||
import org.apache.parquet.schema.Type;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Schema Utilities
|
||||
*/
|
||||
public class SchemaUtil {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(SchemaUtil.class);
|
||||
private static final Logger LOG = LogManager.getLogger(SchemaUtil.class);
|
||||
|
||||
/**
|
||||
* Get the schema difference between the storage schema and hive table schema
|
||||
|
||||
@@ -34,7 +34,6 @@ import java.util.Set;
|
||||
import java.util.UUID;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
@@ -61,6 +60,7 @@ import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
||||
import org.apache.hudi.common.util.FSUtils;
|
||||
import org.apache.hudi.common.util.FileIOUtils;
|
||||
import org.apache.hudi.common.util.SchemaTestUtil;
|
||||
import org.apache.hudi.hive.util.HiveTestService;
|
||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
@@ -153,7 +153,7 @@ public class TestUtil {
|
||||
static void createCOWDataset(String commitTime, int numberOfPartitions)
|
||||
throws IOException, InitializationError, URISyntaxException, InterruptedException {
|
||||
Path path = new Path(hiveSyncConfig.basePath);
|
||||
FileUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
|
||||
FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
|
||||
HoodieTableMetaClient
|
||||
.initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE,
|
||||
hiveSyncConfig.tableName, HoodieAvroPayload.class.getName());
|
||||
@@ -169,7 +169,7 @@ public class TestUtil {
|
||||
static void createMORDataset(String commitTime, String deltaCommitTime, int numberOfPartitions)
|
||||
throws IOException, InitializationError, URISyntaxException, InterruptedException {
|
||||
Path path = new Path(hiveSyncConfig.basePath);
|
||||
FileUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
|
||||
FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
|
||||
HoodieTableMetaClient
|
||||
.initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.MERGE_ON_READ,
|
||||
hiveSyncConfig.tableName, HoodieAvroPayload.class.getName());
|
||||
|
||||
@@ -28,7 +28,6 @@ import java.net.SocketException;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
@@ -41,6 +40,7 @@ import org.apache.hadoop.hive.metastore.api.MetaException;
|
||||
import org.apache.hadoop.hive.thrift.TUGIContainingTransport;
|
||||
import org.apache.hive.service.server.HiveServer2;
|
||||
import org.apache.hudi.common.model.HoodieTestUtils;
|
||||
import org.apache.hudi.common.util.FileIOUtils;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.thrift.TProcessor;
|
||||
@@ -95,7 +95,7 @@ public class HiveTestService {
|
||||
if (clean) {
|
||||
LOG.info("Cleaning Hive cluster data at: " + localHiveLocation + " and starting fresh.");
|
||||
File file = new File(localHiveLocation);
|
||||
FileUtils.deleteDirectory(file);
|
||||
FileIOUtils.deleteDirectory(file);
|
||||
}
|
||||
|
||||
HiveConf serverConf = configureHive(hadoopConf, localHiveLocation);
|
||||
|
||||
Reference in New Issue
Block a user