Docker Container Build and Run setup with foundations for adding docker integration tests. Docker images built with Hadoop 2.8.4 Hive 2.3.3 and Spark 2.3.1 and published to docker-hub
Look at quickstart document for how to setup docker and run demo
This commit is contained in:
committed by
vinoth chandar
parent
9710b5a3a6
commit
f3418e4718
13
docker/build_local_docker_images.sh
Executable file
13
docker/build_local_docker_images.sh
Executable file
@@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
read -p "Docker images can be downloaded from docker hub and seamlessly mounted with latest HUDI jars. Do you still want to build docker images from scratch ?" yn
|
||||||
|
case $yn in
|
||||||
|
[Yy]* ) make install; break;;
|
||||||
|
[Nn]* ) exit;;
|
||||||
|
* ) echo "Please answer yes or no.";;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
pushd ../
|
||||||
|
mvn clean pre-integration-test -DskipTests -Ddocker.compose.skip=true -Ddocker.build.skip=false
|
||||||
|
popd
|
||||||
217
docker/compose/docker-compose_hadoop284_hive233_spark231.yml
Normal file
217
docker/compose/docker-compose_hadoop284_hive233_spark231.yml
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
version: "3.3"
|
||||||
|
|
||||||
|
services:
|
||||||
|
|
||||||
|
namenode:
|
||||||
|
image: varadarb/hudi-hadoop_2.8.4-namenode:latest
|
||||||
|
hostname: namenode
|
||||||
|
container_name: namenode
|
||||||
|
volumes:
|
||||||
|
- /tmp/hadoop_name:/hadoop/dfs/name
|
||||||
|
environment:
|
||||||
|
- CLUSTER_NAME=hudi_hadoop284_hive232_spark231
|
||||||
|
ports:
|
||||||
|
- "50070:50070"
|
||||||
|
- "8020:8020"
|
||||||
|
env_file:
|
||||||
|
- ./hadoop.env
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://namenode:50070"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
datanode1:
|
||||||
|
image: varadarb/hudi-hadoop_2.8.4-datanode:latest
|
||||||
|
container_name: datanode1
|
||||||
|
hostname: datanode1
|
||||||
|
environment:
|
||||||
|
- CLUSTER_NAME=hudi_hadoop284_hive232_spark231
|
||||||
|
env_file:
|
||||||
|
- ./hadoop.env
|
||||||
|
ports:
|
||||||
|
- "50075:50075"
|
||||||
|
- "50010:50010"
|
||||||
|
links:
|
||||||
|
- "namenode"
|
||||||
|
- "historyserver"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://datanode1:50075"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
depends_on:
|
||||||
|
- namenode
|
||||||
|
volumes:
|
||||||
|
- /tmp/hadoop_data:/hadoop/dfs/data
|
||||||
|
|
||||||
|
historyserver:
|
||||||
|
image: varadarb/hudi-hadoop_2.8.4-history:latest
|
||||||
|
hostname: historyserver
|
||||||
|
container_name: historyserver
|
||||||
|
environment:
|
||||||
|
- CLUSTER_NAME=hudi_hadoop284_hive232_spark231
|
||||||
|
depends_on:
|
||||||
|
- "namenode"
|
||||||
|
links:
|
||||||
|
- "namenode"
|
||||||
|
ports:
|
||||||
|
- "58188:8188"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://historyserver:8188"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
env_file:
|
||||||
|
- ./hadoop.env
|
||||||
|
volumes:
|
||||||
|
- historyserver:/hadoop/yarn/timeline
|
||||||
|
|
||||||
|
hive-metastore-postgresql:
|
||||||
|
image: bde2020/hive-metastore-postgresql:2.3.0
|
||||||
|
volumes:
|
||||||
|
- hive-metastore-postgresql:/var/lib/postgresql
|
||||||
|
hostname: hive-metastore-postgresql
|
||||||
|
container_name: hive-metastore-postgresql
|
||||||
|
|
||||||
|
hivemetastore:
|
||||||
|
image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3:latest
|
||||||
|
hostname: hivemetastore
|
||||||
|
container_name: hivemetastore
|
||||||
|
links:
|
||||||
|
- "hive-metastore-postgresql"
|
||||||
|
- "namenode"
|
||||||
|
env_file:
|
||||||
|
- ./hadoop.env
|
||||||
|
command: /opt/hive/bin/hive --service metastore
|
||||||
|
environment:
|
||||||
|
SERVICE_PRECONDITION: "namenode:50070 hive-metastore-postgresql:5432"
|
||||||
|
ports:
|
||||||
|
- "9083:9083"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "nc", "-z", "hivemetastore", "9083"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
depends_on:
|
||||||
|
- "hive-metastore-postgresql"
|
||||||
|
- "namenode"
|
||||||
|
|
||||||
|
hiveserver:
|
||||||
|
image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3:latest
|
||||||
|
hostname: hiveserver
|
||||||
|
container_name: hiveserver
|
||||||
|
env_file:
|
||||||
|
- ./hadoop.env
|
||||||
|
environment:
|
||||||
|
SERVICE_PRECONDITION: "hivemetastore:9083"
|
||||||
|
ports:
|
||||||
|
- "10000:10000"
|
||||||
|
depends_on:
|
||||||
|
- "hivemetastore"
|
||||||
|
links:
|
||||||
|
- "hivemetastore"
|
||||||
|
- "hive-metastore-postgresql"
|
||||||
|
- "namenode"
|
||||||
|
volumes:
|
||||||
|
- ${HUDI_WS}:/var/hoodie/ws
|
||||||
|
|
||||||
|
sparkmaster:
|
||||||
|
image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.3.1:latest
|
||||||
|
hostname: sparkmaster
|
||||||
|
container_name: sparkmaster
|
||||||
|
env_file:
|
||||||
|
- ./hadoop.env
|
||||||
|
ports:
|
||||||
|
- "8080:8080"
|
||||||
|
- "7077:7077"
|
||||||
|
environment:
|
||||||
|
- INIT_DAEMON_STEP=setup_spark
|
||||||
|
links:
|
||||||
|
- "hivemetastore"
|
||||||
|
- "hiveserver"
|
||||||
|
- "hive-metastore-postgresql"
|
||||||
|
- "namenode"
|
||||||
|
|
||||||
|
spark-worker-1:
|
||||||
|
image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.3.1:latest
|
||||||
|
hostname: spark-worker-1
|
||||||
|
container_name: spark-worker-1
|
||||||
|
env_file:
|
||||||
|
- ./hadoop.env
|
||||||
|
depends_on:
|
||||||
|
- sparkmaster
|
||||||
|
ports:
|
||||||
|
- "8081:8081"
|
||||||
|
environment:
|
||||||
|
- "SPARK_MASTER=spark://sparkmaster:7077"
|
||||||
|
links:
|
||||||
|
- "hivemetastore"
|
||||||
|
- "hiveserver"
|
||||||
|
- "hive-metastore-postgresql"
|
||||||
|
- "namenode"
|
||||||
|
|
||||||
|
zookeeper:
|
||||||
|
image: 'bitnami/zookeeper:3.4.12-r68'
|
||||||
|
hostname: zookeeper
|
||||||
|
container_name: zookeeper
|
||||||
|
ports:
|
||||||
|
- '2181:2181'
|
||||||
|
environment:
|
||||||
|
- ALLOW_ANONYMOUS_LOGIN=yes
|
||||||
|
|
||||||
|
kafka:
|
||||||
|
image: 'bitnami/kafka:2.0.0'
|
||||||
|
hostname: kafkabroker
|
||||||
|
container_name: kafkabroker
|
||||||
|
ports:
|
||||||
|
- '9092:9092'
|
||||||
|
environment:
|
||||||
|
- KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
|
||||||
|
- ALLOW_PLAINTEXT_LISTENER=yes
|
||||||
|
|
||||||
|
adhoc-1:
|
||||||
|
image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.3.1:latest
|
||||||
|
hostname: adhoc-1
|
||||||
|
container_name: adhoc-1
|
||||||
|
env_file:
|
||||||
|
- ./hadoop.env
|
||||||
|
depends_on:
|
||||||
|
- sparkmaster
|
||||||
|
ports:
|
||||||
|
- '4040:4040'
|
||||||
|
environment:
|
||||||
|
- "SPARK_MASTER=spark://sparkmaster:7077"
|
||||||
|
links:
|
||||||
|
- "hivemetastore"
|
||||||
|
- "hiveserver"
|
||||||
|
- "hive-metastore-postgresql"
|
||||||
|
- "namenode"
|
||||||
|
volumes:
|
||||||
|
- ${HUDI_WS}:/var/hoodie/ws
|
||||||
|
|
||||||
|
adhoc-2:
|
||||||
|
image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.3.1:latest
|
||||||
|
hostname: adhoc-2
|
||||||
|
container_name: adhoc-2
|
||||||
|
env_file:
|
||||||
|
- ./hadoop.env
|
||||||
|
depends_on:
|
||||||
|
- sparkmaster
|
||||||
|
environment:
|
||||||
|
- "SPARK_MASTER=spark://sparkmaster:7077"
|
||||||
|
links:
|
||||||
|
- "hivemetastore"
|
||||||
|
- "hiveserver"
|
||||||
|
- "hive-metastore-postgresql"
|
||||||
|
- "namenode"
|
||||||
|
volumes:
|
||||||
|
- ${HUDI_WS}:/var/hoodie/ws
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
namenode:
|
||||||
|
historyserver:
|
||||||
|
hive-metastore-postgresql:
|
||||||
|
|
||||||
|
networks:
|
||||||
|
default:
|
||||||
33
docker/compose/hadoop.env
Normal file
33
docker/compose/hadoop.env
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
|
||||||
|
HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
|
||||||
|
HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
|
||||||
|
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
|
||||||
|
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
|
||||||
|
HIVE_SITE_CONF_hive_metastore_uris=thrift://hivemetastore:9083
|
||||||
|
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
|
||||||
|
|
||||||
|
HDFS_CONF_dfs_webhdfs_enabled=true
|
||||||
|
HDFS_CONF_dfs_permissions_enabled=false
|
||||||
|
#HDFS_CONF_dfs_client_use_datanode_hostname=true
|
||||||
|
#HDFS_CONF_dfs_namenode_use_datanode_hostname=true
|
||||||
|
|
||||||
|
CORE_CONF_fs_defaultFS=hdfs://namenode:8020
|
||||||
|
CORE_CONF_hadoop_http_staticuser_user=root
|
||||||
|
CORE_CONF_hadoop_proxyuser_hue_hosts=*
|
||||||
|
CORE_CONF_hadoop_proxyuser_hue_groups=*
|
||||||
|
|
||||||
|
YARN_CONF_yarn_log___aggregation___enable=true
|
||||||
|
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
|
||||||
|
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
|
||||||
|
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
|
||||||
|
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
|
||||||
|
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
|
||||||
|
YARN_CONF_yarn_timeline___service_enabled=true
|
||||||
|
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
|
||||||
|
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
|
||||||
|
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
|
||||||
|
YARN_CONF_yarn_timeline___service_hostname=historyserver
|
||||||
|
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
|
||||||
|
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
|
||||||
|
YARN_CONF_yarn_resourcemanager_resource___tracker_address=resourcemanager:8031
|
||||||
|
YARN_CONF_yarn_nodemanager_vmem___check___enabled=false
|
||||||
21
docker/demo/config/base.properties
Normal file
21
docker/demo/config/base.properties
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Common hoodie client configs
|
||||||
|
hoodie.upsert.shuffle.parallelism=2
|
||||||
|
hoodie.insert.shuffle.parallelism=2
|
||||||
|
hoodie.bulkinsert.shuffle.parallelism=2
|
||||||
29
docker/demo/config/kafka-source.properties
Normal file
29
docker/demo/config/kafka-source.properties
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#
|
||||||
|
include=base.properties
|
||||||
|
# Key fields, for kafka example
|
||||||
|
hoodie.datasource.write.recordkey.field=key
|
||||||
|
hoodie.datasource.write.partitionpath.field=date
|
||||||
|
# Schema provider props (change to absolute path based on your installation)
|
||||||
|
hoodie.deltastreamer.schemaprovider.source.schema.file=/var/demo/config/schema.avsc
|
||||||
|
hoodie.deltastreamer.schemaprovider.target.schema.file=/var/demo/config/schema.avsc
|
||||||
|
# Kafka Source
|
||||||
|
hoodie.deltastreamer.source.kafka.topic=stock_ticks
|
||||||
|
#Kafka props
|
||||||
|
metadata.broker.list=kafkabroker:9092
|
||||||
|
auto.offset.reset=smallest
|
||||||
41
docker/demo/config/schema.avsc
Normal file
41
docker/demo/config/schema.avsc
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
{
|
||||||
|
"type":"record",
|
||||||
|
"name":"stock_ticks",
|
||||||
|
"fields":[{
|
||||||
|
"name": "volume",
|
||||||
|
"type": "long"
|
||||||
|
}, {
|
||||||
|
"name": "ts",
|
||||||
|
"type": "string"
|
||||||
|
}, {
|
||||||
|
"name": "symbol",
|
||||||
|
"type": "string"
|
||||||
|
},{
|
||||||
|
"name": "year",
|
||||||
|
"type": "int"
|
||||||
|
},{
|
||||||
|
"name": "month",
|
||||||
|
"type": "string"
|
||||||
|
},{
|
||||||
|
"name": "high",
|
||||||
|
"type": "double"
|
||||||
|
},{
|
||||||
|
"name": "low",
|
||||||
|
"type": "double"
|
||||||
|
},{
|
||||||
|
"name": "key",
|
||||||
|
"type": "string"
|
||||||
|
},{
|
||||||
|
"name": "date",
|
||||||
|
"type":"string"
|
||||||
|
}, {
|
||||||
|
"name": "close",
|
||||||
|
"type": "double"
|
||||||
|
}, {
|
||||||
|
"name": "open",
|
||||||
|
"type": "double"
|
||||||
|
}, {
|
||||||
|
"name": "day",
|
||||||
|
"type":"string"
|
||||||
|
}
|
||||||
|
]}
|
||||||
26
docker/demo/config/spark-defaults.conf
Normal file
26
docker/demo/config/spark-defaults.conf
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
#
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
# Default system properties included when running spark-submit.
|
||||||
|
# This is useful for setting default environmental settings.
|
||||||
|
|
||||||
|
# Example:
|
||||||
|
spark.master local[3]
|
||||||
|
spark.eventLog.dir hdfs://namenode:8020/tmp/spark-events
|
||||||
|
spark.serializer org.apache.spark.serializer.KryoSerializer
|
||||||
|
#spark.executor.memory 4g
|
||||||
|
# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
|
||||||
3482
docker/demo/data/batch_1.json
Normal file
3482
docker/demo/data/batch_1.json
Normal file
File diff suppressed because it is too large
Load Diff
1668
docker/demo/data/batch_2.json
Normal file
1668
docker/demo/data/batch_2.json
Normal file
File diff suppressed because it is too large
Load Diff
6
docker/demo/setup_demo_container.sh
Executable file
6
docker/demo/setup_demo_container.sh
Executable file
@@ -0,0 +1,6 @@
|
|||||||
|
echo "Copying spark default config and setting up configs"
|
||||||
|
cp /var/hoodie/ws/docker/demo/config/spark-defaults.conf $SPARK_CONF_DIR/.
|
||||||
|
hadoop fs -mkdir -p /var/demo/
|
||||||
|
hadoop fs -mkdir -p /tmp/spark-events
|
||||||
|
hadoop fs -copyFromLocal -f /var/hoodie/ws/docker/demo/config /var/demo/.
|
||||||
|
chmod +x /var/hoodie/ws/hoodie-hive/run_sync_tool.sh
|
||||||
45
docker/hoodie/hadoop/base/Dockerfile
Normal file
45
docker/hoodie/hadoop/base/Dockerfile
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
FROM frolvlad/alpine-oraclejdk8
|
||||||
|
MAINTAINER Hoodie
|
||||||
|
USER root
|
||||||
|
|
||||||
|
# Default to UTF-8 file.encoding
|
||||||
|
ENV LANG C.UTF-8
|
||||||
|
|
||||||
|
# Updating & Installing packages
|
||||||
|
RUN apk add net-tools curl bash perl procps
|
||||||
|
|
||||||
|
ARG HADOOP_VERSION=2.8.4
|
||||||
|
ARG HADOOP_URL=https://www.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz
|
||||||
|
ENV HADOOP_VERSION ${HADOOP_VERSION}
|
||||||
|
ENV HADOOP_URL ${HADOOP_URL}
|
||||||
|
|
||||||
|
RUN set -x \
|
||||||
|
&& echo "Fetch URL2 is : ${HADOOP_URL}" \
|
||||||
|
&& curl -fSL "${HADOOP_URL}" -o /tmp/hadoop.tar.gz \
|
||||||
|
&& curl -fSL "${HADOOP_URL}.asc" -o /tmp/hadoop.tar.gz.asc \
|
||||||
|
&& mkdir -p /opt/hadoop-$HADOOP_VERSION/logs \
|
||||||
|
&& tar -xvf /tmp/hadoop.tar.gz -C /opt/ \
|
||||||
|
&& rm /tmp/hadoop.tar.gz* \
|
||||||
|
&& ln -s /opt/hadoop-$HADOOP_VERSION/etc/hadoop /etc/hadoop \
|
||||||
|
&& cp /etc/hadoop/mapred-site.xml.template /etc/hadoop/mapred-site.xml \
|
||||||
|
&& mkdir /hadoop-data
|
||||||
|
|
||||||
|
ENV HADOOP_PREFIX=/opt/hadoop-$HADOOP_VERSION
|
||||||
|
ENV HADOOP_CONF_DIR=/etc/hadoop
|
||||||
|
ENV MULTIHOMED_NETWORK=1
|
||||||
|
ENV HADOOP_HOME=${HADOOP_PREFIX}
|
||||||
|
ENV HADOOP_INSTALL=${HADOOP_HOME}
|
||||||
|
ENV USER=root
|
||||||
|
ENV PATH /usr/bin:/bin:$HADOOP_PREFIX/bin/:$PATH
|
||||||
|
|
||||||
|
# Exposing a union of ports across hadoop versions
|
||||||
|
# Well known ports including ssh
|
||||||
|
EXPOSE 0-1024 4040 7000-10100 5000-5100 50000-50200 58188 58088 58042
|
||||||
|
|
||||||
|
ADD entrypoint.sh /entrypoint.sh
|
||||||
|
ADD export_container_ip.sh /usr/bin/
|
||||||
|
RUN chmod a+x /usr/bin/export_container_ip.sh \
|
||||||
|
&& chmod a+x /entrypoint.sh
|
||||||
|
|
||||||
|
ENTRYPOINT ["/bin/bash", "/entrypoint.sh"]
|
||||||
|
|
||||||
91
docker/hoodie/hadoop/base/entrypoint.sh
Normal file
91
docker/hoodie/hadoop/base/entrypoint.sh
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
|
||||||
|
#######################################################################################
|
||||||
|
## COPIED FROM ##
|
||||||
|
## https://github.com/big-data-europe/docker-hadoop/blob/master/base/entrypoint.sh ##
|
||||||
|
# ##
|
||||||
|
#######################################################################################
|
||||||
|
|
||||||
|
# Set some sensible defaults
|
||||||
|
export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020}
|
||||||
|
|
||||||
|
function addProperty() {
|
||||||
|
local path=$1
|
||||||
|
local name=$2
|
||||||
|
local value=$3
|
||||||
|
|
||||||
|
local entry="<property><name>$name</name><value>${value}</value></property>"
|
||||||
|
local escapedEntry=$(echo $entry | sed 's/\//\\\//g')
|
||||||
|
sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path
|
||||||
|
}
|
||||||
|
|
||||||
|
function configure() {
|
||||||
|
local path=$1
|
||||||
|
local module=$2
|
||||||
|
local envPrefix=$3
|
||||||
|
|
||||||
|
local var
|
||||||
|
local value
|
||||||
|
|
||||||
|
echo "Configuring $module"
|
||||||
|
for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do
|
||||||
|
name=`echo ${c} | perl -pe 's/___/-/g; s/__/@/g; s/_/./g; s/@/_/g;'`
|
||||||
|
var="${envPrefix}_${c}"
|
||||||
|
value=${!var}
|
||||||
|
echo " - Setting $name=$value"
|
||||||
|
addProperty /etc/hadoop/$module-site.xml $name "$value"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
configure /etc/hadoop/core-site.xml core CORE_CONF
|
||||||
|
configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF
|
||||||
|
configure /etc/hadoop/yarn-site.xml yarn YARN_CONF
|
||||||
|
configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF
|
||||||
|
configure /etc/hadoop/kms-site.xml kms KMS_CONF
|
||||||
|
|
||||||
|
if [ "$MULTIHOMED_NETWORK" = "1" ]; then
|
||||||
|
echo "Configuring for multihomed network"
|
||||||
|
|
||||||
|
# HDFS
|
||||||
|
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0
|
||||||
|
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0
|
||||||
|
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0
|
||||||
|
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0
|
||||||
|
addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true
|
||||||
|
addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true
|
||||||
|
|
||||||
|
# YARN
|
||||||
|
addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0
|
||||||
|
addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0
|
||||||
|
addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0
|
||||||
|
addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0
|
||||||
|
|
||||||
|
# MAPRED
|
||||||
|
addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$GANGLIA_HOST" ]; then
|
||||||
|
mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig
|
||||||
|
mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig
|
||||||
|
|
||||||
|
for module in mapred jvm rpc ugi; do
|
||||||
|
echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31"
|
||||||
|
echo "$module.period=10"
|
||||||
|
echo "$module.servers=$GANGLIA_HOST:8649"
|
||||||
|
done > /etc/hadoop/hadoop-metrics.properties
|
||||||
|
|
||||||
|
for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do
|
||||||
|
echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31"
|
||||||
|
echo "$module.sink.ganglia.period=10"
|
||||||
|
echo "$module.sink.ganglia.supportsparse=true"
|
||||||
|
echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both"
|
||||||
|
echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40"
|
||||||
|
echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649"
|
||||||
|
done > /etc/hadoop/hadoop-metrics2.properties
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Save Container IP in ENV variable
|
||||||
|
/usr/bin/export_container_ip.sh
|
||||||
|
|
||||||
|
exec "$@"
|
||||||
13
docker/hoodie/hadoop/base/export_container_ip.sh
Executable file
13
docker/hoodie/hadoop/base/export_container_ip.sh
Executable file
@@ -0,0 +1,13 @@
|
|||||||
|
interfaces=( "en0" "eth0" )
|
||||||
|
|
||||||
|
ipAddr=""
|
||||||
|
for interface in "${interfaces[@]}"
|
||||||
|
do
|
||||||
|
ipAddr=`ifconfig $interface | grep -Eo 'inet (addr:)?([0-9]+\.){3}[0-9]+' | grep -Eo '([0-9]+\.){3}[0-9]+' | grep -v '127.0.0.1' | head`
|
||||||
|
if [ -n "$ipAddr" ]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Container IP is set to : $ipAddr"
|
||||||
|
export MY_CONTAINER_IP=$ipAddr
|
||||||
90
docker/hoodie/hadoop/base/pom.xml
Normal file
90
docker/hoodie/hadoop/base/pom.xml
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!--
|
||||||
|
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
~
|
||||||
|
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
~ you may not use this file except in compliance with the License.
|
||||||
|
~ You may obtain a copy of the License at
|
||||||
|
~
|
||||||
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
~
|
||||||
|
~ Unless required by applicable law or agreed to in writing, software
|
||||||
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
~ See the License for the specific language governing permissions and
|
||||||
|
~ limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>hoodie-hadoop-docker</artifactId>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<version>0.4.5-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<packaging>pom</packaging>
|
||||||
|
<artifactId>hoodie-hadoop-base-docker</artifactId>
|
||||||
|
|
||||||
|
<description>Base Docker Image with Hoodie</description>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<artifactId>hoodie-hadoop-docker</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<type>pom</type>
|
||||||
|
<scope>import</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<finalName>hoodie</finalName>
|
||||||
|
<plugins>
|
||||||
|
<!-- Build Docker image -->
|
||||||
|
<plugin>
|
||||||
|
<groupId>com.spotify</groupId>
|
||||||
|
<artifactId>dockerfile-maven-plugin</artifactId>
|
||||||
|
<version>${dockerfile.maven.version}</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>tag-latest</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-base</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>latest</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
<execution>
|
||||||
|
<id>tag-version</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-base</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>${project.version}</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</project>
|
||||||
14
docker/hoodie/hadoop/datanode/Dockerfile
Normal file
14
docker/hoodie/hadoop/datanode/Dockerfile
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
ARG HADOOP_VERSION=2.8.4
|
||||||
|
ARG HADOOP_DN_PORT=50075
|
||||||
|
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-base:latest
|
||||||
|
|
||||||
|
ENV HADOOP_DN_PORT ${HADOOP_DN_PORT}
|
||||||
|
|
||||||
|
ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data
|
||||||
|
RUN mkdir -p /hadoop/dfs/data
|
||||||
|
VOLUME /hadoop/dfs/data
|
||||||
|
|
||||||
|
ADD run_dn.sh /run_dn.sh
|
||||||
|
RUN chmod a+x /run_dn.sh
|
||||||
|
|
||||||
|
CMD ["/run_dn.sh"]
|
||||||
89
docker/hoodie/hadoop/datanode/pom.xml
Normal file
89
docker/hoodie/hadoop/datanode/pom.xml
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!--
|
||||||
|
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
~
|
||||||
|
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
~ you may not use this file except in compliance with the License.
|
||||||
|
~ You may obtain a copy of the License at
|
||||||
|
~
|
||||||
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
~
|
||||||
|
~ Unless required by applicable law or agreed to in writing, software
|
||||||
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
~ See the License for the specific language governing permissions and
|
||||||
|
~ limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>hoodie-hadoop-docker</artifactId>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<version>0.4.5-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<packaging>pom</packaging>
|
||||||
|
<artifactId>hoodie-hadoop-datanode-docker</artifactId>
|
||||||
|
|
||||||
|
<description>Base Docker Image with Hoodie</description>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<artifactId>hoodie-hadoop-base-docker</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<type>pom</type>
|
||||||
|
<scope>import</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<!-- Build Docker image -->
|
||||||
|
<plugin>
|
||||||
|
<groupId>com.spotify</groupId>
|
||||||
|
<artifactId>dockerfile-maven-plugin</artifactId>
|
||||||
|
<version>${dockerfile.maven.version}</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>tag-latest</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-datanode</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>latest</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
<execution>
|
||||||
|
<id>tag-version</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-datanode</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>${project.version}</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</project>
|
||||||
9
docker/hoodie/hadoop/datanode/run_dn.sh
Normal file
9
docker/hoodie/hadoop/datanode/run_dn.sh
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'`
|
||||||
|
if [ ! -d $datadir ]; then
|
||||||
|
echo "Datanode data directory not found: $datadir"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
$HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR datanode
|
||||||
14
docker/hoodie/hadoop/historyserver/Dockerfile
Normal file
14
docker/hoodie/hadoop/historyserver/Dockerfile
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
ARG HADOOP_VERSION=2.8.4
|
||||||
|
ARG HADOOP_HISTORY_PORT=8188
|
||||||
|
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-base:latest
|
||||||
|
|
||||||
|
ENV HADOOP_HISTORY_PORT ${HADOOP_HISTORY_PORT}
|
||||||
|
|
||||||
|
ENV YARN_CONF_yarn_timeline___service_leveldb___timeline___store_path=/hadoop/yarn/timeline
|
||||||
|
RUN mkdir -p /hadoop/yarn/timeline
|
||||||
|
VOLUME /hadoop/yarn/timeline
|
||||||
|
|
||||||
|
ADD run_history.sh /run_history.sh
|
||||||
|
RUN chmod a+x /run_history.sh
|
||||||
|
|
||||||
|
CMD ["/run_history.sh"]
|
||||||
89
docker/hoodie/hadoop/historyserver/pom.xml
Normal file
89
docker/hoodie/hadoop/historyserver/pom.xml
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!--
|
||||||
|
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
~
|
||||||
|
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
~ you may not use this file except in compliance with the License.
|
||||||
|
~ You may obtain a copy of the License at
|
||||||
|
~
|
||||||
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
~
|
||||||
|
~ Unless required by applicable law or agreed to in writing, software
|
||||||
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
~ See the License for the specific language governing permissions and
|
||||||
|
~ limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>hoodie-hadoop-docker</artifactId>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<version>0.4.5-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<packaging>pom</packaging>
|
||||||
|
<artifactId>hoodie-hadoop-history-docker</artifactId>
|
||||||
|
|
||||||
|
<description>Base Docker Image with Hoodie</description>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<artifactId>hoodie-hadoop-base-docker</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<type>pom</type>
|
||||||
|
<scope>import</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<!-- Build Docker image -->
|
||||||
|
<plugin>
|
||||||
|
<groupId>com.spotify</groupId>
|
||||||
|
<artifactId>dockerfile-maven-plugin</artifactId>
|
||||||
|
<version>${dockerfile.maven.version}</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>tag-latest</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-history</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>latest</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
<execution>
|
||||||
|
<id>tag-version</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-history</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>${project.version}</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</project>
|
||||||
3
docker/hoodie/hadoop/historyserver/run_history.sh
Normal file
3
docker/hoodie/hadoop/historyserver/run_history.sh
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
$HADOOP_PREFIX/bin/yarn --config $HADOOP_CONF_DIR historyserver
|
||||||
51
docker/hoodie/hadoop/hive_base/Dockerfile
Normal file
51
docker/hoodie/hadoop/hive_base/Dockerfile
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
ARG HADOOP_VERSION=2.8.4
|
||||||
|
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-base:latest
|
||||||
|
|
||||||
|
ENV HIVE_HOME /opt/hive
|
||||||
|
ENV PATH $HIVE_HOME/bin:$PATH
|
||||||
|
ENV HADOOP_HOME /opt/hadoop-$HADOOP_VERSION
|
||||||
|
|
||||||
|
WORKDIR /opt
|
||||||
|
|
||||||
|
ARG HIVE_VERSION=2.3.3
|
||||||
|
ARG HIVE_URL=https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz
|
||||||
|
ENV HIVE_VERSION ${HIVE_VERSION}
|
||||||
|
ENV HIVE_URL ${HIVE_URL}
|
||||||
|
|
||||||
|
#Install Hive MySQL, PostgreSQL JDBC
|
||||||
|
RUN echo "Hive URL is :${HIVE_URL}" && wget ${HIVE_URL} -O hive.tar.gz && \
|
||||||
|
tar -xzvf hive.tar.gz && mv *hive*-bin hive && \
|
||||||
|
ln -s /usr/share/java/mysql-connector-java.jar $HIVE_HOME/lib/mysql-connector-java.jar && \
|
||||||
|
wget https://jdbc.postgresql.org/download/postgresql-9.4.1212.jar -O $HIVE_HOME/lib/postgresql-jdbc.jar && \
|
||||||
|
rm hive.tar.gz && mkdir -p /var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/
|
||||||
|
|
||||||
|
#Spark should be compiled with Hive to be able to use it
|
||||||
|
#hive-site.xml should be copied to $SPARK_HOME/conf folder
|
||||||
|
|
||||||
|
#Custom configuration goes here
|
||||||
|
ADD conf/hive-site.xml $HADOOP_CONF_DIR
|
||||||
|
ADD conf/beeline-log4j2.properties $HIVE_HOME/conf
|
||||||
|
ADD conf/hive-env.sh $HIVE_HOME/conf
|
||||||
|
ADD conf/hive-exec-log4j2.properties $HIVE_HOME/conf
|
||||||
|
ADD conf/hive-log4j2.properties $HIVE_HOME/conf
|
||||||
|
ADD conf/ivysettings.xml $HIVE_HOME/conf
|
||||||
|
ADD conf/llap-daemon-log4j2.properties $HIVE_HOME/conf
|
||||||
|
|
||||||
|
# Setup Hoodie Library jars
|
||||||
|
ADD target/ /var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/
|
||||||
|
|
||||||
|
ENV HUDI_HADOOP_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-hadoop-mr-bundle.jar
|
||||||
|
ENV HUDI_HIVE_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-hive-bundle.jar
|
||||||
|
ENV HUDI_SPARK_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-spark-bundle.jar
|
||||||
|
ENV HUDI_UTILITIES_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-utilities.jar
|
||||||
|
|
||||||
|
COPY startup.sh /usr/local/bin/
|
||||||
|
RUN chmod +x /usr/local/bin/startup.sh
|
||||||
|
|
||||||
|
COPY entrypoint.sh /usr/local/bin/
|
||||||
|
RUN chmod +x /usr/local/bin/entrypoint.sh
|
||||||
|
|
||||||
|
ENV PATH $HIVE_HOME/bin/:$PATH
|
||||||
|
|
||||||
|
ENTRYPOINT ["entrypoint.sh"]
|
||||||
|
CMD startup.sh
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
status = INFO
|
||||||
|
name = BeelineLog4j2
|
||||||
|
packages = org.apache.hadoop.hive.ql.log
|
||||||
|
|
||||||
|
# list of properties
|
||||||
|
property.hive.log.level = WARN
|
||||||
|
property.hive.root.logger = console
|
||||||
|
|
||||||
|
# list of all appenders
|
||||||
|
appenders = console
|
||||||
|
|
||||||
|
# console appender
|
||||||
|
appender.console.type = Console
|
||||||
|
appender.console.name = console
|
||||||
|
appender.console.target = SYSTEM_ERR
|
||||||
|
appender.console.layout.type = PatternLayout
|
||||||
|
appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n
|
||||||
|
|
||||||
|
# list of all loggers
|
||||||
|
loggers = HiveConnection
|
||||||
|
|
||||||
|
# HiveConnection logs useful info for dynamic service discovery
|
||||||
|
logger.HiveConnection.name = org.apache.hive.jdbc.HiveConnection
|
||||||
|
logger.HiveConnection.level = INFO
|
||||||
|
|
||||||
|
# root logger
|
||||||
|
rootLogger.level = ${sys:hive.log.level}
|
||||||
|
rootLogger.appenderRefs = root
|
||||||
|
rootLogger.appenderRef.root.ref = ${sys:hive.root.logger}
|
||||||
54
docker/hoodie/hadoop/hive_base/conf/hive-env.sh
Normal file
54
docker/hoodie/hadoop/hive_base/conf/hive-env.sh
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# Set Hive and Hadoop environment variables here. These variables can be used
|
||||||
|
# to control the execution of Hive. It should be used by admins to configure
|
||||||
|
# the Hive installation (so that users do not have to set environment variables
|
||||||
|
# or set command line parameters to get correct behavior).
|
||||||
|
#
|
||||||
|
# The hive service being invoked (CLI/HWI etc.) is available via the environment
|
||||||
|
# variable SERVICE
|
||||||
|
|
||||||
|
|
||||||
|
# Hive Client memory usage can be an issue if a large number of clients
|
||||||
|
# are running at the same time. The flags below have been useful in
|
||||||
|
# reducing memory usage:
|
||||||
|
#
|
||||||
|
# if [ "$SERVICE" = "cli" ]; then
|
||||||
|
# if [ -z "$DEBUG" ]; then
|
||||||
|
# export HADOOP_OPTS="$HADOOP_OPTS -XX:NewRatio=12 -Xms10m -XX:MaxHeapFreeRatio=40 -XX:MinHeapFreeRatio=15 -XX:+UseParNewGC -XX:-UseGCOverheadLimit"
|
||||||
|
# else
|
||||||
|
# export HADOOP_OPTS="$HADOOP_OPTS -XX:NewRatio=12 -Xms10m -XX:MaxHeapFreeRatio=40 -XX:MinHeapFreeRatio=15 -XX:-UseGCOverheadLimit"
|
||||||
|
# fi
|
||||||
|
# fi
|
||||||
|
|
||||||
|
# The heap size of the jvm stared by hive shell script can be controlled via:
|
||||||
|
#
|
||||||
|
# export HADOOP_HEAPSIZE=1024
|
||||||
|
#
|
||||||
|
# Larger heap size may be required when running queries over large number of files or partitions.
|
||||||
|
# By default hive shell scripts use a heap size of 256 (MB). Larger heap size would also be
|
||||||
|
# appropriate for hive server (hwi etc).
|
||||||
|
|
||||||
|
|
||||||
|
# Set HADOOP_HOME to point to a specific hadoop install directory
|
||||||
|
# HADOOP_HOME=${bin}/../../hadoop
|
||||||
|
|
||||||
|
# Hive Configuration Directory can be controlled by:
|
||||||
|
# export HIVE_CONF_DIR=
|
||||||
|
|
||||||
|
# Folder containing extra ibraries required for hive compilation/execution can be controlled by:
|
||||||
|
# export HIVE_AUX_JARS_PATH=
|
||||||
@@ -0,0 +1,66 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
status = INFO
|
||||||
|
name = HiveExecLog4j2
|
||||||
|
packages = org.apache.hadoop.hive.ql.log
|
||||||
|
|
||||||
|
# list of properties
|
||||||
|
property.hive.log.level = INFO
|
||||||
|
property.hive.root.logger = FA
|
||||||
|
property.hive.query.id = hadoop
|
||||||
|
property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name}
|
||||||
|
property.hive.log.file = ${sys:hive.query.id}.log
|
||||||
|
|
||||||
|
# list of all appenders
|
||||||
|
appenders = console, FA
|
||||||
|
|
||||||
|
# console appender
|
||||||
|
appender.console.type = Console
|
||||||
|
appender.console.name = console
|
||||||
|
appender.console.target = SYSTEM_ERR
|
||||||
|
appender.console.layout.type = PatternLayout
|
||||||
|
appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n
|
||||||
|
|
||||||
|
# simple file appender
|
||||||
|
appender.FA.type = File
|
||||||
|
appender.FA.name = FA
|
||||||
|
appender.FA.fileName = ${sys:hive.log.dir}/${sys:hive.log.file}
|
||||||
|
appender.FA.layout.type = PatternLayout
|
||||||
|
appender.FA.layout.pattern = %d{ISO8601} %-5p [%t]: %c{2} (%F:%M(%L)) - %m%n
|
||||||
|
|
||||||
|
# list of all loggers
|
||||||
|
loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX
|
||||||
|
|
||||||
|
logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn
|
||||||
|
logger.NIOServerCnxn.level = WARN
|
||||||
|
|
||||||
|
logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO
|
||||||
|
logger.ClientCnxnSocketNIO.level = WARN
|
||||||
|
|
||||||
|
logger.DataNucleus.name = DataNucleus
|
||||||
|
logger.DataNucleus.level = ERROR
|
||||||
|
|
||||||
|
logger.Datastore.name = Datastore
|
||||||
|
logger.Datastore.level = ERROR
|
||||||
|
|
||||||
|
logger.JPOX.name = JPOX
|
||||||
|
logger.JPOX.level = ERROR
|
||||||
|
|
||||||
|
# root logger
|
||||||
|
rootLogger.level = ${sys:hive.log.level}
|
||||||
|
rootLogger.appenderRefs = root
|
||||||
|
rootLogger.appenderRef.root.ref = ${sys:hive.root.logger}
|
||||||
73
docker/hoodie/hadoop/hive_base/conf/hive-log4j2.properties
Normal file
73
docker/hoodie/hadoop/hive_base/conf/hive-log4j2.properties
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
status = INFO
|
||||||
|
name = HiveLog4j2
|
||||||
|
packages = org.apache.hadoop.hive.ql.log
|
||||||
|
|
||||||
|
# list of properties
|
||||||
|
property.hive.log.level = INFO
|
||||||
|
property.hive.root.logger = DRFA
|
||||||
|
property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name}
|
||||||
|
property.hive.log.file = hive.log
|
||||||
|
|
||||||
|
# list of all appenders
|
||||||
|
appenders = console, DRFA
|
||||||
|
|
||||||
|
# console appender
|
||||||
|
appender.console.type = Console
|
||||||
|
appender.console.name = console
|
||||||
|
appender.console.target = SYSTEM_ERR
|
||||||
|
appender.console.layout.type = PatternLayout
|
||||||
|
appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n
|
||||||
|
|
||||||
|
# daily rolling file appender
|
||||||
|
appender.DRFA.type = RollingFile
|
||||||
|
appender.DRFA.name = DRFA
|
||||||
|
appender.DRFA.fileName = ${sys:hive.log.dir}/${sys:hive.log.file}
|
||||||
|
# Use %pid in the filePattern to append <process-id>@<host-name> to the filename if you want separate log files for different CLI session
|
||||||
|
appender.DRFA.filePattern = ${sys:hive.log.dir}/${sys:hive.log.file}.%d{yyyy-MM-dd}
|
||||||
|
appender.DRFA.layout.type = PatternLayout
|
||||||
|
appender.DRFA.layout.pattern = %d{ISO8601} %-5p [%t]: %c{2} (%F:%M(%L)) - %m%n
|
||||||
|
appender.DRFA.policies.type = Policies
|
||||||
|
appender.DRFA.policies.time.type = TimeBasedTriggeringPolicy
|
||||||
|
appender.DRFA.policies.time.interval = 1
|
||||||
|
appender.DRFA.policies.time.modulate = true
|
||||||
|
appender.DRFA.strategy.type = DefaultRolloverStrategy
|
||||||
|
appender.DRFA.strategy.max = 30
|
||||||
|
|
||||||
|
# list of all loggers
|
||||||
|
loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX
|
||||||
|
|
||||||
|
logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn
|
||||||
|
logger.NIOServerCnxn.level = WARN
|
||||||
|
|
||||||
|
logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO
|
||||||
|
logger.ClientCnxnSocketNIO.level = WARN
|
||||||
|
|
||||||
|
logger.DataNucleus.name = DataNucleus
|
||||||
|
logger.DataNucleus.level = ERROR
|
||||||
|
|
||||||
|
logger.Datastore.name = Datastore
|
||||||
|
logger.Datastore.level = ERROR
|
||||||
|
|
||||||
|
logger.JPOX.name = JPOX
|
||||||
|
logger.JPOX.level = ERROR
|
||||||
|
|
||||||
|
# root logger
|
||||||
|
rootLogger.level = ${sys:hive.log.level}
|
||||||
|
rootLogger.appenderRefs = root
|
||||||
|
rootLogger.appenderRef.root.ref = ${sys:hive.root.logger}
|
||||||
18
docker/hoodie/hadoop/hive_base/conf/hive-site.xml
Normal file
18
docker/hoodie/hadoop/hive_base/conf/hive-site.xml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
|
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
--><configuration>
|
||||||
|
</configuration>
|
||||||
45
docker/hoodie/hadoop/hive_base/conf/ivysettings.xml
Normal file
45
docker/hoodie/hadoop/hive_base/conf/ivysettings.xml
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<!--This file is used by grapes to download dependencies from a maven repository.
|
||||||
|
This is just a template and can be edited to add more repositories.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<ivysettings>
|
||||||
|
<!--name of the defaultResolver should always be 'downloadGrapes'. -->
|
||||||
|
<settings defaultResolver="downloadGrapes"/>
|
||||||
|
<!-- Only set maven.local.repository if not already set -->
|
||||||
|
<property name="maven.local.repository" value="${user.home}/.m2/repository" override="false" />
|
||||||
|
<property name="m2-pattern"
|
||||||
|
value="file:${maven.local.repository}/[organisation]/[module]/[revision]/[module]-[revision](-[classifier]).[ext]"
|
||||||
|
override="false"/>
|
||||||
|
<resolvers>
|
||||||
|
<!-- more resolvers can be added here -->
|
||||||
|
<chain name="downloadGrapes">
|
||||||
|
<!-- This resolver uses ibiblio to find artifacts, compatible with maven2 repository -->
|
||||||
|
<ibiblio name="central" m2compatible="true"/>
|
||||||
|
<url name="local-maven2" m2compatible="true">
|
||||||
|
<artifact pattern="${m2-pattern}"/>
|
||||||
|
</url>
|
||||||
|
<!-- File resolver to add jars from the local system. -->
|
||||||
|
<filesystem name="test" checkmodified="true">
|
||||||
|
<artifact pattern="/tmp/[module]-[revision](-[classifier]).jar"/>
|
||||||
|
</filesystem>
|
||||||
|
|
||||||
|
</chain>
|
||||||
|
</resolvers>
|
||||||
|
</ivysettings>
|
||||||
@@ -0,0 +1,93 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
status = INFO
|
||||||
|
name = LlapDaemonLog4j2
|
||||||
|
packages = org.apache.hadoop.hive.ql.log
|
||||||
|
|
||||||
|
# list of properties
|
||||||
|
property.llap.daemon.log.level = INFO
|
||||||
|
property.llap.daemon.root.logger = console
|
||||||
|
property.llap.daemon.log.dir = .
|
||||||
|
property.llap.daemon.log.file = llapdaemon.log
|
||||||
|
property.llap.daemon.historylog.file = llapdaemon_history.log
|
||||||
|
property.llap.daemon.log.maxfilesize = 256MB
|
||||||
|
property.llap.daemon.log.maxbackupindex = 20
|
||||||
|
|
||||||
|
# list of all appenders
|
||||||
|
appenders = console, RFA, HISTORYAPPENDER
|
||||||
|
|
||||||
|
# console appender
|
||||||
|
appender.console.type = Console
|
||||||
|
appender.console.name = console
|
||||||
|
appender.console.target = SYSTEM_ERR
|
||||||
|
appender.console.layout.type = PatternLayout
|
||||||
|
appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t%x] %p %c{2} : %m%n
|
||||||
|
|
||||||
|
# rolling file appender
|
||||||
|
appender.RFA.type = RollingFile
|
||||||
|
appender.RFA.name = RFA
|
||||||
|
appender.RFA.fileName = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.log.file}
|
||||||
|
appender.RFA.filePattern = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.log.file}_%i
|
||||||
|
appender.RFA.layout.type = PatternLayout
|
||||||
|
appender.RFA.layout.pattern = %d{ISO8601} %-5p [%t%x]: %c{2} (%F:%M(%L)) - %m%n
|
||||||
|
appender.RFA.policies.type = Policies
|
||||||
|
appender.RFA.policies.size.type = SizeBasedTriggeringPolicy
|
||||||
|
appender.RFA.policies.size.size = ${sys:llap.daemon.log.maxfilesize}
|
||||||
|
appender.RFA.strategy.type = DefaultRolloverStrategy
|
||||||
|
appender.RFA.strategy.max = ${sys:llap.daemon.log.maxbackupindex}
|
||||||
|
|
||||||
|
# history file appender
|
||||||
|
appender.HISTORYAPPENDER.type = RollingFile
|
||||||
|
appender.HISTORYAPPENDER.name = HISTORYAPPENDER
|
||||||
|
appender.HISTORYAPPENDER.fileName = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.historylog.file}
|
||||||
|
appender.HISTORYAPPENDER.filePattern = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.historylog.file}_%i
|
||||||
|
appender.HISTORYAPPENDER.layout.type = PatternLayout
|
||||||
|
appender.HISTORYAPPENDER.layout.pattern = %m%n
|
||||||
|
appender.HISTORYAPPENDER.policies.type = Policies
|
||||||
|
appender.HISTORYAPPENDER.policies.size.type = SizeBasedTriggeringPolicy
|
||||||
|
appender.HISTORYAPPENDER.policies.size.size = ${sys:llap.daemon.log.maxfilesize}
|
||||||
|
appender.HISTORYAPPENDER.strategy.type = DefaultRolloverStrategy
|
||||||
|
appender.HISTORYAPPENDER.strategy.max = ${sys:llap.daemon.log.maxbackupindex}
|
||||||
|
|
||||||
|
# list of all loggers
|
||||||
|
loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX, HistoryLogger
|
||||||
|
|
||||||
|
logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn
|
||||||
|
logger.NIOServerCnxn.level = WARN
|
||||||
|
|
||||||
|
logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO
|
||||||
|
logger.ClientCnxnSocketNIO.level = WARN
|
||||||
|
|
||||||
|
logger.DataNucleus.name = DataNucleus
|
||||||
|
logger.DataNucleus.level = ERROR
|
||||||
|
|
||||||
|
logger.Datastore.name = Datastore
|
||||||
|
logger.Datastore.level = ERROR
|
||||||
|
|
||||||
|
logger.JPOX.name = JPOX
|
||||||
|
logger.JPOX.level = ERROR
|
||||||
|
|
||||||
|
logger.HistoryLogger.name = org.apache.hadoop.hive.llap.daemon.HistoryLogger
|
||||||
|
logger.HistoryLogger.level = INFO
|
||||||
|
logger.HistoryLogger.additivity = false
|
||||||
|
logger.HistoryLogger.appenderRefs = HistoryAppender
|
||||||
|
logger.HistoryLogger.appenderRef.HistoryAppender.ref = HISTORYAPPENDER
|
||||||
|
|
||||||
|
# root logger
|
||||||
|
rootLogger.level = ${sys:llap.daemon.log.level}
|
||||||
|
rootLogger.appenderRefs = root
|
||||||
|
rootLogger.appenderRef.root.ref = ${sys:llap.daemon.root.logger}
|
||||||
118
docker/hoodie/hadoop/hive_base/entrypoint.sh
Normal file
118
docker/hoodie/hadoop/hive_base/entrypoint.sh
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Set some sensible defaults
|
||||||
|
export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020}
|
||||||
|
|
||||||
|
function addProperty() {
|
||||||
|
local path=$1
|
||||||
|
local name=$2
|
||||||
|
local value=$3
|
||||||
|
|
||||||
|
local entry="<property><name>$name</name><value>${value}</value></property>"
|
||||||
|
local escapedEntry=$(echo $entry | sed 's/\//\\\//g')
|
||||||
|
sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path
|
||||||
|
}
|
||||||
|
|
||||||
|
function configure() {
|
||||||
|
local path=$1
|
||||||
|
local module=$2
|
||||||
|
local envPrefix=$3
|
||||||
|
|
||||||
|
local var
|
||||||
|
local value
|
||||||
|
|
||||||
|
echo "Configuring $module"
|
||||||
|
for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do
|
||||||
|
name=`echo ${c} | perl -pe 's/___/-/g; s/__/_/g; s/_/./g'`
|
||||||
|
var="${envPrefix}_${c}"
|
||||||
|
value=${!var}
|
||||||
|
echo " - Setting $name=$value"
|
||||||
|
addProperty $path $name "$value"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
configure /etc/hadoop/core-site.xml core CORE_CONF
|
||||||
|
configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF
|
||||||
|
configure /etc/hadoop/yarn-site.xml yarn YARN_CONF
|
||||||
|
configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF
|
||||||
|
configure /etc/hadoop/kms-site.xml kms KMS_CONF
|
||||||
|
configure /etc/hadoop/mapred-site.xml mapred MAPRED_CONF
|
||||||
|
configure /etc/hadoop/hive-site.xml hive HIVE_SITE_CONF
|
||||||
|
|
||||||
|
if [ "$MULTIHOMED_NETWORK" = "1" ]; then
|
||||||
|
echo "Configuring for multihomed network"
|
||||||
|
|
||||||
|
# HDFS
|
||||||
|
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0
|
||||||
|
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0
|
||||||
|
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0
|
||||||
|
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0
|
||||||
|
addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true
|
||||||
|
addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true
|
||||||
|
|
||||||
|
# YARN
|
||||||
|
addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0
|
||||||
|
addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0
|
||||||
|
addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0
|
||||||
|
addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0
|
||||||
|
|
||||||
|
# MAPRED
|
||||||
|
addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$GANGLIA_HOST" ]; then
|
||||||
|
mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig
|
||||||
|
mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig
|
||||||
|
|
||||||
|
for module in mapred jvm rpc ugi; do
|
||||||
|
echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31"
|
||||||
|
echo "$module.period=10"
|
||||||
|
echo "$module.servers=$GANGLIA_HOST:8649"
|
||||||
|
done > /etc/hadoop/hadoop-metrics.properties
|
||||||
|
|
||||||
|
for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do
|
||||||
|
echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31"
|
||||||
|
echo "$module.sink.ganglia.period=10"
|
||||||
|
echo "$module.sink.ganglia.supportsparse=true"
|
||||||
|
echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both"
|
||||||
|
echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40"
|
||||||
|
echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649"
|
||||||
|
done > /etc/hadoop/hadoop-metrics2.properties
|
||||||
|
fi
|
||||||
|
|
||||||
|
function wait_for_it()
|
||||||
|
{
|
||||||
|
local serviceport=$1
|
||||||
|
local service=${serviceport%%:*}
|
||||||
|
local port=${serviceport#*:}
|
||||||
|
local retry_seconds=5
|
||||||
|
local max_try=100
|
||||||
|
let i=1
|
||||||
|
|
||||||
|
nc -z $service $port
|
||||||
|
result=$?
|
||||||
|
|
||||||
|
until [ $result -eq 0 ]; do
|
||||||
|
echo "[$i/$max_try] check for ${service}:${port}..."
|
||||||
|
echo "[$i/$max_try] ${service}:${port} is not available yet"
|
||||||
|
if (( $i == $max_try )); then
|
||||||
|
echo "[$i/$max_try] ${service}:${port} is still not available; giving up after ${max_try} tries. :/"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[$i/$max_try] try in ${retry_seconds}s once again ..."
|
||||||
|
let "i++"
|
||||||
|
sleep $retry_seconds
|
||||||
|
|
||||||
|
nc -z $service $port
|
||||||
|
result=$?
|
||||||
|
done
|
||||||
|
echo "[$i/$max_try] $service:${port} is available."
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in ${SERVICE_PRECONDITION[@]}
|
||||||
|
do
|
||||||
|
wait_for_it ${i}
|
||||||
|
done
|
||||||
|
|
||||||
|
exec $@
|
||||||
113
docker/hoodie/hadoop/hive_base/pom.xml
Normal file
113
docker/hoodie/hadoop/hive_base/pom.xml
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!--
|
||||||
|
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
~
|
||||||
|
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
~ you may not use this file except in compliance with the License.
|
||||||
|
~ You may obtain a copy of the License at
|
||||||
|
~
|
||||||
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
~
|
||||||
|
~ Unless required by applicable law or agreed to in writing, software
|
||||||
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
~ See the License for the specific language governing permissions and
|
||||||
|
~ limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>hoodie-hadoop-docker</artifactId>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<version>0.4.5-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<packaging>pom</packaging>
|
||||||
|
<artifactId>hoodie-hadoop-hive-docker</artifactId>
|
||||||
|
|
||||||
|
<description>Base Docker Image with Hoodie</description>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<artifactId>hoodie-hadoop-base-docker</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<type>pom</type>
|
||||||
|
<scope>import</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<artifactId>maven-antrun-plugin</artifactId>
|
||||||
|
<version>1.7</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>package</phase>
|
||||||
|
<configuration>
|
||||||
|
<tasks>
|
||||||
|
<copy file="${project.basedir}/../../../../packaging/hoodie-hadoop-mr-bundle/target/hoodie-hadoop-mr-bundle-${project.version}.jar"
|
||||||
|
tofile="target/hoodie-hadoop-mr-bundle.jar" />
|
||||||
|
<copy file="${project.basedir}/../../../../packaging/hoodie-hive-bundle/target/hoodie-hive-bundle-${project.version}.jar"
|
||||||
|
tofile="target/hoodie-hive-bundle.jar" />
|
||||||
|
<copy file="${project.basedir}/../../../../packaging/hoodie-spark-bundle/target/hoodie-spark-bundle-${project.version}.jar"
|
||||||
|
tofile="target/hoodie-spark-bundle.jar" />
|
||||||
|
<copy file="${project.basedir}/../../../../hoodie-utilities/target/hoodie-utilities-${project.version}.jar"
|
||||||
|
tofile="target/hoodie-utilities.jar" />
|
||||||
|
</tasks>
|
||||||
|
</configuration>
|
||||||
|
<goals>
|
||||||
|
<goal>run</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
<!-- Build Docker image -->
|
||||||
|
<plugin>
|
||||||
|
<groupId>com.spotify</groupId>
|
||||||
|
<artifactId>dockerfile-maven-plugin</artifactId>
|
||||||
|
<version>${dockerfile.maven.version}</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>tag-latest</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>latest</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
<execution>
|
||||||
|
<id>tag-version</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>${project.version}</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</project>
|
||||||
10
docker/hoodie/hadoop/hive_base/startup.sh
Normal file
10
docker/hoodie/hadoop/hive_base/startup.sh
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
hadoop fs -mkdir /tmp
|
||||||
|
hadoop fs -mkdir -p /user/hive/warehouse
|
||||||
|
hadoop fs -chmod g+w /tmp
|
||||||
|
hadoop fs -chmod g+w /user/hive/warehouse
|
||||||
|
|
||||||
|
cd $HIVE_HOME/bin
|
||||||
|
export AUX_CLASSPATH=file://${HUDI_HADOOP_BUNDLE}
|
||||||
|
./hiveserver2 --hiveconf hive.server2.enable.doAs=false --hiveconf hive.aux.jars.path=file://${HUDI_HADOOP_BUNDLE}
|
||||||
14
docker/hoodie/hadoop/namenode/Dockerfile
Normal file
14
docker/hoodie/hadoop/namenode/Dockerfile
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
ARG HADOOP_VERSION=2.8.4
|
||||||
|
ARG HADOOP_WEBHDFS_PORT=50070
|
||||||
|
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-base:latest
|
||||||
|
|
||||||
|
ENV HADOOP_WEBHDFS_PORT ${HADOOP_WEBHDFS_PORT}
|
||||||
|
|
||||||
|
ENV HDFS_CONF_dfs_namenode_name_dir=file:///hadoop/dfs/name
|
||||||
|
RUN mkdir -p /hadoop/dfs/name
|
||||||
|
VOLUME /hadoop/dfs/name
|
||||||
|
|
||||||
|
ADD run_nn.sh /run_nn.sh
|
||||||
|
RUN chmod a+x /run_nn.sh
|
||||||
|
|
||||||
|
CMD ["/run_nn.sh"]
|
||||||
89
docker/hoodie/hadoop/namenode/pom.xml
Normal file
89
docker/hoodie/hadoop/namenode/pom.xml
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!--
|
||||||
|
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
~
|
||||||
|
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
~ you may not use this file except in compliance with the License.
|
||||||
|
~ You may obtain a copy of the License at
|
||||||
|
~
|
||||||
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
~
|
||||||
|
~ Unless required by applicable law or agreed to in writing, software
|
||||||
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
~ See the License for the specific language governing permissions and
|
||||||
|
~ limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>hoodie-hadoop-docker</artifactId>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<version>0.4.5-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<packaging>pom</packaging>
|
||||||
|
<artifactId>hoodie-hadoop-namenode-docker</artifactId>
|
||||||
|
|
||||||
|
<description>Base Docker Image with Hoodie</description>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<artifactId>hoodie-hadoop-base-docker</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<type>pom</type>
|
||||||
|
<scope>import</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<!-- Build Docker image -->
|
||||||
|
<plugin>
|
||||||
|
<groupId>com.spotify</groupId>
|
||||||
|
<artifactId>dockerfile-maven-plugin</artifactId>
|
||||||
|
<version>${dockerfile.maven.version}</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>tag-latest</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-namenode</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>latest</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
<execution>
|
||||||
|
<id>tag-version</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-namenode</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>${project.version}</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</project>
|
||||||
19
docker/hoodie/hadoop/namenode/run_nn.sh
Normal file
19
docker/hoodie/hadoop/namenode/run_nn.sh
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
namedir=`echo $HDFS_CONF_dfs_namenode_name_dir | perl -pe 's#file://##'`
|
||||||
|
if [ ! -d $namedir ]; then
|
||||||
|
echo "Namenode name directory not found: $namedir"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$CLUSTER_NAME" ]; then
|
||||||
|
echo "Cluster name not specified"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "`ls -A $namedir`" == "" ]; then
|
||||||
|
echo "Formatting namenode name directory: $namedir"
|
||||||
|
$HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR namenode -format $CLUSTER_NAME
|
||||||
|
fi
|
||||||
|
|
||||||
|
$HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR namenode
|
||||||
78
docker/hoodie/hadoop/pom.xml
Normal file
78
docker/hoodie/hadoop/pom.xml
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!--
|
||||||
|
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
~
|
||||||
|
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
~ you may not use this file except in compliance with the License.
|
||||||
|
~ You may obtain a copy of the License at
|
||||||
|
~
|
||||||
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
~
|
||||||
|
~ Unless required by applicable law or agreed to in writing, software
|
||||||
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
~ See the License for the specific language governing permissions and
|
||||||
|
~ limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>hoodie</artifactId>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<version>0.4.5-SNAPSHOT</version>
|
||||||
|
<relativePath>../../../pom.xml</relativePath>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<artifactId>hoodie-hadoop-docker</artifactId>
|
||||||
|
<packaging>pom</packaging>
|
||||||
|
<modules>
|
||||||
|
<module>base</module>
|
||||||
|
<module>namenode</module>
|
||||||
|
<module>datanode</module>
|
||||||
|
<module>historyserver</module>
|
||||||
|
<module>hive_base</module>
|
||||||
|
<module>spark_base</module>
|
||||||
|
<module>sparkmaster</module>
|
||||||
|
<module>sparkworker</module>
|
||||||
|
<module>sparkadhoc</module>
|
||||||
|
</modules>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<artifactId>hoodie-spark-bundle</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<skipITs>false</skipITs>
|
||||||
|
<docker.build.skip>true</docker.build.skip>
|
||||||
|
<docker.spark.version>2.3.1</docker.spark.version>
|
||||||
|
<docker.hive.version>2.3.3</docker.hive.version>
|
||||||
|
<docker.hadoop.version>2.8.4</docker.hadoop.version>
|
||||||
|
<dockerfile.maven.version>1.4.3</dockerfile.maven.version>
|
||||||
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<extensions>
|
||||||
|
<extension>
|
||||||
|
<groupId>com.spotify</groupId>
|
||||||
|
<artifactId>dockerfile-maven-extension</artifactId>
|
||||||
|
<version>${dockerfile.maven.version}</version>
|
||||||
|
</extension>
|
||||||
|
</extensions>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>com.spotify</groupId>
|
||||||
|
<artifactId>dockerfile-maven-plugin</artifactId>
|
||||||
|
<version>${dockerfile.maven.version}</version>
|
||||||
|
<configuration>
|
||||||
|
<skip>true</skip>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</project>
|
||||||
46
docker/hoodie/hadoop/spark_base/Dockerfile
Normal file
46
docker/hoodie/hadoop/spark_base/Dockerfile
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
ARG HADOOP_VERSION=2.8.4
|
||||||
|
ARG HIVE_VERSION=2.3.3
|
||||||
|
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}
|
||||||
|
|
||||||
|
ENV ENABLE_INIT_DAEMON true
|
||||||
|
ENV INIT_DAEMON_BASE_URI http://identifier/init-daemon
|
||||||
|
ENV INIT_DAEMON_STEP spark_master_init
|
||||||
|
|
||||||
|
ARG SPARK_VERSION=2.3.1
|
||||||
|
ARG SPARK_HADOOP_VERSION=2.7
|
||||||
|
|
||||||
|
ENV SPARK_VERSION ${SPARK_VERSION}
|
||||||
|
ENV HADOOP_VERSION ${SPARK_HADOOP_VERSION}
|
||||||
|
|
||||||
|
COPY wait-for-step.sh /
|
||||||
|
COPY execute-step.sh /
|
||||||
|
COPY finish-step.sh /
|
||||||
|
|
||||||
|
RUN echo "Installing Spark-version (${SPARK_VERSION})" \
|
||||||
|
&& wget http://apache.mirror.iphh.net/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
|
||||||
|
&& tar -xvzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
|
||||||
|
&& mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} /opt/spark \
|
||||||
|
&& rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
|
||||||
|
&& cd /
|
||||||
|
|
||||||
|
#Give permission to execute scripts
|
||||||
|
RUN chmod +x /wait-for-step.sh && chmod +x /execute-step.sh && chmod +x /finish-step.sh
|
||||||
|
|
||||||
|
# Fix the value of PYTHONHASHSEED
|
||||||
|
# Note: this is needed when you use Python 3.3 or greater
|
||||||
|
ENV PYTHONHASHSEED 1
|
||||||
|
|
||||||
|
ENV SPARK_HOME /opt/spark
|
||||||
|
ENV SPARK_INSTALL ${SPARK_HOME}
|
||||||
|
ENV SPARK_CONF_DIR ${SPARK_HOME}/conf
|
||||||
|
ENV PATH $SPARK_INSTALL/bin:$PATH
|
||||||
|
|
||||||
|
ENV SPARK_DRIVER_PORT 5001
|
||||||
|
ENV SPARK_UI_PORT 5002
|
||||||
|
ENV SPARK_BLOCKMGR_PORT 5003
|
||||||
|
|
||||||
|
EXPOSE $SPARK_DRIVER_PORT $SPARK_UI_PORT $SPARK_BLOCKMGR_PORT
|
||||||
|
|
||||||
|
# Without this spark-shell fails - Download if it is not already there in $SPARK_INSTALL
|
||||||
|
RUN wget -nc -q -O "${SPARK_INSTALL}/jars/jersey-bundle-1.19.4.jar" "http://repo1.maven.org/maven2/com/sun/jersey/jersey-bundle/1.19.4/jersey-bundle-1.19.4.jar"
|
||||||
|
|
||||||
14
docker/hoodie/hadoop/spark_base/execute-step.sh
Normal file
14
docker/hoodie/hadoop/spark_base/execute-step.sh
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $ENABLE_INIT_DAEMON = "true" ]
|
||||||
|
then
|
||||||
|
echo "Execute step ${INIT_DAEMON_STEP} in pipeline"
|
||||||
|
while true; do
|
||||||
|
sleep 5
|
||||||
|
echo -n '.'
|
||||||
|
string=$(curl -sL -w "%{http_code}" -X PUT $INIT_DAEMON_BASE_URI/execute?step=$INIT_DAEMON_STEP -o /dev/null)
|
||||||
|
[ "$string" = "204" ] && break
|
||||||
|
done
|
||||||
|
echo "Notified execution of step ${INIT_DAEMON_STEP}"
|
||||||
|
fi
|
||||||
|
|
||||||
16
docker/hoodie/hadoop/spark_base/finish-step.sh
Normal file
16
docker/hoodie/hadoop/spark_base/finish-step.sh
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $ENABLE_INIT_DAEMON = "true" ]
|
||||||
|
then
|
||||||
|
echo "Finish step ${INIT_DAEMON_STEP} in pipeline"
|
||||||
|
while true; do
|
||||||
|
sleep 5
|
||||||
|
echo -n '.'
|
||||||
|
string=$(curl -sL -w "%{http_code}" -X PUT $INIT_DAEMON_BASE_URI/finish?step=$INIT_DAEMON_STEP -o /dev/null)
|
||||||
|
[ "$string" = "204" ] && break
|
||||||
|
done
|
||||||
|
echo "Notified finish of step ${INIT_DAEMON_STEP}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
89
docker/hoodie/hadoop/spark_base/pom.xml
Normal file
89
docker/hoodie/hadoop/spark_base/pom.xml
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!--
|
||||||
|
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
~
|
||||||
|
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
~ you may not use this file except in compliance with the License.
|
||||||
|
~ You may obtain a copy of the License at
|
||||||
|
~
|
||||||
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
~
|
||||||
|
~ Unless required by applicable law or agreed to in writing, software
|
||||||
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
~ See the License for the specific language governing permissions and
|
||||||
|
~ limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>hoodie-hadoop-docker</artifactId>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<version>0.4.5-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<packaging>pom</packaging>
|
||||||
|
<artifactId>hoodie-hadoop-sparkbase-docker</artifactId>
|
||||||
|
|
||||||
|
<description>Base Docker Image with Hoodie</description>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<artifactId>hoodie-hadoop-hive-docker</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<type>pom</type>
|
||||||
|
<scope>import</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<!-- Build Docker image -->
|
||||||
|
<plugin>
|
||||||
|
<groupId>com.spotify</groupId>
|
||||||
|
<artifactId>dockerfile-maven-plugin</artifactId>
|
||||||
|
<version>${dockerfile.maven.version}</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>tag-latest</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkbase_${docker.spark.version}</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>latest</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
<execution>
|
||||||
|
<id>tag-version</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkbase_${docker.spark.version}</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>${project.version}</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</project>
|
||||||
13
docker/hoodie/hadoop/spark_base/wait-for-step.sh
Normal file
13
docker/hoodie/hadoop/spark_base/wait-for-step.sh
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $ENABLE_INIT_DAEMON = "true" ]
|
||||||
|
then
|
||||||
|
echo "Validating if step ${INIT_DAEMON_STEP} can start in pipeline"
|
||||||
|
while true; do
|
||||||
|
sleep 5
|
||||||
|
echo -n '.'
|
||||||
|
string=$(curl -s $INIT_DAEMON_BASE_URI/canStart?step=$INIT_DAEMON_STEP)
|
||||||
|
[ "$string" = "true" ] && break
|
||||||
|
done
|
||||||
|
echo "Can start step ${INIT_DAEMON_STEP}"
|
||||||
|
fi
|
||||||
12
docker/hoodie/hadoop/sparkadhoc/Dockerfile
Normal file
12
docker/hoodie/hadoop/sparkadhoc/Dockerfile
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
ARG HADOOP_VERSION=2.8.4
|
||||||
|
ARG HIVE_VERSION=2.3.3
|
||||||
|
ARG SPARK_VERSION=2.3.1
|
||||||
|
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkbase_${SPARK_VERSION}
|
||||||
|
|
||||||
|
COPY adhoc.sh /opt/spark
|
||||||
|
|
||||||
|
ENV SPARK_WORKER_WEBUI_PORT 8081
|
||||||
|
ENV SPARK_WORKER_LOG /spark/logs
|
||||||
|
ENV SPARK_MASTER "spark://spark-master:7077"
|
||||||
|
|
||||||
|
CMD ["/bin/bash", "/opt/spark/adhoc.sh"]
|
||||||
13
docker/hoodie/hadoop/sparkadhoc/adhoc.sh
Normal file
13
docker/hoodie/hadoop/sparkadhoc/adhoc.sh
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
. "/spark/sbin/spark-config.sh"
|
||||||
|
|
||||||
|
. "/spark/bin/load-spark-env.sh"
|
||||||
|
|
||||||
|
|
||||||
|
export SPARK_HOME=/opt/spark
|
||||||
|
|
||||||
|
date
|
||||||
|
echo "SPARK HOME is : $SPARK_HOME"
|
||||||
|
|
||||||
|
tail -f /dev/null
|
||||||
89
docker/hoodie/hadoop/sparkadhoc/pom.xml
Normal file
89
docker/hoodie/hadoop/sparkadhoc/pom.xml
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!--
|
||||||
|
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
~
|
||||||
|
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
~ you may not use this file except in compliance with the License.
|
||||||
|
~ You may obtain a copy of the License at
|
||||||
|
~
|
||||||
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
~
|
||||||
|
~ Unless required by applicable law or agreed to in writing, software
|
||||||
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
~ See the License for the specific language governing permissions and
|
||||||
|
~ limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>hoodie-hadoop-docker</artifactId>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<version>0.4.5-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<packaging>pom</packaging>
|
||||||
|
<artifactId>hoodie-hadoop-sparkadhoc-docker</artifactId>
|
||||||
|
|
||||||
|
<description>Base Docker Image with Hoodie</description>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<artifactId>hoodie-hadoop-sparkbase-docker</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<type>pom</type>
|
||||||
|
<scope>import</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<!-- Build Docker image -->
|
||||||
|
<plugin>
|
||||||
|
<groupId>com.spotify</groupId>
|
||||||
|
<artifactId>dockerfile-maven-plugin</artifactId>
|
||||||
|
<version>${dockerfile.maven.version}</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>tag-latest</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkadhoc_${docker.spark.version}</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>latest</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
<execution>
|
||||||
|
<id>tag-version</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkadhoc_${docker.spark.version}</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>${project.version}</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</project>
|
||||||
14
docker/hoodie/hadoop/sparkmaster/Dockerfile
Normal file
14
docker/hoodie/hadoop/sparkmaster/Dockerfile
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
ARG HADOOP_VERSION=2.8.4
|
||||||
|
ARG HIVE_VERSION=2.3.3
|
||||||
|
ARG SPARK_VERSION=2.3.1
|
||||||
|
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkbase_${SPARK_VERSION}
|
||||||
|
|
||||||
|
COPY master.sh /opt/spark
|
||||||
|
|
||||||
|
ENV SPARK_MASTER_PORT 7077
|
||||||
|
ENV SPARK_MASTER_WEBUI_PORT 8080
|
||||||
|
ENV SPARK_MASTER_LOG /opt/spark/logs
|
||||||
|
|
||||||
|
EXPOSE 8080 7077 6066
|
||||||
|
|
||||||
|
CMD ["/bin/bash", "/opt/spark/master.sh"]
|
||||||
16
docker/hoodie/hadoop/sparkmaster/master.sh
Normal file
16
docker/hoodie/hadoop/sparkmaster/master.sh
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
export SPARK_MASTER_HOST=`hostname`
|
||||||
|
|
||||||
|
. "/opt/spark/sbin/spark-config.sh"
|
||||||
|
|
||||||
|
. "/opt/spark/bin/load-spark-env.sh"
|
||||||
|
|
||||||
|
mkdir -p $SPARK_MASTER_LOG
|
||||||
|
|
||||||
|
export SPARK_HOME=/opt/spark
|
||||||
|
|
||||||
|
ln -sf /dev/stdout $SPARK_MASTER_LOG/spark-master.out
|
||||||
|
|
||||||
|
cd /opt/spark/bin && /opt/spark/sbin/../bin/spark-class org.apache.spark.deploy.master.Master \
|
||||||
|
--ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG/spark-master.out
|
||||||
89
docker/hoodie/hadoop/sparkmaster/pom.xml
Normal file
89
docker/hoodie/hadoop/sparkmaster/pom.xml
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!--
|
||||||
|
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
~
|
||||||
|
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
~ you may not use this file except in compliance with the License.
|
||||||
|
~ You may obtain a copy of the License at
|
||||||
|
~
|
||||||
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
~
|
||||||
|
~ Unless required by applicable law or agreed to in writing, software
|
||||||
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
~ See the License for the specific language governing permissions and
|
||||||
|
~ limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>hoodie-hadoop-docker</artifactId>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<version>0.4.5-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<packaging>pom</packaging>
|
||||||
|
<artifactId>hoodie-hadoop-sparkmaster-docker</artifactId>
|
||||||
|
|
||||||
|
<description>Base Docker Image with Hoodie</description>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<artifactId>hoodie-hadoop-sparkbase-docker</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<type>pom</type>
|
||||||
|
<scope>import</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<!-- Build Docker image -->
|
||||||
|
<plugin>
|
||||||
|
<groupId>com.spotify</groupId>
|
||||||
|
<artifactId>dockerfile-maven-plugin</artifactId>
|
||||||
|
<version>${dockerfile.maven.version}</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>tag-latest</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkmaster_${docker.spark.version}</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>latest</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
<execution>
|
||||||
|
<id>tag-version</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!-- <goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkmaster_${docker.spark.version}</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>${project.version}</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</project>
|
||||||
14
docker/hoodie/hadoop/sparkworker/Dockerfile
Normal file
14
docker/hoodie/hadoop/sparkworker/Dockerfile
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
ARG HADOOP_VERSION=2.8.4
|
||||||
|
ARG HIVE_VERSION=2.3.3
|
||||||
|
ARG SPARK_VERSION=2.3.1
|
||||||
|
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkbase_${SPARK_VERSION}
|
||||||
|
|
||||||
|
COPY worker.sh /opt/spark
|
||||||
|
|
||||||
|
ENV SPARK_WORKER_WEBUI_PORT 8081
|
||||||
|
ENV SPARK_WORKER_LOG /spark/logs
|
||||||
|
ENV SPARK_MASTER "spark://spark-master:7077"
|
||||||
|
|
||||||
|
EXPOSE 8081
|
||||||
|
|
||||||
|
CMD ["/bin/bash", "/opt/spark/worker.sh"]
|
||||||
89
docker/hoodie/hadoop/sparkworker/pom.xml
Normal file
89
docker/hoodie/hadoop/sparkworker/pom.xml
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!--
|
||||||
|
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
~
|
||||||
|
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
~ you may not use this file except in compliance with the License.
|
||||||
|
~ You may obtain a copy of the License at
|
||||||
|
~
|
||||||
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
~
|
||||||
|
~ Unless required by applicable law or agreed to in writing, software
|
||||||
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
~ See the License for the specific language governing permissions and
|
||||||
|
~ limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>hoodie-hadoop-docker</artifactId>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<version>0.4.5-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<packaging>pom</packaging>
|
||||||
|
<artifactId>hoodie-hadoop-sparkworker-docker</artifactId>
|
||||||
|
|
||||||
|
<description>Base Docker Image with Hoodie</description>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<artifactId>hoodie-hadoop-sparkbase-docker</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<type>pom</type>
|
||||||
|
<scope>import</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<!-- Build Docker image -->
|
||||||
|
<plugin>
|
||||||
|
<groupId>com.spotify</groupId>
|
||||||
|
<artifactId>dockerfile-maven-plugin</artifactId>
|
||||||
|
<version>${dockerfile.maven.version}</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>tag-latest</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!--<goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkworker_${docker.spark.version}</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>latest</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
<execution>
|
||||||
|
<id>tag-version</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>build</goal>
|
||||||
|
<goal>tag</goal>
|
||||||
|
<!--<goal>push</goal> -->
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.build.skip}</skip>
|
||||||
|
<pullNewerImage>false</pullNewerImage>
|
||||||
|
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkworker_${docker.spark.version}</repository>
|
||||||
|
<forceTags>true</forceTags>
|
||||||
|
<tag>${project.version}</tag>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</project>
|
||||||
16
docker/hoodie/hadoop/sparkworker/worker.sh
Normal file
16
docker/hoodie/hadoop/sparkworker/worker.sh
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
. "/spark/sbin/spark-config.sh"
|
||||||
|
|
||||||
|
. "/spark/bin/load-spark-env.sh"
|
||||||
|
|
||||||
|
mkdir -p $SPARK_WORKER_LOG
|
||||||
|
|
||||||
|
export SPARK_HOME=/opt/spark
|
||||||
|
|
||||||
|
ln -sf /dev/stdout $SPARK_WORKER_LOG/spark-worker.out
|
||||||
|
|
||||||
|
date
|
||||||
|
echo "SPARK HOME is : $SPARK_HOME"
|
||||||
|
/opt/spark/sbin/../bin/spark-class org.apache.spark.deploy.worker.Worker \
|
||||||
|
--webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG/spark-worker.out
|
||||||
16
docker/setup_demo.sh
Executable file
16
docker/setup_demo.sh
Executable file
@@ -0,0 +1,16 @@
|
|||||||
|
# Create host mount directory and copy
|
||||||
|
mkdir -p /tmp/hadoop_name
|
||||||
|
mkdir -p /tmp/hadoop_data
|
||||||
|
|
||||||
|
WS_ROOT=`dirname $PWD`
|
||||||
|
# restart cluster
|
||||||
|
HUDI_WS=${WS_ROOT} docker-compose -f compose/docker-compose_hadoop284_hive233_spark231.yml down
|
||||||
|
HUDI_WS=${WS_ROOT} docker-compose -f compose/docker-compose_hadoop284_hive233_spark231.yml pull
|
||||||
|
rm -rf /tmp/hadoop_data/*
|
||||||
|
rm -rf /tmp/hadoop_name/*
|
||||||
|
sleep 5
|
||||||
|
HUDI_WS=${WS_ROOT} docker-compose -f compose/docker-compose_hadoop284_hive233_spark231.yml up -d
|
||||||
|
sleep 15
|
||||||
|
|
||||||
|
docker exec -it adhoc-1 /bin/bash /var/hoodie/ws/docker/demo/setup_demo_container.sh
|
||||||
|
docker exec -it adhoc-2 /bin/bash /var/hoodie/ws/docker/demo/setup_demo_container.sh
|
||||||
@@ -14,11 +14,11 @@ Check out code and pull it into Intellij as a normal maven project.
|
|||||||
|
|
||||||
Normally build the maven project, from command line
|
Normally build the maven project, from command line
|
||||||
```
|
```
|
||||||
$ mvn clean install -DskipTests
|
$ mvn clean install -DskipTests -DskipITs
|
||||||
|
|
||||||
To work with older version of Hive (pre Hive-1.2.1), use
|
To work with older version of Hive (pre Hive-1.2.1), use
|
||||||
|
|
||||||
$ mvn clean install -DskipTests -Dhive11
|
$ mvn clean install -DskipTests -DskipITs -Dhive11
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -293,6 +293,947 @@ hive>
|
|||||||
{% include note.html content="This is only supported for Read-optimized tables for now." %}
|
{% include note.html content="This is only supported for Read-optimized tables for now." %}
|
||||||
|
|
||||||
|
|
||||||
|
## A Demo using docker containers
|
||||||
|
|
||||||
|
Lets use a real world example to see how hudi works end to end. For this purpose, a self contained
|
||||||
|
data infrastructure is brought up in a local docker cluster within your computer.
|
||||||
|
|
||||||
|
The steps assume you are using Mac laptop
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
* Docker Setup : For Mac, Please follow the steps as defined in [https://docs.docker.com/v17.12/docker-for-mac/install/]. For running Spark-SQL queries, please ensure atleast 6 GB and 4 CPUs are allocated to Docker (See Docker -> Preferences -> Advanced). Otherwise, spark-SQL queries could be killed because of memory issues.
|
||||||
|
* kafkacat : A command-line utility to publish/consume from kafka topics. Use `brew install kafkacat` to install kafkacat
|
||||||
|
* /etc/hosts : The demo references many services running in container by the hostname. Add the following settings to /etc/hosts
|
||||||
|
|
||||||
|
```
|
||||||
|
127.0.0.1 adhoc-1
|
||||||
|
127.0.0.1 adhoc-2
|
||||||
|
127.0.0.1 namenode
|
||||||
|
127.0.0.1 datanode1
|
||||||
|
127.0.0.1 hiveserver
|
||||||
|
127.0.0.1 hivemetastore
|
||||||
|
127.0.0.1 kafkabroker
|
||||||
|
127.0.0.1 sparkmaster
|
||||||
|
127.0.0.1 zookeeper
|
||||||
|
```
|
||||||
|
|
||||||
|
### Setting up Docker Cluster
|
||||||
|
|
||||||
|
|
||||||
|
#### Build Hoodie
|
||||||
|
|
||||||
|
The first step is to build hoodie
|
||||||
|
```
|
||||||
|
cd <HUDI_WORKSPACE>
|
||||||
|
mvn package -DskipTests
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Bringing up Demo Cluster
|
||||||
|
|
||||||
|
The next step is to run the docker compose script and setup configs for bringing up the cluster.
|
||||||
|
This should pull the docker images from docker hub and setup docker cluster.
|
||||||
|
|
||||||
|
```
|
||||||
|
cd docker
|
||||||
|
./setup_demo.sh
|
||||||
|
....
|
||||||
|
....
|
||||||
|
....
|
||||||
|
Stopping spark-worker-1 ... done
|
||||||
|
Stopping hiveserver ... done
|
||||||
|
Stopping hivemetastore ... done
|
||||||
|
Stopping historyserver ... done
|
||||||
|
.......
|
||||||
|
......
|
||||||
|
Creating network "hudi_demo" with the default driver
|
||||||
|
Creating hive-metastore-postgresql ... done
|
||||||
|
Creating namenode ... done
|
||||||
|
Creating zookeeper ... done
|
||||||
|
Creating kafkabroker ... done
|
||||||
|
Creating hivemetastore ... done
|
||||||
|
Creating historyserver ... done
|
||||||
|
Creating hiveserver ... done
|
||||||
|
Creating datanode1 ... done
|
||||||
|
Creating sparkmaster ... done
|
||||||
|
Creating adhoc-1 ... done
|
||||||
|
Creating adhoc-2 ... done
|
||||||
|
Creating spark-worker-1 ... done
|
||||||
|
Copying spark default config and setting up configs
|
||||||
|
Copying spark default config and setting up configs
|
||||||
|
Copying spark default config and setting up configs
|
||||||
|
varadarb-C02SG7Q3G8WP:docker varadarb$ docker ps
|
||||||
|
```
|
||||||
|
|
||||||
|
At this point, the docker cluster will be up and running. The demo cluster brings up the following services
|
||||||
|
|
||||||
|
* HDFS Services (NameNode, DataNode)
|
||||||
|
* Spark Master and Worker
|
||||||
|
* Hive Services (Metastore, HiveServer2 along with PostgresDB)
|
||||||
|
* Kafka Broker and a Zookeeper Node (Kakfa will be used as upstream source for the demo)
|
||||||
|
* Adhoc containers to run Hudi/Hive CLI commands
|
||||||
|
|
||||||
|
### Demo
|
||||||
|
|
||||||
|
Stock Tracker data will be used to showcase both different Hudi Views and the effects of Compaction.
|
||||||
|
|
||||||
|
Take a look at the directory `docker/demo/data`. There are 2 batches of stock data - each at 1 minute granularity.
|
||||||
|
The first batch contains stocker tracker data for some stock symbols during the first hour of trading window
|
||||||
|
(9:30 a.m to 10:30 a.m). The second batch contains tracker data for next 30 mins (10:30 - 11 a.m). Hudi will
|
||||||
|
be used to ingest these batches to a dataset which will contain the latest stock tracker data at hour level granularity.
|
||||||
|
The batches are windowed intentionally so that the second batch contains updates to some of the rows in the first batch.
|
||||||
|
|
||||||
|
#### Step 1 : Publish the first batch to Kafka
|
||||||
|
|
||||||
|
Upload the first batch to Kafka topic 'stock ticks'
|
||||||
|
|
||||||
|
```
|
||||||
|
cat docker/demo/data/batch_1.json | kafkacat -b kafkabroker -t stock_ticks -P
|
||||||
|
|
||||||
|
To check if the new topic shows up, use
|
||||||
|
kafkacat -b kafkabroker -L -J | jq .
|
||||||
|
{
|
||||||
|
"originating_broker": {
|
||||||
|
"id": 1001,
|
||||||
|
"name": "kafkabroker:9092/1001"
|
||||||
|
},
|
||||||
|
"query": {
|
||||||
|
"topic": "*"
|
||||||
|
},
|
||||||
|
"brokers": [
|
||||||
|
{
|
||||||
|
"id": 1001,
|
||||||
|
"name": "kafkabroker:9092"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"topics": [
|
||||||
|
{
|
||||||
|
"topic": "stock_ticks",
|
||||||
|
"partitions": [
|
||||||
|
{
|
||||||
|
"partition": 0,
|
||||||
|
"leader": 1001,
|
||||||
|
"replicas": [
|
||||||
|
{
|
||||||
|
"id": 1001
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"isrs": [
|
||||||
|
{
|
||||||
|
"id": 1001
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Step 2: Incrementally ingest data from Kafka topic
|
||||||
|
|
||||||
|
Hudi comes with a tool named DeltaStreamer. This tool can connect to variety of data sources (including Kafka) to
|
||||||
|
pull changes and apply to Hudi dataset using upsert/insert primitives. Here, we will use the tool to download
|
||||||
|
json data from kafka topic and ingest to both COW and MOR tables we initialized in the previous step. This tool
|
||||||
|
automatically initializes the datasets in the file-system if they do not exist yet.
|
||||||
|
|
||||||
|
```
|
||||||
|
docker exec -it adhoc-2 /bin/bash
|
||||||
|
|
||||||
|
# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow dataset in HDFS
|
||||||
|
spark-submit --class com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type COPY_ON_WRITE --source-class com.uber.hoodie.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /user/hive/warehouse/stock_ticks_cow --target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties
|
||||||
|
....
|
||||||
|
....
|
||||||
|
2018-09-24 22:20:00 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:54 - OutputCommitCoordinator stopped!
|
||||||
|
2018-09-24 22:20:00 INFO SparkContext:54 - Successfully stopped SparkContext
|
||||||
|
# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor dataset in HDFS
|
||||||
|
spark-submit --class com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type MERGE_ON_READ --source-class com.uber.hoodie.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /user/hive/warehouse/stock_ticks_mor --target-table stock_ticks_mor --props /var/demo/config/kafka-source.properties
|
||||||
|
....
|
||||||
|
2018-09-24 22:22:01 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:54 - OutputCommitCoordinator stopped!
|
||||||
|
2018-09-24 22:22:01 INFO SparkContext:54 - Successfully stopped SparkContext
|
||||||
|
....
|
||||||
|
|
||||||
|
# As part of the setup (Look at setup_demo.sh), the configs needed for DeltaStreamer is uploaded to HDFS. The configs
|
||||||
|
# contain mostly Kafa connectivity settings, the avro-schema to be used for ingesting along with key and partitioning fields.
|
||||||
|
|
||||||
|
exit
|
||||||
|
```
|
||||||
|
|
||||||
|
You can use HDFS web-browser to look at the datasets
|
||||||
|
`http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_cow`.
|
||||||
|
|
||||||
|
You can explore the new partition folder created in the dataset along with a "deltacommit"
|
||||||
|
file under .hoodie which signals a successful commit.
|
||||||
|
|
||||||
|
There will be a similar setup when you browse the MOR dataset
|
||||||
|
`http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_mor`
|
||||||
|
|
||||||
|
|
||||||
|
#### Step 3: Sync with Hive
|
||||||
|
|
||||||
|
At this step, the datasets are available in HDFS. We need to sync with Hive to create new Hive tables and add partitions
|
||||||
|
inorder to run Hive queries against those datasets.
|
||||||
|
|
||||||
|
```
|
||||||
|
docker exec -it adhoc-2 /bin/bash
|
||||||
|
|
||||||
|
# THis command takes in HIveServer URL and COW Hudi Dataset location in HDFS and sync the HDFS state to Hive
|
||||||
|
/var/hoodie/ws/hoodie-hive/run_sync_tool.sh --jdbc-url jdbc:hive2://hiveserver:10000 --user hive --pass hive --partitioned-by dt --base-path /user/hive/warehouse/stock_ticks_cow --database default --table stock_ticks_cow
|
||||||
|
.....
|
||||||
|
2018-09-24 22:22:45,568 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(112)) - Sync complete for stock_ticks_cow
|
||||||
|
.....
|
||||||
|
|
||||||
|
# Now run hive-sync for the second data-set in HDFS using Merge-On-Read (MOR storage)
|
||||||
|
/var/hoodie/ws/hoodie-hive/run_sync_tool.sh --jdbc-url jdbc:hive2://hiveserver:10000 --user hive --pass hive --partitioned-by dt --base-path /user/hive/warehouse/stock_ticks_mor --database default --table stock_ticks_mor
|
||||||
|
...
|
||||||
|
2018-09-24 22:23:09,171 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(112)) - Sync complete for stock_ticks_mor
|
||||||
|
...
|
||||||
|
2018-09-24 22:23:09,559 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(112)) - Sync complete for stock_ticks_mor_rt
|
||||||
|
....
|
||||||
|
exit
|
||||||
|
```
|
||||||
|
After executing the above command, you will notice
|
||||||
|
|
||||||
|
1. A hive table named `stock_ticks_cow` created which provides Read-Optimized view for the Copy On Write dataset.
|
||||||
|
2. Two new tables `stock_ticks_mor` and `stock_ticks_mor_rt` created for the Merge On Read dataset. The former
|
||||||
|
provides the ReadOptimized view for the Hudi dataset and the later provides the realtime-view for the dataset.
|
||||||
|
|
||||||
|
|
||||||
|
#### Step 4 (a): Run Hive Queries
|
||||||
|
|
||||||
|
Run a hive query to find the latest timestamp ingested for stock symbol 'GOOG'. You will notice that both read-optimized
|
||||||
|
(for both COW and MOR dataset)and realtime views (for MOR dataset)give the same value "10:29 a.m" as Hudi create a
|
||||||
|
parquet file for the first batch of data.
|
||||||
|
|
||||||
|
```
|
||||||
|
docker exec -it adhoc-2 /bin/bash
|
||||||
|
beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
|
||||||
|
# List Tables
|
||||||
|
0: jdbc:hive2://hiveserver:10000> show tables;
|
||||||
|
+---------------------+--+
|
||||||
|
| tab_name |
|
||||||
|
+---------------------+--+
|
||||||
|
| stock_ticks_cow |
|
||||||
|
| stock_ticks_mor |
|
||||||
|
| stock_ticks_mor_rt |
|
||||||
|
+---------------------+--+
|
||||||
|
2 rows selected (0.801 seconds)
|
||||||
|
0: jdbc:hive2://hiveserver:10000>
|
||||||
|
|
||||||
|
|
||||||
|
# Look at partitions that were added
|
||||||
|
0: jdbc:hive2://hiveserver:10000> show partitions stock_ticks_mor_rt;
|
||||||
|
+----------------+--+
|
||||||
|
| partition |
|
||||||
|
+----------------+--+
|
||||||
|
| dt=2018-08-31 |
|
||||||
|
+----------------+--+
|
||||||
|
1 row selected (0.24 seconds)
|
||||||
|
|
||||||
|
|
||||||
|
# COPY-ON-WRITE Queries:
|
||||||
|
=========================
|
||||||
|
|
||||||
|
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| symbol | _c1 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| GOOG | 2018-08-31 10:29:00 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
|
||||||
|
Now, run a projection query:
|
||||||
|
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924221953 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
|
||||||
|
| 20180924221953 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
|
||||||
|
|
||||||
|
# Merge-On-Read Queries:
|
||||||
|
==========================
|
||||||
|
|
||||||
|
Lets run similar queries against M-O-R dataset. Lets look at both
|
||||||
|
ReadOptimized and Realtime views supported by M-O-R dataset
|
||||||
|
|
||||||
|
# Run against ReadOptimized View. Notice that the latest timestamp is 10:29
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG';
|
||||||
|
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| symbol | _c1 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| GOOG | 2018-08-31 10:29:00 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
1 row selected (6.326 seconds)
|
||||||
|
|
||||||
|
|
||||||
|
# Run against Realtime View. Notice that the latest timestamp is again 10:29
|
||||||
|
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
|
||||||
|
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| symbol | _c1 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| GOOG | 2018-08-31 10:29:00 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
1 row selected (1.606 seconds)
|
||||||
|
|
||||||
|
|
||||||
|
# Run projection query against Read Optimized and Realtime tables
|
||||||
|
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG';
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
|
||||||
|
| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
|
||||||
|
| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
|
||||||
|
exit
|
||||||
|
exit
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Step 4 (b): Run Spark-SQL Queries
|
||||||
|
Hudi support Spark as query processor just like Hive. Here are the same hive queries
|
||||||
|
running in spark-sql
|
||||||
|
|
||||||
|
```
|
||||||
|
docker exec -it adhoc-1 /bin/bash
|
||||||
|
$SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --master local[2] --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --executor-memory 3G --num-executors 1 --packages com.databricks:spark-avro_2.11:4.0.0
|
||||||
|
...
|
||||||
|
|
||||||
|
Welcome to
|
||||||
|
____ __
|
||||||
|
/ __/__ ___ _____/ /__
|
||||||
|
_\ \/ _ \/ _ `/ __/ '_/
|
||||||
|
/___/ .__/\_,_/_/ /_/\_\ version 2.3.1
|
||||||
|
/_/
|
||||||
|
|
||||||
|
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_181)
|
||||||
|
Type in expressions to have them evaluated.
|
||||||
|
Type :help for more information.
|
||||||
|
|
||||||
|
scala>
|
||||||
|
scala> spark.sql("show tables").show(100, false)
|
||||||
|
+--------+------------------+-----------+
|
||||||
|
|database|tableName |isTemporary|
|
||||||
|
+--------+------------------+-----------+
|
||||||
|
|default |stock_ticks_cow |false |
|
||||||
|
|default |stock_ticks_mor |false |
|
||||||
|
|default |stock_ticks_mor_rt|false |
|
||||||
|
+--------+------------------+-----------+
|
||||||
|
|
||||||
|
# Copy-On-Write Table
|
||||||
|
|
||||||
|
## Run max timestamp query against COW table
|
||||||
|
|
||||||
|
scala> spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false)
|
||||||
|
[Stage 0:> (0 + 1) / 1]SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
|
||||||
|
SLF4J: Defaulting to no-operation (NOP) logger implementation
|
||||||
|
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
|
||||||
|
+------+-------------------+
|
||||||
|
|symbol|max(ts) |
|
||||||
|
+------+-------------------+
|
||||||
|
|GOOG |2018-08-31 10:29:00|
|
||||||
|
+------+-------------------+
|
||||||
|
|
||||||
|
## Projection Query
|
||||||
|
|
||||||
|
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'").show(100, false)
|
||||||
|
+-------------------+------+-------------------+------+---------+--------+
|
||||||
|
|_hoodie_commit_time|symbol|ts |volume|open |close |
|
||||||
|
+-------------------+------+-------------------+------+---------+--------+
|
||||||
|
|20180924221953 |GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |
|
||||||
|
|20180924221953 |GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|
|
||||||
|
+-------------------+------+-------------------+------+---------+--------+
|
||||||
|
|
||||||
|
# Merge-On-Read Queries:
|
||||||
|
==========================
|
||||||
|
|
||||||
|
Lets run similar queries against M-O-R dataset. Lets look at both
|
||||||
|
ReadOptimized and Realtime views supported by M-O-R dataset
|
||||||
|
|
||||||
|
# Run against ReadOptimized View. Notice that the latest timestamp is 10:29
|
||||||
|
scala> spark.sql("select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'").show(100, false)
|
||||||
|
+------+-------------------+
|
||||||
|
|symbol|max(ts) |
|
||||||
|
+------+-------------------+
|
||||||
|
|GOOG |2018-08-31 10:29:00|
|
||||||
|
+------+-------------------+
|
||||||
|
|
||||||
|
|
||||||
|
# Run against Realtime View. Notice that the latest timestamp is again 10:29
|
||||||
|
|
||||||
|
scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
|
||||||
|
+------+-------------------+
|
||||||
|
|symbol|max(ts) |
|
||||||
|
+------+-------------------+
|
||||||
|
|GOOG |2018-08-31 10:29:00|
|
||||||
|
+------+-------------------+
|
||||||
|
|
||||||
|
# Run projection query against Read Optimized and Realtime tables
|
||||||
|
|
||||||
|
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'").show(100, false)
|
||||||
|
+-------------------+------+-------------------+------+---------+--------+
|
||||||
|
|_hoodie_commit_time|symbol|ts |volume|open |close |
|
||||||
|
+-------------------+------+-------------------+------+---------+--------+
|
||||||
|
|20180924222155 |GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |
|
||||||
|
|20180924222155 |GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|
|
||||||
|
+-------------------+------+-------------------+------+---------+--------+
|
||||||
|
|
||||||
|
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false)
|
||||||
|
+-------------------+------+-------------------+------+---------+--------+
|
||||||
|
|_hoodie_commit_time|symbol|ts |volume|open |close |
|
||||||
|
+-------------------+------+-------------------+------+---------+--------+
|
||||||
|
|20180924222155 |GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |
|
||||||
|
|20180924222155 |GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|
|
||||||
|
+-------------------+------+-------------------+------+---------+--------+
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
#### Step 5: Upload second batch to Kafka and run DeltaStreamer to ingest
|
||||||
|
|
||||||
|
Upload the second batch of data and ingest this batch using delta-streamer. As this batch does not bring in any new
|
||||||
|
partitions, there is no need to run hive-sync
|
||||||
|
|
||||||
|
```
|
||||||
|
cat docker/demo/data/batch_2.json | kafkacat -b kafkabroker -t stock_ticks -P
|
||||||
|
|
||||||
|
# Within Docker container, run the ingestion command
|
||||||
|
docker exec -it adhoc-2 /bin/bash
|
||||||
|
|
||||||
|
# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow dataset in HDFS
|
||||||
|
spark-submit --class com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type COPY_ON_WRITE --source-class com.uber.hoodie.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /user/hive/warehouse/stock_ticks_cow --target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties
|
||||||
|
|
||||||
|
# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor dataset in HDFS
|
||||||
|
spark-submit --class com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type MERGE_ON_READ --source-class com.uber.hoodie.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /user/hive/warehouse/stock_ticks_mor --target-table stock_ticks_mor --props /var/demo/config/kafka-source.properties
|
||||||
|
|
||||||
|
exit
|
||||||
|
```
|
||||||
|
With Copy-On-Write table, the second ingestion by DeltaStreamer resulted in a new version of Parquet file getting created.
|
||||||
|
See `http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_cow/2018/08/31`
|
||||||
|
|
||||||
|
With Merge-On-Read table, the second ingestion merely appended the batch to an unmerged delta (log) file.
|
||||||
|
Take a look at the HDFS filesystem to get an idea: `http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_mor/2018/08/31`
|
||||||
|
|
||||||
|
#### Step 6(a): Run Hive Queries
|
||||||
|
|
||||||
|
With Copy-On-Write table, the read-optimized view immediately sees the changes as part of second batch once the batch
|
||||||
|
got committed as each ingestion creates newer versions of parquet files.
|
||||||
|
|
||||||
|
With Merge-On-Read table, the second ingestion merely appended the batch to an unmerged delta (log) file.
|
||||||
|
This is the time, when ReadOptimized and Realtime views will provide different results. ReadOptimized view will still
|
||||||
|
return "10:29 am" as it will only read from the Parquet file. Realtime View will do on-the-fly merge and return
|
||||||
|
latest committed data which is "10:59 a.m".
|
||||||
|
|
||||||
|
```
|
||||||
|
docker exec -it adhoc-2 /bin/bash
|
||||||
|
beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
|
||||||
|
|
||||||
|
# Copy On Write Table:
|
||||||
|
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
|
||||||
|
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| symbol | _c1 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| GOOG | 2018-08-31 10:59:00 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
1 row selected (1.932 seconds)
|
||||||
|
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924221953 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
|
||||||
|
| 20180924224524 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
|
||||||
|
As you can notice, the above queries now reflect the changes that came as part of ingesting second batch.
|
||||||
|
|
||||||
|
|
||||||
|
# Merge On Read Table:
|
||||||
|
|
||||||
|
# Read Optimized View
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG';
|
||||||
|
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| symbol | _c1 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| GOOG | 2018-08-31 10:29:00 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
1 row selected (1.6 seconds)
|
||||||
|
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG';
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
|
||||||
|
| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
|
||||||
|
# Realtime View
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
|
||||||
|
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| symbol | _c1 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| GOOG | 2018-08-31 10:59:00 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
|
||||||
|
| 20180924224537 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
|
||||||
|
exit
|
||||||
|
exit
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Step 6(b): Run Spark SQL Queries
|
||||||
|
|
||||||
|
Running the same queries in Spark-SQL:
|
||||||
|
|
||||||
|
```
|
||||||
|
docker exec -it adhoc-1 /bin/bash
|
||||||
|
bash-4.4# $SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --master local[2] --executor-memory 3G --num-executors 1 --packages com.databricks:spark-avro_2.11:4.0.0
|
||||||
|
|
||||||
|
# Copy On Write Table:
|
||||||
|
|
||||||
|
scala> spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false)
|
||||||
|
+------+-------------------+
|
||||||
|
|symbol|max(ts) |
|
||||||
|
+------+-------------------+
|
||||||
|
|GOOG |2018-08-31 10:59:00|
|
||||||
|
+------+-------------------+
|
||||||
|
|
||||||
|
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'").show(100, false)
|
||||||
|
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924221953 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
|
||||||
|
| 20180924224524 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
|
||||||
|
As you can notice, the above queries now reflect the changes that came as part of ingesting second batch.
|
||||||
|
|
||||||
|
|
||||||
|
# Merge On Read Table:
|
||||||
|
|
||||||
|
# Read Optimized View
|
||||||
|
scala> spark.sql("select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'").show(100, false)
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| symbol | _c1 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| GOOG | 2018-08-31 10:29:00 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
1 row selected (1.6 seconds)
|
||||||
|
|
||||||
|
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'").show(100, false)
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
|
||||||
|
| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
|
||||||
|
# Realtime View
|
||||||
|
scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| symbol | _c1 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| GOOG | 2018-08-31 10:59:00 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
|
||||||
|
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false)
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
|
||||||
|
| 20180924224537 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
|
||||||
|
exit
|
||||||
|
exit
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Step 7 : Incremental Query for COPY-ON-WRITE Table
|
||||||
|
|
||||||
|
With 2 batches of data ingested, lets showcase the support for incremental queries in Hudi Copy-On-Write datasets
|
||||||
|
|
||||||
|
Lets take the same projection query example
|
||||||
|
```
|
||||||
|
docker exec -it adhoc-2 /bin/bash
|
||||||
|
beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
|
||||||
|
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924064621 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
|
||||||
|
| 20180924065039 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
As you notice from the above queries, there are 2 commits - 20180924064621 and 20180924065039 in timeline order.
|
||||||
|
When you follow the steps, you will be getting different timestamps for commits. Substitute them
|
||||||
|
in place of the above timestamps.
|
||||||
|
|
||||||
|
To show the effects of incremental-query, let us assume that a reader has already seen the changes as part of
|
||||||
|
ingesting first batch. Now, for the reader to see effect of the second batch, he/she has to keep the start timestamp to
|
||||||
|
the commit time of the first batch (20180924064621) and run incremental query
|
||||||
|
|
||||||
|
`Hudi incremental mode` provides efficient scanning for incremental queries by filtering out files that do not have any
|
||||||
|
candidate rows using hudi-managed metadata.
|
||||||
|
|
||||||
|
```
|
||||||
|
docker exec -it adhoc-2 /bin/bash
|
||||||
|
beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
|
||||||
|
0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.mode=INCREMENTAL;
|
||||||
|
No rows affected (0.009 seconds)
|
||||||
|
0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.max.commits=3;
|
||||||
|
No rows affected (0.009 seconds)
|
||||||
|
0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.start.timestamp=20180924064621;
|
||||||
|
|
||||||
|
# With the above setting, file-ids that do not have any updates from the commit 20180924065039 is filtered out without scanning.
|
||||||
|
# Here is the incremental query :
|
||||||
|
|
||||||
|
0: jdbc:hive2://hiveserver:10000>
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG' and `_hoodie_commit_time` > '20180924064621';
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924065039 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
1 row selected (0.83 seconds)
|
||||||
|
0: jdbc:hive2://hiveserver:10000>
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Incremental Query with Spark SQL:
|
||||||
|
```
|
||||||
|
docker exec -it adhoc-1 /bin/bash
|
||||||
|
bash-4.4# $SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --master local[2] --executor-memory 3G --num-executors 1 --packages com.databricks:spark-avro_2.11:4.0.0
|
||||||
|
Welcome to
|
||||||
|
____ __
|
||||||
|
/ __/__ ___ _____/ /__
|
||||||
|
_\ \/ _ \/ _ `/ __/ '_/
|
||||||
|
/___/ .__/\_,_/_/ /_/\_\ version 2.3.1
|
||||||
|
/_/
|
||||||
|
|
||||||
|
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_181)
|
||||||
|
Type in expressions to have them evaluated.
|
||||||
|
Type :help for more information.
|
||||||
|
|
||||||
|
scala> import com.uber.hoodie.DataSourceReadOptions
|
||||||
|
import com.uber.hoodie.DataSourceReadOptions
|
||||||
|
|
||||||
|
# In the below query, 20180925045257 is the first commit's timestamp
|
||||||
|
scala> val hoodieIncViewDF = spark.read.format("com.uber.hoodie").option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY, DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL).option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY, "20180924064621").load("/user/hive/warehouse/stock_ticks_cow")
|
||||||
|
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
|
||||||
|
SLF4J: Defaulting to no-operation (NOP) logger implementation
|
||||||
|
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
|
||||||
|
hoodieIncViewDF: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string, _hoodie_commit_seqno: string ... 15 more fields]
|
||||||
|
|
||||||
|
scala> hoodieIncViewDF.registerTempTable("stock_ticks_cow_incr_tmp1")
|
||||||
|
warning: there was one deprecation warning; re-run with -deprecation for details
|
||||||
|
|
||||||
|
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow_incr_tmp1 where symbol = 'GOOG'").show(100, false);
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924065039 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
#### Step 8: Schedule and Run Compaction for Merge-On-Read dataset
|
||||||
|
|
||||||
|
Lets schedule and run a compaction to create a new version of columnar file so that read-optimized readers will see fresher data.
|
||||||
|
Again, You can use Hudi CLI to manually schedule and run compaction
|
||||||
|
|
||||||
|
```
|
||||||
|
docker exec -it adhoc-1 /bin/bash
|
||||||
|
^[[Aroot@adhoc-1:/opt# /var/hoodie/ws/hoodie-cli/hoodie-cli.sh
|
||||||
|
============================================
|
||||||
|
* *
|
||||||
|
* _ _ _ _ *
|
||||||
|
* | | | | | (_) *
|
||||||
|
* | |__| | ___ ___ __| |_ ___ *
|
||||||
|
* | __ |/ _ \ / _ \ / _` | |/ _ \ *
|
||||||
|
* | | | | (_) | (_) | (_| | | __/ *
|
||||||
|
* |_| |_|\___/ \___/ \__,_|_|\___| *
|
||||||
|
* *
|
||||||
|
============================================
|
||||||
|
|
||||||
|
Welcome to Hoodie CLI. Please type help if you are looking for help.
|
||||||
|
hoodie->connect --path /user/hive/warehouse/stock_ticks_mor
|
||||||
|
18/09/24 06:59:34 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
|
||||||
|
18/09/24 06:59:35 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor
|
||||||
|
18/09/24 06:59:35 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]]
|
||||||
|
18/09/24 06:59:35 INFO table.HoodieTableConfig: Loading dataset properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties
|
||||||
|
18/09/24 06:59:36 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ from /user/hive/warehouse/stock_ticks_mor
|
||||||
|
Metadata for table stock_ticks_mor loaded
|
||||||
|
|
||||||
|
# Ensure no compactions are present
|
||||||
|
|
||||||
|
hoodie:stock_ticks_mor->compactions show all
|
||||||
|
18/09/24 06:59:54 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924064636__clean__COMPLETED], [20180924064636__deltacommit__COMPLETED], [20180924065057__clean__COMPLETED], [20180924065057__deltacommit__COMPLETED]]
|
||||||
|
___________________________________________________________________
|
||||||
|
| Compaction Instant Time| State | Total FileIds to be Compacted|
|
||||||
|
|==================================================================|
|
||||||
|
|
||||||
|
# Schedule a compaction. This will use Spark Launcher to schedule compaction
|
||||||
|
hoodie:stock_ticks_mor->compaction schedule
|
||||||
|
....
|
||||||
|
Compaction successfully completed for 20180924070031
|
||||||
|
|
||||||
|
# Now refresh and check again. You will see that there is a new compaction requested
|
||||||
|
|
||||||
|
hoodie:stock_ticks->connect --path /user/hive/warehouse/stock_ticks_mor
|
||||||
|
18/09/24 07:01:16 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor
|
||||||
|
18/09/24 07:01:16 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]]
|
||||||
|
18/09/24 07:01:16 INFO table.HoodieTableConfig: Loading dataset properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties
|
||||||
|
18/09/24 07:01:16 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ from /user/hive/warehouse/stock_ticks_mor
|
||||||
|
Metadata for table stock_ticks_mor loaded
|
||||||
|
|
||||||
|
hoodie:stock_ticks_mor->compactions show all
|
||||||
|
18/09/24 06:34:12 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924041125__clean__COMPLETED], [20180924041125__deltacommit__COMPLETED], [20180924042735__clean__COMPLETED], [20180924042735__deltacommit__COMPLETED], [==>20180924063245__compaction__REQUESTED]]
|
||||||
|
___________________________________________________________________
|
||||||
|
| Compaction Instant Time| State | Total FileIds to be Compacted|
|
||||||
|
|==================================================================|
|
||||||
|
| 20180924070031 | REQUESTED| 1 |
|
||||||
|
|
||||||
|
# Execute the compaction. The compaction instant value passed below must be the one displayed in the above "compactions show all" query
|
||||||
|
hoodie:stock_ticks_mor->compaction run --compactionInstant 20180924070031 --parallelism 2 --sparkMemory 1G --schemaFilePath /var/demo/config/schema.avsc --retry 1
|
||||||
|
....
|
||||||
|
Compaction successfully completed for 20180924070031
|
||||||
|
|
||||||
|
|
||||||
|
## Now check if compaction is completed
|
||||||
|
|
||||||
|
hoodie:stock_ticks_mor->connect --path /user/hive/warehouse/stock_ticks_mor
|
||||||
|
18/09/24 07:03:00 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor
|
||||||
|
18/09/24 07:03:00 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]]
|
||||||
|
18/09/24 07:03:00 INFO table.HoodieTableConfig: Loading dataset properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties
|
||||||
|
18/09/24 07:03:00 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ from /user/hive/warehouse/stock_ticks_mor
|
||||||
|
Metadata for table stock_ticks_mor loaded
|
||||||
|
|
||||||
|
hoodie:stock_ticks->compactions show all
|
||||||
|
18/09/24 07:03:15 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924064636__clean__COMPLETED], [20180924064636__deltacommit__COMPLETED], [20180924065057__clean__COMPLETED], [20180924065057__deltacommit__COMPLETED], [20180924070031__commit__COMPLETED]]
|
||||||
|
___________________________________________________________________
|
||||||
|
| Compaction Instant Time| State | Total FileIds to be Compacted|
|
||||||
|
|==================================================================|
|
||||||
|
| 20180924070031 | COMPLETED| 1 |
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Step 9: Run Hive Queries including incremental queries
|
||||||
|
|
||||||
|
You will see that both ReadOptimized and Realtime Views will show the latest committed data.
|
||||||
|
Lets also run the incremental query for MOR table.
|
||||||
|
From looking at the below query output, it will be clear that the fist commit time for the MOR table is 20180924064636
|
||||||
|
and the second commit time is 20180924070031
|
||||||
|
|
||||||
|
```
|
||||||
|
docker exec -it adhoc-2 /bin/bash
|
||||||
|
beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
|
||||||
|
|
||||||
|
# Read Optimized View
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG';
|
||||||
|
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| symbol | _c1 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| GOOG | 2018-08-31 10:59:00 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
1 row selected (1.6 seconds)
|
||||||
|
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG';
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
|
||||||
|
| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
|
||||||
|
# Realtime View
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
|
||||||
|
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| symbol | _c1 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| GOOG | 2018-08-31 10:59:00 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
|
||||||
|
| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
|
||||||
|
# Incremental View:
|
||||||
|
|
||||||
|
0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.mode=INCREMENTAL;
|
||||||
|
No rows affected (0.008 seconds)
|
||||||
|
# Max-Commits covers both second batch and compaction commit
|
||||||
|
0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.max.commits=3;
|
||||||
|
No rows affected (0.007 seconds)
|
||||||
|
0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.start.timestamp=20180924064636;
|
||||||
|
No rows affected (0.013 seconds)
|
||||||
|
# Query:
|
||||||
|
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG' and `_hoodie_commit_time` > '20180924064636';
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
exit
|
||||||
|
exit
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Read Optimized and Realtime Views for MOR with Spark-SQL after compaction
|
||||||
|
|
||||||
|
```
|
||||||
|
docker exec -it adhoc-1 /bin/bash
|
||||||
|
bash-4.4# $SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --master local[2] --executor-memory 3G --num-executors 1 --packages com.databricks:spark-avro_2.11:4.0.0
|
||||||
|
|
||||||
|
# Read Optimized View
|
||||||
|
scala> spark.sql("select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'").show(100, false)
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| symbol | _c1 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| GOOG | 2018-08-31 10:59:00 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
1 row selected (1.6 seconds)
|
||||||
|
|
||||||
|
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'").show(100, false)
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
|
||||||
|
| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
|
||||||
|
# Realtime View
|
||||||
|
scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| symbol | _c1 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
| GOOG | 2018-08-31 10:59:00 |
|
||||||
|
+---------+----------------------+--+
|
||||||
|
|
||||||
|
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false)
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| _hoodie_commit_time | symbol | ts | volume | open | close |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
|
||||||
|
| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
|
||||||
|
+----------------------+---------+----------------------+---------+------------+-----------+--+
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
This brings the demo to an end.
|
||||||
|
|
||||||
|
## Testing Hoodie in Local Docker environment
|
||||||
|
|
||||||
|
You can bring up a hadoop docker environment containing Hadoop, Hive and Spark services with support for hoodie.
|
||||||
|
```
|
||||||
|
$ mvn pre-integration-test -DskipTests
|
||||||
|
```
|
||||||
|
The above command builds docker images for all the services with
|
||||||
|
current hoodie source installed at /var/hoodie/ws and also brings up the services using a compose file. We
|
||||||
|
currently use Hadoop (v2.8.4), Hive (v2.3.3) and Spark (v2.3.1) in docker images.
|
||||||
|
|
||||||
|
To bring down the containers
|
||||||
|
```
|
||||||
|
$ cd hoodie-integ-test
|
||||||
|
$ mvn docker-compose:down
|
||||||
|
```
|
||||||
|
|
||||||
|
If you want to bring up the docker containers, use
|
||||||
|
```
|
||||||
|
$ cd hoodie-integ-test
|
||||||
|
$ mvn docker-compose:up -DdetachedMode=true
|
||||||
|
```
|
||||||
|
|
||||||
|
Hoodie is a library that is operated in a broader data analytics/ingestion environment
|
||||||
|
involving Hadoop, Hive and Spark. Interoperability with all these systems is a key objective for us. We are
|
||||||
|
actively adding integration-tests under __hoodie-integ-test/src/test/java__ that makes use of this
|
||||||
|
docker environment (See __hoodie-integ-test/src/test/java/com/uber/hoodie/integ/ITTestHoodieSanity.java__ )
|
||||||
|
|
||||||
|
|
||||||
|
#### Building Local Docker Containers:
|
||||||
|
|
||||||
|
The docker images required for demo and running integration test are already in docker-hub. The docker images
|
||||||
|
and compose scripts are carefully implemented so that they serve dual-purpose
|
||||||
|
|
||||||
|
1. The docker images have inbuilt hudi jar files with environment variable pointing to those jars (HUDI_HADOOP_BUNDLE, ...)
|
||||||
|
2. For running integration-tests, we need the jars generated locally to be used for running services within docker. The
|
||||||
|
docker-compose scripts (see `docker/compose/docker-compose_hadoop284_hive233_spark231.yml`) ensures local jars override
|
||||||
|
inbuilt jars by mounting local HUDI workspace over the docker location
|
||||||
|
|
||||||
|
This helps avoid maintaining separate docker images and avoids the costly step of building HUDI docker images locally.
|
||||||
|
But if users want to test hudi from locations with lower network bandwidth, they can still build local images
|
||||||
|
run the script
|
||||||
|
`docker/build_local_docker_images.sh` to build local docker images before running `docker/setup_demo.sh`
|
||||||
|
|
||||||
|
Here are the commands:
|
||||||
|
|
||||||
|
```
|
||||||
|
cd docker
|
||||||
|
./build_local_docker_images.sh
|
||||||
|
.....
|
||||||
|
|
||||||
|
[INFO] Reactor Summary:
|
||||||
|
[INFO]
|
||||||
|
[INFO] Hoodie ............................................. SUCCESS [ 1.709 s]
|
||||||
|
[INFO] hoodie-common ...................................... SUCCESS [ 9.015 s]
|
||||||
|
[INFO] hoodie-hadoop-mr ................................... SUCCESS [ 1.108 s]
|
||||||
|
[INFO] hoodie-client ...................................... SUCCESS [ 4.409 s]
|
||||||
|
[INFO] hoodie-hive ........................................ SUCCESS [ 0.976 s]
|
||||||
|
[INFO] hoodie-spark ....................................... SUCCESS [ 26.522 s]
|
||||||
|
[INFO] hoodie-utilities ................................... SUCCESS [ 16.256 s]
|
||||||
|
[INFO] hoodie-cli ......................................... SUCCESS [ 11.341 s]
|
||||||
|
[INFO] hoodie-hadoop-mr-bundle ............................ SUCCESS [ 1.893 s]
|
||||||
|
[INFO] hoodie-hive-bundle ................................. SUCCESS [ 14.099 s]
|
||||||
|
[INFO] hoodie-spark-bundle ................................ SUCCESS [ 58.252 s]
|
||||||
|
[INFO] hoodie-hadoop-docker ............................... SUCCESS [ 0.612 s]
|
||||||
|
[INFO] hoodie-hadoop-base-docker .......................... SUCCESS [04:04 min]
|
||||||
|
[INFO] hoodie-hadoop-namenode-docker ...................... SUCCESS [ 6.142 s]
|
||||||
|
[INFO] hoodie-hadoop-datanode-docker ...................... SUCCESS [ 7.763 s]
|
||||||
|
[INFO] hoodie-hadoop-history-docker ....................... SUCCESS [ 5.922 s]
|
||||||
|
[INFO] hoodie-hadoop-hive-docker .......................... SUCCESS [ 56.152 s]
|
||||||
|
[INFO] hoodie-hadoop-sparkbase-docker ..................... SUCCESS [01:18 min]
|
||||||
|
[INFO] hoodie-hadoop-sparkmaster-docker ................... SUCCESS [ 2.964 s]
|
||||||
|
[INFO] hoodie-hadoop-sparkworker-docker ................... SUCCESS [ 3.032 s]
|
||||||
|
[INFO] hoodie-hadoop-sparkadhoc-docker .................... SUCCESS [ 2.764 s]
|
||||||
|
[INFO] hoodie-integ-test .................................. SUCCESS [ 1.785 s]
|
||||||
|
[INFO] ------------------------------------------------------------------------
|
||||||
|
[INFO] BUILD SUCCESS
|
||||||
|
[INFO] ------------------------------------------------------------------------
|
||||||
|
[INFO] Total time: 09:15 min
|
||||||
|
[INFO] Finished at: 2018-09-10T17:47:37-07:00
|
||||||
|
[INFO] Final Memory: 236M/1848M
|
||||||
|
[INFO] ------------------------------------------------------------------------
|
||||||
|
```
|
||||||
0
hoodie-hive/run_sync_tool.sh
Normal file → Executable file
0
hoodie-hive/run_sync_tool.sh
Normal file → Executable file
@@ -14,9 +14,8 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package com.uber.hoodie.hive.util;
|
package com.uber.hoodie.hive;
|
||||||
|
|
||||||
import com.uber.hoodie.hive.PartitionValueExtractor;
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@@ -26,7 +26,6 @@ import com.google.common.collect.Lists;
|
|||||||
import com.uber.hoodie.common.util.SchemaTestUtil;
|
import com.uber.hoodie.common.util.SchemaTestUtil;
|
||||||
import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent;
|
import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent;
|
||||||
import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent.PartitionEventType;
|
import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent.PartitionEventType;
|
||||||
import com.uber.hoodie.hive.util.MultiPartKeysValueExtractor;
|
|
||||||
import com.uber.hoodie.hive.util.SchemaUtil;
|
import com.uber.hoodie.hive.util.SchemaUtil;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
|||||||
212
hoodie-integ-test/pom.xml
Normal file
212
hoodie-integ-test/pom.xml
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!--
|
||||||
|
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
~
|
||||||
|
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
~ you may not use this file except in compliance with the License.
|
||||||
|
~ You may obtain a copy of the License at
|
||||||
|
~
|
||||||
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
~
|
||||||
|
~ Unless required by applicable law or agreed to in writing, software
|
||||||
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
~ See the License for the specific language governing permissions and
|
||||||
|
~ limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>hoodie</artifactId>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<version>0.4.5-SNAPSHOT</version>
|
||||||
|
<relativePath>../pom.xml</relativePath>
|
||||||
|
</parent>
|
||||||
|
<artifactId>hoodie-integ-test</artifactId>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.glassfish.jersey.connectors</groupId>
|
||||||
|
<artifactId>jersey-apache-connector</artifactId>
|
||||||
|
<version>2.17</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.glassfish.jersey.core</groupId>
|
||||||
|
<artifactId>jersey-server</artifactId>
|
||||||
|
<version>2.17</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.glassfish.jersey.containers</groupId>
|
||||||
|
<artifactId>jersey-container-servlet-core</artifactId>
|
||||||
|
<version>2.17</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<artifactId>hoodie-spark</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>org.glassfish.**</groupId>
|
||||||
|
<artifactId>*</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<artifactId>hoodie-common</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<classifier>tests</classifier>
|
||||||
|
<type>test-jar</type>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.awaitility</groupId>
|
||||||
|
<artifactId>awaitility</artifactId>
|
||||||
|
<version>3.1.2</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<artifactId>hoodie-spark</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<classifier>tests</classifier>
|
||||||
|
<type>test-jar</type>
|
||||||
|
<scope>test</scope>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>org.glassfish.**</groupId>
|
||||||
|
<artifactId>*</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.google.guava</groupId>
|
||||||
|
<artifactId>guava</artifactId>
|
||||||
|
<version>20.0</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
|
<artifactId>jackson-annotations</artifactId>
|
||||||
|
<version>2.6.4</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
|
<artifactId>jackson-databind</artifactId>
|
||||||
|
<version>2.6.4</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.fasterxml.jackson.datatype</groupId>
|
||||||
|
<artifactId>jackson-datatype-guava</artifactId>
|
||||||
|
<version>2.9.4</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.github.docker-java</groupId>
|
||||||
|
<artifactId>docker-java</artifactId>
|
||||||
|
<version>3.1.0-rc-3</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>org.glassfish.**</groupId>
|
||||||
|
<artifactId>*</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.uber.hoodie</groupId>
|
||||||
|
<artifactId>hoodie-hadoop-sparkworker-docker</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
<type>pom</type>
|
||||||
|
<scope>import</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<version>${junit.version}</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<dockerCompose.envFile>${project.basedir}/compose_env</dockerCompose.envFile>
|
||||||
|
<dockerCompose.file>${project.basedir}/../docker/compose/docker-compose_hadoop284_hive233_spark231.yml</dockerCompose.file>
|
||||||
|
<skipITs>false</skipITs>
|
||||||
|
<docker.compose.skip>${skipITs}</docker.compose.skip>
|
||||||
|
<checkstyle.skip>true</checkstyle.skip>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<artifactId>exec-maven-plugin</artifactId>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<executions>
|
||||||
|
<execution><!-- setup HUDI_WS variable in docker compose env file -->
|
||||||
|
<id>Setup HUDI_WS</id>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>exec</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<executable>/bin/bash</executable>
|
||||||
|
<arguments>
|
||||||
|
<argument> -c </argument>
|
||||||
|
<argument>echo HUDI_WS=`dirname ${project.basedir}`</argument>
|
||||||
|
</arguments>
|
||||||
|
<outputFile>${dockerCompose.envFile}</outputFile>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-failsafe-plugin</artifactId>
|
||||||
|
<version>2.22.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<goals>
|
||||||
|
<goal>integration-test</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>com.dkanejs.maven.plugins</groupId>
|
||||||
|
<artifactId>docker-compose-maven-plugin</artifactId>
|
||||||
|
<version>2.0.1</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<id>up</id>
|
||||||
|
<phase>pre-integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>up</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.compose.skip}</skip>
|
||||||
|
<host>unix:///var/run/docker.sock</host>
|
||||||
|
<composeFile>${project.basedir}/../docker/compose/docker-compose_hadoop284_hive233_spark231.yml</composeFile>
|
||||||
|
<detachedMode>true</detachedMode>
|
||||||
|
<envFile>${dockerCompose.envFile}</envFile>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
<execution>
|
||||||
|
<id>down</id>
|
||||||
|
<phase>integration-test</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>down</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<skip>${docker.compose.skip}</skip>
|
||||||
|
<composeFile>${project.basedir}/../docker/compose/docker-compose_hadoop284_hive233_spark231.yml</composeFile>
|
||||||
|
<removeVolumes>true</removeVolumes>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</project>
|
||||||
@@ -0,0 +1,178 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package com.uber.hoodie.integ;
|
||||||
|
|
||||||
|
import static java.util.concurrent.TimeUnit.SECONDS;
|
||||||
|
import static org.awaitility.Awaitility.await;
|
||||||
|
|
||||||
|
import com.github.dockerjava.api.DockerClient;
|
||||||
|
import com.github.dockerjava.api.command.DockerCmdExecFactory;
|
||||||
|
import com.github.dockerjava.api.command.ExecCreateCmd;
|
||||||
|
import com.github.dockerjava.api.command.ExecCreateCmdResponse;
|
||||||
|
import com.github.dockerjava.api.model.Container;
|
||||||
|
import com.github.dockerjava.core.DefaultDockerClientConfig;
|
||||||
|
import com.github.dockerjava.core.DockerClientBuilder;
|
||||||
|
import com.github.dockerjava.core.DockerClientConfig;
|
||||||
|
import com.github.dockerjava.core.command.ExecStartResultCallback;
|
||||||
|
import com.github.dockerjava.jaxrs.JerseyDockerCmdExecFactory;
|
||||||
|
import com.google.common.collect.ImmutableList;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
import org.apache.log4j.LogManager;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Before;
|
||||||
|
|
||||||
|
public abstract class ITTestBase {
|
||||||
|
|
||||||
|
public static final Logger LOG = LogManager.getLogger(ITTestBase.class);
|
||||||
|
protected static final String SPARK_WORKER_CONTAINER = "/spark-worker-1";
|
||||||
|
protected static final String ADHOC_1_CONTAINER = "/adhoc-1";
|
||||||
|
protected static final String ADHOC_2_CONTAINER = "/adhoc-2";
|
||||||
|
protected static final String HIVESERVER = "/hiveserver";
|
||||||
|
protected static final String HOODIE_WS_ROOT = "/var/hoodie/ws";
|
||||||
|
protected static final String HOODIE_JAVA_APP = HOODIE_WS_ROOT + "/hoodie-spark/run_hoodie_app.sh";
|
||||||
|
protected static final String HUDI_HADOOP_BUNDLE =
|
||||||
|
HOODIE_WS_ROOT + "/docker/hoodie/hadoop/hive_base/target/hoodie-hadoop-mr-bundle.jar";
|
||||||
|
protected static final String HUDI_HIVE_BUNDLE =
|
||||||
|
HOODIE_WS_ROOT + "/docker/hoodie/hadoop/hive_base/target/hoodie-hive-bundle.jar";
|
||||||
|
protected static final String HUDI_SPARK_BUNDLE =
|
||||||
|
HOODIE_WS_ROOT + "/docker/hoodie/hadoop/hive_base/target/hoodie-spark-bundle.jar";
|
||||||
|
protected static final String HIVE_SERVER_JDBC_URL = "jdbc:hive2://hiveserver:10000";
|
||||||
|
// Skip these lines when capturing output from hive
|
||||||
|
protected static final Integer SLF4J_WARNING_LINE_COUNT_IN_HIVE_CMD = 9;
|
||||||
|
private static final String DEFAULT_DOCKER_HOST = "unix:///var/run/docker.sock";
|
||||||
|
private static final String OVERRIDDEN_DOCKER_HOST = System.getenv("DOCKER_HOST");
|
||||||
|
protected DockerClient dockerClient;
|
||||||
|
protected Map<String, Container> runningContainers;
|
||||||
|
|
||||||
|
protected static String[] getHiveConsoleCommand(String rawCommand) {
|
||||||
|
String jarCommand = "add jar " + HUDI_HADOOP_BUNDLE + ";";
|
||||||
|
String fullCommand = jarCommand + rawCommand;
|
||||||
|
|
||||||
|
List<String> cmd = new ImmutableList.Builder().add("hive")
|
||||||
|
.add("--hiveconf")
|
||||||
|
.add("hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat")
|
||||||
|
.add("--hiveconf")
|
||||||
|
.add("hive.stats.autogather=false")
|
||||||
|
.add("-e")
|
||||||
|
.add("\"" + fullCommand + "\"")
|
||||||
|
.build();
|
||||||
|
return cmd.stream().toArray(String[]::new);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void init() throws IOException {
|
||||||
|
String dockerHost = (OVERRIDDEN_DOCKER_HOST != null) ? OVERRIDDEN_DOCKER_HOST : DEFAULT_DOCKER_HOST;
|
||||||
|
//Assuming insecure docker engine
|
||||||
|
DockerClientConfig config = DefaultDockerClientConfig.createDefaultConfigBuilder()
|
||||||
|
.withDockerHost(dockerHost)
|
||||||
|
.build();
|
||||||
|
// using jaxrs/jersey implementation here (netty impl is also available)
|
||||||
|
DockerCmdExecFactory dockerCmdExecFactory = new JerseyDockerCmdExecFactory()
|
||||||
|
.withConnectTimeout(1000)
|
||||||
|
.withMaxTotalConnections(100)
|
||||||
|
.withMaxPerRouteConnections(10);
|
||||||
|
dockerClient = DockerClientBuilder.getInstance(config)
|
||||||
|
.withDockerCmdExecFactory(dockerCmdExecFactory)
|
||||||
|
.build();
|
||||||
|
await().atMost(60, SECONDS).until(this::servicesUp);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean servicesUp() {
|
||||||
|
List<Container> containerList = dockerClient.listContainersCmd().exec();
|
||||||
|
for (Container c : containerList) {
|
||||||
|
if (!c.getState().equalsIgnoreCase("running")) {
|
||||||
|
System.out.println("Container : " + Arrays.toString(c.getNames())
|
||||||
|
+ "not in running state, Curr State :" + c.getState());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
runningContainers = containerList.stream().map(c -> Pair.of(c.getNames()[0], c))
|
||||||
|
.collect(Collectors.toMap(Pair::getLeft, Pair::getRight));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected TestExecStartResultCallback executeCommandInDocker(String containerName, String[] command,
|
||||||
|
boolean expectedToSucceed)
|
||||||
|
throws Exception {
|
||||||
|
LOG.info("Executing command (" + Arrays.toString(command) + ") in container " + containerName);
|
||||||
|
Container sparkWorkerContainer = runningContainers.get(containerName);
|
||||||
|
ExecCreateCmd cmd = dockerClient.execCreateCmd(sparkWorkerContainer.getId())
|
||||||
|
.withCmd(command).withAttachStdout(true).withAttachStderr(true);
|
||||||
|
|
||||||
|
ExecCreateCmdResponse createCmdResponse = cmd.exec();
|
||||||
|
TestExecStartResultCallback callback = new TestExecStartResultCallback(new ByteArrayOutputStream(),
|
||||||
|
new ByteArrayOutputStream());
|
||||||
|
dockerClient.execStartCmd(createCmdResponse.getId()).withDetach(false).withTty(false)
|
||||||
|
.exec(callback).awaitCompletion();
|
||||||
|
int exitCode = dockerClient.inspectExecCmd(createCmdResponse.getId()).exec().getExitCode();
|
||||||
|
LOG.info("Exit code for command (" + Arrays.toString(command) + ") is " + exitCode);
|
||||||
|
if (exitCode != 0) {
|
||||||
|
LOG.error("Command (" + Arrays.toString(command) + ") failed.");
|
||||||
|
LOG.error("Stdout is :" + callback.getStdout().toString());
|
||||||
|
LOG.error("Stderr is :" + callback.getStderr().toString());
|
||||||
|
}
|
||||||
|
if (expectedToSucceed) {
|
||||||
|
Assert.assertTrue("Command (" + Arrays.toString(command)
|
||||||
|
+ ") expected to succeed. Exit (" + exitCode + ")", exitCode == 0);
|
||||||
|
} else {
|
||||||
|
Assert.assertTrue("Command (" + Arrays.toString(command)
|
||||||
|
+ ") expected to fail. Exit (" + exitCode + ")", exitCode != 0);
|
||||||
|
}
|
||||||
|
cmd.close();
|
||||||
|
return callback;
|
||||||
|
}
|
||||||
|
|
||||||
|
public class TestExecStartResultCallback extends ExecStartResultCallback {
|
||||||
|
|
||||||
|
// Storing the reference in subclass to expose to clients
|
||||||
|
private final ByteArrayOutputStream stdout;
|
||||||
|
private final ByteArrayOutputStream stderr;
|
||||||
|
|
||||||
|
public TestExecStartResultCallback(ByteArrayOutputStream stdout, ByteArrayOutputStream stderr) {
|
||||||
|
super(stdout, stderr);
|
||||||
|
this.stdout = stdout;
|
||||||
|
this.stderr = stderr;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onComplete() {
|
||||||
|
super.onComplete();
|
||||||
|
LOG.info("onComplete called");
|
||||||
|
try {
|
||||||
|
stderr.flush();
|
||||||
|
stdout.flush();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public ByteArrayOutputStream getStdout() {
|
||||||
|
return stdout;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ByteArrayOutputStream getStderr() {
|
||||||
|
return stderr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,139 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package com.uber.hoodie.integ;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Smoke tests to run as part of verification.
|
||||||
|
*/
|
||||||
|
public class ITTestHoodieSanity extends ITTestBase {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRunEcho() throws Exception {
|
||||||
|
String[] cmd = new String[]{"echo", "Happy Testing"};
|
||||||
|
TestExecStartResultCallback callback = executeCommandInDocker(ADHOC_1_CONTAINER,
|
||||||
|
cmd, true);
|
||||||
|
String stdout = callback.getStdout().toString();
|
||||||
|
String stderr = callback.getStderr().toString();
|
||||||
|
LOG.info("Got output for (" + Arrays.toString(cmd) + ") :" + stdout);
|
||||||
|
LOG.info("Got error output for (" + Arrays.toString(cmd) + ") :" + stderr);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
/**
|
||||||
|
* A basic integration test that runs HoodieJavaApp to create a sample COW Hoodie with single partition key
|
||||||
|
* data-set and performs upserts on it. Hive integration and upsert functionality is checked by running a count
|
||||||
|
* query in hive console.
|
||||||
|
*/
|
||||||
|
public void testRunHoodieJavaAppOnSinglePartitionKeyCOWTable() throws Exception {
|
||||||
|
String hiveTableName = "docker_hoodie_single_partition_key_cow_test";
|
||||||
|
testRunHoodieJavaAppOnCOWTable(hiveTableName, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
/**
|
||||||
|
* A basic integration test that runs HoodieJavaApp to create a sample COW Hoodie with multiple partition-keys
|
||||||
|
* data-set and performs upserts on it. Hive integration and upsert functionality is checked by running a count
|
||||||
|
* query in hive console.
|
||||||
|
*/
|
||||||
|
public void testRunHoodieJavaAppOnMultiPartitionKeysCOWTable() throws Exception {
|
||||||
|
String hiveTableName = "docker_hoodie_multi_partition_key_cow_test";
|
||||||
|
testRunHoodieJavaAppOnCOWTable(hiveTableName, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A basic integration test that runs HoodieJavaApp to create a sample COW Hoodie
|
||||||
|
* data-set and performs upserts on it. Hive integration and upsert functionality is checked by running a count
|
||||||
|
* query in hive console.
|
||||||
|
* TODO: Add spark-shell test-case
|
||||||
|
*/
|
||||||
|
public void testRunHoodieJavaAppOnCOWTable(String hiveTableName, boolean singlePartitionKey) throws Exception {
|
||||||
|
|
||||||
|
// Drop Table if it exists
|
||||||
|
{
|
||||||
|
String[] hiveDropCmd = getHiveConsoleCommand("drop table if exists " + hiveTableName);
|
||||||
|
executeCommandInDocker(HIVESERVER, hiveDropCmd, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure table does not exist
|
||||||
|
{
|
||||||
|
String[] hiveTableCheck = getHiveConsoleCommand("show tables like '" + hiveTableName + "'");
|
||||||
|
TestExecStartResultCallback callback =
|
||||||
|
executeCommandInDocker(HIVESERVER, hiveTableCheck, true);
|
||||||
|
String stderr = callback.getStderr().toString();
|
||||||
|
String stdout = callback.getStdout().toString();
|
||||||
|
LOG.info("Got output for (" + Arrays.toString(hiveTableCheck) + ") :" + stdout);
|
||||||
|
LOG.info("Got error output for (" + Arrays.toString(hiveTableCheck) + ") :" + stderr);
|
||||||
|
Assert.assertTrue("Result :" + callback.getStdout().toString(), stdout.trim().isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run Hoodie Java App
|
||||||
|
{
|
||||||
|
String[] cmd = null;
|
||||||
|
if (singlePartitionKey) {
|
||||||
|
cmd = new String[]{
|
||||||
|
HOODIE_JAVA_APP,
|
||||||
|
"--hive-sync",
|
||||||
|
"--hive-url", HIVE_SERVER_JDBC_URL,
|
||||||
|
"--hive-table", hiveTableName
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
cmd = new String[]{
|
||||||
|
HOODIE_JAVA_APP,
|
||||||
|
"--hive-sync",
|
||||||
|
"--hive-url", HIVE_SERVER_JDBC_URL,
|
||||||
|
"--use-multi-partition-keys",
|
||||||
|
"--hive-table", hiveTableName
|
||||||
|
};
|
||||||
|
}
|
||||||
|
TestExecStartResultCallback callback = executeCommandInDocker(ADHOC_1_CONTAINER,
|
||||||
|
cmd, true);
|
||||||
|
String stdout = callback.getStdout().toString().trim();
|
||||||
|
String stderr = callback.getStderr().toString().trim();
|
||||||
|
LOG.info("Got output for (" + Arrays.toString(cmd) + ") :" + stdout);
|
||||||
|
LOG.info("Got error output for (" + Arrays.toString(cmd) + ") :" + stderr);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure table does exist
|
||||||
|
{
|
||||||
|
String[] hiveTableCheck = getHiveConsoleCommand("show tables like '" + hiveTableName + "'");
|
||||||
|
TestExecStartResultCallback callback =
|
||||||
|
executeCommandInDocker(HIVESERVER, hiveTableCheck, true);
|
||||||
|
String stderr = callback.getStderr().toString().trim();
|
||||||
|
String stdout = callback.getStdout().toString().trim();
|
||||||
|
LOG.info("Got output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stdout + ")");
|
||||||
|
LOG.info("Got error output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stderr + ")");
|
||||||
|
Assert.assertEquals("Table exists", hiveTableName, stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure row count is 100 (without duplicates)
|
||||||
|
{
|
||||||
|
String[] hiveTableCheck = getHiveConsoleCommand("select count(1) from " + hiveTableName);
|
||||||
|
TestExecStartResultCallback callback =
|
||||||
|
executeCommandInDocker(ADHOC_1_CONTAINER, hiveTableCheck, true);
|
||||||
|
String stderr = callback.getStderr().toString().trim();
|
||||||
|
String stdout = callback.getStdout().toString().trim();
|
||||||
|
LOG.info("Got output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stdout + ")");
|
||||||
|
LOG.info("Got error output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stderr + ")");
|
||||||
|
Assert.assertEquals("Expecting 100 rows to be present in the new table", 100,
|
||||||
|
Integer.parseInt(stdout.trim()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
log4j.rootLogger=WARN, A1
|
||||||
|
log4j.category.com.uber=INFO
|
||||||
|
log4j.category.org.apache.parquet.hadoop=WARN
|
||||||
|
# A1 is set to be a ConsoleAppender.
|
||||||
|
log4j.appender.A1=org.apache.log4j.ConsoleAppender
|
||||||
|
# A1 uses PatternLayout.
|
||||||
|
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
|
||||||
|
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
|
||||||
@@ -111,6 +111,21 @@
|
|||||||
</execution>
|
</execution>
|
||||||
</executions>
|
</executions>
|
||||||
</plugin>
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-jar-plugin</artifactId>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<goals>
|
||||||
|
<goal>test-jar</goal>
|
||||||
|
</goals>
|
||||||
|
<phase>test-compile</phase>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
<configuration>
|
||||||
|
<skip>false</skip>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.rat</groupId>
|
<groupId>org.apache.rat</groupId>
|
||||||
<artifactId>apache-rat-plugin</artifactId>
|
<artifactId>apache-rat-plugin</artifactId>
|
||||||
|
|||||||
2
hoodie-spark/run_hoodie_app.sh
Normal file → Executable file
2
hoodie-spark/run_hoodie_app.sh
Normal file → Executable file
@@ -21,4 +21,4 @@ fi
|
|||||||
OTHER_JARS=`ls -1 $DIR/target/lib/*jar | grep -v '*avro*-1.' | tr '\n' ':'`
|
OTHER_JARS=`ls -1 $DIR/target/lib/*jar | grep -v '*avro*-1.' | tr '\n' ':'`
|
||||||
#TODO - Need to move TestDataGenerator and HoodieJavaApp out of tests
|
#TODO - Need to move TestDataGenerator and HoodieJavaApp out of tests
|
||||||
echo "Running command : java -cp $DIR/target/test-classes/:$DIR/../hoodie-client/target/test-classes/:${HADOOP_CONF_DIR}:$HOODIE_JAR:${CLIENT_JAR}:$OTHER_JARS HoodieJavaApp $@"
|
echo "Running command : java -cp $DIR/target/test-classes/:$DIR/../hoodie-client/target/test-classes/:${HADOOP_CONF_DIR}:$HOODIE_JAR:${CLIENT_JAR}:$OTHER_JARS HoodieJavaApp $@"
|
||||||
java -cp $DIR/target/test-classes/:$DIR/../hoodie-client/target/test-classes/:${HADOOP_CONF_DIR}:$HOODIE_JAR:${CLIENT_JAR}:$OTHER_JARS HoodieJavaApp "$@"
|
java -Xmx1G -cp $DIR/target/test-classes/:$DIR/../hoodie-client/target/test-classes/:${HADOOP_CONF_DIR}:$HOODIE_JAR:${CLIENT_JAR}:$OTHER_JARS HoodieJavaApp "$@"
|
||||||
|
|||||||
@@ -153,7 +153,7 @@ object DataSourceWriteOptions {
|
|||||||
val HIVE_TABLE_OPT_KEY = "hoodie.datasource.hive_sync.table"
|
val HIVE_TABLE_OPT_KEY = "hoodie.datasource.hive_sync.table"
|
||||||
val HIVE_USER_OPT_KEY = "hoodie.datasource.hive_sync.username"
|
val HIVE_USER_OPT_KEY = "hoodie.datasource.hive_sync.username"
|
||||||
val HIVE_PASS_OPT_KEY = "hoodie.datasource.hive_sync.password"
|
val HIVE_PASS_OPT_KEY = "hoodie.datasource.hive_sync.password"
|
||||||
val HIVE_URL_OPT_KEY = "hoodie.datasource.hive_sync.jdbcUrl"
|
val HIVE_URL_OPT_KEY = "hoodie.datasource.hive_sync.jdbcurl"
|
||||||
val HIVE_PARTITION_FIELDS_OPT_KEY = "hoodie.datasource.hive_sync.partition_fields"
|
val HIVE_PARTITION_FIELDS_OPT_KEY = "hoodie.datasource.hive_sync.partition_fields"
|
||||||
val HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY = "hoodie.datasource.hive_sync.partition_extractor_class"
|
val HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY = "hoodie.datasource.hive_sync.partition_extractor_class"
|
||||||
val HIVE_ASSUME_DATE_PARTITION_OPT_KEY = "hoodie.datasource.hive_sync.assume_date_partitioning"
|
val HIVE_ASSUME_DATE_PARTITION_OPT_KEY = "hoodie.datasource.hive_sync.assume_date_partitioning"
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ import com.uber.hoodie.HoodieDataSourceHelpers;
|
|||||||
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
||||||
import com.uber.hoodie.common.model.HoodieTableType;
|
import com.uber.hoodie.common.model.HoodieTableType;
|
||||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||||
|
import com.uber.hoodie.hive.MultiPartKeysValueExtractor;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
@@ -65,7 +66,10 @@ public class HoodieJavaApp {
|
|||||||
private String hivePass = "hive";
|
private String hivePass = "hive";
|
||||||
|
|
||||||
@Parameter(names = {"--hive-url", "-hl"}, description = "hive JDBC URL")
|
@Parameter(names = {"--hive-url", "-hl"}, description = "hive JDBC URL")
|
||||||
private String hiveJdbcUrl = "jdbc:hive://localhost:10000";
|
private String hiveJdbcUrl = "jdbc:hive2://localhost:10000";
|
||||||
|
|
||||||
|
@Parameter(names = {"--use-multi-partition-keys", "-mp"}, description = "Use Multiple Partition Keys")
|
||||||
|
private Boolean useMultiPartitionKeys = false;
|
||||||
|
|
||||||
@Parameter(names = {"--help", "-h"}, help = true)
|
@Parameter(names = {"--help", "-h"}, help = true)
|
||||||
public Boolean help = false;
|
public Boolean help = false;
|
||||||
@@ -188,10 +192,16 @@ public class HoodieJavaApp {
|
|||||||
writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), hiveTable)
|
writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), hiveTable)
|
||||||
.option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), hiveDB)
|
.option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), hiveDB)
|
||||||
.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), hiveJdbcUrl)
|
.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), hiveJdbcUrl)
|
||||||
.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr")
|
|
||||||
.option(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), hiveUser)
|
.option(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), hiveUser)
|
||||||
.option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass)
|
.option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass)
|
||||||
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true");
|
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true");
|
||||||
|
if (useMultiPartitionKeys) {
|
||||||
|
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day")
|
||||||
|
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
|
||||||
|
MultiPartKeysValueExtractor.class.getCanonicalName());
|
||||||
|
} else {
|
||||||
|
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return writer;
|
return writer;
|
||||||
}
|
}
|
||||||
|
|||||||
2
pom.xml
2
pom.xml
@@ -39,6 +39,8 @@
|
|||||||
<module>packaging/hoodie-hadoop-mr-bundle</module>
|
<module>packaging/hoodie-hadoop-mr-bundle</module>
|
||||||
<module>packaging/hoodie-hive-bundle</module>
|
<module>packaging/hoodie-hive-bundle</module>
|
||||||
<module>packaging/hoodie-spark-bundle</module>
|
<module>packaging/hoodie-spark-bundle</module>
|
||||||
|
<module>docker/hoodie/hadoop</module>
|
||||||
|
<module>hoodie-integ-test</module>
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
<licenses>
|
<licenses>
|
||||||
|
|||||||
Reference in New Issue
Block a user