1
0

[HUDI-82] Adds Presto integration in Docker demo (#847)

This commit is contained in:
Bhavani Sudha Saktheeswaran
2019-08-22 19:40:36 -07:00
committed by vinoth chandar
parent 1b79ef7672
commit 92eed6aca8
17 changed files with 590 additions and 0 deletions

View File

@@ -170,6 +170,42 @@ services:
- KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
- ALLOW_PLAINTEXT_LISTENER=yes
presto-coordinator-1:
container_name: presto-coordinator-1
hostname: presto-coordinator-1
image: apachehudi/hudi-hadoop_2.8.4-prestobase_0.217:latest
ports:
- '8090:8090'
environment:
- PRESTO_JVM_MAX_HEAP=512M
- PRESTO_QUERY_MAX_MEMORY=1GB
- PRESTO_QUERY_MAX_MEMORY_PER_NODE=256MB
- PRESTO_QUERY_MAX_TOTAL_MEMORY_PER_NODE=384MB
- PRESTO_MEMORY_HEAP_HEADROOM_PER_NODE=100MB
- TERM=xterm
links:
- "hivemetastore"
command: coordinator
presto-worker-1:
container_name: presto-worker-1
hostname: presto-worker-1
image: apachehudi/hudi-hadoop_2.8.4-prestobase_0.217:latest
depends_on: ["presto-coordinator-1"]
environment:
- PRESTO_JVM_MAX_HEAP=512M
- PRESTO_QUERY_MAX_MEMORY=1GB
- PRESTO_QUERY_MAX_MEMORY_PER_NODE=256MB
- PRESTO_QUERY_MAX_TOTAL_MEMORY_PER_NODE=384MB
- PRESTO_MEMORY_HEAP_HEADROOM_PER_NODE=100MB
- TERM=xterm
links:
- "hivemetastore"
- "hiveserver"
- "hive-metastore-postgresql"
- "namenode"
command: worker
adhoc-1:
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.3.1:latest
hostname: adhoc-1
@@ -187,6 +223,7 @@ services:
- "hiveserver"
- "hive-metastore-postgresql"
- "namenode"
- "presto-coordinator-1"
volumes:
- ${HUDI_WS}:/var/hoodie/ws
@@ -205,6 +242,7 @@ services:
- "hiveserver"
- "hive-metastore-postgresql"
- "namenode"
- "presto-coordinator-1"
volumes:
- ${HUDI_WS}:/var/hoodie/ws

View File

@@ -36,6 +36,7 @@
<module>sparkmaster</module>
<module>sparkworker</module>
<module>sparkadhoc</module>
<module>prestobase</module>
</modules>
<dependencies>
@@ -52,6 +53,7 @@
<docker.spark.version>2.3.1</docker.spark.version>
<docker.hive.version>2.3.3</docker.hive.version>
<docker.hadoop.version>2.8.4</docker.hadoop.version>
<docker.presto.version>0.217</docker.presto.version>
<dockerfile.maven.version>1.4.3</dockerfile.maven.version>
<checkstyle.skip>true</checkstyle.skip>
</properties>

View File

@@ -0,0 +1,69 @@
## Presto docker setup is based on https://github.com/smizy/docker-presto
ARG HADOOP_VERSION=2.8.4
ARG HIVE_VERSION=2.3.3
FROM apachehudi/hudi-hadoop_${HADOOP_VERSION}-base:latest as hadoop-base
ARG PRESTO_VERSION=0.217
ENV PRESTO_VERSION ${PRESTO_VERSION}
ENV PRESTO_HOME /opt/presto-server-${PRESTO_VERSION}
ENV PRESTO_CONF_DIR ${PRESTO_HOME}/etc
ENV PRESTO_LOG_DIR /var/log/presto
ENV PRESTO_JVM_MAX_HEAP 2G
ENV PRESTO_QUERY_MAX_MEMORY 1GB
ENV PRESTO_QUERY_MAX_MEMORY_PER_NODE 512MB
ENV PRESTO_DISCOVERY_URI http://presto-coordinator-1:8090
ENV PATH $PATH:${PRESTO_HOME}/bin
RUN set -x \
&& DEBIAN_FRONTEND=noninteractive apt-get -yq update \
&& apt-get -yq install \
bash \
less \
python \
tar \
wget \
## - hadoop native dependency lib
bzip2 \
fts \
fuse \
libtirpc1 \
libsnappy1v5 \
zip \
cron \
gosu \
&& rm -rf /var/lib/apt/lists/* \
## presto-server
&& wget -q -O - https://repo1.maven.org/maven2/com/facebook/presto/presto-server/${PRESTO_VERSION}/presto-server-${PRESTO_VERSION}.tar.gz \
| tar -xzf - -C /opt/ \
&& mkdir -p /var/hoodie/ws/docker/hoodie/hadoop/prestobase/target/ \
## presto-client
&& wget -q -O /usr/local/bin/presto https://repo1.maven.org/maven2/com/facebook/presto/presto-cli/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar \
&& chmod +x /usr/local/bin/presto \
## user/dir/permmsion
&& adduser --shell /sbin/nologin --uid 1000 docker \
&& adduser --shell /sbin/nologin presto \
&& mkdir -p \
${PRESTO_CONF_DIR} \
${PRESTO_LOG_DIR} \
&& chmod -R 777 \
${PRESTO_HOME} \
${PRESTO_LOG_DIR} \
## cleanup
&& rm -rf /tmp/nativelib
COPY etc/ ${PRESTO_CONF_DIR}/
COPY bin/* /usr/local/bin/
COPY lib/* /usr/local/lib/
RUN chmod +x /usr/local/bin/entrypoint.sh
ADD target/ /var/hoodie/ws/docker/hoodie/hadoop/prestobase/target/
ENV HUDI_PRESTO_BUNDLE /var/hoodie/ws/docker/hoodie/hadoop/prestobase/target/hudi-presto-bundle.jar
RUN cp ${HUDI_PRESTO_BUNDLE} ${PRESTO_HOME}/plugin/hive-hadoop2/
VOLUME ["${PRESTO_LOG_DIR}"]
WORKDIR ${PRESTO_HOME}
ENTRYPOINT ["entrypoint.sh"]

View File

@@ -0,0 +1,52 @@
#!/bin/bash
set -eo pipefail
wait_until() {
local hostname=${1?}
local port=${2?}
local retry=${3:-100}
local sleep_secs=${4:-2}
local address_up=0
while [ ${retry} -gt 0 ] ; do
echo "Waiting until ${hostname}:${port} is up ... with retry count: ${retry}"
if nc -z ${hostname} ${port}; then
address_up=1
break
fi
retry=$((retry-1))
sleep ${sleep_secs}
done
if [ $address_up -eq 0 ]; then
echo "GIVE UP waiting until ${hostname}:${port} is up! "
exit 1
fi
}
if [ ! -e ${PRESTO_LOG_DIR}/node.id ]; then
cat /proc/sys/kernel/random/uuid > ${PRESTO_LOG_DIR}/node.id
fi
export PRESTO_NODE_ID=$(cat ${PRESTO_LOG_DIR}/node.id)
# apply template
for template in $(ls ${PRESTO_CONF_DIR}/*.mustache)
do
conf_file=${template%.mustache}
cat ${conf_file}.mustache | mustache.sh > ${conf_file}
done
case "$1" in
"coordinator" | "worker" )
server_role="$1"
shift
exec gosu presto launcher --config=${PRESTO_CONF_DIR}/${server_role}.properties "$@" run
;;
*)
;;
esac
exec "$@"

View File

@@ -0,0 +1,15 @@
#!/bin/sh
# `mustache.sh`, Mustache in POSIX shell.
set -e
# Load the `mustache` function and its friends. These are assumed to be
# in the `lib` directory in the same tree as this `bin` directory.
. "$(dirname "$(dirname "$0")")/lib/mustache.sh"
# Call `mustache` to make this behave somewhat like `mustache`(1).
# Because it doesn't accept the `--compile` or `--tokens` command-line
# options and does not accept input file(s) as arguments, this program
# is called `mustache.sh`(1), not `mustache`(1).
mustache

View File

@@ -0,0 +1,18 @@
connector.name=hive-hadoop2
hive.metastore-cache-ttl=1s
hive.metastore-refresh-interval=1m
hive.metastore-timeout=20s
hive.metastore.uri=thrift://hivemetastore:9083
hive.storage-format=PARQUET
hive.parquet.use-column-names=true
hive.max-split-size=128MB
hive.assume-canonical-partition-keys=true
hive.recursive-directories=true
hive.config.resources=/etc/hadoop/core-site.xml,/etc/hadoop/hdfs-site.xml
hive.hdfs.authentication.type=NONE
hive.hdfs.impersonation.enabled=false
hive.bucket-execution=false
hive.table-statistics-enabled=true
hive.max-partitions-per-writers=3000
hive.split-loader-concurrency=1
hive.orc.bloom-filters.enabled=true

View File

@@ -0,0 +1,4 @@
connector.name=jmx
jmx.dump-tables=java.lang:type=Runtime,com.facebook.presto.execution.scheduler:name=NodeScheduler
jmx.dump-period=10s
jmx.max-entries=86400

View File

@@ -0,0 +1,3 @@
connector.name=localfile
presto-logs.http-request-log.location=/var/log/presto
presto-logs.http-request-log.pattern=http-request.*

View File

@@ -0,0 +1,6 @@
coordinator=true
node-scheduler.include-coordinator=false
http-server.http.port=8090
query.max-memory={{PRESTO_QUERY_MAX_MEMORY}}
discovery-server.enabled=true
discovery.uri={{PRESTO_DISCOVERY_URI}}

View File

@@ -0,0 +1,9 @@
-server
-Xmx{{PRESTO_JVM_MAX_HEAP}}
-XX:+UseG1GC
-XX:G1HeapRegionSize=32M
-XX:+UseGCOverheadLimit
-XX:+ExplicitGCInvokesConcurrent
-XX:+HeapDumpOnOutOfMemoryError
-XX:OnOutOfMemoryError=kill -9 %p
-DHADOOP_USER_NAME=hive

View File

@@ -0,0 +1 @@
com.facebook.presto=INFO

View File

@@ -0,0 +1,3 @@
node.environment=production
node.id={{PRESTO_NODE_ID}}
node.data-dir={{PRESTO_LOG_DIR}}

View File

@@ -0,0 +1,4 @@
coordinator=false
http-server.http.port=8090
query.max-memory={{PRESTO_QUERY_MAX_MEMORY}}
discovery.uri={{PRESTO_DISCOVERY_URI}}

View File

@@ -0,0 +1,252 @@
# `mustache.sh`, Mustache in POSIX shell.
set -e
# File descriptor 3 is commandeered for debug output, which may end up being
# forwarded to standard error.
[ -z "$MUSTACHE_DEBUG" ] && exec 3>/dev/null || exec 3>&2
# File descriptor 4 is commandeered for use as a sink for literal and
# variable output of (inverted) sections that are not destined for standard
# output because their condition is not met.
exec 4>/dev/null
# File descriptor 5 is commandeered for capturing input for list processing.
exec 5>/dev/null
# Consume standard input one character at a time to render `mustache`(5)
# templates with data from the environment.
mustache() {
# Initialize the file descriptor to be used to emit characters. At
# times this value will be 4 to send output to `/dev/null`.
_M_FD=1
# IFS must only contain '\n' so as to be able to read space and tab
# characters from standard input one-at-a-time. The easiest way to
# convince it to actually contain the correct byte, and only the
# correct byte, is to use a single-quoted literal newline.
IFS='
'
# Consuming standard input one character at a time is quite a feat
# within the confines of POSIX shell. Bash's `read` builtin has
# `-n` for limiting the number of characters consumed. Here it is
# faked using `sed`(1) to place each character on its own line.
# The subtlety is that real newline characters are chomped so they
# must be indirectly detected by checking for zero-length
# characters, which is done as the character is emitted.
_mustache_sed | _mustache
# TODO Replace the original value of IFS. Be careful if it's unset.
}
# Process the one-character-per-line stream from `sed` via a state machine.
# This function will be called recursively in subshell environments to
# isolate nested section tags from the outside environment.
_mustache() {
# Always start by assuming a character is a literal.
_M_STATE="literal"
# The `read` builtin consumes one line at a time but by now each line
# contains only a single character.
while read _M_C
do
echo " _M_C: $_M_C (${#_M_C}), _M_STATE: $_M_STATE" >&3
echo "$_M_C" >&5
case "$_M_STATE" in
# Consume a single character literal. In the event this
# character and the previous character have been opening
# braces, progress to the "tag" state and initialize the
# tag name to the empty string (this invariant is relied
# on by the "tag" state). If this is the first opening
# brace, wait and see. Otherwise, emit this character.
"literal")
if [ -z "$_M_PREV_C" ]
then
case "$_M_C" in
"{") ;;
"") echo;;
*) printf "%s" "$_M_C";;
esac
else
case "$_M_PREV_C$_M_C" in
"{{") _M_STATE="tag" _M_TAG="";;
?"{") ;;
*)
[ "$_M_PREV_C" = "{" ] && printf "%s" "{"
[ -z "$_M_C" ] && echo || printf "%s" "$_M_C";;
esac
fi >&$_M_FD;;
# Consume the tag type and tag.
"tag")
case "$_M_PREV_C$_M_C" in
# A third opening brace in a row could be treated as
# a literal and the beginning of tag, as it is here,
# or as the beginning of a tag which begins with an
# opening brace.
"{{") printf "{" >&$_M_FD;;
# Note the type of this tag, defaulting to "variable".
"{#"|"{^"|"{/"|"{!"|"{>") _M_TAG_TYPE="$_M_C" _M_TAG="";;
# A variable tag must note the first character of the
# variable name. Since it's possible that an opening
# brace comes in the middle of the tag, check that
# this is indeed the beginning of the tag.
"{"?)
if [ -z "$_M_TAG" ]
then
_M_TAG_TYPE="variable" _M_TAG="$_M_C"
fi;;
# Two closing braces in a row closes the tag. The
# state resets to "literal" and the tag is processed,
# possibly in a subshell.
"}}")
_M_STATE="literal"
_mustache_tag;;
# A single closing brace is ignored at first.
?"}") ;;
# If the variable continues, the closing brace becomes
# part of the variable name.
"}"?) _M_TAG="$_M_TAG}";;
# Any other character becomes part of the variable name.
*) _M_TAG="$_M_TAG$_M_C";;
esac;;
esac
# This character becomes the previous character.
_M_PREV_C="$_M_C"
done
}
# Paper over different versions of cat.
_mustache_cat() {
set +e
cat -A <"/dev/null" >"/dev/null" 2>&1
_M_STATUS="$?"
set -e
if [ "$_M_STATUS" -eq 1 ]
then cat -e
else cat -A
fi
}
# Execute a tag surrounded by backticks. Remove the backticks first.
_mustache_cmd() {
_M_CMD="$*"
_M_CMD="${_M_CMD#"\`"}"
_M_CMD="${_M_CMD%"\`"}"
sh -c "$_M_CMD"
}
# Print an error message and GTFO. The message is the concatenation
# of all the arguments to this function.
_mustache_die() {
echo "mustache.sh: $*" >&2
exit 1
}
# Paper over differences between GNU sed and BSD sed
_mustache_sed() {
_M_NEWLINE="
"
set +e
sed -r <"/dev/null" >"/dev/null" 2>&1
_M_STATUS="$?"
set -e
if [ "$_M_STATUS" -eq 1 ]
then sed -E "s/./&\\$_M_NEWLINE/g; s/\\\\/\\\\\\\\/g"
else sed -r "s/./&\\n/g; s/\\\\/\\\\\\\\/g"
fi
}
# Process a complete tag. Variables are emitted, sections are recursed
# into, comments are ignored, and (for now) partials raise an error.
_mustache_tag() {
case "$_M_TAG_TYPE" in
# Variable tags expand to the value of an environment variable
# or the empty string if the environment variable is unset.
#
# If the tag is surrounded by backticks, execute it as a shell
# command, instead, using standard output as its value.
#
# Since the variable tag has been completely consumed, return
# to the assumption that everything's a literal until proven
# otherwise for this character.
"variable")
case "$_M_TAG" in
"\`"*"\`") _mustache_cmd "$_M_TAG";;
*) eval printf "%s" "\"\$$_M_TAG\"";;
esac >&$_M_FD;;
# Section tags expand to the expanded value of the section's
# literals and tags if and only if the section tag is in the
# environment and non-empty. Inverted section tags expand
# if the section tag is empty or unset in the environment.
#
# If the tag is surrounded by backticks, execute it as a shell
# command, instead, and process the section once for each line
# of standard output (made available as `_M_LINE`).
#
# Sections not being expanded are redirected to `/dev/null`.
"#"|"^")
echo " # _M_TAG: $_M_TAG" >&3
_M_TAG_V="$(eval printf "%s" "\"\$$_M_TAG\"")"
case "$_M_TAG_TYPE" in
"#") [ -z "$_M_TAG_V" ] && _M_FD=4;;
"^") [ -n "$_M_TAG_V" ] && _M_FD=4;;
esac
case "$_M_TAG" in
"\`"*"\`")
_M_CAPTURE="$(_M_SECTION_TAG="$_M_TAG" _mustache 5>&1 >&4)"
echo " _M_CAPTURE: $_M_CAPTURE" | _mustache_cat >&3
_mustache_cmd "$_M_TAG" | while read _M_LINE
do
echo " _M_LINE: $_M_LINE" >&3
(
_M_SECTION_TAG="$_M_TAG"
echo "$_M_CAPTURE" | _mustache
)
done;;
*)
(
_M_SECTION_TAG="$_M_TAG"
_mustache
);;
esac
_M_FD=1;;
# Closing tags for (inverted) sections must match the expected
# tag name. Any redirections made when the (inverted) section
# opened are reset when the section closes.
"/")
echo " / _M_TAG: $_M_TAG, _M_SECTION_TAG: $_M_SECTION_TAG" >&3
if [ "$_M_TAG" != "$_M_SECTION_TAG" ]
then
_mustache_die "mismatched closing tag $_M_TAG," \
"expected $_M_SECTION_TAG"
fi
exit;;
# Comments do nothing.
"!") ;;
# TODO Partials.
">") _mustache_die "{{>$_M_TAG}} syntax not implemented";;
esac
}

View File

@@ -0,0 +1,106 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hudi-hadoop-docker</artifactId>
<groupId>org.apache.hudi</groupId>
<version>0.5.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<artifactId>hudi-hadoop-presto-docker</artifactId>
<description>Base Docker Image with Hoodie</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hadoop-base-docker</artifactId>
<version>${project.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-antrun-plugin</artifactId>
<version>1.7</version>
<executions>
<execution>
<phase>package</phase>
<configuration>
<tasks>
<copy file="${project.basedir}/../../../../packaging/hudi-presto-bundle/target/hudi-presto-bundle-${project.version}.jar" tofile="target/hudi-presto-bundle.jar" />
</tasks>
</configuration>
<goals>
<goal>run</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- Build Docker image -->
<plugin>
<groupId>com.spotify</groupId>
<artifactId>dockerfile-maven-plugin</artifactId>
<version>${dockerfile.maven.version}</version>
<executions>
<execution>
<id>tag-latest</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>apachehudi/hudi-hadoop_${docker.hadoop.version}-prestobase_${docker.presto.version}</repository>
<forceTags>true</forceTags>
<tag>latest</tag>
</configuration>
</execution>
<execution>
<id>tag-version</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>apachehudi/hudi-hadoop_${docker.hadoop.version}-prestobase_${docker.presto.version}</repository>
<forceTags>true</forceTags>
<tag>${project.version}</tag>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@@ -21,10 +21,16 @@ ARG HIVE_VERSION=2.3.3
ARG SPARK_VERSION=2.3.1
FROM apachehudi/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkbase_${SPARK_VERSION}
ARG PRESTO_VERSION=0.217
COPY adhoc.sh /opt/spark
ENV SPARK_WORKER_WEBUI_PORT 8081
ENV SPARK_WORKER_LOG /spark/logs
ENV SPARK_MASTER "spark://spark-master:7077"
ENV PRESTO_VERSION ${PRESTO_VERSION}
RUN set -x \
## presto-client
&& wget -q -O /usr/local/bin/presto https://repo1.maven.org/maven2/com/facebook/presto/presto-cli/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar \
&& chmod +x /usr/local/bin/presto
CMD ["/bin/bash", "/opt/spark/adhoc.sh"]

View File

@@ -24,8 +24,10 @@
export SPARK_HOME=/opt/spark
export PRESTO_CLI_CMD="/usr/local/bin/presto --server presto-coordinator-1"
date
echo "SPARK HOME is : $SPARK_HOME"
echo "PRESTO CLI CMD is : $PRESTO_CLI_CMD"
tail -f /dev/null