From 29020d0de04d96ed22588d8fa1d4bb50b57e1700 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 14 Oct 2017 16:06:34 -0700 Subject: [PATCH 1/5] docker image for spark --- docker/Dockerfile | 58 ++++++++++++++++++++++++++++++++++++++++++++ docker/entrypoint.sh | 11 +++++++++ 2 files changed, 69 insertions(+) create mode 100644 docker/Dockerfile create mode 100755 docker/entrypoint.sh diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 00000000..81037d8c --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,58 @@ +FROM ubuntu:16.04 + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + libfreetype6-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + python \ + python-dev \ + rsync \ + software-properties-common \ + unzip \ + openjdk-8-jdk \ + openjdk-8-jre-headless \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN rm /bin/sh && ln -s /bin/bash /bin/sh + +# Please don't set DEBIAN_FRONTEND. +# Checkout "Why is DEBIAN_FRONTEND=noninteractive discouraged in Dockerfiles?" +# from https://docs.docker.com/engine/faq/ for details. +# ENV DEBIAN_FRONTEND noninteractive + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN pip --no-cache-dir install \ + Pillow \ + h5py \ + numpy \ + pandas \ + scipy \ + sklearn \ + ipython \ + pyspark + +# Install TensorFlow CPU verion from central +RUN pip --no-cache-dir install \ + 'http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.head-cp27-cp27mu-linux_x86_64.whl' + +ENV SPARK_WORKER_PORT 8888 + +EXPOSE 4040 6066 7077 8080 8888 + +# Avoid the default Docker behavior of mapping our IP address to an unreachable host name + +RUN mkdir -p /phi9t +COPY entrypoint.sh /phi9t/. + +WORKDIR /workspace + +ENTRYPOINT ["/phi9t/entrypoint.sh"] diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh new file mode 100755 index 00000000..38698cb9 --- /dev/null +++ b/docker/entrypoint.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +IP_ADDR="$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }')" +echo "Container IP Address: $IP_ADDR" +#export MASTER="spark://${IP_ADDR}:7077" +export SPARK_LOCAL_IP="${IP_ADDR}" +export SPARK_PUBLIC_DNS="${IP_ADDR}" + +umount /etc/hosts + +exec ipython -i $@ From 0b7b06b86c322a429ae4729050591a2ec23f4e55 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 14 Oct 2017 16:12:51 -0700 Subject: [PATCH 2/5] docs --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index fa4a16d9..4f751912 100644 --- a/README.md +++ b/README.md @@ -244,4 +244,9 @@ registerKerasImageUDF("my_keras_inception_udf", InceptionV3(weights="imagenet"), ### Estimator ## Releases: +* 0.2.0 Feature + - TFTransformer + - KerasEstimator + - TFImage Applications + - Developer tools * 0.1.0 initial release From 120c00cb8bda578b60a0c97c8a6ddd577aebd94e Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 14 Oct 2017 16:34:42 -0700 Subject: [PATCH 3/5] generate pyspark scripts --- bin/totgen.sh | 246 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100755 bin/totgen.sh diff --git a/bin/totgen.sh b/bin/totgen.sh new file mode 100755 index 00000000..14bbd81e --- /dev/null +++ b/bin/totgen.sh @@ -0,0 +1,246 @@ +#!/bin/bash + +####################################################### +# Create the list of necessary environment variables +scala_major_ver=2.11 +package_name="sparkdl" +###################################################### + +set -eu + +# The current directory of the script. +_bsd_="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/.." + +function log_info { >&2 echo "$(tput setaf 6)INFO: $@$(tput sgr0)"; } +function quit_with { >&2 echo "$(tput setaf 1)ERROR: $@$(tput sgr0)"; exit 1; } + +[[ -n "${SPARK_HOME:-}" ]] || \ + quit_with "must provide Spark home" + +host_spark_home="${SPARK_HOME:-}" + +# Generate required classpaths +[[ -x "${_bsd_}/sbt" ]] || \ + quit_with "cannot locate runnable project sbt executable" + +sbt_path_root="${_bsd_}/.sbt.paths" +function tr_classpath { perl -pe "s@~@$HOME@g" "${sbt_path_root}/$@"; } +function mk_classpath { local IFS=':'; echo "$*"; } + +# Spark packages are cached in local ivy cache +(cd "${_bsd_}" + #./sbt genClasspath assembly + #./sbt genClasspath spPackage + ./sbt genClasspath spDist + cd "${sbt_path_root}" + rm -f SPARK_PACKAGE_PYREQ && touch $_ + for spkg in $(cat SBT_SPARK_PACKAGE_CLASSPATH | tr ':' '\n'); do + log_info "[py-deps]: ${spkg}" + printf "\n# BEGIN: $(basename $spkg)\n" >> SPARK_PACKAGE_PYREQ + spkg_pyreq="$(jar -tf "${spkg}" | grep -Ei 'requirements.txt' || echo 'none')" + if [[ "none" != "${spkg_pyreq}" ]]; then + unzip -p "${spkg}" "${spkg_pyreq}" >> SPARK_PACKAGE_PYREQ + else + log_info "didn't detect requirements.txt" + fi + printf "\n# END: $(basename $spkg)\n" >> SPARK_PACKAGE_PYREQ + done + perl -pe "s@${HOME}@~@g" SBT_SPARK_PACKAGE_CLASSPATH > SPARK_PACKAGE_CLASSPATH + perl -pe "s@${HOME}@~@g" SBT_RUNTIME_CLASSPATH > SPARK_RUNTIME_EXTRA_CLASSPATH + rm -f SBT_*_CLASSPATH +) + +# Find the local assembly jar, if any +_sbt_target_path="${_bsd_}/target/scala-${scala_major_ver}" +#assembly_jar="$(find ${_sbt_target_path} -name "*-assembly*.jar" -type f | uniq)" +spkg_zip="$(find ${_sbt_target_path}/.. -name "*${scala_major_ver}.zip" -type f | uniq)" + +# Setup python paths +_proj_pypath="${_bsd_}"/python +_spark_pypath="${SPARK_HOME}"/python:"$(find "${SPARK_HOME}"/python/lib/ -name 'py4j-*-src.zip' -type f | uniq)" +_spark_pkg_path="$(tr_classpath SPARK_PACKAGE_CLASSPATH)" +_spark_pkg_pyreq="${sbt_path_root}/SPARK_PACKAGE_PYREQ" + +# Notice that spark submit requires colons +#_spark_pkg_submit_common="${assembly_jar},${_spark_pkg_path}" +#_spark_pkg_submit_common="${_spark_pkg_path}" +_spark_pkg_submit_common="${spkg_zip},${_spark_pkg_path}" + +_submit_py_files="${_spark_pkg_submit_common}" +_submit_jars="${_spark_pkg_submit_common}" + +log_info "[spark submit] --jars ${_submit_jars}" +log_info "[spark submit] --py-files ${_submit_py_files}" + +# Provide required python and jar files for Spark testing cluster +# Create individial scripts + +####################################################### +# PySpark +####################################################### + +SCPT="${_bsd_}/.EXEC_SCRIPT" + +function SCPT_BEGIN { + rm -f "${SCPT}" && touch $_ && chmod +x $_ + cat << '_SCPT_HEADER_EOF_' >> "${SCPT}" +#!/usr/bin/env bash +##%%---- +## Generated automatically +##%%---- + +_bsd_="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +function quit_with { >&2 echo "ERROR: $@"; exit 1; } +function check_vars { + for _varname in ${@}; do + local _var="$(eval "echo \$${_varname}")" + [[ -n "${_var}" ]] || quit_with "${_varname} not defined" + done +} + +_SCPT_HEADER_EOF_ + +cat << _SCPT_HEADER_VAR_EOF_ >> "${SCPT}" +##%%---- +# Global variables + +spark_pkg_path="${_spark_pkg_path}" + +##%%---- +_SCPT_HEADER_VAR_EOF_ +} + +function SCPT_PYSPARK_BODY { + cat << _SCPT_LOCAL_VAR_EOF_ >> "${SCPT}" +##%%---- +# Local variables + +_proj_pypath="${_proj_pypath}" +_spark_pypath="${_spark_pypath}" +_spark_pkg_pypath="${_spark_pkg_path}" +_submit_jars="${_submit_jars}" +_submit_py_files="${_submit_py_files}" + +##%%---- +_SCPT_LOCAL_VAR_EOF_ + + cat << '_EXEC_SCRIPT_EOF_' >> "${SCPT}" +check_vars _py _ipy _pyspark + +_local_pypath="$(${_py} -c 'import site; print(site.USER_SITE)')" + +export PYSPARK_PYTHON=${_py} +export PYSPARK_DRIVER_PYTHON=${_ipy} +export PYSPARK_DRIVER_PYTHON_OPTS="-i --simple-prompt --pprint" +# We should only be using the assembly to make sure consistency +# REPL based development could always go and reload pieces +export PYTHONPATH="${_proj_pypath}:${_local_pypath}:${_spark_pypath}:${_spark_pkg_pypath}" +#export PYTHONPATH="${_local_pypath}:${_spark_pypath}:${_spark_pkg_pypath}" + +exec "${_pyspark}" \ + --master "local[4]" \ + --conf spark.app.name="[drgscl]::pyspark" \ + --conf spark.eventLog.enabled=false \ + --conf spark.driver.memory=10g \ + --conf spark.executor.memory=10g \ + --py-files "${_submit_py_files}" \ + --jars "${_submit_jars}" \ + --verbose \ + $@ + +_EXEC_SCRIPT_EOF_ +} + + +function SCPT_SPARK_SHELL_BODY { + cat << '_EXEC_SCRIPT_EOF_' >> "${SCPT}" +check_vars _spark_shell +check_vars _submit_jars + +exec "${_spark_shell}" \ + --master "local[4]" \ + --conf spark.app.name="[drgscl]::spark-shell" \ + --conf spark.eventLog.enabled=false \ + --jars "${_submit_jars}" \ + $@ + +_EXEC_SCRIPT_EOF_ +} + +####################################################### +# Documentation +####################################################### + +function gen_jekyll { + SCPT_BEGIN + cat << _SCPT_LOCAL_VAR_EOF_ >> "${SCPT}" +##%%---- +# Local variables + +this_package="${package_name}" +_proj_pypath="${_proj_pypath}" +_spark_pypath="${_spark_pypath}" +_spark_pkg_pypath="${_spark_pkg_path}" +_spark_pkg_prereqs="${_spark_pkg_pyreq}" +export SPARK_HOME="${host_spark_home}" +_py="$(which python)" +_pip="$(which pip)" + +##%%---- +_SCPT_LOCAL_VAR_EOF_ + + cat << '_EXEC_SCRIPT_EOF_' >> "${SCPT}" +check_vars _py _pip +_local_pypath="$(${_py} -c 'import site; print(site.USER_SITE)')" +export PYTHONPATH="${_local_pypath}:${_proj_pypath}:${_spark_pypath}:${_spark_pkg_pypath}" + +pip install --user -r "${_spark_pkg_prereqs}" + +(cd ${_bsd_}/python && sphinx-apidoc -f -o docs ${this_package}) + +pushd "${_bsd_}/docs" +jekyll $@ +popd + +_EXEC_SCRIPT_EOF_ + + mv "${SCPT}" ${_bsd_}/.jekyll +} + +function gen_py2_spark_shell { + SCPT_BEGIN + cat << _SCPT_VAR_EOF_ >> "${SCPT}" +_py="$(which python2)" +_ipy="$(which ipython2)" +_pyspark="${SPARK_HOME}"/bin/pyspark +_SCPT_VAR_EOF_ + SCPT_PYSPARK_BODY + mv "${SCPT}" ${_bsd_}/.py2.spark.shell +} + +function gen_py3_spark_shell { + SCPT_BEGIN + cat << _SCPT_VAR_EOF_ >> "${SCPT}" +_py="$(which python3)" +_ipy="$(which ipython3)" +_pyspark="${SPARK_HOME}"/bin/pyspark +_SCPT_VAR_EOF_ + SCPT_PYSPARK_BODY + mv "${SCPT}" ${_bsd_}/.py3.spark.shell +} + +function gen_spark_shell { + SCPT_BEGIN + cat << _SCPT_VAR_EOF_ >> "${SCPT}" +_spark_shell="${SPARK_HOME}"/bin/spark-shell +_submit_jars="${_submit_jars}" +_SCPT_VAR_EOF_ + SCPT_SPARK_SHELL_BODY + mv "${SCPT}" ${_bsd_}/.spark.shell +} + +gen_py2_spark_shell +gen_py3_spark_shell +gen_spark_shell +gen_jekyll From 49b9ea439f57d0198f2052754abacff70acc3998 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Sat, 14 Oct 2017 16:36:14 -0700 Subject: [PATCH 4/5] tools --- bin/totgen.sh | 3 ++ build.sbt | 16 ++----- linter.sh | 14 ++++++ project/GenClasspathPlugin.scala | 78 ++++++++++++++++++++++++++++++++ project/LibDeps.scala | 26 +++++++++++ prospector.yaml | 39 ++++++++++++++++ 6 files changed, 163 insertions(+), 13 deletions(-) create mode 100755 linter.sh create mode 100644 project/GenClasspathPlugin.scala create mode 100644 project/LibDeps.scala create mode 100644 prospector.yaml diff --git a/bin/totgen.sh b/bin/totgen.sh index 14bbd81e..06c7b454 100755 --- a/bin/totgen.sh +++ b/bin/totgen.sh @@ -162,7 +162,10 @@ exec "${_spark_shell}" \ --master "local[4]" \ --conf spark.app.name="[drgscl]::spark-shell" \ --conf spark.eventLog.enabled=false \ + --conf spark.driver.memory=10g \ + --conf spark.executor.memory=10g \ --jars "${_submit_jars}" \ + --verbose \ $@ _EXEC_SCRIPT_EOF_ diff --git a/build.sbt b/build.sbt index 95bc6043..bc04b090 100644 --- a/build.sbt +++ b/build.sbt @@ -1,16 +1,6 @@ // Your sbt build file. Guides on how to write one can be found at // http://www.scala-sbt.org/0.13/docs/index.html - -val sparkVer = sys.props.getOrElse("spark.version", "2.1.1") -val sparkBranch = sparkVer.substring(0, 3) -val defaultScalaVer = sparkBranch match { - case "2.0" => "2.11.8" - case "2.1" => "2.11.8" - case "2.2" => "2.11.8" - case _ => throw new IllegalArgumentException(s"Unsupported Spark version: $sparkVer.") -} -val scalaVer = sys.props.getOrElse("scala.version", defaultScalaVer) -val scalaMajorVersion = scalaVer.substring(0, scalaVer.indexOf(".", scalaVer.indexOf(".") + 1)) +import libdeps.LibVers._ sparkVersion := sparkVer @@ -19,7 +9,7 @@ scalaVersion := scalaVer spName := "databricks/spark-deep-learning" // Don't forget to set the version -version := s"0.1.0-spark$sparkBranch" +version := s"0.2.0-spark$sparkBranch" // All Spark Packages need a license licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0")) @@ -34,7 +24,7 @@ sparkComponents ++= Seq("mllib-local", "mllib", "sql") // add any Spark Package dependencies using spDependencies. // e.g. spDependencies += "databricks/spark-avro:0.1" -spDependencies += s"databricks/tensorframes:0.2.9-s_${scalaMajorVersion}" +spDependencies += s"databricks/tensorframes:0.2.9-s_${scalaMajorVer}" // These versions are ancient, but they cross-compile around scala 2.10 and 2.11. // Update them when dropping support for scala 2.10 diff --git a/linter.sh b/linter.sh new file mode 100755 index 00000000..1e51a7c6 --- /dev/null +++ b/linter.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +_bsd_="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +if [[ $# -gt 1 ]]; then + target_files=(${@}) +else + target_files=($(git diff --name-only upstream/master HEAD)) +fi + +echo "${target_files[@]}" +pushd "${_bsd_}" +exec prospector --profile ${_bsd_}/prospector.yaml "${target_files[@]}" +popd diff --git a/project/GenClasspathPlugin.scala b/project/GenClasspathPlugin.scala new file mode 100644 index 00000000..4f9ecb3d --- /dev/null +++ b/project/GenClasspathPlugin.scala @@ -0,0 +1,78 @@ +package sbtgenclasspath + +import sbt._, Keys._ +import sbtsparkpackage.SparkPackagePlugin.autoImport._ +import libdeps.LibVers._ + +object GenClasspathPlugin extends sbt.AutoPlugin { + + object autoImport { + + lazy val genClasspath = taskKey[Unit]("Build runnable script with classpath") + lazy val extraSparkSubmitModules = settingKey[Seq[ModuleID]]("Additional spark submit jar dependencies") + + lazy val genClasspathSettings: Seq[Def.Setting[_]] = Seq( + + extraSparkSubmitModules := Seq.empty[ModuleID], + + genClasspath := { + import java.io.PrintWriter + + val sbtPathRoot = baseDirectory.value / ".sbt.paths" + sbtPathRoot.mkdirs() + + def writeClasspath(cpType: String)(R: => String): Unit = { + val fout = new PrintWriter((sbtPathRoot / s"SBT_${cpType}_CLASSPATH").toString) + println(s"Building ${cpType} classpath for current project") + try fout.write(R) finally fout.close() + } + + writeClasspath("RUNTIME") { + (fullClasspath in Runtime).value.files.map(_.toString).mkString(":") + } + + writeClasspath("SPARK_PACKAGE") { + import scala.util.matching.Regex + val patt = s"(.+?)/(.+?):(.+?)(-s_${scalaMajorVer})?".r + val pkgs = (spDependencies.value).map { _ match { + case patt(orgName, pkgName, pkgVer, stem, _*) => + if (null != stem) { + println(s"org ${orgName}, pkg ${pkgName}, ver ${pkgVer}, ${stem}") + s"${pkgName}-${pkgVer}${stem}.jar" + } else { + println(s"org ${orgName}, pkg ${pkgName}, ver ${pkgVer}") + s"${pkgName}-${pkgVer}.jar" + } + }}.toSet + + // TODO: not knowing the proper way, I just fall back to Regex + val extraSpModIds = (extraSparkSubmitModules in Compile).value.flatMap { mod => + //"com.typesafe.scala-logging:scala-logging-api:2.1.2" + // scala-logging-api_2.11-2.1.2.jar + val patt = s"(.+?):(.+?):(.+?)".r + mod.toString match { + case patt(orgName, pkgName, pkgVer) => + Seq(s"${pkgName}_${scalaMajorVer}-${pkgVer}.jar", s"${pkgName}-${pkgVer}.jar") + } + }.toSet + + (fullClasspath in Compile).value.files.filter { cpFile => + val cpName = cpFile.getName + println(cpName) + (pkgs contains cpName) || (extraSpModIds contains cpName) + }.map(_.toString).mkString(":") + } + } + ) + } + import autoImport._ + + override def requires = sbt.plugins.JvmPlugin + + // This plugin is automatically enabled for projects which are JvmPlugin. + override def trigger = allRequirements + + // a group of settings that are automatically added to projects. + override val projectSettings = + inConfig(Compile)(genClasspathSettings) ++ inConfig(Test)(genClasspathSettings) +} diff --git a/project/LibDeps.scala b/project/LibDeps.scala new file mode 100644 index 00000000..93b578c3 --- /dev/null +++ b/project/LibDeps.scala @@ -0,0 +1,26 @@ +package libdeps + +/** + ====================================================== + * Build parameters + ====================================================== + */ +object LibVers { + + lazy val sparkVer = sys.props.getOrElse("spark.version", "2.2.0") + lazy val sparkBranch = sparkVer.substring(0, 3) + lazy val defaultScalaVer = sparkBranch match { + case "2.0" => "2.11.8" + case "2.1" => "2.11.8" + case "2.2" => "2.11.8" + case _ => throw new IllegalArgumentException(s"Unsupported Spark version: $sparkVer.") + } + + lazy val scalaVer = sys.props.getOrElse("scala.version", defaultScalaVer) + lazy val scalaMajorVer = scalaVer.substring(0, scalaVer.indexOf(".", scalaVer.indexOf(".") + 1)) + + lazy val defaultScalaTestVer = scalaVer match { + case s if s.startsWith("2.10") => "2.0" + case s if s.startsWith("2.11") => "2.2.6" // scalatest_2.11 does not have 2.0 published + } +} diff --git a/prospector.yaml b/prospector.yaml new file mode 100644 index 00000000..d8ffa3c7 --- /dev/null +++ b/prospector.yaml @@ -0,0 +1,39 @@ +strictness: high +test-warnings: True +doc-warnings: false + +ignore-paths: + - docs + - spark-warehouse + - cover + +max-line-length: 100 + +pep8: + run: true + disable: + - N802 + - N803 + - N806 + - E302 + +pylint: + run: true + disable: + - too-many-instance-attributes + - cyclic-import + - len-as-condition + - invalid-name + - no-else-return + - no-self-use + - import-error + - protected-access + - reimported + +mccabe: + disable: + - MC0001 + +pyroma: + run: true + \ No newline at end of file From beb98193f3ede4e5c89abd6af800de200cbde2f0 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Tue, 17 Oct 2017 12:08:41 -0700 Subject: [PATCH 5/5] Update README.md reflect the changes pertaining to this PR. --- README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index 4f751912..a0e7270c 100644 --- a/README.md +++ b/README.md @@ -244,9 +244,6 @@ registerKerasImageUDF("my_keras_inception_udf", InceptionV3(weights="imagenet"), ### Estimator ## Releases: -* 0.2.0 Feature - - TFTransformer - - KerasEstimator - - TFImage Applications +* 0.2.x Feature - Developer tools * 0.1.0 initial release