diff --git a/.Doxyfile b/.Doxyfile
index e33235a2..b33ffef3 100644
--- a/.Doxyfile
+++ b/.Doxyfile
@@ -811,7 +811,7 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = ./scanner/main.cpp
+EXCLUDE                = ./scanner/main.cpp ./python/scannerpy/build
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -838,7 +838,7 @@ EXCLUDE_PATTERNS       =
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
 
-EXCLUDE_SYMBOLS        =
+EXCLUDE_SYMBOLS        = scanner::internal
 
 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
 # that contain example code fragments that are included (see the \include
diff --git a/.clang-format b/.clang-format
index 6bbae787..52bb2edc 100644
--- a/.clang-format
+++ b/.clang-format
@@ -41,7 +41,7 @@ BreakStringLiterals: true
 ColumnLimit:     80
 CommentPragmas:  '^ IWYU pragma:'
 ConstructorInitializerAllOnOneLineOrOnePerLine: true
-ConstructorInitializerIndentWidth: 4
+ConstructorInitializerIndentWidth: 2
 ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
 DerivePointerAlignment: false
@@ -89,6 +89,6 @@ SpacesInCStyleCastParentheses: false
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
 Standard:        Auto
-TabWidth:        8
+TabWidth:        2
 UseTab:          Never
 ...
diff --git a/.gdbinit b/.gdbinit
new file mode 100644
index 00000000..d70ae0b8
--- /dev/null
+++ b/.gdbinit
@@ -0,0 +1 @@
+handle SIG40 nostop noprint
diff --git a/.gitignore b/.gitignore
index 2f9a3231..aa1d5f3d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,15 +2,20 @@
 # similar but **not quite** the same. If you add a file here, make sure that
 # the Docker build still works.
 
+dependencies.txt
+nvidia-docker-compose.yml
 .cache
 build
 **/*.pyc
 **/*.trace
 thirdparty
+!thidparty/resources
 !thirdparty/CMakeLists.txt
+!docker/**/thirdparty
 python/*.egg-info
-
+docker/**/deps.sh
 **/*.mp4
 **/*.mkv
 
-python/scannerpy/include
\ No newline at end of file
+python/scannerpy/include
+dist
\ No newline at end of file
diff --git a/.scanner.example.toml b/.scanner.example.toml
index cad236b3..b0f153f8 100644
--- a/.scanner.example.toml
+++ b/.scanner.example.toml
@@ -11,4 +11,6 @@ scanner_path = "/opt/scanner"
 #     bucket = "gcs-bucket"
 
 [network]
-    master = "localhost:5001"
+    master = "localhost"
+    master_port = "5001"
+    worker_port = "5002"
diff --git a/.travis.yml b/.travis.yml
index 19368d70..7398eabb 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,17 +5,27 @@ language: cpp
 services:
 - docker
 
+addons:
+  apt:
+    packages:
+      - docker-ce
+
 env:
   global:
     - DOCKER_REPO=scannerresearch/scanner
     - DOCKER_EMAIL=wcrichto@cs.stanford.edu
     - DOCKER_USER=wcrichto
-    - secure: "Jf7f8UvpJ8yRA/2nnxaRb9jGrzWZvbRVRxzQtmcx+ltBpflmLFbXB6o4sySGETn5YrJe1g4VNWvkFBZIdLY0n+v4AQ/D1eMWtJT+kZLHihVGcxB3G7SUSOyEUaN1DzdHbaRW4VCSXQJWgK48amxlwnNfXtbNkSpiqIifDURBeHK1UT9+w1FKuTDZmEXv+ubDKjXvv1DuPdTtnolItDfrdtKLh5iXgTRvgXFanmNvydIcHqfz+72qJZJRCdTwUrA8FPq7jKNndftX9y2/dKuA/Xd/cP818iR57Z2PJXeR1c6ZmQ0RBTIZ8pdwG6bfix+48a5+aWlPJ3friA0G2b8psdZ/rQMxBbEClTI+BfnuJnYaIW8JREqmOBACBL7/4eo/Jifv1+vl+VhWHSicS6mWg2khCNAMPWj3cIA/bZrRlTmNlC06GkUgnHar/HhovGrowbZCGOxN0CNWWcuCGeLYfvPgPiI4JRU8eq0VqVhhqXZyPuS/RT1FoqZ4mMfNZG+RSIKHU56g5M4elakp/bZdu9lRoS4hr9VFpvPaLorOTOXPa3J+JxmB9OIP+NLxMIGjUhYnSYfTJ9/+Opxg2lWUOdqXzPZsyPyI+4RuIyyPh1w6jN/xD3EqQC55u5JlFlJ1WuNoyhx16QcKu1Eytxd2JxU6LutMw9WgQET53Q3EOVk=" # DOCKER_PASS
+    - secure: "DAwB02uem8/aIRNfHPknzqZCCmwDlJoQq6r3qE1CyUqkM9VMu6mQa/RBYMBwp3Upny6oI6yBlYJqqVr1rjZzI4by0j0IVhuhQPp2G5DJ/mVZ5PQSJ0bxJtCzm29hHo8hi+EgIk7r7YTcHOr3SRmplf0vM0WFuJplW5fJ6J/RbJSPPpnlTOIgp0Ry9B4eowLhi4b//TbwmtzEX6j4yKxWnDTJ6cV9xRVN1FoNd15PvpiBkOZ3dVGWtdUZm9K3Q3IumeyuemAoTSuKU3r3aqMSamdLf7pmYjpzTrYDT9CMbbEVjzr8M3Y5wFyjQsIdFKzYM/0jRBlUbKaLMhz7a/9vlkplBI954ufOqHiGAx3Cdk4jfkJ3hyucCQqYuQwHxWMmru3lV6jIZf+rYn6UBomHeNN1AIiikZ1EjvNDwY8iwGZPJbcZ42gn2mTaUVxsmwMD757AalTOzZo94+pdFJDjWY6y6kPQlZbdj8AT14bBHf8x8zF8EsZrh/WQMcRgzEU3BcjDE3RIqNgK38Sv1OzqNVTYy57PXbEMKMqfpAQx8FTcx/7NXBRrStrQ24pRZJA+wvkUefpyWZnayyMPvttzB4106GCOWAjSdJEyYQTnZMrXMRX4Qf+NHx6KHlQcExxpww/hSvewsh0JRrvMol5HKXJaZqnF6ZTqxeQdLUDdn3g=" # DOCKER_PASS
     - secure: "mJDNPmfRd3cyFNIhRg6TQo8ow5hOS+FC+DX6MLpM6Giv2nL1KkJD0fgnLqgiZeEKDOFHvGpGswGjYTAzNWYeocsneby88w58sNbueUE6PT74UqsbzrmvMhKh2xtEERkJFz5gKvdwc9a4nAvH5ejJW7OZ5OvSscR2vRBl4tUwTuA3czWbod6NwBC/sUrf0jTZhNgZWWWT1j2SLt379Q52Xmd5+ixAaSibctf8hCEMzE/lNsWprR1gO7H89eyLVRQc8VQh8msVtgo7QD/aapr+w96GDWWUAm6z5iNe8gLGZB9v4BtTm82XOV0iefNDfT0fwIhHrM6vqIdiiuMeYMteErT3rF55h0o8hJrP57lwT5u7hU/yxUhNTvtCdJTo8THJihXHbOT7LjxaF20SlJzjARSGZq9mYHeBgv9sGej2jF80iSn+xTAbalVFbc4hKumELnn/D7mGWRTjsGGTbTqNXvTQvxm/QjXmNusav5vKeRFwjbPGfb0iDSfhlZ7BHkR50OyskZ/R/69WjIFS9gBE2pvp4FlVpndKWkkkd4TVSgmnKCXxnQdeNS8We8NodvTBlTcvHFub8j2izKTZRU7FBXRgo9d+Gu+43nSRvjJUgwg/6D6Bou6boVHJo8ib9xzakVjojbJxehatj+dadpuVkG3tjULejPNg/qnjlJc10Pg=" # PYPI_PASS
+  matrix:
+    - BUILD_TYPE=cpu
+    - BUILD_TYPE=gpu
 
 install:
-- sudo apt-get install -y doxygen graphviz python-pip
-- pip install doxypypy twine
+- sudo apt-get update
+- sudo apt-get install -y doxygen graphviz
+- sudo pip install requests[security] --upgrade
+- sudo pip install doxypypy twine Sphinx
 
 script: ./scripts/travis-build.sh
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 487721c2..9cacd409 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,8 +19,6 @@ project(Scanner)
 ###### Config options #####
 option(BUILD_CUDA "" ON)
 option(BUILD_TESTS "" ON)
-option(BUILD_SERVER "" OFF)
-option(BUILD_EXAMPLES "" ON)
 option(ENABLE_PROFILING "" OFF)
 
 if (BUILD_TESTS)
@@ -66,97 +64,7 @@ if (ENABLE_PROFILING)
   add_definitions(-DSCANNER_PROFILING)
 endif()
 
-###### Optional Dependencies #######
-if (BUILD_CUDA)
-  find_package(CUDA REQUIRED)
-  add_definitions(-DHAVE_CUDA)
-  include_directories(${CUDA_INCLUDE_DIRS})
-  if(COMPILER_SUPPORTS_CXX1Y)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")
-  endif()
-endif()
-
-if (BUILD_CUDA)
-  add_library(scanner_halide scanner/engine/halide_context.cpp)
-endif()
-
-###### Required Dependencies #######
-find_package(SaneProtobuf REQUIRED)
-find_package(FFmpeg REQUIRED)
-find_package(LibLZMA REQUIRED)
-find_package(OpenSSL REQUIRED)
-find_package(BZip2 REQUIRED)
-find_package(Boost COMPONENTS thread program_options regex python REQUIRED)
-find_package(GFlags REQUIRED)
-find_package(Glog REQUIRED)
-find_package(GoogleTest REQUIRED)
-find_package(CURL REQUIRED)
-find_package(Iconv REQUIRED)
-find_package(Storehouse REQUIRED CONFIG
-  PATHS "thirdparty/build/bin/storehouse")
-find_package(TinyToml REQUIRED)
-find_package(PythonLibs 2.7 EXACT REQUIRED)
-
-set(GTEST_INCLUDE_DIRS
-  "${THIRDPARTY_OUTPUT_PATH}/googletest/include")
-set(GTEST_LIBRARIES
-  "${THIRDPARTY_OUTPUT_PATH}/googletest/lib/libgtest.a")
-set(GTEST_LIB_MAIN
-  "${THIRDPARTY_OUTPUT_PATH}/googletest/lib/libgtest_main.a")
-
-set(SCANNER_LIBRARIES
-  "${PROTOBUF_LIBRARY}"
-  "${STOREHOUSE_LIBRARIES}"
-  "${FFMPEG_LIBRARIES}"
-  "-L/opt/ffmpeg-3.2.2/lib"
-  "-lswscale"
-  "${LIBLZMA_LIBRARIES}"
-  "${OPENSSL_LIBRARIES}"
-  "${BZIP2_LIBRARIES}"
-  "${PROXYGEN_LIBRARIES}"
-  "${FOLLY_LIBRARIES}"
-  "${Boost_LIBRARIES}"
-  "${GFLAGS_LIBRARIES}"
-  "${GLOG_LIBRARIES}"
-  "${CURL_LIBRARIES}"
-  "${ICONV_LIBRARIES}"
-  "${SCANNER_LIBRARIES}"
-  "${PYTHON_LIBRARIES}"
-  "-ljpeg"
-  "-lz"
-  "-ldl"
-  "-lgrpc++_unsecure -lgrpc -lgpr")
-
-include_directories(
-  "."
-  "${CMAKE_CURRENT_BINARY_DIR}" # for protobuf generated files
-  "${PROTOBUF_INCLUDE_DIRS}"
-  "${FFMPEG_INCLUDE_DIR}"
-  "${TINYTOML_INCLUDE_DIR}"
-  "${STOREHOUSE_INCLUDE_DIRS}"
-  "${OPENSSL_INCLUDE_DIR}"
-  "${Boost_INCLUDE_DIRS}"
-  "${GLOG_INCLUDE_DIRS}"
-  "${LIBLZMA_INCLUDE_DIRS}"
-  "${PYTHON_INCLUDE_DIRS}")
-
-if (BUILD_TESTS)
-  include_directories("${GTEST_INCLUDE_DIRS}")
-endif()
-
-if (BUILD_CUDA)
-  list(APPEND SCANNER_LIBRARIES
-    util_cuda
-    "${CUDA_LIBRARIES}"
-    "/usr/lib/x86_64-linux-gnu/libnvcuvid.so"
-    "-lcuda")
-endif()
-
-if (APPLE)
-  include_directories(
-    "/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/")
-elseif()
-endif()
+include(cmake/Dependencies.cmake)
 
 ###### Project code #######
 set(PROTO_FILES
@@ -200,6 +108,7 @@ add_library(scanner SHARED
   $<TARGET_OBJECTS:engine>
   $<TARGET_OBJECTS:video>
   $<TARGET_OBJECTS:util>
+  scanner/util/halide_context.cpp
   ${PROTO_SRCS}
   ${GRPC_PROTO_SRCS}
   ${STRUCK_SOURCES}
@@ -236,10 +145,6 @@ endforeach()
 
 add_subdirectory(stdlib)
 
-if (BUILD_EXAMPLES)
-  add_subdirectory(examples)
-endif()
-
 if (BUILD_TESTS)
   add_subdirectory(tests)
 endif()
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index a5c4082f..00000000
--- a/Dockerfile
+++ /dev/null
@@ -1,20 +0,0 @@
-FROM scannerresearch/scanner-base:ubuntu16.04
-MAINTAINER Will Crichton "wcrichto@cs.stanford.edu"
-ARG cores=1
-ARG gpu=ON
-
-ADD . /opt/scanner
-WORKDIR /opt/scanner
-RUN cd thirdparty && mkdir build && cd build && \
-    cmake -D CMAKE_BUILD_TYPE=Release .. && \
-    make -j ${cores}
-RUN mkdir build && cd build && \
-    cmake -D BUILD_IMGPROC_OPS=ON \
-          -D BUILD_CAFFE_OPS=ON \
-          -D BUILD_OPENFACE_OPS=ON \
-          -D BUILD_TESTS=ON \
-          -D BUILD_CUDA=${gpu} \
-          .. && \
-    make -j ${cores} && \
-    cd /opt/scanner && ./scripts/dev-setup.sh
-ENV PYTHONPATH /opt/scanner/python:$PYTHONPATH
diff --git a/INSTALL.md b/INSTALL.md
new file mode 100644
index 00000000..e6be923f
--- /dev/null
+++ b/INSTALL.md
@@ -0,0 +1,118 @@
+# Building Scanner
+
+*NOTE*: The following build instructions have only been tested on Ubuntu 16.04.
+
+There are five major steps to build and install Scanner:
+1 Install apt-get dependencies
+2 Install python pip dependencies
+3 Run deps.sh to download and install external dependencies
+4 Build Scanner
+5 Install scannerpy python package
+
+Scanner depends on the following *major* dependencies:
+
+* Python == 2.7
+* boost >= 1.63.0
+* ffmpeg >= 3.3.1
+* opencv >= 3.2.0
+* protobuf == 3.4.0
+* grpc == 1.7.2
+* caffe >= rc5 OR intel-caffe >= 1.0.6
+
+Scanner optionally requires:
+* CUDA >= 8.0
+
+Scanner provides a dependency script `deps.sh` to automatically install any or
+all of the *major* dependencies if they are not already installed. Each of these
+dependencies has a set of required system-level packages. If you need to install
+all or most of of these dependencies, run the 'All dependencies' apt-get command
+below. If you only need to install a few, we also provide apt-get commands for
+each package.
+
+## 1. apt-get Dependencies
+
+All dependencies
+```bash
+apt-get install \
+  build-essential \
+  cmake git libgtk2.0-dev pkg-config libavcodec-dev libavformat-dev \
+  libswscale-dev unzip llvm clang libc++-dev libgflags-dev libgtest-dev \
+  libssl-dev libcurl3-dev liblzma-dev libeigen3-dev  \
+  libgoogle-glog-dev libatlas-base-dev libsuitesparse-dev libgflags-dev \
+  libx264-dev libopenjpeg-dev libxvidcore-dev \
+  libpng-dev libjpeg-dev libbz2-dev git python-pip wget \
+  libleveldb-dev libsnappy-dev libhdf5-serial-dev liblmdb-dev python-dev \
+  python-tk autoconf autogen libtool libtbb-dev libopenblas-dev \
+  liblapacke-dev swig yasm python2.7 cpio \
+  automake libass-dev libfreetype6-dev libsdl2-dev libtheora-dev libtool \
+  libva-dev libvdpau-dev libvorbis-dev libxcb1-dev libxcb-shm0-dev \
+  libxcb-xfixes0-dev mercurial pkg-config texinfo wget zlib1g-dev \
+  curl unzip
+```
+
+For gRPC
+```bash
+apt-get install \
+  build-essential autoconf automake libtool curl make g++ \
+  unzip clang libc++-dev libgflags-dev libgtest-dev unzip
+```
+
+For OpenCV
+```bash
+apt-get install \
+  build-essential cmake git libgtk2.0-dev pkg-config \
+  python-dev libtbb2 libtbb-dev libjpeg-dev libpng-dev libtiff-dev \
+  libjasper-dev libdc1394-22-dev
+```
+
+For FFMPEG
+```bash
+apt-get -y install \
+  build-essential autoconf automake build-essential cmake git libass-dev \
+  libfreetype6-dev \ libsdl2-dev libtheora-dev libtool libva-dev libvdpau-dev \
+  libvorbis-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev mercurial \
+  pkg-config texinfo wget zlib1g-dev
+```
+
+## 2. Python Pip Dependencies
+
+Scanner depends on several python packages installable via pip. From the 
+top-level directory, run:
+
+```bash
+pip install -r requirements.txt
+```
+
+## 3. Run deps.sh
+
+To install or specify where your *major* dependencies are, from the top-level
+directory run:
+
+```bash
+bash ./deps.sh
+```
+
+This script will query you for each major
+dependency and install those that are not already installed. By default,
+it will install the dependencies to a local directory inside the scanner repo
+(it will not install system-wide).
+
+## 4. Build Scanner
+
+Run the following commands from the top-level directory:
+```bash
+mkdir build
+cd build
+cmake ..
+make -j
+```
+
+## 5. Install scannerpy python package
+
+Run the following commands from the top-level directory:
+```bash
+python python/setup.py bdist_wheel
+pip install dist/scannerpy-0.1.13-py2-none-any.whl
+```
+
+Congratulations! You've installed the scannerpy package.
diff --git a/README.md b/README.md
index 0484e2f3..7e5385ec 100644
--- a/README.md
+++ b/README.md
@@ -1,61 +1,92 @@
 # Scanner: Efficient Video Analysis at Scale [![Build Status](https://travis-ci.org/scanner-research/scanner.svg?branch=master)](https://travis-ci.org/scanner-research/scanner) #
 
-_For [build instructions](https://github.com/scanner-research/scanner/wiki/Building-Scanner), [tutorials](https://github.com/scanner-research/scanner/wiki/Getting-started), [documentation](https://github.com/scanner-research/scanner/wiki/Documentation), and [contributing guidelines](https://github.com/scanner-research/scanner/wiki/Contributing), visit the [Scanner wiki](https://github.com/scanner-research/scanner/wiki)._
+Scanner is a system for efficient video processing and understanding at scale.
+Scanner provides a python API for expressing computations and a heterogeneous
+runtime for scheduling these computations onto clusters of machines with
+CPUs or GPUs.
 
-Scanner lets you write stateful functions that get efficiently mapped across batches of video frames. These functions can execute on a multi-core CPU or GPU and can be distributed across multiple machines. You can think about Scanner like Spark for pixels. For example, you could use Scanner to:
+* [Install](https://github.com/scanner-research/scanner#install)
+* [Running Scanner](https://github.com/scanner-research/scanner#running-scanner)
+* [Tutorials & Examples](https://github.com/scanner-research/scanner#tutorials--examples)
+* [Documentation](https://github.com/scanner-research/scanner#documentation)
+* [Contributing](https://github.com/scanner-research/scanner#contributing)
 
-* [Locate and recognize faces](https://github.com/scanner-research/scanner/blob/master/examples/face_detection/face_detect.py)
-* [Detect shots in a film](https://github.com/scanner-research/scanner/blob/master/examples/shot_detection/shot_detect.py)
-* [Search videos by image](https://github.com/scanner-research/scanner/blob/master/examples/reverse_image_search/search.py)
+Scanner is an active research project, part of a collaboration between Carnegie Mellon and Stanford. Please contact [Alex Poms](https://github.com/apoms) and [Will Crichton](https://github.com/willcrichton) with questions.
 
-To support these applications, Scanner uses a Python interface similar to Tensorflow and Spark SQL. Videos are represented as tables in a database, and users write computation graphs to transform these tables. For example, to compute the color histogram for each frame in a set of videos on the GPU:
+## Install
 
-```python
-from scannerpy import Database, DeviceType
-from scannerpy.stdlib import parsers
-db = Database()
-videos = db.ingest_video_collection('my_videos', ['vid0.mp4', 'vid1.mkv'])
-hist = db.ops.Histogram(device=DeviceType.GPU)
-output = db.run(videos, hist, 'my_videos_hist')
-vid0_hists = output.tables(0).columns(0).load(parsers.histograms)
-```
+There are two ways to build and run Scanner on your machine:
+* [Docker](https://github.com/scanner-research/scanner#docker)
+* [From Source](https://github.com/scanner-research/scanner#from-source)
 
-Scanner provides a convenient way to organize your videos as well as data derived from the videos (bounding boxes, histograms, feature maps, etc.) using a relational database. Behind the scenes, Scanner handles decoding the compressed videos into raw frames, allowing you to process an individual video in parallel. It then runs a computation graph on the decoded frames using kernels written in C++ for maximum performance and distributes the computation over a cluster. Scanner supports a number of operators and third-party libraries to reduce the work of writing new computations:
+### Docker
+First, install [Docker](https://docs.docker.com/engine/installation/#supported-platforms).
+If you have a GPU and you're running on Linux, install [nvidia-docker 1.0](https://github.com/NVIDIA/nvidia-docker/tree/1.0) and run:
+
+```bash
+pip install --upgrade nvidia-docker-compose
+wget https://raw.githubusercontent.com/scanner-research/scanner/master/docker/docker-compose.yml
+nvidia-docker-compose pull gpu
+nvidia-docker-compose run --service-ports gpu /bin/bash
+```
 
-* [Caffe](https://github.com/bvlc/caffe) support for neural network evaluation
-* [OpenCV](https://github.com/opencv/opencv) support with included kernels for color histograms and optical flow
-* Object tracking in videos with [Struck](https://github.com/samhare/struck)
-* Image processing with [Halide](http://halide-lang.org/)
+Otherwise, you should run:
 
-Lastly, Scanner also offers some utilities for ease of development:
+```bash
+pip install --upgrade docker-compose
+wget https://raw.githubusercontent.com/scanner-research/scanner/master/docker/docker-compose.yml
+docker-compose pull cpu
+docker-compose run --service-ports cpu /bin/bash
+```
 
-* Profiling via [chrome://tracing](https://www.chromium.org/developers/how-tos/trace-event-profiling-tool)
-* Support for different storage backends including [Google Cloud Storage](https://cloud.google.com/storage/)
-* Custom operators for adding your own functionality outside the source tree
+If these commands were successful, you should now have bash session at the
+Scanner directory inside the docker container. To start processing some videos,
+check out [Running Scanner](https://github.com/scanner-research/scanner#running-scanner)
 
-Scanner is an active research project, part of a collaboration between Carnegie Mellon and Stanford. Please contact [Alex Poms](https://github.com/apoms) and [Will Crichton](https://github.com/willcrichton) with questions.
+### From Source
+Follow the instructions at [INSTALL](https://github.com/scanner-research/scanner/blob/master/INSTALL.md)
+to build Scanner from source. To start processing some videos, check out [Running Scanner](https://github.com/scanner-research/scanner#running-scanner).
 
-## Quick start ##
+## Running Scanner
 
-To quickly dive into Scanner, you can use one of our prebuilt [Docker images](https://hub.docker.com/r/scannerresearch/scanner). To run a GPU image, you must install and use [nvidia-docker](https://github.com/NVIDIA/nvidia-docker).
+Since Scanner programs are written using a high-level python API, running a
+Scanner program is as simple as executing a python script. Let's run a Scanner
+job now to find all the faces of people in a video (you can also use your own 
+video if you have one on-hand). Run the following commands:
 
 ```bash
-nvidia-docker run -d --name scanner -ti scannerresearch/scanner:gpu /bin/bash
-nvidia-docker attach scanner
+cd path/to/your/scanner/directory
+# Download an example video (or use your own)
+wget https://storage.googleapis.com/scanner-data/tutorial_assets/star_wars_heros.mp4
+# Run the Scanner program
+python examples/apps/face_detection/main.py star_wars_heros.mp4
+
 ```
 
-_Note: if you don't have a GPU, then run `docker` instead of `nvidia-docker` and use `scanner:cpu` instead of `scanner:gpu` in the Docker image name._
+You should see several progress bars indicating the video is being processed.
+When finished, there will be an mp4 file in your current directory called `
+star_wars_heros_faces.mp4` with bounding boxes drawn over every
+face in the original video. Congratulations, you just ran your first Scanner
+program! Here's a few next steps:
 
-Then inside your Docker container, run:
+* To learn how to start writing your own Scanner programs, dive into the API with the [tutorials](https://github.com/scanner-research/scanner#tutorials--examples).
+* To run other Scanner programs on your videos, check out the [examples](https://github.com/scanner-research/scanner#tutorials--examples).
+* If you're looking for a code reference, check out the [documentation](https://github.com/scanner-research/scanner#documentation)
 
-```bash
-python examples/face_detection/face_detect.py
-```
+## Tutorials & Examples & How-To's
 
-This runs a Scanner demo which detects faces in every frame of a short video from YouTube, creating a file `example_faces.mp4`. Type `Ctrl-P + Ctrl-Q` to detach from the container and then run:
+The tutorials and examples are located in the
+[examples](https://github.com/scanner-research/scanner/tree/master/examples)
+directory. Some of the examples include:
 
-```bash
-nvidia-docker cp scanner:/opt/scanner/example_faces.mp4 .
-```
+* [Locate and recognize faces in a video](https://github.com/scanner-research/scanner/blob/master/examples/apps/face_detection/)
+* [Detect shots in a film](https://github.com/scanner-research/scanner/blob/master/examples/apps/shot_detection/)
+* [Search videos by image](https://github.com/scanner-research/scanner/blob/master/examples/apps/reverse_image_search/)
+
+## Documentation
+
+TODO(apoms)
+
+## Contributing
 
-Then you can view the generated video on your own machine. That's it! To learn more about Scanner, please visit the [Scanner wiki](https://github.com/scanner-research/scanner/wiki).
+TODO(apoms)
diff --git a/build.sh b/build.sh
new file mode 100755
index 00000000..9987cc63
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+PKG=scannerpy
+
+pushd build
+if make -j$(nproc); then
+    popd
+    if rm -rf dist && \
+        python python/setup.py bdist_wheel;
+    then
+        cwd=$(pwd)
+        # cd to /tmp to avoid name clashes with Python module name and any
+        # directories of the same name in our cwd
+        pushd /tmp
+        (yes | pip uninstall $PKG)
+        (yes | pip install $cwd/dist/*)
+        popd
+    fi
+else
+    popd
+fi
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
new file mode 100644
index 00000000..8877be15
--- /dev/null
+++ b/cmake/Dependencies.cmake
@@ -0,0 +1,124 @@
+###### Parse dependency file #######
+file(STRINGS ${CMAKE_SOURCE_DIR}/dependencies.txt ConfigContents)
+foreach(NameAndValue ${ConfigContents})
+  # Strip leading spaces
+  string(REGEX REPLACE "^[ ]+" "" NameAndValue ${NameAndValue})
+  # Find variable name
+  string(REGEX MATCH "^[^=]+" Name ${NameAndValue})
+  # Find the value
+  string(REPLACE "${Name}=" "" Value ${NameAndValue})
+  # Set the variable
+  set(${Name} "${Value}")
+endforeach()
+
+list(APPEND CMAKE_PREFIX_PATH ${PROTOBUF_DIR})
+
+###### Optional Dependencies #######
+if (BUILD_CUDA)
+  find_package(CUDA REQUIRED)
+  add_definitions(-DHAVE_CUDA)
+  include_directories(${CUDA_INCLUDE_DIRS})
+  if(COMPILER_SUPPORTS_CXX1Y)
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")
+  endif()
+endif()
+
+if (BUILD_CUDA)
+  add_library(scanner_halide scanner/util/halide_context.cpp)
+endif()
+
+set(OPENCV_DESIRED_COMPONENTS core highgui imgproc)
+if (BUILD_CUDA)
+  list(APPEND OPENCV_DESIRED_COMPONENTS cudaimgproc cudaarithm)
+endif()
+
+###### Required Dependencies #######
+find_package(SaneProtobuf REQUIRED)
+find_package(GRPC REQUIRED)
+find_package(FFmpeg REQUIRED)
+find_package(LibLZMA REQUIRED)
+find_package(OpenSSL REQUIRED)
+find_package(BZip2 REQUIRED)
+find_package(Boost COMPONENTS thread program_options regex python REQUIRED)
+find_package(GFlags REQUIRED)
+find_package(Glog REQUIRED)
+find_package(GoogleTest REQUIRED)
+find_package(CURL REQUIRED)
+find_package(Iconv REQUIRED)
+find_package(Storehouse REQUIRED CONFIG
+  PATHS "${CMAKE_SOURCE_DIR}/thirdparty/install")
+find_package(Hwang REQUIRED)
+find_package(TinyToml REQUIRED)
+find_package(PythonLibs 2.7 EXACT REQUIRED)
+find_package(OpenCV COMPONENTS ${OPENCV_DESIRED_COMPONENTS})
+find_package(OpenMP REQUIRED)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+
+set(GTEST_INCLUDE_DIRS ${GOOGLETEST_INCLUDE_DIR})
+set(GTEST_LIBRARIES ${GOOGLETEST_LIBRARIES})
+set(GTEST_LIB_MAIN ${GOOGLETEST_MAIN})
+
+set(SCANNER_LIBRARIES
+  "${HWANG_LIBRARY}"
+  "${PROTOBUF_LIBRARY}"
+  "${GRPC_LIBRARIES}"
+  "${FFMPEG_LIBRARIES}"
+  "${LIBLZMA_LIBRARIES}"
+  "${BZIP2_LIBRARIES}"
+  "${GFLAGS_LIBRARIES}"
+  "${GLOG_LIBRARIES}"
+  "${CURL_LIBRARIES}"
+  "${ICONV_LIBRARIES}"
+  "${SCANNER_LIBRARIES}"
+  "-lpython2.7"
+  "${Boost_LIBRARIES}"
+  "${Boost_LIBRARY_DIRS}/libboost_python.so"
+  "${Boost_LIBRARY_DIRS}/libboost_numpy.so"
+  "${STOREHOUSE_LIBRARIES}"
+  "${OPENSSL_LIBRARIES}"
+  "-ljpeg"
+  "-lz"
+  "-ldl"
+  )
+
+message(${SCANNER_LIBRARIES})
+
+include_directories(
+  "."
+  "${CMAKE_CURRENT_BINARY_DIR}" # for protobuf generated files
+  "${HWANG_INCLUDE_DIRS}"
+  "${PROTOBUF_INCLUDE_DIRS}"
+  "${GRPC_INCLUDE_DIRS}"
+  "${FFMPEG_INCLUDE_DIR}"
+  "${TINYTOML_INCLUDE_DIR}"
+  "${STOREHOUSE_INCLUDE_DIRS}"
+  "${OPENSSL_INCLUDE_DIR}"
+  "${GLOG_INCLUDE_DIRS}"
+  "${LIBLZMA_INCLUDE_DIRS}"
+  "${PYTHON_INCLUDE_DIRS}"
+  "${Boost_INCLUDE_DIRS}")
+
+if (OpenCV_FOUND)
+  list(APPEND SCANNER_LIBRARIES ${OpenCV_LIBRARIES})
+  include_directories(${OpenCV_INCLUDE_DIRS})
+  add_definitions(-DHAVE_OPENCV)
+endif()
+
+if (BUILD_TESTS)
+  include_directories("${GTEST_INCLUDE_DIRS}")
+endif()
+
+if (BUILD_CUDA)
+  list(APPEND SCANNER_LIBRARIES
+    util_cuda
+    "${CUDA_LIBRARIES}"
+    "/usr/lib/x86_64-linux-gnu/libnvcuvid.so"
+    "-lcuda")
+endif()
+
+if (APPLE)
+  include_directories(
+    "/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/")
+elseif()
+endif()
diff --git a/cmake/Modules/FindFFmpeg.cmake b/cmake/Modules/FindFFmpeg.cmake
index c511df95..b2269196 100644
--- a/cmake/Modules/FindFFmpeg.cmake
+++ b/cmake/Modules/FindFFmpeg.cmake
@@ -54,6 +54,11 @@ NAMES swresample
 PATHS ${_FFMPEG_AVUTIL_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
 )
 
+find_library(FFMPEG_LIBSWSCALE
+NAMES swscale
+PATHS ${_FFMPEG_AVCODEC_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
+)
+
 if (FFMPEG_LIBAVCODEC AND FFMPEG_LIBAVFORMAT)
 set(FFMPEG_FOUND TRUE)
 endif()
@@ -66,6 +71,7 @@ ${FFMPEG_LIBAVCODEC}
 ${FFMPEG_LIBAVFORMAT}
 ${FFMPEG_LIBAVUTIL}
 ${FFMPEG_LIBSWRESAMPLE}
+${FFMPEG_LIBSWSCALE}
 )
 
 endif (FFMPEG_FOUND)
diff --git a/cmake/Modules/FindGRPC.cmake b/cmake/Modules/FindGRPC.cmake
new file mode 100644
index 00000000..2c0b3946
--- /dev/null
+++ b/cmake/Modules/FindGRPC.cmake
@@ -0,0 +1,48 @@
+# - Try to find grpc library
+#
+# The following variables are optionally searched for defaults
+#  GRPC_DIR:   Base directory where all components are found
+#
+# The following are set after configuration is done:
+#  GRPC_FOUND
+#  GRPC_INCLUDE_DIRS
+#  GRPC_LIBRARIES
+#  GRPC_LIBRARY_DIRS
+
+include(FindPackageHandleStandardArgs)
+
+set(GRPC_ROOT_DIR "" CACHE PATH "Folder contains GRPC")
+
+if (NOT "$ENV{GRPC_DIR}" STREQUAL "")
+  set(GRPC_DIR $ENV{GRPC_DIR})
+endif()
+
+# We are testing only a couple of files in the include directories
+if(WIN32)
+  find_path(GRPC_INCLUDE_DIR grpc/grpc.h
+    PATHS ${GRPC_ROOT_DIR}/src/windows)
+else()
+  find_path(GRPC_INCLUDE_DIR grpc/grpc.h
+    PATHS
+    ${GRPC_DIR}/include)
+endif()
+
+find_library(GRPCPP_UNSECURE_LIBRARY grpc++_unsecure
+  PATHS
+  ${GRPC_DIR}/lib)
+
+find_library(GRPC_LIBRARY grpc
+  PATHS
+  ${GRPC_DIR}/lib)
+
+find_library(GPR_LIBRARY gpr
+  PATHS
+  ${GRPC_DIR}/lib)
+
+find_package_handle_standard_args(GRPC DEFAULT_MSG
+    GRPC_INCLUDE_DIR GRPC_LIBRARY)
+
+if(GRPC_FOUND)
+    set(GRPC_INCLUDE_DIRS ${GRPC_INCLUDE_DIR})
+    set(GRPC_LIBRARIES ${GRPCPP_UNSECURE_LIBRARY} ${GRPC_LIBRARY} ${GPR_LIBRARY})
+endif()
diff --git a/cmake/Modules/FindGipuma.cmake b/cmake/Modules/FindGipuma.cmake
new file mode 100644
index 00000000..8615d225
--- /dev/null
+++ b/cmake/Modules/FindGipuma.cmake
@@ -0,0 +1,38 @@
+# - Try to find Gipuma
+#
+# The following variables are optionally searched for defaults
+#  GIPUMA_ROOT_DIR:       Base directory where all Gipuma components are found
+#
+# The following are set after configuration is done:
+#  GIPUMA_FOUND
+#  GIPUMA_INCLUDE_DIRS
+
+include(FindPackageHandleStandardArgs)
+
+set(GIPUMA_ROOT_DIR "" CACHE PATH "Folder contains Gipuma")
+
+if (NOT "$ENV{Gipuma_DIR}" STREQUAL "")
+  set(GIPUMA_ROOT_DIR $ENV{Gipuma_DIR} CACHE PATH "Folder contains Gipuma" FORCE)
+endif()
+
+# We are testing only a couple of files in the include directories
+if(WIN32)
+  find_path(GIPUMA_INCLUDE_DIR gipuma.h
+    PATHS ${GIPUMA_ROOT_DIR}/src/windows
+    PATH_SUFFIXES gipuma)
+else()
+  find_path(GIPUMA_INCLUDE_DIR gipuma.h
+    PATHS ${GIPUMA_ROOT_DIR}/include
+    PATH_SUFFIXES gipuma)
+
+endif()
+
+find_library(GIPUMA_LIBRARY gipuma PATHS ${GIPUMA_ROOT_DIR}/lib)
+
+find_package_handle_standard_args(GIPUMA DEFAULT_MSG GIPUMA_INCLUDE_DIR
+  GIPUMA_LIBRARY)
+
+if(GIPUMA_FOUND)
+  set(GIPUMA_INCLUDE_DIRS ${GIPUMA_INCLUDE_DIR})
+  set(GIPUMA_LIBRARIES ${GIPUMA_LIBRARY})
+endif()
diff --git a/cmake/Modules/FindGoogleTest.cmake b/cmake/Modules/FindGoogleTest.cmake
index 68c1c44b..521ca65f 100644
--- a/cmake/Modules/FindGoogleTest.cmake
+++ b/cmake/Modules/FindGoogleTest.cmake
@@ -33,6 +33,11 @@ find_library(GOOGLETEST_LIBRARY gtest
   ${GOOGLETEST_ROOT_DIR}/lib
   ${CMAKE_SOURCE_DIR}/thirdparty/build/bin/googletest/lib)
 
+find_library(GOOGLETEST_MAIN gtest_main
+  PATHS
+  ${GOOGLETEST_ROOT_DIR}/lib
+  ${CMAKE_SOURCE_DIR}/thirdparty/build/bin/googletest/lib)
+
 find_package_handle_standard_args(GOOGLETEST DEFAULT_MSG
     GOOGLETEST_INCLUDE_DIR GOOGLETEST_LIBRARY)
 
diff --git a/cmake/Modules/FindHalide.cmake b/cmake/Modules/FindHalide.cmake
index e555696a..8d8509d0 100644
--- a/cmake/Modules/FindHalide.cmake
+++ b/cmake/Modules/FindHalide.cmake
@@ -1,12 +1,14 @@
 # FindHalide.cmake
 # ... shamelessly based on FindJeMalloc.cmake
 
-
 set(HALIDE_ROOT_DIR "" CACHE PATH "Folder contains Halide")
 
 if (NOT "$ENV{Halide_DIR}" STREQUAL "")
   set(HALIDE_ROOT_DIR $ENV{Halide_DIR} CACHE PATH "Folder contains Halide"
     FORCE)
+elseif (Halide_DIR)
+  set(HALIDE_ROOT_DIR ${Halide_DIR} CACHE PATH "Folder contains Halide"
+    FORCE)
 endif()
 
 find_library(HALIDE_LIBRARIES
@@ -25,8 +27,8 @@ find_package_handle_standard_args(Halide DEFAULT_MSG
   HALIDE_INCLUDE_DIR
   )
 
-set(HALIDE_LIBRARY HALIDE_LIBRARIES)
-set(HALIDE_INCLUDE_DIRS HALIDE_INCLUDE_DIR)
+set(HALIDE_LIBRARY ${HALIDE_LIBRARIES})
+set(HALIDE_INCLUDE_DIRS ${HALIDE_INCLUDE_DIR})
 
 mark_as_advanced(
   HALIDE_ROOT_DIR
diff --git a/cmake/Modules/FindHwang.cmake b/cmake/Modules/FindHwang.cmake
new file mode 100644
index 00000000..abf30cd0
--- /dev/null
+++ b/cmake/Modules/FindHwang.cmake
@@ -0,0 +1,38 @@
+# FindHwang.cmake
+
+set(HWANG_ROOT_DIR "" CACHE PATH "Folder contains Hwang")
+
+if (NOT "$ENV{Hwang_DIR}" STREQUAL "")
+  set(HWANG_ROOT_DIR $ENV{Hwang_DIR} CACHE PATH "Folder contains Hwang"
+    FORCE)
+elseif (Hwang_DIR)
+  set(HWANG_ROOT_DIR ${Hwang_DIR} CACHE PATH "Folder contains Hwang"
+    FORCE)
+endif()
+
+find_library(HWANG_LIBRARIES
+  NAMES hwang
+  HINTS ${HWANG_ROOT_DIR}/lib
+  )
+
+find_path(HWANG_INCLUDE_DIR
+  NAMES hwang/common.h 
+  HINTS ${HWANG_ROOT_DIR}/include
+  )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Hwang DEFAULT_MSG
+  HWANG_LIBRARIES
+  HWANG_INCLUDE_DIR
+  )
+
+set(HWANG_LIBRARY ${HWANG_LIBRARIES})
+set(HWANG_INCLUDE_DIRS ${HWANG_INCLUDE_DIR})
+
+mark_as_advanced(
+  HWANG_ROOT_DIR
+  HWANG_LIBRARY
+  HWANG_LIBRARIES
+  HWANG_INCLUDE_DIR
+  HWANG_INCLUDE_DIRS
+  )
diff --git a/cmake/Modules/FindOpenPose.cmake b/cmake/Modules/FindOpenPose.cmake
new file mode 100644
index 00000000..dae6683d
--- /dev/null
+++ b/cmake/Modules/FindOpenPose.cmake
@@ -0,0 +1,37 @@
+# - Try to find OpenPose
+#
+# The following variables are optionally searched for defaults
+#  OPENPOSE_ROOT_DIR:       Base directory where all Caffe components are found
+#
+# The following are set after configuration is done:
+#  OPENPOSE_FOUND
+#  OPENPOSE_INCLUDE_DIRS
+#  OPENPOSE_LIBRARIES
+#  OPENPOSE_LIBRARY_DIRS
+
+include(FindPackageHandleStandardArgs)
+
+set(OPENPOSE_ROOT_DIR "" CACHE PATH "Folder contains OpenPose")
+
+if (NOT "$ENV{OpenPose_DIR}" STREQUAL "")
+  set(OPENPOSE_ROOT_DIR $ENV{OpenPose_DIR})
+endif()
+
+# We are testing only a couple of files in the include directories
+if(WIN32)
+  find_path(OPENPOSE_INCLUDE_DIR openpose/headers.hpp
+    PATHS ${OPENPOSE_ROOT_DIR}/src/windows)
+else()
+  find_path(OPENPOSE_INCLUDE_DIR openpose/headers.hpp
+    PATHS ${OPENPOSE_ROOT_DIR}/include)
+endif()
+
+find_library(OPENPOSE_LIBRARY openpose PATHS ${OPENPOSE_ROOT_DIR}/lib)
+
+find_package_handle_standard_args(OPENPOSE DEFAULT_MSG
+    OPENPOSE_INCLUDE_DIR OPENPOSE_LIBRARY)
+
+if(OPENPOSE_FOUND)
+    set(OPENPOSE_INCLUDE_DIRS ${OPENPOSE_INCLUDE_DIR})
+    set(OPENPOSE_LIBRARIES ${OPENPOSE_LIBRARY})
+endif()
diff --git a/cmake/Modules/FindSaneProtobuf.cmake b/cmake/Modules/FindSaneProtobuf.cmake
index ddc967b7..5df49fd3 100644
--- a/cmake/Modules/FindSaneProtobuf.cmake
+++ b/cmake/Modules/FindSaneProtobuf.cmake
@@ -152,7 +152,7 @@ function(PROTOBUF_GENERATE_CPP SRCS HDRS USE_GRPC)
         OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${DIR_FIL}/${FIL_WE}.grpc.pb.cc"
         "${CMAKE_CURRENT_BINARY_DIR}/${DIR_FIL}/${FIL_WE}.grpc.pb.h"
         COMMAND  ${PROTOBUF_PROTOC_EXECUTABLE}
-        ARGS --plugin=protoc-gen-grpc=/usr/local/bin/grpc_cpp_plugin --grpc_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
+        ARGS --plugin=protoc-gen-grpc=${GRPC_CPP_PLUGIN} --grpc_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
         DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE}
         COMMENT "Running C++ protocol buffer compiler on ${FIL}"
         VERBATIM)
@@ -218,7 +218,7 @@ function(PROTOBUF_GENERATE_PYTHON SRCS USE_GRPC)
         "${CMAKE_CURRENT_BINARY_DIR}/${DIR_FIL}/${FIL_WE}_pb2.py")
       add_custom_command(
         OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${DIR_FIL}/${FIL_WE}_pb2.py"
-        COMMAND  ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} --plugin=protoc-gen-grpc_python=/usr/local/bin/grpc_python_plugin --grpc_python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path}  ${ABS_FIL}
+        COMMAND  ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} --plugin=protoc-gen-grpc_python=${GRPC_PYTHON_PLUGIN} --grpc_python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path}  ${ABS_FIL}
         DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE}
         COMMENT "Running Python protocol buffer compiler on ${FIL}"
         VERBATIM )
@@ -334,6 +334,23 @@ find_program(PROTOBUF_PROTOC_EXECUTABLE
 )
 mark_as_advanced(PROTOBUF_PROTOC_EXECUTABLE)
 
+find_program(GRPC_PYTHON_PLUGIN
+    NAMES grpc_python_plugin
+    DOC ""
+    PATHS
+    ${PROTOBUF_SRC_ROOT_FOLDER}/vsprojects/${_PROTOBUF_ARCH_DIR}Release
+    ${PROTOBUF_SRC_ROOT_FOLDER}/vsprojects/${_PROTOBUF_ARCH_DIR}Debug
+)
+mark_as_advanced(GRPC_PYTHON_PLUGIN)
+
+find_program(GRPC_CPP_PLUGIN
+    NAMES grpc_cpp_plugin
+    DOC ""
+    PATHS
+    ${PROTOBUF_SRC_ROOT_FOLDER}/vsprojects/${_PROTOBUF_ARCH_DIR}Release
+    ${PROTOBUF_SRC_ROOT_FOLDER}/vsprojects/${_PROTOBUF_ARCH_DIR}Debug
+)
+mark_as_advanced(GRPC_CPP_PLUGIN)
 
 include(FindPackageHandleStandardArgs)
 FIND_PACKAGE_HANDLE_STANDARD_ARGS(Protobuf DEFAULT_MSG
diff --git a/cmake/Modules/FindTinyToml.cmake b/cmake/Modules/FindTinyToml.cmake
index 06a039e3..755c47dd 100644
--- a/cmake/Modules/FindTinyToml.cmake
+++ b/cmake/Modules/FindTinyToml.cmake
@@ -12,7 +12,11 @@ include(FindPackageHandleStandardArgs)
 set(TINYTOML_ROOT_DIR "" CACHE PATH "Folder contains TinyToml")
 
 if (NOT "$ENV{TinyToml_DIR}" STREQUAL "")
-  set(TINYTOML_ROOT_DIR $ENV{TinyToml_DIR})
+  set(TINYTOML_ROOT_DIR $ENV{TinyToml_DIR} CACHE PATH "Folder contains TinyToml"
+    FORCE)
+elseif(TinyToml_DIR)
+  set(TINYTOML_ROOT_DIR ${TinyToml_DIR} CACHE PATH "Folder contains TinyToml"
+    FORCE)
 endif()
 
 # We are testing only a couple of files in the include directories
diff --git a/cmake/Util/Op.cmake b/cmake/Util/Op.cmake
index bf6bf46a..62669f35 100644
--- a/cmake/Util/Op.cmake
+++ b/cmake/Util/Op.cmake
@@ -2,15 +2,13 @@
 # op. It sets a few default flags and exposes a function build_op for simplifying
 # the build process. See examples/tutorial/04_custom_op.py for an example usage.
 
-if(NOT SCANNER_PATH)
-  message(FATAL_ERROR "Set SCANNER_PATH to the Scanner repo directory before including Op.cmake.")
-endif()
-list(APPEND CMAKE_MODULE_PATH "${SCANNER_PATH}/cmake/Modules/")
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/Modules")
 
 include(CheckCXXCompilerFlag)
 CHECK_CXX_COMPILER_FLAG("-std=c++1y" COMPILER_SUPPORTS_CXX1Y)
 if(NOT COMPILER_SUPPORTS_CXX1Y)
-  message(FATAL_ERROR "The compiler ${CMAKE_CXX_COMPILER} has no C++1y support.")
+  message(FATAL_ERROR
+    "The compiler ${CMAKE_CXX_COMPILER} has no C++1y support.")
 endif()
 
 if (NOT CMAKE_BUILD_TYPE)
@@ -19,32 +17,41 @@ if (NOT CMAKE_BUILD_TYPE)
 endif()
 
 function(build_op)
-  set(options )
+  set(options)
   set(oneValueArgs LIB_NAME PROTO_SRC NO_FLAGS)
   set(multiValueArgs CPP_SRCS)
   cmake_parse_arguments(args "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   include_directories("${CMAKE_CURRENT_BINARY_DIR}")
 
+  # Build protobuf files if they exist
   if(NOT("${args_PROTO_SRC}" STREQUAL ""))
     find_package(SaneProtobuf REQUIRED)
     set(PROTOBUF_IMPORT_DIRS "${SCANNER_PATH}")
     protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS OFF ${args_PROTO_SRC})
     protobuf_generate_python(PROTO_PY OFF ${args_PROTO_SRC})
-    add_custom_target(${args_LIB_NAME}_proto_files DEPENDS ${PROTO_HDRS} ${PROTO_PY})
+    add_custom_target(${args_LIB_NAME}_proto_files
+      DEPENDS ${PROTO_HDRS} ${PROTO_PY})
     add_library(${args_LIB_NAME} SHARED ${args_CPP_SRCS} ${PROTO_SRCS})
     add_dependencies(${args_LIB_NAME} ${args_LIB_NAME}_proto_files)
-    target_link_libraries(${args_LIB_NAME} PUBLIC
-      "${PROTOBUF_LIBRARY}"
-      "${SCANNER_PATH}/build/libscanner.so")
+    target_link_libraries(${args_LIB_NAME} PUBLIC "${PROTOBUF_LIBRARY}")
   else()
     add_library(${args_LIB_NAME} SHARED ${args_CPP_SRCS})
   endif()
 
+  # NO_FLAGS is primarily for special treatment of libstdlib right now
   if("${args_NO_FLAGS}" STREQUAL "")
+    # Explictly link libscanner.so
+    execute_process(
+      OUTPUT_VARIABLE SCANNER_LIB_PATH
+      COMMAND
+      python -c "import scannerpy.stdlib.build_flags as b; b.print_lib()")
+    target_link_libraries(${args_LIB_NAME} PUBLIC "${SCANNER_LIB_PATH}/libscanner.so")
+
     execute_process(
       OUTPUT_VARIABLE BUILD_FLAGS
-      COMMAND python -c "import scannerpy; scannerpy.Database().print_build_flags()")
+      COMMAND
+      python -c "import scannerpy.stdlib.build_flags as b; b.print_flags()")
     set_target_properties(
       ${args_LIB_NAME} PROPERTIES
       COMPILE_FLAGS "${BUILD_FLAGS}")
diff --git a/deps.sh b/deps.sh
new file mode 100644
index 00000000..9a8351a6
--- /dev/null
+++ b/deps.sh
@@ -0,0 +1,552 @@
+#!/bin/bash
+
+cores=$(nproc)
+
+LOCAL_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+BUILD_DIR=$LOCAL_DIR/thirdparty/build
+DEFAULT_INSTALL_DIR=$LOCAL_DIR/thirdparty/install
+FILES_DIR=$LOCAL_DIR/thirdparty/resources
+
+POSITIONAL=()
+
+# Ask if installed
+INSTALL_BOOST=true
+INSTALL_FFMPEG=true
+INSTALL_OPENCV=true
+INSTALL_PROTOBUF=true
+INSTALL_GRPC=true
+INSTALL_CAFFE=true
+INSTALL_HALIDE=true
+INSTALL_OPENPOSE=true
+
+USE_GPU=false
+
+# Assume not installed
+INSTALL_HWANG=true
+INSTALL_TINYTOML=true
+INSTALL_STOREHOUSE=true
+INSTALL_GOOGLETEST=true
+
+INSTALL_PREFIX=$DEFAULT_INSTALL_DIR
+
+INSTALL_ALL=false
+INSTALL_NONE=false
+
+while [[ $# -gt 0 ]]
+do
+key="$1"
+
+case $key in
+    -c|--cores)
+    cores="$2"
+    shift # past arg
+    shift # past value
+    ;;
+    -g|--use-gpu)
+    USE_GPU=true
+    shift # past arg
+    ;;
+    -p|--prefix)
+    INSTALL_PREFIX="$2"
+    shift # past arg
+    shift # past value
+    ;;
+    -a|--install-all)
+    INSTALL_ALL=true
+    shift # past arg
+    ;;
+    -n|--install-none)
+    INSTALL_NONE=true
+    shift # past arg
+    ;;
+    *)    # unknown option
+    POSITIONAL+=("$1") # save it in an array for later
+    shift # past argument
+    ;;
+esac
+done
+
+echo "--------------------------------------------------------------"
+echo "|          Scanner Dependency Installation Script            |"
+echo "--------------------------------------------------------------"
+echo "The script will ask if required dependencies are installed and"
+echo "then install missing dependencies to "
+echo "$INSTALL_PREFIX"
+echo "(customized by specifying (--prefix <dir>)"
+
+set -- "${POSITIONAL[@]}" # restore positional parameters
+
+# Directories for installed dependencies
+BOOST_DIR=$INSTALL_PREFIX
+FFMPEG_DIR=$INSTALL_PREFIX
+OPENCV_DIR=$INSTALL_PREFIX
+PROTOBUF_DIR=$INSTALL_PREFIX
+GRPC_DIR=$INSTALL_PREFIX
+CAFFE_DIR=$INSTALL_PREFIX
+HALIDE_DIR=$INSTALL_PREFIX
+HWANG_DIR=$INSTALL_PREFIX
+STOREHOUSE_DIR=$INSTALL_PREFIX
+TINYTOML_DIR=$INSTALL_PREFIX
+OPENPOSE_DIR=$INSTALL_PREFIX
+
+export C_INCLUDE_PATH=$INSTALL_PREFIX/include:$C_INCLUDE_PATH
+export LD_LIBRARY_PATH=$INSTALL_PREFIX/lib:$LD_LIBRARY_PATH
+export PATH=$INSTALL_PREFIX/bin:$PATH
+export PKG_CONFIG_PATH=$INSTALL_PREFIX/lib/pkgconfig:$PGK_CONFIG_PATH
+
+mkdir -p $BUILD_DIR
+mkdir -p $INSTALL_PREFIX
+
+if [[ $INSTALL_NONE == true ]]; then
+    INSTALL_BOOST=false
+    INSTALL_FFMPEG=false
+    INSTALL_OPENCV=false
+    INSTALL_PROTOBUF=false
+    INSTALL_GRPC=false
+    INSTALL_CAFFE=false
+    INSTALL_HALIDE=false
+    INSTALL_OPENPOSE=false
+    INSTALL_HWANG=false
+    INSTALL_TINYTOML=false
+    INSTALL_STOREHOUSE=false
+    INSTALL_GOOGLETEST=false
+
+elif [[ $INSTALL_ALL == false ]]; then
+    # Ask about each library
+    while true; do
+        echo "Do you have boost>=1.63.0 installed with the modules: "
+        echo -n "thread, program_options, regex, python, numpy? [y/N]: "
+        read yn
+        if [[ $yn == y ]] || [[ $yn == Y ]]; then
+            INSTALL_BOOST=false
+            echo -n "Where is your boost install? [/usr/local]: "
+            read install_location
+            if [[ $install_location == "" ]]; then
+                BOOST_DIR=/usr/local
+            else
+                BOOST_DIR=$install_location
+            fi
+            break
+        else
+            INSTALL_BOOST=true
+            break
+        fi
+    done
+
+    while true; do
+        echo -n "Do you have ffmpeg>=3.3.1 installed? [y/N]: "
+        read yn
+        if [[ $yn == y ]] || [[ $yn == Y ]]; then
+            INSTALL_FFMPEG=false
+            echo -n "Where is your ffmpeg install? [/usr/local]: "
+            read install_location
+            if [[ $install_location == "" ]]; then
+                FFMPEG_DIR=/usr/local
+            else
+                FFMPEG_DIR=$install_location
+            fi
+            break
+        else
+            INSTALL_FFMPEG=true
+            break
+        fi
+    done
+
+    while true; do
+        echo -n "Do you have opencv>=3.2.0 with contrib installed? [y/N]: "
+        read yn
+        if [[ $yn == y ]] || [[ $yn == Y ]]; then
+            INSTALL_OPENCV=false
+            echo -n "Where is your opencv install? [/usr/local]: "
+            read install_location
+            if [[ $install_location == "" ]]; then
+                OPENCV_DIR=/usr/local
+            else
+                OPENCV_DIR=$install_location
+            fi
+            break
+        else
+            INSTALL_OPENCV=true
+            break
+        fi
+    done
+
+    while true; do
+        echo -n "Do you have protobuf>=3.4.0 installed? [y/N]: "
+        read yn
+        if [[ $yn == y ]] || [[ $yn == Y ]]; then
+            INSTALL_PROTOBUF=false
+            echo -n "Where is your protobuf install? [/usr/local]: "
+            read install_location
+            if [[ $install_location == "" ]]; then
+                PROTOBUF_DIR=/usr/local
+            else
+                PROTOBUF_DIR=$install_location
+            fi
+            break
+        else
+            INSTALL_PROTOBUF=true
+            break
+        fi
+    done
+
+    while true; do
+        echo -n "Do you have grpc>=1.7.2 installed? [y/N]: "
+        read yn
+        if [[ $yn == y ]] || [[ $yn == Y ]]; then
+            INSTALL_GRPC=false
+            echo -n "Where is your grpc install? [/usr/local]: "
+            read install_location
+            if [[ $install_location == "" ]]; then
+                GRPC_DIR=/usr/local
+            else
+                GRPC_DIR=$install_location
+            fi
+            break
+        else
+            INSTALL_GRPC=true
+            break
+        fi
+    done
+
+    while true; do
+        echo -n "Do you have halide (release_2016_10_25) installed? [y/N]: "
+        read yn
+        if [[ $yn == y ]] || [[ $yn == Y ]]; then
+            INSTALL_HALIDE=false
+            echo -n "Where is your halide install? [/usr/local]: "
+            read install_location
+            if [[ $install_location == "" ]]; then
+                HALIDE_DIR=/usr/local
+            else
+                HALIDE_DIR=$install_location
+            fi
+            break
+        else
+            INSTALL_HALIDE=true
+            break
+        fi
+    done
+
+    while true; do
+        echo -n "Do you have OpenPose (v1.2.0) installed? [y/N]: "
+        read yn
+        if [[ $yn == y ]] || [[ $yn == Y ]]; then
+            INSTALL_OPENPOSE=false
+            echo -n "Where is your OpenPose install? [/usr/local]: "
+            read install_location
+            if [[ $install_location == "" ]]; then
+                OPENPOSE_DIR=/usr/local
+            else
+                OPENPOSE_DIR=$install_location
+            fi
+            break
+        else
+            INSTALL_OPENPOSE=true
+            break
+        fi
+    done
+
+    while true; do
+        echo -n "Do you have caffe>=rc5 or intel-caffe>=1.0.6 installed? [y/N]: "
+        read yn
+        if [[ $yn == y ]] || [[ $yn == Y ]]; then
+            INSTALL_CAFFE=false
+            echo -n "Where is your caffe install? [/usr/local]: "
+            read install_location
+            if [[ $install_location == "" ]]; then
+                CAFFE_DIR=/usr/local
+            else
+                CAFFE_DIR=$install_location
+            fi
+            break
+        else
+            INSTALL_CAFFE=true
+            echo -n "Do you plan to use GPUs for CNN evaluation? [y/N]: "
+            read yn
+            if [[ $yn == y ]] || [[ $yn == Y ]]; then
+                USE_GPU=true
+                break
+            else
+                USE_GPU=false
+                break
+            fi
+        fi
+    done
+fi
+
+if [[ $INSTALL_BOOST == true ]] && [[ ! -f $BUILD_DIR/boost.done ]] ; then
+    echo "Installing boost 1.63.0..."
+    cd $BUILD_DIR
+    rm -fr boost*
+    wget "https://dl.bintray.com/boostorg/release/1.63.0/source/boost_1_63_0.tar.gz" && \
+        tar -xf boost_1_63_0.tar.gz && cd boost_1_63_0 && ./bootstrap.sh && \
+        ./b2 install --prefix=$INSTALL_PREFIX -j${cores} && \
+        rm -rf $BUILD_DIR/boost_1_63_0.tar.gz && touch $BUILD_DIR/boost.done \
+            || { echo 'Installing boost failed!' ; exit 1; }
+    echo "Done installing boost 1.63.0"
+fi
+
+
+if [[ $INSTALL_FFMPEG == true ]] && [[ ! -f $BUILD_DIR/ffmpeg.done ]] ; then
+    echo "Installing ffmpeg 3.3.1..."
+    # FFMPEG
+    cd $BUILD_DIR
+    rm -fr ffmpeg
+    git clone -b n3.3.1 https://git.ffmpeg.org/ffmpeg.git && cd ffmpeg && \
+    ./configure --prefix=$INSTALL_PREFIX --extra-version=0ubuntu0.16.04.1 \
+                --toolchain=hardened --cc=cc --cxx=g++ --enable-gpl \
+                --enable-shared --disable-stripping \
+                --disable-decoder=libschroedinger \
+                --enable-avresample --enable-libx264 --enable-nonfree && \
+    make -j${cores} && make install && touch $BUILD_DIR/ffmpeg.done \
+        || { echo 'Installing ffmpeg failed!' ; exit 1; }
+    echo "Done installing ffmpeg 3.3.1"
+fi
+
+if [[ $INSTALL_OPENCV == true ]] && [[ ! -f $BUILD_DIR/opencv.done ]]; then
+    # OpenCV 3.2.0 + OpenCV contrib
+    echo "Installing OpenCV 3.2.0..."
+    cd $BUILD_DIR
+    rm -rf opencv opencv_contrib ceres-solver
+    git clone -b 3.2.0 https://github.com/opencv/opencv && \
+        git clone -b 3.2.0  https://github.com/opencv/opencv_contrib && \
+        git clone -b 1.12.0 https://github.com/ceres-solver/ceres-solver && \
+        cd ceres-solver && mkdir build && cd build && \
+        cmake .. -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX && \
+        make install -j$cores && \
+        mkdir -p $BUILD_DIR/opencv/build && cd $BUILD_DIR/opencv/build && \
+        cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_INSTALL_PREFIX=$INSTALL_PREFIX \
+              -D BUILD_TESTS=OFF -D BUILD_PERF_TESTS=OFF -D ENABLE_FAST_MATH=1 \
+              -D CUDA_FAST_MATH=1 -D WITH_CUBLAS=1 -D WITH_NVCUVID=1 \
+              -D BUILD_opencv_rgbd=OFF \
+              -D BUILD_opencv_cnn_3dobj=OFF \
+              -D BUILD_TIFF=OFF \
+              -D WITH_TIFF=OFF \
+              -D OPENCV_EXTRA_MODULES_PATH=$BUILD_DIR/opencv_contrib/modules \
+              .. && \
+        make install -j$cores && touch $BUILD_DIR/opencv.done \
+            || { echo 'Installing OpenCV failed!' ; exit 1; }
+    echo "Done installing OpenCV 3.2.0"
+fi
+
+if [[ $INSTALL_PROTOBUF == true ]] && [[ ! -f $BUILD_DIR/protobuf.done ]] ; then
+    # protobuf 3.4.1
+    echo "Installing protobuf 3.4.1..."
+    cd $BUILD_DIR
+    rm -fr protobuf
+    git clone -b v3.4.1 https://github.com/google/protobuf.git && \
+        cd protobuf && bash ./autogen.sh && \
+        ./configure --prefix=$INSTALL_PREFIX && make -j$cores && \
+        make install && touch $BUILD_DIR/protobuf.done \
+            || { echo 'Installing protobuf failed!' ; exit 1; }
+    echo "Done installing protobuf 3.4.1"
+fi
+
+if [[ $INSTALL_GRPC == true ]] && [[ ! -f $BUILD_DIR/grpc.done ]] ; then
+    # gRPC 1.7.2
+    echo "Installing gRPC 1.7.2..."
+    cd $BUILD_DIR
+    rm -fr grpc
+    git clone -b v1.7.2 https://github.com/grpc/grpc && \
+        cd grpc && git submodule update --init --recursive && \
+        make EXTRA_CPPFLAGS=-I$INSTALL_PREFIX/include \
+             EXTRA_LDFLAGS=-L$INSTALL_PREFIX/lib -j$cores && \
+        make install EXTRA_CPPFLAGS=-I$INSTALL_PREFIX/include \
+             EXTRA_LDFLAGS=-L$INSTALL_PREFIX/lib prefix=$INSTALL_PREFIX && \
+        ldconfig -n $INSTALL_PREFIX/lib && \
+        touch $BUILD_DIR/grpc.done \
+            || { echo 'Installing gRPC failed!' ; exit 1; }
+    echo "Done installing gRPC 1.7.2"
+fi
+
+if [[ $INSTALL_HALIDE == true ]] && [[ ! -f $BUILD_DIR/halide.done ]] ; then
+    # Halide
+    echo "Installing Halide..."
+    cd $BUILD_DIR
+    rm -fr Halide
+    git clone -b release_2016_10_25 https://github.com/halide/Halide && \
+        cd Halide && \
+        make distrib -j$cores && \
+        cp -r distrib/* $INSTALL_PREFIX && \
+        touch $BUILD_DIR/halide.done \
+            || { echo 'Installing Halide failed!' ; exit 1; }
+    echo "Done installing Halide"
+fi
+
+if [[ $INSTALL_STOREHOUSE == true ]] && [[ ! -f $BUILD_DIR/storehouse.done ]] ; then
+    echo "Installing storehouse..."
+    cd $BUILD_DIR
+    rm -fr storehouse
+    git clone https://github.com/scanner-research/storehouse && \
+        cd storehouse && git checkout 1462a19fe9b3227a7abb0293e3f15709a4ed1f2f && \
+        cd thirdparty && mkdir build && cd build && \
+        cmake .. -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX && \
+        make -j${cores} && cd ../../ && \
+        mkdir build && cd build && \
+        cmake .. -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX -DBOOST_ROOT=$BOOST_DIR && make -j${cores} && \
+        make install && \
+        cd .. && ./build.sh && \
+        touch $BUILD_DIR/storehouse.done \
+            || { echo 'Installing storehouse failed!' ; exit 1; }
+    echo "Done installing storehouse"
+fi
+
+if [[ $INSTALL_HWANG == true ]] && [[ ! -f $BUILD_DIR/hwang.done ]] ; then
+    echo "Installing hwang..."
+    cd $BUILD_DIR
+    rm -fr hwang
+    git clone https://github.com/scanner-research/hwang && \
+        cd hwang && \
+        git checkout 69f04c4e5df9c6db701a0cb5b79f964e66637f87 && \
+        bash ./deps.sh -a \
+             --with-boost $INSTALL_PREFIX \
+             --with-ffmpeg $INSTALL_PREFIX \
+             --with-protobuf $INSTALL_PREFIX \
+             --cores ${cores} && \
+        mkdir -p build && cd build && \
+        cmake .. -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX -DBUILD_CUDA=ON && \
+        make -j${cores} && make install -j${cores} && cd .. && \
+        cd python && python setup.py bdist_wheel && \
+        pip install dist/hwang-0.1.0-py2-none-any.whl && \
+        touch $BUILD_DIR/hwang.done \
+            || { echo 'Installing hwang failed!' ; exit 1; }
+    echo "Done installing hwang"
+fi
+
+if [[ $INSTALL_TINYTOML == true ]] && [[ ! -f $BUILD_DIR/tinytoml.done ]]; then
+    echo "Installing tinytoml..."
+    cd $BUILD_DIR
+    rm -fr tinytoml
+    git clone https://github.com/mayah/tinytoml.git && \
+        cd tinytoml && git checkout 3559856002eee57693349b8a2d8a0cf6250d269c && \
+        cp -r include/* $INSTALL_PREFIX/include && \
+        touch $BUILD_DIR/tinytoml.done \
+            || { echo 'Installing tinytoml failed!' ; exit 1; }
+    echo "Done installing tinytoml"
+fi
+
+if [[ $INSTALL_GOOGLETEST == true ]] && ! -f $BUILD_DIR/googletest.done ]]; then
+    echo "Installing googletest..."
+    cd $BUILD_DIR
+    rm -fr googletest
+    git clone https://github.com/google/googletest && \
+        cd googletest && mkdir build && cd build && \
+        cmake .. -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX && \
+        make -j${cores} && make install && \
+        touch $BUILD_DIR/googletest.done \
+            || { echo 'Installing googletest failed!' ; exit 1; }
+    echo "Done installing googletest"
+fi
+
+if [[ $INSTALL_CAFFE == true ]] && [[ $USE_GPU == false ]] && \
+       [[ ! -f $BUILD_DIR/caffe.done ]]; then
+    # Intel Caffe 1.0.6
+    cd $BUILD_DIR
+    rm -fr caffe
+    # Use more recent mkldnn commit to fix gcc bug
+    git clone -b 1.0.6 https://github.com/intel/caffe && \
+        cd caffe && \
+        cp $FILES_DIR/caffe/Makefile.config Makefile.config && \
+        rm mkldnn.commit && \
+        echo "2604f435da7bb9f1896ae37200d91734adfdba9c" > mkldnn.commit && \
+        mkdir build && cd build && \
+        cmake -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX \
+              -DCPU_ONLY=ON \
+              -DOpenCV_DIR=$INSTALL_PREFIX \
+              -DBLAS=mkl \
+              .. && \
+        make -j${cores} && \
+        make install && \
+        cd .. && \
+        cp -r external/mkl/mklml_lnx_2018.0.20170908/* $INSTALL_PREFIX && \
+        cp -r external/mkldnn/install/* $INSTALL_PREFIX && \
+        touch $BUILD_DIR/caffe.done \
+            || { echo 'Installing caffe failed!' ; exit 1; }
+fi
+
+if [[ $INSTALL_CAFFE == true ]] && [[ $USE_GPU == true ]] && \
+       [[ ! -f $BUILD_DIR/caffe.done ]]; then
+    cd $BUILD_DIR
+    # Intel MKL
+    rm -fr mkl
+    mkdir mkl && \
+        cd mkl && \
+        wget http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/12414/l_mkl_2018.1.163.tgz && \
+        tar -zxf l_mkl_2018.1.163.tgz && \
+        cp $FILES_DIR/mkl/silent.cfg silent.cfg && \
+        echo "PSET_INSTALL_DIR=$INSTALL_PREFIX/intel" >> silent.cfg && \
+        cd l_mkl_2018.1.163 && \
+        bash install.sh --cli-mode --silent ../silent.cfg
+
+    cd $BUILD_DIR
+    # Caffe rc5
+    rm -fr caffe
+    git clone -b rc5 https://github.com/BVLC/caffe && \
+        cd caffe && cp $FILES_DIR/caffe/Makefile.config Makefile.config && \
+        mkdir build && cd build && \
+        cmake -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX \
+              -DCMAKE_PREFIX_PATH=$INSTALL_PREFIX \
+              -DINTEL_ROOT=$INSTALL_PREFIX/intel \
+              -DBLAS=mkl \
+              -DCUDA_ARCH_NAME="Manual" \
+              -DCUDA_ARCH_BIN="30 35 50 60 61" \
+              -DCUDA_ARCH_PTX="30 35 50 60 61" \
+              -DOpenCV_DIR=$INSTALL_PREFIX \
+              .. && \
+        make -j${cores} && \
+        make install && \
+        touch $BUILD_DIR/caffe.done \
+            || { echo 'Installing caffe failed!' ; exit 1; }
+fi
+
+if [[ $INSTALL_OPENPOSE == true ]] && [[ ! -f $BUILD_DIR/openpose.done ]]; then
+    cd $BUILD_DIR
+    rm -rf openpose
+    git clone -b v1.2.0 https://github.com/CMU-Perceptual-Computing-Lab/openpose && \
+        cd openpose && mkdir build && cd build && \
+        cmake -D CMAKE_INSTALL_PREFIX=$INSTALL_PREFIX \
+              -D CMAKE_PREFIX_PATH=$INSTALL_PREFIX \
+              -D OpenCV_DIR=$INSTALL_PREFIX \
+              -D BUILD_CAFFE=OFF \
+              -D Caffe_INCLUDE_DIRS=$CAFFE_DIR/include \
+              -D Caffe_LIBS=$CAFFE_DIR/lib/libcaffe.so \
+              -D BOOST_ROOT=$BOOST_DIR \
+              -D BUILD_EXAMPLES=Off \
+              -D BUILD_DOCS=Off \
+              -D DOWNLOAD_COCO_MODEL=Off \
+              -D DOWNLOAD_HAND_MODEL=Off \
+              -D DOWNLOAD_FACE_MODEL=Off \
+              .. && \
+        make install -j${cores} && \
+        touch $BUILD_DIR/openpose.done \
+              || { echo 'Installing OpenPose failed!'; exit 1; }
+fi
+
+
+
+DEP_FILE=$LOCAL_DIR/dependencies.txt
+rm -f $DEP_FILE
+echo "BOOST_DIR=$BOOST_DIR" >> $DEP_FILE
+echo "FFMPEG_DIR=$FFMPEG_DIR" >> $DEP_FILE
+echo "OpenCV_DIR=$OPENCV_DIR" >> $DEP_FILE
+echo "PROTOBUF_DIR=$PROTOBUF_DIR" >> $DEP_FILE
+echo "GRPC_DIR=$GRPC_DIR" >> $DEP_FILE
+echo "CAFFE_DIR=$CAFFE_DIR" >> $DEP_FILE
+echo "Halide_DIR=$HALIDE_DIR" >> $DEP_FILE
+echo "Hwang_DIR=$HWANG_DIR" >> $DEP_FILE
+echo "STOREHOUSE_DIR=$STOREHOUSE_DIR" >> $DEP_FILE
+echo "TinyToml_DIR=$TINYTOML_DIR" >> $DEP_FILE
+
+echo "Done installing required dependencies!"
+echo -n "Add $INSTALL_PREFIX/lib to your LD_LIBRARY_PATH and "
+echo -n "add $INSTALL_PREFIX/bin to your PATH so the installed "
+echo -n "dependencies can be found! "
+echo "e.g. export LD_LIBRARY_PATH=$INSTALL_PREFIX/lib:\$LD_LIBRARY_PATH"
+if [[ $INSTALL_OPENCV == true ]]; then
+    echo "Add $INSTALL_PREFIX/lib/python2.7/dist-packages to your PYTHONPATH to use OpenCV from Python"
+fi
+if [[ $INSTALL_CAFFE_CPU == true ]] || [[ $INSTALL_CAFFE_GPU == true ]]; then
+    echo "Add $INSTALL_PREFIX/python to your PYTHONPATH to use Caffe from Python"
+fi
+#echo "Add $INSTALL_PREFIX/lib to your LD_LIBRARY_PATH"
diff --git a/docker/Dockerfile.scanner b/docker/Dockerfile.scanner
new file mode 100644
index 00000000..68697aa1
--- /dev/null
+++ b/docker/Dockerfile.scanner
@@ -0,0 +1,24 @@
+ARG tag=gpu
+FROM scannerresearch/scanner-base:ubuntu16.04-${tag}
+MAINTAINER Will Crichton "wcrichto@cs.stanford.edu"
+ARG cores=1
+ARG gpu=ON
+
+ADD . /opt/scanner
+WORKDIR /opt/scanner
+ENV Caffe_DIR /usr/local
+ENV LD_LIBRARY_PATH \
+       "/usr/local/cuda/lib64:$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs"
+RUN cd /opt/scanner && \
+    (bash deps.sh --install-none --prefix /usr/local) && \
+    mkdir build && cd build && \
+    cmake -D BUILD_IMGPROC_OPS=ON \
+          -D BUILD_CAFFE_OPS=ON \
+          -D BUILD_OPENFACE_OPS=OFF \
+          -D BUILD_TESTS=ON \
+          -D BUILD_CUDA=${gpu} \
+          .. && \
+    make -j ${cores} && \
+    cd .. && rm -fr python/scannerpy/build && \
+    python python/setup.py bdist_wheel && \
+    pip install dist/scannerpy-0.1.13-py2-none-any.whl
diff --git a/docker/build-all-base.sh b/docker/build-all-base.sh
new file mode 100755
index 00000000..7a440c19
--- /dev/null
+++ b/docker/build-all-base.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+set -e
+
+NO_CACHE=false
+CORES=$(nproc)
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+for dir in $DIR/*/
+do
+    base=`basename ${dir%*/}`
+
+    cp $DIR/../deps.sh $dir/deps.sh
+
+    base_tag=scannerresearch/scanner-base:$base
+
+    function build {
+        docker build \
+               --build-arg cores=$CORES \
+               --build-arg base_tag=$base_tag \
+               --no-cache=$NO_CACHE \
+               -t scannerresearch/scanner-base:$2 \
+               -f $dir/Dockerfile.$1 \
+               $dir
+    }
+
+    function push {
+        docker push scannerresearch/scanner-base:$1
+    }
+
+    build base $base
+
+    build gpu $base-gpu
+    push $base-gpu
+
+    build cpu $base-cpu
+    push $base-cpu
+done
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
new file mode 100644
index 00000000..34f97cdf
--- /dev/null
+++ b/docker/docker-compose.yml
@@ -0,0 +1,10 @@
+version: "2"
+services:
+  gpu:
+    image: scannerresearch/scanner:gpu
+    ports:
+      - "8888:8888"
+  cpu:
+    image: scannerresearch/scanner:cpu
+    ports:
+      - "8888:8888"
diff --git a/docker/ubuntu16.04/Dockerfile.base b/docker/ubuntu16.04/Dockerfile.base
new file mode 100644
index 00000000..c6def0dc
--- /dev/null
+++ b/docker/ubuntu16.04/Dockerfile.base
@@ -0,0 +1,40 @@
+# Scanner base image for Ubuntu 16.04
+
+FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+MAINTAINER Will Crichton "wcrichto@cs.stanford.edu"
+ARG cores=1
+ARG cpu_only=OFF
+
+# Apt-installable dependencies
+RUN apt-get update && apt-get upgrade -y && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository -y ppa:git-core/ppa && \
+    add-apt-repository -y ppa:jonathonf/python-2.7 && \
+    apt-get update && \
+    apt-get install -y libssl-dev libcurl3-dev liblzma-dev libeigen3-dev  \
+    libgoogle-glog-dev libatlas-base-dev libsuitesparse-dev libgflags-dev \
+    libx264-dev libopenjpeg-dev libxvidcore-dev \
+    libpng-dev libjpeg-dev libbz2-dev git python-pip wget \
+    libleveldb-dev libsnappy-dev libhdf5-serial-dev liblmdb-dev python-dev \
+    llvm clang python-tk autoconf autogen libtool libtbb-dev libopenblas-dev \
+    liblapacke-dev swig yasm python2.7 cpio curl unzip
+ADD thirdparty/resources/cuda/libnvcuvid.so.367.48 /usr/lib/x86_64-linux-gnu/libnvcuvid.so
+RUN ln -s /usr/local/cuda-8.0/targets/x86_64-linux/lib/stubs/libcuda.so \
+       /usr/local/cuda-8.0/targets/x86_64-linux/lib/stubs/libcuda.so.1
+ENV CUDA_LIB_PATH /usr/local/cuda/lib64/stubs
+
+# Non-apt-installable dependencies
+ENV deps /deps
+WORKDIR ${deps}
+
+# CMake
+RUN wget "https://cmake.org/files/v3.8/cmake-3.8.1.tar.gz" && \
+    tar -xf cmake-3.8.1.tar.gz && cd ${deps}/cmake-3.8.1 && \
+    ./bootstrap --parallel=${cores} && \
+    make install -j${cores} && \
+    rm -rf ${deps}/cmake-3.8.1.tar.gz ${deps}/cmake-3.8.1
+
+# Python dependencies
+WORKDIR /opt/scanner-base
+ADD . .
+RUN pip install --upgrade pip && pip install -r requirements.txt
diff --git a/docker/ubuntu16.04/Dockerfile.cpu b/docker/ubuntu16.04/Dockerfile.cpu
new file mode 100644
index 00000000..a2422f8b
--- /dev/null
+++ b/docker/ubuntu16.04/Dockerfile.cpu
@@ -0,0 +1,14 @@
+# Scanner base CPU image for Ubuntu 16.04
+
+ARG base_tag
+FROM ${base_tag}
+MAINTAINER Will Crichton "wcrichto@cs.stanford.edu"
+ARG cores=1
+
+RUN bash ./deps.sh --install-all --prefix /usr/local && \
+    rm -rf /opt/scanner-base
+
+ENV PYTHONPATH /usr/local/python:${PYTHONPATH}
+ENV PYTHONPATH /usr/local/lib/python2.7/site-packages:${PYTHONPATH}
+
+WORKDIR /
diff --git a/docker/ubuntu16.04/Dockerfile.gpu b/docker/ubuntu16.04/Dockerfile.gpu
new file mode 100644
index 00000000..6b88ddf5
--- /dev/null
+++ b/docker/ubuntu16.04/Dockerfile.gpu
@@ -0,0 +1,15 @@
+# Scanner base GPU image for Ubuntu 16.04
+
+ARG base_tag
+FROM ${base_tag}
+MAINTAINER Will Crichton "wcrichto@cs.stanford.edu"
+ARG cores=1
+
+RUN bash ./deps.sh --install-all --prefix /usr/local --use-gpu && \
+    rm -rf /opt/scanner-base
+
+ENV LD_LIBRARY_PATH /usr/local/intel/mkl/lib:${LD_LIBRARY_PATH}
+ENV PYTHONPATH /usr/local/python:${PYTHONPATH}
+ENV PYTHONPATH /usr/local/lib/python2.7/site-packages:${PYTHONPATH}
+
+WORKDIR /
diff --git a/docker/ubuntu16.04/requirements.txt b/docker/ubuntu16.04/requirements.txt
new file mode 100644
index 00000000..f22e2356
--- /dev/null
+++ b/docker/ubuntu16.04/requirements.txt
@@ -0,0 +1,18 @@
+ipython==5.3.0
+numpy==1.12.0
+protobuf==3.2.0
+toml==0.9.2
+youtube-dl
+scipy==0.18.1
+scikit-learn==0.18.1
+scikit-image==0.12.3
+enum34==1.1.6
+matplotlib==2.0.0
+seaborn==0.7.1
+grpcio==1.7.0
+doxypypy==0.8.8.6
+pytest==3.0.6
+twine==1.8.1
+ipaddress==1.0.18
+plotly==2.0.6
+jupyter==1.0.0
diff --git a/docker/ubuntu16.04/thirdparty/resources/caffe/Makefile.config b/docker/ubuntu16.04/thirdparty/resources/caffe/Makefile.config
new file mode 100644
index 00000000..48dd599d
--- /dev/null
+++ b/docker/ubuntu16.04/thirdparty/resources/caffe/Makefile.config
@@ -0,0 +1,112 @@
+## Refer to http://caffe.berkeleyvision.org/installation.html
+# Contributions simplifying and improving our build system are welcome!
+
+# cuDNN acceleration switch (uncomment to build with cuDNN).
+USE_CUDNN := 1
+
+# CPU-only switch (uncomment to build without GPU support).
+# CPU_ONLY := 1
+
+# uncomment to disable IO dependencies and corresponding data layers
+# USE_OPENCV := 0
+# USE_LEVELDB := 0
+# USE_LMDB := 0
+
+# uncomment to allow MDB_NOLOCK when reading LMDB files (only if necessary)
+#	You should not set this flag if you will be reading LMDBs with any
+#	possibility of simultaneous read and write
+# ALLOW_LMDB_NOLOCK := 1
+
+# Uncomment if you're using OpenCV 3
+OPENCV_VERSION := 3
+
+# To customize your choice of compiler, uncomment and set the following.
+# N.B. the default for Linux is g++ and the default for OSX is clang++
+# CUSTOM_CXX := g++
+
+# CUDA directory contains bin/ and lib/ directories that we need.
+CUDA_DIR := /usr/local/cuda
+# On Ubuntu 14.04, if cuda tools are installed via
+# "sudo apt-get install nvidia-cuda-toolkit" then use this instead:
+# CUDA_DIR := /usr
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
+		-gencode arch=compute_20,code=sm_21 \
+		-gencode arch=compute_30,code=sm_30 \
+		-gencode arch=compute_35,code=sm_35 \
+		-gencode arch=compute_50,code=sm_50 \
+		-gencode arch=compute_50,code=compute_50
+
+# BLAS choice:
+# atlas for ATLAS (default)
+# mkl for MKL
+# open for OpenBlas
+BLAS := atlas
+# Custom (MKL/ATLAS/OpenBLAS) include and lib directories.
+# Leave commented to accept the defaults for your choice of BLAS
+# (which should work)!
+# BLAS_INCLUDE := /path/to/your/blas
+# BLAS_LIB := /path/to/your/blas
+
+# Homebrew puts openblas in a directory that is not on the standard search path
+# BLAS_INCLUDE := $(shell brew --prefix openblas)/include
+# BLAS_LIB := $(shell brew --prefix openblas)/lib
+
+# This is required only if you will compile the matlab interface.
+# MATLAB directory should contain the mex binary in /bin.
+# MATLAB_DIR := /usr/local
+# MATLAB_DIR := /Applications/MATLAB_R2012b.app
+
+# NOTE: this is required only if you will compile the python interface.
+# We need to be able to find Python.h and numpy/arrayobject.h.
+PYTHON_INCLUDE := /usr/include/python2.7 \
+		/usr/lib/python2.7/dist-packages/numpy/core/include
+# Anaconda Python distribution is quite popular. Include path:
+# Verify anaconda location, sometimes it's in root.
+# ANACONDA_HOME := $(HOME)/anaconda
+# PYTHON_INCLUDE := $(ANACONDA_HOME)/include \
+		# $(ANACONDA_HOME)/include/python2.7 \
+		# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \
+
+# Uncomment to use Python 3 (default is Python 2)
+# PYTHON_LIBRARIES := boost_python3 python3.5m
+# PYTHON_INCLUDE := /usr/include/python3.5m \
+#                 /usr/lib/python3.5/dist-packages/numpy/core/include
+
+# We need to be able to find libpythonX.X.so or .dylib.
+PYTHON_LIB := /usr/lib
+# PYTHON_LIB := $(ANACONDA_HOME)/lib
+
+# Homebrew installs numpy in a non standard path (keg only)
+# PYTHON_INCLUDE += $(dir $(shell python -c 'import numpy.core; print(numpy.core.__file__)'))/include
+# PYTHON_LIB += $(shell brew --prefix numpy)/lib
+
+# Uncomment to support layers written in Python (will link against Python libs)
+# WITH_PYTHON_LAYER := 1
+
+# Whatever else you find you need goes here.
+INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include
+LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
+
+# If Homebrew is installed at a non standard location (for example your home directory) and you use it for general dependencies
+# INCLUDE_DIRS += $(shell brew --prefix)/include
+# LIBRARY_DIRS += $(shell brew --prefix)/lib
+
+# Uncomment to use `pkg-config` to specify OpenCV library paths.
+# (Usually not necessary -- OpenCV libraries are normally installed in one of the above $LIBRARY_DIRS.)
+# USE_PKG_CONFIG := 1
+
+# N.B. both build and distribute dirs are cleared on `make clean`
+BUILD_DIR := build
+DISTRIBUTE_DIR := distribute
+
+# Uncomment for debugging. Does not work on OSX due to https://github.com/BVLC/caffe/issues/171
+# DEBUG := 1
+
+# The ID of the GPU that 'make runtest' will use to run unit tests.
+TEST_GPUID := 0
+
+# enable pretty build (comment to see full commands)
+Q ?= @
diff --git a/docker/ubuntu16.04/thirdparty/resources/cuda/libnvcuvid.so.367.48 b/docker/ubuntu16.04/thirdparty/resources/cuda/libnvcuvid.so.367.48
new file mode 100644
index 00000000..69fa92a2
Binary files /dev/null and b/docker/ubuntu16.04/thirdparty/resources/cuda/libnvcuvid.so.367.48 differ
diff --git a/docker/ubuntu16.04/thirdparty/resources/mkl/silent.cfg b/docker/ubuntu16.04/thirdparty/resources/mkl/silent.cfg
new file mode 100755
index 00000000..ec4faa35
--- /dev/null
+++ b/docker/ubuntu16.04/thirdparty/resources/mkl/silent.cfg
@@ -0,0 +1,37 @@
+# Patterns used to check silent configuration file
+#
+# anythingpat - any string
+# filepat     - the file location pattern (/file/location/to/license.lic)
+# lspat       - the license server address pattern (0123@hostname)
+# snpat       - the serial number pattern (ABCD-01234567)
+
+# Accept EULA, valid values are: {accept, decline}
+ACCEPT_EULA=accept
+
+# Optional error behavior, valid values are: {yes, no}
+CONTINUE_WITH_OPTIONAL_ERROR=yes
+
+# Install location, valid values are: {/opt/intel, filepat}
+PSET_INSTALL_DIR=/opt/intel
+
+# Continue with overwrite of existing installation directory, valid values are: {yes, no}
+CONTINUE_WITH_INSTALLDIR_OVERWRITE=yes
+
+# List of components to install, valid values are: {ALL, DEFAULTS, anythingpat}
+COMPONENTS=DEFAULTS
+
+# Installation mode, valid values are: {install, repair, uninstall}
+PSET_MODE=install
+
+# Directory for non-RPM database, valid values are: {filepat}
+#NONRPM_DB_DIR=filepat
+
+# Path to the cluster description file, valid values are: {filepat}
+#CLUSTER_INSTALL_MACHINES_FILE=filepat
+
+# Perform validation of digital signatures of RPM files, valid values are: {yes, no}
+SIGNING_ENABLED=yes
+
+# Select target architecture of your applications, valid values are: {IA32, INTEL64, ALL}
+ARCH_SELECTED=ALL
+
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
deleted file mode 100644
index c01ac803..00000000
--- a/examples/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(simple)
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 00000000..5f3021ee
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,35 @@
+# Scanner examples
+
+This directory contains a number of simple examples and full applications that 
+show you how to use Scanner. We recommend starting with the
+[tutorial](https://github.com/scanner-research/scanner/blob/master/examples/tutorial).
+
+## Tutorials
+* [Walkthrough.ipynb](https://github.com/scanner-research/scanner/blob/master/examples/Walkthrough.ipynb): an IPython notebook that goes through a simple application (shot detection) using Scanner.
+* [tutorial](https://github.com/scanner-research/scanner/blob/master/examples/tutorial): a set of well-commented files exploring different Scanner features in code.
+
+If you want to run the notebook yourself so that you can interactively edit the
+code, run:
+
+```bash
+cd path/to/your/scanner/directory/
+cd examples
+jupyter notebook --allow-root --ip=0.0.0.0 --port=8888
+```
+
+Then in your browser, go to [http://localhost:8888/notebooks/Walkthrough.ipynb](http://localhost:8888/notebooks/Walkthrough.ipynb) and copy in the token from the console logs. Follow the instructions in the Jupyter notebook.
+
+## Applications
+
+* [face_detection](https://github.com/scanner-research/scanner/blob/master/examples/apps/face_detection): Location and recognizing faces in a video.
+* [shot_detection](https://github.com/scanner-research/scanner/blob/master/examples/apps/shot_detection): Segmenting a video into shots. Same application as the walkthrough.
+* [reverse_image_search](https://github.com/scanner-research/scanner/blob/master/examples/apps/reverse_image_search): Searching through a video by image.
+* [depth_from_stereo](https://github.com/scanner-research/scanner/blob/master/examples/apps/depth_from_stereo): Computing a per-pixel depth image from two views of the same location.
+* [hyperlapse](https://github.com/scanner-research/scanner/blob/master/examples/apps/hyperlapse): Creating more stable timelapse videos with the [Hyperlapse](https://www.microsoft.com/en-us/research/publication/real-time-hyperlapse-creation-via-optimal-frame-selection/) algorithm.
+* [optical_flow](https://github.com/scanner-research/scanner/blob/master/examples/apps/optical_flow): Using OpenCV to compute flow fields within a video.
+
+## How-Tos
+* [tensorflow](https://github.com/scanner-research/scanner/blob/master/examples/how-tos/tensorflow): Integrating TensorFlow networks into Scanner.
+* [caffe](https://github.com/scanner-research/scanner/blob/master/examples/how-tos/caffe): How to use different Caffe nets in Scanner.
+* [python_kernel](https://github.com/scanner-research/scanner/blob/master/examples/how-tos/python_kernel): How to create python kernels to use in scanner 
+* [halide](https://github.com/scanner-research/scanner/blob/master/examples/how-tos/halide): Integrating Halide kernels into Scanner.
diff --git a/examples/Walkthrough.ipynb b/examples/Walkthrough.ipynb
new file mode 100644
index 00000000..e0498a5c
--- /dev/null
+++ b/examples/Walkthrough.ipynb
@@ -0,0 +1,282 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Scanner walkthrough\n",
+    "\n",
+    "To explore how Scanner works, we're going to walk through a simple video analysis application. If you want to analyze a film, a common unit of analysis is the _shot_, short segments of video often delineated by the camera cutting to a different angle or location. In this walkthrough, we're going to use Scanner to implement _shot segmentation_, or breaking up a video into shots. To start, we need to get a video. We'll use a scene from Moana:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%html\n",
+    "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/79DijItQXMM\" frameborder=\"0\" allowfullscreen></iframe>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "We've set up some scripts to help you download the video in the snippet below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import util\n",
+    "path = util.download_video()\n",
+    "print path\n",
+    "\n",
+    "# Read all the frames\n",
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt\n",
+    "import cv2\n",
+    "from timeit import default_timer as now\n",
+    "\n",
+    "start = now()\n",
+    "video = cv2.VideoCapture(path)\n",
+    "frames = []\n",
+    "while True:\n",
+    "    ret, frame = video.read()\n",
+    "    if not ret: break\n",
+    "    frames.append(frame)\n",
+    "video.release()\n",
+    "print 'Time to read frames: {:.3f}s'.format(now() - start)\n",
+    "\n",
+    "# Display the tenth frame    \n",
+    "plt.imshow(cv2.cvtColor(frames[10], cv2.COLOR_RGB2BGR))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Take another look at the video and see if you can identify when shots change. Our shot segmentation algorithm uses the following intuition: in a video, most frames are similar to the one following it. Because most shot changes happen with cuts (as opposed to dissolves or fades), there's an immediate visual break from one frame to the next. We want to identify when the change in visual content between two adjacent frames is substantially larger than normal. One way to estimate change in visual content is by computing a histogram of colors for each frame, i.e. count the number of dark pixels and light pixels in each color channel (red/green/blue), and then compute the magnitude of difference between adjacent frames' histograms. Let's visualize this for the above video:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from scipy.spatial import distance\n",
+    "\n",
+    "histograms = []\n",
+    "N = len(frames)\n",
+    "\n",
+    "# Compute 3 color histograms (one for each channel) for each video frame\n",
+    "start = now()\n",
+    "for frame in frames:\n",
+    "    hists = [cv2.calcHist([frame], [channel], None, [16], [0, 256]) \n",
+    "             for channel in range(3)]\n",
+    "    histograms.append(hists)\n",
+    "print 'Time to compute histograms: {:.3f}s'.format(now() - start)\n",
+    "\n",
+    "# Compute differences between adjacent pairs of histograms\n",
+    "def compute_histogram_diffs(histograms):    \n",
+    "    diffs = []        \n",
+    "    for i in range(1, N):\n",
+    "        frame_diffs = [distance.chebyshev(histograms[i-1][channel], histograms[i][channel]) \n",
+    "                       for channel in range(3)]\n",
+    "        avg_diff = np.mean(frame_diffs)\n",
+    "        diffs.append(avg_diff)\n",
+    "    return diffs\n",
+    "        \n",
+    "diffs = compute_histogram_diffs(histograms)\n",
+    "\n",
+    "# Plot the differences\n",
+    "plt.rcParams[\"figure.figsize\"] = [16, 9]\n",
+    "plt.xlabel(\"Frame number\")\n",
+    "plt.ylabel(\"Difference from previous frame\")\n",
+    "plt.plot(range(1, N), diffs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This plot shows, for each frame, the difference between its color histograms and the previous frame's color histograms. Try playing around with the number of histogram bins as well as the [distance metric](https://docs.scipy.org/doc/scipy/reference/spatial.distance.html). As you can see, there are a number of sharp peaks interspersed throughout the video that likely correspond to shot boundaries. We can run a sliding window over the above graph to find the peaks:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "WINDOW_SIZE = 500  # The size of our sliding window (how many data points to include)\n",
+    "OUTLIER_STDDEV = 3 # Outliers are N standard deviations away from the mean of the sliding window\n",
+    "\n",
+    "def find_shot_boundaries(diffs):\n",
+    "    boundaries = []\n",
+    "    for i in range(1, N):\n",
+    "        window = diffs[max(i-WINDOW_SIZE,0):min(i+WINDOW_SIZE,N)]\n",
+    "        if diffs[i-1] - np.mean(window) > OUTLIER_STDDEV * np.std(window):\n",
+    "            boundaries.append(i)\n",
+    "    return boundaries\n",
+    "\n",
+    "boundaries = find_shot_boundaries(diffs)        \n",
+    "\n",
+    "print 'Shot boundaries are:'\n",
+    "print boundaries\n",
+    "\n",
+    "from scannerpy.stdlib.montage import make_montage\n",
+    "montage = make_montage(len(boundaries), iter([frames[i] for i in boundaries]))\n",
+    "plt.imshow(cv2.cvtColor(montage, cv2.COLOR_RGB2BGR))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And we've done it! The video is now segmented in shots. At this point, you're probably wondering: \"...but I thought this was a Scanner tutorial!\" Well, consider now: what if you wanted to run this pipeline over a second trailer? A movie? A thousand movies? The simple Python code we wrote above is great for experimenting, but doesn't scale. To accelerate this analysis, we need to speed up the core computation, computing the color histogram. Here are some ways we can make that faster:\n",
+    "\n",
+    "* Use a faster histogram implementation, e.g. using the GPU.\n",
+    "* Use a faster video decoder, e.g. the hardware decoder.\n",
+    "* Parallelize the histogram pipeline on multiple CPUs or GPUs.\n",
+    "* Parallelize the histogram pipeline across a cluster of machines.\n",
+    "\n",
+    "All of that is fairly difficult to do with Python, but easy with Scanner. \n",
+    "\n",
+    "Now I'm going to walk you through running the histogram computation in Scanner. First, we start by setting up our Scanner database."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "from scannerpy import Database, DeviceType, Job, BulkJob\n",
+    "\n",
+    "db = Database()\n",
+    "# If you have a cluster, you can specify: Database(master='localhost:5001', workers=['worker1:5002', 'worker2:5002'...])\n",
+    "# By default, the database will only run on the local machine.\n",
+    "[input_table], _ = db.ingest_videos([('example', path)], force=True)\n",
+    "print db.summarize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In Scanner, all data is organized into tables, just like a database. Videos are represented as tables where each frame is a row. You can see in the summary that the `example` table has two columns, `index` like the `id` field in a SQL database and `frame` which represents the raw RGB pixels. Scanner columns have one of two types: either it's a frame (i.e. image or video), or it's a byte array.\n",
+    "\n",
+    "To create a video table, you run `db.ingest_videos(list of (table name, path) pairs)`. Each table has a name specified the first element in the ingested pairs. By default, `ingest_videos` will raise a `ScannerException` if you attempt to ingest over an existing table, but `force=True` will allow such behavior. Next, we want to compute the histogram over each frame."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "frame = db.ops.FrameInput()\n",
+    "histogram = db.ops.Histogram(\n",
+    "    frame = frame,\n",
+    "    device = DeviceType.GPU) # Change this to DeviceType.CPU if you don't have a GPU\n",
+    "output = db.ops.Output(columns = [histogram])\n",
+    "job = Job(op_args = {\n",
+    "    frame: db.table('example').column('frame'),\n",
+    "    output: 'example_hist'\n",
+    "})\n",
+    "bulk_job = BulkJob(output = output, jobs = [job])\n",
+    "\n",
+    "start = now()\n",
+    "[output_table] = db.run(bulk_job, force=True) \n",
+    "print 'Time to decode + compute histograms: {:.3f}'.format(now() - start)\n",
+    "print db.summarize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Computations in Scanner are defined in a *data-parallel* manner--that is, you write a computation that takes in one (or a few) frames at a time, and then the Scanner runtime runs your computation in parallel across your video. Here, we define a computation that computes a color histogram for each frame in the video. This is done by defining a series of \"ops\" (operators, similar to TensorFlow):\n",
+    "1. The `FrameInput` op represents a single frame, the input to our computation. This will be drawn from a video.\n",
+    "2. `Histogram` is an op that computes a color histogram over the input `frame`. We specify that it should run on the CPU.\n",
+    "3. `Output` represents the final output of our computation, the data that will get written back to our database, in this case a table with a single column containing the histogram for each frame of the input table.\n",
+    "\n",
+    "Once we define a computation, then a `Job` provides parameters to the computation, here saying which table the frames should be drawn from (`example`), and what the name of the output table should be (`example_hist`). Lastly, we define a `BulkJob` with containing our job and the output node (\"bulk\" because we can run many jobs in parallel), and then tell the database to run those jobs.\n",
+    "\n",
+    "After a job is complete, we want to load the results of our computation into Python for further processing:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scannerpy.stdlib import parsers\n",
+    "from pprint import pprint\n",
+    "rows = output_table.load(['histogram'], parsers.histograms)\n",
+    "histograms = [h for _, h in rows]\n",
+    "\n",
+    "# Run the same shot detection pipeline as before\n",
+    "diffs = compute_histogram_diffs(histograms)\n",
+    "boundaries = find_shot_boundaries(diffs)\n",
+    "montage = make_montage(len(boundaries), iter([frames[i] for i in boundaries]))\n",
+    "plt.imshow(cv2.cvtColor(montage, cv2.COLOR_RGB2BGR))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Because Scanner does not have a built-in type system, loading columns from a table requires a parsing function that converts between raw byte strings and Python-understandable data types. For example, the `parsers.histograms` parser function returns a nested Numpy array of the three histograms. Once you've parsed the table column, that's it! You now have your Scanner computation results loaded and ready to go.\n",
+    "\n",
+    "Let's reflect for a moment on the script we just made. Is it any faster than before? Going back to our four bullet points:\n",
+    "\n",
+    "* Scanner will run your computation on the GPU (`device=DeviceType.GPU`).\n",
+    "* Scanner will use accelerated hardware video decode behind the scenes.\n",
+    "* Scanner will automatically run on all of your CPU cores and on multiple GPUs.\n",
+    "* Scanner will automatically distribute the work across a cluster.\n",
+    "\n",
+    "That's what you get for free using Scanner for your video analyses. All of the code for organizing, distributing, and decoding your videos is taken care of by the Scanner runtime. As an exercise, download a long video like a movie and try running both our Python histogram pipeline and the Scanner pipeline. You'll likely notice a substantial difference!\n",
+    "\n",
+    "So, where should you go from here? I would check out:\n",
+    "* [Extended tutorial](https://github.com/scanner-research/scanner/tree/master/examples/tutorial): covers more Scanner features like sampling patterns and building custom ops.\n",
+    "* [Code examples](https://github.com/scanner-research/scanner/tree/master/examples): other applications like face detection and reverse image search implemented with Scanner."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/apps/depth_from_stereo/main.py b/examples/apps/depth_from_stereo/main.py
new file mode 100644
index 00000000..1b00d8a8
--- /dev/null
+++ b/examples/apps/depth_from_stereo/main.py
@@ -0,0 +1,272 @@
+from scannerpy import Database, DeviceType
+from scannerpy.stdlib import NetDescriptor, parsers, bboxes
+import scipy.misc
+import numpy as np
+import cv2
+import sys
+import random
+import json
+import time
+import os
+import os.path
+import struct
+
+def write_dmb_file(path, image):
+    with open(path, 'wb') as f:
+        # type
+        f.write(struct.pack('i', 1)) # type
+        # height
+        f.write(struct.pack('i', image.shape[0]))
+        # width
+        f.write(struct.pack('i', image.shape[1]))
+        # channels
+        if len(image.shape) > 2:
+            f.write(struct.pack('i', image.shape[2]))
+        else:
+            f.write(struct.pack('i', 1))
+        f.write(image.tobytes())
+
+
+def make_p_matrices(calib):
+    cameras = calib['cameras']
+    p_matrices = {}
+    for cam in cameras:
+        K = np.array(cam['K'])
+        R = np.array(cam['R'])
+        t = np.array(cam['t'])
+        p = K.dot(np.hstack((R, t)))
+        p_matrices[(cam['panel'], cam['node'])] = p
+    return p_matrices
+
+
+def main():
+    with open('/n/scanner/apoms/panoptic/160422_mafia2/calibration_160422_mafia2.json', 'r') as f:
+        calib = json.load(f)
+
+    p_matrices = make_p_matrices(calib)
+    dataset = '160422_haggling1'
+    template_path = '/n/scanner/apoms/panoptic/' + dataset + '/vgaVideos/vga_{:02d}_{:02d}.mp4'
+    i = 0
+    video_paths = []
+    table_idx = {}
+    for p in range(1, 21):
+        for c in range(1, 25):
+            video_paths.append(template_path.format(p, c))
+            table_idx[(p, c)] = i
+            i += 1
+
+    with Database(debug=True) as db:
+        # Ingest
+        if False:
+            #collection, _ = db.ingest_video_collection(dataset, video_paths,
+            #                                           force=True)
+            collection = db.collection(dataset)
+
+            # Setup tables with calibration data
+            calibration_table_names = []
+            columns = ['P']
+            for p in range(1, 21):
+                for c in range(1, 25):
+                    table_name = 'calibration_{:02d}_{:02d}'.format(p, c)
+                    num_rows = collection.tables(len(calibration_table_names)).num_rows()
+                    cam = db.protobufs.Camera()
+                    if (p == 14 and c == 18) or num_rows == 0:
+                        rows = [[cam.SerializeToString()]]
+                        db.new_table(table_name, columns, rows, force=True)
+                        calibration_table_names.append(table_name)
+                        continue
+                    P = p_matrices[(p, c)]
+                    for i in range(3):
+                        for j in range(4):
+                            cam.p.append(P[i, j])
+                    rows = []
+                    for i in range(num_rows):
+                        rows.append([cam.SerializeToString()])
+                    print(table_name)
+                    db.new_table(table_name, columns, rows, force=True)
+                    calibration_table_names.append(table_name)
+            calib_collection = db.new_collection(dataset + '_calibration',
+                                                 calibration_table_names,
+                                                 force=True)
+
+        collection = db.collection(dataset)
+        calib_collection = db.collection(dataset + '_calibration')
+
+        gipuma_args = db.protobufs.GipumaArgs()
+        gipuma_args.min_disparity = 0
+        gipuma_args.max_disparity = 384
+        gipuma_args.min_depth = 30
+        gipuma_args.max_depth = 500
+        gipuma_args.iterations = 8
+        gipuma_args.kernel_width = 19
+        gipuma_args.kernel_height = 19
+
+        columns = []
+        camera_groups_length = 4
+        for i in range(camera_groups_length):
+            columns += ["frame" + str(i), "fi" + str(i), "calib" + str(i)]
+        input_op = db.ops.Input(["index"] + columns)
+        op = db.ops.Gipuma(
+            inputs=[(input_op, columns)],
+            args=gipuma_args, device=DeviceType.GPU)
+
+        tasks = []
+
+        start_frame = 4300
+        end_frame = 4302
+        item_size = 64
+        sampler_args = db.protobufs.StridedRangeSamplerArgs()
+        sampler_args.stride = 1
+        start = start_frame
+        end = end_frame
+        while start < end:
+            sampler_args.warmup_starts.append(start)
+            sampler_args.starts.append(start)
+            sampler_args.ends.append(min(start + item_size, end))
+            start += item_size
+
+        camera_groups = [
+            [(1, 1), (1, 2), (5, 1), (16, 13)],
+            # [(3, 1), (3, 3), (5, 3), (1, 6)],
+            # [(4, 2), (1, 3), (5, 3), (3, 3)],
+            # [(7, 4), (7, 8), (6, 3), (8, 3)],
+            # [(10, 4), (9, 3), (10, 3), (11, 3)],
+            # [(13, 8), (13, 10), (12, 8), (14, 20)],
+            # [(16, 4), (16, 16), (15, 2), (16, 8)],
+        ]
+        for group in camera_groups:
+            first_idx = table_idx[group[0]]
+            print(first_idx)
+
+            first_table = collection.tables(first_idx)
+            first_calib_table = calib_collection.tables(first_idx)
+
+            task = db.protobufs.Task()
+            task.output_table_name = 'disparity_{:02d}_{:02d}'.format(
+                group[0][0], group[0][1])
+            column_names = [c.name() for c in first_table.columns()]
+
+            # Load frames
+            sample = task.samples.add()
+            sample.table_name = first_table.name()
+            sample.column_names.extend(column_names)
+            sample.sampling_function = "StridedRange"
+            sample.sampling_args = sampler_args.SerializeToString()
+
+            # Load calibration
+            sample = task.samples.add()
+            sample.table_name = first_calib_table.name()
+            sample.column_names.extend(['P'])
+            sample.sampling_function = "StridedRange"
+            sample.sampling_args = sampler_args.SerializeToString()
+
+            for c, p in group[1:]:
+                idx = table_idx[(c, p)]
+
+                print(idx)
+                table = collection.tables(idx)
+                calib_table = calib_collection.tables(idx)
+
+                sample = task.samples.add()
+                sample.table_name = table.name()
+                sample.column_names.extend(["frame", "frame_info"])
+                sample.sampling_function = "StridedRange"
+                sample.sampling_args = sampler_args.SerializeToString()
+
+                sample = task.samples.add()
+                sample.table_name = calib_table.name()
+                sample.column_names.extend(['P'])
+                sample.sampling_function = "StridedRange"
+                sample.sampling_args = sampler_args.SerializeToString()
+
+            tasks.append(task)
+
+        # Output data for fusibile
+        top_folder = 'gipuma_results/'
+        frame_folder = top_folder + '{:08d}/'
+        images_folder = frame_folder + 'images/'
+        image_name = '{:03d}.png'
+        image_path = images_folder + image_name
+        krt_path = images_folder + 'cam.txt'
+        results_folder = frame_folder + 'results/'
+        cam_results_folder = results_folder + '2hat_cam_{:03d}/'
+        normals_path = cam_results_folder + 'normals.dmb'
+        depth_path = cam_results_folder + 'disp.dmb'
+
+        output_tables = db.run(tasks, op, pipeline_instances_per_node=4, force=True)
+
+        # Export data directory corresponding to image files
+        # for i, table in enumerate(collection.tables()):
+        #     for fi, tup in table.load(['frame'], rows=range(start_frame,
+        #                                                     end_frame)):
+        #         if not os.path.exists(images_folder.format(fi)):
+        #             os.makedirs(images_folder.format(fi))
+        #         img = tup[0]
+        #         cv2.imwrite(image_path.format(fi, i), img)
+        # Export camera calibration params file (krt_file)
+        for fi in range(end_frame - start_frame):
+            with open(krt_path.format(fi), 'w') as f:
+                f.write(str(479) + '\n')
+                i = -1
+                offset = 0
+                cameras = calib['cameras']
+                for p in range(1, 21):
+                    for c in range(1, 25):
+                        i += 1
+                        if p == 14 and c == 18:
+                            continue
+                        f.write(image_name.format(i) + ' ')
+                        cam = cameras[offset]
+                        K = cam['K']
+                        for n in [item for sublist in K for item in sublist]:
+                            f.write(str(n) + ' ')
+                        R = cam['R']
+                        for n in [item for sublist in R for item in sublist]:
+                            f.write(str(n) + ' ')
+                        t = cam['t']
+                        for n in [item for sublist in t for item in sublist]:
+                            f.write(str(n) + ' ')
+                        f.write('\n')
+                        offset += 1
+
+        # Export normals and depth dmb files
+        for i, table in enumerate(output_tables):
+            for fi, tup in table.load(['points', 'cost']):
+                if not os.path.exists(cam_results_folder.format(fi, i)):
+                    os.makedirs(cam_results_folder.format(fi, i))
+                points = np.frombuffer(tup[0], dtype=np.float32).reshape(480, 640, 4)
+                cost = np.frombuffer(tup[1], dtype=np.float32).reshape(480, 640, 1)
+                avg = np.median(cost[:])
+                mask = np.where(cost > avg)
+                print(len(mask))
+                print
+
+                depth_img = points[:,:,3].copy()
+                depth_img[mask[0], mask[1]] = 0
+                write_dmb_file(depth_path.format(fi, i), depth_img)
+
+                normal_img = points[:,:,0:3].copy()
+                normal_img[mask[0],mask[1],:] = 0
+                write_dmb_file(normals_path.format(fi, i), normal_img)
+                #scipy.misc.toimage(depth_img).save('depth{:05d}_01_01.png'.format(fi))
+
+        # For visualizing depth maps
+        if False:
+            disparity_table = db.table('disparity_01_01')
+            for fi, tup in disparity_table.load(['points']):
+                points = np.frombuffer(tup[0], dtype=np.float32).reshape(480, 640, 1)
+                avg = np.median(points[:])
+                depth_img = points[:,:,0].copy()
+                depth_img[np.where(depth_img > avg)] = avg * 10
+                print('avg', avg)
+                scipy.misc.toimage(depth_img).save('cost{:05d}_01_01.png'.format(fi))
+
+            disparity_table = db.table('disparity_03_01')
+            for fi, tup in disparity_table.load(['points']):
+                points = np.frombuffer(tup[0], dtype=np.float32).reshape(480, 640, 1)
+                depth_img = points[:,:,0]
+                scipy.misc.toimage(depth_img).save('cost{:05d}_03_01.png'.format(fi))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/apps/face_detection/main.py b/examples/apps/face_detection/main.py
new file mode 100644
index 00000000..23014a6d
--- /dev/null
+++ b/examples/apps/face_detection/main.py
@@ -0,0 +1,46 @@
+from scannerpy import Database, DeviceType, Job, BulkJob
+from scannerpy.stdlib import pipelines
+import subprocess
+import cv2
+import sys
+import os.path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../..')
+import util
+
+if len(sys.argv) <= 1:
+    print('Usage: main.py <video_file>')
+    exit(1)
+
+movie_path = sys.argv[1]
+print('Detecting faces in movie {}'.format(movie_path))
+movie_name = os.path.splitext(os.path.basename(movie_path))[0]
+
+with Database() as db:
+    print('Ingesting video into Scanner ...')
+    [input_table], _ = db.ingest_videos(
+        [(movie_name, movie_path)], force=True)
+
+    sampler = db.sampler.all()
+
+    print('Detecting faces...')
+    [bboxes_table] = pipelines.detect_faces(
+        db, [input_table.column('frame')], sampler,
+        movie_name + '_bboxes')
+
+    print('Drawing faces onto video...')
+    frame = db.ops.FrameInput()
+    sampled_frame = frame.sample()
+    bboxes = db.ops.Input()
+    out_frame = db.ops.DrawBox(frame = sampled_frame, bboxes = bboxes)
+    output = db.ops.Output(columns=[out_frame])
+    job = Job(op_args={
+        frame: input_table.column('frame'),
+        sampled_frame: sampler,
+        bboxes: bboxes_table.column('bboxes'),
+        output: movie_name + '_bboxes_overlay',
+    })
+    bulk_job = BulkJob(output=output, jobs=[job])
+    [out_table] = db.run(bulk_job, force=True)
+    out_table.column('frame').save_mp4(movie_name + '_faces')
+
+    print('Successfully generated {:s}_faces.mp4'.format(movie_name))
diff --git a/examples/apps/hyperlapse/main.py b/examples/apps/hyperlapse/main.py
new file mode 100644
index 00000000..cf3d5627
--- /dev/null
+++ b/examples/apps/hyperlapse/main.py
@@ -0,0 +1,135 @@
+from scannerpy import Database, DeviceType, Job
+from scannerpy.stdlib import parsers, video
+import numpy as np
+import math
+from scipy import sparse
+import matplotlib.pyplot as plt
+import cv2
+from timeit import default_timer as now
+
+class Constants:
+    def __init__(self, iw, ih, T):
+        self.iw = iw
+        self.ih = ih
+        self.T =T
+        self.d = math.floor(math.sqrt(self.ih**2 + self.iw**2))
+        self.tau_c = 0.1 * self.d
+        self.gamma = 0.5 * self.d
+
+    w = 32
+    g = 4
+    lam_s = 200
+    lam_a = 80
+    # lam_s = .01
+    # lam_a = .01
+    tau_s = 200
+    tau_a = 200
+
+    # Speedup should be user defined
+    v = 12
+
+with Database(debug=True) as db:
+    def create_database():
+        db.ingest_videos([('example', '/n/scanner/datasets/hyperlapse/long.mp4')],
+                         force=True)
+
+    def extract_features():
+        frame = db.table('example').as_op().range(0, 1000, item_size=100)
+        features, keypoints = db.ops.FeatureExtractor(
+            frame = frame,
+            feature_type = db.protobufs.SURF,
+            device = DeviceType.GPU)
+        job = Job(columns = [features, keypoints], name = 'example_surf')
+        db.run(job, force = True)
+
+    def compute_matches():
+        features, keypoints = db.table('example_surf').as_op().all(item_size=100)
+        frame = db.table('example').as_op().range(0, 1000, item_size=100)
+        frame_info = db.ops.InfoFromFrame(frame = frame, device = DeviceType.GPU)
+        cost_matrix = db.ops.FeatureMatcher(
+            features = features, keypoints = keypoints, frame_info = frame_info,
+            stencil = range(0, 32),
+            device = DeviceType.GPU)
+        job = Job(columns = [cost_matrix], name = 'example_matches')
+        db.run(job, force = True)
+
+    def build_path():
+        matches = db.table('example_matches')
+        T = matches.num_rows()
+
+        C = Constants(1080, 1920, T)
+        Cm = np.zeros((C.T+1, C.T+1))
+        # Cm = sparse.eye(C.T+1, C.T+1, format='lil')
+
+        rows = matches.load(['cost_matrix'], parsers.array(np.float32))
+        for i, row in rows:
+            l = min(len(row), C.T+1 - (i+2+C.w))
+            if l == 0: break
+            Cm[i+1, (i+2):(i+2+l)] = row[:l]
+
+        def vel_cost(i, j):
+            return min(((j - i) - C.v) ** 2, C.tau_s)
+
+        def acc_cost(h, i, j):
+            return min(((j - i) - (i - h)) ** 2, C.tau_a)
+
+        Dv = np.zeros((C.T+1, C.T+1))
+        Tv = np.zeros((C.T+1, C.T+1), dtype=np.int32)
+
+        # Initialization
+        for i in range(1, C.g+1):
+            for j in range(i+1, i+C.w+1):
+                Dv[i,j] = Cm[i,j] + C.lam_s * vel_cost(i, j)
+
+        # First pass: populate Dv
+        for i in range(C.g, C.T+1):
+            for j in range(i+1, min(i+C.w+1, C.T+1)):
+                c = Cm[i,j] + C.lam_s * vel_cost(i, j)
+                a = [Dv[i-k,i] + C.lam_a * acc_cost(i-k, i, j)
+                     for k in range(1, C.w+1)]
+                Dv[i,j] = c + min(a)
+                Tv[i,j] = int(i - (np.argmin(a) + 1))
+
+        # Second pass: trace back min cost path
+        s = 0
+        d = 0
+        dmin = float("inf")
+        for i in range(C.T-C.g, C.T+1):
+            for j in range(i+1, min(i+C.w+1, C.T+1)):
+                if Dv[i,j] < dmin:
+                    dmin = Dv[i,j]
+                    s = i
+                    d = j
+
+        path = [d]
+        while s > C.g:
+            path.insert(0, s)
+            b = Tv[s, d]
+            d = s
+            s = b
+
+        print path
+
+        return path
+
+    def encode_video(path):
+        frames = list(db.table('example').load(['frame']))
+        video.write_video(
+            'hyperlapse.mkv',
+            [f[0] for i, f in frames if i-1 in path],
+            fps=12.0)
+        # video.write_video(
+        #     'timelapse.mkv',
+        #     [f[0] for i, f in frames if i % 12 == 0],
+        #     fps=12.0)
+
+
+    # create_database()
+    t = now()
+    extract_features()
+    print 'extract: {:.3f}'.format(now() - t)
+    t = now()
+    compute_matches()
+    print 'match: {:.3f}'.format(now() - t)
+    # path = build_path()
+    # encode_video(path)
diff --git a/examples/apps/object_detection/main.py b/examples/apps/object_detection/main.py
new file mode 100644
index 00000000..b76476af
--- /dev/null
+++ b/examples/apps/object_detection/main.py
@@ -0,0 +1,89 @@
+from scannerpy import Database, DeviceType, Job, BulkJob
+from scannerpy.stdlib import NetDescriptor, parsers, bboxes
+import math
+import os
+import subprocess
+import cv2
+import sys
+import os.path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../..')
+import util
+import numpy as np
+
+util.download_video()
+
+with Database() as db:
+    video_path = util.download_video()
+    if True or not db.has_table('example'):
+        print('Ingesting video into Scanner ...')
+        db.ingest_videos([('example', video_path)], force=True)
+
+    input_table = db.table('example')
+
+    descriptor = NetDescriptor.from_file(db, 'nets/faster_rcnn_coco.toml')
+    caffe_args = db.protobufs.CaffeArgs()
+    caffe_args.net_descriptor.CopyFrom(descriptor.as_proto())
+    caffe_args.batch_size = 1
+
+
+    frame = db.ops.FrameInput()
+    caffe_frame = db.ops.CaffeInput(
+        frame = frame,
+        args = caffe_args,
+        device = DeviceType.GPU)
+    cls_prob, rois, fc7 = db.ops.FasterRCNN(
+        caffe_input = caffe_frame,
+        args = caffe_args,
+        device = DeviceType.GPU)
+    bboxes, feature = db.ops.FasterRCNNOutput(
+        cls_prob = cls_prob,
+        rois = rois,
+        fc7 = fc7,
+        args = caffe_args,
+        device = DeviceType.CPU)
+    output = db.ops.Output(columns=[bboxes, feature])
+
+    job = Job(op_args={
+        frame: input_table.column('frame'),
+        output: input_table.name() + '_detections'
+    })
+    bulk_job = BulkJob(output=output, jobs=[job])
+    [output] = db.run(bulk_job, pipeline_instances_per_node = 1,
+                      work_packet_size = 10, io_packet_size = 40, force=True)
+
+    output = db.table(input_table.name() + '_detections')
+
+    output.profiler().write_trace('detect_test.trace')
+
+    print('Extracting frames...')
+
+    def parse_features(buf, db):
+        if len(buf) == 1:
+            return np.zeros((1), dtype=np.dtype(np.float32))
+        else:
+            out = np.frombuffer(buf, dtype=np.dtype(np.int32))
+            return out.reshape((-1, 4096))
+
+    video_bboxes = [box for (_, box) in output.columns('bboxes').load(parsers.bboxes)]
+    video_features = [feature for (_, feature) in output.columns('features').load(parse_features)]
+    video_frames = [f[0] for _, f in db.table('example').load(['frame'], rows=range(800,1600))]
+
+    print('Writing output video...')
+    frame_shape = video_frames[0].shape
+    print(frame_shape)
+    output = cv2.VideoWriter(
+        'example_detections.mkv',
+        cv2.VideoWriter_fourcc(*'X264'),
+       #cv2.VideoWriter_fourcc('m', 'p', '4', 'v'),
+        24.0,
+        (frame_shape[1], frame_shape[0]))
+
+    for (frame, bboxes) in zip(video_frames, video_bboxes):
+        for bbox in bboxes:
+            cv2.rectangle(
+                frame,
+                (int(bbox.x1), int(bbox.y1)),
+                (int(bbox.x2), int(bbox.y2)),
+                (255, 0, 0), 3)
+        output.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+    output.release()
diff --git a/examples/apps/optical_flow/main.py b/examples/apps/optical_flow/main.py
new file mode 100644
index 00000000..3fa8d307
--- /dev/null
+++ b/examples/apps/optical_flow/main.py
@@ -0,0 +1,41 @@
+from scannerpy import Database, DeviceType, Job, BulkJob
+from scannerpy.stdlib import parsers
+import os
+import os.path as osp
+import numpy as np
+import time
+import sys
+
+if len(sys.argv) <= 1:
+    print('Usage: main.py <video_file>')
+    exit(1)
+
+video_path = sys.argv[1]
+print('Performing optical flow on {}...'.format(video_path))
+video_name = os.path.splitext(os.path.basename(video_path))[0]
+
+with Database() as db:
+    if not db.has_table(video_name):
+        db.ingest_videos([(video_name, video_path)])
+    input_table = db.table(video_name)
+
+    sampler = db.sampler.all()
+
+    frame = db.ops.FrameInput()
+    flow = db.ops.OpticalFlow(
+        frame = frame,
+        device=DeviceType.CPU)
+    sampled_flow = flow.sample()
+    output = db.ops.Output(columns=[sampled_flow])
+
+    job = Job(op_args={
+        frame: input_table.column('frame'),
+        sampled_flow: sampler,
+        output: input_table.name() + '_flow'
+    })
+    bulk_job = BulkJob(output=output, jobs=[job])
+
+    [output_table] = db.run(bulk_job, pipeline_instances_per_node=1, force=True)
+
+    vid_flows = [flow[0] for _, flow in output_table.load(['flow'], rows=[0])]
+    np.save('flows.npy', vid_flows)
diff --git a/examples/apps/pose_detection/main.py b/examples/apps/pose_detection/main.py
new file mode 100644
index 00000000..6a230fa6
--- /dev/null
+++ b/examples/apps/pose_detection/main.py
@@ -0,0 +1,58 @@
+from scannerpy import Database, DeviceType, Job, ColumnType, BulkJob
+from scannerpy.stdlib import NetDescriptor, parsers, pipelines
+import math
+import os
+import subprocess
+import cv2
+import sys
+import os.path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../..')
+import util
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+POSE_KERNEL_PATH = os.path.join(SCRIPT_DIR, 'pose_draw_kernel.py')
+
+if len(sys.argv) <= 1:
+    print('Usage: main.py <video_file>')
+    exit(1)
+
+movie_path = sys.argv[1]
+print('Detecting poses in video {}'.format(movie_path))
+movie_name = os.path.splitext(os.path.basename(movie_path))[0]
+
+with Database() as db:
+    video_path = movie_path
+    if not db.has_table(movie_name):
+        print('Ingesting video into Scanner ...')
+        db.ingest_videos([(movie_name, video_path)], force=True)
+    input_table = db.table(movie_name)
+
+    sampler = db.sampler.range(120, 480)
+
+    [poses_table] = pipelines.detect_poses(
+        db, [input_table.column('frame')],
+        sampler,
+        '{:s}_poses'.format(movie_name))
+
+    print('Drawing on frames...')
+    db.register_op('PoseDraw', [('frame', ColumnType.Video), 'poses'],
+                   [('frame', ColumnType.Video)])
+    db.register_python_kernel('PoseDraw', DeviceType.CPU, POSE_KERNEL_PATH)
+    frame = db.ops.FrameInput()
+    sampled_frame = frame.sample()
+    poses = db.ops.Input()
+    drawn_frame = db.ops.PoseDraw(
+        frame = sampled_frame,
+        poses = poses)
+    output = db.ops.Output(columns=[drawn_frame])
+    job = Job(op_args={
+        frame: input_table.column('frame'),
+        sampled_frame: sampler,
+        poses: poses_table.column('pose'),
+        output: movie_name + '_drawn_poses',
+    })
+    bulk_job = BulkJob(output=output, jobs=[job])
+    [drawn_poses_table] = db.run(bulk_job, force=True)
+    print('Writing output video...')
+    drawn_poses_table.column('frame').save_mp4('{:s}_poses'.format(
+        movie_name))
diff --git a/examples/apps/pose_detection/pose_draw_kernel.py b/examples/apps/pose_detection/pose_draw_kernel.py
new file mode 100644
index 00000000..2f6eb9c9
--- /dev/null
+++ b/examples/apps/pose_detection/pose_draw_kernel.py
@@ -0,0 +1,31 @@
+import cv2
+
+import scannerpy
+from scannerpy.stdlib import parsers
+
+class PoseDrawKernel(scannerpy.Kernel):
+    def __init__(self, config, protobufs):
+        self.protobufs = protobufs
+
+    def close(self):
+        pass
+
+    def execute(self, input_columns):
+        frame = input_columns[0]
+        frame_poses = input_columns[1]
+        for all_pose in parsers.poses(frame_poses, self.protobufs):
+            pose = all_pose.pose_keypoints()
+            for i in range(18):
+                if pose[i, 2] < 0.35: continue
+                print(pose[i, 1], pose[i, 0])
+                print(frame.shape)
+                x = int(pose[i, 0] * frame.shape[1])
+                y = int(pose[i, 1] * frame.shape[0])
+                cv2.circle(
+                    frame,
+                    (x, y),
+                    8,
+                    (255, 0, 0), 3)
+        return [frame]
+
+KERNEL = PoseDrawKernel
diff --git a/examples/apps/reverse_image_search/search.py b/examples/apps/reverse_image_search/search.py
new file mode 100644
index 00000000..1cff3fd3
--- /dev/null
+++ b/examples/apps/reverse_image_search/search.py
@@ -0,0 +1,120 @@
+from scannerpy import Database, DeviceType
+from scannerpy.stdlib import NetDescriptor, parsers, bboxes
+import numpy as np
+import faiss
+import cv2
+import sys
+import random
+
+STATIC_DIR = 'examples/reverse_image_search/static'
+
+db = Database(debug=True)
+
+descriptor = NetDescriptor.from_file(db, 'nets/faster_rcnn_coco.toml')
+caffe_args = db.protobufs.CaffeArgs()
+caffe_args.net_descriptor.CopyFrom(descriptor.as_proto())
+caffe_args.batch_size = 1
+
+def parse_fvec(bufs, db):
+    buf = bufs[0]
+    if len(buf) == 1:
+        return []
+    else:
+        splits = len(buf) / (4096*4)
+        return np.split(np.frombuffer(buf, dtype=np.float32), splits)
+
+def make_op_graph(input):
+    caffe_input = db.ops.CaffeInput(
+        inputs=[(input, ["frame", "frame_info"])],
+        args=caffe_args,
+        device=DeviceType.GPU)
+    caffe = db.ops.FasterRCNN(
+        inputs=[(caffe_input, ["caffe_frame"]), (input, ["frame_info"])],
+        args=caffe_args,
+        device=DeviceType.GPU)
+    frcnn_output = db.ops.FasterRCNNOutput(
+        inputs=[(caffe, ["cls_prob", "rois", "fc7"])])
+    return frcnn_output
+
+def build_index():
+    print('Building index...')
+    if not db.has_table('example_frcnn'):
+        print('Object detections not found. Running Scanner job...')
+        [example], _ = db.ingest_videos(
+            [('example', '/bigdata/wcrichto/videos/movies/anewhope.m4v')],
+            force=True)
+        tasks = db.sampler().strided([(example.name(), 'example_frcnn')], 24)
+        db.run(tasks, make_op_graph(db.ops.Input()), force=True)
+
+    output_table = db.table('example_frcnn')
+    # bboxes.draw(example, output_table, 'example_bboxes.mkv')
+
+    fvec_index = faiss.IndexFlatL2(4096)
+    bbox_index = []
+    for (frame, bboxes), (_, vec) in \
+        zip(output_table.load([0], parsers.bboxes),
+            output_table.load([1], parse_fvec)):
+        # TODO(wcrichto): fix this frame*24 hack
+        if len(vec) > 0:
+            fvec_index.add(np.array(vec))
+            for bbox in bboxes:
+                bbox_index.append((frame*24, bbox))
+
+    return fvec_index, bbox_index
+
+def query(path, fvec_index, bbox_index):
+    print('Running query with image {}'.format(path))
+    with open(path) as f:
+        t = f.read()
+
+    # TODO(wcrichto): fix this silly hack when new_table properly
+    # supports force=True
+    q_t = "query_image_{}".format(random.randint(0, 1000000))
+    db.new_table(q_t, ["img"], [[t]], force=True)
+
+    table_input = db.ops.Input(["img"])
+    img_input = db.ops.ImageDecoder(inputs=[(table_input, ["img"])])
+    [query_output_table] = db.run(db.sampler().all([(q_t, 'query_output')]),
+           make_op_graph(img_input),
+           force=True)
+    query_output_table = db.table('query_output')
+    _, qvecs = next(query_output_table.load([1], parse_fvec))
+    if len(qvecs) == 0:
+        print('Error: could not find an object in query image.')
+        return []
+
+    _, neighbors = fvec_index.search(np.array(qvecs[:1]), 50)
+    return [bbox_index[i] for i in neighbors[0]]
+
+def visualize(results):
+    example = db.table('example')
+    to_vis = []
+    for k, (i, bbox) in enumerate(results):
+        valid = True
+        for j, _1, _2 in to_vis:
+            if abs(i - j) < 10:
+                valid = False
+                break
+        if valid:
+            to_vis.append((i, bbox, k))
+        if len(to_vis) == 5: break
+
+    for i, (frame_index, bbox, k) in enumerate(to_vis):
+        _, frame = next(example.load([0], rows=[frame_index]))
+        frame = frame[0]
+        cv2.rectangle(
+            frame,
+            (int(bbox.x1), int(bbox.y1)),
+            (int(bbox.x2), int(bbox.y2)),
+            (255, 0, 0), 3)
+        cv2.imwrite('{}/result{}.jpg'.format(STATIC_DIR, i),
+                    cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+
+def main():
+    path = sys.argv[1] if len(sys.argv) > 1 else '{}/query.jpg'.format(STATIC_DIR)
+    fvec_index, bbox_index = build_index()
+    results = query(path, fvec_index, bbox_index)
+    visualize(results)
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/apps/reverse_image_search/server.py b/examples/apps/reverse_image_search/server.py
new file mode 100644
index 00000000..9f8536d6
--- /dev/null
+++ b/examples/apps/reverse_image_search/server.py
@@ -0,0 +1,42 @@
+import subprocess
+try:
+    from flask import Flask, request, send_from_directory
+except ImportError:
+    print('This example needs Flask to run. Try running:\n'
+          'pip install flask')
+
+app = Flask(__name__)
+
+
+STATIC_DIR = 'examples/reverse_image_search/static'
+
+# TODO(wcrichto): figure out how to prevent image caching
+
+@app.route('/mystatic/<path:path>')
+def mystatic(path):
+    return send_from_directory('static', path)
+
+@app.route('/', methods=['GET','POST'])
+def index():
+    if request.method == 'POST':
+        f = request.files['file']
+        f.save('{}/query.jpg'.format(STATIC_DIR))
+        subprocess.check_call(['python', 'examples/reverse_image_search/search.py'])
+        return """
+<img src="/mystatic/result0.jpg" />
+<img src="/mystatic/result1.jpg" />
+<img src="/mystatic/result2.jpg" />
+<img src="/mystatic/result3.jpg" />
+<img src="/mystatic/result4.jpg" />
+"""
+    else:
+        return """
+<form method="post" enctype="multipart/form-data">
+    <input type="file" name="file">
+    <input type="submit" value="Upload">
+</form>
+"""
+
+
+if __name__ == "__main__":
+    app.run(host='0.0.0.0', debug=True)
diff --git a/examples/apps/shot_detection/shot_detect.py b/examples/apps/shot_detection/shot_detect.py
new file mode 100644
index 00000000..1dcb8f16
--- /dev/null
+++ b/examples/apps/shot_detection/shot_detect.py
@@ -0,0 +1,191 @@
+from scannerpy import Database, DeviceType, Job, BulkJob
+from scannerpy.stdlib import parsers
+from scipy.spatial import distance
+from subprocess import check_call as run
+import numpy as np
+import cv2
+import math
+import sys
+import os.path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/../..')
+import util
+import time
+
+try:
+    import plotly.offline as offline
+    import plotly.graph_objs as go
+except ImportError:
+    print('You need to install plotly to run this. Try running:\npip install plotly')
+    exit()
+
+WINDOW_SIZE = 500
+
+def compute_shot_boundaries(hists):
+    # Compute the mean difference between each pair of adjacent frames
+    diffs = np.array([np.mean([distance.chebyshev(hists[i-1][j], hists[i][j])
+                               for j in range(3)])
+                      for i in range(1, len(hists))])
+    diffs = np.insert(diffs, 0, 0)
+    n = len(diffs)
+
+    # Plot the differences. Look at histogram-diffs.html
+    #data = [go.Scatter(x=range(n),y=diffs)]
+    #offline.plot(data, filename='histogram-diffs.html')
+
+    # Do simple outlier detection to find boundaries between shots
+    boundaries = []
+    for i in range(1, n):
+        window = diffs[max(i-WINDOW_SIZE,0):min(i+WINDOW_SIZE,n)]
+        if diffs[i] - np.mean(window) > 3 * np.std(window):
+            boundaries.append(i)
+    return boundaries
+
+
+def make_monrage(n, frames):
+    _, frame = frames.next()
+    frame = frame[0]
+    (frame_h, frame_w, _) = frame.shape
+    target_w = 64
+    target_h = int(target_w / float(frame_w) * frame_h)
+    frames_per_row = 16
+    img_w = frames_per_row * target_w
+    img_h = int(math.ceil(float(n) / frames_per_row)) * target_h
+    img = np.zeros((img_h, img_w, 3))
+
+    def place_image(i, fr):
+        fr = cv2.resize(fr, (target_w, target_h))
+        fr = cv2.cvtColor(fr, cv2.COLOR_RGB2BGR)
+        row = i / frames_per_row
+        col = i % frames_per_row
+        img[(row * target_h):((row+1) * target_h),
+            (col * target_w):((col+1) * target_w),
+            :] = fr
+
+    place_image(0, frame)
+    for i, (_, frame) in enumerate(frames):
+        place_image(i + 1, frame[0])
+
+    return img
+
+def main(movie_path):
+    total_start = time.time()
+
+    print('Detecting shots in movie {}'.format(movie_path))
+    movie_name = os.path.basename(movie_path)
+
+    # Use GPU kernels if we have a GPU
+    with Database() as db:
+        print('Loading movie into Scanner database...')
+        s = time.time()
+
+        if db.has_gpu():
+            device = DeviceType.GPU
+        else:
+            device = DeviceType.CPU
+
+        ############ ############ ############ ############
+        # 0. Ingest the video into the database
+        ############ ############ ############ ############
+        [movie_table], _ = db.ingest_videos([(movie_name, movie_path)],
+                                            force=True)
+        print('Time: {:.1f}s'.format(time.time() - s))
+        print('Number of frames in movie: {:d}'.format(movie_table.num_rows()))
+
+        s = time.time()
+        ############ ############ ############ ############
+        # 1. Run Histogram over the entire video in Scanner
+        ############ ############ ############ ############
+        print('Computing a color histogram for each frame...')
+        frame = db.ops.FrameInput()
+        histogram = db.ops.Histogram(
+            frame = frame,
+            device = device)
+        output = db.ops.Output(columns=[histogram])
+        job = Job(op_args={
+            frame: movie_table.column('frame'),
+            output: movie_name + '_hist'
+        })
+        bulk_job = BulkJob(output=output, jobs=[job])
+        [hists_table] = db.run(bulk_job, force=True)
+        print('\nTime: {:.1f}s, {:.1f} fps'.format(
+            time.time() - s,
+            movie_table.num_rows() / (time.time() - s)))
+
+        s = time.time()
+        ############ ############ ############ ############
+        # 2. Load histograms and compute shot boundaries
+        #    in python
+        ############ ############ ############ ############
+        print('Computing shot boundaries...')
+        # Read histograms from disk
+        hists = [h for _, h in hists_table.load(['histogram'],
+                                                parsers.histograms)]
+        boundaries = compute_shot_boundaries(hists)
+        print('Found {:d} shots.'.format(len(boundaries)))
+        print('Time: {:.1f}s'.format(time.time() - s))
+
+        s = time.time()
+        ############ ############ ############ ############
+        # 3. Create montage in Scanner
+        ############ ############ ############ ############
+        print('Creating shot montage...')
+
+        row_length = 16
+        rows_per_item = 1
+        target_width = 256
+
+        # Compute partial row montages that we will stack together
+        # at the end
+        frame = db.ops.FrameInput()
+        gather_frame = frame.sample()
+        sliced_frame = gather_frame.slice()
+        montage = db.ops.Montage(
+            frame = sliced_frame,
+            num_frames = row_length * rows_per_item,
+            target_width = target_width,
+            frames_per_row = row_length,
+            device = device)
+        sampled_montage = montage.sample()
+        output = db.ops.Output(
+            columns=[sampled_montage.unslice().lossless()])
+
+        item_size = row_length * rows_per_item
+
+        starts_remainder = len(boundaries) % item_size
+        evenly_divisible = (starts_remainder == 0)
+        if not evenly_divisible:
+            boundaries = boundaries[0:len(boundaries) - starts_remainder]
+
+        job = Job(op_args={
+            frame: movie_table.column('frame'),
+            gather_frame: db.sampler.gather(boundaries),
+            sliced_frame: db.partitioner.all(item_size),
+            sampled_montage: [db.sampler.gather([item_size - 1])
+                              for _ in range(len(boundaries) / item_size)],
+            output: 'montage_image'
+        })
+        bulk_job = BulkJob(output=output, jobs=[job])
+
+        [montage_table] = db.run(bulk_job, force=True)
+
+        # Stack all partial montages together
+        montage_img = np.zeros((1, target_width * row_length, 3), dtype=np.uint8)
+        for idx, img in montage_table.column('montage').load():
+            img = np.flip(img, 2)
+            montage_img = np.vstack((montage_img, img))
+
+        print('')
+        print('Time: {:.1f}s'.format(time.time() - s))
+
+        ############ ############ ############ ############
+        # 4. Write montage to disk
+        ############ ############ ############ ############
+        cv2.imwrite('shots.jpg', montage_img)
+        print('Successfully generated shots.jpg')
+        print('Total time: {:.2f} s'.format(time.time() - total_start))
+
+if __name__ == "__main__":
+    if len(sys.argv) <= 1:
+        print('Usage: main.py <video_file>')
+        exit(1)
+    main(sys.argv[1])
diff --git a/examples/caffe/facenet.py b/examples/caffe/facenet.py
deleted file mode 100644
index 6530177e..00000000
--- a/examples/caffe/facenet.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from scannerpy import Database, DeviceType, NetDescriptor
-from functools import partial
-import numpy as np
-import cv2
-import struct
-
-db = Database()
-
-descriptor = NetDescriptor.from_file(db, 'features/caffe_facenet.toml')
-facenet_args = db.protobufs.FacenetArgs()
-facenet_args.scale = 1.0
-facenet_args.threshold = 0.5
-caffe_args = facenet_args.caffe_args
-caffe_args.net_descriptor.CopyFrom(descriptor.as_proto())
-caffe_args.batch_size = 96
-
-table_input = db.ops.Input()
-caffe_input = db.ops.FacenetInput(
-    inputs=[(table_input, ["frame", "frame_info"])],
-    args=facenet_args,
-    device=DeviceType.GPU)
-caffe = db.ops.Facenet(
-    inputs=[(caffe_input, ["caffe_frame"]), (table_input, ["frame_info"])],
-    args=facenet_args,
-    device=DeviceType.GPU)
-caffe_output = db.ops.FacenetOutput(
-    inputs=[(caffe, ["caffe_output"]), (table_input, ["frame_info"])],
-    args=facenet_args)
-
-input_collection = db.ingest_video_collection('test', ['test.mp4'])
-output_collection = db.run(input_collection, caffe_output, 'test_faces')
-
-def parse_bboxes(db, buf):
-    (num_bboxes,) = struct.unpack("=Q", buf[:8])
-    buf = buf[8:]
-    bboxes = []
-    for i in range(num_bboxes):
-        (bbox_size,) = struct.unpack("=i", buf[:4])
-        buf = buf[4:]
-        box = db.protobufs.BoundingBox()
-        box.ParseFromString(buf[:bbox_size])
-        buf = buf[bbox_size:]
-        bbox = [box.x1, box.y1, box.x2, box.y2, box.score,
-                box.track_id, box.track_score]
-        bboxes.append(bbox)
-    return bboxes
-
-bboxes = output_collection.tables(0).columns(0).load(parse_bboxes)
diff --git a/examples/face_detection/face_detect.py b/examples/face_detection/face_detect.py
deleted file mode 100644
index 940cc00b..00000000
--- a/examples/face_detection/face_detect.py
+++ /dev/null
@@ -1,82 +0,0 @@
-from scannerpy import Database, DeviceType
-from scannerpy.stdlib import NetDescriptor, parsers, bboxes
-import os
-import subprocess
-import cv2
-import sys
-import os.path
-sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/..')
-import util
-
-db = Database()
-
-# TODO(wcrichto): comment the demo. Make the Scanner philosophy more clear.
-# Add some figures to the wiki perhaps explaining the high level
-
-descriptor = NetDescriptor.from_file(db, 'nets/caffe_facenet.toml')
-facenet_args = db.protobufs.FacenetArgs()
-facenet_args.threshold = 0.5
-caffe_args = facenet_args.caffe_args
-caffe_args.net_descriptor.CopyFrom(descriptor.as_proto())
-caffe_args.batch_size = 5
-
-table_input = db.ops.Input()
-facenet_input = db.ops.FacenetInput(
-    inputs=[(table_input, ["frame", "frame_info"])],
-    args=facenet_args,
-    device=DeviceType.GPU)
-facenet = db.ops.Facenet(
-    inputs=[(facenet_input, ["facenet_input"]), (table_input, ["frame_info"])],
-    args=facenet_args,
-    device=DeviceType.GPU)
-facenet_output = db.ops.FacenetOutput(
-    inputs=[(facenet, ["facenet_output"]), (table_input, ["frame_info"])],
-    args=facenet_args)
-
-if not db.has_table('example'):
-    print('Ingesting video into Scanner ...')
-    db.ingest_videos([('example', util.download_video())], force=True)
-
-sampler = db.sampler()
-print('Running face detector...')
-outputs = []
-for scale in [0.125, 0.25, 0.5, 1.0]:
-    print('Scale {}...'.format(scale))
-    facenet_args.scale = scale
-    tasks = sampler.all([('example', 'example_faces_{}'.format(scale))],
-                        item_size=50)
-    [output] = db.run(tasks, facenet_output, force=True, work_item_size=5)
-    outputs.append(output)
-
-all_bboxes = [
-    [box for (_, box) in out.load([0], parsers.bboxes)]
-    for out in outputs]
-
-nms_bboxes = []
-frames = len(all_bboxes[0])
-runs = len(all_bboxes)
-for fi in range(frames):
-    frame_bboxes = []
-    for r in range(runs):
-        frame_bboxes += (all_bboxes[r][fi])
-    frame_bboxes = bboxes.nms(frame_bboxes, 0.3)
-    nms_bboxes.append(frame_bboxes)
-
-print('Extracting frames...')
-video_faces = nms_bboxes
-video_frames = [f[0] for _, f in db.table('example').load([0])]
-
-print('Writing output video...')
-frame_shape = video_frames[0].shape
-output = cv2.VideoWriter(
-    'example_faces.mkv',
-    cv2.VideoWriter_fourcc(*'X264'),
-    24.0,
-    (frame_shape[1], frame_shape[0]))
-
-for (frame, frame_faces) in zip(video_frames, video_faces):
-    for face in frame_faces:
-        if face[4] < 0.5: continue
-        face = map(int, face)
-        cv2.rectangle(frame, (face[0], face[1]), (face[2], face[3]), (255, 0, 0), 3)
-    output.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
diff --git a/examples/how-tos/caffe/resnet.py b/examples/how-tos/caffe/resnet.py
new file mode 100644
index 00000000..6d550a02
--- /dev/null
+++ b/examples/how-tos/caffe/resnet.py
@@ -0,0 +1,42 @@
+from scannerpy import Database, DeviceType, Job, BulkJob
+from scannerpy.stdlib import NetDescriptor
+import numpy as np
+import cv2
+import struct
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/..')
+import util
+
+video_path = util.download_video() if len(sys.argv) <= 1 else sys.argv[1]
+print('Performing classification on video {}'.format(video_path))
+video_name = os.path.splitext(os.path.basename(video_path))[0]
+
+with Database() as db:
+    [input_table], _ = db.ingest_videos(
+        [(video_name, video_path)], force=True)
+
+    descriptor = NetDescriptor.from_file(db, 'nets/resnet.toml')
+
+    batch_size = 48
+    frame = db.ops.FrameInput()
+    caffe_input = db.ops.CaffeInput(
+        frame = frame,
+        net_descriptor = descriptor.as_proto(),
+        batch_size = batch_size,
+        device=DeviceType.GPU)
+    caffe_output = db.ops.Caffe(
+        caffe_frame = caffe_input,
+        net_descriptor = descriptor.as_proto(),
+        batch_size = batch_size,
+        batch = batch_size,
+        device=DeviceType.GPU)
+    output = db.ops.Output(columns=[caffe_output])
+
+    job = Job(op_args={
+        frame: input_table.column('frame'),
+        output: input_table.name() + '_classification'
+    })
+    bulk_job = BulkJob(output=output, jobs=[job])
+
+    [output] = db.run(bulk_job, pipeline_instances_per_node=1, force=True)
diff --git a/examples/halide/CMakeLists.txt b/examples/how-tos/halide/CMakeLists.txt
similarity index 100%
rename from examples/halide/CMakeLists.txt
rename to examples/how-tos/halide/CMakeLists.txt
diff --git a/examples/halide/halide_resize.cpp b/examples/how-tos/halide/halide_resize.cpp
similarity index 93%
rename from examples/halide/halide_resize.cpp
rename to examples/how-tos/halide/halide_resize.cpp
index 18fd784b..ef0b494d 100644
--- a/examples/halide/halide_resize.cpp
+++ b/examples/how-tos/halide/halide_resize.cpp
@@ -11,7 +11,7 @@ Expr kernel_box(Expr x) {
 }
 
 class HalideResizeKernel : public Halide::Generator<HalideResizeKernel> {
-public:
+ public:
   ImageParam input{UInt(8), 3, "input"};
   Param<int> input_width{"input_width"}, input_height{"input_height"};
   Param<int> target_width{"target_width"}, target_height{"target_height"};
@@ -55,9 +55,7 @@ class HalideResizeKernel : public Halide::Generator<HalideResizeKernel> {
     resized_final(x, y, c) = clamp(resized_y(x, y, c), 0.0f, 255.0f);
     resized_final.bound(c, 0, 3);
 
-    input
-      .dim(0).set_stride(3)
-      .dim(2).set_stride(1);
+    input.dim(0).set_stride(3).dim(2).set_stride(1);
 
     Target target = Halide::get_target_from_environment();
     target.set_feature(Target::CUDA);
@@ -69,5 +67,4 @@ class HalideResizeKernel : public Halide::Generator<HalideResizeKernel> {
   }
 };
 
-Halide::RegisterGenerator<HalideResizeKernel> register_me{
-    "halide_resize"};
+Halide::RegisterGenerator<HalideResizeKernel> register_me{"halide_resize"};
diff --git a/examples/halide/resize.proto b/examples/how-tos/halide/resize.proto
similarity index 100%
rename from examples/halide/resize.proto
rename to examples/how-tos/halide/resize.proto
diff --git a/examples/halide/resize_op.cpp b/examples/how-tos/halide/resize_op.cpp
similarity index 62%
rename from examples/halide/resize_op.cpp
rename to examples/how-tos/halide/resize_op.cpp
index 0b3bb6b5..48bcf4de 100644
--- a/examples/halide/resize_op.cpp
+++ b/examples/how-tos/halide/resize_op.cpp
@@ -1,23 +1,22 @@
-#include "scanner/api/op.h"
+#include "halide_resize/halide_resize.h"
+#include "resize.pb.h"
 #include "scanner/api/kernel.h"
-#include "scanner/util/memory.h"
+#include "scanner/api/op.h"
 #include "scanner/util/halide.h"
-#include "resize.pb.h"
-#include "halide_resize/halide_resize.h"
+#include "scanner/util/memory.h"
 
 class ResizeKernel : public scanner::VideoKernel {
-public:
+ public:
   ResizeKernel(const scanner::Kernel::Config& config)
-    : scanner::VideoKernel(config), device_(config.devices[0]) {
+      : scanner::VideoKernel(config), device_(config.devices[0]) {
     ResizeArgs args;
     args.ParseFromArray(config.args.data(), config.args.size());
     width_ = args.width();
     height_ = args.height();
   }
 
-
-  void execute(const scanner::BatchedColumns &input_columns,
-               scanner::BatchedColumns &output_columns) override {
+  void execute(const scanner::BatchedColumns& input_columns,
+               scanner::BatchedColumns& output_columns) override {
     int input_count = input_columns[0].rows.size();
 
     // This must be called at the top of the execute method in any VideoKernel.
@@ -26,32 +25,23 @@ class ResizeKernel : public scanner::VideoKernel {
 
     size_t output_size = width_ * height_ * 3;
     unsigned char* output_block = scanner::new_block_buffer(
-      device_, output_size * input_count, input_count);
+        device_, output_size * input_count, input_count);
 
     for (int i = 0; i < input_count; ++i) {
       buffer_t input_halide_buf = {0};
       scanner::setup_halide_frame_buf(input_halide_buf, frame_info_);
-      scanner::set_halide_buf_ptr(
-        device_,
-        input_halide_buf,
-        input_columns[0].rows[i].buffer,
-        input_columns[0].rows[i].size);
+      scanner::set_halide_buf_ptr(device_, input_halide_buf,
+                                  input_columns[0].rows[i].buffer,
+                                  input_columns[0].rows[i].size);
 
       buffer_t output_halide_buf = {0};
       scanner::setup_halide_frame_buf(output_halide_buf, frame_info_);
-      scanner::set_halide_buf_ptr(
-        device_,
-        output_halide_buf,
-        output_block + i * output_size,
-        output_size);
+      scanner::set_halide_buf_ptr(device_, output_halide_buf,
+                                  output_block + i * output_size, output_size);
 
-      int error = halide_resize(
-        &input_halide_buf,
-        frame_info_.width(),
-        frame_info_.height(),
-        width_,
-        height_,
-        &output_halide_buf);
+      int error = halide_resize(&input_halide_buf, frame_info_.width(),
+                                frame_info_.height(), width_, height_,
+                                &output_halide_buf);
       LOG_IF(FATAL, error != 0) << "Halide error " << error;
 
       scanner::unset_halide_buf_ptr(device_, input_halide_buf);
@@ -59,15 +49,14 @@ class ResizeKernel : public scanner::VideoKernel {
     }
   }
 
-private:
+ private:
   scanner::DeviceHandle device_;
   int width_;
   int height_;
 };
 
-
 REGISTER_OP(Resize).outputs({"frame"});
 
 REGISTER_KERNEL(Resize, ResizeKernel)
-   .device(scanner::DeviceType::GPU)
-   .num_devices(1);
+    .device(scanner::DeviceType::GPU)
+    .num_devices(1);
diff --git a/examples/how-tos/python_kernel/my_kernel.py b/examples/how-tos/python_kernel/my_kernel.py
new file mode 100644
index 00000000..91414d94
--- /dev/null
+++ b/examples/how-tos/python_kernel/my_kernel.py
@@ -0,0 +1,14 @@
+import scannerpy
+import struct
+
+class MyOpKernel(scannerpy.Kernel):
+    def __init__(self, config, protobufs):
+        self.protobufs = protobufs
+
+    def close(self):
+        pass
+
+    def execute(self, input_columns):
+        return [struct.pack('=q', 9000)]
+
+KERNEL = MyOpKernel
diff --git a/examples/how-tos/python_kernel/python.py b/examples/how-tos/python_kernel/python.py
new file mode 100644
index 00000000..1faa6506
--- /dev/null
+++ b/examples/how-tos/python_kernel/python.py
@@ -0,0 +1,20 @@
+from scannerpy import Database, Job, BulkJob, ColumnType, DeviceType
+import os
+
+script_dir = os.path.dirname(os.path.abspath(__file__))
+
+with Database() as db:
+    db.register_op('MyOp', [('frame', ColumnType.Video)], ['test'])
+    kernel_path = script_dir + '/my_kernel.py'
+    db.register_python_kernel('MyOp', DeviceType.CPU, kernel_path)
+
+    frame = db.ops.FrameInput()
+    test = db.ops.MyOp(frame = frame)
+    output = db.ops.Output(columns=[test])
+
+    job = Job(op_args={
+        frame: db.table('example').column('frame'),
+        output: 'example_py'
+    })
+    bulk_job = BulkJob(output=output, jobs=[job])
+    db.run(bulk_job, force=True, pipeline_instances_per_node=1)
diff --git a/examples/how-tos/tensorflow/tensorflow_kernel.py b/examples/how-tos/tensorflow/tensorflow_kernel.py
new file mode 100644
index 00000000..63de81e9
--- /dev/null
+++ b/examples/how-tos/tensorflow/tensorflow_kernel.py
@@ -0,0 +1,59 @@
+# Mostly taken from: https://github.com/tensorflow/models/blob/master/object_detection/object_detection_tutorial.ipynb
+
+import numpy as np
+import tensorflow as tf
+import cv2
+import os
+from scannerpy.stdlib import pykernel
+from utils import visualization_utils as vis_util
+from utils import label_map_util
+import six.moves.urllib as urllib
+
+PATH_TO_REPO = '/home/wcrichto/.deps/models'
+
+# What model to download.
+MODEL_NAME = 'ssd_mobilenet_v1_coco_11_06_2017'
+MODEL_FILE = MODEL_NAME + '.tar.gz'
+DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'
+
+# List of the strings that is used to add correct label for each box.
+PATH_TO_LABELS = os.path.join(PATH_TO_REPO, 'object_detection', 'data', 'mscoco_label_map.pbtxt')
+
+PATH_TO_GRAPH = os.path.join(PATH_TO_REPO, 'object_detection', 'ssd_mobilenet_v1_coco_11_06_2017', 'frozen_inference_graph.pb')
+
+NUM_CLASSES = 90
+
+label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
+categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
+category_index = label_map_util.create_category_index(categories)
+
+class Kernel(pykernel.TensorFlowKernel):
+    def build_graph(self):
+        dnn = tf.Graph()
+        with dnn.as_default():
+            od_graph_def = tf.GraphDef()
+            with tf.gfile.GFile(PATH_TO_GRAPH, 'rb') as fid:
+                serialized_graph = fid.read()
+                od_graph_def.ParseFromString(serialized_graph)
+                tf.import_graph_def(od_graph_def, name='')
+        return dnn
+
+    def execute(self, cols):
+        print 'Execute'
+        image = cols[0]
+        image_tensor = self.graph.get_tensor_by_name('image_tensor:0')
+        boxes = self.graph.get_tensor_by_name('detection_boxes:0')
+        scores = self.graph.get_tensor_by_name('detection_scores:0')
+        classes = self.graph.get_tensor_by_name('detection_classes:0')
+        (boxes, scores, classes) = self.sess.run(
+            [boxes, scores, classes],
+            feed_dict={image_tensor: np.expand_dims(image, axis=0)})
+        vis_util.visualize_boxes_and_labels_on_image_array(
+            image,
+            np.squeeze(boxes),
+            np.squeeze(classes).astype(np.int32),
+            np.squeeze(scores),
+            category_index,
+            use_normalized_coordinates=True,
+            line_thickness=8)
+        return [image.tobytes()]
diff --git a/examples/opticalflow/flow.py b/examples/opticalflow/flow.py
deleted file mode 100644
index 069777ed..00000000
--- a/examples/opticalflow/flow.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from scannerpy import Database, DeviceType
-from scannerpy.stdlib import parsers
-import os.path as osp
-import numpy as np
-
-db = Database()
-
-input = db.ops.Input()
-flow = db.ops.OpticalFlow(
-    inputs=[(input,['frame', 'frame_info'])],
-    device=DeviceType.GPU)
-output = db.ops.Output(inputs=[(flow, ['flow']), (input, ['frame_info'])])
-
-if not db.has_table('example'):
-    db.ingest_videos([('example', 'example.mp4')])
-input_table = db.table('example')
-
-tasks = db.sampler().all([(input_table.name(), 'example_flows')], warmup_size=1)
-[output_table] = db.run(tasks, output)
-
-vid_flows = [flow for _, flow in output_table.load((0, 1), parsers.flow)]
-np.save('flows.npy', vid_flows)
diff --git a/examples/shot_detection/shot_detect.py b/examples/shot_detection/shot_detect.py
deleted file mode 100644
index de384c9b..00000000
--- a/examples/shot_detection/shot_detect.py
+++ /dev/null
@@ -1,90 +0,0 @@
-from scannerpy import Database, DeviceType
-from scannerpy.stdlib import parsers
-from scipy.spatial import distance
-import numpy as np
-import cv2
-import math
-import sys
-import os.path
-sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/..')
-import util
-
-try:
-    import plotly.offline as offline
-    import plotly.graph_objs as go
-except ImportError:
-    print('You need to install plotly to run this. Try running:\npip install plotly')
-    exit()
-
-WINDOW_SIZE = 500
-
-def main():
-    movie_path = util.download_video() if len(sys.argv) <= 1 else sys.argv[1]
-    print('Detecting shots in movie {}'.format(movie_path))
-
-    db = Database()
-    if not db.has_table('movie'):
-        print('Loading movie into Scanner database...')
-        db.ingest_videos([('movie', movie_path)], force=True)
-    movie_table = db.table('movie')
-
-    if not db.has_table('movie_hist'):
-        print('Computing a color histogram for each frame...')
-        db.run(
-            db.sampler().all([(movie_table.name(), 'movie_hist')]),
-            db.ops.Histogram(device=DeviceType.GPU),
-            force=True)
-    hists_table = db.table('movie_hist')
-
-    print('Computing shot boundaries...')
-
-    # Fetch histograms from disk
-    hists = [h for _, h in hists_table.load(['histogram'], parsers.histograms)]
-
-    # Compute the mean difference between each pair of adjacent frames
-    diffs = np.array([np.mean([distance.chebyshev(hists[i-1][j], hists[i][j])
-                               for j in range(3)])
-                      for i in range(1, len(hists))])
-    diffs = np.insert(diffs, 0, 0)
-    n = len(diffs)
-
-    # Plot the differences. Look at histogram-diffs.html
-    data = [go.Scatter(x=range(n),y=diffs)]
-    offline.plot(data, filename='histogram-diffs.html')
-
-    # Do simple outlier detection to find boundaries between shots
-    boundaries = []
-    for i in range(1, n):
-        window = diffs[max(i-WINDOW_SIZE,0):min(i+WINDOW_SIZE,n)]
-        if diffs[i] - np.mean(window) > 3 * np.std(window):
-            boundaries.append(i)
-
-    print('Visualizing shot boundaries...')
-
-    # Loading the frames for each shot boundary
-    frames = [f[0] for _, f in
-              movie_table.load([0], rows=boundaries)]
-    n = len(frames)
-
-    (frame_h, frame_w, _) = frames[0].shape
-    target_w = 256
-    target_h = int(target_w / float(frame_w) * frame_h)
-    frames_per_row = 8
-    img_w = frames_per_row * target_w
-    img_h = int(math.ceil(float(n) / frames_per_row)) * target_h
-
-    img = np.zeros((img_h, img_w, 3))
-    for i, frame in enumerate(frames):
-        frame = cv2.resize(frame, (target_w, target_h))
-        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
-        row = i / frames_per_row
-        col = i % frames_per_row
-        img[(row * target_h):((row+1) * target_h),
-            (col * target_w):((col+1) * target_w),
-            :] = frame
-
-    cv2.imwrite('shots.jpg', img)
-    print('Successfully generated shots.jpg')
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt
deleted file mode 100644
index 4dc81ffd..00000000
--- a/examples/simple/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-add_executable(example_simple main.cpp)
-target_link_libraries(example_simple scanner stdlib)
diff --git a/examples/simple/main.alt.py b/examples/simple/main.alt.py
deleted file mode 100644
index 563f1961..00000000
--- a/examples/simple/main.alt.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from scannerpy import Database, DeviceType
-from scannerpy.stdlib import parsers
-from sklearn.preprocessing import normalize
-import numpy as np
-import cv2
-
-db = Database()
-
-hist = db.ops.Histogram(device=DeviceType.CPU)
-
-input = db.ops.Input()
-flow = db.ops.OpticalFlow(
-    inputs=[(input,['frame', 'frame_info'])],
-    device=DeviceType.GPU)
-output = db.ops.Output(inputs=[(flow, ['flow']), (input, ['frame_info'])])
-
-def parse_hist(buf):
-    return np.split(np.frombuffer(buf, dtype=np.dtype(np.int32)), 3)
-
-def single_video():
-    video = '/bigdata/wcrichto/videos/meanGirls_short.mp4'
-    db.ingest_video(('meangirls', video))
-    sampler = db.sampler()
-    tasks = sampler.all([('meangirls', 'meangirls_hist')])
-    [table] = db.run(tasks, hist)
-
-def video_collection():
-    input_collection, _ = db.ingest_video_collection(
-        'meangirls',
-        ['/bigdata/wcrichto/videos/meanGirls_short.mp4'],
-        force=True)
-    input_collection = db.collection('meangirls')
-    sampler = db.sampler()
-    tasks = sampler.all(input_collection, warmup_size=1)
-    output_collection = db.run(tasks, output, 'meangirls_hist', force=True)
-    output_collection = db.collection('meangirls_hist')
-    table = output_collection.tables(0)
-
-    vid = cv2.VideoWriter(
-        'test.mkv',
-        cv2.VideoWriter_fourcc(*'X264'),
-        24.0,
-        (640, 480))
-
-    for row, flow in table.load((0, 1), parsers.flow):
-        img = np.linalg.norm(flow, axis=(2,))*4
-        normalize(img)
-        img = img.astype(np.uint8)
-        vid.write(cv2.cvtColor(img, cv2.COLOR_GRAY2BGR))
-
-    # output_collection.profiler().write_trace('test.trace')
-
-
-
-video_collection()
diff --git a/examples/simple/main.cpp b/examples/simple/main.cpp
deleted file mode 100644
index 8de874a2..00000000
--- a/examples/simple/main.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-#include "scanner/api/op.h"
-#include "scanner/api/database.h"
-#include "stdlib/stdlib.pb.h"
-
-#include <grpc/grpc_posix.h>
-
-int main(int argc, char** argv) {
-  grpc_use_signal(-1);
-
-  std::string db_path = "/tmp/test_db";
-  std::unique_ptr<storehouse::StorageConfig> sc(
-      storehouse::StorageConfig::make_posix_config());
-  std::string master_address = "localhost:5001";
-
-  scanner::Database db(sc.get(), db_path, master_address);
-
-  // Ingest video
-  scanner::Result result;
-  std::vector<scanner::FailedVideo> failed_videos;
-  result = db.ingest_videos(
-      {"mean"}, {"/n/scanner/wcrichto.new/videos/meanGirls_medium.mp4"},
-      failed_videos);
-  assert(failed_videos.empty());
-
-  // Initialize master and one worker
-  scanner::MachineParameters machine_params = scanner::default_machine_params();
-  db.start_master(machine_params);
-  db.start_worker(machine_params);
-
-  // Construct job parameters
-  scanner::JobParameters params;
-  params.job_name = "test_job";
-  params.memory_pool_config.mutable_cpu()->set_use_pool(false);
-  params.memory_pool_config.mutable_gpu()->set_use_pool(false);
-  params.pipeline_instances_per_node = 1;
-  params.work_item_size = 512;
-
-  // Specify job tasks
-  scanner::Task task;
-  task.output_table_name = "blurred_mean";
-  scanner::TableSample sample;
-  sample.table_name = "mean";
-  sample.column_names = {"frame", "frame_info"};
-
-  sample.sampling_function = "Gather";
-  scanner::proto::GatherSamplerArgs args;
-  auto& gather_sample = *args.add_samples();
-  for (int i = 0; i < 100; i += 1) {
-    gather_sample.add_rows(i);
-  }
-  std::vector<scanner::u8> args_data(args.ByteSize());
-  args.SerializeToArray(args_data.data(), args_data.size());
-  sample.sampling_args = args_data;
-
-  task.samples.push_back(sample);
-  params.task_set.tasks.push_back(task);
-
-  scanner::proto::BlurArgs blur_args;
-  blur_args.set_sigma(0.5);
-  blur_args.set_kernel_size(3);
-
-  size_t blur_args_size = blur_args.ByteSize();
-  char* blur_args_buff = new char[blur_args_size];
-  blur_args.SerializeToArray(blur_args_buff, blur_args_size);
-
-  scanner::Op *input =
-      scanner::make_input_op({"frame", "frame_info"});
-
-  scanner::Op *blur = new scanner::Op(
-      "Blur", {scanner::OpInput(input, {"frame", "frame_info"})},
-      scanner::DeviceType::CPU, blur_args_buff, blur_args_size);
-
-  scanner::Op *output = scanner::make_output_op(
-      {scanner::OpInput(blur, {"frame", "frame_info"})});
-
-  // Launch job
-  params.task_set.output_op = output;
-  result = db.new_job(params);
-  assert(result.success());
-}
diff --git a/examples/simple/main.py b/examples/simple/main.py
deleted file mode 100644
index 555ddd1e..00000000
--- a/examples/simple/main.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import grpc
-
-import sys
-scanner_path = '/home/wcrichto/scanner'
-sys.path.append(scanner_path + '/build')
-sys.path.append(scanner_path + '/thirdparty/build/bin/storehouse/lib')
-
-from storehousepy import StorageConfig
-import scanner.metadata_pb2 as metadata
-import scanner.engine.rpc_pb2 as rpc
-import scanner.kernels.args_pb2 as kernel_args
-
-import scanner_bindings
-
-storage_config = StorageConfig.make_posix_config()
-
-db_path = '/tmp/new_scanner_db'
-
-scanner_bindings.create_database(storage_config, db_path)
-scanner_bindings.ingest_videos(
-    storage_config,
-    db_path,
-    ['meangirls'],
-    ['/bigdata/wcrichto/videos/meanGirls_short.mp4'])
-
-memory_config = metadata.MemoryPoolConfig()
-memory_config.use_pool = False
-db_params = scanner_bindings.make_database_parameters(
-    storage_config, memory_config.SerializeToString(), db_path)
-
-master_address = "localhost:5001"
-master = scanner_bindings.start_master(db_params)
-worker = scanner_bindings.start_worker(db_params, master_address)
-
-job_params = rpc.JobParameters()
-job_params.job_name = "test_job"
-
-task_set = job_params.task_set
-task = task_set.tasks.add()
-task.output_table_name = "blurred_mean"
-sample = task.samples.add()
-sample.table_name = "meangirls"
-sample.column_names.extend(["frame", "frame_info"])
-sample.rows.extend(range(1000))
-
-input = task_set.evaluators.add()
-input.name = "InputTable"
-input.device_type = metadata.CPU
-input_input = input.inputs.add()
-input_input.evaluator_index = -1
-input_input.columns.extend(["frame", "frame_info"])
-
-blur = task_set.evaluators.add()
-blur.name = "Blur"
-blur_input = blur.inputs.add()
-blur_input.evaluator_index = 0
-blur_input.columns.extend(["frame", "frame_info"])
-blur.device_type = metadata.CPU
-args = kernel_args.BlurArgs()
-args.kernel_size = 3
-args.sigma = 0.5
-blur.kernel_args = args.SerializeToString()
-
-output = task_set.evaluators.add()
-output.name = "OutputTable"
-output_input = output.inputs.add()
-output_input.evaluator_index = 1
-output_input.columns.extend(["frame"])
-
-channel = grpc.insecure_channel(master_address)
-stub = rpc.MasterStub(channel)
-stub.NewJob(job_params)
diff --git a/examples/simple/test.py b/examples/simple/test.py
deleted file mode 100644
index 57cd462a..00000000
--- a/examples/simple/test.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from scannerpy import Database
-
-db = Database()
-
-input_videos = db.ingest_video_collection(
-    'meangirls', ['/bigdata/wcrichto.new/videos/meanGirls.mp4'])
-
-hist_evaluator = db.evaluators.Histogram()
-
-output_histograms = db.run(videos, hist_evaluator, 'meangirls_histogram')
-
-histogram = output_histograms.tables[0].columns[0].load()
diff --git a/examples/tutorial/00_basic.py b/examples/tutorial/00_basic.py
index ffdfa8ba..18b2b362 100644
--- a/examples/tutorial/00_basic.py
+++ b/examples/tutorial/00_basic.py
@@ -1,10 +1,11 @@
-from scannerpy import Database, DeviceType
+from scannerpy import Database, DeviceType, Job, BulkJob
 from scannerpy.stdlib import parsers
+
 import numpy as np
 import cv2
 import sys
 import os.path
-sys.path.append(os.path.dirname(__file__) + '/..')
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/..')
 import util
 
 ################################################################################
@@ -14,38 +15,63 @@
 
 # Initialize a connection to the Scanner database. Loads configuration from the
 # ~/.scanner.toml configuration file.
-db = Database()
-
-# Create an operator to run on our video. This computes a histogram with 16 bins
-# for each color channel in a given frame.
-hist_op = db.ops.Histogram()
-
-# Create a Scanner table from our video in the format (table name, video path).
-# If any videos fail to ingest, they'll show up in the failed list. If force
-# is true, it will overwrite existing tables of the same name.
-example_video_path = util.download_video()
-[input_table], failed = db.ingest_videos([
-    ('example', example_video_path),
-    ('thisshouldfail', 'thisshouldfail.mp4')], force=True)
-print(db.summarize())
-print('Failures:', failed)
-
-# Define which frames we're going to run the operator on (all of them, in this
-# case). The sampler takes in pairs of (input table name, output table name).
-sampler = db.sampler()
-tasks = sampler.all([(input_table.name(), 'example_hist')])
-
-# Run the operator on the input and get an output table. The columns of the
-# output table are written to disk by the Scanner runtime.
-[output_table] = db.run(tasks, hist_op, force=True)
-
-# Load the histograms from a column of the output table. The parsers.histograms
-# function  converts the raw bytes output by Scanner into a numpy array for each
-# channel.
-video_hists = output_table.load(['histogram'], parsers.histograms)
-
-# Loop over the column's rows. Each row is a tuple of the frame number and
-# value for that row.
-for (frame_index, frame_hists) in video_hists:
-    assert len(frame_hists) == 3
-    assert frame_hists[0].shape[0] == 16
+with Database() as db:
+
+    # Create a Scanner table from our video in the format (table name,
+    # video path). If any videos fail to ingest, they'll show up in the failed
+    # list. If force is true, it will overwrite existing tables of the same
+    # name.
+    example_video_path = util.download_video()
+    [input_table], failed = db.ingest_videos([
+        ('example', example_video_path),
+        ('thisshouldfail', 'thisshouldfail.mp4')], force=True)
+
+    print(db.summarize())
+    print('Failures:', failed)
+
+    # Scanner processes videos by forming a graph of operations that operate
+    # on input frames from a table and produce outputs to a new table.
+
+    # FrameInput declares that we want to read from a table column that
+    # represents a video frame.
+    frame = db.ops.FrameInput()
+
+    # These frames are input into a Histogram op that computes a color histogram
+    # for each frame.
+    hist = db.ops.Histogram(frame=frame)
+
+    # Finally, any columns provided to Output will be saved to the output
+    # table at the end of the computation.
+    output_op = db.ops.Output(columns=[hist])
+
+    # A job defines a table you want to create. In op_args, we bind the frame
+    # input column from above to the table we want to read from and name
+    # the output table 'example_hist' by binding a string to output_op.
+    job = Job(
+        op_args={
+            frame: db.table('example').column('frame'),
+            output_op: 'example_hist'
+        }
+    )
+    # Multiple tables can be created using the same execution graph using
+    # a bulk job. Here we specify the execution graph (or DAG) by providing
+    # the output_op and also specify the jobs we wish to compute.
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+
+    # This executes the job and produces the output table. You'll see a progress
+    # bar while Scanner is computing the outputs.
+    output_tables = db.run(bulk_job, force=True)
+
+    # Load the histograms from a column of the output table. The
+    # parsers.histograms  function  converts the raw bytes output by Scanner
+    # into a numpy array for each channel.
+    video_hists = output_tables[0].load(['histogram'], parsers.histograms)
+
+    # Loop over the column's rows. Each row is a tuple of the frame number and
+    # value for that row.
+    num_rows = 0
+    for (frame_index, frame_hists) in video_hists:
+        assert len(frame_hists) == 3
+        assert frame_hists[0].shape[0] == 16
+        num_rows += 1
+    assert num_rows == db.table('example').num_rows()
diff --git a/examples/tutorial/01_sampling.py b/examples/tutorial/01_sampling.py
index c1e1ddd6..e70d62c4 100644
--- a/examples/tutorial/01_sampling.py
+++ b/examples/tutorial/01_sampling.py
@@ -1,32 +1,51 @@
-from scannerpy import Database
+from scannerpy import Database, Job, BulkJob
+from scannerpy.stdlib import parsers
 
 ################################################################################
-# This tutorial shows how to use the Sampler class to select which parts of a  #
-# video to process with an op.                                                 #
+# This tutorial shows how to select different frames of a video to process.   #
 ################################################################################
 
-db = Database()
-hist_op = db.ops.Histogram()
-
-# We can access previously created tables with db.table(name).
-input_table = db.table('example')
-
-# The sampler lets you run operators over subsets of frames from your videos.
-# Here, the "strided" sampling mode will run over every 8th frame, i.e. frames
-# [0, 8, 16, ...]
-sampler = db.sampler()
-tables = [(input_table.name(), 'example_hist_subsampled')]
-tasks = sampler.strided(tables, 8)
-
-# We pass the tasks to the database same as before, and can process the output
-# same as before.
-[output_table] = db.run(tasks, hist_op, force=True)
-
-# Here's some examples of other sampling modes.
-
-# Range takes a specific subset of a video. Here, it runs over all frames from
-# 0 to 100
-sampler.range(tables, 0, 100)
-
-# Gather takes an arbitrary list of frames from a video.
-sampler.gather(tables[0], [10, 17, 32])
+with Database() as db:
+    frame = db.ops.FrameInput()
+
+    # You can tell Scanner which frames of the video (or which rows of a video
+    # table) you want to sample. Here, we indicate that we want to sample
+    # the frame column (we will say how to sample when specifying a job).
+    strided_frame = frame.sample()
+
+    # We process the sampled frame same as before.
+    hist = db.ops.Histogram(frame=strided_frame)
+    output_op = db.ops.Output(columns=[hist])
+
+    # For each job, you can specify how sampling should be performed for
+    # a specific column. In the same way we used the op_args argument to bind
+    # a table to an input column, we bind a sampling directive to strided_frame.
+    job = Job(
+        op_args={
+            frame: db.table('example').column('frame'),
+            # The "strided" sampling mode will run over # every 8th frame,
+            # i.e. frames [0, 8, 16, ...]
+            strided_frame: db.sampler.strided(8),
+            output_op: 'example_hist_strided'
+        }
+    )
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    output_tables = db.run(bulk_job, force=True, pipeline_instances_per_node=1)
+
+    # Loop over the column's rows. Each row is a tuple of the frame number and
+    # value for that row.
+    video_hists = output_tables[0].load(['histogram'], parsers.histograms)
+    num_rows = 0
+    for (frame_index, frame_hists) in video_hists:
+        assert len(frame_hists) == 3
+        assert frame_hists[0].shape[0] == 16
+        num_rows += 1
+    assert num_rows == db.table('example').num_rows() / 8
+
+    # Here's some examples of other sampling modes.
+    # Range takes a specific subset of a video. Here, it runs over all frames
+    # from 0 to 100
+    db.sampler.range(0, 100)
+
+    # Gather takes an arbitrary list of frames from a video.
+    db.sampler.gather([10, 17, 32])
diff --git a/examples/tutorial/02_collections.py b/examples/tutorial/02_collections.py
index 8b234b36..c7812f4f 100644
--- a/examples/tutorial/02_collections.py
+++ b/examples/tutorial/02_collections.py
@@ -1,29 +1,43 @@
-from scannerpy import Database
+from scannerpy import Database, Job
 import sys
 import os.path
-sys.path.append(os.path.dirname(__file__) + '/..')
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/..')
 import util
 
 ################################################################################
 # This tutorial shows how to organize your videos into Collections.            #
 ################################################################################
 
-db = Database()
-hist_op = db.ops.Histogram()
+with Database() as db:
 
-# Instead of ingesting each video into a table individually, we can group video
-# tables into a single entity called a collection. Here, we create a collection
-# called "example_collection" from the video in the previous example.
-# Collections do not incur any runtime overhead, but are simply an abstraction
-# for more easily managing your videos.
-example_video_path = util.download_video()
-input_collection, _ = db.ingest_video_collection(
-    'example_collection', [example_video_path], force=True)
-print(db.summarize())
+    # Instead of ingesting each video into a table individually, we can group
+    # video # tables into a single entity called a collection. Here, we create
+    # a collection # called "example_collection" from the video in the previous
+    # example. # Collections do not incur any runtime overhead, but are simply
+    # an abstraction for more easily managing your videos.
+    example_video_path = util.download_video()
+    input_collection, _ = db.ingest_video_collection(
+        'example_collection', [example_video_path], force=True)
+    print(db.summarize())
 
-# We can also provide collections directly to the run function which will run
-# the op over all frames in all videos in the collection.
-output_collection = db.run(input_collection, hist_op, 'example_hist_collection')
+    # You can retrieve table objects off the collection.
+    table = output_collection.tables(0)
 
-# You can retrieve table objects off the collection.
-output_table = output_collection.tables(0)
+    frame = db.ops.FrameInput()
+    hist = db.ops.Histogram(frame=frame)
+    output_op = db.ops.Output(columns=[hist])
+    # You can use a collection to enumerate tables
+    jobs = []
+    for table in input_collection.tables():
+        job = Job(
+            op_args={
+                frame: table.column('frame'),
+                output_op: table.name() + '_output'
+            }
+        )
+        jobs.append(job)
+    bulk_job = BulkJob(output=output_op, jobs=jobs)
+    output_tables = db.run(bulk_job, force=True, pipeline_instances_per_node=1)
+
+    # You can create new collections from existing tables
+    hist_collection = db.new_collection('hist_collection', output_tables)
diff --git a/examples/tutorial/03_ops.py b/examples/tutorial/03_ops.py
index fd5b8a45..e4b7a2d0 100644
--- a/examples/tutorial/03_ops.py
+++ b/examples/tutorial/03_ops.py
@@ -1,40 +1,84 @@
-from scannerpy import Database
+from scannerpy import Database, Job, DeviceType, BulkJob
 
 ################################################################################
 # This tutorial shows how to combine multiple operators into a computation     #
 # graph and wire inputs/outputs.                                               #
 ################################################################################
 
-db = Database()
-sampler = db.sampler()
-tasks = sampler.all([('example', 'example_hist_blurred')])
-
-# Scanner can take a directed acyclic graph (DAG) of operators and pass data
-# between them. Each graph has an Input node at the beginning that represents
-# the data from the input table.
-input = db.ops.Input()
-
-# To wire up the graph, you set the inputs of an operator to be the outputs of
-# another. Here, the input op outputs two columns, "frame" which is the raw
-# bytes of the frame, and "frame_info" which contains information about the
-# width/height/etc. of each frame. We feed these two columns into the Blur.
-blur = db.ops.Blur(
-    inputs=[(input, ["frame", "frame_info"])],
-    kernel_size=3,
-    sigma=0.5)
-
-# An op can take inputs from multiple other ops, here taking the blurred frame
-# from the Blur op and the frame info from the Input op.
-hist = db.ops.Histogram(inputs=[(blur, ["frame"]), (input, ["frame_info"])])
-
-# Each op graph must have an Output node at the end that determines which
-# columns get saved into the output table.
-output = db.ops.Output(inputs=[(hist, ["histogram"])])
-
-# You provide the last op in the graph, here the output op, as the argument to
-# db.run.
-db.run(tasks, output, force=True)
-
-# Note: if you don't explicitly include an Input or Output node in your op graph
-# they will be automatically added for you. This is how the previous examples
-# have worked.
+with Database() as db:
+
+    # Scanner can take a directed acyclic graph (DAG) of operators and pass data
+    # between them. Each graph has starts with data from an input table.
+    frame = db.ops.FrameInput()
+
+    blurred_frame = db.ops.Blur(
+        frame = frame,
+        kernel_size = 3,
+        sigma = 0.5)
+
+    # Multiple operators can be hooked up in a computation by using the outputs
+    # of one as the inputs of another.
+    histogram = db.ops.Histogram(
+        frame = blurred_frame)
+
+    output_op = db.ops.Output(columns=[histogram])
+
+    job = Job(
+        op_args={
+            frame: db.table('example').column('frame'),
+            output_op: 'output_table',
+        }
+    )
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+
+    db.run(bulk_job, force=True)
+
+    # Ops can have several attributes that affect which stream elements they
+    # will receive or how they will receive them. These attributes include:
+    #
+    # - Batch: The Op can receive multiple elements at once to enable SIMD
+    #          or vector-style processing.
+    #
+    # - Stencil: The Op requires a window of input elements (for example, the
+    #            previous and next element) at the same time to produce an
+    #            output.
+    #
+    # - Bounded State: For each output, the Op requires at least W sequential
+    #                  "warmup" elements before it can produce a valid output.
+    #                  For example, if the output of this Op is sampled
+    #                  sparsely, this guarantees that the Op can "warmup"
+    #                  its state on a stream of W elements before producing the
+    #                  requested output.
+    #
+    # - Unbounded State: This Op will always process all preceding elements of
+    #                    its input streams before producing a requested output.
+    #                    This means that sampling operations after this Op
+    #                    can not change how many inputs it receives. In the next
+    #                    tutorial, we will show how this can be relaxed for
+    #                    sub-streams of the input.
+    #
+    # The rest of this tutorial will show examples of each attribute in action.
+
+
+    # Batch
+    # Here we specify that the histogram kernel should receive a batch of 8
+    # input elements at once. Logically, each element is still processed
+    # independently but multiple elements are provided for efficient
+    # batch processing. If there are not enough elements left in a stream,
+    # the Op may receive less than a batch worth of elements.
+    histogram = db.ops.Histogram(
+        frame = frame,
+        batch = 8)
+
+
+    # Stencil
+    diff = db.ops.FrameDifference(
+        frame = frame,
+        stencil = [-1, 0])
+
+
+    # Bounded State
+    # Detailed documentation to come...
+
+    # Unbounded State
+    # Detailed documentation to come...
diff --git a/examples/tutorial/04_custom_op.py b/examples/tutorial/04_custom_op.py
deleted file mode 100644
index f9e632cf..00000000
--- a/examples/tutorial/04_custom_op.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from scannerpy import Database
-import os.path
-
-################################################################################
-# This tutorial shows how to write and use your own custom op.                 #
-################################################################################
-
-# Look at resize_op/resize_op.cpp to start this tutorial.
-
-db = Database()
-
-if not os.path.isfile('resize_op/build/libresize_op.so'):
-    print('You need to build the custom op first: \n'
-          '$ cd resize_op; mkdir build && cd build; cmake ..; make')
-    exit()
-
-# To load a custom op into the Scanner runtime, we use db.load_op to open the
-# shared library we compiled. If the op takes arguments, it also optionally
-# takes a path to the generated python file for the arg protobuf.
-db.load_op('resize_op/build/libresize_op.so', 'resize_op/build/resize_pb2.py')
-
-# Then we use our op just like in the other examples.
-resize = db.ops.Resize(width=200, height=300)
-
-sampler = db.sampler()
-tasks = sampler.all([('example', 'example_resized')])
-db.run(tasks, resize, force=True)
diff --git a/examples/tutorial/04_slicing.py b/examples/tutorial/04_slicing.py
new file mode 100644
index 00000000..8cac8384
--- /dev/null
+++ b/examples/tutorial/04_slicing.py
@@ -0,0 +1,85 @@
+from scannerpy import Database, Job, BulkJob, DeviceType
+from scannerpy.stdlib import parsers
+import math
+
+################################################################################
+# This tutorial shows how to use column slicing to limit Op dependencies       #
+# within subsequences of the input.                                            #
+################################################################################
+
+with Database(debug=True) as db:
+    frame = db.ops.FrameInput()
+
+    # 
+    # table) you want to sample. Here, we indicate that we want to sample
+    # the frame column (we will say how to sample when specifying a job).
+    sliced_frame = frame.slice()
+
+    # We process the sampled frame same as before.
+    hist = db.ops.Histogram(frame=sliced_frame)
+    unsliced_hist = hist.unslice()
+
+    output_op = db.ops.Output(columns=[unsliced_hist])
+
+    # For each job, you can specify how sampling should be performed for
+    # a specific column. In the same way we used the op_args argument to bind
+    # a table to an input column, we bind a sampling directive to strided_frame.
+    job = Job(
+        op_args={
+            frame: db.table('example').column('frame'),
+            # The "strided" sampling mode will run over # every 8th frame,
+            # i.e. frames [0, 8, 16, ...]
+            sliced_frame: db.partitioner.all(500),
+            output_op: 'example_hist_sliced'
+        }
+    )
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    output_tables = db.run(bulk_job, force=True, pipeline_instances_per_node=2)
+
+    # Loop over the column's rows. Each row is a tuple of the frame number and
+    # value for that row.
+    video_hists = output_tables[0].load(['histogram'], parsers.histograms)
+    num_rows = 0
+    for (frame_index, frame_hists) in video_hists:
+        assert len(frame_hists) == 3
+        assert frame_hists[0].shape[0] == 16
+        num_rows += 1
+    print(num_rows)
+    assert num_rows == db.table('example').num_rows()
+
+    # 
+
+    frame = db.ops.FrameInput()
+    sliced_frame = frame.slice()
+    hist = db.ops.Histogram(frame=sliced_frame)
+
+    gath_hist = hist.sample()
+
+    unsliced_hist = gath_hist.unslice()
+    output_op = db.ops.Output(columns=[unsliced_hist])
+
+    # For each job, you can specify how sampling should be performed for
+    # a specific column. In the same way we used the op_args argument to bind
+    # a table to an input column, we bind a sampling directive to strided_frame.
+    num_slice_groups = int(math.ceil(db.table('example').num_rows() / 500.0))
+    job = Job(
+        op_args={
+            frame: db.table('example').column('frame'),
+            sliced_frame: db.partitioner.all(500),
+            gath_hist: [db.sampler.gather([0, 15])
+                        for _ in range(num_slice_groups)],
+            output_op: 'example_hist_sliced_gath'
+        }
+    )
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    output_tables = db.run(bulk_job, force=True, pipeline_instances_per_node=2)
+
+    # Loop over the column's rows. Each row is a tuple of the frame number and
+    # value for that row.
+    video_hists = output_tables[0].load(['histogram'], parsers.histograms)
+    num_rows = 0
+    for (frame_index, frame_hists) in video_hists:
+        assert len(frame_hists) == 3
+        assert frame_hists[0].shape[0] == 16
+        num_rows += 1
+    assert num_rows == num_slice_groups * 2
diff --git a/examples/tutorial/05_compression.py b/examples/tutorial/05_compression.py
new file mode 100644
index 00000000..c0d178bd
--- /dev/null
+++ b/examples/tutorial/05_compression.py
@@ -0,0 +1,70 @@
+from scannerpy import Database, Job, DeviceType, BulkJob
+
+################################################################################
+# This tutorial discusses how Scanner compresses output columns, how to        #
+# control how and when this compression happens, and how to export compressed  #
+# video files.
+################################################################################
+
+with Database() as db:
+
+    # Frames on disk can either be stored uncompressed (raw bits) or compressed
+    # (encoded using some form of image or video compression). When Scanner
+    # reads frames from a table, it automatically decodes the data if necessary.
+    # The Op DAG only sees the raw frames. For example, this table is stored
+    # as compressed video.
+    def make_blurred_frame():
+        frame = db.ops.FrameInput()
+
+        blurred_frame = db.ops.Blur(
+            frame = frame,
+            kernel_size = 3,
+            sigma = 0.5)
+        return frame, blurred_frame
+
+    # By default, if an Op outputs a frame with 3 channels with type uint8,
+    # those frames will be compressed using video encoding. No other frame
+    # type is currently compressed.
+    frame, blurred_frame = make_blurred_frame()
+    output_op = db.ops.Output(columns=[blurred_frame])
+    job = Job(
+        op_args={
+            frame: db.table('example').column('frame'),
+            output_op: 'output_table_name',
+        }
+    )
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    db.run(bulk_job, force=True)
+
+    frame, blurred_frame = make_blurred_frame()
+    # The compression parameters can be controlled by annotating the column
+    low_quality_frame = blurred_frame.compress_video(quality = 35)
+    output_op = db.ops.Output(columns=[low_quality_frame])
+    job = Job(
+        op_args={
+            frame: db.table('example').column('frame'),
+            output_op: 'low_quality_table',
+        }
+    )
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    db.run(bulk_job, force=True)
+
+    # If no compression is desired, this can be specified by indicating that
+    # the column should be lossless.
+    frame, blurred_frame = make_blurred_frame()
+    # The compression parameters can be controlled by annotating the column
+    lossless_frame = blurred_frame.lossless()
+    output_op = db.ops.Output(columns=[lossless_frame])
+    job = Job(
+        op_args={
+            frame: db.table('example').column('frame'),
+            output_op: 'pristine_frame',
+        }
+    )
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    db.run(bulk_job, force=True)
+
+    # Any column which is saved as compressed video can be exported as an mp4
+    # file by calling save_mp4 on the column. This will output a file called
+    # 'low_quality_video.mp4' in the current directory.
+    db.table('low_quality_table').column('frame').save_mp4('low_quality_video')
diff --git a/examples/tutorial/06_custom_op.py b/examples/tutorial/06_custom_op.py
new file mode 100644
index 00000000..c8f7bc5b
--- /dev/null
+++ b/examples/tutorial/06_custom_op.py
@@ -0,0 +1,35 @@
+from scannerpy import Database, Job
+import os.path
+
+################################################################################
+# This tutorial shows how to write and use your own custom op.                 #
+################################################################################
+
+# Look at resize_op/resize_op.cpp to start this tutorial.
+
+with Database() as db:
+
+    if not os.path.isfile('resize_op/build/libresize_op.so'):
+        print('You need to build the custom op first: \n'
+              '$ cd resize_op; mkdir build && cd build; cmake ..; make')
+        exit()
+
+    # To load a custom op into the Scanner runtime, we use db.load_op to open the
+    # shared library we compiled. If the op takes arguments, it also optionally
+    # takes a path to the generated python file for the arg protobuf.
+    db.load_op('resize_op/build/libresize_op.so', 'resize_op/build/resize_pb2.py')
+
+    frame = db.ops.FrameInput()
+    # Then we use our op just like in the other examples.
+    resize = db.ops.MyResize(
+        frame = frame,
+        width = 200, height = 300)
+    output_op = db.ops.Output(columns=[resize])
+    job = Job(
+        op_args={
+            frame: db.table('example').column('frame'),
+            output_op: 'example_resized',
+        }
+    )
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    db.run(bulk_job, force=True)
diff --git a/examples/tutorial/07_profiling.py b/examples/tutorial/07_profiling.py
new file mode 100644
index 00000000..6a5287d2
--- /dev/null
+++ b/examples/tutorial/07_profiling.py
@@ -0,0 +1,30 @@
+from scannerpy import Database, Job, DeviceType
+
+################################################################################
+# This tutorial shows how to look at profiling information for your job.       #
+################################################################################
+
+with Database() as db:
+
+    frame = db.ops.FrameInput()
+    histogram = db.ops.Histogram(frame = frame)
+    output_op = db.ops.Output(columns=[histogram])
+    job = Job(
+        op_args={
+            frame: db.table('example').column('frame')
+            output_op: 'example_hist_profile',
+        }
+    )
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    [output_table] = db.run(bulk_job, force=True)
+
+    # The profiler contains information about how long different parts of your
+    # computation take to run. We use Google Chrome's trace format, which you
+    # can view by going to chrome://tracing in Chrome and clicking "load" in
+    # the top left.
+    output_table.profiler().write_trace('hist.trace')
+
+    # Each row corresponds to a different part of the system, e.g. the thread
+    # loading bytes from disk or the thread running your kernels. If you have
+    # multiple pipelines or multiple nodes, you will see many of these evaluate
+    # threads.
diff --git a/examples/tutorial/masterpy.py b/examples/tutorial/masterpy.py
new file mode 100644
index 00000000..f7d9309f
--- /dev/null
+++ b/examples/tutorial/masterpy.py
@@ -0,0 +1,2 @@
+from scannerpy import Database
+db = Database(workers=['ocean.pdl.local.cmu.edu:15559', 'crissy.pdl.local.cmu.edu:15559'])
diff --git a/examples/tutorial/resize_op/Makefile b/examples/tutorial/resize_op/Makefile
index adfb8916..ad60ecb7 100644
--- a/examples/tutorial/resize_op/Makefile
+++ b/examples/tutorial/resize_op/Makefile
@@ -1,14 +1,23 @@
 # Scanner outputs build flags that add Scanner to your library and include paths
 # as well as cover common flags for building a shared library.
-SCANNER_FLAGS = `python -c "import scannerpy; scannerpy.Database().print_build_flags()"`
+SCANNER_FLAGS = `python -c "import scannerpy.stdlib.build_flags"`
 
 # Change this to whatever your OpenCV directory is
 OpenCV_DIR=/opt/opencv-3.2.0
 
-all:
+all: build/libresize_op.so
+
+clean:
+	rm -rf build/*
+
 # protoc generates the Python and C++ bindings for the ResizeArgs class.
+build/resize.pb.cc: resize.proto
 	protoc resize.proto --python_out=build --cpp_out=build
+
 # g++ builds the op library
-	g++ build/resize.pb.cc resize_op.cpp -o build/libresize_op.so ${SCANNER_FLAGS} \
+build/libresize_op.so: build/resize.pb.cc resize_op.cpp
+	g++ $^ -o $@ ${SCANNER_FLAGS} \
 		-I ${OpenCV_DIR}/include -L ${OpenCV_DIR}/lib -lopencv_core -lopencv_imgproc \
 		-I build
+
+.PHONY: clean
diff --git a/examples/tutorial/resize_op/resize.proto b/examples/tutorial/resize_op/resize.proto
index 5e0211d8..a5a4e542 100644
--- a/examples/tutorial/resize_op/resize.proto
+++ b/examples/tutorial/resize_op/resize.proto
@@ -1,6 +1,6 @@
 syntax = "proto3";
 
-message ResizeArgs {
+message MyResizeArgs {
   int32 width = 1;
   int32 height = 2;
 }
\ No newline at end of file
diff --git a/examples/tutorial/resize_op/resize_op.cpp b/examples/tutorial/resize_op/resize_op.cpp
index 2469a775..5efd6972 100644
--- a/examples/tutorial/resize_op/resize_op.cpp
+++ b/examples/tutorial/resize_op/resize_op.cpp
@@ -1,8 +1,8 @@
-#include "scanner/api/op.h"      // for REGISTER_OP
-#include "scanner/api/kernel.h"  // for VideoKernel and REGISTER_KERNEL
-#include "scanner/util/opencv.h" // for using OpenCV
-#include "scanner/util/memory.h" // for device-independent memory management
-#include "resize.pb.h"           // for ResizeArgs (generated file)
+#include "resize.pb.h"            // for ResizeArgs (generated file)
+#include "scanner/api/kernel.h"   // for VideoKernel and REGISTER_KERNEL
+#include "scanner/api/op.h"       // for REGISTER_OP
+#include "scanner/util/memory.h"  // for device-independent memory management
+#include "scanner/util/opencv.h"  // for using OpenCV
 
 /*
  * Ops in Scanner are abstract units of computation that are implemented by
@@ -13,21 +13,20 @@
 
 // Custom kernels must inherit the Kernel class or any subclass thereof,
 // e.g. the VideoKernel which provides support for processing video frames.
-class ResizeKernel : public scanner::VideoKernel {
-public:
-
+class MyResizeKernel : public scanner::VideoKernel {
+ public:
   // To allow ops to be customized by users at a runtime, e.g. to define the
-  // target width and height of the ResizeKernel, Scanner uses Google's Protocol
+  // target width and height of the MyResizeKernel, Scanner uses Google's Protocol
   // Buffers, or protobufs, to define serialzable types usable in C++ and
   // Python (see resize_op/args.proto). By convention, ops that take
   // arguments must define a protobuf called <OpName>Args, e.g. ResizeArgs,
   // In Python, users will provide the argument fields to the op constructor,
   // and these will get serialized into a string. This string is part of the
   // general configuration each kernel receives from the runtime, config.args.
-  ResizeKernel(const scanner::Kernel::Config& config)
-    : scanner::VideoKernel(config) {
+  MyResizeKernel(const scanner::Kernel::Config& config)
+      : scanner::VideoKernel(config) {
     // The protobuf arguments must be decoded from the input string.
-    ResizeArgs args;
+    MyResizeArgs args;
     args.ParseFromArray(config.args.data(), config.args.size());
     width_ = args.width();
     height_ = args.height();
@@ -35,39 +34,38 @@ class ResizeKernel : public scanner::VideoKernel {
 
   // Execute is the core computation of the kernel. It maps a batch of rows
   // from an input table to a batch of rows of the output table. Here, we map
-  // from two input columns from the video, "frame" and "frame_info", and return
+  // from one input column from the video, "frame", and return
   // a single column, "frame".
-  void execute(const scanner::BatchedColumns &input_columns,
-               scanner::BatchedColumns &output_columns) override {
-    int input_count = input_columns[0].rows.size();
+  void execute(const scanner::BatchedColumns& input_columns,
+               scanner::BatchedColumns& output_columns) override {
+    auto& frame_column = input_columns[0];
+    int input_count = num_rows(frame_column);
 
     // This must be called at the top of the execute method in any VideoKernel.
     // See the VideoKernel for the implementation check_frame_info.
-    check_frame_info(scanner::CPU_DEVICE, input_columns[1]);
+    check_frame(scanner::CPU_DEVICE, frame_column[0]);
+
+    auto& resized_frame_column = output_columns[0];
+    scanner::FrameInfo output_frame_info(
+      height_, width_, 3, scanner::FrameType::U8);
 
     for (int i = 0; i < input_count; ++i) {
-      // Convert the raw input buffer into an OpenCV matrix
-      cv::Mat input(
-        frame_info_.height(),
-        frame_info_.width(),
-        CV_8UC3,
-        input_columns[0].rows[i].buffer);
+      // Get a frame from the batch of input frames
+      const scanner::Frame* frame = frame_column[i].as_const_frame();
+      cv::Mat input = scanner::frame_to_mat(frame);
 
-      // Allocate a buffer for the output
-      size_t output_size = width_ * height_ * 3;
-      unsigned char* output_buf =
-        scanner::new_buffer(scanner::CPU_DEVICE, output_size);
-      cv::Mat output(height_, width_, CV_8UC3, output_buf);
+      // Allocate a frame for the resized output frame
+      scanner::Frame* resized_frame =
+        scanner::new_frame(scanner::CPU_DEVICE, output_frame_info);
+      cv::Mat output = scanner::frame_to_mat(resized_frame);
 
-      // Call to OpenCV for the resize
       cv::resize(input, output, cv::Size(width_, height_));
 
-      // Add the buffer to an output column
-      INSERT_ROW(output_columns[0], output_buf, output_size);
+      scanner::insert_frame(resized_frame_column, resized_frame);
     }
   }
 
-private:
+ private:
   int width_;
   int height_;
 };
@@ -75,8 +73,8 @@ class ResizeKernel : public scanner::VideoKernel {
 // These functions run statically when the shared library is loaded to tell the
 // Scanner runtime about your custom op.
 
-REGISTER_OP(Resize).inputs({"frame", "frame_info"}).outputs({"frame"});
+REGISTER_OP(MyResize).frame_input("frame").frame_output("frame");
 
-REGISTER_KERNEL(Resize, ResizeKernel)
-   .device(scanner::DeviceType::CPU)
-   .num_devices(1);
+REGISTER_KERNEL(MyResize, MyResizeKernel)
+    .device(scanner::DeviceType::CPU)
+    .num_devices(1);
diff --git a/examples/tutorial/workerpy.py b/examples/tutorial/workerpy.py
new file mode 100644
index 00000000..c51f2b71
--- /dev/null
+++ b/examples/tutorial/workerpy.py
@@ -0,0 +1,7 @@
+from scannerpy import Database
+db = Database()
+
+db.start_worker()
+
+import time
+time.sleep(1000)
\ No newline at end of file
diff --git a/examples/util.py b/examples/util.py
index 69c07830..82c911a5 100644
--- a/examples/util.py
+++ b/examples/util.py
@@ -15,5 +15,5 @@ def download_video():
             'outtmpl': u'/tmp/example.%(ext)s'
         }
         with youtube_dl.YoutubeDL(ydl_opts) as ydl:
-            ydl.download(["https://www.youtube.com/watch?v=tFMo3UJ4B4g"])
+            ydl.download(["https://www.youtube.com/watch?v=79DijItQXMM"])
     return VID_PATH
diff --git a/google.md b/google.md
new file mode 100644
index 00000000..806de091
--- /dev/null
+++ b/google.md
@@ -0,0 +1,59 @@
+# Getting started with Google Cloud
+
+This guide will walk you through setting up Scanner on Google Cloud. You will need to have a Google account.
+
+## 1. Install the Cloud SDK
+
+On your local machine (laptop/desktop), follow the instructions here to install Google's Cloud SDK: [https://cloud.google.com/sdk/downloads](https://cloud.google.com/sdk/downloads)
+
+## 2. Create a project
+
+If you do not already have a project created, pick a project ID for your application, e.g. `my-scanner-project`. Then run:
+```bash
+gcloud projects create <project ID>
+```
+
+## 3. Make a bucket
+
+You will need to store your videos in Google Cloud Storage. Cloud Storage is organized into independent buckets (like top-level directories). Pick a name for your bucket, e.g. `scanner-data`, and run:
+```bash
+gsutil mb gs://scanner-data
+```
+
+## 4. Enable S3 interoperability
+
+We use an S3 API to access GCS (for good reasons), so you need to explicitly enable this feature. Go here: [https://console.cloud.google.com/storage/settings](https://console.cloud.google.com/storage/settings)
+
+Click *Enable interoperability access* and then click *Create a new key*. Into your local shell, run:
+```bash
+export AWS_ACCESS_KEY_ID=<Access Key>
+export AWS_SECRET_ACCESS_KEY=<Secret>
+```
+
+I would recommend putting these in your shell's `.*rc` file as well.
+
+## 5. Set up your Scanner config
+
+Change the storage heading in your `~/.scanner.toml` to use GCS:
+```toml
+[storage]
+type = "gcs"
+bucket = "<your bucket name>"
+db_path = "scanner_db"
+```
+
+## 6. Upload your videos into your bucket
+
+You can copy videos onto GCS like this:
+```bash
+gsutil cp example.mp4 gs://scanner-data/videos/
+```
+
+## 7. You're done!
+
+Now, whenever you want to specify an ingest path, it does not need a leading slash and should not include the bucket name. For example, with the config above, the following is a valid ingest path:
+```
+videos/example.mp4
+```
+
+If you want to use Google Cloud to scale computation instead of just storage, take a look at our Kubernetes adapter: [https://github.com/scanner-research/scanner-kube](https://github.com/scanner-research/scanner-kube)
diff --git a/nets/caffe_facenet.toml b/nets/caffe_facenet.toml
deleted file mode 100644
index 055bbb4a..00000000
--- a/nets/caffe_facenet.toml
+++ /dev/null
@@ -1,24 +0,0 @@
-[net]
-    model = "nets/caffe_facenet/facenet_deploy.prototxt"
-    weights = "nets/caffe_facenet/facenet_deploy.caffemodel"
-    input_layers = ["data"]
-    output_layers = ["score_final"]
-
-    [net.input]
-        dimensions = [
-            "batch",
-            "channel",
-            "height",
-            "width"
-        ]
-        channel_ordering = [
-            "red",
-            "green",
-            "blue"
-        ]
-
-[mean-image]
-    [mean-image.colors]
-        red = 119.29959869
-        green = 110.54627228
-        blue = 101.8384321
diff --git a/nets/cpm.toml b/nets/cpm.toml
deleted file mode 100644
index b8369c43..00000000
--- a/nets/cpm.toml
+++ /dev/null
@@ -1,28 +0,0 @@
-[net]
-    model = "features/cpm/pose_deploy_resize.prototxt"
-    weights = "features/cpm/pose_iter_320000.caffemodel"
-    input_layers = ["data"]
-    output_layers = ["Mconv7_stage6"]
-    input_height = 368
-    input_width = 368
-    normalize = false
-
-    [net.input]
-        dimensions = [
-            "batch",
-            "channel",
-            "height",
-            "width"
-        ]
-        channel_ordering = [
-            "blue",
-            "green",
-            "red",
-            "weight"
-        ]
-
-[mean-image]
-    [mean-image.colors]
-        blue = 0.0
-        green = 0.0
-        red = 0.0
diff --git a/nets/cpm2.toml b/nets/cpm2.toml
deleted file mode 100644
index 03c815ab..00000000
--- a/nets/cpm2.toml
+++ /dev/null
@@ -1,28 +0,0 @@
-[net]
-    model = "features/cpm2/mpi_pose_deploy_linvec.prototxt"
-    weights = "features/cpm2/mpi_pose_iter_160000.caffemodel"
-    input_layers = ["image"]
-    output_layers = ["resized_map", "joints"]
-    input_height = 368
-    preserve_aspect_ratio = true
-    pad_mod = 8
-    normalize = false
-
-    [net.input]
-        dimensions = [
-            "batch",
-            "channel",
-            "height",
-            "width"
-        ]
-        channel_ordering = [
-            "blue",
-            "green",
-            "red",
-        ]
-
-[mean-image]
-    [mean-image.colors]
-        blue = 0.0
-        green = 0.0
-        red = 0.0
diff --git a/nets/faster_rcnn.toml b/nets/faster_rcnn.toml
index 05a4130e..b48473ef 100644
--- a/nets/faster_rcnn.toml
+++ b/nets/faster_rcnn.toml
@@ -1,6 +1,6 @@
 [net]
-    model = "features/faster_rcnn/faster_rcnn_test.pt"
-    weights = "features/faster_rcnn/VGG16_faster_rcnn_final.caffemodel"
+    model = "nets/faster_rcnn/faster_rcnn_test.pt"
+    weights = "nets/faster_rcnn/VGG16_faster_rcnn_final.caffemodel"
     input_layers = ["data", "im_info"]
     output_layers = ["cls_prob", "rois", "fc7"]
     normalize = false
diff --git a/nets/faster_rcnn_coco.toml b/nets/faster_rcnn_coco.toml
index bf434b5b..f12d4304 100644
--- a/nets/faster_rcnn_coco.toml
+++ b/nets/faster_rcnn_coco.toml
@@ -1,9 +1,11 @@
 [net]
-    model = "features/faster_rcnn_coco/test.prototxt"
-    weights = "features/faster_rcnn_coco/coco_vgg16_faster_rcnn_final.caffemodel"
-    input_layers = ["data", "im_info"]
+    model = "nets/faster_rcnn_coco/test.prototxt"
+    weights = "nets/faster_rcnn_coco/coco_vgg16_faster_rcnn_final.caffemodel"
+    input_layers = ["data"]
     output_layers = ["cls_prob", "rois", "fc7"]
+    # output_layers = ["fc7"]
     normalize = false
+    uses_python = true
 
     [net.input]
         dimensions = [
diff --git a/nets/get_cpm.sh b/nets/get_cpm.sh
deleted file mode 100755
index 5bbf95f9..00000000
--- a/nets/get_cpm.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-CWD=`pwd`
-DIR=`basename $CWD`
-
-prog() {
-    mkdir nets/cpm
-    cd nets/cpm
-
-    # Person center detection
-    wget https://raw.githubusercontent.com/shihenw/convolutional-pose-machines-release/master/model/_trained_person_MPI/pose_deploy_copy_4sg_resize.prototxt
-    wget http://pearl.vasc.ri.cmu.edu/caffe_model_github/model/_trained_person_MPI/pose_iter_70000.caffemodel
-
-    # Pose estimation
-    wget https://raw.githubusercontent.com/shihenw/convolutional-pose-machines-release/master/model/_trained_MPI/pose_deploy_resize.prototxt
-    wget http://pearl.vasc.ri.cmu.edu/caffe_model_github/model/_trained_MPI/pose_iter_320000.caffemodel
-
-    cd $CWD
-}
-
-if [[ "$DIR" != "scanner" ]] && [[ "$1" != "-f" ]];
-then
-    echo "Warning: you must run this script from the Scanner repo root, and I don't think you are."
-    echo "Run this again with -f if you're sure."
-else
-    prog
-fi
diff --git a/nets/get_cpm2.sh b/nets/get_cpm2.sh
deleted file mode 100644
index adbcbe46..00000000
--- a/nets/get_cpm2.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-CWD=`pwd`
-DIR=`basename $CWD`
-
-prog() {
-    mkdir nets/cpm2
-    cd nets/cpm2
-
-    # Prototxt for COCO version
-    wget https://raw.githubusercontent.com/CMU-Perceptual-Computing-Lab/caffe_demo/master/model/coco/pose_deploy_linevec.prototxt
-    mv pose_deploy_linvec.prototxt coco_pose_deploy_linvec.prototxt
-    # Prototxt for MPI version
-    wget https://raw.githubusercontent.com/CMU-Perceptual-Computing-Lab/caffe_demo/master/model/mpi/pose_deploy_linevec.prototxt
-    mv pose_deploy_linvec.prototxt mpi_pose_deploy_linvec.prototxt
-    # Caffemodel for COCO
-    wget http://posefs1.perception.cs.cmu.edu/Users/tsimon/Projects/coco/data/models/coco/pose_iter_440000.caffemodel
-    mv pose_iter_440000.caffemodel coco_pose_iter_440000.caffemodel
-    # Caffemodel for MPI
-    wget http://posefs1.perception.cs.cmu.edu/Users/tsimon/Projects/coco/data/models/mpi/pose_iter_160000.caffemodel
-    mv pose_iter_160000.caffemodel mpi_pose_iter_160000.caffemodel
-
-    cd $CWD
-}
-
-if [[ "$DIR" != "scanner" ]] && [[ "$1" != "-f" ]];
-then
-    echo "Warning: you must run this script from the Scanner repo root, and I don't think you are."
-    echo "Run this again with -f if you're sure."
-else
-    prog
-fi
diff --git a/nets/get_frcnn_coco.sh b/nets/get_frcnn_coco.sh
new file mode 100755
index 00000000..6d8fda2e
--- /dev/null
+++ b/nets/get_frcnn_coco.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+CWD=`pwd`
+DIR=`basename $CWD`
+
+prog() {
+    mkdir nets/faster_rcnn_coco
+    cd nets/faster_rcnn_coco
+
+    # Prototxt for MPI version
+    wget https://storage.googleapis.com/scanner-data/models/faster_rcnn_coco/test.prototxt
+    # Caffemodel for MPI
+    wget https://storage.googleapis.com/scanner-data/models/faster_rcnn_coco/coco_vgg16_faster_rcnn_final.caffemodel
+
+    cd $CWD
+}
+
+if [[ "$DIR" != "scanner" ]] && [[ "$1" != "-f" ]];
+then
+    echo "Warning: you must run this script from the Scanner repo root, and I don't think you are."
+    echo "Run this again with -f if you're sure."
+else
+    prog
+fi
diff --git a/nets/get_resnet.sh b/nets/get_resnet.sh
index 2f2e6318..24497fb2 100755
--- a/nets/get_resnet.sh
+++ b/nets/get_resnet.sh
@@ -7,11 +7,9 @@ prog() {
     mkdir -p nets/resnet
     cd nets/resnet
 
-    wget https://iuxblw-bn1306.files.1drv.com/y3ml4MHciDBiEDaTSmHlVGB9Hm9cIQNS53sbuCwaolComo2PZ55hhPo5SijUqhtgTv8cad4vvbn7LOY_KPNwJsz-NQTpJENAFTTdVIML1J7-_1uU2hQHE54eak7bf_ZjTJK9aOKxzBPrxrtm8Uu0d3TUPDmcG9ieDoSuonT_YpdKC0/ResNet-50-deploy.prototxt?download&psid=1
-    mv ResNet-50-deploy.prototxt?download ResNet-50-deploy.prototxt
+    wget https://storage.googleapis.com/scanner-data/models/resnet/ResNet-50-deploy.prototxt
 
-    wget https://iuxblw-bn1306.files.1drv.com/y3mUZccSR9x9zlIg_J9kGKeSlmZVvbNxfV8Rajw74tIIsvfHH3AH9GZ3cmYZoaePkTzliM1K5fDEzPKW0z-BLyvxm8rdzNvwgwjUjo2RMsMcxBVd8gzKCKC6WPowuGzRDB9wFK942ZvywiJ12bwiar8OCKy2NlRQbCUw3f_PRaUVc0/ResNet-50-model.caffemodel?download&psid=1
-    mv ResNet-50-model.caffemodel?download ResNet-50-model.caffemodel
+    wget https://storage.googleapis.com/scanner-data/models/resnet/ResNet-50-model.caffemodel
 
     cd $CWD
 }
diff --git a/nets/googlenet.toml b/nets/googlenet.toml
index 01c200a2..bb9e67e3 100644
--- a/nets/googlenet.toml
+++ b/nets/googlenet.toml
@@ -1,6 +1,6 @@
 [net]
-    model = "features/googlenet/deploy.prototxt"
-    weights = "features/googlenet/bvlc_googlenet.caffemodel"
+    model = "nets/googlenet/deploy.prototxt"
+    weights = "nets/googlenet/bvlc_googlenet.caffemodel"
     source = "https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet"
     input_layers = ["data"]
     output_layers = ["loss3/classifier"]
diff --git a/nets/resnet.toml b/nets/resnet.toml
index d324e6ed..0edcf25e 100644
--- a/nets/resnet.toml
+++ b/nets/resnet.toml
@@ -1,6 +1,6 @@
 [net]
-    model = "features/resnet/ResNet-50-deploy.prototxt"
-    weights = "features/resnet/ResNet-50-model.caffemodel"
+    model = "nets/resnet/ResNet-50-deploy.prototxt"
+    weights = "nets/resnet/ResNet-50-model.caffemodel"
     source = "https://github.com/KaimingHe/deep-residual-networks"
     input_layers = ["data"]
     output_layers = ["fc1000"]
@@ -15,9 +15,9 @@
             "width",
         ]
         channel_ordering = [
-            "blue"
-            "green"
-            "red"
+            "blue",
+            "green",
+            "red",
         ]
 
 [mean-image]
diff --git a/nets/resnet_101.toml b/nets/resnet_101.toml
index 393805fa..0bbca454 100644
--- a/nets/resnet_101.toml
+++ b/nets/resnet_101.toml
@@ -15,9 +15,9 @@
             "width",
         ]
         channel_ordering = [
-            "blue"
-            "green"
-            "red"
+            "blue",
+            "green",
+            "red",
         ]
 
 [mean-image]
diff --git a/examples/reverse_image_search/TODO b/python/scanner/__init__.py
similarity index 100%
rename from examples/reverse_image_search/TODO
rename to python/scanner/__init__.py
diff --git a/python/scanner/engine/__init__.py b/python/scanner/engine/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/scanner/stdlib/__init__.py b/python/scanner/stdlib/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/scannerpy/__init__.py b/python/scannerpy/__init__.py
index e82739ef..66aacc0c 100644
--- a/python/scannerpy/__init__.py
+++ b/python/scannerpy/__init__.py
@@ -1,3 +1,7 @@
-from common import ScannerException, DeviceType
-from database import Database
-from config import Config
+from __future__ import absolute_import, division, print_function, unicode_literals
+from scannerpy.common import ScannerException, DeviceType, DeviceHandle, ColumnType
+from scannerpy.job import Job
+from scannerpy.bulk_job import BulkJob
+from scannerpy.database import Database, ProtobufGenerator, start_master, start_worker
+from scannerpy.config import Config
+from scannerpy.kernel import Kernel, KernelConfig
diff --git a/python/scannerpy/bulk_job.py b/python/scannerpy/bulk_job.py
new file mode 100644
index 00000000..0a6490f7
--- /dev/null
+++ b/python/scannerpy/bulk_job.py
@@ -0,0 +1,13 @@
+class BulkJob(object):
+    """
+    Specifies a set of jobs that will share the same execution DAG.
+    """
+    def __init__(self, output, jobs):
+        self._output = output
+        self._jobs = jobs
+
+    def output(self):
+        return self._output
+
+    def jobs(self):
+        return self._jobs
diff --git a/python/scannerpy/collection.py b/python/scannerpy/collection.py
deleted file mode 100644
index 3a2f64e3..00000000
--- a/python/scannerpy/collection.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from common import *
-
-
-class Collection:
-    """
-    A set of Table objects.
-    """
-
-    def __init__(self, db, name, descriptor):
-        self._db = db
-        self._name = name
-        self._descriptor = descriptor
-
-    def name(self):
-        return self._name
-
-    def table_names(self):
-        return list(self._descriptor.tables)
-
-    def tables(self, index=None):
-        tables = [self._db.table(t) for t in self._descriptor.tables]
-        return tables[index] if index is not None else tables
-
-    def profiler(self):
-        return self._db.profiler(self._descriptor.job_id)
diff --git a/python/scannerpy/column.py b/python/scannerpy/column.py
index 64e9d2e8..e6935675 100644
--- a/python/scannerpy/column.py
+++ b/python/scannerpy/column.py
@@ -1,58 +1,123 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
 import struct
-import cv2
 import math
-from common import *
+from subprocess import Popen, PIPE
+import tempfile
+import os
 
+from scannerpy.stdlib import parsers
+from scannerpy.common import *
+from scannerpy.job import Job
+from scannerpy.bulk_job import BulkJob
 
-class Column:
+class Column(object):
     """
     A column of a Table.
     """
 
-    def __init__(self, table, descriptor):
+    def __init__(self, table, name):
         self._table = table
-        self._descriptor = descriptor
+        self._name = name
         self._db = table._db
         self._storage = table._db.config.storage
         self._db_path = table._db.config.db_path
 
+        self._loaded = False
+        self._descriptor = None
+        self._video_descriptor = None
+
+    def _load_meta(self):
+        if not self._loaded:
+            self._loaded = True
+            descriptor, video_descriptor = self._table._load_column(self._name)
+            self._descriptor = descriptor
+            self._video_descriptor = video_descriptor
+
     def name(self):
-        return self._descriptor.name
+        return self._name
+
+    def type(self):
+        self._load_meta()
+        return self._descriptor.type
+
+    def id(self):
+        self._load_meta()
+        return self._descriptor.id
+
+    def keyframes(self):
+        self._load_meta()
+        if (self._descriptor.type == self._db.protobufs.Video and
+            self._video_descriptor.codec_type ==
+            self._db.protobufs.VideoDescriptor.H264):
+            # For each encoded video, add start frame offset
+            frame_offset = 0
+            kf_offset = 0
+            keyframes = []
+            for frames_per_video, kfs_per_video in zip(
+                    self._video_descriptor.frames_per_video,
+                    self._video_descriptor.keyframes_per_video):
+                keyframes += [
+                    frame_offset + kfi
+                    for kfi in self._video_descriptor.keyframe_indices[
+                            kf_offset:kf_offset + kfs_per_video]]
+                frame_offset += frames_per_video
+                kf_offset += kfs_per_video
+            return keyframes
+        else:
+            return range(self._table.num_rows())
 
     def _load_output_file(self, item_id, rows, fn=None):
         assert len(rows) > 0
 
+        metadata_path = '{}/tables/{}/{}_{}_metadata.bin'.format(
+            self._db_path, self._table._descriptor.id,
+            self._descriptor.id, item_id)
+        try:
+            metadata_contents = self._storage.read(
+                metadata_path.encode('ascii'))
+        except UserWarning:
+            raise ScannerException('Path {} does not exist'.format(
+                metadata_path))
+
         path = '{}/tables/{}/{}_{}.bin'.format(
             self._db_path, self._table._descriptor.id,
             self._descriptor.id, item_id)
         try:
-            contents = self._storage.read(path)
+            contents = self._storage.read(path.encode('ascii'))
         except UserWarning:
             raise ScannerException('Path {} does not exist'.format(path))
 
         lens = []
+        total_rows = 0
+        i = 0
+        while i < len(metadata_contents):
+            (num_rows,) = struct.unpack("=Q", metadata_contents[i:i+8])
+            total_rows += num_rows
+            i += 8
+            for fi in range(num_rows):
+                (buf_len,) = struct.unpack("=Q", metadata_contents[i:i+8])
+                lens.append(buf_len)
+                i += 8
+
         start_pos = None
         pos = 0
-        (num_rows,) = struct.unpack("l", contents[:8])
-
-        i = 8
-        rows = rows if len(rows) > 0 else range(num_rows)
-        for fi in range(num_rows):
-            (buf_len,) = struct.unpack("l", contents[i:i+8])
-            i += 8
+        rows = rows if len(rows) > 0 else range(total_rows)
+        for fi in range(total_rows):
             old_pos = pos
-            pos += buf_len
+            pos += lens[fi]
             if start_pos is None:
                 start_pos = old_pos
-            lens.append(buf_len)
 
         rows_idx = 0
-        i = 8 + num_rows * 8 + start_pos
+        i = start_pos
         for j, buf_len in enumerate(lens):
-            if j == rows[rows_idx]:
+            if rows_idx < len(rows) and j == rows[rows_idx]:
                 buf = contents[i:i+buf_len]
-                if fn is not None:
-                    yield fn(buf, self._db)
+                # len(buf) == 0 when element is null
+                if len(buf) == 0:
+                    yield None
+                elif fn is not None:
+                    yield fn(buf, self._db.protobufs)
                 else:
                     yield buf
                 rows_idx += 1
@@ -65,7 +130,7 @@ def _load(self, fn=None, rows=None):
         # Integer divide, round up
         num_items = len(table_descriptor.end_rows)
         bufs = []
-        input_rows = self._table.rows()
+        input_rows = list(range(self._table.num_rows()))
         assert len(input_rows) == total_rows
         i = 0
         rows_so_far = 0
@@ -75,7 +140,7 @@ def _load(self, fn=None, rows=None):
         for item_id in range(num_items):
             start_row = prev
             end_row = table_descriptor.end_rows[item_id]
-            item_rows = start_row - end_row
+            item_rows = end_row - start_row
             prev = end_row
             select_rows = []
             while rows_idx < len(rows):
@@ -91,10 +156,6 @@ def _load(self, fn=None, rows=None):
                     i += 1
             rows_so_far += item_rows
 
-    def _decode_png(self, png, db):
-        return cv2.imdecode(np.frombuffer(png, dtype=np.dtype(np.uint8)),
-                            cv2.IMREAD_COLOR)
-
     # TODO(wcrichto): don't show progress bar when running decode png
     def load(self, fn=None, rows=None):
         """
@@ -110,10 +171,12 @@ def load(self, fn=None, rows=None):
             `fn`).
         """
 
+        self._load_meta()
         # If the column is a video, then dump the requested frames to disk as
         # PNGs and return the decoded PNGs
-        if self._descriptor.type == self._db.protobufs.Video:
-            sampler = self._db.sampler()
+        if (self._descriptor.type == self._db.protobufs.Video and
+            self._video_descriptor.codec_type ==
+            self._db.protobufs.VideoDescriptor.H264):
             png_table_name = self._db._png_dump_prefix.format(self._table.name())
             if self._db.has_table(png_table_name):
                 png_table = self._db.table(png_table_name)
@@ -121,14 +184,86 @@ def load(self, fn=None, rows=None):
                    png_table.num_rows() == self._table.num_rows() and \
                    png_table._descriptor.timestamp > \
                    self._table._descriptor.timestamp:
-                    return png_table.columns(0).load(self._decode_png)
+                    return png_table.load(['img'], parsers.image)
             pair = [(self._table.name(), png_table_name)]
-            if rows is None:
-                tasks = sampler.all(pair)
-            else:
-                tasks = [sampler.gather(pair[0], rows)]
-            [out_tbl] = self._db.run(tasks, self._db.ops.ImageEncoder(),
-                                     force=True, show_progress=False)
-            return out_tbl.columns(0).load(self._decode_png)
+            op_args = {}
+            frame = self._db.ops.FrameInput()
+            op_args[frame] = self
+            enc_input = frame
+            if rows is not None:
+                sampled_frame = frame.sample()
+                op_args[sampled_frame] = self._db.sampler.gather(rows)
+                enc_input = sampled_frame
+            img = self._db.ops.ImageEncoder(frame = enc_input)
+            output_op = self._db.ops.Output(columns=[img])
+            op_args[output_op] = png_table_name
+            job = Job(op_args=op_args)
+            bulk_job = BulkJob(output=output_op, jobs=[job])
+            [out_tbl] = self._db.run(bulk_job, force=True, show_progress=False)
+            return out_tbl.load(['img'], parsers.image)
+        elif self._descriptor.type == self._db.protobufs.Video:
+            frame_type = self._video_descriptor.frame_type
+            if frame_type == self._db.protobufs.U8:
+                dtype = np.uint8
+            elif frame_type == self._db.protobufs.F32:
+                dtype = np.float32
+            elif frame_type == self._db.protobufs.F64:
+                dtype = np.float64
+            parser_fn = parsers.raw_frame_gen(self._video_descriptor.height,
+                                              self._video_descriptor.width,
+                                              self._video_descriptor.channels,
+                                              dtype)
+            return self._load(fn=parser_fn, rows=rows)
         else:
             return self._load(fn, rows=rows)
+
+    def save_mp4(self, output_name, fps=None, scale=None):
+        self._load_meta()
+        if not (self._descriptor.type == self._db.protobufs.Video and
+                self._video_descriptor.codec_type ==
+                self._db.protobufs.VideoDescriptor.H264):
+            raise ScannerException('Attempted to save a non-h264-compressed '
+                                   'column as an mp4. Try compressing the '
+                                   'column first by saving the output as '
+                                   'an RGB24 frame')
+        num_items = len(self._table._descriptor.end_rows)
+
+        paths = ['{}/tables/{:d}/{:d}_{:d}.bin'.format(
+            self._db._db_path,
+            self._table._descriptor.id, self._descriptor.id, item_id)
+                          for item_id in range(num_items)]
+        temp_paths = []
+        for _ in range(len(paths)):
+            fd, p = tempfile.mkstemp()
+            os.close(fd)
+            temp_paths.append(p)
+        # Copy all files locally before calling ffmpeg
+        for in_path, temp_path in zip(paths, temp_paths):
+            with open(temp_path, 'w') as f:
+                f.write(self._storage.read(in_path.encode('ascii')))
+
+        files = '|'.join(temp_paths)
+
+        vid_fps = (fps or
+                   (1.0/(self._video_descriptor.time_base_num /
+                         float(self._video_descriptor.time_base_denom))))
+
+        args = ''
+        if scale:
+            args += '-filter:v "scale={:d}x{:d}" '.format(scale[0], scale[1])
+
+        cmd = (
+            'ffmpeg -y '
+            '-r {fps:f} ' # set the input fps
+            '-i "concat:{input_files:s}" ' # concatenate the h264 files
+            '-c:v libx264 '
+            '-filter:v "setpts=N" ' # h264 does not have pts' in it
+            '{extra_args:s}'
+            '{output_name:s}.mp4'.format(
+                input_files = files,
+                fps = vid_fps,
+                extra_args = args,
+                output_name=output_name))
+        rc = Popen(cmd, shell=True).wait()
+        if rc != 0:
+            raise ScannerException('ffmpeg failed during mp4 export!')
diff --git a/python/scannerpy/common.py b/python/scannerpy/common.py
index 89364000..ff458da8 100644
--- a/python/scannerpy/common.py
+++ b/python/scannerpy/common.py
@@ -1,4 +1,4 @@
-import logging as log
+from __future__ import absolute_import, division, print_function, unicode_literals
 import numpy as np
 import enum
 from collections import defaultdict
@@ -14,10 +14,31 @@ class DeviceType(enum.Enum):
     GPU = 1
 
     @staticmethod
-    def to_proto(db, device):
+    def to_proto(protobufs, device):
         if device == DeviceType.CPU:
-            return db.protobufs.CPU
+            return protobufs.CPU
         elif device == DeviceType.GPU:
-            return db.protobufs.GPU
+            return protobufs.GPU
         else:
             raise ScannerException('Invalid device type')
+
+
+class DeviceHandle(object):
+    def __init__(self, device, device_id):
+        self.device = device
+        self.device_id = device_id
+
+
+class ColumnType(enum.Enum):
+    """ Enum for specifying what the type of a column is. """
+    Blob = 0
+    Video = 1
+
+    @staticmethod
+    def to_proto(protobufs, ty):
+        if ty == ColumnType.Blob:
+            return protobufs.Other
+        elif ty == ColumnType.Video:
+            return protobufs.Video
+        else:
+            raise ScannerException('Invalid column type')
diff --git a/python/scannerpy/config.py b/python/scannerpy/config.py
index 63963fd6..b0da3409 100644
--- a/python/scannerpy/config.py
+++ b/python/scannerpy/config.py
@@ -1,9 +1,11 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
 import os
 import toml
 import sys
 from subprocess import check_output
-from common import *
-from storehousepy import StorageConfig, StorageBackend
+
+from scannerpy.common import *
+from storehouse import StorageConfig, StorageBackend
 
 
 def read_line(s):
@@ -11,10 +13,8 @@ def read_line(s):
 
 
 class Config(object):
-    def __init__(self, config_path=None):
-        log.basicConfig(
-            level=log.DEBUG,
-            format='%(levelname)s %(asctime)s %(filename)s:%(lineno)03d] %(message)s')
+    def __init__(self, config_path=None, db_path=None):
+
         self.config_path = config_path or self.default_config_path()
 
         if not os.path.isfile(self.config_path):
@@ -30,38 +30,59 @@ def __init__(self, config_path=None):
                 f.write(toml.dumps(config))
             print('Wrote Scanner configuration to {}'.format(path))
 
-        config = self.load_config(self.config_path)
+        self.config = self.load_config(self.config_path)
+        config = self.config
         try:
             self.module_dir = os.path.dirname(os.path.realpath(__file__))
             build_path = self.module_dir + '/build'
             sys.path.append(build_path)
 
-            storage = config['storage']
-            storage_type = storage['type']
-            self.db_path = str(storage['db_path'])
-            if storage_type == 'posix':
-                storage_config = StorageConfig.make_posix_config()
-            elif storage_type == 'gcs':
-                with open(storage['key_path']) as f:
-                    key = f.read()
-                storage_config = StorageConfig.make_gcs_config(
-                    storage['cert_path'].encode('latin-1'),
-                    key,
-                    storage['bucket'].encode('latin-1'))
+            if db_path is not None:
+                self.db_path = db_path
             else:
-                raise ScannerException('Unsupported storage type {}'.format(storage_type))
+                storage = config['storage']
+                self.db_path = str(storage['db_path'])
+            storage_config = self._make_storage_config(config)
 
-            self.master_address = 'localhost:5001'
+            self.master_address = 'localhost'
+            self.master_port = '5001'
+            self.worker_port = '5002'
             if 'network' in config:
                 network = config['network']
-                if 'master_address' in network:
-                    self.master_address = network['master_address']
+                if 'master' in network:
+                    self.master_address = network['master'].encode(
+                        'ascii', 'ignore')
+                if 'master_port' in network:
+                    self.master_port = network['master_port'].encode(
+                        'ascii', 'ignore')
+                if 'worker_port' in network:
+                    self.worker_port = network['worker_port'].encode(
+                        'ascii', 'ignore')
 
         except KeyError as key:
-            raise ScannerException('Scanner config missing key: {}'.format(key))
+            raise ScannerException(
+                'Scanner config missing key: {}'.format(key))
         self.storage_config = storage_config
         self.storage = StorageBackend.make_from_config(storage_config)
 
+    def _make_storage_config(self, config):
+        storage = config['storage']
+        storage_type = storage['type']
+        if storage_type == 'posix':
+            storage_config = StorageConfig.make_posix_config()
+        elif storage_type == 'gcs':
+            storage_config = StorageConfig.make_gcs_config(
+                storage['bucket'].encode('latin-1'))
+        elif storage_type == 's3':
+            storage_config = StorageConfig.make_s3_config(
+                storage['bucket'].encode('latin-1'),
+                storage['region'].encode('latin-1'),
+                storage['endpoint'].encode('latin-1'))
+        else:
+            raise ScannerException(
+                'Unsupported storage type {}'.format(storage_type))
+        return storage_config
+
     @staticmethod
     def default_config_path():
         return os.path.expanduser('~') + '/.scanner.toml'
@@ -76,20 +97,40 @@ def load_config(self, path):
 
     @staticmethod
     def default_config():
-        hostname = check_output(['hostname', '-A']).split(' ')[0]
-
-        scanner_path = os.path.abspath(
-            os.path.join(os.path.dirname(__file__), '..', '..'))
+        hostname = check_output(['hostname']).strip()
 
         db_path = os.path.expanduser('~') + '/.scanner_db'
 
         return {
-            'scanner_path': scanner_path,
             'storage': {
                 'type': 'posix',
                 'db_path': db_path,
             },
             'network': {
-                'master': hostname + ':5001'
+                'master': hostname,
+                'master_port': '5001',
+                'worker_port': '5002'
             }
         }
+
+    def __getstate__(self):
+        # capture what is normally pickled
+        state = self.__dict__.copy()
+        # Get rid of the storehouse objects
+        state.pop('storage_config', None)
+        state.pop('storage', None)
+        # what we return here will be stored in the pickle
+        return state
+
+    def __setstate__(self, newstate):
+        self.module_dir = os.path.dirname(os.path.realpath(__file__))
+        build_path = self.module_dir + '/build'
+        if not build_path in sys.path:
+            sys.path.append(build_path)
+        sys.stdout.flush()
+
+        sc = self._make_storage_config(newstate['config'])
+        newstate['storage_config'] = sc
+        newstate['storage'] = StorageBackend.make_from_config(sc)
+        # re-instate our __dict__ state from the pickled state
+        self.__dict__.update(newstate)
diff --git a/python/scannerpy/database.py b/python/scannerpy/database.py
index 1b6f2228..b3cb3fa7 100644
--- a/python/scannerpy/database.py
+++ b/python/scannerpy/database.py
@@ -1,24 +1,127 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
 import os
 import os.path
 import sys
 import grpc
 import imp
 import socket
+import time
+import ipaddress
+import pickle
+import struct
+import signal
+import copy
+import collections
+import subprocess
+from tqdm import tqdm
+
+from multiprocessing import Process, Queue
 from subprocess import Popen, PIPE
 from random import choice
 from string import ascii_uppercase
-
-from common import *
-from profiler import Profiler
-from config import Config
-from op import OpGenerator, Op
-from sampler import Sampler
-from collection import Collection
-from table import Table
-from column import Column
-
-
-class Database:
+from threading import Thread
+from multiprocessing.dummy import Pool as ThreadPool
+
+from scannerpy.common import *
+from scannerpy.profiler import Profiler
+from scannerpy.config import Config
+from scannerpy.op import OpGenerator, Op, OpColumn
+from scannerpy.sampler import Sampler
+from scannerpy.partitioner import TaskPartitioner
+from scannerpy.table import Table
+from scannerpy.column import Column
+from scannerpy.protobuf_generator import ProtobufGenerator
+from scannerpy.job import Job
+from scannerpy.bulk_job import BulkJob
+
+from storehouse import StorageConfig, StorageBackend
+
+import scannerpy.libscanner as bindings
+import scanner.metadata_pb2 as metadata_types
+import scanner.engine.rpc_pb2 as rpc_types
+import scanner.engine.rpc_pb2_grpc as grpc_types
+import scanner.types_pb2 as misc_types
+
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+
+def start_master(port=None, config=None, config_path=None, block=False,
+                 watchdog=True, prefetch_table_metadata=True,
+                 no_workers_timeout=30):
+    """
+    Start a master server instance on this node.
+
+    Kwargs:
+        config: A scanner Config object. If specified, config_path is
+                ignored.
+        config_path: Path to a Scanner configuration TOML, by default
+                     assumed to be `~/.scanner.toml`.
+        block: If true, will wait until the server is shutdown. Server
+               will not shutdown currently unless wait_For_server_shutdown
+               is eventually called.
+
+    Returns:
+        A cpp database instance.
+    """
+    config = config or Config(config_path)
+    port = port or config.master_port
+
+    # Load all protobuf types
+    db = bindings.Database(
+        config.storage_config,
+        config.db_path.encode('ascii'),
+        (config.master_address + ':' + port).encode('ascii'))
+    result = bindings.start_master(db, port.encode('ascii'), watchdog,
+                                   prefetch_table_metadata,
+                                   no_workers_timeout)
+    if not result.success():
+        raise ScannerException('Failed to start master: {}'.format(result.msg()))
+    if block:
+        bindings.wait_for_server_shutdown(db)
+    return db
+
+
+def start_worker(master_address, machine_params=None, port=None, config=None,
+                 config_path=None, block=False, watchdog=True, prefetch_table_metadata=True):
+    """
+    Start a worker instance on this node.
+
+    Args:
+        master_address: The address of the master server to connect this worker
+                        to.
+
+    Kwargs:
+        config: A scanner Config object. If specified, config_path is
+                ignored.
+        config_path: Path to a Scanner configuration TOML, by default
+                     assumed to be `~/.scanner.toml`.
+        block: If true, will wait until the server is shutdown. Server
+               will not shutdown currently unless wait_ror_server_shutdown
+               is eventually called.
+
+    Returns:
+        A cpp database instance.
+    """
+    config = config or Config(config_path)
+    port = port or config.worker_port
+
+    # Load all protobuf types
+    db = bindings.Database(
+        config.storage_config,
+        #storage_config,
+        config.db_path.encode('ascii'),
+        master_address.encode('ascii'))
+    machine_params = machine_params or bindings.default_machine_params()
+    result = bindings.start_worker(db, machine_params,
+                                   str(port).encode('ascii'), watchdog,
+                                   prefetch_table_metadata)
+    if not result.success():
+        raise ScannerException('Failed to start worker: {}'.format(result.msg()))
+    if block:
+        bindings.wait_for_server_shutdown(db)
+    return result
+
+
+class Database(object):
     """
     Entrypoint for all Scanner operations.
 
@@ -28,7 +131,11 @@ class Database:
         protobufs: TODO(wcrichto)
     """
 
-    def __init__(self, config_path=None, config=None):
+    def __init__(self, master=None, workers=None,
+                 config_path=None, config=None,
+                 debug=None, start_cluster=True,
+                 prefetch_table_metadata=True,
+                 no_workers_timeout=30):
         """
         Initializes a Scanner database.
 
@@ -49,108 +156,85 @@ def __init__(self, config_path=None, config=None):
         else:
             self.config = Config(config_path)
 
-        # Load all protobuf types
-        import scanner.metadata_pb2 as metadata_types
-        import scanner.engine.rpc_pb2 as rpc_types
-        import scanner.types_pb2 as misc_types
-        import libscanner as bindings
+        self._start_cluster = start_cluster
+        self._prefetch_table_metadata = prefetch_table_metadata
+        self._no_workers_timeout = no_workers_timeout
+        self._debug = debug
+        if debug is None:
+            self._debug = (master is None and workers is None)
+
+        self._master = None
 
-        self._protobufs = [misc_types, rpc_types, metadata_types]
         self._bindings = bindings
 
         # Setup database metadata
         self._db_path = self.config.db_path
         self._storage = self.config.storage
-        self._master_address = self.config.master_address
         self._cached_db_metadata = None
-        self._png_dump_prefix = '__png_dump_'
+        self._png_dump_prefix = '__png_dump_{:s}'
 
         self.ops = OpGenerator(self)
-        self.protobufs = ProtobufGenerator(self)
-
-        # Initialize database if it does not exist
-        pydb_path = '{}/pydb'.format(self._db_path)
-        self._db = self._bindings.Database(
-            self.config.storage_config,
-            self._db_path,
-            self._master_address)
-        if not os.path.isdir(pydb_path):
-            os.mkdir(pydb_path)
-            self._collections = self.protobufs.CollectionsDescriptor()
-            self._update_collections()
+        self.sampler = Sampler(self)
+        self.partitioner = TaskPartitioner(self)
+        self.protobufs = ProtobufGenerator(self.config)
+        self._op_cache = {}
 
-        # Load database descriptors from disk
-        self._collections = self._load_descriptor(
-            self.protobufs.CollectionsDescriptor,
-            'pydb/descriptor.bin')
+        self._workers = {}
+        self._worker_conns = None
+        self.start_cluster(master, workers);
 
-        self._connect_to_master()
+    def __del__(self):
+        self.stop_cluster()
 
-        stdlib_path = '{}/build/stdlib'.format(self.config.module_dir)
-        self.load_op('{}/libstdlib.so'.format(stdlib_path),
-                     '{}/stdlib_pb2.py'.format(stdlib_path))
+    def __enter__(self):
+        return self
 
-    def get_build_flags(self):
-        """
-        Gets the g++ build flags for compiling custom ops.
+    def __exit__(self, exception_type, exception_val, exception_tb):
+        self.stop_cluster()
+        del self._db
 
-        For example, to compile a custom kernel:
-        \code{.sh}
-        export SCANNER_FLAGS=`python -c "import scannerpy as sp; print(sp.Database().get_build_flags())"`
-        g++ mykernel.cpp -o mylib.so `echo $SCANNER_FLAGS`
-        \endcode
-
-        Returns:
-           A flag string.
-        """
-
-        include_dirs = self._bindings.get_include().split(";")
-        include_dirs.append(self.config.module_dir + "/include")
-        include_dirs.append(self.config.module_dir + "/build")
-        flags = '{include} -std=c++11 -fPIC -shared -L{libdir} -lscanner {other}'
-        return flags.format(
-            include=" ".join(["-I " + d for d in include_dirs]),
-            libdir='{}/build'.format(self.config.module_dir),
-            other=self._bindings.other_flags())
-
-    def print_build_flags(self):
-        sys.stdout.write(self.get_build_flags())
+    def has_gpu(self):
+        try:
+            with open(os.devnull, 'w') as f:
+                subprocess.check_call(['nvidia-smi'], stdout=f, stderr=f)
+            return True
+        except:
+            pass
+        return False
 
     def summarize(self):
         summary = ''
         db_meta = self._load_db_metadata()
+        if len(db_meta.tables) == 0:
+            return 'Your database is empty!'
+
         tables = [
             ('TABLES', [
+                ('ID', [str(t.id) for t in db_meta.tables]),
                 ('Name', [t.name for t in db_meta.tables]),
                 ('# rows', [
                     str(self.table(t.id).num_rows()) for t in db_meta.tables
                 ]),
                 ('Columns', [
-                    ', '.join([c.name() for c in self.table(t.id).columns()])
+                    ', '.join(self.table(t.id).column_names())
                     for t in db_meta.tables
                 ]),
+                ('Committed', ['true' if self.table(t.id).committed()
+                               else 'false'
+                               for t in db_meta.tables]),
             ]),
         ]
 
-        if len(self._collections.names) > 0:
-            tables.append(('COLLECTIONS', [
-                ('Name', self._collections.names),
-                ('# tables', [
-                    str(len(self.collection(id).table_names()))
-                    for id in self._collections.ids
-                ])
-            ]))
-
         for table_idx, (label, cols) in enumerate(tables):
             if table_idx > 0:
                 summary += '\n\n'
             num_cols = len(cols)
-            max_col_lens = [max(max([len(s) for s in c]), len(name))
+            max_col_lens = [max(max([len(s) for s in c] or [0]), len(name))
                             for name, c in cols]
             table_width = sum(max_col_lens) + 3*(num_cols-1)
             label = '** {} **'.format(label)
-            summary += ' ' * (table_width/2 - len(label)/2) + label + '\n'
-            summary += '-'*table_width + '\n'
+            summary += ' ' * int(table_width/2 - len(label)/2) + label + '\n'
+            summary += '-' * table_width + '\n'
             col_name_fmt = ' | '.join(['{{:{}}}' for _ in range(num_cols)])
             col_name_fmt = col_name_fmt.format(*max_col_lens)
             summary += col_name_fmt.format(*[s for s, _ in cols]) + '\n'
@@ -163,12 +247,14 @@ def summarize(self):
 
     def _load_descriptor(self, descriptor, path):
         d = descriptor()
-        d.ParseFromString(self._storage.read('{}/{}'.format(self._db_path, path)))
+        d.ParseFromString(
+            self._storage.read(
+                ('{}/{}'.format(self._db_path, path)).encode('ascii')))
         return d
 
     def _save_descriptor(self, descriptor, path):
         self._storage.write(
-            '{}/{}'.format(self._db_path, path),
+            ('{}/{}'.format(self._db_path, path)).encode('ascii'),
             descriptor.SerializeToString())
 
     def _load_db_metadata(self):
@@ -177,80 +263,306 @@ def _load_db_metadata(self):
                 self.protobufs.DatabaseDescriptor,
                 'db_metadata.bin')
             self._cached_db_metadata = desc
+            # table id cache
+            self._table_id = {}
+            self._table_name = {}
+            self._table_committed = {}
+            for i, table in enumerate(self._cached_db_metadata.tables):
+                if table.name in self._table_name:
+                    raise ScannerException(
+                        'Internal error: multiple tables with same name: {}'.format(name))
+                self._table_id[table.id] = i
+                self._table_name[table.name] = i
+                self._table_committed[table.id] = table.committed
+
+            if self._prefetch_table_metadata:
+                self._table_descriptor = {}
+                self._video_descriptor = {}
+                # Read all table descriptors from database
+                NUM_TABLES_TO_READ = 10000
+                table_names = self._table_name.keys()
+                for i in range(0, len(table_names), NUM_TABLES_TO_READ):
+                    get_tables_params = self.protobufs.GetTablesParams()
+                    for table_name in table_names[i:i+NUM_TABLES_TO_READ]:
+                        get_tables_params.tables.append(table_name)
+                    get_tables_result = self._try_rpc(lambda: self._master.GetTables(
+                        get_tables_params))
+                    if not get_tables_result.result.success:
+                        raise ScannerException(
+                            'Internal error: GetTables returned error: {}'.format(
+                                get_tables_result.result.msg))
+                    for table, video in zip(get_tables_result.tables, get_tables_result.videos):
+                        self._table_descriptor[table.id] = table
+                        self._video_descriptor[table.id] = video
+
         return self._cached_db_metadata
 
+    def _connect_to_worker(self, address):
+        channel = grpc.insecure_channel(
+            address,
+            options=[('grpc.max_message_length', 24499183 * 2)])
+        worker = self.protobufs.WorkerStub(channel)
+        try:
+            self._master.Ping(self.protobufs.Empty())
+            return worker
+        except grpc.RpcError as e:
+            status = e.code()
+            if status == grpc.StatusCode.UNAVAILABLE:
+                pass
+            else:
+                raise ScannerException('Master ping errored with status: {}'
+                                       .format(status))
+        return None
+
+
     def _connect_to_master(self):
         channel = grpc.insecure_channel(
             self._master_address,
             options=[('grpc.max_message_length', 24499183 * 2)])
         self._master = self.protobufs.MasterStub(channel)
-
-        # Ping master and start master/worker locally if they don't exist.
+        result = False
         try:
             self._master.Ping(self.protobufs.Empty())
+            result = True
         except grpc.RpcError as e:
             status = e.code()
             if status == grpc.StatusCode.UNAVAILABLE:
-                log.info("Master not started, creating temporary master/worker...")
-                # If they get GC'd then the masters/workers will die, so persist
-                # them until the database object dies
-                self.start_master()
-                self.start_worker()
-                log.info("Temporary master/worker started")
-
-                # If we don't reconnect to master, there's a 5-10 sec delay for
-                # for original connection to reboot
-                channel = grpc.insecure_channel(self._master_address)
-                self._master = self.protobufs.MasterStub(channel)
-            elif status == grpc.StatusCode.OK:
                 pass
+            elif status == grpc.StatusCode.OK:
+                result = True
             else:
                 raise ScannerException('Master ping errored with status: {}'
                                        .format(status))
+        return result
 
-
-    def start_master(self):
-        """
-        TODO(wcrichto)
-        """
-
-        return self._bindings.start_master(self._db)
-
-    def start_worker(self):
-        """
-        TODO(wcrichto)
-        """
-
-        machine_params = self._bindings.default_machine_params()
-        return self._bindings.start_worker(self._db, machine_params)
-
-    def _run_remote_cmd(self, host, cmd):
-        local_ip = socket.gethostbyname(socket.gethostname())
-        if socket.gethostbyname(host) == local_ip:
+    def _run_remote_cmd(self, host, cmd, nohup=False):
+        host_name, _, _ = host.partition(':')
+        host_ip = unicode(socket.gethostbyname(host_name), "utf-8")
+        if ipaddress.ip_address(host_ip).is_loopback:
             return Popen(cmd, shell=True)
         else:
-            print "ssh {} {}".format(host, cmd)
-            return Popen("ssh {} {}".format(host, cmd), shell=True)
+            cmd = cmd.replace('"', '\\"')
+            return Popen("ssh {} \"cd {} && {} {} {}\"".format(
+                host_name,
+                os.getcwd(),
+                '' if nohup else '',
+                cmd,
+                '' if nohup else ''),
+                         shell=True)
+
+    def _start_heartbeat(self):
+        # Start up heartbeat to keep master alive
+        def heartbeat_task(q, master_address):
+            channel = grpc.insecure_channel(
+                master_address,
+                options=[('grpc.max_message_length', 24499183 * 2)])
+            master = grpc_types.MasterStub(channel)
+            while q.empty():
+                master.PokeWatchdog(rpc_types.Empty())
+                time.sleep(1)
+
+        self._heartbeat_queue = Queue()
+        self._heartbeat_process = Process(target=heartbeat_task,
+                                          args=(self._heartbeat_queue,
+                                                self._master_address))
+        self._heartbeat_process.daemon = True
+        self._heartbeat_process.start()
+
+    def _stop_heartbeat(self):
+        self._heartbeat_queue.put(0)
+
+    def _handle_signal(self, signum, frame):
+        if (signum == signal.SIGINT or
+            signum == signal.SIGTERM or
+            signum == signal.SIGKILL):
+            # Stop cluster
+            self._stop_heartbeat()
+            self.stop_cluster()
+            sys.exit(1)
 
     def start_cluster(self, master, workers):
         """
-        Convenience method for starting a Scanner cluster.
-
-        This should be run as a background/tmux/etc. script.
+        Starts  a Scanner cluster.
 
         Args:
             master: ssh-able address of the master node.
             workers: list of ssh-able addresses of the worker nodes.
         """
-        master_cmd = 'python -c "from scannerpy import Database as Db; Db().start_master(True)"'
-        worker_cmd = 'python -c "from scannerpy import Database as Db; Db().start_worker(\'{}:5001\', True)"' \
-                     .format(master)
 
-        master = self._run_remote_cmd(master, master_cmd)
-        workers = [self._run_remote_cmd(w, worker_cmd) for w in workers]
-        master.wait()
-        for worker in workers:
-            worker.wait()
+        if master is None:
+            self._master_address = (
+                self.config.master_address + ':' + self.config.master_port)
+        else:
+            self._master_address = master
+        if workers is None:
+            self._worker_addresses = [
+                self.config.master_address + ':' + self.config.worker_port]
+        else:
+            self._worker_addresses = workers
+
+        # Boot up C++ database bindings
+        self._db = self._bindings.Database(
+            self.config.storage_config,
+            str(self._db_path).encode('ascii'),
+            str(self._master_address).encode('ascii'))
+
+        if self._start_cluster:
+            # Set handler to shutdown cluster on signal
+            # TODO(apoms): we should clear these handlers when stopping
+            # the cluster
+            signal.signal(signal.SIGINT, self._handle_signal)
+            signal.signal(signal.SIGTERM, self._handle_signal)
+
+            if self._debug:
+                self._master_conn = None
+                self._worker_conns = None
+                machine_params = self._bindings.default_machine_params()
+                res = self._bindings.start_master(
+                    self._db, self.config.master_port.encode('ascii'), True,
+                    self._prefetch_table_metadata,
+                    self._no_workers_timeout).success
+                assert res
+                res = self._connect_to_master()
+                if not res:
+                    raise ScannerException(
+                        'Failed to connect to local master process on port '
+                        '{:s}. (Is there another process that is bound to that '
+                        'port already?)'.format(self.config.master_port))
+
+                self._start_heartbeat()
+
+                for i in range(len(self._worker_addresses)):
+                    res = self._bindings.start_worker(
+                        self._db, machine_params,
+                        str(int(self.config.worker_port) + i).encode('ascii'),
+                        True,
+                        self._prefetch_table_metadata).success
+                    if not res:
+                        raise ScannerException(
+                            'Failed to start local worker on port {:d} and '
+                            'connect to master. (Is there another process that '
+                            'is bound to that port already?)'.format(
+                                self.config.worker_port))
+            else:
+                master_port = self._master_address.partition(':')[2]
+                pickled_config = pickle.dumps(self.config)
+                master_cmd = (
+                    'python -c ' +
+                    '\"from scannerpy import start_master\n' +
+                    'import pickle\n' +
+                    'config=pickle.loads(\'\'\'{config:s}\'\'\')\n' +
+                    'start_master(port=\'{master_port:s}\', block=True,\n' +
+                    '             config=config,\n' +
+                    '             prefetch_table_metadata={prefetch},\n' +
+                    '             no_workers_timeout={no_workers})\" ' +
+                    '').format(
+                        master_port=master_port,
+                        config=pickled_config,
+                        prefetch=self._prefetch_table_metadata,
+                        no_workers=self._no_workers_timeout)
+                worker_cmd = (
+                    'python -c ' +
+                    '\"from scannerpy import start_worker\n' +
+                    'import pickle\n' +
+                    'config=pickle.loads(\'\'\'{config:s}\'\'\')\n' +
+                    'start_worker(\'{master:s}\', port=\'{worker_port:s}\',\n' +
+                    '             block=True,\n' +
+                    '             config=config)\" ' +
+                    '')
+                self._master_conn = self._run_remote_cmd(self._master_address,
+                                                         master_cmd,
+                                                         nohup=True)
+
+                # Wait for master to start
+                slept_so_far = 0
+                sleep_time = 60
+                while slept_so_far < sleep_time:
+                    if self._connect_to_master():
+                        break
+                    time.sleep(0.3)
+                    slept_so_far += 0.3
+                if slept_so_far >= sleep_time:
+                    self._master_conn.kill()
+                    self._master_conn = None
+                    raise ScannerException('Timed out waiting to connect to master')
+                # Start up heartbeat to keep master alive
+                self._start_heartbeat()
+
+                # Start workers now that master is ready
+                self._worker_conns = []
+                ignored_nodes = 0
+                for w in self._worker_addresses:
+                    try:
+                        self._worker_conns.append(self._run_remote_cmd(w, worker_cmd.format(
+                            master=self._master_address,
+                            config=pickled_config,
+                            worker_port=w.partition(':')[2]),
+                        nohup=True))
+                    except:
+                        print('WARNING: Failed to ssh into {:s}, ignoring'.format(w))
+                        ignored_nodes += 1
+                slept_so_far = 0
+                # Has to be this long for GCS
+                sleep_time = 60
+                while slept_so_far < sleep_time:
+                    active_workers = self._master.ActiveWorkers(self.protobufs.Empty())
+                    if (len(active_workers.workers) > len(self._worker_conns)):
+                        raise ScannerException(
+                            ('Master has more workers than requested ' +
+                             '({:d} vs {:d})').format(len(active_workers.workers),
+                                                      len(self._worker_conns)))
+                    if (len(active_workers.workers) == len(self._worker_conns)):
+                        break
+                    time.sleep(0.3)
+                    slept_so_far += 0.3
+                if slept_so_far >= sleep_time:
+                    self._master_conn.kill()
+                    for wc in self._worker_conns:
+                        wc.kill()
+                    self._master_conn = None
+                    self._worker_conns = None
+                    raise ScannerException(
+                        'Timed out waiting for workers to connect to master')
+                if ignored_nodes > 0:
+                    print('Ignored {:d} nodes during startup.'.format(
+                        ignored_nodes))
+        else:
+            self._master_conn = None
+            self._worker_conns = None
+
+            # Wait for master to start
+            slept_so_far = 0
+            sleep_time = 20
+            while slept_so_far < sleep_time:
+                if self._connect_to_master():
+                    break
+                time.sleep(0.3)
+                slept_so_far += 0.3
+            if slept_so_far >= sleep_time:
+                raise ScannerException('Timed out waiting to connect to master')
+
+        # Load stdlib
+        self.load_op('{}/libstdlib.so'.format(SCRIPT_DIR),
+                     '{}/../scanner/stdlib/stdlib_pb2.py'.format(SCRIPT_DIR))
+
+    def stop_cluster(self):
+        if self._start_cluster:
+           if self._master:
+               # Stop heartbeat
+               self._stop_heartbeat()
+               try:
+                   self._try_rpc(
+                       lambda: self._master.Shutdown(self.protobufs.Empty()))
+               except:
+                   pass
+               self._master = None
+           if self._master_conn:
+               self._master_conn.kill()
+               self._master_conn = None
+           if self._worker_conns:
+               for wc in self._worker_conns:
+                   wc.kill()
+               self._worker_conns = None
 
     def _try_rpc(self, fn):
         try:
@@ -280,67 +592,59 @@ def load_op(self, so_path, proto_path=None):
                         if one exists.
         """
         if proto_path is not None:
-            if not os.path.isfile(proto_path):
-                raise ScannerException('Protobuf path does not exist: {}'
-                                       .format(proto_path))
-            mod = imp.load_source('_ignore', proto_path)
-            self._protobufs.append(mod)
-        op_info = self.protobufs.OpInfo()
-        op_info.so_path = so_path
-        self._try_rpc(lambda: self._master.LoadOp(op_info))
-
-    def _update_collections(self):
-        self._save_descriptor(self._collections, 'pydb/descriptor.bin')
-
-    def delete_collection(self, collection_name):
-        if collection_name not in self._collections.names:
-            raise ScannerException('Collection with name {} does not exist'
-                                   .format(collection_name))
-
-        index = self._collections.names[:].index(collection_name)
-        id = self._collections.ids[index]
-        del self._collections.names[index]
-        del self._collections.ids[index]
-
-        os.remove('{}/pydb/collection_{}.bin'.format(self._db_path, id))
-
-    def new_collection(self, collection_name, table_names, force=False, job_id=None):
-        """
-        Creates a new Collection from a list of tables.
-
-        Args:
-            collection_name: String name of the collection to create.
-            table_names: List of table name strings to put in the collection.
-
-        Kwargs:
-            force: TODO(wcrichto)
-            job_id: TODO(wcrichto)
-
-        Returns:
-            The new Collection object.
-        """
-
-        if collection_name in self._collections.names:
-            if force:
-                self.delete_collection(collection_name)
+            self.protobufs.add_module(proto_path)
+        op_path = self.protobufs.OpPath()
+        op_path.path = so_path
+        self._try_rpc(lambda: self._master.LoadOp(op_path))
+
+    def register_op(self, name, input_columns, output_columns,
+                    variadic_inputs=False, stencil=None, proto_path=None):
+        op_registration = self.protobufs.OpRegistration()
+        op_registration.name = name
+        op_registration.variadic_inputs = variadic_inputs
+
+        def add_col(columns, col):
+            if isinstance(col, basestring):
+                c = columns.add()
+                c.name = col
+                c.type = self.protobufs.Other
+            elif isinstance(col, collections.Iterable):
+                c = columns.add()
+                c.name = col[0]
+                c.type = ColumnType.to_proto(self.protobufs, col[1])
             else:
                 raise ScannerException(
-                    'Collection with name {} already exists'
-                    .format(collection_name))
-
-        last_id = self._collections.ids[-1] if len(self._collections.ids) > 0 else -1
-        new_id = last_id + 1
-        self._collections.ids.append(new_id)
-        self._collections.names.append(collection_name)
-        self._update_collections()
-        collection = self.protobufs.CollectionDescriptor()
-        collection.tables.extend(table_names)
-        collection.job_id = -1 if job_id is None else job_id
-        self._save_descriptor(collection, 'pydb/collection_{}.bin'.format(new_id))
-
-        return self.collection(collection_name)
-
-    def ingest_videos(self, videos, force=False):
+                    'Column ' + col + ' must be a string name or a tuple of '
+                    '(name, column_type)')
+        for in_col in input_columns:
+            add_col(op_registration.input_columns, in_col)
+        for out_col in output_columns:
+            add_col(op_registration.output_columns, out_col)
+        if stencil is None:
+            op_registration.can_stencil = False
+        else:
+            op_registration.can_stencil = True
+            op_registration.preferred_stencil.extend(stencil)
+        if proto_path is not None:
+            self.protobufs.add_module(proto_path)
+        self._try_rpc(lambda: self._master.RegisterOp(op_registration))
+
+    def register_python_kernel(self, op_name, device_type, kernel_path,
+                               batch=1):
+        with open(kernel_path, 'r') as f:
+            kernel_str = f.read()
+        py_registration = self.protobufs.PythonKernelRegistration()
+        py_registration.op_name = op_name
+        py_registration.device_type = DeviceType.to_proto(self.protobufs,
+                                                          device_type)
+        py_registration.kernel_str = kernel_str
+        py_registration.pickled_config = pickle.dumps(self.config)
+        py_registration.batch_size = batch
+        self._try_rpc(
+            lambda: self._master.RegisterPythonKernel(py_registration))
+
+
+    def ingest_videos(self, videos, inplace=False, force=False):
         """
         Creates a Table from a video.
 
@@ -359,18 +663,20 @@ def ingest_videos(self, videos, force=False):
             raise ScannerException('Must ingest at least one video.')
 
         [table_names, paths] = zip(*videos)
+        to_delete = []
         for table_name in table_names:
             if self.has_table(table_name):
                 if force is True:
-                    self._delete_table(table_name)
+                    to_delete.append(table_name)
                 else:
                     raise ScannerException(
                         'Attempted to ingest over existing table {}'
                         .format(table_name))
-        self._save_descriptor(self._load_db_metadata(), 'db_metadata.bin')
+        self.delete_tables(to_delete)
         ingest_params = self.protobufs.IngestParameters()
         ingest_params.table_names.extend(table_names)
         ingest_params.video_paths.extend(paths)
+        ingest_params.inplace = inplace
         ingest_result = self._try_rpc(
             lambda: self._master.IngestVideos(ingest_params))
         if not ingest_result.result.success:
@@ -382,88 +688,92 @@ def ingest_videos(self, videos, force=False):
                 if p not in ingest_result.failed_paths],
                 failures)
 
-    def ingest_video_collection(self, collection_name, videos, force=False):
+    def has_table(self, name):
+        db_meta = self._load_db_metadata()
+        if name in self._table_name:
+            return True
+        return False
+
+    def delete_tables(self, names):
+        delete_tables_params = self.protobufs.DeleteTablesParams()
+        for name in names:
+            delete_tables_params.tables.append(name)
+        self._try_rpc(lambda: self._master.DeleteTables(delete_tables_params))
+        self._cached_db_metadata = None
+
+    def delete_table(self, name):
+        self.delete_tables([name])
+
+    def new_table(self, name, columns, rows, fn=None, force=False):
         """
-        Creates a Collection from a list of videos.
+        Creates a new table from a list of rows.
 
         Args:
-            collection_name: String name of the Collection to create.
-            videos: List of video paths.
+            name: String name of the table to create
+            columns: List of names of table columns
+            rows: List of rows with each row a list of elements corresponding
+                  to the specified columns. Elements must be strings of
+                  serialized representations of the data.
 
         Kwargs:
-            force: TODO(wcrichto)
+            fn: TODO(wcrichto)
+            force: TODO(apoms)
 
         Returns:
-            (Collection, list of (path, reason) failures to ingest)
+            The new table object.
         """
-        table_names = ['{}:{:03d}'.format(collection_name, i)
-                       for i in range(len(videos))]
-        tables, failures = self.ingest_videos(zip(table_names, videos), force)
-        collection = self.new_collection(
-            collection_name, [t.name() for t in tables], force)
-        return collection, failures
 
-    def has_collection(self, name):
-        return name in self._collections.names
+        if self.has_table(name):
+            if force:
+                self.delete_table(name)
+            else:
+                raise ScannerException('Attempted to create table with existing '
+                                       'name {}'.format(name))
+        if fn is not None:
+            rows = [fn(row, self.protobufs) for row in rows]
 
-    def collection(self, name):
-        if isinstance(name, basestring):
-            index = self._collections.names[:].index(name)
-            id = self._collections.ids[index]
-        else:
-            id = name
-        collection = self._load_descriptor(
-            self.protobufs.CollectionDescriptor,
-            'pydb/collection_{}.bin'.format(id))
-        return Collection(self, name, collection)
+        params = self.protobufs.NewTableParams();
+        params.table_name = name
+        params.columns[:] = ["index"] + columns
 
-    def has_table(self, name):
-        db_meta = self._load_db_metadata()
-        for table in db_meta.tables:
-            if table.name == name:
-                return True
-        return False
+        for i, row in enumerate(rows):
+            row_proto = params.rows.add()
+            row_proto.columns[:] = [struct.pack('=Q', i)] + row
 
-    def _delete_table(self, name):
-        table = self.table(name)
-        db_meta = self._load_db_metadata()
-        for i, t in enumerate(db_meta.tables):
-            if t.id == table.id():
-                del db_meta.tables[i]
-                return
-        assert False
+        self._try_rpc(lambda: self._master.NewTable(params))
 
-    def delete_table(self, name):
-        self._delete_table(name)
-        self._save_descriptor(db_meta, 'db_metadata.bin')
+        self._cached_db_metadata = None
+
+        return self.table(name)
 
     def table(self, name):
         db_meta = self._load_db_metadata()
 
+        table_name = None
+        table_id = None
         if isinstance(name, basestring):
-            table_id = None
-            for table in db_meta.tables:
-                if table.name == name:
-                    table_id = table.id
-                    break
+            if name in self._table_name:
+                table_name = name
+                table_id = db_meta.tables[self._table_name[name]].id
             if table_id is None:
                 raise ScannerException('Table with name {} not found'.format(name))
-            for table in db_meta.tables:
-                if table.name == name and table.id != table_id:
-                    raise ScannerException(
-                        'Internal error: multiple tables with same name: {}'.format(name))
         elif isinstance(name, int):
-            table_id = name
+            if name in self._table_id:
+                table_id = name
+                table_name = db_meta.tables[self._table_id[name]].name
+            if table_id is None:
+                raise ScannerException('Table with id {} not found'.format(name))
         else:
             raise ScannerException('Invalid table identifier')
 
-        descriptor = self._load_descriptor(
-            self.protobufs.TableDescriptor,
-            'tables/{}/descriptor.bin'.format(table_id))
-        return Table(self, descriptor)
+        table = Table(self, table_name, table_id)
+        if self._prefetch_table_metadata:
+            table._descriptor = self._table_descriptor[table_id]
+            video_descriptor = self._video_descriptor[table_id]
+            if video_descriptor.table_id != -1:
+                table._video_descriptors = [None, video_descriptor]
 
-    def sampler(self):
-        return Sampler(self)
+        return table
 
     def profiler(self, job_name):
         db_meta = self._load_db_metadata()
@@ -480,66 +790,90 @@ def profiler(self, job_name):
 
         return Profiler(self, job_id)
 
-    def _toposort(self, op):
+    def _get_op_info(self, op_name):
+        if op_name in self._op_cache:
+            op_info = self._op_cache[op_name]
+        else:
+            op_info_args = self.protobufs.OpInfoArgs()
+            op_info_args.op_name = op_name
+
+            op_info = self._try_rpc(lambda: self._master.GetOpInfo(op_info_args))
+
+            if not op_info.result.success:
+                raise ScannerException(op_info.result.msg)
+
+            self._op_cache[op_name] = op_info
+
+        return op_info
+
+    def _check_has_op(self, op_name):
+        self._get_op_info(op_name)
+
+    def _get_input_columns(self, op_name):
+        return self._get_op_info(op_name).input_columns
+
+    def _get_output_columns(self, op_name):
+        return self._get_op_info(op_name).output_columns
+
+    def _toposort(self, dag):
+        op = dag
+        # Perform DFS on modified graph
         edges = defaultdict(list)
         in_edges_left = defaultdict(int)
-        start_node = None
 
+        input_nodes = []
         explored_nodes = set()
         stack = [op]
         while len(stack) > 0:
             c = stack.pop()
+            if c in explored_nodes:
+                continue
             explored_nodes.add(c)
-            if (c._name == "InputTable"):
-                start_node = c
+
+            if c._name == "Input":
+                input_nodes.append(c)
                 continue
-            elif len(c._inputs) == 0:
-                input = Op.input(self)
-                # TODO(wcrichto): allow non-frame input
-                c._inputs = [(input, ["frame", "frame_info"])]
-                start_node = input
-            for (parent, _) in c._inputs:
-                edges[parent].append(c)
+
+            for input in c._inputs:
+                edges[input._op].append(c)
                 in_edges_left[c] += 1
 
-                if parent not in explored_nodes:
-                    stack.append(parent)
+                if input._op not in explored_nodes:
+                    stack.append(input._op)
 
+        # Keep track of position of input ops and sampling/slicing ops
+        # to use for associating job args to
+        input_ops = {}
+        sampling_slicing_ops = {}
+        output_ops = {}
+
+        # Compute sorted list
         eval_sorted = []
         eval_index = {}
-        stack = [start_node]
+        stack = input_nodes[:]
         while len(stack) > 0:
             c = stack.pop()
             eval_sorted.append(c)
-            eval_index[c] = len(eval_sorted) - 1
+            op_idx = len(eval_sorted) - 1
+            eval_index[c] = op_idx
             for child in edges[c]:
                 in_edges_left[child] -= 1
                 if in_edges_left[child] == 0:
                     stack.append(child)
-
-        return [e.to_proto(eval_index) for e in eval_sorted]
-
-    def _process_dag(self, op):
-        # If ops are passed as a list (e.g. [transform, caffe])
-        # then hook up inputs to outputs of adjacent ops
-        if isinstance(op, list):
-            for i in range(len(op) - 1):
-                if len(op[i+1]._inputs) > 0:
-                    continue
-                if op[i]._name == "InputTable":
-                    out_cols = ["frame", "frame_info"]
-                else:
-                    out_cols = self._bindings.get_output_columns(op[i]._name)
-                op[i+1]._inputs = [(op[i], out_cols)]
-            op = op[-1]
-
-        # If the user doesn't explicitly specify an OutputTable, assume that
-        # it's all the output columns of the last op.
-        if op._name != "OutputTable":
-            out_cols = self._bindings.get_output_columns(str(op._name))
-            op = Op.output(self, [(op, out_cols)])
-
-        return self._toposort(op)
+            if c._name == "Input":
+                input_ops[c] = op_idx
+            elif (c._name == "Sample" or
+                  c._name == "Space" or
+                  c._name == "Slice" or
+                  c._name == "Unslice"):
+                sampling_slicing_ops[c] = op_idx
+            elif c._name == "OutputTable":
+                output_ops[c] = op_idx
+
+        return [e.to_proto(eval_index) for e in eval_sorted], \
+            input_ops, \
+            sampling_slicing_ops, \
+            output_ops
 
     def _parse_size_string(self, s):
         (prefix, suffix) = (s[:-1], s[-1])
@@ -548,92 +882,170 @@ def _parse_size_string(self, s):
             'M': 1024**2,
             'K': 1024**1
         }
+        suffix = suffix.upper()
         if suffix not in mults:
             raise ScannerException('Invalid size suffix in "{}"'.format(s))
         return int(prefix) * mults[suffix]
 
-    def run(self, tasks, op,
-            output_collection=None,
-            job_name=None,
+    def wait_on_current_job(self, show_progress=True):
+        pbar = None
+        total_tasks = None
+        last_task_count = 0
+        last_jobs_failed = 0
+        last_failed_workers = 0
+        while True:
+            try:
+                job_status = self._master.GetJobStatus(self.protobufs.Empty())
+                if show_progress and pbar is None and job_status.total_jobs != 0 \
+                   and job_status.total_tasks != 0:
+                    total_tasks = job_status.total_tasks
+                    pbar = tqdm(total=total_tasks)
+            except grpc.RpcError as e:
+                raise ScannerException(e)
+            if job_status.finished:
+                break
+            if pbar is not None:
+                tasks_completed = job_status.tasks_done
+                pbar.update(tasks_completed - last_task_count)
+                last_task_count = tasks_completed
+                pbar.set_postfix({
+                    'jobs': job_status.total_jobs - job_status.jobs_done,
+                    'tasks': job_status.total_tasks - job_status.tasks_done,
+                    'workers': job_status.num_workers,
+                })
+                time_str = time.strftime('%l:%M%p %z on %b %d, %Y')
+                if last_jobs_failed < job_status.jobs_failed:
+                    num_jobs_failed = job_status.jobs_failed - last_jobs_failed
+                    pbar.write('{:d} {:s} failed at {:s}'.format(
+                        num_jobs_failed,
+                        'job' if num_jobs < 2 else 'jobs',
+                        time_str))
+                if last_failed_workers < job_status.failed_workers:
+                    num_workers_failed = job_status.failed_workers - last_failed_workers
+                    pbar.write('{:d} {:s} failed at {:s}'.format(
+                        num_workers_failed,
+                        'worker' if num_workers_failed < 2 else 'workers',
+                        time_str))
+                last_jobs_failed = job_status.jobs_failed
+                last_failed_workers = job_status.failed_workers
+            time.sleep(1.0)
+
+        if pbar is not None:
+            pbar.close()
+
+        return job_status
+
+    def run(self, bulk_job,
             force=False,
-            work_item_size=250,
+            work_packet_size=250,
+            io_packet_size=-1,
             cpu_pool=None,
             gpu_pool=None,
-            pipeline_instances_per_node=-1,
-            show_progress=True):
-        """
-        Runs a computation over a set of inputs.
-
-        Args:
-            tasks: The set of inputs to run the computation on. If tasks is a
-                   Collection, then the computation is run on all frames of all
-                   tables in the collection. Otherwise, tasks should be generated
-                   by the Sampler.
-            op: The computation to run. Op is either a list of
-                   ops to run in sequence, or a DAG with the output node
-                   passed in as the argument.
-
-        Kwargs:
-            output_collection: If this is not None, then a new collection with
-                               this name will be created for all the output
-                               tables.
-            job_name: An optional name to assign the job. It will be randomly
-                      generated if none is given.
-            force: TODO(wcrichto)
-            work_item_size: TODO(wcrichto)
-            cpu_pool: TODO(wcrichto)
-            gpu_pool: TODO(wcrichto)
-            pipeline_instances_per_node: TODO(wcrichto)
-            show_progress: TODO(wcrichto)
-
-        Returns:
-            Either the output Collection if output_collection is specified
-            or a list of Table objects.
-        """
-
-        # If the input is a collection, assume user is running over all frames
-        input_is_collection = isinstance(tasks, Collection)
-        if input_is_collection:
-            if output_collection is None:
-                raise ScannerException(
-                    'If Database.run input is a collection, output_collection_name '
-                    'must be specified')
-            sampler = self.sampler()
-            tasks = sampler.all(tasks)
-
-        # If the output should be a collection, then set the table names
-        if output_collection is not None:
-            if self.has_collection(output_collection) and not force:
+            pipeline_instances_per_node=None,
+            show_progress=True,
+            profiling=False,
+            load_sparsity_threshold=8,
+            tasks_in_queue_per_pu=4,
+            task_timeout=0):
+        assert isinstance(bulk_job, BulkJob)
+        assert isinstance(bulk_job.output(), Op)
+
+        # Collect compression annotations to add to job
+        compression_options = []
+        output_op = bulk_job.output()
+        for out_col in output_op.inputs():
+            opts = self.protobufs.OutputColumnCompression()
+            opts.codec = 'default'
+            if out_col._type == self.protobufs.Video:
+                for k, v in out_col._encode_options.iteritems():
+                    if k == 'codec':
+                        opts.codec = v
+                    else:
+                        opts.options[k] = str(v)
+            compression_options.append(opts)
+
+        sorted_ops, input_ops, sampling_slicing_ops, output_ops = (
+            self._toposort(bulk_job.output()))
+
+        job_params = self.protobufs.BulkJobParameters()
+        job_name = ''.join(choice(ascii_uppercase) for _ in range(12))
+        job_params.job_name = job_name
+        job_params.ops.extend(sorted_ops)
+        job_output_table_names = []
+        for job in bulk_job.jobs():
+            j = job_params.jobs.add()
+            output_table_name = None
+            for op_col, args in job.op_args().iteritems():
+                if isinstance(op_col, Op):
+                    op = op_col
+                else:
+                    op = op_col._op
+                if op in input_ops:
+                    op_idx = input_ops[op]
+                    col_input = j.inputs.add()
+                    col_input.op_index = op_idx
+                    if not args._table.committed():
+                        raise ScannerException(
+                            'Attempted to bind table {name} to Input Op but '
+                            'table {name} is not committed.'
+                            .format(name=args._table.name()))
+                    col_input.table_name = args._table.name()
+                    col_input.column_name = args.name()
+                elif op in sampling_slicing_ops:
+                    op_idx = sampling_slicing_ops[op]
+                    saa = j.sampling_args_assignment.add()
+                    saa.op_index = op_idx
+                    if not isinstance(args, list):
+                        args = [args]
+                    for arg in args:
+                        sa = saa.sampling_args.add()
+                        sa.CopyFrom(arg)
+                elif op in output_ops:
+                    op_idx = output_ops[op]
+                    assert isinstance(args, basestring)
+                    output_table_name = args
+                    job_output_table_names.append(args)
+                else:
+                    raise ScannerException(
+                        'Attempted to bind arguments to Op {} which is not '
+                        'an input, sampling, spacing, slicing, or output Op.'
+                        .format(op.name()))  # FIXME(apoms): op.name() is unbound
+            if output_table_name is None:
                 raise ScannerException(
-                    'Collection with name {} already exists'
-                    .format(output_collection))
-            for task in tasks:
-                new_name = '{}:{}'.format(
-                    output_collection,
-                    task.samples[0].table_name.split(':')[-1])
-                task.output_table_name = new_name
-
-        for task in tasks:
-            if self.has_table(task.output_table_name):
+                    'Did not specify the output table name by binding a '
+                    'string to the output Op.')
+            j.output_table_name = output_table_name
+
+        # Delete tables if they exist and force was specified
+        to_delete = []
+        for name in job_output_table_names:
+            if self.has_table(name):
                 if force:
-                    self._delete_table(task.output_table_name)
+                    to_delete.append(name)
                 else:
-                    raise ScannerException('Job would overwrite existing table {}'
-                                           .format(task.output_table_name))
-        self._save_descriptor(self._load_db_metadata(), 'db_metadata.bin')
-
-        job_params = self.protobufs.JobParameters()
-        # Generate a random job name if none given
-        job_name = job_name or ''.join(choice(ascii_uppercase) for _ in range(12))
-        job_params.job_name = job_name
-        job_params.task_set.tasks.extend(tasks)
-        job_params.task_set.ops.extend(self._process_dag(op))
-        job_params.pipeline_instances_per_node = pipeline_instances_per_node
-        job_params.work_item_size = work_item_size
-        job_params.show_progress = show_progress
-
+                    raise ScannerException(
+                        'Job would overwrite existing table {}'
+                        .format(name))
+        self.delete_tables(to_delete)
+
+        job_params.compression.extend(compression_options)
+        job_params.pipeline_instances_per_node = (
+            pipeline_instances_per_node or -1)
+        job_params.work_packet_size = work_packet_size
+        job_params.io_packet_size = io_packet_size
+        job_params.profiling = profiling
+        job_params.tasks_in_queue_per_pu = tasks_in_queue_per_pu
+        job_params.load_sparsity_threshold = load_sparsity_threshold
+        job_params.boundary_condition = (
+            self.protobufs.BulkJobParameters.REPEAT_EDGE)
+        job_params.task_timeout = task_timeout
+
+        job_params.memory_pool_config.pinned_cpu = False
         if cpu_pool is not None:
             job_params.memory_pool_config.cpu.use_pool = True
+            if cpu_pool[0] == 'p':
+                job_params.memory_pool_config.pinned_cpu = True
+                cpu_pool = cpu_pool[1:]
             size = self._parse_size_string(cpu_pool)
             job_params.memory_pool_config.cpu.free_space = size
 
@@ -645,32 +1057,20 @@ def run(self, tasks, op,
         # Run the job
         self._try_rpc(lambda: self._master.NewJob(job_params))
 
+        job_status = self.wait_on_current_job(show_progress)
+
+        if not job_status.result.success:
+            raise ScannerException(job_status.result.msg)
+
         # Invalidate db metadata because of job run
         self._cached_db_metadata = None
 
         db_meta = self._load_db_metadata()
         job_id = None
-        for job in db_meta.jobs:
+        for job in db_meta.bulk_jobs:
             if job.name == job_name:
                 job_id = job.id
         if job_id is None:
             raise ScannerException('Internal error: job id not found after run')
 
-        # Return a new collection if the input was a collection, otherwise
-        # return a table list
-        table_names = [task.output_table_name for task in tasks]
-        if output_collection is not None:
-            return self.new_collection(output_collection, table_names, force, job_id)
-        else:
-            return [self.table(t) for t in table_names]
-
-
-class ProtobufGenerator:
-    def __init__(self, db):
-        self._db = db
-
-    def __getattr__(self, name):
-        for mod in self._db._protobufs:
-            if hasattr(mod, name):
-                return getattr(mod, name)
-        raise ScannerException('No protobuf with name {}'.format(name))
+        return [self.table(t) for t in job_output_table_names]
diff --git a/python/scannerpy/evaluator.py b/python/scannerpy/evaluator.py
deleted file mode 100644
index 8183546d..00000000
--- a/python/scannerpy/evaluator.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from common import *
-
-
-class EvaluatorGenerator:
-    """
-    Creates Evaluator instances to define a computation.
-
-    When a particular evaluator is requested from the generator, e.g.
-    `db.evaluators.Histogram`, the generator does a dynamic lookup for the
-    evaluator in a C++ registry.
-    """
-
-    def __init__(self, db):
-        self._db = db
-
-    def __getattr__(self, name):
-        if name == 'Input':
-            return lambda: Evaluator.input(self._db)
-        elif name == 'Output':
-            return lambda inputs: Evaluator.output(self._db, inputs)
-
-        if not self._db._bindings.has_evaluator(name):
-            raise ScannerException('Evaluator {} does not exist'.format(name))
-
-        def make_evaluator(**kwargs):
-            inputs = kwargs.pop('inputs', [])
-            device = kwargs.pop('device', DeviceType.CPU)
-            args = kwargs.pop('args', None)
-            return Evaluator(self._db, name, inputs, device,
-                             kwargs if args is None else args)
-        return make_evaluator
-
-
-class Evaluator:
-    def __init__(self, db, name, inputs, device, args):
-        self._db = db
-        self._name = name
-        self._inputs = inputs
-        self._device = device
-        self._args = args
-
-    @classmethod
-    def input(cls, db):
-        # TODO(wcrichto): allow non-frame inputs
-        return cls(db, "InputTable", [(None, ["frame", "frame_info"])],
-                   DeviceType.CPU, {})
-
-    @classmethod
-    def output(cls, db, inputs):
-        return cls(db, "OutputTable", inputs, DeviceType.CPU, {})
-
-    def output_columns(self):
-        # TODO
-        pass
-
-    def to_proto(self, indices):
-        e = self._db.protobufs.Evaluator()
-        e.name = self._name
-
-        for (in_eval, cols) in self._inputs:
-            inp = e.inputs.add()
-            idx = indices[in_eval] if in_eval is not None else -1
-            inp.evaluator_index = idx
-            inp.columns.extend(cols)
-
-        e.device_type = DeviceType.to_proto(self._db, self._device)
-
-        if isinstance(self._args, dict):
-            # To convert an arguments dict, we search for a protobuf with the
-            # name {Evaluator}Args (e.g. BlurArgs, HistogramArgs) in the
-            # args.proto module, and fill that in with keys from the args dict.
-            if len(self._args) > 0:
-                proto_name = self._name + 'Args'
-                args_proto = getattr(self._db.protobufs, proto_name)()
-                for k, v in self._args.iteritems():
-                    try:
-                        setattr(args_proto, k, v)
-                    except AttributeError:
-                        # If the attribute is a nested proto, we can't assign
-                        # directly, so copy from the value.
-                        getattr(args_proto, k).CopyFrom(v)
-                    e.kernel_args = args_proto.SerializeToString()
-        else:
-            # If arguments are a protobuf object, serialize it directly
-            e.kernel_args = self._args.SerializeToString()
-
-        return e
diff --git a/python/scannerpy/job.py b/python/scannerpy/job.py
new file mode 100644
index 00000000..755c939a
--- /dev/null
+++ b/python/scannerpy/job.py
@@ -0,0 +1,11 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+class Job(object):
+    """
+    A specification of a table to produce as output of a bulk job.
+    """
+    def __init__(self, op_args):
+        self._op_args = op_args
+
+    def op_args(self):
+        return self._op_args
diff --git a/python/scannerpy/kernel.py b/python/scannerpy/kernel.py
new file mode 100644
index 00000000..baf0ef7f
--- /dev/null
+++ b/python/scannerpy/kernel.py
@@ -0,0 +1,22 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+class KernelConfig(object):
+    def __init__(self, device_handles, input_columns, input_column_types,
+                 output_columns, args, node_id):
+        self.devices = device_handles
+        self.input_columns = input_columns
+        self.input_column_types = input_column_types
+        self.output_columns = output_columns
+        self.args = args
+        self.node_id = node_id
+
+
+class Kernel(object):
+    def __init__(self, config, protobufs):
+        self.protobufs = protobufs
+
+    def close(self):
+        pass
+
+    def execute(self, input_columns):
+        pass
diff --git a/python/scannerpy/op.py b/python/scannerpy/op.py
index 140bd90d..e0d19154 100644
--- a/python/scannerpy/op.py
+++ b/python/scannerpy/op.py
@@ -1,4 +1,75 @@
-from common import *
+from __future__ import absolute_import, division, print_function, unicode_literals
+import grpc
+import copy
+
+from scannerpy.common import *
+
+class OpColumn:
+    def __init__(self, db, op, col, typ):
+        self._db = db
+        self._op = op
+        self._col = col
+        self._type = typ
+        self._encode_options = None
+        if self._type == self._db.protobufs.Video:
+            self._encode_options = {'codec': 'default'}
+
+    def sample(self):
+        return self._db.ops.Sample(col=self)
+
+    def space(self):
+        return self._db.ops.Space(col=self)
+
+    def slice(self):
+        return self._db.ops.Slice(col=self)
+
+    def unslice(self):
+        return self._db.ops.Unslice(col=self)
+
+    def compress(self, codec = 'video', **kwargs):
+        self._assert_is_video()
+        codecs = {'video': self.compress_video,
+                  'default': self.compress_default,
+                  'raw': self.lossless}
+        if codec in codecs:
+            return codecs[codec](self, **kwargs)
+        else:
+            raise ScannerException('Compression codec {} not currently '
+                                   'supported. Available codecs are: {}.'
+                                   .format(' '.join(codecs.keys())))
+
+    def compress_video(self, quality = -1, bitrate = -1, keyframe_distance = -1):
+        self._assert_is_video()
+        encode_options = {
+            'codec': 'h264',
+            'quality': quality,
+            'bitrate': bitrate,
+            'keyframe_distance': keyframe_distance
+        }
+        return self._new_compressed_column(encode_options)
+
+    def lossless(self):
+        self._assert_is_video()
+        encode_options = {'codec': 'raw'}
+        return self._new_compressed_column(encode_options)
+
+    def compress_default(self):
+        self._assert_is_video()
+        encode_options = {'codec': 'default'}
+        return self._new_compressed_column(encode_options)
+
+    def _assert_is_video(self):
+        if self._type != self._db.protobufs.Video:
+            raise ScannerException(
+                'Compression only supported for columns of'
+                'type "video". Column {} type is {}.'
+                .format(self._col,
+                        self.db.protobufs.ColumnType.Name(self._type)))
+
+    def _new_compressed_column(self, encode_options):
+        new_col = OpColumn(self._db, self._op, self._col, self._type)
+        new_col._encode_options = encode_options
+        return new_col
 
 
 class OpGenerator:
@@ -15,51 +86,111 @@ def __init__(self, db):
 
     def __getattr__(self, name):
         if name == 'Input':
-            return lambda: Op.input(self._db)
+            return lambda: Op.input(self._db).outputs()
+        elif name == 'FrameInput':
+            return lambda: Op.frame_input(self._db).outputs()
         elif name == 'Output':
-            return lambda inputs: Op.output(self._db, inputs)
-
-        if not self._db._bindings.has_op(name):
-            raise ScannerException('Op {} does not exist'.format(name))
-
-        def make_op(**kwargs):
-            inputs = kwargs.pop('inputs', [])
+            def make_op(columns):
+                op = Op.output(self._db, columns)
+                return op
+            return make_op
+
+        # This will raise an exception if the op does not exist.
+        op_info = self._db._get_op_info(name)
+
+        def make_op(*args, **kwargs):
+            inputs = []
+            if op_info.variadic_inputs:
+                inputs.extend(args)
+            else:
+                for c in op_info.input_columns:
+                    val = kwargs.pop(c.name, None)
+                    if val is None:
+                        raise ScannerException('Op {} required column {} as input'
+                                               .format(name, c.name))
+                    inputs.append(val)
             device = kwargs.pop('device', DeviceType.CPU)
+            batch = kwargs.pop('batch', -1)
+            warmup = kwargs.pop('warmup', 0)
+            stencil = kwargs.pop('stencil', [])
             args = kwargs.pop('args', None)
-            return Op(self._db, name, inputs, device,
-                             kwargs if args is None else args)
+            op = Op(self._db, name, inputs, device, batch, warmup,
+                    stencil, kwargs if args is None else args)
+            return op.outputs()
+
         return make_op
 
 
 class Op:
-    def __init__(self, db, name, inputs, device, args):
+    def __init__(self, db, name, inputs, device, batch=-1, warmup=0,
+                 stencil=[0], args={}):
         self._db = db
         self._name = name
         self._inputs = inputs
         self._device = device
+        self._batch = batch
+        self._warmup = warmup
+        self._stencil = stencil
         self._args = args
 
+        if (name == 'Input' or
+            name == 'Space' or
+            name == 'Sample' or
+            name == 'Slice' or
+            name == 'Unslice'):
+            outputs = []
+            for c in inputs:
+                outputs.append(OpColumn(db, self, c._col, c._type))
+        elif name == "OutputTable":
+            outputs = []
+        else:
+            cols = self._db._get_output_columns(self._name)
+            outputs = [OpColumn(self._db, self, c.name, c.type) for c in cols]
+        self._outputs = outputs
+
     @classmethod
     def input(cls, db):
-        # TODO(wcrichto): allow non-frame inputs
-        return cls(db, "InputTable", [(None, ["frame", "frame_info"])],
-                   DeviceType.CPU, {})
+        c = cls(db, "Input", [OpColumn(db, None, 'col', db.protobufs.Other)],
+                DeviceType.CPU)
+        return c
+
+    @classmethod
+    def frame_input(cls, db):
+        c = cls(db, "Input", [OpColumn(db, None, 'col', db.protobufs.Video)],
+                DeviceType.CPU)
+        return c
 
     @classmethod
     def output(cls, db, inputs):
-        return cls(db, "OutputTable", inputs, DeviceType.CPU, {})
+        return cls(db, "OutputTable", inputs, DeviceType.CPU)
+
+    def inputs(self):
+        return self._inputs
+
+    def outputs(self):
+        if len(self._outputs) == 1:
+            return self._outputs[0]
+        else:
+            return tuple(self._outputs)
 
     def to_proto(self, indices):
         e = self._db.protobufs.Op()
         e.name = self._name
+        e.device_type = DeviceType.to_proto(self._db.protobufs, self._device)
+        e.stencil.extend(self._stencil)
+        e.batch = self._batch
+        e.warmup = self._warmup
 
-        for (in_eval, cols) in self._inputs:
+        if e.name == "Input":
             inp = e.inputs.add()
-            idx = indices[in_eval] if in_eval is not None else -1
-            inp.op_index = idx
-            inp.columns.extend(cols)
-
-        e.device_type = DeviceType.to_proto(self._db, self._device)
+            inp.column = self._inputs[0]._col
+            inp.op_index = -1
+        else:
+            for i in self._inputs:
+                inp = e.inputs.add()
+                idx = indices[i._op] if i._op is not None else -1
+                inp.op_index = idx
+                inp.column = i._col
 
         if isinstance(self._args, dict):
             # To convert an arguments dict, we search for a protobuf with the
diff --git a/python/scannerpy/partitioner.py b/python/scannerpy/partitioner.py
new file mode 100644
index 00000000..c69a1a81
--- /dev/null
+++ b/python/scannerpy/partitioner.py
@@ -0,0 +1,56 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from scannerpy.common import *
+
+DEFAULT_GROUP_SIZE = 250
+
+class TaskPartitioner:
+    """
+    Utility for specifying how to partition the output domain of a job into
+    tasks.
+    """
+
+    def __init__(self, db):
+        self._db = db
+
+    def all(self, group_size=DEFAULT_GROUP_SIZE):
+        return self.strided(1, group_size=group_size)
+
+    def strided(self, stride, group_size=DEFAULT_GROUP_SIZE):
+        args = self._db.protobufs.StridedPartitionerArgs()
+        args.stride = stride
+        args.group_size = group_size
+        sampling_args = self._db.protobufs.SamplingArgs()
+        sampling_args.sampling_function = 'Strided'
+        sampling_args.sampling_args = args.SerializeToString()
+        return sampling_args
+
+    def range(self, start, end):
+        return self.ranges([(start, end)])
+
+    def ranges(self, intervals):
+        return self.strided_ranges(intervals, 1)
+
+    def gather(self, groups):
+        args = self._db.protobufs.GatherSamplerArgs()
+        for rows in groups:
+            gather_group = args.groups_add()
+            gather_group.rows[:] = rows
+        sampling_args = self._db.protobufs.SamplingArgs()
+        sampling_args.sampling_function = 'Gather'
+        sampling_args.sampling_args = args.SerializeToString()
+        return sampling_args
+
+    def strided_range(self, start, end, stride):
+        return self.strided_ranges([(start, end)], stride)
+
+    def strided_ranges(self, intervals, stride):
+        args = self._db.protobufs.StridedRangePartitionerArgs()
+        args.stride = stride
+        for start, end in intervals:
+            args.starts.append(start)
+            args.ends.append(end)
+        sampling_args = self._db.protobufs.SamplingArgs()
+        sampling_args.sampling_function = 'StridedRange'
+        sampling_args.sampling_args = args.SerializeToString()
+        return sampling_args
diff --git a/python/scannerpy/profiler.py b/python/scannerpy/profiler.py
index 140718d6..ed6d2ae0 100644
--- a/python/scannerpy/profiler.py
+++ b/python/scannerpy/profiler.py
@@ -1,6 +1,8 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
 import struct
 import json
-from common import *
+
+from scannerpy.common import *
 
 
 def read_advance(fmt, buf, offset):
@@ -27,7 +29,7 @@ class Profiler:
     def __init__(self, db, job_id):
         self._storage = db._storage
         job = db._load_descriptor(
-            db.protobufs.JobDescriptor,
+            db.protobufs.BulkJobDescriptor,
             'jobs/{}/descriptor.bin'.format(job_id))
 
         self._profilers = {}
@@ -92,7 +94,9 @@ def write_trace(self, path):
 
     def _convert_time(self, d):
         def convert(t):
-            return '{:2f}'.format(t / 1.0e9)
+            if isinstance(t, float):
+                return '{:2f}'.format(t / 1.0e9)
+            return t
         return {k: self._convert_time(v) if isinstance(v, dict) else convert(v)
                 for (k, v) in d.iteritems()}
 
@@ -109,10 +113,14 @@ def statistics(self):
                 for thread in profiler[kind]:
                     for (key, start, end) in thread['intervals']:
                         if key not in totals[kind]:
-                            totals[kind][key] = 0
+                            totals[kind][key] = 0.0
                         totals[kind][key] += end-start
+                    for (name, value) in thread['counters'].iteritems():
+                        if name not in totals[kind]:
+                            totals[kind][name] = 0
+                        totals[kind][name] += value
 
-        totals['total_time'] = (total_end - total_start)
+        totals['total_time'] = float(total_end - total_start)
         readable_totals = self._convert_time(totals)
         return readable_totals
 
@@ -172,7 +180,7 @@ def _parse_profiler_output(self, bytes_buffer, offset):
         }, offset
 
     def _parse_profiler_file(self, profiler_path):
-        bytes_buffer = self._storage.read(profiler_path)
+        bytes_buffer = self._storage.read(profiler_path.encode('ascii'))
         offset = 0
         # Read start and end time intervals
         t, offset = read_advance('q', bytes_buffer, offset)
diff --git a/python/scannerpy/protobuf_generator.py b/python/scannerpy/protobuf_generator.py
new file mode 100644
index 00000000..fe5e0395
--- /dev/null
+++ b/python/scannerpy/protobuf_generator.py
@@ -0,0 +1,39 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import os.path
+import imp
+import sys
+
+from scannerpy.common import *
+
+import scanner.stdlib.stdlib_pb2 as stdlib_types
+import scannerpy.libscanner as bindings
+import scanner.metadata_pb2 as metadata_types
+import scanner.engine.rpc_pb2 as rpc_types
+import scanner.engine.rpc_pb2_grpc as grpc_types
+import scanner.types_pb2 as misc_types
+
+class ProtobufGenerator:
+    def __init__(self, cfg):
+        self._mods = []
+
+        for mod in [misc_types, rpc_types, grpc_types, metadata_types,
+                    stdlib_types]:
+            self.add_module(mod)
+
+    def add_module(self, path):
+        if isinstance(path, basestring):
+            if not os.path.isfile(path):
+                raise ScannerException('Protobuf path does not exist: {}'
+                                       .format(path))
+            imp.acquire_lock()
+            mod = imp.load_source('_ignore', path)
+            imp.release_lock()
+        else:
+            mod = path
+        self._mods.append(mod)
+
+    def __getattr__(self, name):
+        for mod in self._mods:
+            if hasattr(mod, name):
+                return getattr(mod, name)
+        raise ScannerException('No protobuf with name {}'.format(name))
diff --git a/python/scannerpy/sampler.py b/python/scannerpy/sampler.py
index 1a341a3e..b06e1627 100644
--- a/python/scannerpy/sampler.py
+++ b/python/scannerpy/sampler.py
@@ -1,6 +1,8 @@
-from common import *
-from collection import Collection
+from __future__ import absolute_import, division, print_function, unicode_literals
 
+from scannerpy.common import *
+
+DEFAULT_TASK_SIZE = 250
 
 class Sampler:
     """
@@ -11,117 +13,59 @@ class Sampler:
     def __init__(self, db):
         self._db = db
 
-    def _convert_collection(self, videos):
-        if isinstance(videos, Collection):
-            return [(t, '') for t in videos.table_names()]
-        else:
-            return videos
-
-    def all(self, videos, item_size=1000, warmup_size=0):
-        sampler_args = self._db.protobufs.AllSamplerArgs()
-        sampler_args.sample_size = item_size
-        sampler_args.warmup_size = warmup_size
-        videos = self._convert_collection(videos)
-        tasks = []
-        for video in videos:
-            (input_table_name, output_table_name) = video
-            table = self._db.table(video[0])
-            task = self._db.protobufs.Task()
-            task.output_table_name = output_table_name
-            input_table = self._db.table(input_table_name)
-            column_names = [c.name() for c in input_table.columns()]
-            sample = task.samples.add()
-            sample.table_name = input_table_name
-            sample.column_names.extend(column_names)
-            sample.sampling_function = "All"
-            sample.sampling_args = sampler_args.SerializeToString()
-            tasks.append(task)
-        return tasks
+    def all(self):
+        sampling_args = self._db.protobufs.SamplingArgs()
+        sampling_args.sampling_function = "All"
+        return sampling_args
 
-    def strided(self, videos, stride, item_size=1000):
-        videos = self._convert_collection(videos)
-        tasks = []
-        for video in videos:
-            table = self._db.table(video[0])
-            task = self.strided_range(video, 0, table.num_rows(), stride,
-                                      item_size=item_size)
-            tasks.append(task)
-        return tasks
+    def strided(self, stride):
+        args = self._db.protobufs.StridedSamplerArgs()
+        args.stride = stride
+        sampling_args = self._db.protobufs.SamplingArgs()
+        sampling_args.sampling_function = "Strided"
+        sampling_args.sampling_args = args.SerializeToString()
+        return sampling_args
 
-    def range(self, videos, start, end, item_size=1000, warmup_size=0):
-        return self.ranges(videos, [(start, end)], item_size=item_size,
-                           warmup_size=warmup_size)
+    def range(self, start, end):
+        return self.ranges([(start, end)])
 
-    def ranges(self, videos, intervals, item_size=1000, warmup_size=0):
-        videos = self._convert_collection(videos)
-        tasks = []
-        for video in videos:
-            task = self.strided_ranges(video, intervals, 1,
-                                      item_size=item_size,
-                                      warmup_size=warmup_size)
-            tasks.append(task)
-        return tasks
+    def ranges(self, intervals):
+        return self.strided_ranges(intervals, 1)
 
-    def gather(self, video, rows, item_size=1000):
-        if isinstance(video, list) or isinstance(video, Collection):
-            raise ScannerException('Sampler.gather only takes a single video')
-        if not isinstance(video, tuple):
-            raise ScannerException("""Error: sampler input must either be a collection \
-or (input_table, output_table) pair')""")
+    def gather(self, rows):
+        args = self._db.protobufs.GatherSamplerArgs()
+        args.rows[:] = rows
+        sampling_args = self._db.protobufs.SamplingArgs()
+        sampling_args.sampling_function = 'Gather'
+        sampling_args.sampling_args = args.SerializeToString()
+        return sampling_args
 
-        (input_table_name, output_table_name) = video
-        task = self._db.protobufs.Task()
-        task.output_table_name = output_table_name
-        input_table = self._db.table(input_table_name)
-        column_names = [c.name() for c in input_table.columns()]
-        sample = task.samples.add()
-        sample.table_name = input_table_name
-        sample.column_names.extend(column_names)
-        sample.sampling_function = "Gather"
-        sampler_args = self._db.protobufs.GatherSamplerArgs()
-        s = 0
-        while s < len(rows):
-            e = min(s + item_size, len(rows))
-            sampler_args_sample = sampler_args.samples.add()
-            sampler_args_sample.rows[:] = rows[s:e]
-            s = e
-        sample.sampling_args = sampler_args.SerializeToString()
-        return task
+    def strided_range(self, start, end, stride):
+        return self.strided_ranges([(start, end)], stride)
 
-    def strided_range(self, video, start, end, stride, item_size=1000,
-                      warmup_size=0):
-        return self.strided_ranges(video, [(start, end)], stride,
-                                   item_size=item_size,
-                                   warmup_size=warmup_size)
+    def strided_ranges(self, intervals, stride):
+        args = self._db.protobufs.StridedRangeSamplerArgs()
+        args.stride = stride
+        for start, end in intervals:
+            args.starts.append(start)
+            args.ends.append(end)
+        sampling_args = self._db.protobufs.SamplingArgs()
+        sampling_args.sampling_function = "StridedRanges"
+        sampling_args.sampling_args = args.SerializeToString()
+        return sampling_args
 
-    def strided_ranges(self, video, intervals, stride, item_size=1000,
-                      warmup_size=0):
-        if isinstance(video, list) or isinstance(video, Collection):
-            raise ScannerException('Sampler.strided_range only takes a single video')
-        if not isinstance(video, tuple):
-            raise ScannerException("""Error: sampler input must either be a collection \
-or (input_table, output_table) pair')""")
+    def space_null(self, spacing):
+        args = self._db.protobufs.SpaceNullSamplerArgs()
+        args.spacing = spacing
+        sampling_args = self._db.protobufs.SamplingArgs()
+        sampling_args.sampling_function = "SpaceNull"
+        sampling_args.sampling_args = args.SerializeToString()
+        return sampling_args
 
-        (input_table_name, output_table_name) = video
-        task = self._db.protobufs.Task()
-        task.output_table_name = output_table_name
-        input_table = self._db.table(input_table_name)
-        num_rows = input_table.num_rows()
-        column_names = [c.name() for c in input_table.columns()]
-        sample = task.samples.add()
-        sample.table_name = input_table_name
-        sample.column_names.extend(column_names)
-        sample.sampling_function = "StridedRange"
-        sampler_args = self._db.protobufs.StridedRangeSamplerArgs()
-        sampler_args.stride = stride
-        for start, end in intervals:
-            s = start
-            while s < end:
-                ws = max(0, s - warmup_size * stride)
-                e = min(s + item_size * stride, end)
-                sampler_args.warmup_starts.append(ws)
-                sampler_args.starts.append(s)
-                sampler_args.ends.append(e)
-                s = e
-        sample.sampling_args = sampler_args.SerializeToString()
-        return task
+    def space_repeat(self, spacing):
+        args = self._db.protobufs.SpaceRepeatSamplerArgs()
+        args.spacing = spacing
+        sampling_args = self._db.protobufs.SamplingArgs()
+        sampling_args.sampling_function = "SpaceRepeat"
+        sampling_args.sampling_args = args.SerializeToString()
+        return sampling_args
diff --git a/python/scannerpy/stdlib/bbox_nms_kernel.py b/python/scannerpy/stdlib/bbox_nms_kernel.py
new file mode 100644
index 00000000..e7553a2e
--- /dev/null
+++ b/python/scannerpy/stdlib/bbox_nms_kernel.py
@@ -0,0 +1,25 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import scannerpy
+import scannerpy.stdlib.parsers as parsers
+import scannerpy.stdlib.writers as writers
+import scannerpy.stdlib.bboxes as bboxes
+
+class BBoxNMSKernel(scannerpy.Kernel):
+    def __init__(self, config, protobufs):
+        self.protobufs = protobufs
+        args = protobufs.BBoxNMSArgs()
+        args.ParseFromString(config.args)
+        self.scale = args.scale
+
+    def close(self):
+        pass
+
+    def execute(self, input_columns):
+        bboxes_list = []
+        for c in input_columns:
+            bboxes_list += parsers.bboxes(c, self.protobufs)
+        nmsed_bboxes = bboxes.nms(bboxes_list, 0.1)
+        return writers.bboxes([nmsed_bboxes], self.protobufs)
+
+KERNEL = BBoxNMSKernel
diff --git a/python/scannerpy/stdlib/bboxes.py b/python/scannerpy/stdlib/bboxes.py
index 97ecb6cc..eaca1fe6 100644
--- a/python/scannerpy/stdlib/bboxes.py
+++ b/python/scannerpy/stdlib/bboxes.py
@@ -1,11 +1,22 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
 import numpy as np
+import cv2
+
+from scannerpy.table import Table
+import scannerpy.stdlib.parsers
 
-def nms(boxes, overlapThresh):
+def nms(orig_boxes, overlapThresh):
     # if there are no boxes, return an empty list
-    if len(boxes) == 0:
+    if len(orig_boxes) == 0:
         return []
-    elif len(boxes) == 1:
-        return boxes
+    elif len(orig_boxes) == 1:
+        return orig_boxes
+
+    boxes = [
+        [box.x1, box.y1, box.x2, box.y2, box.score,
+         box.track_id, box.track_score]
+        for box in orig_boxes]
 
     npboxes = np.array(boxes[0])
     for box in boxes[1:]:
@@ -22,11 +33,12 @@ def nms(boxes, overlapThresh):
     y1 = boxes[:,1]
     x2 = boxes[:,2]
     y2 = boxes[:,3]
+    score = boxes[:,4]
 
     # compute the area of the bounding boxes and sort the bounding
     # boxes by the bottom-right y-coordinate of the bounding box
     area = (x2 - x1 + 1) * (y2 - y1 + 1)
-    idxs = np.argsort(y2)
+    idxs = np.argsort(score)
 
     # keep looping while some indexes still remain in the indexes
     # list
@@ -57,4 +69,31 @@ def nms(boxes, overlapThresh):
             ([last], np.where(overlap > overlapThresh)[0])))
 
     # return only the bounding boxes that were picked
-    return boxes[pick]
+    return np.array(orig_boxes)[pick]
+
+
+def draw(vid_table, bbox_table, output_path, fps=24, threshold=0.0,
+                color=(255,0,0)):
+    if isinstance(bbox_table, Table):
+        rows = bbox_table.parent_rows()
+        bboxes = [b for _, b in bbox_table.load([0], parsers.bboxes)]
+    else:
+        [rows, bboxes] = zip(*bbox_table)
+    frames = [f[0] for _, f in vid_table.load([0], rows=rows)]
+
+    frame_shape = frames[0].shape
+    output = cv2.VideoWriter(
+        output_path,
+        cv2.VideoWriter_fourcc(*'X264'),
+        fps,
+        (frame_shape[1], frame_shape[0]))
+
+    for (frame, frame_bboxes) in zip(frames, bboxes):
+        for bbox in frame_bboxes:
+            if bbox.score < threshold: continue
+            cv2.rectangle(
+                frame,
+                (int(bbox.x1), int(bbox.y1)),
+                (int(bbox.x2), int(bbox.y2)),
+                color, 3)
+        output.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
diff --git a/python/scannerpy/stdlib/build_flags.py b/python/scannerpy/stdlib/build_flags.py
new file mode 100644
index 00000000..f49ac5fb
--- /dev/null
+++ b/python/scannerpy/stdlib/build_flags.py
@@ -0,0 +1,32 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import os.path
+import sys
+
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+
+def get_include():
+    return os.path.abspath(os.path.join(SCRIPT_DIR, '..', 'include'))
+
+def print_include():
+    sys.stdout.write(get_include())
+
+def get_lib():
+    return os.path.abspath(os.path.join(SCRIPT_DIR, '..'))
+
+def print_lib():
+    sys.stdout.write(get_lib())
+
+def get_cmake():
+    return os.path.abspath(os.path.join(SCRIPT_DIR, '..', 'cmake', 'Op.cmake'))
+
+def print_cmake():
+    sys.stdout.write(get_cmake())
+
+def get_flags():
+    return (
+        '-std=c++11 -I{include} -L{libdir} -lscanner'.format(
+            include=get_include(),
+            libdir=get_lib()))
+
+def print_flags():
+    sys.stdout.write(get_flags())
diff --git a/python/scannerpy/stdlib/loaders.py b/python/scannerpy/stdlib/loaders.py
index 01906ef2..c09849f1 100644
--- a/python/scannerpy/stdlib/loaders.py
+++ b/python/scannerpy/stdlib/loaders.py
@@ -1,5 +1,5 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
 import numpy as np
-import cv2
 import struct
 
 def bboxes(db, buf):
diff --git a/python/scannerpy/stdlib/montage.py b/python/scannerpy/stdlib/montage.py
new file mode 100644
index 00000000..07b9b3ac
--- /dev/null
+++ b/python/scannerpy/stdlib/montage.py
@@ -0,0 +1,27 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import cv2
+import math
+import numpy as np
+
+def make_montage(n, frames, frame_width=64, frames_per_row=16):
+    frame = next(frames)
+    (frame_h, frame_w, _) = frame.shape
+    target_w = frame_width
+    target_h = int(target_w / float(frame_w) * frame_h)
+    img_w = frames_per_row * target_w
+    img_h = int(math.ceil(float(n) / frames_per_row)) * target_h
+    img = np.zeros((img_h, img_w, 3), dtype=np.uint8)
+
+    def place_image(i, fr):
+        fr = cv2.resize(fr, (target_w, target_h))
+        row = i / frames_per_row
+        col = i % frames_per_row
+        img[(row * target_h):((row+1) * target_h),
+            (col * target_w):((col+1) * target_w),
+            :] = fr
+
+    place_image(0, frame)
+    for i, frame in enumerate(frames):
+        place_image(i + 1, frame)
+
+    return img
diff --git a/python/scannerpy/stdlib/net_descriptor.py b/python/scannerpy/stdlib/net_descriptor.py
index 0f4dc648..316b2f40 100644
--- a/python/scannerpy/stdlib/net_descriptor.py
+++ b/python/scannerpy/stdlib/net_descriptor.py
@@ -1,10 +1,14 @@
-from ..common import *
+from __future__ import absolute_import, division, print_function, unicode_literals
 import toml
 
+from scannerpy.common import *
 
-class NetDescriptor:
+class NetDescriptor(object):
     def __init__(self, db):
         self._descriptor = db.protobufs.NetDescriptor()
+        self._descriptor.input_width = -1
+        self._descriptor.input_height = -1
+        self._descriptor.pad_mod = -1
 
     def _val(self, dct, key, default):
         if key in dct:
@@ -12,6 +16,105 @@ def _val(self, dct, key, default):
         else:
             return default
 
+    @property
+    def model_path(self):
+        return self._descriptor.model_path
+
+    @model_path.setter
+    def model_path(self, value):
+        self._descriptor.model_path = value
+
+    @property
+    def model_weights_path(self):
+        return self._descriptor.model_weights_path
+
+    @model_weights_path.setter
+    def model_weights_path(self, value):
+        self._descriptor.model_weights_path = value
+
+    @property
+    def input_layer_names(self):
+        return self._descriptor.input_layer_names[:]
+
+    @input_layer_names.setter
+    def input_layer_names(self, value):
+        del self._descriptor.input_layer_names[:]
+        self._descriptor.input_layer_names.extend(value)
+
+    @property
+    def output_layer_names(self):
+        return self._descriptor.output_layer_names[:]
+
+    @output_layer_names.setter
+    def output_layer_names(self, value):
+        del self._descriptor.output_layer_names[:]
+        self._descriptor.output_layer_names.extend(value)
+
+    @property
+    def input_width(self):
+        return self._descriptor.input_width
+
+    @input_width.setter
+    def input_width(self, value):
+        self._descriptor.input_width = value
+
+    @property
+    def input_height(self):
+        return self._descriptor.input_height
+
+    @input_width.setter
+    def input_height(self, value):
+        self._descriptor.input_height = value
+
+    @property
+    def normalize(self):
+        return self._descriptor.normalize
+
+    @normalize.setter
+    def normalize(self, value):
+        self._descriptor.normalize = value
+
+    @property
+    def preserve_aspect_ratio(self):
+        return self._descriptor.preserve_aspect_ratio
+
+    @preserve_aspect_ratio.setter
+    def normalize(self, value):
+        self._descriptor.preserve_aspect_ratio = value
+
+    @property
+    def transpose(self):
+        return self._descriptor.transpose
+
+    @transpose.setter
+    def transpose(self, value):
+        self._descriptor.transpose = value
+
+    @property
+    def pad_mod(self):
+        return self._descriptor.pad_mod
+
+    @pad_mod.setter
+    def pad_mod(self, value):
+        self._descriptor.pad_mod = value
+
+    @property
+    def uses_python(self):
+        return self._descriptor.uses_python
+
+    @uses_python.setter
+    def uses_python(self, value):
+        self._descriptor.uses_python = value
+
+    @property
+    def mean_colors(self):
+        return self._descriptor.mean_colors
+
+    @uses_python.setter
+    def mean_colors(self, value):
+        del self._descriptor.mean_colors[:]
+        self._descriptor.mean_colors.extend(value)
+
     @classmethod
     def from_file(cls, db, path):
         self = cls(db)
@@ -30,6 +133,7 @@ def from_file(cls, db, path):
         d.preserve_aspect_ratio = self._val(net, 'preserve_aspect_ratio', False)
         d.transpose = self._val(net, 'tranpose', False)
         d.pad_mod = self._val(net, 'pad_mod', -1)
+        d.uses_python = self._val(net, 'uses_python', False)
 
         mean = args['mean-image']
         if 'colors' in mean:
diff --git a/python/scannerpy/stdlib/parsers.py b/python/scannerpy/stdlib/parsers.py
index ca39063e..ecc5405d 100644
--- a/python/scannerpy/stdlib/parsers.py
+++ b/python/scannerpy/stdlib/parsers.py
@@ -1,36 +1,71 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
 import numpy as np
 import cv2
 import struct
 
+from scannerpy.stdlib.poses import Pose
 
-def bboxes(bufs, db):
-    buf = bufs[0]
+def bboxes(buf, protobufs):
     (num_bboxes,) = struct.unpack("=Q", buf[:8])
     buf = buf[8:]
     bboxes = []
     for i in range(num_bboxes):
-        (bbox_size,) = struct.unpack("=i", buf[:4])
-        buf = buf[4:]
-        box = db.protobufs.BoundingBox()
+        (bbox_size,) = struct.unpack("=Q", buf[:8])
+        buf = buf[8:]
+        box = protobufs.BoundingBox()
         box.ParseFromString(buf[:bbox_size])
         buf = buf[bbox_size:]
-        bbox = [box.x1, box.y1, box.x2, box.y2, box.score,
-                box.track_id, box.track_score]
-        bboxes.append(bbox)
+        bboxes.append(box)
     return bboxes
 
+def poses(buf, protobufs):
+    if len(buf) == 1:
+        return []
 
-def histograms(bufs, db):
+    kp_size = (Pose.POSE_KEYPOINTS +
+               Pose.FACE_KEYPOINTS +
+               Pose.HAND_KEYPOINTS * 2) * 3
+    poses = []
+    all_kp = np.frombuffer(buf, dtype=np.float32)
+    for j in range(0, len(all_kp), kp_size):
+        pose = Pose.from_buffer(all_kp[j:(j+kp_size)].tobytes())
+        poses.append(pose)
+    return poses
+
+
+def histograms(bufs, protobufs):
+    # bufs[0] is None when element is null
+    if bufs[0] is None:
+        return None
     return np.split(np.frombuffer(bufs[0], dtype=np.dtype(np.int32)), 3)
 
 
-def frame_info(buf, db):
-    info = db.protobufs.FrameInfo()
+def frame_info(buf, protobufs):
+    info = protobufs.FrameInfo()
     info.ParseFromString(buf)
     return info
 
 
-def flow(bufs, db):
+def flow(bufs, protobufs):
+    if bufs[0] is None:
+        return None
     output = np.frombuffer(bufs[0], dtype=np.dtype(np.float32))
     info = frame_info(bufs[1], db)
     return output.reshape((info.height, info.width, 2))
+
+
+def array(ty):
+    def parser(bufs, protobufs):
+        return np.frombuffer(bufs[0], dtype=np.dtype(ty))
+    return parser
+
+
+def image(bufs, protobufs):
+    return cv2.imdecode(np.frombuffer(bufs[0], dtype=np.dtype(np.uint8)),
+                        cv2.IMREAD_COLOR)
+
+def raw_frame_gen(shape0, shape1, shape2, typ):
+    def parser(bufs, protobufs):
+        output = np.frombuffer(bufs, dtype=typ)
+        return output.reshape((shape0, shape1, shape2))
+    return parser
diff --git a/python/scannerpy/stdlib/pipelines.py b/python/scannerpy/stdlib/pipelines.py
new file mode 100644
index 00000000..2af40b24
--- /dev/null
+++ b/python/scannerpy/stdlib/pipelines.py
@@ -0,0 +1,217 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import math
+import os.path
+
+from scannerpy import DeviceType, Job, BulkJob
+from scannerpy.stdlib import NetDescriptor, writers, bboxes, poses, parsers
+from scannerpy.stdlib.util import temp_directory, download_temp_file
+
+script_dir = os.path.dirname(os.path.abspath(__file__))
+
+
+def detect_faces(db,
+                 input_frame_columns,
+                 output_samplings,
+                 output_names,
+                 width=960,
+                 prototxt_path=None,
+                 model_weights_path=None,
+                 templates_path=None,
+                 return_profiling=False):
+    if prototxt_path is None:
+        prototxt_path = download_temp_file(
+            'https://storage.googleapis.com/scanner-data/nets/caffe_facenet/facenet_deploy.prototxt'
+        )
+    if model_weights_path is None:
+        model_weights_path = download_temp_file(
+            'https://storage.googleapis.com/scanner-data/nets/caffe_facenet/facenet_deploy.caffemodel'
+        )
+    if templates_path is None:
+        templates_path = download_temp_file(
+            'https://storage.googleapis.com/scanner-data/nets/caffe_facenet/facenet_templates.bin'
+        )
+
+    descriptor = NetDescriptor(db)
+    descriptor.model_path = prototxt_path
+    descriptor.model_weights_path = model_weights_path
+    descriptor.input_layer_names = ['data']
+    descriptor.output_layer_names = ['score_final']
+    descriptor.mean_colors = [119.29959869, 110.54627228, 101.8384321]
+
+    facenet_args = db.protobufs.FacenetArgs()
+    facenet_args.templates_path = templates_path
+    facenet_args.threshold = 0.5
+    caffe_args = facenet_args.caffe_args
+    caffe_args.net_descriptor.CopyFrom(descriptor.as_proto())
+
+    if db.has_gpu():
+        device = DeviceType.GPU
+        pipeline_instances = -1
+    else:
+        device = DeviceType.CPU
+        pipeline_instances = 1
+
+    if type(output_names) is not list:
+        output_names = [
+            '{}_{}'.format(output_names, i)
+            for i in range(len(input_frame_columns))
+        ]
+    else:
+        assert (len(output_names) == len(input_frame_columns))
+
+    if type(output_samplings) is not list:
+        output_samplings = [
+            output_samplings for _ in range(len(input_frame_columns))
+        ]
+    else:
+        assert (len(output_samplings) == len(input_frame_columns))
+
+    outputs = []
+    scales = [1.0, 0.5, 0.25, 0.125]
+    batch_sizes = [int((2**i)) for i in range(len(scales))]
+    profilers = {}
+    for scale, batch in zip(scales, batch_sizes):
+        facenet_args.scale = scale
+        caffe_args.batch_size = batch
+
+        frame = db.ops.FrameInput()
+        #resized = db.ops.Resize(
+        #    frame = frame,
+        #    width = width, height = 0,
+        #    min = True, preserve_aspect = True,
+        frame_info = db.ops.InfoFromFrame(frame=frame)
+        facenet_input = db.ops.FacenetInput(
+            frame=frame, args=facenet_args, device=device)
+        facenet = db.ops.Facenet(
+            facenet_input=facenet_input, args=facenet_args, device=device)
+        facenet_output = db.ops.FacenetOutput(
+            facenet_output=facenet,
+            original_frame_info=frame_info,
+            args=facenet_args)
+        sampled_output = facenet_output.sample()
+        output = db.ops.Output(columns=[sampled_output])
+
+        jobs = []
+        for output_name, frame_column, output_sampling in zip(
+                output_names, input_frame_columns, output_samplings):
+            job = Job(op_args={
+                frame: frame_column,
+                sampled_output: output_sampling,
+                output: '{}_{}'.format(output_name, scale)
+            })
+            jobs.append(job)
+
+        bulk_job = BulkJob(output=output, jobs=jobs)
+        output = db.run(
+            bulk_job,
+            force=True,
+            work_packet_size=batch * 4,
+            pipeline_instances_per_node=pipeline_instances)
+        profilers['scale_{}'.format(scale)] = output[0].profiler()
+        outputs.append(output)
+
+    # Register nms bbox op and kernel
+    db.register_op('BBoxNMS', [], ['bboxes'], variadic_inputs=True)
+    kernel_path = script_dir + '/bbox_nms_kernel.py'
+    db.register_python_kernel('BBoxNMS', DeviceType.CPU, kernel_path)
+    # scale = max(width / float(max_width), 1.0)
+    scale = 1.0
+
+    bbox_inputs = [db.ops.Input() for _ in outputs]
+    nmsed_bboxes = db.ops.BBoxNMS(*bbox_inputs, scale=scale)
+    output = db.ops.Output(columns=[nmsed_bboxes])
+
+    jobs = []
+    for i in range(len(input_frame_columns)):
+        op_args = {}
+        for bi, cols in enumerate(outputs):
+            op_args[bbox_inputs[bi]] = cols[i].column('bboxes')
+        op_args[output] = output_names[i]
+        jobs.append(Job(op_args=op_args))
+    bulk_job = BulkJob(output=output, jobs=jobs)
+    return db.run(bulk_job, force=True)
+
+
+def detect_poses(db,
+                 input_frame_columns,
+                 sampling,
+                 output_name,
+                 batch=1,
+                 models_path=None,
+                 pose_model_weights_path=None,
+                 hand_prototxt_path=None,
+                 hand_model_weights_path=None,
+                 face_prototxt_path=None,
+                 face_model_weights_path=None):
+    if models_path is None:
+        models_path = os.path.join(temp_directory(), 'openpose')
+
+        pose_fs_url = 'http://posefs1.perception.cs.cmu.edu/OpenPose/models/'
+        # Pose prototxt
+        download_temp_file(
+            'https://raw.githubusercontent.com/CMU-Perceptual-Computing-Lab/'
+            'openpose/master/models/pose/coco/pose_deploy_linevec.prototxt',
+            'openpose/pose/coco/pose_deploy_linevec.prototxt')
+        # Pose model weights
+        download_temp_file(
+            os.path.join(pose_fs_url, 'pose/coco/pose_iter_440000.caffemodel'),
+            'openpose/pose/coco/pose_iter_440000.caffemodel')
+        # Hands prototxt
+        download_temp_file(
+            'https://raw.githubusercontent.com/CMU-Perceptual-Computing-Lab/'
+            'openpose/master/models/hand/pose_deploy.prototxt',
+            'openpose/hand/pose_deploy.prototxt')
+        # Hands model weights
+        download_temp_file(
+            os.path.join(pose_fs_url, 'hand/pose_iter_102000.caffemodel'),
+            'openpose/hand/pose_iter_102000.caffemodel')
+        # Face prototxt
+        download_temp_file(
+            'https://raw.githubusercontent.com/CMU-Perceptual-Computing-Lab/'
+            'openpose/master/models/face/pose_deploy.prototxt',
+            'openpose/face/pose_deploy.prototxt')
+        # Face model weights
+        download_temp_file(
+            os.path.join(pose_fs_url, 'face/pose_iter_116000.caffemodel'),
+            'openpose/face/pose_iter_116000.caffemodel')
+        # Face haar cascades
+        download_temp_file(
+            'https://raw.githubusercontent.com/CMU-Perceptual-Computing-Lab/'
+            'openpose/master/models/face/haarcascade_frontalface_alt.xml',
+            'openpose/face/haarcascade_frontalface_alt.xml')
+
+    pose_args = db.protobufs.OpenPoseArgs()
+    pose_args.model_directory = models_path
+    pose_args.pose_num_scales = 3
+    pose_args.pose_scale_gap = 0.33
+    pose_args.hand_num_scales = 4
+    pose_args.hand_scale_gap = 0.4
+
+    if db.has_gpu():
+        device = DeviceType.GPU
+        pipeline_instances = -1
+    else:
+        device = DeviceType.CPU
+        pipeline_instances = 1
+
+    frame = db.ops.FrameInput()
+    poses_out = db.ops.OpenPose(
+        frame=frame, device=device, args=pose_args, batch=batch)
+    sampled_poses = poses_out.sample()
+    output = db.ops.Output(columns=[sampled_poses])
+
+    jobs = []
+    for i, input_frame_column in enumerate(input_frame_columns):
+        job = Job(op_args={
+            frame: input_frame_column,
+            sampled_poses: sampling,
+            output: '{}_{}_poses'.format(output_name, i)
+        })
+        jobs.append(job)
+    bulk_job = BulkJob(output=output, jobs=jobs)
+    output = db.run(
+        bulk_job,
+        force=True,
+        work_packet_size=8,
+        pipeline_instances_per_node=pipeline_instances)
+    return output
diff --git a/python/scannerpy/stdlib/pose_nms_kernel.py b/python/scannerpy/stdlib/pose_nms_kernel.py
new file mode 100644
index 00000000..f6d4416b
--- /dev/null
+++ b/python/scannerpy/stdlib/pose_nms_kernel.py
@@ -0,0 +1,25 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import scannerpy
+import scannerpy.stdlib.parsers as parsers
+import scannerpy.stdlib.writers as writers
+import scannerpy.stdlib.poses as poses
+
+class PoseNMSKernel(scannerpy.Kernel):
+    def __init__(self, config, protobufs):
+        self.protobufs = protobufs
+        args = protobufs.PoseNMSArgs()
+        args.ParseFromString(config.args)
+        self.height = args.height
+
+    def close(self):
+        pass
+
+    def execute(self, input_columns):
+        pose_list = []
+        for c in input_columns:
+            pose_list += parsers.poses(c, self.protobufs)
+        nmsed_poses = poses.nms(pose_list, self.height * 0.2)
+        return writers.poses([nmsed_poses], self.protobufs)
+
+KERNEL = PoseNMSKernel
diff --git a/python/scannerpy/stdlib/poses.py b/python/scannerpy/stdlib/poses.py
new file mode 100644
index 00000000..eb95b1ba
--- /dev/null
+++ b/python/scannerpy/stdlib/poses.py
@@ -0,0 +1,164 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import numpy as np
+import cv2
+import copy
+from collections import defaultdict
+
+class Pose(object):
+    POSE_KEYPOINTS = 18
+    FACE_KEYPOINTS = 70
+    HAND_KEYPOINTS = 21
+
+    Nose = 0
+    Neck = 1
+    RShoulder = 2
+    RElbow = 3
+    RWrist = 4
+    LShoulder = 5
+    LElbow = 6
+    LWrist = 7
+    RHip = 8
+    RKnee = 9
+    RAnkle = 10
+    LHip = 11
+    LKnee = 12
+    LAnkle = 13
+    REye = 14
+    LEye = 15
+    REar = 16
+    LEar = 17
+    Background = 18
+
+    def __init__(self):
+        self.keypoints = np.zeros(
+            (Pose.POSE_KEYPOINTS +
+             Pose.FACE_KEYPOINTS +
+             Pose.HAND_KEYPOINTS * 2,
+             3))
+
+    def _format_keypoints(self):
+        return self.keypoints
+
+    def pose_keypoints(self):
+        kp = self._format_keypoints()
+        return kp[:self.POSE_KEYPOINTS, :]
+
+    def face_keypoints(self):
+        kp = self._format_keypoints()
+        return kp[self.POSE_KEYPOINTS:(self.POSE_KEYPOINTS+self.FACE_KEYPOINTS), :]
+
+    def hand_keypoints(self):
+        kp = self._format_keypoints()
+        base = kp[self.POSE_KEYPOINTS+self.FACE_KEYPOINTS:, :]
+        return [base[:self.HAND_KEYPOINTS, :], base[self.HAND_KEYPOINTS:, :]]
+
+    def face_bbox(self):
+        p = self.pose_keypoints()
+        l = p[16, :2]
+        r = p[17, :2]
+        o = p[0, :2]
+        up = o + [r[1] - l[1], l[0] - r[0]]
+        down = o + [l[1] - r[1], r[0] - l[0]]
+        face = np.array([l, r, up, down])
+
+        xmin = face[:, 0].min()
+        xmax = face[:, 0].max()
+        ymin = face[:, 1].min()
+        ymax = face[:, 1].max()
+
+        score = min(p[16, 2], p[17, 2], p[0, 2])
+        return [(xmin, ymin), (xmax, ymax), score]
+
+    @staticmethod
+    def from_buffer(keypoints_buffer):
+        pose = Pose()
+        shape = pose.keypoints.shape
+        pose.keypoints = (
+            np.frombuffer(keypoints_buffer, dtype=np.float32).reshape(shape))
+        return pose
+
+
+def nms(orig_poses, overlapThresh):
+    # if there are no boxes, return an empty list
+    if len(orig_poses) == 0:
+        return []
+    elif len(orig_poses) == 1:
+        return orig_poses
+
+    poses = copy.deepcopy(orig_poses)
+
+    # if the bounding boxes integers, convert them to floats --
+    # this is important since we'll be doing a bunch of divisions
+
+    num_joints = poses[0].shape[0]
+
+    max_boxes = len(poses)
+    joints_4d = np.stack(poses, axis=2)
+    pose_scores = np.sum(joints_4d[:,2,:], axis=0)
+    num_joints_per_pose = np.sum(joints_4d[:,2,:] > 0.2, axis=0)
+    # sort by score
+    idxs = np.argsort(pose_scores)
+    idxs_orig = np.argsort(pose_scores)
+
+    # spatially hash joints into buckets
+    x_buckets = [defaultdict(set) for _ in range(num_joints)]
+    y_buckets = [defaultdict(set) for _ in range(num_joints)]
+    for i, idx in enumerate(idxs):
+        pose = poses[idx]
+        for pi in range(num_joints):
+            if pose[pi,2] > 0.2:
+                x_pos = int(pose[pi,1] - (pose[pi,1] % overlapThresh))
+                y_pos = int(pose[pi,0] - (pose[pi,0] % overlapThresh))
+                for xp in range(x_pos - 1, x_pos + 2):
+                    x_buckets[pi][xp].add(idx)
+                for yp in range(y_pos - 1, y_pos + 2):
+                    y_buckets[pi][yp].add(idx)
+
+    # the list of picked indexes
+    pick = []
+
+    # keep looping while some indexes still remain in the indexes
+    # list
+    while len(idxs) > 0:
+        # grab the last index in the indexes list and add the
+        # index value to the list of picked indexes
+        last = len(idxs) - 1
+        i = idxs[last]
+        pick.append(i)
+
+        overlaps = defaultdict(int)
+        pose = poses[i]
+        for pi in range(num_joints):
+            if pose[pi,2] > 0.2:
+                x_pos = int(pose[pi,1] - (pose[pi,1] % overlapThresh))
+                y_pos = int(pose[pi,0] - (pose[pi,0] % overlapThresh))
+
+                x_set = set()
+                for xp in range(x_pos - 1, x_pos + 2):
+                    x_set.update(x_buckets[pi][xp])
+                y_set = set()
+                for yp in range(y_pos - 1, y_pos + 2):
+                    y_set.update(y_buckets[pi][yp])
+                both_set = x_set.intersection(y_set)
+                # Increment num overlaps for each joint
+                for idx in both_set:
+                    overlaps[idx] += 1
+
+        duplicates = []
+        for idx, num_overlaps in overlaps.iteritems():
+            if num_overlaps >= min(3, num_joints_per_pose[idx]):
+                for ii, idx2 in enumerate(idxs):
+                    if idx == idx2:
+                        break
+                duplicates.append(ii)
+
+        # delete all indexes from the index list that have
+        idxs = np.delete(idxs, np.concatenate(
+            ([last], np.array(duplicates))))
+
+    # return only the bounding boxes that were picked
+    out_poses = []
+    for i in pick:
+        out_poses.append(orig_poses[i])
+    return out_poses
diff --git a/python/scannerpy/stdlib/pykernel.py b/python/scannerpy/stdlib/pykernel.py
new file mode 100644
index 00000000..6323bbbd
--- /dev/null
+++ b/python/scannerpy/stdlib/pykernel.py
@@ -0,0 +1,19 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import tensorflow as tf
+
+class TensorFlowKernel:
+    def __init__(self, protobufs, config):
+        # TODO: wrap this in "with device"
+        config = tf.ConfigProto(allow_soft_placement = True)
+        self.sess = tf.Session(config=config)
+        self.graph = self.build_graph(self.sess)
+
+    def close(self):
+        self.sess.close()
+
+    def build_graph(self):
+        raise NotImplementedError
+
+    def execute(self):
+        raise NotImplementedError
diff --git a/python/scannerpy/stdlib/util.py b/python/scannerpy/stdlib/util.py
new file mode 100644
index 00000000..52668ba0
--- /dev/null
+++ b/python/scannerpy/stdlib/util.py
@@ -0,0 +1,50 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import os
+import urllib2
+import errno
+
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError as exc:  # Python >2.5
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else:
+            raise
+
+
+def temp_directory():
+    path = os.path.expanduser('~/.scanner/resources')
+    mkdir_p(path)
+    return path
+
+
+def download_temp_file(url, local_path=None):
+    if local_path is None:
+        local_path = url.rsplit('/', 1)[-1]
+    local_path = os.path.join(temp_directory(), local_path)
+    mkdir_p(os.path.dirname(local_path))
+    if not os.path.isfile(local_path):
+        print('Downloading {:s} to {:s}...'.format(url, local_path))
+        f = urllib2.urlopen(url)
+        with open(local_path, 'wb') as local_f:
+            local_f.write(f.read())
+    return local_path
+
+
+# def download_temp_youtube_video(url, local_path=None):
+#     # Get filename
+#     https://youtu.be/SHoHdkUw-Is  '--restrict-filenames --get-filename'
+#     ~/.local/bin/youtube-dl -f 266 https://youtu.be/SHoHdkUw-Is -o
+#     if local_path is None:
+#         local_path = url.rsplit('/', 1)[-1]
+#     local_path = os.path.join(temp_directory(), local_path)
+#     mkdir_p(os.path.dirname(local_path))
+#     if not os.path.isfile(local_path):
+#         print('Downloading {:s} to {:s}...'.format(url, local_path))
+#         f = urllib2.urlopen(url)
+#         with open(local_path, 'wb') as local_f:
+#             local_f.write(f.read())
+#     return local_path
+
+# ~/.local/bin/youtube-dl -f 266 https://youtu.be/SHoHdkUw-Is -o
diff --git a/python/scannerpy/stdlib/video.py b/python/scannerpy/stdlib/video.py
new file mode 100644
index 00000000..29e3e169
--- /dev/null
+++ b/python/scannerpy/stdlib/video.py
@@ -0,0 +1,14 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import cv2
+
+def write_video(path, frames, fps=24.0):
+    assert len(frames) > 0
+
+    output = cv2.VideoWriter(
+        path,
+        cv2.VideoWriter_fourcc(*'X264'),
+        fps,
+        (frames[0].shape[1], frames[0].shape[0]))
+
+    for frame in frames:
+        output.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
diff --git a/python/scannerpy/stdlib/writers.py b/python/scannerpy/stdlib/writers.py
new file mode 100644
index 00000000..e935195a
--- /dev/null
+++ b/python/scannerpy/stdlib/writers.py
@@ -0,0 +1,25 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import struct
+
+def bboxes(bufs, protobufs):
+    s = struct.pack('=Q', len(bufs[0]))
+    for bbox in bufs[0]:
+        bs = bbox.SerializeToString()
+        s += struct.pack('=Q', len(bs))
+        s += bs
+    return [s]
+
+def poses(bufs, protobufs):
+    s = struct.pack("=Q", len(bufs[0]))
+    for pose in bufs[0]:
+        # Num joints
+        s += struct.pack("=Q", len(pose))
+        for i in range(len(pose)):
+            point = protobufs.Point()
+            point.y = pose[i, 0]
+            point.x = pose[i, 1]
+            point.score = pose[i, 2]
+            # Point size
+            s += struct.pack("=Q", point.ByteSize())
+            s += point.SerializeToString()
+    return [s]
diff --git a/python/scannerpy/table.py b/python/scannerpy/table.py
index fa498e9c..6cedb673 100644
--- a/python/scannerpy/table.py
+++ b/python/scannerpy/table.py
@@ -1,6 +1,10 @@
-from common import *
-from column import Column
+from __future__ import absolute_import, division, print_function, unicode_literals
+import struct
+from itertools import izip
+from timeit import default_timer as now
 
+from scannerpy.common import *
+from scannerpy.column import Column
 
 class Table:
     """
@@ -8,72 +12,103 @@ class Table:
 
     Can be part of many Collection objects.
     """
-    def __init__(self, db, descriptor):
+    def __init__(self, db, name, id):
         self._db = db
-        self._descriptor = descriptor
-        job_id = self._descriptor.job_id
-        if job_id != -1:
+        # We pass name and id to avoid having to read the descriptor
+        self._name = name
+        self._id = id
+        self._descriptor = None
+        self._video_descriptors = None
+
+    def id(self):
+        return self._id
+
+    def name(self):
+        return self._name
+
+    def _need_descriptor(self):
+        if self._descriptor is None:
+            self._descriptor = self._db._load_descriptor(
+                self._db.protobufs.TableDescriptor,
+                'tables/{}/descriptor.bin'.format(self._id))
+
+    def _load_column(self, name):
+        if not self.committed():
+            raise ScannerException('Table has not committed yet.')
+        self._need_descriptor()
+        if self._video_descriptors is None:
+            self._video_descriptors = []
+            for c in self._descriptor.columns:
+                video_descriptor = None
+                if c.type == self._db.protobufs.Video:
+                    video_descriptor = self._db._load_descriptor(
+                        self._db.protobufs.VideoDescriptor,
+                        'tables/{:d}/{:d}_0_video_metadata.bin'.format(
+                            self._id,
+                            c.id))
+                self._video_descriptors.append(video_descriptor)
+        for i, c in enumerate(self._descriptor.columns):
+            if c.name == name:
+                return c, self._video_descriptors[i]
+        raise ScannerException('Column {} not found in Table {}'
+                               .format(name, self._name))
+
+    def _load_job(self):
+        self._need_descriptor()
+        if self._descriptor.job_id != -1:
             self._job = self._db._load_descriptor(
                 self._db.protobufs.JobDescriptor,
-                'jobs/{}/descriptor.bin'.format(job_id))
+                'jobs/{}/descriptor.bin'.format(self._descriptor.job_id))
             self._task = None
             for task in self._job.tasks:
-                if task.output_table_name == self._descriptor.name:
+                if task.output_table_name == self._name:
                     self._task = task
             if self._task is None:
                 raise ScannerException('Table {} not found in job {}'
-                                       .format(self._descriptor.name, job_id))
+                                       .format(self._name, self._descriptor.job_id))
         else:
             self._job = None
 
-    def id(self):
-        return self._descriptor.id
 
-    def name(self):
-        return self._descriptor.name
-
-    def columns(self, index=None):
-        columns = [Column(self, c) for c in self._descriptor.columns]
-        if index is not None:
-            col = None
-            if isinstance(index, basestring):
-                for c in columns:
-                    if c.name() == index:
-                        col = c
-                        break
-                if col is None:
-                    raise ScannerException('Could not find column with name {}'
-                                           .format(index))
-            else:
-                assert isinstance(index, int)
-                if index < 0 or index >= len(columns):
-                    raise ScannerException('No column with index {}'
-                                           .format(index))
-                col = columns[index]
-            return col
-        else:
-            return columns
+    # HACK(wcrichto): reading from TableDescriptor to avoid loading VideoDescriptors
+    def column_names(self):
+        self._need_descriptor()
+        return [c.name for c in self._descriptor.columns]
+
+    def column(self, name):
+        return Column(self, name)
 
     def num_rows(self):
+        self._need_descriptor()
         return self._descriptor.end_rows[-1]
 
-    def rows(self):
-        return list(range(self.num_rows()))
+    def _parse_index(self, bufs, db):
+        return struct.unpack("=Q", bufs[0])[0]
+
+    def committed(self):
+        return self._db._table_committed[self._id]
 
     def parent_rows(self):
-        assert(False)
-        return list(range(self.num_rows()))
+        self._need_descriptor()
+        if self._descriptor.job_id == -1:
+            raise ScannerException('Table {} has no parent'.format(self.name()))
+
+        return [i for _, i in self.load(['index'], fn=self._parse_index)]
 
     def profiler(self):
-        job_id = self._descriptor.job_id
-        if job_id != -1:
-            return self._db.profiler(job_id)
+        if not self.committed():
+            raise ScannerException('Table has not committed yet.')
+        self._need_descriptor()
+        if self._descriptor.job_id != -1:
+            return self._db.profiler(self._descriptor.job_id)
         else:
             raise ScannerException('Ingested videos do not have profile data')
 
     def load(self, columns, fn=None, rows=None):
-        cols = [self.columns(c).load(rows=rows) for c in columns]
-        for tup in zip(*cols):
+        if not self.committed():
+            raise ScannerException('Table has not committed yet.')
+        cols = [self.column(c).load(rows=rows) for c in columns]
+        for tup in izip(*cols):
             row = tup[0][0]
             vals = [x for _, x in tup]
             if fn is not None:
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 00000000..ab3501ea
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,140 @@
+from setuptools import setup, find_packages
+import os
+import os.path
+import shutil
+import glob
+
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+SCANNERPY_DIR = os.path.join(SCRIPT_DIR, 'scannerpy')
+SCANNER_DIR = '.'
+BUILD_DIR = os.path.join(SCANNER_DIR, 'build')
+PIP_DIR = os.path.join(BUILD_DIR, 'pip')
+
+# Make a pip directory in the build directory
+shutil.rmtree(PIP_DIR, ignore_errors=True)
+shutil.copytree(SCRIPT_DIR, PIP_DIR)
+#os.makedirs(PIP_DIR)
+#os.makedirs(PIP_DIR + '/scanner')
+#os.makedirs(PIP_DIR + '/scanner/stdlib')
+
+# Copy python into pip directory
+#shutil.copytree(SCANNERPY_DIR, PIP_DIR + '/scannerpy')
+
+# Copy libraries into pip directory
+LIBRARIES = [
+    os.path.join(BUILD_DIR, 'libscanner.so'),
+    os.path.join(BUILD_DIR, 'stdlib', 'libstdlib.so')
+]
+for library in LIBRARIES:
+    shutil.copy(library, PIP_DIR + '/scannerpy/')
+
+def copy_partial_tree(from_dir, to_dir, pattern):
+    dest_paths = []
+    try:
+        os.makedirs(to_dir)
+    except:
+        pass
+    for f in glob.glob(os.path.join(from_dir, pattern)):
+        print(f)
+        shutil.copy(f, to_dir)
+        dest_paths.append(os.path.join(to_dir, os.path.basename(f)))
+
+    # List all directories in from_dir
+    for d in [p for p in os.listdir(from_dir)
+              if os.path.isdir(os.path.join(from_dir, p))]:
+        print('dir', d)
+        dest_paths += copy_partial_tree(
+            os.path.join(from_dir, d),
+            os.path.join(to_dir, d),
+            pattern)
+    return dest_paths
+
+def glob_files(path, prefix=''):
+    all_paths = os.listdir(path)
+    files = [os.path.join(prefix, p) for p in all_paths
+             if os.path.isfile(os.path.join(path, p))]
+    for d in [p for p in all_paths
+              if os.path.isdir(os.path.join(path, p))]:
+        files += glob_files(os.path.join(path, d),
+                            prefix=os.path.join(prefix, d))
+    return files
+
+
+# Copy built protobuf python files
+copy_partial_tree(
+    os.path.join(BUILD_DIR, 'scanner'),
+     os.path.join(PIP_DIR, 'scanner'),
+    '*.py')
+copy_partial_tree(
+    os.path.join(BUILD_DIR, 'stdlib'),
+     os.path.join(PIP_DIR, 'scanner', 'stdlib'),
+    '*.py')
+
+# Copy cmake files
+os.makedirs(os.path.join(PIP_DIR, 'scannerpy', 'cmake'))
+shutil.copy(
+    os.path.join(SCANNER_DIR, 'cmake', 'Util', 'Op.cmake'),
+    os.path.join(PIP_DIR, 'scannerpy', 'cmake'))
+copy_partial_tree(
+     os.path.join(SCANNER_DIR, 'cmake', 'Modules'),
+     os.path.join(PIP_DIR, 'scannerpy', 'cmake', 'Modules'),
+    '*')
+
+cmake_files = glob_files(
+    os.path.join(PIP_DIR, 'scannerpy', 'cmake'), 'cmake')
+
+# Copy scanner headers
+copy_partial_tree(
+    os.path.join(SCANNER_DIR, 'scanner'),
+     os.path.join(PIP_DIR, 'scannerpy', 'include', 'scanner'),
+    '*.h')
+copy_partial_tree(
+    os.path.join(SCANNER_DIR, 'scanner'),
+    os.path.join(PIP_DIR, 'scannerpy', 'include', 'scanner'),
+    '*.inl')
+
+copy_partial_tree(
+    os.path.join(BUILD_DIR, 'scanner'),
+    os.path.join(PIP_DIR, 'scannerpy', 'include', 'scanner'),
+    '*.h')
+
+include_files = glob_files(
+    os.path.join(PIP_DIR, 'scannerpy', 'include'), 'include')
+
+package_data = {
+    'scannerpy': [
+        './*.so'
+    ] + include_files + cmake_files
+}
+
+REQUIRED_PACKAGES = [
+    'protobuf == 3.4.0',
+    'grpcio == 1.7.3',
+    'toml >= 0.9.2',
+    'enum34 >= 1.1.6',
+    'numpy >= 1.12.0',
+    'scipy >= 0.18.1',
+    'storehouse >= 0.1.0',
+    'tqdm >= 4.19.5'
+]
+
+print(find_packages(where=PIP_DIR))
+setup(
+    name='scannerpy',
+    version='0.1.13',
+    description='Efficient video analysis at scale',
+    long_description='',
+    url='https://github.com/scanner-research/scanner',
+    author='Alex Poms and Will Crichton',
+    author_email='wcrichto@cs.stanford.edu',
+
+    package_dir={'': PIP_DIR},
+    packages=find_packages(where=PIP_DIR),
+    install_requires=REQUIRED_PACKAGES,
+    include_package_data=True,
+    package_data=package_data,
+    zip_safe=False,
+
+    license='Apache 2.0',
+    keywords='video distributed gpu',
+)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..4c7821a2
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,19 @@
+ipython==5.3.0
+numpy==1.12.0
+protobuf==3.4.0
+toml==0.9.2
+youtube-dl
+scipy==0.18.1
+scikit-learn==0.18.1
+scikit-image==0.12.3
+enum34==1.1.6
+matplotlib==2.0.0
+seaborn==0.7.1
+grpcio==1.7.3
+doxypypy==0.8.8.6
+pytest==3.0.6
+twine==1.8.1
+storehouse
+ipaddress==1.0.18
+plotly==2.0.6
+jupyter==1.0.0
diff --git a/scanner/api/CMakeLists.txt b/scanner/api/CMakeLists.txt
index 58adc9cb..3d105ba9 100644
--- a/scanner/api/CMakeLists.txt
+++ b/scanner/api/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(SOURCE_FILES
+  frame.cpp
   kernel.cpp
   op.cpp
   database.cpp
diff --git a/scanner/api/database.cpp b/scanner/api/database.cpp
index 65fd96d2..0735bf85 100644
--- a/scanner/api/database.cpp
+++ b/scanner/api/database.cpp
@@ -14,12 +14,13 @@
  */
 
 #include "scanner/api/database.h"
-#include "scanner/engine/runtime.h"
 #include "scanner/engine/ingest.h"
-#include "scanner/engine/db.h"
+#include "scanner/engine/master.h"
+#include "scanner/engine/metadata.h"
 #include "scanner/engine/rpc.grpc.pb.h"
 #include "scanner/engine/rpc.pb.h"
 #include "scanner/engine/runtime.h"
+#include "scanner/engine/worker.h"
 #include "scanner/metadata.pb.h"
 #include "scanner/util/cuda.h"
 
@@ -36,7 +37,7 @@ namespace scanner {
 
 namespace {
 template <typename T>
-std::unique_ptr<grpc::Server> start(T &service, const std::string &port) {
+std::unique_ptr<grpc::Server> start(T& service, const std::string& port) {
   std::string server_address("0.0.0.0:" + port);
   grpc::ServerBuilder builder;
   builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
@@ -46,103 +47,9 @@ std::unique_ptr<grpc::Server> start(T &service, const std::string &port) {
   return std::move(server);
 }
 
-proto::TaskSet consume_task_set(TaskSet &ts) {
-  proto::TaskSet task_set;
-  // Parse tasks
-  for (Task &t : ts.tasks) {
-    proto::Task *task = task_set.add_tasks();
-    task->set_output_table_name(t.output_table_name);
-    for (TableSample &ts : t.samples) {
-      proto::TableSample *sample = task->add_samples();
-      sample->set_table_name(ts.table_name);
-      for (std::string &s : ts.column_names) {
-        sample->add_column_names(s);
-      }
-      sample->set_sampling_function(ts.sampling_function);
-      sample->set_sampling_args(ts.sampling_args.data(),
-                                ts.sampling_args.size());
-    }
-  }
-  // Parse ops
-  std::map<Op *, std::vector<Op *>> edges; // parent -> child
-  std::map<Op *, i32> in_edges_left;              // parent -> child
-  Op *start_node = nullptr;
-  {
-    // Find all edges
-    std::set<Op *> explored_nodes;
-    std::vector<Op *> stack;
-    stack.push_back(ts.output_op);
-    while (!stack.empty()) {
-      Op *c = stack.back();
-      stack.pop_back();
-      explored_nodes.insert(c);
-
-      if (c->get_name() == "InputTable") {
-        assert(start_node == nullptr);
-        start_node = c;
-        continue;
-      }
-      for (const OpInput &input : c->get_inputs()) {
-        Op *parent_eval = input.get_op();
-        edges[parent_eval].push_back(c);
-        in_edges_left[c] += 1;
-
-        if (explored_nodes.count(parent_eval) > 0)
-          continue;
-        stack.push_back(parent_eval);
-      }
-    }
-  }
-  std::vector<Op *> sorted_ops;
-  std::map<Op *, i32> op_index;
-  {
-    // Perform topological sort
-    std::vector<Op *> stack;
-    stack.push_back(start_node);
-    while (!stack.empty()) {
-      Op *curr = stack.back();
-      stack.pop_back();
-
-      sorted_ops.push_back(curr);
-      op_index.insert({curr, sorted_ops.size() - 1});
-      for (Op *child : edges[curr]) {
-        i32 &edges_left = in_edges_left[child];
-        edges_left -= 1;
-        if (edges_left == 0) {
-          stack.push_back(child);
-        }
-      }
-    }
-  }
-  assert(sorted_ops.size() == in_edges_left.size() + 1);
-  // Translate sorted ops into serialized task set
-  for (Op *eval : sorted_ops) {
-    proto::Op *proto_eval = task_set.add_ops();
-    proto_eval->set_name(eval->get_name());
-    proto_eval->set_device_type(eval->get_device_type());
-    proto_eval->set_kernel_args(eval->get_args(), eval->get_args_size());
-    for (const OpInput &input : eval->get_inputs()) {
-      proto::OpInput *proto_input = proto_eval->add_inputs();
-      i32 parent_index;
-      if (input.get_op() == nullptr) {
-        parent_index = -1;
-      } else {
-        parent_index = op_index.at(input.get_op());
-      }
-      proto_input->set_op_index(parent_index);
-      for (const std::string &column_name : input.get_columns()) {
-        proto_input->add_columns(column_name);
-      }
-    }
-  }
-
-  return task_set;
-}
-
-internal::DatabaseParameters
-machine_params_to_db_params(const MachineParameters &params,
-                            storehouse::StorageConfig *sc,
-                            const std::string db_path) {
+internal::DatabaseParameters machine_params_to_db_params(
+    const MachineParameters& params, storehouse::StorageConfig* sc,
+    const std::string db_path) {
   internal::DatabaseParameters db;
   db.storage_config = sc;
   db.db_path = db_path;
@@ -150,6 +57,8 @@ machine_params_to_db_params(const MachineParameters &params,
   db.num_load_workers = params.num_load_workers;
   db.num_save_workers = params.num_save_workers;
   db.gpu_ids = params.gpu_ids;
+  db.prefetch_table_metadata = true;
+  db.no_workers_timeout = 30;
   return db;
 }
 }
@@ -157,7 +66,7 @@ machine_params_to_db_params(const MachineParameters &params,
 MachineParameters default_machine_params() {
   MachineParameters machine_params;
   machine_params.num_cpus = std::thread::hardware_concurrency();
-  machine_params.num_load_workers = 2;
+  machine_params.num_load_workers = 8;
   machine_params.num_save_workers = 2;
 #ifdef HAVE_CUDA
   i32 gpu_count;
@@ -169,13 +78,13 @@ MachineParameters default_machine_params() {
   return machine_params;
 }
 
-Database::Database(storehouse::StorageConfig *storage_config,
-                   const std::string &db_path,
-                   const std::string &master_address)
-    : storage_config_(storage_config),
-      storage_(storehouse::StorageBackend::make_from_config(storage_config)),
-      db_path_(db_path), master_address_(master_address) {
-
+Database::Database(storehouse::StorageConfig* storage_config,
+                   const std::string& db_path,
+                   const std::string& master_address)
+  : storage_config_(storage_config),
+    storage_(storehouse::StorageBackend::make_from_config(storage_config)),
+    db_path_(db_path),
+    master_address_(master_address) {
   internal::set_database_path(db_path);
   if (!database_exists()) {
     internal::DatabaseMetadata meta{};
@@ -185,41 +94,73 @@ Database::Database(storehouse::StorageConfig *storage_config,
   gpr_set_log_verbosity(GPR_LOG_SEVERITY_ERROR);
 }
 
-Result Database::start_master(const MachineParameters& machine_params) {
-  if (master_state_.service.get() != nullptr) {
+Result Database::start_master(const MachineParameters& machine_params,
+                              const std::string& port,
+                              bool watchdog,
+                              bool prefetch_table_metadata,
+                              i64 no_workers_timeout) {
+  if (master_state_ != nullptr) {
     LOG(WARNING) << "Master already started";
     Result result;
     result.set_success(true);
     return result;
   }
+  master_state_.reset(new ServerState);
   internal::DatabaseParameters params =
       machine_params_to_db_params(machine_params, storage_config_, db_path_);
-  master_state_.service.reset(scanner::internal::get_master_service(params));
-  master_state_.server = start(master_state_.service, "5001");
+  params.prefetch_table_metadata = prefetch_table_metadata;
+  params.no_workers_timeout = no_workers_timeout;
+
+  auto master_service = scanner::internal::get_master_service(params);
+  master_state_->service.reset(master_service);
+  master_state_->server = start(master_state_->service, port);
+
+  // Register shutdown signal handler
+
+  // Setup watchdog
+  master_service->start_watchdog(master_state_->server.get(), watchdog);
 
   Result result;
   result.set_success(true);
   return result;
 }
 
-Result Database::start_worker(const MachineParameters& machine_params) {
+Result Database::start_worker(const MachineParameters& machine_params,
+                              const std::string& port,
+                              bool watchdog,
+                              bool prefetch_table_metadata) {
   internal::DatabaseParameters params =
       machine_params_to_db_params(machine_params, storage_config_, db_path_);
-  worker_states_.emplace_back();
-  ServerState &state = worker_states_.back();
-  state.service.reset(
-      scanner::internal::get_worker_service(params, master_address_));
-  state.server = start(state.service, "5002");
+  params.prefetch_table_metadata = prefetch_table_metadata;
+  ServerState* s = new ServerState;
+  ServerState& state = *s;
+  auto worker_service =
+      scanner::internal::get_worker_service(params, master_address_, port);
+  state.service.reset(worker_service);
+  state.server = start(state.service, port);
+  worker_states_.emplace_back(s);
+
+  // Register shutdown signal handler
+
+  Result register_result = worker_service->register_with_master();
+  if (!register_result.success()) {
+    return register_result;
+  }
+
+  // Setup watchdog
+  worker_service->start_watchdog(state.server.get(), watchdog);
 
   Result result;
   result.set_success(true);
   return result;
 }
 
-Result Database::ingest_videos(const std::vector<std::string> &table_names,
-                               const std::vector<std::string> &paths,
-                               std::vector<FailedVideo> &failed_videos) {
+Result Database::ingest_videos(const std::vector<std::string>& table_names,
+                               const std::vector<std::string>& paths,
+                               bool inplace,
+                               std::vector<FailedVideo>& failed_videos) {
   internal::ingest_videos(storage_config_, db_path_, table_names, paths,
+                          inplace,
                           failed_videos);
   Result result;
   result.set_success(true);
@@ -239,10 +180,9 @@ Result Database::ingest_videos(const std::vector<std::string> &table_names,
     params.add_video_paths(p);
   }
   proto::IngestResult job_result;
-  grpc::Status status =
-      master_->IngestVideos(&context, params, &job_result);
-  LOG_IF(FATAL, !status.ok()) << "Could not contact master server: "
-                              << status.error_message();
+  grpc::Status status = master_->IngestVideos(&context, params, &job_result);
+  LOG_IF(FATAL, !status.ok())
+      << "Could not contact master server: " << status.error_message();
   for (i32 i = 0; i < job_result.failed_paths().size(); ++i) {
     FailedVideo failed;
     failed.path = job_result.failed_paths(i);
@@ -252,25 +192,22 @@ Result Database::ingest_videos(const std::vector<std::string> &table_names,
   return job_result.result();
 }
 
-Result Database::new_job(JobParameters &params) {
-  auto channel =
-      grpc::CreateChannel(master_address_, grpc::InsecureChannelCredentials());
-  std::unique_ptr<proto::Master::Stub> master_ =
-      proto::Master::NewStub(channel);
+Result Database::delete_table(const std::string& table_name) {
+  Result result;
+  internal::DatabaseMetadata meta = internal::read_database_metadata(
+      storage_.get(), internal::DatabaseMetadata::descriptor_path());
 
-  grpc::ClientContext context;
-  proto::JobParameters job_params;
-  job_params.set_job_name(params.job_name);
-  job_params.set_pipeline_instances_per_node(params.pipeline_instances_per_node);
-  job_params.set_work_item_size(params.work_item_size);
-  proto::TaskSet set = consume_task_set(params.task_set);
-  job_params.mutable_task_set()->Swap(&set);
-  Result job_result;
-  grpc::Status status = master_->NewJob(&context, job_params, &job_result);
-  LOG_IF(FATAL, !status.ok()) << "Could not contact master server: "
-                              << status.error_message();
-
-  return job_result;
+  i32 id = meta.get_table_id(table_name);
+  if (id == -1) {
+    RESULT_ERROR(&result, "Table %s does not exist", table_name.c_str());
+    return result;
+  }
+
+  meta.remove_table(id);
+  internal::write_database_metadata(storage_.get(), meta);
+
+  internal::TableMetadata table = internal::read_table_metadata(
+      storage_.get(), internal::TableMetadata::descriptor_path(id));
 }
 
 Result Database::shutdown_master() {
@@ -290,12 +227,14 @@ Result Database::shutdown_worker() {
 }
 
 Result Database::wait_for_server_shutdown() {
-  if (master_state_.server.get() != nullptr) {
-    master_state_.server->Wait();
+  if (master_state_ != nullptr) {
+    master_state_->server->Wait();
+    master_state_.reset(nullptr);
   }
-  for (ServerState& state : worker_states_) {
-    state.server->Wait();
+  for (auto& state : worker_states_) {
+    state->server->Wait();
   }
+  worker_states_.clear();
 
   Result result;
   result.set_success(true);
@@ -314,8 +253,8 @@ bool Database::database_exists() {
   internal::set_database_path(db_path_);
   std::string db_meta_path = internal::DatabaseMetadata::descriptor_path();
   storehouse::FileInfo info;
-  storehouse::StoreResult result = storage_->get_file_info(db_meta_path, info);
-  return (result != storehouse::StoreResult::FileDoesNotExist);
+  storehouse::StoreResult result;
+  EXP_BACKOFF(storage_->get_file_info(db_meta_path, info), result);
+  return (result == storehouse::StoreResult::Success);
 }
-
 }
diff --git a/scanner/api/database.h b/scanner/api/database.h
index 9c61f560..b141b507 100644
--- a/scanner/api/database.h
+++ b/scanner/api/database.h
@@ -26,65 +26,45 @@
 
 namespace scanner {
 
+//! Description of resources for a given machine.
 struct MachineParameters {
   i32 num_cpus;
   i32 num_load_workers;
   i32 num_save_workers;
-  std::vector<i32> gpu_ids;
+  std::vector<i32>
+      gpu_ids;  //!< List of CUDA device IDs that Scanner should use.
 };
 
+//! Pick smart defaults for the current machine.
 MachineParameters default_machine_params();
 
-struct TableSample {
-  std::string table_name;
-  std::vector<std::string> column_names;
-  std::string sampling_function;
-  std::vector<u8> sampling_args;
-};
-
-struct Task {
-  std::string output_table_name;
-  std::vector<TableSample> samples;
-};
-
-struct TaskSet {
-  std::vector<Task> tasks;
-  Op *output_op;
-};
-
-struct JobParameters {
-  std::string job_name;
-  TaskSet task_set;
-
-  MemoryPoolConfig memory_pool_config;
-  i32 pipeline_instances_per_node;
-  i64 work_item_size;
-};
-
+//! Info about a video that fails to ingest.
 struct FailedVideo {
   std::string path;
   std::string message;
 };
 
+//! Main entry point into Scanner.
 class Database {
-public:
-  Database(storehouse::StorageConfig *storage_config,
-           const std::string &db_path,
-           const std::string &master_address);
-
-  Result start_master(const MachineParameters &params);
+ public:
+  Database(storehouse::StorageConfig* storage_config,
+           const std::string& db_path, const std::string& master_address);
 
-  Result start_worker(const MachineParameters &params);
+  Result start_master(const MachineParameters& params, const std::string& port,
+                      bool watchdog = true,
+                      bool prefetch_table_metadata = true,
+                      i64 no_workers_timeout = 30);
 
-  Result ingest_videos(const std::vector<std::string> &table_names,
-                              const std::vector<std::string> &paths,
-                              std::vector<FailedVideo> &failed_videos);
+  Result start_worker(const MachineParameters& params, const std::string& port,
+                      bool watchdog = true,
+                      bool prefetch_table_metadata = true);
 
-  // void ingest_images(storehouse::StorageConfig *storage_config,
-  //                    const std::string &db_path, const std::string &table_name,
-  //                    const std::vector<std::string> &paths);
+  Result ingest_videos(const std::vector<std::string>& table_names,
+                       const std::vector<std::string>& paths,
+                       bool inplace,
+                       std::vector<FailedVideo>& failed_videos);
 
-  Result new_job(JobParameters &params);
+  Result delete_table(const std::string& table_name);
 
   Result shutdown_master();
 
@@ -94,21 +74,21 @@ class Database {
 
   Result destroy_database();
 
-protected:
+ protected:
   bool database_exists();
 
   struct ServerState {
     std::unique_ptr<grpc::Server> server;
-    std::unique_ptr<grpc::Service> service;
+    std::shared_ptr<grpc::Service> service;
   };
 
-private:
-  storehouse::StorageConfig *storage_config_;
+ private:
+  storehouse::StorageConfig* storage_config_;
   std::unique_ptr<storehouse::StorageBackend> storage_;
   std::string db_path_;
   std::string master_address_;
 
-  ServerState master_state_;
-  std::vector<ServerState> worker_states_;
+  std::unique_ptr<ServerState> master_state_;
+  std::vector<std::unique_ptr<ServerState>> worker_states_;
 };
 }
diff --git a/scanner/api/frame.cpp b/scanner/api/frame.cpp
new file mode 100644
index 00000000..267a83b1
--- /dev/null
+++ b/scanner/api/frame.cpp
@@ -0,0 +1,116 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scanner/api/frame.h"
+#include "scanner/util/memory.h"
+
+namespace scanner {
+
+size_t size_of_frame_type(FrameType type) {
+  size_t s;
+  switch (type) {
+    case FrameType::U8:
+      s = sizeof(u8);
+      break;
+    case FrameType::F32:
+      s = sizeof(f32);
+      break;
+    case FrameType::F64:
+      s = sizeof(f64);
+      break;
+  }
+  return s;
+}
+
+FrameInfo::FrameInfo(int shape0, int shape1, int shape2, FrameType t) {
+  assert(shape0 >= 0);
+  assert(shape1 >= 0);
+  assert(shape2 >= 0);
+
+  shape[0] = shape0;
+  shape[1] = shape1;
+  shape[2] = shape2;
+  type = t;
+}
+
+FrameInfo::FrameInfo(const std::vector<int> shapes, FrameType t) {
+  assert(shapes.size() <= 3);
+
+  for (int i = 0; i < shapes.size(); ++i) {
+    shape[i] = shapes[i];
+    assert(shape[i] >= 0);
+  }
+  type = t;
+}
+
+bool FrameInfo::operator==(const FrameInfo& other) const {
+  bool same = (type == other.type);
+  for (int i = 0; i < FRAME_DIMS; ++i) {
+    same &= (shape[i] == other.shape[i]);
+  }
+  return same;
+}
+
+bool FrameInfo::operator!=(const FrameInfo& other) const {
+  return !(*this == other);
+}
+
+size_t FrameInfo::size() const {
+  size_t s = size_of_frame_type(type);
+  for (int i = 0; i < FRAME_DIMS; ++i) {
+    s *= shape[i];
+  }
+  return s;
+}
+
+int FrameInfo::width() const { return shape[1]; }
+
+int FrameInfo::height() const { return shape[0]; }
+
+//! Only valid when the dimensions are (height, width, channels)
+int FrameInfo::channels() const { return shape[2]; }
+
+Frame::Frame(FrameInfo info, u8* b) : data(b) {
+  memcpy(shape, info.shape, sizeof(int) * FRAME_DIMS);
+  type = info.type;
+}
+
+FrameInfo Frame::as_frame_info() const {
+  return FrameInfo(shape[0], shape[1], shape[2], type);
+}
+
+size_t Frame::size() const { return as_frame_info().size(); }
+
+int Frame::width() const { return as_frame_info().width(); }
+
+int Frame::height() const { return as_frame_info().height(); }
+
+//! Only valid when the dimensions are (height, width, channels)
+int Frame::channels() const { return as_frame_info().channels(); }
+
+Frame* new_frame(DeviceHandle device, FrameInfo info) {
+  u8* buffer = new_buffer(device, info.size());
+  return new Frame(info, buffer);
+}
+
+std::vector<Frame*> new_frames(DeviceHandle device, FrameInfo info, i32 num) {
+  u8* buffer = new_block_buffer(device, info.size() * num, num);
+  std::vector<Frame*> frames;
+  for (i32 i = 0; i < num; ++i) {
+    frames.push_back(new Frame(info, buffer + i * info.size()));
+  }
+  return frames;
+}
+}
diff --git a/scanner/api/frame.h b/scanner/api/frame.h
new file mode 100644
index 00000000..fed82776
--- /dev/null
+++ b/scanner/api/frame.h
@@ -0,0 +1,88 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "scanner/metadata.pb.h"
+#include "scanner/util/common.h"
+#include "scanner/util/profiler.h"
+
+#include <vector>
+
+namespace scanner {
+
+using proto::FrameType;
+
+size_t size_of_frame_type(FrameType type);
+
+const i32 FRAME_DIMS = 3;
+
+//! FrameInfo
+struct FrameInfo {
+  FrameInfo() = default;
+  FrameInfo(const FrameInfo& info) = default;
+  FrameInfo(FrameInfo&& info) = default;
+  FrameInfo& operator=(const FrameInfo&) = default;
+
+  FrameInfo(int shape0, int shape1, int shape2, FrameType type);
+  FrameInfo(const std::vector<int> shapes, FrameType type);
+
+  bool operator==(const FrameInfo& other) const;
+  bool operator!=(const FrameInfo& other) const;
+
+  size_t size() const;
+
+  //! Only valid when the dimensions are (height, width, channels)
+  int width() const;
+
+  //! Only valid when the dimensions are (height, width, channels)
+  int height() const;
+
+  //! Only valid when the dimensions are (height, width, channels)
+  int channels() const;
+
+  int shape[FRAME_DIMS];
+  FrameType type;
+};
+
+//! Frame
+class Frame {
+ public:
+  Frame(FrameInfo info, u8* buffer);
+
+  FrameInfo as_frame_info() const;
+
+  size_t size() const;
+
+  //! Only valid when the dimensions are (height, width, channels)
+  int width() const;
+
+  //! Only valid when the dimensions are (height, width, channels)
+  int height() const;
+
+  //! Only valid when the dimensions are (height, width, channels)
+  int channels() const;
+
+  int shape[FRAME_DIMS];
+  FrameType type;
+  u8* data;
+};
+
+Frame* new_frame(DeviceHandle device, FrameInfo info);
+
+void delete_frame(DeviceHandle device, u8* buffer);
+
+std::vector<Frame*> new_frames(DeviceHandle device, FrameInfo info, i32 num);
+}
diff --git a/scanner/api/kernel.cpp b/scanner/api/kernel.cpp
index 84c9f457..f768c848 100644
--- a/scanner/api/kernel.cpp
+++ b/scanner/api/kernel.cpp
@@ -20,38 +20,127 @@
 
 namespace scanner {
 
-Kernel::Kernel(const Config &config) {}
+Element::Element(u8* _buffer, size_t _size)
+  : buffer(_buffer), size(_size), is_frame(false) {}
 
-void VideoKernel::check_frame_info(const DeviceHandle &device,
-                                   const RowList &row_list) {
-  auto &rows = row_list.rows;
-  assert(rows.size() > 0);
+Element::Element(Frame* frame)
+  : buffer((u8*)frame), size(sizeof(Frame)), is_frame(true) {}
 
+BaseKernel::BaseKernel(const KernelConfig& config) {}
+
+StenciledBatchedKernel::StenciledBatchedKernel(const KernelConfig& config)
+    : BaseKernel(config) {}
+
+void StenciledBatchedKernel::execute_kernel(
+    const StenciledBatchedColumns& input_columns,
+    BatchedColumns& output_columns) {
+  execute(input_columns, output_columns);
+}
+
+StenciledKernel::StenciledKernel(const KernelConfig& config)
+  : BaseKernel(config) {}
+
+void StenciledKernel::execute_kernel(
+    const StenciledBatchedColumns& input_columns,
+    BatchedColumns& output_columns) {
+  StenciledColumns in;
+  for (auto& col : input_columns) {
+    in.emplace_back();
+    std::vector<Element>& b = in.back();
+    b = col[0];
+  }
+
+  Columns out_cols(output_columns.size());
+  execute(in, out_cols);
+  for (size_t i = 0; i < out_cols.size(); ++i) {
+    output_columns[i].push_back(out_cols[i]);
+  }
+}
+
+BatchedKernel::BatchedKernel(const KernelConfig& config)
+    : BaseKernel(config) {}
+
+void BatchedKernel::execute_kernel(
+    const StenciledBatchedColumns& input_columns,
+    BatchedColumns& output_columns) {
+  BatchedColumns in;
+  for (auto& col : input_columns) {
+    in.emplace_back();
+    std::vector<Element>& b = in.back();
+    for (auto& stencil : col) {
+      b.push_back(stencil[0]);
+    }
+  }
+
+  execute(in, output_columns);
+}
+
+Kernel::Kernel(const KernelConfig& config)
+    : BaseKernel(config) {}
+
+void Kernel::execute_kernel(
+    const StenciledBatchedColumns& input_columns,
+    BatchedColumns& output_columns) {
+
+  Columns in_cols;
+  for (auto& col : input_columns) {
+    in_cols.push_back(col[0][0]);
+  }
+
+  Columns out_cols(output_columns.size());
+  execute(in_cols, out_cols);
+  for (size_t i = 0; i < out_cols.size(); ++i) {
+    output_columns[i].push_back(out_cols[i]);
+  }
+}
+
+void VideoKernel::check_frame(const DeviceHandle& device,
+                              const Element& element) {
+  const Frame* frame = element.as_const_frame();
+  bool same = (frame->type == frame_info_.type);
+  for (i32 i = 0; i < 3; ++i) {
+    same &= (frame->shape[i] == frame_info_.shape[i]);
+  }
+  if (!same) {
+    memcpy(frame_info_.shape, frame->shape, sizeof(int) * 3);
+    frame_info_.type = frame->type;
+    new_frame_info();
+  }
+}
+
+void VideoKernel::check_frame_info(const DeviceHandle& device,
+                                   const Element& element) {
   // Assume that all the FrameInfos in the same batch are the same
-  u8 *buffer = new_buffer(CPU_DEVICE, rows[0].size);
-  memcpy_buffer((u8 *)buffer, CPU_DEVICE, rows[0].buffer, device, rows[0].size);
-  FrameInfo frame_info;
-  bool parsed = frame_info.ParseFromArray(buffer, rows[0].size);
-  LOG_IF(FATAL, !parsed) << "Invalid frame info";
-  delete_buffer(CPU_DEVICE, buffer);
+  u8* buffer = new_buffer(CPU_DEVICE, element.size);
+  memcpy_buffer((u8*)buffer, CPU_DEVICE, element.buffer, device, element.size);
+  FrameInfo* frame_info = reinterpret_cast<FrameInfo*>(buffer);
 
-  if (frame_info.width() != frame_info_.width() ||
-      frame_info.height() != frame_info_.height()) {
-    frame_info_ = frame_info;
+  bool same = (frame_info->type == frame_info_.type);
+  for (i32 i = 0; i < 3; ++i) {
+    same &= (frame_info->shape[i] == frame_info_.shape[i]);
+  }
+  if (!same) {
+    memcpy(frame_info_.shape, frame_info->shape, sizeof(int) * 3);
+    frame_info_.type = frame_info->type;
     new_frame_info();
   }
+  delete_buffer(CPU_DEVICE, buffer);
 }
 
 namespace internal {
-KernelRegistration::KernelRegistration(const KernelBuilder &builder) {
-
-  const std::string &name = builder.name_;
+KernelRegistration::KernelRegistration(const KernelBuilder& builder) {
+  const std::string& name = builder.name_;
   DeviceType type = builder.device_type_;
   i32 num_devices = builder.num_devices_;
+  auto& input_devices = builder.input_devices_;
+  auto& output_devices = builder.output_devices_;
+  bool can_batch = builder.can_batch_;
+  i32 preferred_batch = builder.preferred_batch_size_;
   KernelConstructor constructor = builder.constructor_;
-  internal::KernelFactory *factory =
-      new internal::KernelFactory(name, type, num_devices, 0, constructor);
-  internal::KernelRegistry *registry = internal::get_kernel_registry();
+  internal::KernelFactory* factory = new internal::KernelFactory(
+      name, type, num_devices, input_devices, output_devices, can_batch,
+      preferred_batch, constructor);
+  internal::KernelRegistry* registry = internal::get_kernel_registry();
   registry->add_kernel(name, factory);
 }
 }
diff --git a/scanner/api/kernel.h b/scanner/api/kernel.h
index abc128b5..3a73ebb9 100644
--- a/scanner/api/kernel.h
+++ b/scanner/api/kernel.h
@@ -15,61 +15,195 @@
 
 #pragma once
 
+#include "scanner/api/frame.h"
 #include "scanner/util/common.h"
+#include "scanner/util/memory.h"
 #include "scanner/util/profiler.h"
 
 #include <vector>
 
 namespace scanner {
 
-using proto::FrameInfo;
+//! Element in a Scanner table, byte buffer of arbitrary size.
+struct Element {
+  Element() = default;
+  Element(const Element&) = default;
+  Element(Element&&) = default;
+  Element& operator=(const Element&) = default;
 
-bool is_frame_column(const std::string& name);
+  Element(u8* buffer, size_t size);
+  Element(Frame* frame);
 
-FrameInfo get_frame_info(const std::string& name);
+  inline Frame* as_frame() { return reinterpret_cast<Frame*>(buffer); }
+  inline const Frame* as_const_frame() const {
+    return reinterpret_cast<Frame*>(buffer);
+  }
+  inline FrameInfo* as_frame_info() {
+    return reinterpret_cast<FrameInfo*>(buffer);
+  }
+  inline const FrameInfo* as_const_frame_info() const {
+    return reinterpret_cast<FrameInfo*>(buffer);
+  }
+
+  inline bool is_null() const {
+    return buffer == nullptr;
+  }
 
-struct Row {
   u8* buffer;
   size_t size;
+  bool is_frame;
+  // @brief the index of the element in the input domain
+  i64 index;
 };
 
-struct RowList {
-  std::vector<Row> rows;
-};
+using ElementList = std::vector<Element>;
+
+using BatchedColumns = std::vector<ElementList>;
+
+using StenciledColumns = std::vector<ElementList>;
+
+//! Column -> Batch -> Stencil
+using StenciledBatchedColumns = std::vector<std::vector<ElementList>>;
+
+using Columns = std::vector<Element>;
+
+inline size_t num_rows(const ElementList& column) { return column.size(); }
+
+inline void insert_element(ElementList& column, u8* buffer, size_t size) {
+  column.push_back(::scanner::Element{buffer, size});
+}
+
+inline void insert_frame(ElementList& column, Frame* frame) {
+  column.push_back(::scanner::Element{frame});
+}
+
+inline void insert_element(Element& element, u8* buffer, size_t size) {
+  element = ::scanner::Element{buffer, size};
+}
+
+inline void insert_frame(Element& element, Frame* frame) {
+  element = ::scanner::Element{frame};
+}
 
-using BatchedColumns = std::vector<RowList>;
+inline Element add_element_ref(DeviceHandle device, Element& element) {
+  if (element.is_null()) {
+    return Element();
+  }
+  Element ele;
+  if (element.is_frame) {
+    Frame* frame = element.as_frame();
+    add_buffer_ref(device, frame->data);
+    // Copy frame because Frame is not referenced counted
+    ele = ::scanner::Element{new Frame(frame->as_frame_info(), frame->data)};
+  } else {
+    add_buffer_ref(device, element.buffer);
+    ele = element;
+  }
+  ele.index = element.index;
+  return ele;
+}
+
+inline void delete_element(DeviceHandle device, Element& element) {
+  if (element.is_null()) {
+    return;
+  }
+  if (element.is_frame) {
+    Frame* frame = element.as_frame();
+    delete_buffer(device, frame->data);
+    delete frame;
+  } else {
+    delete_buffer(device, element.buffer);
+  }
+}
+
+//! Kernel parameters provided at instantiation.
+struct KernelConfig {
+  std::vector<DeviceHandle> devices;  //! Non-empty set of devices provided to
+                                      //! the kernel.
+  std::vector<std::string> input_columns;
+  std::vector<proto::ColumnType> input_column_types;
+  std::vector<std::string> output_columns;
+  std::vector<u8> args;  //! Byte-string of proto args if given.
+  i32 node_id;
+};
 
 /**
  * @brief Interface for a unit of computation in a pipeline.
  *
  * Kernels form the core of Scanner's interface. They are essentially
- * functions that take rows of inputs and produce an equal number rows of
+ * functions that take elements of inputs and produce an equal number elements
+ * of
  * outputs. Kernels are stateful operators that get reset when provided
  * non-contiguous batches of input. See KernelFactory for how an op
  * defines what hardware it can use for its computation.
  */
-class Kernel {
+class BaseKernel {
  public:
   static const i32 UnlimitedDevices = 0;
+  BaseKernel(const KernelConfig& config);
 
-  struct Config {
-    std::vector<DeviceHandle> devices;
-    std::vector<std::string> input_columns;
-    std::vector<std::string> output_columns;
-    std::vector<u8> args;
-    i32 work_item_size;
-  };
+  virtual ~BaseKernel(){};
 
-  Kernel(const Config& config);
+  /**
+   * @brief Checks if kernel arguments are valid.
+   *
+   * Only useful if your kernel has its own custom Protobuf arguments.
+   */
+  virtual void validate(proto::Result* result) { result->set_success(true); }
 
-  virtual ~Kernel(){};
+  /**
+   * @brief Requests that kernel resets its logical state.
+   *
+   * Scanner calls reset on a kernel when it provides non-consecutive
+   * inputs or when about to provide inputs from a difference slice. This allows
+   * unbounded or bounded state kernels to clear their logical state so
+   * that state from logically unrelated parts of the input do not affect
+   * the output.
+   */
+  virtual void reset(){};
+
+  /**
+   * @brief For internal use
+   **/
+  virtual void execute_kernel(const StenciledBatchedColumns& input_columns,
+                              BatchedColumns& output_columns) = 0;
 
   /**
-   * @brief TODO
+   * @brief For internal use
+   **/
+  virtual void set_profiler(Profiler* profiler) { profiler_ = profiler; }
+
+  /**
+   * The profiler allows an op to save profiling data for later
+   * visualization. It is not guaranteed to be non-null, so check before use.
    */
-  virtual void validate(proto::Result* result) {
-    result->set_success(true);
-  }
+  Profiler* profiler_ = nullptr;
+};
+
+
+/**
+ * @brief Interface for a unit of computation in a pipeline.
+ *
+ * Kernels form the core of Scanner's interface. They are essentially
+ * functions that take elements of inputs and produce an equal number elements
+ * of
+ * outputs. Kernels are stateful operators that get reset when provided
+ * non-contiguous batches of input. See KernelFactory for how an op
+ * defines what hardware it can use for its computation.
+ */
+class StenciledBatchedKernel : public BaseKernel {
+ public:
+  static const i32 UnlimitedDevices = 0;
+  StenciledBatchedKernel(const KernelConfig& config);
+
+  virtual ~StenciledBatchedKernel(){};
+
+  /**
+   * @brief Checks if kernel arguments are valid.
+   *
+   * Only useful if your kernel has its own custom Protobuf arguments.
+   */
+  virtual void validate(proto::Result* result) { result->set_success(true); }
 
   /**
    * @brief Resets ops when about to receive non-consecutive inputs.
@@ -83,55 +217,176 @@ class Kernel {
   virtual void reset(){};
 
   /**
-   * @brief Runs the op on input rows and produces equal number of
-   *        output rows.
+   * @brief For internal use
+   **/
+  virtual void execute_kernel(const StenciledBatchedColumns& input_columns,
+                              BatchedColumns& output_columns) override;
+
+  //! Do not call this function.
+  virtual void set_profiler(Profiler* profiler) { profiler_ = profiler; }
+
+ protected:
+  /**
+   * @brief Runs the op on input elements and produces equal number of
+   *        output elements.
    *
    * @param input_columns
-   *        vector of columns, where each column is a vector of inputs and each
+   *        vector of columns, where each column is a vector of inputs and
+   * each
    *        input is a byte array
    * @param output_columns
    *        op output, each column must have same length as the number of
-   *        input rows
+   *        input elements
    *
-   * Evaluate gets run on batches of inputs. At the beginning of a pipeline this
+   * Evaluate gets run on batches of inputs. At the beginning of a pipeline
+   * this
    * is raw RGB images from the input images/videos, and after that the input
    * becomes whatever was returned by the previous op.
    *
    * Number of output columns must be non-zero.
    */
-  virtual void execute(const BatchedColumns &input_columns,
-                       BatchedColumns &output_columns) = 0;
+  virtual void execute(const StenciledBatchedColumns& input_columns,
+                       BatchedColumns& output_columns) = 0;
 
   /**
-   * Do not call this function.
+   * The profiler allows an op to save profiling data for later
+   * visualization. It is not guaranteed to be non-null, so check before use.
    */
-  virtual void set_profiler(Profiler* profiler) { profiler_ = profiler; }
+  Profiler* profiler_ = nullptr;
+};
 
+/**
+ * @brief Interface for a unit of computation in a pipeline.
+ *
+ * Kernels form the core of Scanner's interface. They are essentially
+ * functions that take elements of inputs and produce an equal number elements
+ * of
+ * outputs. Kernels are stateful operators that get reset when provided
+ * non-contiguous batches of input. See KernelFactory for how an op
+ * defines what hardware it can use for its computation.
+ */
+class BatchedKernel : public BaseKernel {
+ public:
+  BatchedKernel(const KernelConfig& config);
+
+  virtual ~BatchedKernel(){};
+
+  /**
+   * @brief For internal use
+   **/
+  virtual void execute_kernel(const StenciledBatchedColumns& input_columns,
+                              BatchedColumns& output_columns);
  protected:
   /**
-   * The profiler allows an op to save profiling data for later
-   * visualization. It is not guaranteed to be non-null, so check before use.
+   * @brief Runs the op on input elements and produces equal number of
+   *        output elements.
+   *
+   * @param input_columns
+   *        vector of columns, where each column is a vector of inputs and
+   * each
+   *        input is a byte array
+   * @param output_columns
+   *        op output, each column must have same length as the number of
+   *        input elements
+   *
+   * Evaluate gets run on batches of inputs. At the beginning of a pipeline
+   * this
+   * is raw RGB images from the input images/videos, and after that the input
+   * becomes whatever was returned by the previous op.
+   *
+   * Number of output columns must be non-zero.
    */
-  Profiler* profiler_ = nullptr;
+  virtual void execute(const BatchedColumns& input_columns,
+                       BatchedColumns& output_columns) = 0;
 };
 
-class VideoKernel : public Kernel {
-public:
-  VideoKernel(const Config& config) : Kernel(config) {};
+class StenciledKernel : public BaseKernel {
+ public:
+  StenciledKernel(const KernelConfig& config);
 
-protected:
-  void check_frame_info(const DeviceHandle& device, const RowList& row_list);
-  virtual void new_frame_info(){};
+  virtual ~StenciledKernel(){};
 
-  FrameInfo frame_info_;
+  /**
+   * @brief For internal use
+   **/
+  virtual void execute_kernel(const StenciledBatchedColumns& input_columns,
+                              BatchedColumns& output_columns);
+ protected:
+  /**
+   * @brief Runs the op on input elements and produces equal number of
+   *        output elements.
+   *
+   * @param input_columns
+   *        vector of columns, where each column is a vector of inputs and
+   * each
+   *        input is a byte array
+   * @param output_columns
+   *        op output, each column must have same length as the number of
+   *        input elements
+   *
+   * Evaluate gets run on batches of inputs. At the beginning of a pipeline
+   * this
+   * is raw RGB images from the input images/videos, and after that the input
+   * becomes whatever was returned by the previous op.
+   *
+   * Number of output columns must be non-zero.
+   */
+  virtual void execute(const StenciledColumns& input_columns,
+                       Columns& output_columns) = 0;
 };
 
-#define ROW_BUFFER(column__, row__) (column__.rows[row__].buffer)
+class Kernel : public BaseKernel {
+ public:
+  Kernel(const KernelConfig& config);
 
-#define ROW_SIZE(column__, row__) (column__.rows[row__].size)
+  virtual ~Kernel(){};
 
-#define INSERT_ROW(column__, buffer__, size__) \
-  column__.rows.push_back(::scanner::Row{buffer__, size__})
+  /**
+   * @brief For internal use
+   **/
+  virtual void execute_kernel(const StenciledBatchedColumns& input_columns,
+                              BatchedColumns& output_columns);
+ protected:
+  /**
+   * @brief Runs the op on input elements and produces equal number of
+   *        output elements.
+   *
+   * @param input_columns
+   *        vector of elements, where each element is from a different column
+   * @param output_columns
+   *        op output, vector of elements, where each element is from a
+   *        different column
+   *
+   * Evaluate gets run on batches of inputs. At the beginning of a pipeline
+   * this
+   * is raw RGB images from the input images/videos, and after that the input
+   * becomes whatever was returned by the previous op.
+   *
+   * Number of output columns must be non-zero.
+   */
+  virtual void execute(const Columns& input_columns,
+                       Columns& output_columns) = 0;
+};
+
+//! Kernel with support for frame and frame_info columns.
+class VideoKernel {
+ protected:
+  /**
+   * @brief Checks frame info column against cached data.
+   *
+   * This function should be called at the top of the execute function on the
+   * frame info column. If the frame info changes, e.g. the kernel is processing
+   * a new video, then this calls new_frame_info which you can override.
+   */
+  void check_frame(const DeviceHandle& device, const Element& element);
+
+  void check_frame_info(const DeviceHandle& device, const Element& element);
+
+  //! Callback for if frame info changes.
+  virtual void new_frame_info(){};
+
+  FrameInfo frame_info_{};
+};
 
 ///////////////////////////////////////////////////////////////////////////////
 /// Implementation Details
@@ -139,7 +394,8 @@ namespace internal {
 
 class KernelBuilder;
 
-using KernelConstructor = std::function<Kernel*(const Kernel::Config& config)>;
+using KernelConstructor =
+    std::function<BaseKernel*(const KernelConfig& config)>;
 
 class KernelRegistration {
  public:
@@ -147,12 +403,16 @@ class KernelRegistration {
 };
 
 class KernelBuilder {
-public:
+ public:
   friend class KernelRegistration;
 
-  KernelBuilder(const std::string &name,
-                KernelConstructor constructor)
-    : name_(name), constructor_(constructor) {}
+  KernelBuilder(const std::string& name, KernelConstructor constructor)
+    : name_(name),
+      constructor_(constructor),
+      device_type_(DeviceType::CPU),
+      num_devices_(1),
+      can_batch_(false),
+      preferred_batch_size_(1) {}
 
   KernelBuilder& device(DeviceType device_type) {
     device_type_ = device_type;
@@ -164,13 +424,34 @@ class KernelBuilder {
     return *this;
   }
 
+  KernelBuilder& input_device(const std::string& input_name,
+                              DeviceType device_type) {
+    input_devices_[input_name] = device_type;
+    return *this;
+  }
+
+  KernelBuilder& output_device(const std::string& output_name,
+                               DeviceType device_type) {
+    output_devices_[output_name] = device_type;
+    return *this;
+  }
+
+  KernelBuilder& batch(i32 preferred_batch_size = 1) {
+    can_batch_ = true;
+    preferred_batch_size = preferred_batch_size;
+    return *this;
+  }
+
  private:
   std::string name_;
   KernelConstructor constructor_;
   DeviceType device_type_;
   i32 num_devices_;
+  std::map<std::string, DeviceType> input_devices_;
+  std::map<std::string, DeviceType> output_devices_;
+  bool can_batch_;
+  i32 preferred_batch_size_;
 };
-
 }
 
 #define REGISTER_KERNEL(name__, kernel__) \
@@ -179,11 +460,10 @@ class KernelBuilder {
 #define REGISTER_KERNEL_HELPER(uid__, name__, kernel__) \
   REGISTER_KERNEL_UID(uid__, name__, kernel__)
 
-#define REGISTER_KERNEL_UID(uid__, name__, kernel__)                           \
-  static ::scanner::internal::KernelRegistration kernel_registration_##uid__   \
-      __attribute__((unused)) = ::scanner::internal::KernelBuilder(            \
-          #name__,                                                             \
-          [](const ::scanner::Kernel::Config &config) {                        \
-              return new kernel__(config); })
-
+#define REGISTER_KERNEL_UID(uid__, name__, kernel__)                         \
+  static ::scanner::internal::KernelRegistration kernel_registration_##uid__ \
+      __attribute__((unused)) = ::scanner::internal::KernelBuilder(          \
+          #name__, [](const ::scanner::KernelConfig& config) {             \
+            return new kernel__(config);                                     \
+          })
 }
diff --git a/scanner/api/op.cpp b/scanner/api/op.cpp
index 117ae992..ee09bca4 100644
--- a/scanner/api/op.cpp
+++ b/scanner/api/op.cpp
@@ -18,47 +18,42 @@
 #include "scanner/engine/op_registry.h"
 
 namespace scanner {
-
-Op::Op(const std::string &name,
-                     const std::vector<OpInput> &inputs,
-                     DeviceType device_type, char *args, size_t args_size)
-    : name_(name), inputs_(inputs), type_(device_type), args_(args),
-      args_size_(args_size) {}
-
-const std::string &Op::get_name() const { return name_; }
-
-const std::vector<OpInput> &Op::get_inputs() const { return inputs_; }
-
-DeviceType Op::get_device_type() const { return type_; }
-
-char *Op::get_args() const { return args_; }
-
-size_t Op::get_args_size() const { return args_size_; }
-
-Op *OpInput::get_op() const { return op; }
-
-const std::vector<std::string> &OpInput::get_columns() const {
-  return columns;
-}
-
-Op *make_input_op(const std::vector<std::string> &columns) {
-  OpInput eval_input = {nullptr, columns};
-  return new Op("InputTable", {eval_input}, DeviceType::CPU);
-}
-
-Op *make_output_op(const std::vector<OpInput> &inputs) {
-  return new Op("OutputTable", inputs, DeviceType::CPU);
-}
-
 namespace internal {
 
-OpRegistration::OpRegistration(const OpBuilder &builder) {
-  const std::string &name = builder.name_;
-  const std::vector<std::string> &input_columns = builder.input_columns_;
-  const std::vector<std::string> &output_columns = builder.output_columns_;
-  OpInfo *info = new OpInfo(name, input_columns, output_columns);
-  OpRegistry *registry = get_op_registry();
-  registry->add_op(name, info);
+OpRegistration::OpRegistration(const OpBuilder& builder) {
+  const std::string& name = builder.name_;
+  const bool variadic_inputs = builder.variadic_inputs_;
+  std::vector<Column> input_columns;
+  size_t i = 0;
+  for (auto& name_type : builder.input_columns_) {
+    Column col;
+    col.set_id(i++);
+    col.set_name(std::get<0>(name_type));
+    col.set_type(std::get<1>(name_type));
+    input_columns.push_back(col);
+  }
+  std::vector<Column> output_columns;
+  i = 0;
+  for (auto& name_type : builder.output_columns_) {
+    Column col;
+    col.set_id(i++);
+    col.set_name(std::get<0>(name_type));
+    col.set_type(std::get<1>(name_type));
+    output_columns.push_back(col);
+  }
+  bool can_stencil = builder.can_stencil_;
+  const std::vector<i32>& stencil = builder.preferred_stencil_;
+  bool has_bounded_state = builder.has_bounded_state_;
+  i32 warmup = builder.warmup_;
+  bool has_unbounded_state = builder.has_unbounded_state_;
+  OpInfo* info = new OpInfo(name, variadic_inputs, input_columns,
+                            output_columns, can_stencil, stencil,
+                            has_bounded_state, warmup, has_unbounded_state);
+  OpRegistry* registry = get_op_registry();
+  Result result = registry->add_op(name, info);
+  if (!result.success()) {
+    LOG(WARNING) << "Failed to register op " << name << ": " << result.msg();
+  }
 }
 }
 }
diff --git a/scanner/api/op.h b/scanner/api/op.h
index 400c7984..200ecc0b 100644
--- a/scanner/api/op.h
+++ b/scanner/api/op.h
@@ -22,52 +22,6 @@
 
 namespace scanner {
 
-struct OpInput;
-
-class Op {
-public:
-  Op(const std::string &name, const std::vector<OpInput> &inputs,
-            DeviceType device_type,
-            char *args = nullptr, size_t args_size = 0);
-
-  virtual ~Op(){};
-
-  const std::string& get_name() const;
-
-  const std::vector<OpInput>& get_inputs() const;
-
-  DeviceType get_device_type() const;
-
-  char* get_args() const;
-
-  size_t get_args_size() const;
-
-protected:
-  std::string name_;
-  std::vector<OpInput> inputs_;
-  DeviceType type_;
-  char* args_;
-  size_t args_size_;
-};
-
-class OpInput {
-public:
-  OpInput(Op *op, const std::vector<std::string> &columns)
-      : op(op), columns(columns) {}
-
-  Op* get_op() const;
-
-  const std::vector<std::string>& get_columns() const;
-
-private:
-  Op *op;
-  std::vector<std::string> columns;
-};
-
-Op* make_input_op(const std::vector<std::string>& columns);
-
-Op* make_output_op(const std::vector<OpInput>& inputs);
-
 ///////////////////////////////////////////////////////////////////////////////
 /// Implementation Details
 namespace internal {
@@ -83,31 +37,88 @@ class OpBuilder {
  public:
   friend class OpRegistration;
 
-  OpBuilder(const std::string &name)
-      : name_(name) {}
+  OpBuilder(const std::string& name)
+      : name_(name), variadic_inputs_(false), can_stencil_(false),
+        has_bounded_state_(false), warmup_(0), has_unbounded_state_(false) {}
 
-  OpBuilder& inputs(const std::vector<std::string>& columns) {
-    input_columns_ = columns;
+  OpBuilder& variadic_inputs() {
+    if (input_columns_.size() > 0) {
+      LOG(FATAL) << "Op " << name_ << " cannot have both fixed and variadic "
+                 << "inputs";
+    }
+    variadic_inputs_ = true;
     return *this;
   }
 
-  OpBuilder& outputs(const std::vector<std::string>& columns) {
-    output_columns_ = columns;
+  OpBuilder& input(const std::string& name,
+                   ColumnType type = ColumnType::Other) {
+    if (variadic_inputs_) {
+      LOG(FATAL) << "Op " << name_ << " cannot have both fixed and variadic "
+                 << "inputs";
+    }
+    input_columns_.push_back(std::make_tuple(name, type));
+    return *this;
+  }
+
+  OpBuilder& frame_input(const std::string& name) {
+    return input(name, ColumnType::Video);
+  }
+
+  OpBuilder& output(const std::string& name,
+                    ColumnType type = ColumnType::Other) {
+    output_columns_.push_back(std::make_tuple(name, type));
+    return *this;
+  }
+
+  OpBuilder& frame_output(const std::string& name) {
+    return output(name, ColumnType::Video);
+  }
+
+  OpBuilder& stencil(const std::vector<int>& stencil = {0}) {
+    can_stencil_ = true;
+    preferred_stencil_ = stencil;
+    return *this;
+  }
+
+  OpBuilder& bounded_state(i32 warmup = 0) {
+    if (has_unbounded_state_) {
+      LOG(FATAL) << "Attempted to specify Op " << name_
+                 << " has bounded state but Op was already declared to have "
+                    "unbounded state.";
+    }
+    has_bounded_state_ = true;
+    warmup_ = warmup;
+    return *this;
+  }
+
+  OpBuilder& unbounded_state() {
+    if (has_bounded_state_) {
+      LOG(FATAL) << "Attempted to specify Op " << name_
+                 << " has unbounded state but Op was already declared to have "
+                    "bounded state.";
+    }
+    has_unbounded_state_ = true;
     return *this;
   }
 
  private:
   std::string name_;
-  std::vector<std::string> input_columns_;
-  std::vector<std::string> output_columns_;
+  bool variadic_inputs_;
+  std::vector<std::tuple<std::string, ColumnType>> input_columns_;
+  std::vector<std::tuple<std::string, ColumnType>> output_columns_;
+  bool can_stencil_;
+  std::vector<int> preferred_stencil_ = {0};
+  bool has_bounded_state_;
+  i32 warmup_;
+  bool has_unbounded_state_;
 };
 }
 
-#define REGISTER_OP(name__) \
-  REGISTER_OP_UID(__COUNTER__, name__)
+#define REGISTER_OP(name__) REGISTER_OP_HELPER(__COUNTER__, name__)
+
+#define REGISTER_OP_HELPER(uid__, name__) REGISTER_OP_UID(uid__, name__)
 
-#define REGISTER_OP_UID(uid__, name__)                                  \
-  static ::scanner::internal::OpRegistration                            \
-      op_registration_##uid__ __attribute__((unused)) =                 \
-          ::scanner::internal::OpBuilder(#name__)
+#define REGISTER_OP_UID(uid__, name__)                               \
+  static ::scanner::internal::OpRegistration op_registration_##uid__ \
+      __attribute__((unused)) = ::scanner::internal::OpBuilder(#name__)
 }
diff --git a/scanner/api/user_function.cpp b/scanner/api/user_function.cpp
index a4af041a..26a3bdae 100644
--- a/scanner/api/user_function.cpp
+++ b/scanner/api/user_function.cpp
@@ -2,25 +2,25 @@
 
 namespace scanner {
 
-void UserFunctionRegistry::add_user_function(const std::string &name,
+void UserFunctionRegistry::add_user_function(const std::string& name,
                                              const FnPtr fn) {
   fns_.insert({name, fn});
 }
 
-bool UserFunctionRegistry::has_user_function(const std::string &name) {
+bool UserFunctionRegistry::has_user_function(const std::string& name) {
   return fns_.count(name) > 0;
 }
 
-UserFunctionRegistry *get_user_function_registry() {
-  static UserFunctionRegistry *registry = new UserFunctionRegistry;
+UserFunctionRegistry* get_user_function_registry() {
+  static UserFunctionRegistry* registry = new UserFunctionRegistry;
   return registry;
 }
 
 namespace internal {
 
-UserFunctionRegistration::UserFunctionRegistration(const std::string &name,
+UserFunctionRegistration::UserFunctionRegistration(const std::string& name,
                                                    const FnPtr fn) {
-  UserFunctionRegistry *registry = get_user_function_registry();
+  UserFunctionRegistry* registry = get_user_function_registry();
   registry->add_user_function(name, fn);
 }
 }
diff --git a/scanner/api/user_function.h b/scanner/api/user_function.h
index 868a8432..1d026ce2 100644
--- a/scanner/api/user_function.h
+++ b/scanner/api/user_function.h
@@ -4,45 +4,46 @@
 
 namespace scanner {
 
-using FnPtr = void(*)();
+using FnPtr = void (*)();
 
+//! Convenience for dynamic registration of C++ functions.
 class UserFunctionRegistry {
-public:
-  void add_user_function(const std::string &name, const FnPtr fn);
+ public:
+  void add_user_function(const std::string& name, const FnPtr fn);
 
-  template<typename T>
-  T get_user_function(const std::string &name);
+  template <typename T>
+  T get_user_function(const std::string& name);
 
   bool has_user_function(const std::string& name);
 
-private:
+ private:
   std::map<std::string, const FnPtr> fns_;
 };
 
 UserFunctionRegistry* get_user_function_registry();
 
-template<typename T>
-T UserFunctionRegistry::get_user_function(const std::string &name) {
+template <typename T>
+T UserFunctionRegistry::get_user_function(const std::string& name) {
   return reinterpret_cast<T>(fns_.at(name));
 }
 
 namespace internal {
 
 class UserFunctionRegistration {
-public:
+ public:
   UserFunctionRegistration(const std::string& name, const FnPtr fn);
 };
-
 }
 
 #define REGISTER_USER_FUNCTION(name__, function__) \
   REGISTER_USER_FUNCTION_HELPER(__COUNTER__, name__, function__)
 
-#define REGISTER_USER_FUNCTION_HELPER(uid__, name__, function__)  \
+#define REGISTER_USER_FUNCTION_HELPER(uid__, name__, function__) \
   REGISTER_USER_FUNCTION_UID(uid__, name__, function__)
 
-#define REGISTER_USER_FUNCTION_UID(uid__, name__, function__)           \
-  static ::scaner::internal::UserFunctionRegistration user_function_registration_##uid__ = \
-    ::scanner::internal::UserFunctionRegistration(#name__, static_cast<const void*>(function__));
-
+#define REGISTER_USER_FUNCTION_UID(uid__, name__, function__) \
+  static ::scaner::internal::UserFunctionRegistration         \
+      user_function_registration_##uid__ =                    \
+          ::scanner::internal::UserFunctionRegistration(      \
+              #name__, static_cast<const void*>(function__));
 }
diff --git a/scanner/engine/CMakeLists.txt b/scanner/engine/CMakeLists.txt
index b4496dc9..03371c2b 100644
--- a/scanner/engine/CMakeLists.txt
+++ b/scanner/engine/CMakeLists.txt
@@ -1,19 +1,29 @@
 get_property(dirs DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
-configure_file(python.in.cpp python.cpp)
+configure_file(build_flags.in.cpp build_flags.cpp)
+
+add_executable(build_flags build_flags.cpp)
 
 set(SOURCE_FILES
+  runtime.cpp
   master.cpp
   worker.cpp
   ingest.cpp
+  video_index_entry.cpp
   load_worker.cpp
   evaluate_worker.cpp
   save_worker.cpp
-  sampling.cpp
   sampler.cpp
-  db.cpp
+  dag_analysis.cpp
+  metadata.cpp
   kernel_registry.cpp
   op_registry.cpp
-  python.cpp)
+  table_meta_cache.cpp
+  python.cpp
+  python_kernel.cpp
+  sample_op.cpp
+  space_op.cpp
+  slice_op.cpp
+  unslice_op.cpp)
 
 add_library(engine OBJECT
   ${SOURCE_FILES})
diff --git a/scanner/engine/build_flags.in.cpp b/scanner/engine/build_flags.in.cpp
new file mode 100644
index 00000000..56708727
--- /dev/null
+++ b/scanner/engine/build_flags.in.cpp
@@ -0,0 +1,10 @@
+#include <iostream>
+int main() {
+  std::cout << "@dirs@" << std::endl;
+#ifdef HAVE_CUDA
+  std::cout << "-DHAVE_CUDA" << std::endl;
+#else
+  std::cout << std::endl;
+#endif
+  return 0;
+}
diff --git a/scanner/engine/dag_analysis.cpp b/scanner/engine/dag_analysis.cpp
new file mode 100644
index 00000000..248f2a81
--- /dev/null
+++ b/scanner/engine/dag_analysis.cpp
@@ -0,0 +1,1440 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scanner/engine/dag_analysis.h"
+#include "scanner/engine/sampler.h"
+#include "scanner/api/op.h"
+#include "scanner/api/kernel.h"
+
+namespace scanner {
+namespace internal {
+
+bool is_builtin_op(const std::string& name) {
+  for (const auto& n : BUILTIN_OP_NAMES) {
+    if (n == name) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Result validate_jobs_and_ops(
+    DatabaseMetadata& meta, TableMetaCache& table_metas,
+    const std::vector<proto::Job>& jobs,
+    const std::vector<proto::Op>& ops,
+    DAGAnalysisInfo& info) {
+  std::vector<i32>& op_slice_level = info.op_slice_level;
+  std::map<i64, i64>& input_ops = info.input_ops;
+  std::map<i64, i64>& slice_ops = info.slice_ops;
+  std::map<i64, i64>& unslice_ops = info.unslice_ops;
+  std::map<i64, i64>& sampling_ops = info.sampling_ops;
+  std::map<i64, std::vector<i64>>& op_children = info.op_children;
+
+  Result result;
+  result.set_success(true);
+  {
+    // Validate ops
+    OpRegistry* op_registry = get_op_registry();
+    KernelRegistry* kernel_registry = get_kernel_registry();
+
+    i32 op_idx = 0;
+    // Keep track of op names and outputs for verifying that requested
+    // edges between ops are valid
+    std::vector<std::vector<std::string>> op_outputs;
+    // Slices are currently restricted to not nest and there to only exist
+    // a single slice grouping from start to finish currently.
+    std::vector<std::string> op_names;
+    for (auto& op : ops) {
+      op_names.push_back(op.name());
+
+      // Input Op's output is defined by the input table column they sample
+      if (op.name() == INPUT_OP_NAME) {
+        if (op.inputs().size() == 0) {
+          RESULT_ERROR(&result, "Input op at %d did not specify any inputs.",
+                       op_idx);
+          return result;
+        }
+        if (op.inputs().size() > 1) {
+          RESULT_ERROR(&result, "Input op at %d specified more than one input.",
+                       op_idx);
+          return result;
+        }
+        op_outputs.emplace_back();
+        op_outputs.back().push_back(op.inputs(0).column());
+        size_t input_ops_size = input_ops.size();
+        input_ops[op_idx] = input_ops_size;
+        op_slice_level.push_back(0);
+        op_idx++;
+        continue;
+      }
+
+      // Verify the inputs for this Op
+      i32 input_count = op.inputs().size();
+      i32 input_slice_level = 0;
+      if (input_count > 0) {
+        input_slice_level = op_slice_level.at(op.inputs(0).op_index());
+      }
+      for (auto& input : op.inputs()) {
+        // Verify inputs are topologically sorted so we can traverse linearly
+        if (input.op_index() >= op_idx) {
+          RESULT_ERROR(&result,
+                       "Op %s (%d) referenced input index %d."
+                       "Ops must be specified in topo sort order.",
+                       op.name().c_str(), op_idx, input.op_index());
+          return result;
+        }
+
+        // Verify the requested input is provided by the Op the input is being
+        // requested from.
+        const std::string& input_op_name = op_names.at(input.op_index());
+        const std::vector<std::string>& inputs =
+            op_outputs.at(input.op_index());
+        const std::string requested_input_column = input.column();
+        bool found = false;
+        for (auto& out_col : inputs) {
+          if (requested_input_column == out_col) {
+            found = true;
+            break;
+          }
+        }
+        if (!found) {
+          RESULT_ERROR(&result,
+                       "Op %s at index %d requested column %s from input "
+                       "Op %s at index %d but that Op does not have the "
+                       "requsted column.",
+                       op.name().c_str(), op_idx,
+                       requested_input_column.c_str(), input_op_name.c_str(),
+                       input.op_index());
+          return result;
+        }
+
+        // Verify inputs are from the same slice level
+        if (op_slice_level.at(input.op_index()) != input_slice_level) {
+          RESULT_ERROR(&result,
+                       "Input Op %s (%d) specified inputs at "
+                       "different slice levels (%d vs %d). Ops within at "
+                       "a slice level should only receive inputs from other "
+                       "Ops at the same slice level.",
+                       op.name().c_str(), op_idx, input_slice_level,
+                       op_slice_level.at(input.op_index()));
+          return result;
+        }
+
+        // HACK(apoms): we currently restrict all unslice outputs to only
+        // be consumed by an output Op to make it easy to schedule each
+        // slice like an independent task.
+        if (input_op_name == UNSLICE_OP_NAME &&
+            op.name() != OUTPUT_OP_NAME) {
+          RESULT_ERROR(&result,
+                       "Unslice Op specified as input to %s Op. Scanner "
+                       "currently only supports Output Ops consuming "
+                       "the results of an Unslice Op.",
+                       op.name().c_str());
+          return result;
+        }
+
+        // Keep op children info for later analysis
+        op_children[input.op_index()].push_back(op_idx);
+      }
+
+      // Slice
+      int output_slice_level = input_slice_level;
+      if (op.name() == SLICE_OP_NAME) {
+        if (output_slice_level > 0) {
+          RESULT_ERROR(&result, "Nested slicing not currently supported.");
+          return result;
+        }
+        size_t slice_ops_size = slice_ops.size();
+        slice_ops[op_idx] = slice_ops_size;
+        output_slice_level += 1;
+        op_outputs.emplace_back();
+        for (auto& input : op.inputs()) {
+          op_outputs.back().push_back(input.column());
+        }
+      }
+      // Unslice
+      else if (op.name() == UNSLICE_OP_NAME) {
+        if (input_slice_level == 0) {
+          RESULT_ERROR(&result,
+                       "Unslice received inputs that have not been "
+                       "sliced.");
+          return result;
+        }
+        size_t unslice_ops_size = unslice_ops.size();
+        unslice_ops[op_idx] = unslice_ops_size;
+        output_slice_level -= 1;
+        op_outputs.emplace_back();
+        for (auto& input : op.inputs()) {
+          op_outputs.back().push_back(input.column());
+        }
+      }
+      // Sample & Space
+      else if (op.name() == "Sample" || op.name() == "SampleFrame" ||
+               op.name() == "Space" || op.name() == "SpaceFrame") {
+        size_t sampling_ops_size = sampling_ops.size();
+        sampling_ops[op_idx] = sampling_ops_size;
+        op_outputs.emplace_back();
+        for (auto& input : op.inputs()) {
+          op_outputs.back().push_back(input.column());
+        }
+      }
+      // Output
+      else if (op.name() == OUTPUT_OP_NAME) {
+        if (input_slice_level != 0) {
+          RESULT_ERROR(&result,
+                       "Final output columns are sliced. Final outputs must "
+                       "be unsliced.");
+          return result;
+        }
+      }
+      // Verify op exists and record outputs
+      else {
+        op_outputs.emplace_back();
+        if (!op_registry->has_op(op.name())) {
+          RESULT_ERROR(&result, "Op %s is not registered.", op.name().c_str());
+          return result;
+        } else {
+          // Keep track of op outputs for verifying dependent ops
+          for (auto& col :
+               op_registry->get_op_info(op.name())->output_columns()) {
+            op_outputs.back().push_back(col.name());
+          }
+        }
+        if (!kernel_registry->has_kernel(op.name(), op.device_type())) {
+          RESULT_ERROR(&result,
+                       "Op %s at index %d requested kernel with device type "
+                       "%s but no such kernel exists.",
+                       op.name().c_str(), op_idx,
+                       (op.device_type() == DeviceType::CPU ? "CPU" : "GPU"));
+          return result;
+        }
+      }
+      op_slice_level.push_back(output_slice_level);
+      // Perform Op parameter verification (stenciling, batching, # inputs)
+      if (!is_builtin_op(op.name())) {
+        OpInfo* info = op_registry->get_op_info(op.name());
+        KernelFactory* factory =
+            kernel_registry->get_kernel(op.name(), op.device_type());
+        // Check that the # of inputs match up
+        // TODO(apoms): type check for frame
+        if (!info->variadic_inputs()) {
+          i32 expected_inputs = info->input_columns().size();
+          if (expected_inputs != input_count) {
+            RESULT_ERROR(
+                &result,
+                "Op %s at index %d expects %d input columns, but received %d",
+                op.name().c_str(), op_idx, expected_inputs, input_count);
+            return result;
+          }
+        }
+
+        // Check that a stencil is not set on a non-stenciling kernel
+        // If can't stencil, then should have a zero size stencil or a size 1
+        // stencil with the element 0
+        if (!info->can_stencil() &&
+            !((op.stencil_size() == 0) ||
+              (op.stencil_size() == 1 && op.stencil(0) == 0))) {
+          RESULT_ERROR(
+              &result,
+              "Op %s at index %d specified stencil but that Op was not "
+              "declared to support stenciling. Add .stencil() to the Op "
+              "declaration to support stenciling.",
+              op.name().c_str(), op_idx);
+          return result;
+        }
+        // Check that a stencil is not set on a non-stenciling kernel
+        if (!factory->can_batch() && op.batch() > 1) {
+          RESULT_ERROR(
+              &result,
+              "Op %s at index %d specified a batch size but the Kernel for "
+              "that Op was not declared to support batching. Add .batch() to "
+              "the Kernel declaration to support batching.",
+              op.name().c_str(), op_idx);
+          return result;
+        }
+      }
+      op_idx++;
+    }
+    if (op_names.size() < 2) {
+      RESULT_ERROR(&result,
+                   "Must specify at least two Ops: "
+                   "an Input Op, and an Output Op. "
+                   "However, %lu Op(s) were specified.",
+                   op_names.size());
+      return result;
+    } else {
+      if (op_names.back() != OUTPUT_OP_NAME) {
+        RESULT_ERROR(&result, "Last Op is %s but must be %s",
+                     op_names.back().c_str(),
+                     OUTPUT_OP_NAME.c_str());
+        return result;
+      }
+    }
+  }
+
+  // Validate table tasks
+  std::set<std::string> job_output_table_names;
+  for (auto& job : jobs) {
+    if (job.output_table_name() == "") {
+      RESULT_ERROR(&result,
+                   "Job specified with empty output table name. Output "
+                   "tables can not have empty names")
+      return result;
+    }
+    if (meta.has_table(job.output_table_name())) {
+      RESULT_ERROR(&result,
+                   "Job specified with duplicate output table name. "
+                   "A table with name %s already exists.",
+                   job.output_table_name().c_str());
+      return result;
+    }
+    if (job_output_table_names.count(job.output_table_name()) > 0) {
+      RESULT_ERROR(&result,
+                   "Multiple table tasks specified with output table name %s. "
+                   "Table names must be unique.",
+                   job.output_table_name().c_str());
+      return result;
+    }
+    job_output_table_names.insert(job.output_table_name());
+
+    // Verify table task column inputs
+    if (job.inputs().size() == 0) {
+      RESULT_ERROR(
+          &result,
+          "Job %s did not specify any table inputs. Jobs "
+          "must specify at least one table to sample from.",
+          job.output_table_name().c_str());
+      return result;
+    } else {
+      std::set<i32> used_input_ops;
+      for (auto& column_input : job.inputs()) {
+        // Verify input is specified on an Input Op
+        if (used_input_ops.count(column_input.op_index()) > 0) {
+          RESULT_ERROR(&result,
+                       "Job %s tried to set input column for Input Op "
+                       "at %d twice.",
+                       job.output_table_name().c_str(),
+                       column_input.op_index());
+          return result;
+        }
+        if (input_ops.count(column_input.op_index()) == 0) {
+          RESULT_ERROR(&result,
+                       "Job %s tried to set input column for Input Op "
+                       "at %d, but this Op is not an Input Op.",
+                       job.output_table_name().c_str(),
+                       column_input.op_index());
+          return result;
+        }
+        used_input_ops.insert(column_input.op_index());
+        // Verify column input table exists
+        if (!meta.has_table(column_input.table_name())) {
+          RESULT_ERROR(&result,
+                       "Job %s tried to sample from non-existent table "
+                       "%s.",
+                       job.output_table_name().c_str(),
+                       column_input.table_name().c_str());
+          return result;
+        }
+        // Verify column input column exists in the requested table
+        if (!table_metas.at(column_input.table_name())
+                 .has_column(column_input.column_name())) {
+          RESULT_ERROR(&result,
+                       "Job %s tried to sample column %s from table %s, "
+                       "but that column is not in that table.",
+                       job.output_table_name().c_str(),
+                       column_input.column_name().c_str(),
+                       column_input.table_name().c_str());
+          return result;
+        }
+      }
+    }
+
+    // Verify sampling args for table task
+    {
+      std::set<i32> used_sampling_ops;
+      for (auto& sampling_args_assignment : job.sampling_args_assignment()) {
+        if (used_sampling_ops.count(sampling_args_assignment.op_index()) > 0) {
+          RESULT_ERROR(&result,
+                       "Job %s tried to set sampling args for Op at %d "
+                       "twice.",
+                       job.output_table_name().c_str(),
+                       sampling_args_assignment.op_index());
+          return result;
+        }
+        if (sampling_ops.count(sampling_args_assignment.op_index()) == 0 &&
+            slice_ops.count(sampling_args_assignment.op_index()) == 0) {
+          RESULT_ERROR(&result,
+                       "Job %s tried to set sampling args for Op at %d, "
+                       "but this Op is not a sampling or slicing Op.",
+                       job.output_table_name().c_str(),
+                       sampling_args_assignment.op_index());
+          return result;
+        }
+        used_sampling_ops.insert(sampling_args_assignment.op_index());
+        // TODO(apoms): verify sampling args are valid
+        if (sampling_args_assignment.sampling_args().size() == 0) {
+          RESULT_ERROR(&result,
+                       "Job %s tried to set empty sampling args for Op at %d.",
+                       job.output_table_name().c_str(),
+                       sampling_args_assignment.op_index());
+          return result;
+        }
+        i32 slice_level =
+            op_slice_level.at(sampling_args_assignment.op_index());
+        if (slice_level == 0 &&
+            sampling_args_assignment.sampling_args().size() > 1) {
+          RESULT_ERROR(&result,
+                       "Job %s tried to set multiple sampling args for "
+                       "Op at %d that has not been sliced.",
+                       job.output_table_name().c_str(),
+                       sampling_args_assignment.op_index());
+          return result;
+        }
+      }
+    }
+  }
+  return result;
+}
+
+Result determine_input_rows_to_slices(
+    DatabaseMetadata& meta, TableMetaCache& table_metas,
+    const std::vector<proto::Job>& jobs,
+    const std::vector<proto::Op>& ops,
+    DAGAnalysisInfo& info) {
+  Result result;
+  result.set_success(true);
+  const std::vector<i32>& op_slice_level = info.op_slice_level;
+  const std::map<i64, i64>& input_ops = info.input_ops;
+  const std::map<i64, i64>& slice_ops = info.slice_ops;
+  const std::map<i64, i64>& unslice_ops = info.unslice_ops;
+  const std::map<i64, i64>& sampling_ops = info.sampling_ops;
+  const std::map<i64, std::vector<i64>>& op_children = info.op_children;
+  std::vector<std::map<i64, i64>>& slice_input_rows = info.slice_input_rows;
+  std::vector<std::map<i64, std::vector<i64>>>& slice_output_rows =
+      info.slice_output_rows;
+  std::vector<std::map<i64, std::vector<i64>>>& unslice_input_rows =
+      info.unslice_input_rows;
+  std::vector<std::map<i64, std::vector<i64>>>& total_rows_per_op =
+      info.total_rows_per_op;
+  std::vector<i64>& total_output_rows = info.total_output_rows;
+  // For each job, use table rows to determine number of total possible outputs
+  // by propagating downward through Op DAG
+  for (const proto::Job& job : jobs) {
+    slice_input_rows.emplace_back();
+    std::map<i64, i64>& job_slice_input_rows = slice_input_rows.back();
+    slice_output_rows.emplace_back();
+    std::map<i64, std::vector<i64>>& job_slice_output_rows =
+        slice_output_rows.back();
+    unslice_input_rows.emplace_back();
+    std::map<i64, std::vector<i64>>& job_unslice_input_rows =
+        unslice_input_rows.back();
+    total_rows_per_op.emplace_back();
+    std::map<i64, std::vector<i64>>& job_total_rows_per_op =
+        total_rows_per_op.back();
+    // Create domain samplers using sampling args
+    // Op idx -> samplers for each slice group
+    std::map<i64, proto::SamplingArgsAssignment> args_assignment;
+    std::map<i64, std::vector<std::unique_ptr<DomainSampler>>> domain_samplers;
+    for (const proto::SamplingArgsAssignment& saa :
+         job.sampling_args_assignment()) {
+      if (ops.at(saa.op_index()).name() == SLICE_OP_NAME) {
+        args_assignment[saa.op_index()] = saa;
+      } else {
+        std::vector<std::unique_ptr<DomainSampler>>& samplers =
+            domain_samplers[saa.op_index()];
+        // Assign number of rows to correct op
+        for (auto& sa : saa.sampling_args()) {
+          DomainSampler* sampler;
+          result = make_domain_sampler_instance(
+              sa.sampling_function(),
+              std::vector<u8>(sa.sampling_args().begin(),
+                              sa.sampling_args().end()),
+              sampler);
+          if (!result.success()) {
+            return result;
+          }
+          samplers.emplace_back(sampler);
+        }
+      }
+    }
+    // Each Op can have a vector of outputs because of one level slicing
+    // Op idx -> input columns -> slice groups
+    std::vector<std::vector<std::vector<i64>>> op_num_inputs(ops.size());
+    // Currently, we constrain there to only be a single number of slice groups
+    // per job (no slicing in different ways) to make it easy to schedule
+    // as tasks
+    i64 number_of_slice_groups = -1;
+    // First populate num rows from table inputs
+    for (const proto::ColumnInput& ci : job.inputs()) {
+      // Determine number of rows for the requested table
+      i64 num_rows = table_metas.at(ci.table_name()).num_rows();
+      // Assign number of rows to correct op
+      op_num_inputs[ci.op_index()] = {{num_rows}};
+    }
+    bool success = false;
+    std::vector<i64> ready_ops;
+    for (auto& kv : input_ops) {
+      ready_ops.push_back(kv.first);
+    }
+    while (!ready_ops.empty()) {
+      i64 op_idx = ready_ops.back();
+      ready_ops.pop_back();
+
+      // Verify inputs are rate matched
+      std::vector<i64> slice_group_outputs;
+      {
+        const std::vector<i64>& first_input_column_slice_groups =
+            op_num_inputs.at(op_idx).at(0);
+        // Check all columns match the first column
+        for (const auto& input_column_slice_groups : op_num_inputs.at(op_idx)) {
+          // Verify there are the same number of slice groups
+          if (input_column_slice_groups.size() !=
+              first_input_column_slice_groups.size()) {
+            RESULT_ERROR(
+                &result,
+                "Job %s specified multiple inputs with a differing "
+                "number of slice groups for %s Op at %ld (%lu vs %lu).",
+                job.output_table_name().c_str(), ops[op_idx].name().c_str(),
+                op_idx, first_input_column_slice_groups.size(),
+                input_column_slice_groups.size());
+            return result;
+          }
+          // Verify the number of rows for each slice group matches
+          for (size_t i = 0; i < first_input_column_slice_groups.size(); ++i) {
+            if (input_column_slice_groups.at(i) !=
+                first_input_column_slice_groups.at(i)) {
+              RESULT_ERROR(
+                  &result,
+                  "Job %s specified multiple inputs with a differing "
+                  "number of rows for slice group %lu for %s Op at %ld "
+                  "(%lu vs %lu).",
+                  job.output_table_name().c_str(), i,
+                  ops[op_idx].name().c_str(), op_idx,
+                  input_column_slice_groups.at(i),
+                  first_input_column_slice_groups.at(i));
+              return result;
+            }
+          }
+        }
+        slice_group_outputs = first_input_column_slice_groups;
+      }
+      // Check if we are done
+      if (ops[op_idx].name() == OUTPUT_OP_NAME) {
+        // Should always be at slice level 0
+        assert(slice_group_outputs.size() == 1);
+        total_output_rows.push_back(slice_group_outputs.at(0));
+        success = true;
+        break;
+      }
+      // Check if this is a sampling Op
+      if (sampling_ops.count(op_idx) > 0) {
+        i64 sampling_op_idx = sampling_ops.at(op_idx);
+        // Verify number of samplers is equal to number of slice groups
+        if (domain_samplers.at(op_idx).size() != slice_group_outputs.size()) {
+          RESULT_ERROR(&result,
+                       "Job %s specified %lu samplers but there are %lu slice "
+                       "groups for %s Op at %ld.",
+                       job.output_table_name().c_str(),
+                       domain_samplers.at(op_idx).size(),
+                       slice_group_outputs.size(), ops[op_idx].name().c_str(),
+                       op_idx);
+          return result;
+        }
+        // Apply domain samplers to determine downstream row count
+        std::vector<i64> new_slice_group_outputs;
+        for (size_t i = 0; i < slice_group_outputs.size(); ++i) {
+          auto& sampler = domain_samplers.at(op_idx).at(i);
+          i64 new_outputs = 0;
+          result = sampler->get_num_downstream_rows(slice_group_outputs.at(i),
+                                                    new_outputs);
+          if (!result.success()) {
+            return result;
+          }
+          new_slice_group_outputs.push_back(new_outputs);
+        }
+        slice_group_outputs = new_slice_group_outputs;
+      }
+
+      // Check if this is a slice op
+      if (slice_ops.count(op_idx) > 0) {
+        assert(op_slice_level.at(op_idx) == 1);
+        assert(slice_group_outputs.size() == 1);
+        // Create Partitioner to enumerate slices
+        Partitioner* partitioner = nullptr;
+        auto& args = args_assignment[op_idx].sampling_args(0);
+        result = make_partitioner_instance(
+            args.sampling_function(),
+            std::vector<u8>(
+                args.sampling_args().begin(),
+                args.sampling_args().end()),
+            slice_group_outputs.at(0),
+            partitioner);
+        if (!result.success()) {
+          return result;
+        }
+
+        // Track job slice inputs so we can determine number of groups later
+        job_slice_input_rows.insert({op_idx, slice_group_outputs.at(0)});
+        // Update outputs with the new slice group outputs for this partition
+        slice_group_outputs = partitioner->total_rows_per_group();
+        delete partitioner;
+
+        if (number_of_slice_groups == -1) {
+          number_of_slice_groups == slice_group_outputs.size();
+        } else if (slice_group_outputs.size() != number_of_slice_groups) {
+          RESULT_ERROR(
+              &result,
+              "Job %s specified one slice with %lu groups and another "
+              "slice with %lu groups. Scanner currently does not "
+              "support multiple slices with different numbers of groups "
+              "in the same job.",
+              job.output_table_name().c_str(),
+              slice_group_outputs.size(), number_of_slice_groups);
+          return result;
+        }
+        job_slice_output_rows.insert({op_idx, slice_group_outputs});
+      }
+
+      // Check if this is an unslice op
+      if (unslice_ops.count(op_idx) > 0) {
+        assert(op_slice_level.at(op_idx) == 0);
+
+        job_unslice_input_rows.insert({op_idx, slice_group_outputs});
+        // Concatenate all slice group outputs
+        i64 new_outputs = 0;
+        for (i64 group_outputs : slice_group_outputs) {
+          new_outputs += group_outputs;
+        }
+        slice_group_outputs = {new_outputs};
+      }
+      // Track size of output domain for this Op for use in boundary condition
+      // check
+      job_total_rows_per_op[op_idx] = slice_group_outputs;
+
+      for (i64 child_op_idx : op_children.at(op_idx)) {
+        op_num_inputs.at(child_op_idx).push_back(slice_group_outputs);
+        // Check if Op has all of its inputs. If so, add to ready stack
+        if (op_num_inputs.at(child_op_idx).size() ==
+            ops[child_op_idx].inputs_size()) {
+          ready_ops.push_back(child_op_idx);
+        }
+      }
+    }
+    if (!success) {
+      // This should never happen...
+      assert(false);
+    }
+  }
+  return result;
+}
+
+std::tuple<i64, i64> determine_stencil_bounds(
+    const std::vector<proto::Op>& ops) {
+  i64 min = std::numeric_limits<i64>::max();
+  i64 max = std::numeric_limits<i64>::min();
+
+  OpRegistry* op_registry = get_op_registry();
+  // Skip input and output table ops
+  for (size_t i = 0; i < ops.size() - 1; ++i) {
+    auto& op = ops[i];
+    const auto& op_info = op_registry->get_op_info(op.name());
+
+    std::vector<i32> stencil;
+    if (op.stencil_size() > 0) {
+      stencil = std::vector<i32>(op.stencil().begin(), op.stencil().end());
+    } else {
+      stencil = op_info->preferred_stencil();
+    }
+
+    min = std::min((i64)stencil[0], min);
+    max = std::max((i64)stencil[stencil.size() - 1], max);
+  }
+
+  return std::make_tuple(min, max);
+}
+
+Result derive_slice_final_output_rows(
+    const proto::Job& job, const std::vector<proto::Op>& ops, i64 slice_op_idx,
+    i64 slice_input_rows, DAGAnalysisInfo& info,
+    std::vector<i64>& slice_output_partition) {
+  Result result;
+  result.set_success(true);
+  // First create partitioner to determine slice groups
+  Partitioner* partitioner = nullptr;
+  proto::SamplingArgs args;
+  {
+    bool found = false;
+    for (auto& saa : job.sampling_args_assignment()) {
+      if (saa.op_index() == slice_op_idx) {
+        args = saa.sampling_args(0);
+        found = true;
+      }
+    }
+    assert(found);
+  }
+  result = make_partitioner_instance(
+      args.sampling_function(),
+      std::vector<u8>(args.sampling_args().begin(), args.sampling_args().end()),
+      slice_input_rows, partitioner);
+  if (!result.success()) {
+    return result;
+  }
+  // Traverse down graph from each slice group and count the number of rows
+  // produced. This is the partition offset that we will use to split the graph
+  std::vector<i64>& slice_rows = slice_output_partition;
+  slice_rows.push_back(0);
+  i64 current_offset = 0;
+  for (size_t i = 0; i < partitioner->total_groups(); ++i) {
+    const PartitionGroup& g = partitioner->group_at(i);
+    // Traverse down all children until reaching the output
+    std::vector<i64> input_row_counts(ops.size());
+    std::vector<i64> next_queue;
+    input_row_counts[slice_op_idx] = g.rows.size();
+    next_queue.push_back(slice_op_idx);
+    while (!next_queue.empty()) {
+      i64 op_idx = next_queue.back();
+      next_queue.pop_back();
+
+      auto& op = ops.at(op_idx);
+      // Check if sampling Op or unslice Op
+      i64 input_row_count = input_row_counts.at(op_idx);
+      i64 output_row_count = input_row_count;
+      if (op.name() == "Space" || op.name() == "SpaceFrame" ||
+          op.name() == "Sample" || op.name() == "SampleFrame") {
+        proto::SamplingArgs sa;
+        for (auto& saa : job.sampling_args_assignment()) {
+          if (saa.op_index() == op_idx) {
+            sa = saa.sampling_args(i);
+          }
+        }
+        DomainSampler* sampler;
+        result = make_domain_sampler_instance(
+            sa.sampling_function(), std::vector<u8>(sa.sampling_args().begin(),
+                                                    sa.sampling_args().end()),
+            sampler);
+        if (!result.success()) {
+          return result;
+        }
+        // Perform row count modification
+        result =
+            sampler->get_num_downstream_rows(input_row_count, output_row_count);
+        delete sampler;
+        if (!result.success()) {
+          return result;
+        }
+      }
+      else if (op.name() == UNSLICE_OP_NAME) {
+      }
+      else if (op.name() == OUTPUT_OP_NAME) {
+        // We are done
+        current_offset += input_row_count;
+        slice_rows.push_back(current_offset);
+        break;
+      }
+
+      for (i64 cid : info.op_children.at(op_idx)) {
+        input_row_counts.at(cid) = output_row_count;
+        next_queue.push_back(cid);
+      }
+    }
+  }
+  assert(slice_rows.size() == partitioner->total_groups() + 1);
+  return result;
+}
+
+void populate_analysis_info(const std::vector<proto::Op>& ops,
+                            DAGAnalysisInfo& info) {
+  std::vector<i32>& op_slice_level = info.op_slice_level;
+  std::map<i64, i64>& input_ops = info.input_ops;
+  std::map<i64, i64>& slice_ops = info.slice_ops;
+  std::map<i64, i64>& unslice_ops = info.unslice_ops;
+  std::map<i64, i64>& sampling_ops = info.sampling_ops;
+  std::map<i64, std::vector<i64>>& op_children = info.op_children;
+  std::map<i64, bool>& bounded_state_ops = info.bounded_state_ops;
+  std::map<i64, bool>& unbounded_state_ops = info.unbounded_state_ops;
+
+  std::map<i64, i32>& warmup_sizes = info.warmup_sizes;
+  std::map<i64, i32>& batch_sizes = info.batch_sizes;
+  std::map<i64, std::vector<i32>>& stencils = info.stencils;
+
+  // Validate ops
+  OpRegistry* op_registry = get_op_registry();
+  KernelRegistry* kernel_registry = get_kernel_registry();
+
+  i32 op_idx = 0;
+  // Keep track of op names and outputs for verifying that requested
+  // edges between ops are valid
+  std::vector<std::vector<std::string>> op_outputs;
+  // Slices are currently restricted to not nest and there to only exist
+  // a single slice grouping from start to finish currently.
+  std::vector<std::string> op_names;
+  for (auto& op : ops) {
+    op_names.push_back(op.name());
+
+    // Input Op's output is defined by the input table column they sample
+    if (op.name() == INPUT_OP_NAME) {
+      op_outputs.emplace_back();
+      op_outputs.back().push_back(op.inputs(0).column());
+      size_t input_ops_size = input_ops.size();
+      input_ops[op_idx] = input_ops_size;
+      op_slice_level.push_back(0);
+      op_idx++;
+      continue;
+    }
+
+    // Verify the inputs for this Op
+    i32 input_count = op.inputs().size();
+    i32 input_slice_level = 0;
+    if (input_count > 0) {
+      input_slice_level = op_slice_level.at(op.inputs(0).op_index());
+    }
+    for (auto& input : op.inputs()) {
+      // Keep op children info for later analysis
+      op_children[input.op_index()].push_back(op_idx);
+    }
+
+    // Slice
+    int output_slice_level = input_slice_level;
+    if (op.name() == SLICE_OP_NAME) {
+      assert(output_slice_level == 0);
+      size_t slice_ops_size = slice_ops.size();
+      slice_ops[op_idx] = slice_ops_size;
+      op_outputs.emplace_back();
+      for (auto& input : op.inputs()) {
+        op_outputs.back().push_back(input.column());
+      }
+      output_slice_level += 1;
+    }
+    // Unslice
+    else if (op.name() == UNSLICE_OP_NAME) {
+      assert(input_slice_level > 0);
+      size_t unslice_ops_size = unslice_ops.size();
+      unslice_ops[op_idx] = unslice_ops_size;
+      op_outputs.emplace_back();
+      for (auto& input : op.inputs()) {
+        op_outputs.back().push_back(input.column());
+      }
+      output_slice_level -= 1;
+    }
+    // Sample & Space
+    else if (op.name() == "Sample" || op.name() == "SampleFrame" ||
+             op.name() == "Space" || op.name() == "SpaceFrame") {
+      size_t sampling_ops_size = sampling_ops.size();
+      sampling_ops[op_idx] = sampling_ops_size;
+      op_outputs.emplace_back();
+      for (auto& input : op.inputs()) {
+        op_outputs.back().push_back(input.column());
+      }
+    }
+    // Output
+    else if (op.name() == OUTPUT_OP_NAME) {
+      assert(input_slice_level == 0);
+    }
+    // Verify op exists and record outputs
+    else {
+      op_outputs.emplace_back();
+      assert(op_registry->has_op(op.name()));
+
+      // Keep track of op outputs for verifying dependent ops
+      for (auto& col : op_registry->get_op_info(op.name())->output_columns()) {
+        op_outputs.back().push_back(col.name());
+      }
+
+      assert(kernel_registry->has_kernel(op.name(), op.device_type()));
+    }
+    op_slice_level.push_back(output_slice_level);
+
+    // Perform Op parameter verification (stenciling, batching, # inputs)
+    if (!is_builtin_op(op.name())) {
+      OpInfo* info = op_registry->get_op_info(op.name());
+      KernelFactory* factory =
+          kernel_registry->get_kernel(op.name(), op.device_type());
+
+      // Use default batch if not specified
+      i32 batch_size = op.batch() != -1
+                           ? op.batch()
+                           : factory->preferred_batch_size();
+      batch_sizes[op_idx] = batch_size;
+      // Use default stencil if not specified
+      std::vector<i32> stencil;
+      if (op.stencil_size() > 0) {
+        stencil = std::vector<i32>(op.stencil().begin(), op.stencil().end());
+      } else {
+        stencil = info->preferred_stencil();
+      }
+      stencils[op_idx] = stencil;
+      if (info->has_bounded_state()) {
+        bounded_state_ops[op_idx] = true;
+        warmup_sizes[op_idx] = op.warmup() != -1 ? op.warmup() : info->warmup();
+      }
+      else if (info->has_unbounded_state()) {
+        unbounded_state_ops[op_idx] = true;
+      }
+    }
+    op_idx++;
+  }
+}
+
+void remap_input_op_edges(std::vector<proto::Op>& ops,
+                          DAGAnalysisInfo& info) {
+  auto rename_col = [](i32 op_idx, const std::string& n) {
+    return std::to_string(op_idx) + "_" + n;
+  };
+  auto& remap_map = info.input_ops_to_first_op_columns;
+  {
+    auto first_op_input = ops.at(0).mutable_inputs(0);
+    first_op_input->set_column(rename_col(0, first_op_input->column()));
+    remap_map[0] = 0;
+  }
+  for (size_t op_idx = 1; op_idx < ops.size(); ++op_idx) {
+    auto& op = ops.at(op_idx);
+    // If input Op, add column to original input Op and get rid of existing
+    // column
+    if (op.name() == INPUT_OP_NAME) {
+      remap_map[op_idx] = ops.at(0).inputs_size();
+
+      std::string new_column_name =
+          rename_col(op_idx, op.inputs(0).column());
+      proto::OpInput* new_input = ops.at(0).add_inputs();
+      new_input->set_op_index(-1);
+      new_input->set_column(new_column_name);
+
+      ops.at(op_idx).clear_inputs();
+    }
+    // Remap all inputs to input Ops to the first Op
+    for (size_t i = 0; i < op.inputs_size(); ++i) {
+      auto input = op.mutable_inputs(i);
+      i32 input_op_idx = input->op_index();
+      if (remap_map.count(input_op_idx) > 0) {
+        input->set_op_index(0);
+        input->set_column(rename_col(input_op_idx, input->column()));
+      }
+    }
+  }
+}
+
+void perform_liveness_analysis(const std::vector<proto::Op>& ops,
+                               DAGAnalysisInfo& results) {
+  const std::map<i64, bool>& bounded_state_ops = results.bounded_state_ops;
+  const std::map<i64, bool>& unbounded_state_ops = results.unbounded_state_ops;
+  const std::map<i64, i32>& warmup_sizes = results.warmup_sizes;
+  const std::map<i64, i32>& batch_sizes = results.batch_sizes;
+  const std::map<i64, std::vector<i32>>& stencils = results.stencils;
+
+  std::vector<std::vector<std::tuple<i32, std::string>>>& live_columns =
+      results.live_columns;
+  std::vector<std::vector<i32>>& dead_columns = results.dead_columns;
+  std::vector<std::vector<i32>>& unused_outputs = results.unused_outputs;
+  std::vector<std::vector<i32>>& column_mapping = results.column_mapping;
+
+  // Start off with the columns from the gathered tables
+  OpRegistry* op_registry = get_op_registry();
+  KernelRegistry* kernel_registry = get_kernel_registry();
+  // Active intermediates
+  std::map<i32, std::vector<std::tuple<std::string, i32>>> intermediates;
+  {
+    auto& input_op = ops.at(0);
+    for (const auto& col : input_op.inputs()) {
+      const std::string& input_col = col.column();
+      // Set last used to first op so that all input ops are live to start
+      // with. We could eliminate input columns which aren't used, but this
+      // also requires modifying the samples.
+      intermediates[0].push_back(std::make_tuple(input_col, 1));
+    }
+  }
+  for (size_t i = 1; i < ops.size(); ++i) {
+    auto& op = ops.at(i);
+    // For each input, update the intermediate last used index to the
+    // current index
+    for (auto& eval_input : op.inputs()) {
+      i32 parent_index = eval_input.op_index();
+      const std::string& parent_col = eval_input.column();
+      bool found = false;
+      for (auto& kv : intermediates.at(parent_index)) {
+        if (std::get<0>(kv) == parent_col) {
+          found = true;
+          std::get<1>(kv) = i;
+          break;
+        }
+      }
+      assert(found);
+    }
+    if (op.name() == OUTPUT_OP_NAME) {
+      continue;
+    }
+    // Add this op's outputs to the intermediate list
+    if (is_builtin_op(op.name())) {
+      // Make sure it is initialized even if no inputs
+      intermediates[i] = {};
+      for (auto& input : op.inputs()) {
+        std::string col = input.column();
+        // HACK(apoms): we remap input column names but don't update
+        // the downstream column. A better solution would be to
+        // explicitly enumerate the output column names during the initial
+        // dag analysis and keep it around.
+        if (ops.at(input.op_index()).name() == INPUT_OP_NAME) {
+          col = col.substr(col.find("_") + 1);
+        }
+        intermediates[i].push_back(std::make_tuple(col, i));
+      }
+    } else {
+      const auto& op_info = op_registry->get_op_info(op.name());
+      for (const auto& output_column : op_info->output_columns()) {
+        intermediates[i].push_back(std::make_tuple(output_column.name(), i));
+      }
+    }
+  }
+
+  // The live columns at each op index
+  live_columns.resize(ops.size());
+  for (size_t i = 0; i < ops.size(); ++i) {
+    i32 op_index = i;
+    auto& columns = live_columns[i];
+    size_t max_i = std::min((size_t)(ops.size() - 2), i);
+    for (size_t j = 0; j <= max_i; ++j) {
+      for (auto& kv : intermediates.at(j)) {
+        i32 last_used_index = std::get<1>(kv);
+        if (last_used_index > op_index) {
+          // Last used index is greater than current index, so still live
+          columns.push_back(std::make_tuple((i32)j, std::get<0>(kv)));
+        }
+      }
+    }
+  }
+
+  // The columns to remove for the current kernel
+  dead_columns.resize(ops.size());
+  // Outputs from the current kernel that are not used
+  unused_outputs.resize(ops.size());
+  // Indices in the live columns list that are the inputs to the current
+  // kernel.
+  column_mapping.resize(ops.size());
+  for (size_t i = 1; i < ops.size(); ++i) {
+    i32 op_index = i;
+    auto& prev_columns = live_columns[i - 1];
+    auto& op = ops.at(op_index);
+    // Determine which columns are no longer live
+    {
+      auto& unused = unused_outputs[i];
+      auto& dead = dead_columns[i];
+      // For all parent Ops, check if we are the last Op to use
+      // their output column
+      size_t max_i = std::min((size_t)(ops.size() - 2), (size_t)i);
+      for (size_t j = 0; j <= max_i; ++j) {
+        i32 parent_index = j;
+        // For the current parent Op, check if we are the last to use
+        // any of its outputs
+        for (auto& kv : intermediates.at(j)) {
+          i32 last_used_index = std::get<1>(kv);
+          if (last_used_index == op_index) {
+            // We are the last to use the Op column.
+            // Column is no longer live, so remove it.
+            const std::string& col_name = std::get<0>(kv);
+            if (j == i) {
+              // This column was produced by the current Op but not used
+              i32 col_index = -1;
+              const std::vector<Column>& op_cols =
+                  op_registry->get_op_info(op.name())->output_columns();
+              for (size_t k = 0; k < op_cols.size(); k++) {
+                if (col_name == op_cols[k].name()) {
+                  col_index = k;
+                  break;
+                }
+              }
+              assert(col_index != -1);
+              unused.push_back(col_index);
+            } else {
+              // This column was produced by a previous Op
+              // Determine where in the previous live columns list this
+              // column existed
+              i32 col_index = -1;
+              for (i32 k = 0; k < (i32)prev_columns.size(); ++k) {
+                const std::tuple<i32, std::string>& live_input =
+                    prev_columns[k];
+                if (parent_index == std::get<0>(live_input) &&
+                    col_name == std::get<1>(live_input)) {
+                  col_index = k;
+                  break;
+                }
+              }
+              assert(col_index != -1);
+              dead.push_back(col_index);
+            }
+          }
+        }
+      }
+    }
+    // For each input to the Op, determine where in the live column list
+    // that input is
+    auto& mapping = column_mapping.at(op_index);
+    for (const auto& eval_input : op.inputs()) {
+      i32 parent_index = eval_input.op_index();
+      const std::string& col = eval_input.column();
+      i32 col_index = -1;
+      for (i32 k = 0; k < (i32)prev_columns.size(); ++k) {
+        const std::tuple<i32, std::string>& live_input = prev_columns[k];
+        if (parent_index == std::get<0>(live_input) &&
+            col == std::get<1>(live_input)) {
+          col_index = k;
+          break;
+        }
+      }
+      assert(col_index != -1);
+      mapping.push_back(col_index);
+    }
+  }
+}
+
+Result derive_stencil_requirements(
+    const DatabaseMetadata& meta, const TableMetaCache& table_meta,
+    const proto::Job& job, const std::vector<proto::Op>& ops,
+    const DAGAnalysisInfo& analysis_results,
+    proto::BulkJobParameters::BoundaryCondition boundary_condition,
+    i64 table_id, i64 job_idx, i64 task_idx,
+    const std::vector<i64>& output_rows, LoadWorkEntry& output_entry,
+    std::deque<TaskStream>& task_streams) {
+  const std::map<i64, std::vector<i32>>& stencils = analysis_results.stencils;
+  const std::vector<std::vector<std::tuple<i32, std::string>>>& live_columns =
+      analysis_results.live_columns;
+
+  output_entry.set_table_id(table_id);
+  output_entry.set_job_index(job_idx);
+  output_entry.set_task_index(task_idx);
+
+  i64 num_ops = ops.size();
+
+  const std::map<i64, std::vector<i64>>& job_slice_output_rows =
+      analysis_results.slice_output_rows.at(job_idx);
+  const std::map<i64, std::vector<i64>>& job_unslice_input_rows =
+      analysis_results.unslice_input_rows.at(job_idx);
+  const std::map<i64, bool>& bounded_state_ops =
+      analysis_results.bounded_state_ops;
+  const std::map<i64, bool>& unbounded_state_ops =
+      analysis_results.unbounded_state_ops;
+  const std::map<i64, i32>& warmup_sizes = analysis_results.warmup_sizes;
+  // Create domain samplers
+  // Op -> slice
+  std::map<i64, std::vector<std::unique_ptr<DomainSampler>>> domain_samplers;
+  for (const proto::SamplingArgsAssignment& saa :
+       job.sampling_args_assignment()) {
+    std::vector<std::unique_ptr<DomainSampler>>& samplers =
+        domain_samplers[saa.op_index()];
+    // Assign number of rows to correct op
+    if (ops.at(saa.op_index()).name() != SLICE_OP_NAME) {
+      for (auto& sa : saa.sampling_args()) {
+        DomainSampler* sampler;
+        Result result = make_domain_sampler_instance(
+            sa.sampling_function(),
+            std::vector<u8>(sa.sampling_args().begin(),
+                            sa.sampling_args().end()),
+            sampler);
+        if (!result.success()) {
+          return result;
+        }
+        samplers.emplace_back(sampler);
+      }
+    }
+  }
+
+  // Associate input ops with table ids
+  std::vector<i32> table_ids(job.inputs_size());
+  std::vector<i32> column_ids(job.inputs_size());
+  for (auto& col_input : job.inputs()) {
+    i32 col_idx =
+        analysis_results.input_ops_to_first_op_columns.at(col_input.op_index());
+    table_ids[col_idx] = meta.get_table_id(col_input.table_name());
+    column_ids[col_idx] = table_meta.at(col_input.table_name())
+                              .column_id(col_input.column_name());
+  }
+
+  // Compute the required rows for each kernel based on the stencil, sampling
+  // operations, and slice operations.
+  // For each Op, determine the set of rows needed in the live columns list
+  // and the set of rows to feed to the Op at the current column mapping
+  // Op -> Rows
+  std::vector<std::set<i64>> required_output_rows_at_op(ops.size());
+  std::vector<std::vector<i64>> required_input_rows_at_op(ops.size());
+  // Track inputs for ecah column of the input Op since different rnput Op
+  // colums may correspond to different tables and conservatively requesting
+  // all rows could cause an invalid access
+  std::vector<std::set<i64>> required_input_op_output_rows;
+  required_input_op_output_rows.resize(ops.at(0).inputs_size());
+  std::vector<std::vector<i64>> required_input_op_input_rows;
+  required_input_op_input_rows.resize(ops.at(0).inputs_size());
+  assert(ops.at(0).inputs_size() == job.inputs_size());
+  // HACK(apoms): we currently propagate this boundary condition upward,
+  // but that would technically cause the upstream sequence to have more
+  // elements than required. Should we stop the boundary condition at the Op
+  // by deduplication?
+  auto handle_boundary = [boundary_condition](
+      const std::vector<i64>& downstream_rows, i64 max_rows,
+      std::vector<i64>& bounded_rows) {
+    // Handle rows which touch boundaries
+    for (size_t i = 0; i < downstream_rows.size(); ++i) {
+      i64 r = downstream_rows[i];
+      if (r < 0 || r >= max_rows) {
+        switch (boundary_condition) {
+          case proto::BulkJobParameters::REPEAT_EDGE: {
+            r = (r < 0) ? 0 : max_rows - 1;
+            break;
+          }
+          case proto::BulkJobParameters::REPEAT_NULL: {
+            r = -1;
+            break;
+          }
+          case proto::BulkJobParameters::ERROR: {
+            Result result;
+            RESULT_ERROR(&result, "Boundary error.");
+            return result;
+          }
+        }
+      }
+      bounded_rows.push_back(r);
+    }
+    Result result;
+    result.set_success(true);
+    return result;
+  };
+  // Walk up the Ops to derive upstream rows
+  i32 slice_group = 0;
+  {
+    // Initialize output rows
+    required_output_rows_at_op.at(num_ops - 1) =
+        std::set<i64>(output_rows.begin(), output_rows.end());
+    // For each kernel, derive the minimal required upstream elements
+    for (i64 op_idx = num_ops - 1; op_idx >= 0; --op_idx) {
+      auto& op = ops.at(op_idx);
+      std::vector<i64> downstream_rows(
+          required_output_rows_at_op.at(op_idx).begin(),
+          required_output_rows_at_op.at(op_idx).end());
+      std::sort(downstream_rows.begin(), downstream_rows.end());
+      std::vector<i64> compute_rows;
+      // Determine which upstream rows are needed for the requested output rows
+      std::vector<i64> new_rows;
+      // Input Op
+      if (op.name() == INPUT_OP_NAME) {
+        // Ignore if it is not the first input
+        if (op_idx == 0) {
+          // Determine input table this column came from
+          for (size_t i = 0; i < table_ids.size(); ++i) {
+            i32 table_id = table_ids[i];
+            std::vector<i64> output_rows(
+                required_input_op_output_rows.at(i).begin(),
+                required_input_op_output_rows.at(i).end());
+            std::sort(output_rows.begin(), output_rows.end());
+            std::vector<i64>& input_rows = required_input_op_input_rows.at(i);
+            i64 num_rows = table_meta.at(table_id).num_rows();
+
+            // Perform boundary restriction
+            Result result = handle_boundary(output_rows, num_rows, input_rows);
+            if (!result.success()) {
+              return result;
+            }
+          }
+        }
+      }
+      // Sample or Space Op
+      else if (op.name() == SAMPLE_OP_NAME) {
+        // Use domain sampler
+        i32 slice = 0;
+        if (analysis_results.op_slice_level.at(op_idx) > 0) {
+          assert(slice_group != -1);
+          slice = slice_group;
+        }
+        Result result = domain_samplers.at(op_idx)
+                            .at(slice)
+                            ->get_upstream_rows(downstream_rows, new_rows);
+        if (!result.success()) {
+          return result;
+        }
+      }
+      // Space Op
+      else if (op.name() == SPACE_OP_NAME) {
+        // Use domain sampler
+        i32 slice = 0;
+        if (analysis_results.op_slice_level.at(op_idx) > 0) {
+          assert(slice_group != -1);
+          slice = slice_group;
+        }
+        Result result = domain_samplers.at(op_idx).at(slice)->get_upstream_rows(
+            downstream_rows, new_rows);
+        if (!result.success()) {
+          return result;
+        }
+      }
+      // Slice Op
+      else if (op.name() == SLICE_OP_NAME) {
+        // We know which slice group we are in already from the unslice
+        // HACK(apoms): we currently restrict pipelines such that slices
+        // can be computed entirely independently and choose output rows
+        // that do not cross state boundaries to make it possible to assume
+        // that all rows are in the same slice
+        assert(slice_group != -1);
+
+        const auto& slice_output_counts = job_slice_output_rows.at(op_idx);
+        i64 offset = 0;
+        for (i64 i = 0; i < slice_group; ++i) {
+          offset += slice_output_counts.at(i);
+        }
+
+        i64 rows_in_group = slice_output_counts.at(slice_group);
+        // Perform boundary restriction
+        std::vector<i64> bounded_rows;
+        Result result =
+            handle_boundary(downstream_rows, rows_in_group, bounded_rows);
+        if (!result.success()) {
+          return result;
+        }
+
+        // Remap row indices
+        for (i64 r : bounded_rows) {
+          new_rows.push_back(r + offset);
+        }
+      }
+      // Unslice Op
+      else if (op.name() == UNSLICE_OP_NAME) {
+        // Determine which slices we are in and propagate those rows upwards
+        // HACK(apoms): we currently restrict pipelines such that slices
+        // can be computed entirely independently and choose output rows
+        // that do not cross state boundaries to make it possible to assume
+        // that all rows are in the same slice
+        i64 downstream_min = downstream_rows[0];
+        i64 downstream_max = downstream_rows[downstream_rows.size() - 1];
+        const auto& unslice_input_counts = job_unslice_input_rows.at(op_idx);
+        i64 offset = 0;
+        slice_group = 0;
+        bool found = false;
+        for (; slice_group < unslice_input_counts.size(); ++slice_group) {
+          if (downstream_min >= offset &&
+              downstream_max < offset + unslice_input_counts.at(slice_group)) {
+            found = true;
+            break;
+          }
+          offset += unslice_input_counts.at(slice_group);
+        }
+        assert(found);
+        // Remap row indices
+        for (i64 r : downstream_rows) {
+          new_rows.push_back(r - offset);
+        }
+      }
+      // Output Op
+      else if (op.name() == OUTPUT_OP_NAME) {
+        new_rows = downstream_rows;
+      }
+      // Regular Op
+      else {
+        assert(!is_builtin_op(op.name()));
+        std::unordered_set<i64> current_rows;
+        current_rows.reserve(downstream_rows.size());
+        // If bounded state, we need to handle warmup
+        if (bounded_state_ops.count(op_idx) > 0) {
+          i32 warmup = warmup_sizes.at(op_idx);
+          for (i64 r : downstream_rows) {
+            // Check that we have all warmup rows
+            for (i64 i = 0; i <= warmup; ++i) {
+              i64 req_row = r - i;
+              if (req_row < 0) {
+                continue;
+              }
+              current_rows.insert(req_row);
+            }
+          }
+        }
+        // If unbounded state, we need all upstream inputs from 0
+        else if (unbounded_state_ops.count(op_idx) > 0) {
+          i32 max_required_row = downstream_rows.back();
+          for (i64 i = 0; i <= max_required_row; ++i) {
+            current_rows.insert(i);
+          }
+        } else {
+          current_rows.insert(downstream_rows.begin(), downstream_rows.end());
+        }
+        compute_rows = std::vector<i64>(current_rows.begin(),
+                                        current_rows.end());
+        std::sort(compute_rows.begin(), compute_rows.end());
+
+        // Ensure we have inputs for stenciling kernels
+        std::unordered_set<i64> stencil_rows;
+        const std::vector<i32>& stencil = stencils.at(op_idx);
+        for (i64 r : current_rows) {
+          for (i64 s : stencil) {
+            stencil_rows.insert(r + s);
+          }
+        }
+        new_rows = std::vector<i64>(stencil_rows.begin(), stencil_rows.end());
+        std::sort(new_rows.begin(), new_rows.end());
+      }
+
+      required_input_rows_at_op.at(op_idx) = new_rows;
+      // Input Op inputs do not connect to any other Ops
+      if (op.name() != INPUT_OP_NAME) {
+        for (auto& input : op.inputs()) {
+          if (input.op_index() == 0) {
+            // For the input Op, we track each input column separately since
+            // they may come from different tables
+            i64 col_id = -1;
+            for (size_t i = 0; i < ops.at(0).inputs_size(); ++i) {
+              const auto& col = ops.at(0).inputs(i);
+              if (col.column() == input.column()) {
+                col_id = i;
+                break;
+              }
+            }
+            assert(col_id != -1);
+            required_input_op_output_rows.at(col_id).insert(new_rows.begin(),
+                                                            new_rows.end());
+          }
+          auto& input_outputs = required_output_rows_at_op.at(input.op_index());
+          input_outputs.insert(new_rows.begin(), new_rows.end());
+        }
+      }
+
+      if (compute_rows.empty()) {
+        compute_rows = new_rows;
+      }
+
+      TaskStream s;
+      s.slice_group = slice_group;
+      s.valid_input_rows = new_rows;
+      s.compute_input_rows = compute_rows;
+      s.valid_output_rows = downstream_rows;
+      task_streams.push_front(s);
+    }
+  }
+
+  // Get rid of input stream since this is already captured by the load samples
+  task_streams.pop_front();
+
+  for (size_t i = 0; i < table_ids.size(); ++i) {
+    auto out_sample = output_entry.add_samples();
+    out_sample->set_table_id(table_ids[i]);
+    out_sample->set_column_id(column_ids[i]);
+    google::protobuf::RepeatedField<i64> input_data(
+        required_input_op_input_rows.at(i).begin(),
+        required_input_op_input_rows.at(i).end());
+    out_sample->mutable_input_row_ids()->Swap(&input_data);
+    google::protobuf::RepeatedField<i64> output_data(
+        required_input_op_output_rows.at(i).begin(),
+        required_input_op_output_rows.at(i).end());
+    out_sample->mutable_output_row_ids()->Swap(&output_data);
+  }
+  Result result;
+  result.set_success(true);
+  return result;
+}
+
+}
+}
diff --git a/scanner/engine/dag_analysis.h b/scanner/engine/dag_analysis.h
new file mode 100644
index 00000000..d94a37eb
--- /dev/null
+++ b/scanner/engine/dag_analysis.h
@@ -0,0 +1,128 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scanner/engine/metadata.h"
+#include "scanner/engine/table_meta_cache.h"
+#include "scanner/engine/runtime.h"
+
+#include <deque>
+
+namespace scanner {
+namespace internal {
+
+const std::string INPUT_OP_NAME = "Input";
+const std::string OUTPUT_OP_NAME = "OutputTable";
+const std::string SAMPLE_OP_NAME = "Sample";
+const std::string SPACE_OP_NAME = "Space";
+const std::string SLICE_OP_NAME = "Slice";
+const std::string UNSLICE_OP_NAME = "Unslice";
+
+const std::vector<std::string> BUILTIN_OP_NAMES = {
+  INPUT_OP_NAME,
+  OUTPUT_OP_NAME,
+  SAMPLE_OP_NAME,
+  SPACE_OP_NAME,
+  SLICE_OP_NAME,
+  UNSLICE_OP_NAME,
+};
+
+bool is_builtin_op(const std::string& name);
+
+struct DAGAnalysisInfo {
+  std::vector<i32> op_slice_level;
+  std::map<i64, i64> input_ops;
+  std::map<i64, i64> slice_ops;
+  std::map<i64, i64> unslice_ops;
+  std::map<i64, i64> sampling_ops;
+  std::map<i64, std::vector<i64>> op_children;
+
+  // Input rows to slice Ops per Job
+  std::vector<std::map<i64, i64>> slice_input_rows;
+  // Job -> Op -> Slice
+  std::vector<std::map<i64, std::vector<i64>>> slice_output_rows;
+  // Input rows to unslice Ops per Job
+  // Job -> Op -> Slice
+  std::vector<std::map<i64, std::vector<i64>>> unslice_input_rows;
+  // Total rows for each ops domain
+  // Job -> Op -> Slice
+  std::vector<std::map<i64, std::vector<i64>>> total_rows_per_op;
+  // Total output rows per Job
+  std::vector<i64> total_output_rows;
+
+  std::map<i64, bool> bounded_state_ops;
+  std::map<i64, bool> unbounded_state_ops;
+  std::map<i64, i32> warmup_sizes;
+  std::map<i64, i32> batch_sizes;
+  std::map<i64, std::vector<i32>> stencils;
+
+  // Filled in by remap_input_op_edges
+  std::map<i64, i64> input_ops_to_first_op_columns;
+
+  // Op -> Columns
+  std::vector<std::vector<std::tuple<i32, std::string>>> live_columns;
+  std::vector<std::vector<i32>> dead_columns;
+  std::vector<std::vector<i32>> unused_outputs;
+  std::vector<std::vector<i32>> column_mapping;
+};
+
+
+Result validate_jobs_and_ops(
+    DatabaseMetadata& meta, TableMetaCache& table_metas,
+    const std::vector<proto::Job>& jobs,
+    const std::vector<proto::Op>& ops,
+    DAGAnalysisInfo& info);
+
+Result determine_input_rows_to_slices(
+    DatabaseMetadata& meta, TableMetaCache& table_metas,
+    const std::vector<proto::Job>& jobs,
+    const std::vector<proto::Op>& ops,
+    DAGAnalysisInfo& info);
+
+Result derive_slice_final_output_rows(
+    const proto::Job& job,
+    const std::vector<proto::Op>& ops,
+    i64 slice_op_idx,
+    i64 slice_input_rows,
+    DAGAnalysisInfo& info,
+    std::vector<i64>& slice_output_partition);
+
+void populate_analysis_info(const std::vector<proto::Op>& ops,
+                            DAGAnalysisInfo& info);
+
+// Change all edges from input Ops to instead come from the first Op.
+// We currently only implement IO at the start and end of a pipeline.
+void remap_input_op_edges(std::vector<proto::Op>& ops,
+                          DAGAnalysisInfo& info);
+
+void perform_liveness_analysis(const std::vector<proto::Op>& ops,
+                               DAGAnalysisInfo& info);
+
+Result derive_stencil_requirements(
+    const DatabaseMetadata& meta, const TableMetaCache& table_meta,
+    const proto::Job& job, const std::vector<proto::Op>& ops,
+    const DAGAnalysisInfo& analysis_results,
+    proto::BulkJobParameters::BoundaryCondition boundary_condition,
+    i64 table_id, i64 job_idx, i64 task_idx,
+    const std::vector<i64>& output_rows, LoadWorkEntry& output_entry,
+    std::deque<TaskStream>& task_streams);
+
+// Result derive_input_rows_from_output_rows(
+//     const std::vector<proto::Job>& jobs,
+//     const std::vector<proto::Op>& ops,
+//     const std::vector<std::vector<i64>>& output_rows,
+//     DAGAnalysisInfo& info,
+//     std::vector<std::vector<i64>>& input_rows);
+}
+}
diff --git a/scanner/engine/db.cpp b/scanner/engine/db.cpp
deleted file mode 100644
index 802dca49..00000000
--- a/scanner/engine/db.cpp
+++ /dev/null
@@ -1,418 +0,0 @@
-/* Copyright 2016 Carnegie Mellon University
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "scanner/engine/db.h"
-#include "scanner/engine/runtime.h"
-#include "scanner/util/storehouse.h"
-#include "scanner/util/util.h"
-#include "storehouse/storage_backend.h"
-
-#include <cassert>
-#include <cstdarg>
-#include <errno.h>
-#include <iostream>
-#include <libgen.h>
-#include <limits.h> /* PATH_MAX */
-#include <sstream>
-#include <string.h>
-#include <sys/stat.h> /* mkdir(2) */
-
-using storehouse::WriteFile;
-using storehouse::RandomReadFile;
-using storehouse::StoreResult;
-
-namespace scanner {
-using namespace proto;
-
-namespace internal {
-
-template <> std::string Metadata<DatabaseDescriptor>::descriptor_path() const {
-  const DatabaseMetadata *meta = (const DatabaseMetadata *)this;
-  return database_metadata_path();
-}
-
-template <> std::string Metadata<VideoDescriptor>::descriptor_path() const {
-  const VideoMetadata *meta = (const VideoMetadata *)this;
-  return table_item_video_metadata_path(meta->table_id(), meta->column_id(),
-                                        meta->item_id());
-}
-
-template <> std::string Metadata<JobDescriptor>::descriptor_path() const {
-  const JobMetadata *meta = (const JobMetadata *)this;
-  return job_descriptor_path(meta->id());
-}
-
-template <> std::string Metadata<TableDescriptor>::descriptor_path() const {
-  const TableMetadata *meta = (const TableMetadata *)this;
-  return table_descriptor_path(meta->id());
-}
-
-DatabaseMetadata::DatabaseMetadata() : next_table_id_(0), next_job_id_(0) {}
-
-DatabaseMetadata::DatabaseMetadata(const DatabaseDescriptor &d)
-    : Metadata(d), next_table_id_(d.next_table_id()),
-      next_job_id_(d.next_job_id()) {
-  for (int i = 0; i < descriptor_.tables_size(); ++i) {
-    const DatabaseDescriptor::Table &table = descriptor_.tables(i);
-    table_id_names_.insert({table.id(), table.name()});
-  }
-  for (int i = 0; i < descriptor_.jobs_size(); ++i) {
-    const DatabaseDescriptor_Job &job = descriptor_.jobs(i);
-    job_id_names_.insert({job.id(), job.name()});
-    job_names_.push_back(job.name());
-  }
-}
-
-const DatabaseDescriptor &DatabaseMetadata::get_descriptor() const {
-  descriptor_.set_next_table_id(next_table_id_);
-  descriptor_.set_next_job_id(next_job_id_);
-  descriptor_.clear_tables();
-  descriptor_.clear_jobs();
-
-  for (auto &kv : table_id_names_) {
-    auto table = descriptor_.add_tables();
-    table->set_id(kv.first);
-    table->set_name(kv.second);
-  }
-
-  for (auto &kv : job_id_names_) {
-    auto job = descriptor_.add_jobs();
-    job->set_id(kv.first);
-    job->set_name(kv.second);
-  }
-
-  return descriptor_;
-}
-
-std::string DatabaseMetadata::descriptor_path() {
-  return database_metadata_path();
-}
-
-const std::vector<std::string> DatabaseMetadata::table_names() const {
-  std::vector<std::string> names;
-  for (auto &entry : table_id_names_) {
-    names.push_back(entry.second);
-  }
-  return names;
-}
-
-bool DatabaseMetadata::has_table(const std::string &table) const {
-  for (const auto &kv : table_id_names_) {
-    if (kv.second == table) {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool DatabaseMetadata::has_table(i32 table_id) const {
-  return table_id_names_.count(table_id) > 0;
-}
-
-i32 DatabaseMetadata::get_table_id(const std::string &table) const {
-  i32 id = -1;
-  for (const auto &kv : table_id_names_) {
-    if (kv.second == table) {
-      id = kv.first;
-      break;
-    }
-  }
-  LOG_IF(FATAL, id == -1) << "Table " << table << " does not exist.";
-  return id;
-}
-
-const std::string &DatabaseMetadata::get_table_name(i32 table_id) const {
-  return table_id_names_.at(table_id);
-}
-
-i32 DatabaseMetadata::add_table(const std::string &table) {
-  i32 table_id = -1;
-  if (!has_table(table)) {
-    table_id = next_table_id_++;
-    table_id_names_[table_id] = table;
-  }
-  return table_id;
-}
-
-void DatabaseMetadata::remove_table(i32 table_id) {
-  assert(table_id_names_.count(table_id) > 0);
-  table_id_names_.erase(table_id);
-}
-
-const std::vector<std::string> &DatabaseMetadata::job_names() const {
-  return job_names_;
-}
-
-bool DatabaseMetadata::has_job(const std::string &job) const {
-  for (const auto &kv : job_id_names_) {
-    if (kv.second == job) {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool DatabaseMetadata::has_job(i32 job_id) const {
-  return job_id_names_.count(job_id) > 0;
-}
-
-i32 DatabaseMetadata::get_job_id(const std::string &job) const {
-  i32 job_id = -1;
-  for (const auto &kv : job_id_names_) {
-    if (kv.second == job) {
-      job_id = kv.first;
-      break;
-    }
-  }
-  assert(job_id != -1);
-  return job_id;
-}
-
-const std::string &DatabaseMetadata::get_job_name(i32 job_id) const {
-  return job_id_names_.at(job_id);
-}
-
-i32 DatabaseMetadata::add_job(const std::string &job_name) {
-  i32 job_id = next_job_id_++;
-  job_id_names_[job_id] = job_name;
-  return job_id;
-}
-
-void DatabaseMetadata::remove_job(i32 job_id) {
-  assert(job_id_names_.count(job_id) > 0);
-  job_id_names_.erase(job_id);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-/// VideoMetdata
-VideoMetadata::VideoMetadata() {}
-
-VideoMetadata::VideoMetadata(const VideoDescriptor &descriptor)
-    : Metadata(descriptor) {}
-
-std::string VideoMetadata::descriptor_path(i32 table_id, i32 column_id,
-                                           i32 item_id) {
-  return table_item_video_metadata_path(table_id, column_id, item_id);
-}
-
-i32 VideoMetadata::table_id() const { return descriptor_.table_id(); }
-
-i32 VideoMetadata::column_id() const { return descriptor_.column_id(); }
-
-i32 VideoMetadata::item_id() const { return descriptor_.item_id(); }
-
-i32 VideoMetadata::frames() const { return descriptor_.frames(); }
-
-i32 VideoMetadata::width() const { return descriptor_.width(); }
-
-i32 VideoMetadata::height() const { return descriptor_.height(); }
-
-std::vector<i64> VideoMetadata::keyframe_positions() const {
-  return std::vector<i64>(descriptor_.keyframe_positions().begin(),
-                          descriptor_.keyframe_positions().end());
-}
-
-std::vector<i64> VideoMetadata::keyframe_byte_offsets() const {
-  return std::vector<i64>(descriptor_.keyframe_byte_offsets().begin(),
-                          descriptor_.keyframe_byte_offsets().end());
-}
-
-///////////////////////////////////////////////////////////////////////////////
-/// ImageFormatGroupMetadata
-ImageFormatGroupMetadata::ImageFormatGroupMetadata() {}
-
-ImageFormatGroupMetadata::ImageFormatGroupMetadata(
-    const ImageFormatGroupDescriptor &descriptor)
-    : Metadata(descriptor) {}
-
-i32 ImageFormatGroupMetadata::num_images() const {
-  return descriptor_.num_images();
-}
-
-i32 ImageFormatGroupMetadata::width() const { return descriptor_.width(); }
-
-i32 ImageFormatGroupMetadata::height() const { return descriptor_.height(); }
-
-ImageEncodingType ImageFormatGroupMetadata::encoding_type() const {
-  return descriptor_.encoding_type();
-}
-
-ImageColorSpace ImageFormatGroupMetadata::color_space() const {
-  return descriptor_.color_space();
-}
-
-std::vector<i64> ImageFormatGroupMetadata::compressed_sizes() const {
-  return std::vector<i64>(descriptor_.compressed_sizes().begin(),
-                          descriptor_.compressed_sizes().end());
-}
-
-///////////////////////////////////////////////////////////////////////////////
-/// JobMetadata
-JobMetadata::JobMetadata() {}
-JobMetadata::JobMetadata(const JobDescriptor &job) : Metadata(job) {
-  for (auto &c : descriptor_.columns()) {
-    columns_.push_back(c);
-    column_ids_.insert({c.name(), c.id()});
-  }
-  for (auto &t : descriptor_.tasks()) {
-    table_names_.push_back(t.output_table_name());
-  }
-}
-
-std::string JobMetadata::descriptor_path(i32 job_id) {
-  return job_descriptor_path(job_id);
-}
-
-i32 JobMetadata::id() const { return descriptor_.id(); }
-
-std::string JobMetadata::name() const { return descriptor_.name(); }
-
-i32 JobMetadata::work_item_size() const { return descriptor_.work_item_size(); }
-
-i32 JobMetadata::num_nodes() const { return descriptor_.num_nodes(); }
-
-const std::vector<Column> &JobMetadata::columns() const { return columns_; }
-
-i32 JobMetadata::column_id(const std::string &column_name) const {
-  column_ids_.at(column_name);
-}
-
-const std::vector<std::string> &JobMetadata::table_names() const {
-  return table_names_;
-}
-
-bool JobMetadata::has_table(const std::string &name) const {
-  for (const std::string &n : table_names_) {
-    if (n == name) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// i64 JobMetadata::rows_in_table(const std::string &name) const {
-//   i64 rows = -1;
-//   auto it = rows_in_table_.find(name);
-//   if (it == rows_in_table_.end()) {
-//     for (const proto::Task &task : descriptor_.tasks()) {
-//       assert(task.samples_size() > 0);
-//       const proto::TableSample &sample = task.samples(0);
-//       rows = sample.rows_size();
-//       rows_in_table_.insert(std::make_pair(name, rows));
-//     }
-//   } else {
-//     rows = it->second;
-//   }
-//   assert(rows != -1);
-//   return rows;
-// }
-
-// i64 JobMetadata::total_rows() const {
-//   i64 rows = 0;
-//   for (const proto::Task &task : descriptor_.tasks()) {
-//     assert(task.samples_size() > 0);
-//     const proto::TableSample &sample = task.samples(0);
-//     rows += sample.rows_size();
-//   }
-//   return rows;
-// }
-
-///////////////////////////////////////////////////////////////////////////////
-/// TableMetadata
-TableMetadata::TableMetadata() {}
-TableMetadata::TableMetadata(const TableDescriptor &table) : Metadata(table) {
-  for (auto &c : descriptor_.columns()) {
-    columns_.push_back(c);
-  }
-}
-
-std::string TableMetadata::descriptor_path(i32 table_id) {
-  return table_descriptor_path(table_id);
-}
-
-i32 TableMetadata::id() const { return descriptor_.id(); }
-
-std::string TableMetadata::name() const { return descriptor_.name(); }
-
-i64 TableMetadata::num_rows() const {
-  return descriptor_.end_rows(descriptor_.end_rows_size() - 1);
-}
-
-std::vector<i64> TableMetadata::end_rows() const {
-  return std::vector<i64>(descriptor_.end_rows().begin(),
-                          descriptor_.end_rows().end());
-}
-
-const std::vector<Column> &TableMetadata::columns() const { return columns_; }
-
-std::string TableMetadata::column_name(i32 column_id) const {
-  for (auto &c : descriptor_.columns()) {
-    if (c.id() == column_id) {
-      return c.name();
-    }
-  }
-  LOG(FATAL) << "Column id " << column_id << " not found!";
-}
-
-i32 TableMetadata::column_id(const std::string &column_name) const {
-  for (auto &c : descriptor_.columns()) {
-    if (c.name() == column_name) {
-      return c.id();
-    }
-  }
-  LOG(FATAL) << "Column name " << column_name << " not found!";
-}
-
-ColumnType TableMetadata::column_type(i32 column_id) const {
-  for (auto &c : descriptor_.columns()) {
-    if (c.id() == column_id) {
-      return c.type();
-    }
-  }
-  LOG(FATAL) << "Column id " << column_id << " not found!";
-}
-
-namespace {
-std::string &get_database_path_ref() {
-  static std::string prefix = "";
-  return prefix;
-}
-}
-
-const std::string &get_database_path() {
-  std::atomic_thread_fence(std::memory_order_acquire);
-  return get_database_path_ref();
-}
-
-void set_database_path(std::string path) {
-  VLOG(1) << "Setting DB path to " << path;
-  get_database_path_ref() = path + "/";
-  std::atomic_thread_fence(std::memory_order_release);
-}
-
-void write_new_table(storehouse::StorageBackend *storage,
-                     DatabaseMetadata &meta, TableMetadata &table) {
-  VLOG(1) << "Writing new table " << table.name() << "..." << std::endl;
-  TableDescriptor &table_desc = table.get_descriptor();
-  i32 table_id = meta.add_table(table.name());
-  table_desc.set_id(table_id);
-
-  write_table_metadata(storage, table);
-  write_database_metadata(storage, meta);
-  VLOG(1) << "Finished writing new table " << table.name() << "."
-            << std::endl;
-}
-}
-}
diff --git a/scanner/engine/evaluate_worker.cpp b/scanner/engine/evaluate_worker.cpp
index 710aca10..7e8e427b 100644
--- a/scanner/engine/evaluate_worker.cpp
+++ b/scanner/engine/evaluate_worker.cpp
@@ -1,488 +1,1199 @@
 #include "scanner/engine/evaluate_worker.h"
 
 #include "scanner/engine/op_registry.h"
-#include "scanner/video/decoder_automata.h"
+#include "scanner/engine/dag_analysis.h"
+#include "scanner/util/cuda.h"
 
-#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
 #include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
 #include <thread>
 
 namespace scanner {
 namespace internal {
-namespace {
-void move_if_different_address_space(Profiler &profiler,
-                                     DeviceHandle current_handle,
-                                     DeviceHandle target_handle,
-                                     RowList &column) {
-  if (!current_handle.is_same_address_space(target_handle)) {
-    std::vector<u8 *> dest_buffers, src_buffers;
-    std::vector<size_t> sizes;
-
-    size_t total_size = 0;
-    for (i32 b = 0; b < (i32)column.rows.size(); ++b) {
-      total_size += column.rows[b].size;
-    }
 
-    if (column.rows.size() > 0) {
-      u8 *block =
-          new_block_buffer(target_handle, total_size, column.rows.size());
-      for (i32 b = 0; b < (i32)column.rows.size(); ++b) {
-        size_t size = column.rows[b].size;
-        dest_buffers.push_back(block);
-        block += size;
-        src_buffers.push_back(column.rows[b].buffer);
-        sizes.push_back(size);
-      }
-
-      auto memcpy_start = now();
-      memcpy_vec(dest_buffers, target_handle, src_buffers, current_handle,
-                 sizes);
-      profiler.add_interval("memcpy", memcpy_start, now());
-
-      auto delete_start = now();
-      for (i32 b = 0; b < (i32)column.rows.size(); ++b) {
-        delete_buffer(current_handle, column.rows[b].buffer);
-        column.rows[b].buffer = dest_buffers[b];
-      }
-    }
-  }
-}
-}
-
-void move_if_different_address_space(Profiler &profiler,
-                                     DeviceHandle current_handle,
-                                     DeviceHandle target_handle,
-                                     BatchedColumns &columns) {
-  for (i32 i = 0; i < (i32)columns.size(); ++i) {
-    RowList &column = columns[i];
-    move_if_different_address_space(profiler, current_handle, target_handle,
-                                    column);
-  }
+PreEvaluateWorker::PreEvaluateWorker(const PreEvaluateWorkerArgs& args)
+  : node_id_(args.node_id),
+    worker_id_(args.worker_id),
+    device_handle_(args.device_handle),
+    num_cpus_(args.num_cpus),
+    profiler_(args.profiler) {
 }
 
-void *pre_evaluate_thread(void *arg) {
-  PreEvaluateThreadArgs &args = *reinterpret_cast<PreEvaluateThreadArgs *>(arg);
+void PreEvaluateWorker::feed(EvalWorkEntry& work_entry, bool first) {
+  auto feed_start = now();
 
-  i64 work_item_size = args.job_params->work_item_size();
+  entry_ = work_entry;
 
-  i32 last_table_id = -1;
-  i32 last_end_row = -1;
-  i32 last_item_id = -1;
+  needs_configure_ = !(work_entry.job_index == last_job_idx_);
+  needs_reset_ = true;
 
-  DeviceHandle decoder_output_handle;
-  std::vector<std::unique_ptr<DecoderAutomata>> decoders;
-  while (true) {
-    auto idle_start = now();
-    // Wait for next work item to process
+  last_job_idx_ = work_entry.job_index;
 
-    std::tuple<IOItem, EvalWorkEntry> entry;
-    args.input_work.pop(entry);
-    IOItem& io_item = std::get<0>(entry);
-    EvalWorkEntry& work_entry = std::get<1>(entry);
-    if (work_entry.io_item_index == -1) {
-      break;
-    }
-
-    VLOG(1) << "Pre-evaluate (N/KI: " << args.node_id << "/" << args.id << "): "
-            << "processing item " << work_entry.io_item_index;
-
-    args.profiler.add_interval("idle", idle_start, now());
-
-    auto work_start = now();
+  // Split up a work entry into work item size chunks
+  total_rows_ = 0;
+  for (size_t i = 0; i < work_entry.columns.size(); ++i) {
+    total_rows_ =
+        std::max(total_rows_, (i64)work_entry.row_ids[i].size());
+  }
 
-    bool needs_configure = !(io_item.table_id() == last_table_id);
-    bool needs_reset = true;
-    // NOTE(apoms): for avoiding warmup
-    // needs_configure || !(io_item.item_id() == last_item_id ||
-    //       (io_item.table_id() == last_table_id &&
-    //        io_item.start_row() == last_end_row));
+  // FIXME: do we need this w/ multiple videos of different resolutions in the
+  // same task?
+  if (needs_configure_) {
+    // decoders_.clear();
+  }
 
-    last_table_id = io_item.table_id();
-    last_end_row = io_item.end_row();
-    last_item_id = io_item.item_id();
+  // Setup decoders if they have not been initialized yet
+  i32 media_col_idx = 0;
+  if (decoders_.empty()) {
+    auto init_start = now();
+    VideoDecoderType decoder_type;
+    i32 num_devices;
+    // Select a decoder type based on the type of the first op and
+    // the available decoders
+    if (device_handle_.type == DeviceType::GPU &&
+        VideoDecoder::has_decoder_type(VideoDecoderType::NVIDIA)) {
+      decoder_output_handle_.type = DeviceType::GPU;
+      decoder_output_handle_.id = device_handle_.id;
+      decoder_type = VideoDecoderType::NVIDIA;
+      num_devices = 1;
+    } else {
+      decoder_output_handle_ = CPU_DEVICE;
+      decoder_type = VideoDecoderType::SOFTWARE;
+      num_devices = num_cpus_;
+    }
+    for (size_t c = 0; c < work_entry.columns.size(); ++c) {
+      if (work_entry.column_types[c] == ColumnType::Video &&
+          work_entry.video_encoding_type[media_col_idx] ==
+              proto::VideoDescriptor::H264) {
+        if (work_entry.inplace_video[c]) {
+          hwang::DeviceHandle hd;
+          switch (device_handle_.type) {
+            case DeviceType::CPU:
+              hd.type = hwang::DeviceType::CPU;
+              break;
+            case DeviceType::GPU:
+              hd.type = hwang::DeviceType::GPU;
+              break;
+            default:
+              std::abort();
+          }
+          hd.id = device_handle_.id;
 
-    // Split up a work entry into work item size chunks
-    i64 total_rows = io_item.end_row() - io_item.start_row();
+          hwang::VideoDecoderType vd;
+          switch (decoder_type) {
+            case VideoDecoderType::SOFTWARE:
+              vd = hwang::VideoDecoderType::SOFTWARE;
+              break;
+            case VideoDecoderType::NVIDIA:
+              vd = hwang::VideoDecoderType::NVIDIA;
+              break;
+            default:
+              std::abort();
+          }
 
-    if (needs_configure) {
-      //decoders.clear();
+          inplace_decoders_.emplace_back(
+              new hwang::DecoderAutomata(hd, num_devices, vd));
+          //decoders_.back()->set_profiler(&profiler_);
+          decoders_.emplace_back(nullptr);
+        } else {
+          decoders_.emplace_back(
+              new DecoderAutomata(device_handle_, num_devices, decoder_type));
+          decoders_.back()->set_profiler(&profiler_);
+          inplace_decoders_.emplace_back(nullptr);
+        }
+        media_col_idx++;
+      }
     }
+    profiler_.add_interval("init", init_start, now());
+  }
 
-    // Setup decoders if they have not been initialized yet
-    if (decoders.empty()) {
-      auto init_start = now();
-      VideoDecoderType decoder_type;
-      i32 num_devices;
-      // Select a decoder type based on the type of the first op and
-      // the available decoders
-      if (args.device_handle.type == DeviceType::GPU &&
-          VideoDecoder::has_decoder_type(VideoDecoderType::NVIDIA)) {
-        decoder_output_handle.type = DeviceType::GPU;
-        decoder_output_handle.id = args.device_handle.id;
-        decoder_type = VideoDecoderType::NVIDIA;
-        num_devices = 1;
+  media_col_idx = 0;
+  auto setup_start = now();
+  // Deserialize all decode args into protobufs
+  decode_args_.clear();
+  for (size_t c = 0; c < work_entry.columns.size(); ++c) {
+    if (work_entry.column_types[c] == ColumnType::Video &&
+        work_entry.video_encoding_type[media_col_idx] ==
+            proto::VideoDescriptor::H264) {
+      decode_args_.emplace_back();
+      auto& args = decode_args_.back();
+      for (Element element : work_entry.columns[c]) {
+        args.emplace_back();
+        proto::DecodeArgs& da = args.back();
+        google::protobuf::io::ArrayInputStream in_stream(element.buffer,
+                                                         element.size);
+        google::protobuf::io::CodedInputStream cstream(&in_stream);
+        cstream.SetTotalBytesLimit(element.size + 1, element.size + 1);
+        bool result = da.ParseFromCodedStream(&cstream);
+        assert(result);
+        delete_element(CPU_DEVICE, element);
+      }
+
+      if (!work_entry.inplace_video[c]) {
+        decoders_[media_col_idx]->initialize(args);
       } else {
-        decoder_output_handle = CPU_DEVICE;
-        decoder_type = VideoDecoderType::SOFTWARE;
-        num_devices = args.num_cpus;
-      }
-      for (size_t c = 0; c < work_entry.columns.size(); ++c) {
-        if (work_entry.column_types[c] == ColumnType::Video) {
-          decoders.emplace_back(new DecoderAutomata(args.device_handle,
-                                                    num_devices, decoder_type));
+        // Translate into encoded data
+        std::vector<hwang::DecoderAutomata::EncodedData> encoded_data;
+        for (auto &da : args) {
+          encoded_data.emplace_back();
+          hwang::DecoderAutomata::EncodedData& ed = encoded_data.back();
+          u8* video_data = reinterpret_cast<u8*>(da.encoded_video());
+          size_t video_data_size = da.encoded_video_size();
+          ed.encoded_video =
+              std::vector<u8>(video_data, video_data + video_data_size);
+          ed.width = da.width();
+          ed.height = da.height();
+          ed.start_keyframe = da.start_keyframe();
+          ed.end_keyframe = da.end_keyframe();
+          ed.sample_offsets = std::vector<u64>(da.sample_offsets().begin(),
+                                               da.sample_offsets().end());
+          ed.sample_sizes = std::vector<u64>(da.sample_sizes().begin(),
+                                             da.sample_sizes().end());
+          ed.keyframes =
+              std::vector<u64>(da.keyframes().begin(), da.keyframes().end());
+          ed.valid_frames = std::vector<u64>(da.valid_frames().begin(),
+                                             da.valid_frames().end());
+        }
+        if (args.size() > 0) {
+          std::vector<u8> metadata(args.back().metadata().begin(),
+                                   args.back().metadata().end());
+          inplace_decoders_[media_col_idx]->initialize(encoded_data, metadata);
         }
       }
-      args.profiler.add_interval("init", init_start, now());
+      media_col_idx++;
     }
+  }
+  first_item_ = first;
+  current_row_ = 0;
+  profiler_.add_interval("feed", feed_start, now());
+}
 
-    i32 media_col_idx = 0;
-    std::vector<std::vector<proto::DecodeArgs>> decode_args;
-    bool first_item = true;
-    std::vector<EvalWorkEntry> work_items;
-    auto setup_start = now();
-    for (size_t c = 0; c < work_entry.columns.size(); ++c) {
-      if (work_entry.column_types[c] == ColumnType::Video) {
-        decode_args.emplace_back();
-        auto &args = decode_args.back();
-        for (Row row : work_entry.columns[c].rows) {
-          args.emplace_back();
-          proto::DecodeArgs &da = args.back();
-          google::protobuf::io::ArrayInputStream in_stream(row.buffer,
-                                                           row.size);
-          google::protobuf::io::CodedInputStream cstream(&in_stream);
-          cstream.SetTotalBytesLimit(row.size + 1, row.size + 1);
-          bool result = da.ParseFromCodedStream(&cstream);
-          assert(result);
-          delete_buffer(CPU_DEVICE, row.buffer);
+bool PreEvaluateWorker::yield(i32 item_size,
+                              EvalWorkEntry& output_entry) {
+  if (current_row_ >= total_rows_) return false;
+
+  auto yield_start = now();
+
+  EvalWorkEntry& work_entry = entry_;
+
+  i64 start_row = current_row_;
+  i64 end_row = std::min(current_row_ + item_size, total_rows_);
+
+  bool first_item = (start_row == 0);
+  i32 media_col_idx = 0;
+  EvalWorkEntry entry;
+  entry.table_id = work_entry.table_id;
+  entry.job_index = work_entry.job_index;
+  entry.task_index = work_entry.task_index;
+  entry.needs_configure = first_item ? needs_configure_ : false;
+  entry.needs_reset = first_item_ ? needs_reset_ : false;
+  entry.last_in_io_packet = (end_row >= total_rows_) ? true : false;
+  entry.columns.resize(work_entry.columns.size());
+  entry.last_in_task = work_entry.last_in_task;
+  entry.row_ids.resize(work_entry.row_ids.size());
+
+  for (size_t c = 0; c < work_entry.columns.size(); ++c) {
+    i64 column_start_row =
+        std::min(start_row, (i64)work_entry.row_ids.at(c).size());
+    i64 column_end_row =
+        std::min(end_row, (i64)work_entry.row_ids.at(c).size());
+    if (work_entry.column_types[c] == ColumnType::Video) {
+      // Perform decoding
+      i64 num_rows = column_end_row - column_start_row;
+      if (work_entry.video_encoding_type[media_col_idx] ==
+          proto::VideoDescriptor::H264) {
+        if (num_rows > 0) {
+          // Encoded as video
+          FrameInfo frame_info(decode_args_[media_col_idx][0].height(),
+                               decode_args_[media_col_idx][0].width(), 3,
+                               FrameType::U8);
+          u8* buffer = new_block_buffer(decoder_output_handle_,
+                                        num_rows * frame_info.size(), num_rows);
+          if (!work_entry.inplace_video[c]) {
+            decoders_[media_col_idx]->get_frames(buffer, num_rows);
+          } else {
+            inplace_decoders_[media_col_idx]->get_frames(buffer, num_rows);
+          }
+          for (i64 n = 0; n < num_rows; ++n) {
+            insert_frame(entry.columns[c],
+                         new Frame(frame_info, buffer + frame_info.size() * n));
+          }
         }
-        decoders[media_col_idx]->initialize(args);
-        media_col_idx++;
-      }
-    }
-    args.profiler.add_interval("setup", setup_start, now());
-
-    auto decode_start = now();
-    for (i64 r = 0; r < total_rows; r += work_item_size) {
-      media_col_idx = 0;
-      EvalWorkEntry entry;
-      entry.io_item_index = work_entry.io_item_index;
-      entry.needs_configure = first_item ? needs_configure : false;
-      entry.needs_reset = first_item ? needs_reset : false;
-      entry.last_in_io_item = (r + work_item_size >= total_rows) ? true : false;
-      entry.warmup_rows = work_entry.warmup_rows;
-      entry.columns.resize(work_entry.columns.size());
-      for (size_t c = 0; c < work_entry.columns.size(); ++c) {
-        i64 start = r;
-        i64 end = std::min(r + work_item_size, total_rows);
-        if (work_entry.column_types[c] == ColumnType::Video) {
-          // Perform decoding
-          i64 num_rows = end - start;
-          size_t frame_size = decode_args[media_col_idx][0].width() *
-                              decode_args[media_col_idx][0].height() * 3;
-          u8 *buffer = new_block_buffer(decoder_output_handle,
-                                        num_rows * frame_size, num_rows);
-          decoders[media_col_idx]->get_frames(buffer, num_rows);
+        entry.column_handles.push_back(decoder_output_handle_);
+      } else {
+        // Encoded as raw data
+        if (num_rows > 0) {
+          FrameInfo frame_info = work_entry.frame_sizes[media_col_idx];
           for (i64 n = 0; n < num_rows; ++n) {
-            INSERT_ROW(entry.columns[c], buffer + frame_size * n, frame_size);
+            Element& e = work_entry.columns[c][column_start_row + n];
+            assert(e.size == frame_info.size());
+            insert_frame(entry.columns[c], new Frame(frame_info, e.buffer));
           }
-          entry.column_handles.push_back(decoder_output_handle);
-          media_col_idx++;
-        } else {
-          entry.columns[c].rows =
-              std::vector<Row>(work_entry.columns[c].rows.begin() + start,
-                               work_entry.columns[c].rows.begin() + end);
-          entry.column_handles.push_back(work_entry.column_handles[c]);
         }
+        entry.column_handles.push_back(work_entry.column_handles[c]);
       }
-      // Push entry to kernels
-      args.output_work.push(std::make_tuple(io_item, entry));
-      first_item = false;
+      media_col_idx++;
+    } else {
+      entry.columns[c] =
+          std::vector<Element>(work_entry.columns[c].begin() + column_start_row,
+                               work_entry.columns[c].begin() + column_end_row);
+      entry.column_handles.push_back(work_entry.column_handles[c]);
     }
-    args.profiler.add_interval("decode", decode_start, now());
+    entry.row_ids[c] =
+        std::vector<i64>(work_entry.row_ids[c].begin() + column_start_row,
+                         work_entry.row_ids[c].begin() + column_end_row);
   }
+  profiler_.add_interval("yield", yield_start, now());
 
-  VLOG(1) << "Pre-evaluate (N/PU: " << args.node_id << "/" << args.id
-            << "): thread finished ";
-  THREAD_RETURN_SUCCESS();
-}
+  current_row_ += item_size;
 
-void *evaluate_thread(void *arg) {
-  EvaluateThreadArgs &args = *reinterpret_cast<EvaluateThreadArgs *>(arg);
+  output_entry = entry;
+  return true;
+}
 
+EvaluateWorker::EvaluateWorker(const EvaluateWorkerArgs& args)
+  : node_id_(args.node_id),
+    worker_id_(worker_id_),
+    profiler_(args.profiler),
+    arg_group_(args.arg_group) {
   auto setup_start = now();
-
+  for (auto& col : arg_group_.column_mapping) {
+    column_mapping_set_.emplace_back(col.begin(), col.end());
+  }
   // Instantiate kernels
-  const std::vector<std::vector<i32>> &dead_columns = args.dead_columns;
-  const std::vector<std::vector<i32>> &unused_outputs = args.unused_outputs;
-  const std::vector<std::vector<i32>> &column_mapping = args.column_mapping;
-  std::vector<DeviceHandle> kernel_devices;
-  std::vector<i32> kernel_num_outputs;
-  std::vector<std::unique_ptr<Kernel>> kernels;
   {
-    OpRegistry *registry = get_op_registry();
-    for (size_t i = 0; i < args.kernel_factories.size(); ++i) {
-      KernelFactory *factory = std::get<0>(args.kernel_factories[i]);
-      const Kernel::Config &config = std::get<1>(args.kernel_factories[i]);
-      kernel_devices.push_back(config.devices[0]);
-      kernel_num_outputs.push_back(registry->get_op_info(factory->get_op_name())
-                                       ->output_columns()
-                                       .size());
+    OpRegistry* registry = get_op_registry();
+    DeviceHandle last_device = CPU_DEVICE;
+    for (size_t i = 0; i < arg_group_.kernel_factories.size(); ++i) {
+      KernelFactory* factory = std::get<0>(arg_group_.kernel_factories[i]);
+      if (factory == nullptr) {
+        kernel_devices_.push_back(last_device);
+        kernel_input_devices_.push_back({last_device});
+        kernel_output_devices_.push_back({last_device});
+        kernel_num_outputs_.push_back(1);
+        kernels_.emplace_back(nullptr);
+        continue;
+      }
+      OpInfo* op_info = registry->get_op_info(factory->get_op_name());
+      const KernelConfig& config = std::get<1>(arg_group_.kernel_factories[i]);
+      kernel_devices_.push_back(config.devices[0]);
+      kernel_input_devices_.emplace_back();
+      if (op_info->variadic_inputs()) {
+        DeviceHandle handle = config.devices[0];
+        for (int i = 0; i < config.input_columns.size(); ++i) {
+          kernel_input_devices_.back().push_back(handle);
+        }
+      } else {
+        const auto& input_devices = factory->get_input_devices();
+        for (const auto& in_col : op_info->input_columns()) {
+          const auto& col_name = in_col.name();
+          DeviceType type = config.devices[0].type;
+          if (input_devices.count(col_name)) {
+            type = input_devices.at(col_name);
+          }
+          kernel_input_devices_.back().push_back(
+              DeviceHandle{type, config.devices[0].id});
+        }
+      }
+      kernel_output_devices_.emplace_back();
+      {
+        const auto& output_devices = factory->get_output_devices();
+        for (const auto& out_col : op_info->output_columns()) {
+          const auto& col_name = out_col.name();
+          DeviceType type = config.devices[0].type;
+          if (output_devices.count(col_name)) {
+            type = output_devices.at(col_name);
+          }
+          kernel_output_devices_.back().push_back(
+              DeviceHandle{type, config.devices[0].id});
+        }
+      }
+      last_device = config.devices[0];
+      kernel_num_outputs_.push_back(op_info->output_columns().size());
+
+#ifdef HAVE_CUDA
+      cudaSetDevice(0);
+#endif
       auto kernel = factory->new_instance(config);
       kernel->validate(&args.result);
       VLOG(1) << "Kernel finished validation " << args.result.success();
       if (!args.result.success()) {
-        VLOG(1) << "Kernel validate failed: " << args.result.msg();
+        LOG(ERROR) << "Kernel validate failed: " << args.result.msg();
         THREAD_RETURN_SUCCESS();
       }
-      kernels.emplace_back(kernel);
+      kernels_.emplace_back(kernel);
+    }
+  }
+  assert(kernels_.size() > 0);
+
+  for (auto& kernel : kernels_) {
+    if (kernel != nullptr) {
+      kernel->set_profiler(&args.profiler);
     }
   }
-  assert(kernels.size() > 0);
+  // Setup kernel cache sizes
+  element_cache_row_ids_.resize(kernels_.size());
+  element_cache_.resize(kernels_.size());
+  element_cache_devices_.resize(kernels_.size());
+  for (size_t i = 0; i < kernels_.size(); ++i) {
+    // Resize stencil cache to be the same size as the number of inputs
+    // to the kernel
+    element_cache_[i].resize(arg_group_.column_mapping[i].size());
+    element_cache_row_ids_[i].resize(arg_group_.column_mapping[i].size());
+  }
+  valid_output_rows_.resize(kernels_.size());
+  current_valid_input_idx_.resize(kernels_.size());
+  current_valid_output_idx_.assign(kernels_.size(), 0);
+
+  args.profiler.add_interval("setup", now(), setup_start);
 
-  for (auto &kernel : kernels) {
-    kernel->set_profiler(&args.profiler);
+  // Signal the main worker thread that we've finished startup
+  std::unique_lock<std::mutex> lk(args.startup_lock);
+  args.startup_count += 1;
+  args.startup_cv.notify_one();
+}
+
+EvaluateWorker::~EvaluateWorker() {
+  // Clear the stencil cache
+  clear_stencil_cache();
+}
+
+void EvaluateWorker::new_task(i64 job_idx, i64 task_idx,
+                              const std::vector<TaskStream>& task_streams) {
+  job_idx_ = job_idx;
+  task_idx_ = task_idx;
+  for (size_t i = 0; i < task_streams.size(); ++i) {
+    for (i64 used_rows : current_valid_input_idx_[i]) {
+      assert(valid_input_rows_[i].size() == used_rows);
+    }
   }
+  valid_input_rows_.clear();
+  valid_input_rows_set_.clear();
+  current_valid_input_idx_.clear();
+
+  compute_rows_.clear();
+  compute_rows_set_.clear();
+  current_compute_idx_.clear();
 
-  args.profiler.add_interval("setup", setup_start, now());
-
-  while (true) {
-    auto idle_start = now();
-    // Wait for next work item to process
-    std::tuple<IOItem, EvalWorkEntry> entry;
-    args.input_work.pop(entry);
-    IOItem& io_item = std::get<0>(entry);
-    EvalWorkEntry& work_entry = std::get<1>(entry);
-    if (work_entry.io_item_index == -1) {
-      break;
+  valid_output_rows_.clear();
+  valid_output_rows_set_.clear();
+  current_valid_output_idx_.clear();
+
+  current_element_cache_input_idx_.clear();
+  slice_group_ = -1;
+  for (size_t k = 0; k < task_streams.size(); ++k) {
+    auto& ts = task_streams[k];
+    if (ts.slice_group != -1) {
+      slice_group_ = ts.slice_group;
+    }
+    valid_input_rows_.push_back(ts.valid_input_rows);
+    valid_input_rows_set_.push_back(
+        std::set<i64>(ts.valid_input_rows.begin(), ts.valid_input_rows.end()));
+    current_valid_input_idx_.emplace_back();
+    for(i64 i = 0; i < arg_group_.column_mapping[k].size(); ++i) {
+      current_valid_input_idx_.back().push_back(0);
     }
 
-    VLOG(1) << "Evaluate (N/KI/G: " << args.node_id << "/" << args.ki << "/"
-              << args.kg << "): processing item " << work_entry.io_item_index;
+    compute_rows_.push_back(ts.compute_input_rows);
+    compute_rows_set_.push_back(std::set<i64>(
+        ts.compute_input_rows.begin(), ts.compute_input_rows.end()));
+    current_compute_idx_.push_back(0);
+
+    valid_output_rows_.push_back(ts.valid_output_rows);
+    valid_output_rows_set_.push_back(std::set<i64>(ts.valid_output_rows.begin(),
+                                                   ts.valid_output_rows.end()));
+    current_valid_output_idx_.push_back(0);
+
+    current_element_cache_input_idx_.push_back(0);
+  }
+
+  // Initialize domain samplers for this job and this slice
+  domain_samplers_.clear();
+  for (auto& kv : arg_group_.sampling_args) {
+    i64 op_idx = kv.first;
+    i64 slice = 0;
+    if (arg_group_.sampling_args.at(op_idx).at(job_idx).size() > 1) {
+      slice = slice_group_;
+    }
+    auto& sampling_args =
+        arg_group_.sampling_args.at(op_idx).at(job_idx).at(slice);
+    DomainSampler* sampler = nullptr;
+    Result result = make_domain_sampler_instance(
+        sampling_args.sampling_function(),
+        std::vector<u8>(sampling_args.sampling_args().begin(),
+                        sampling_args.sampling_args().end()),
+        sampler);
+    if (!result.success()) {
+      VLOG(1) << "Make domain sampler failed: " << result.msg();
+      THREAD_RETURN_SUCCESS();
+    }
+    domain_samplers_[op_idx].reset(sampler);
+  }
+
+  // Make the op aware of the format of the data
+  for (auto& kernel : kernels_) {
+    if (kernel) {
+      kernel->reset();
+    }
+  }
+
+  final_output_handles_.clear();;
+  final_output_columns_.clear();
+  final_row_ids_.clear();
+
+  clear_stencil_cache();
+}
+
+void EvaluateWorker::feed(EvalWorkEntry& work_entry) {
+  entry_ = work_entry;
+
+  auto feed_start = now();
+
+  current_input_ = 0;
+  total_inputs_ = 0;
+  for (size_t i = 0; i < work_entry.columns.size(); ++i) {
+    total_inputs_ =  // io_item.end_row - io_item.start_row;
+        std::max(total_inputs_, (i32)work_entry.columns[i].size());
+  }
+
+  std::vector<DeviceHandle> side_output_handles = work_entry.column_handles;
+  BatchedColumns side_output_columns = work_entry.columns;
+  std::vector<std::vector<i64>> side_row_ids = work_entry.row_ids;
+
+  // For each kernel, produce as much output as can be produced given current
+  // input rows and stencil cache.
+  for (size_t k = 0; k < arg_group_.op_names.size(); ++k) {
+    const std::string& op_name = arg_group_.op_names.at(k);
+    DeviceHandle current_handle = kernel_devices_[k];
+    const std::vector<DeviceHandle>& current_input_handles =
+        kernel_input_devices_[k];
+    const std::vector<DeviceHandle>& current_output_handles =
+        kernel_output_devices_[k];
+
+    std::vector<i64>& kernel_valid_input_rows = valid_input_rows_[k];
+    std::set<i64>& kernel_valid_input_rows_set = valid_input_rows_set_[k];
+    std::vector<i64>& kernel_current_input_idx = current_valid_input_idx_[k];
+
+    std::vector<i64>& kernel_compute_rows = compute_rows_[k];
+    i64& kernel_current_compute_idx = current_compute_idx_[k];
+
+    std::vector<i64>& kernel_valid_output_rows = valid_output_rows_[k];
+    std::set<i64>& kernel_valid_output_rows_set = valid_output_rows_set_[k];
+    i64& kernel_current_output_idx = current_valid_output_idx_[k];
+
+    i64& kernel_element_cache_input_idx = current_element_cache_input_idx_[k];
+    std::vector<std::deque<Element>>& kernel_cache = element_cache_[k];
+    std::vector<DeviceHandle>& kernel_cache_devices = element_cache_devices_[k];
+    std::vector<std::deque<i64>>& kernel_cache_row_ids =
+        element_cache_row_ids_[k];
+    std::vector<i32>& input_column_idx = arg_group_.column_mapping[k];
+    std::set<i32>& input_column_idx_set = column_mapping_set_[k];
+
+    // Since inputs can arrive at different rates, we need to keep
+    // inputs around until they have been used.
+    // Place all new input elements in side output columns into intermediate
+    // cache. If different device, move all required values in the side output
+    // columns to the proper device for this kernel
+    assert(op_name == INPUT_OP_NAME ||
+           current_input_handles.size() == input_column_idx.size());
+    if (kernel_cache_devices.empty()) {
+      for (i32 i = 0; i < input_column_idx.size(); ++i) {
+        kernel_cache_devices.push_back(current_input_handles[i]);
+      }
+    }
+    for (i32 i = 0; i < input_column_idx.size(); ++i) {
+      i32 in_col_idx = input_column_idx[i];
+      assert(in_col_idx < side_output_columns.size());
+      // Select elements which this kernel requires as inputs
+      auto& row_ids = side_row_ids[in_col_idx];
+      ElementList valid_inputs;
+      i64& current_input_idx = kernel_current_input_idx[i];
+      for (size_t r = 0; r < row_ids.size(); ++r) {
+        assert(current_input_idx >= kernel_valid_input_rows.size() ||
+               row_ids[r] <= kernel_valid_input_rows[current_input_idx]);
+        if (current_input_idx < kernel_valid_input_rows.size() &&
+            row_ids[r] == kernel_valid_input_rows[current_input_idx]) {
+          // Insert row ids for valid elements into cache
+          kernel_cache_row_ids[i].push_back(row_ids[r]);
+          Element element(side_output_columns[in_col_idx][r]);
+          // We provide the input index to the kernel so that it can detect
+          // non-consecutive elements
+          element.index = row_ids[r];
+          valid_inputs.push_back(element);
+          current_input_idx++;
+        }
+      }
+      if (valid_inputs.size() > 0) {
+        auto copy_start = now();
+        ElementList list =
+            copy_or_ref_elements(profiler_, side_output_handles[in_col_idx],
+                                 current_input_handles[i], valid_inputs);
+        profiler_.add_interval("op_marshal", copy_start, now());
+        // Insert new elements into cache
+        kernel_cache[i].insert(kernel_cache[i].end(), list.begin(), list.end());
+      }
+    }
+    // Determine the highest row seen so we know how many elements we
+    // might be able to produce
+    i64 max_row_id_seen = -1;
+    if (input_column_idx.size() > 0 && kernel_cache_row_ids[0].size() > 0) {
+      max_row_id_seen = kernel_cache_row_ids[0].back();
+      for (i32 i = 1; i < input_column_idx.size(); ++i) {
+        max_row_id_seen =
+            std::min(max_row_id_seen, kernel_cache_row_ids[i].back());
+      }
+      // Update current compute position
+      for (i64 i = 0; i < kernel_cache_row_ids[0].size(); ++i) {
+        i64 row_id = kernel_cache_row_ids[0][i];
+        assert(kernel_current_compute_idx >= kernel_compute_rows.size() ||
+               row_id <= kernel_compute_rows[kernel_current_compute_idx]);
+        if (kernel_current_compute_idx < kernel_compute_rows.size() &&
+            row_id == kernel_compute_rows[kernel_current_compute_idx]) {
+          kernel_current_compute_idx++;
+        }
+      }
+    }
 
-    args.profiler.add_interval("idle", idle_start, now());
+    // Figure out how many elements can be produced
+    auto compute_producible_elements =
+        [kernel_element_cache_input_idx, kernel_current_compute_idx,
+         &kernel_compute_rows, max_row_id_seen](i64 stencil, i64 batch) {
+          i64 producible_rows = 0;
+          for (i64 i = kernel_element_cache_input_idx;
+               i < kernel_current_compute_idx; ++i) {
+            i64 row = kernel_compute_rows[i];
+            // Check if this row was seen by all inputs
+            if (row + stencil > max_row_id_seen) {
+              break;
+            }
+            producible_rows++;
+          }
+          i64 batch_over = producible_rows % batch;
+          return producible_rows - batch_over;
+        };
 
-    auto work_start = now();
+    // NOTE(apoms): the number of producible rows should be a multiple of the
+    // batch size (if not zero). If not, then this should be the last batch
+    // in the task we should add an assert to verify this is the case.
+    i64 producible_elements = 0;
+    i32 num_output_columns = 0;
+    std::vector<i32> kernel_stencil;
+    if (is_builtin_op(op_name)) {
+      producible_elements = compute_producible_elements(0, 1);
+      num_output_columns = 1;
+      kernel_stencil = {0};
+      if (op_name == INPUT_OP_NAME) {
+        num_output_columns = 0;
+      }
+    } else {
+      kernel_stencil = arg_group_.kernel_stencils[k];
+      i32 kernel_batch_size = arg_group_.kernel_batch_sizes[k];
 
-    // Make the op aware of the format of the data
-    if (work_entry.needs_reset) {
-      for (auto &kernel : kernels) {
-        kernel->reset();
+      i64 bs = kernel_batch_size;
+      // If end of task, we set batch size to 1 to get all remaining elements
+      assert(kernel_current_input_idx.size() > 0);
+      i64 rows_left_in_task =
+          kernel_valid_input_rows.size() - kernel_current_input_idx[0];
+      if (rows_left_in_task < kernel_batch_size) {
+        bs = 1;
       }
+      producible_elements =
+          compute_producible_elements(kernel_stencil.back(), bs);
+
+      auto& unused_outputs = arg_group_.unused_outputs[k];
+      num_output_columns = kernel_num_outputs_[k] - unused_outputs.size();
     }
 
-    EvalWorkEntry output_work_entry;
-    output_work_entry.io_item_index = work_entry.io_item_index;
-    output_work_entry.needs_configure = work_entry.needs_configure;
-    output_work_entry.needs_reset = work_entry.needs_reset;
-    output_work_entry.last_in_io_item = work_entry.last_in_io_item;
-    output_work_entry.warmup_rows = work_entry.warmup_rows;
-
-    BatchedColumns &work_item_output_columns = output_work_entry.columns;
-    std::vector<DeviceHandle> &work_item_output_handles =
-        output_work_entry.column_handles;
-    i32 num_final_output_columns = 0;
-
-    i32 current_input = 0;
-    i32 total_inputs = 0;
-    for (size_t i = 0; i < work_entry.columns.size(); ++i) {
-      total_inputs = // io_item.end_row - io_item.start_row;
-          std::max(total_inputs, (i32)work_entry.columns[i].rows.size());
+    // Grab row ids corresponding to producible elements by walking through
+    // element cache
+    // NOTE(apoms): elements in kernel cache from each column should be the same
+    // since the input domain for all inputs to a kernel must be the same
+    std::vector<i64> producible_row_ids(
+        kernel_compute_rows.begin() + kernel_element_cache_input_idx,
+        kernel_compute_rows.begin() + kernel_element_cache_input_idx +
+            producible_elements);
+
+    {
+      // Get the output handles for only the columns that are used
+      std::vector<DeviceHandle> used_output_column_handles;
+      auto& unused_outputs = arg_group_.unused_outputs[k];
+      for (i32 c = 0; c < kernel_num_outputs_[k]; ++c) {
+        bool found = false;
+        for (int i = 0; i < unused_outputs.size(); ++i) {
+          if (c == unused_outputs[i]) {
+            found = true;
+            break;
+          }
+        }
+        if (!found) {
+          used_output_column_handles.push_back(current_output_handles[c]);
+        }
+      }
+      for (i32 c = 0; c < num_output_columns; ++c) {
+        side_output_handles.push_back(used_output_column_handles[c]);
+        side_output_columns.emplace_back();
+        side_row_ids.emplace_back();
+      }
     }
-    while (current_input < total_inputs) {
-      i32 batch_size =
-        std::min(total_inputs - current_input, args.job_params->work_item_size());
-
-      BatchedColumns side_input_columns;
-      DeviceHandle input_handle;
-      // Initialize the output buffers with the frame input because we
-      // perform a swap from output to input on each iterator to pass outputs
-      // from the previous op into the input of the next one
-      std::vector<DeviceHandle> side_output_handles = work_entry.column_handles;
-      BatchedColumns side_output_columns;
-      side_output_columns.resize(work_entry.columns.size());
-      for (size_t i = 0; i < work_entry.columns.size(); ++i) {
-        i32 batch =
-            std::min(batch_size, (i32)work_entry.columns[i].rows.size());
-        assert(batch > 0);
-        side_output_columns[i].rows.insert(
-            side_output_columns[i].rows.end(),
-            work_entry.columns[i].rows.begin() + current_input,
-            work_entry.columns[i].rows.begin() + current_input + batch);
-      }
-      for (size_t k = 0; k < kernels.size(); ++k) {
-        DeviceHandle current_handle = kernel_devices[k];
-        std::unique_ptr<Kernel> &kernel = kernels[k];
-        i32 num_outputs = kernel_num_outputs[k];
 
-        // Map from previous output columns to the set of input columns needed
-        // by the kernel
-        BatchedColumns input_columns;
-        for (i32 in_col_idx : column_mapping[k]) {
-          assert(in_col_idx < side_output_columns.size());
-
-          // If current op type and input buffer type differ, then move
-          // the data in the input buffer into a new buffer which has the same
-          // type as the op input
-          auto copy_start = now();
-          move_if_different_address_space(
-              args.profiler, side_output_handles[in_col_idx], current_handle,
-              side_output_columns[in_col_idx]);
-          side_output_handles[in_col_idx] = current_handle;
-
-          input_handle = current_handle;
-          args.profiler.add_interval("op_marshal", copy_start, now());
-
-          input_columns.push_back(side_output_columns[in_col_idx]);
+    if (op_name == INPUT_OP_NAME) {
+      // Should ignore it since we remapped inputs
+    } else if (op_name == SAMPLE_OP_NAME) {
+      // Filter and remap row ids
+      auto& sampler = domain_samplers_.at(k);
+      // For each available input, check if it maps to a valid downstream
+      // element
+      std::vector<i64> downstream_rows;
+      std::vector<i64> downstream_upstream_mapping;
+      Result result = sampler->get_downstream_rows(
+          producible_row_ids, downstream_rows, downstream_upstream_mapping);
+      if (!result.success()) {
+        VLOG(1) << "Sampler failed: " << result.msg();
+        THREAD_RETURN_SUCCESS();
+      }
+
+      // Pass down rows and ref elements
+      auto& output_column = side_output_columns.back();
+      for (size_t i = 0; i < downstream_rows.size(); ++i) {
+        i64 upstream_row_idx = downstream_upstream_mapping[i];
+        auto& element = *(kernel_cache.at(0).begin() + upstream_row_idx);
+        Element ele = add_element_ref(current_handle, element);
+        output_column.push_back(ele);
+      }
+      side_row_ids.back() = downstream_rows;
+    } else if (op_name == SPACE_OP_NAME) {
+      // Space and remap row ids
+      auto& sampler = domain_samplers_.at(k);
+      std::vector<i64> downstream_rows;
+      std::vector<i64> downstream_upstream_mapping;
+      Result result = sampler->get_downstream_rows(
+          producible_row_ids, downstream_rows, downstream_upstream_mapping);
+      if (!result.success()) {
+        VLOG(1) << "Sampler failed: " << result.msg();
+        THREAD_RETURN_SUCCESS();
+      }
+      // For each available input, expand it by placing nulls or repeats
+      auto& output_column = side_output_columns.back();
+      for (size_t i = 0; i < downstream_rows.size(); ++i) {
+        i64 upstream_row_idx = downstream_upstream_mapping[i];
+        if (upstream_row_idx == -1) {
+          // Put null element
+          output_column.emplace_back();
+        } else {
+          auto& element = *(kernel_cache.at(0).begin() + upstream_row_idx);
+          Element ele = add_element_ref(current_handle, element);
+          output_column.push_back(ele);
+        }
+      }
+      side_row_ids.back() = downstream_rows;
+    } else if (op_name == SLICE_OP_NAME) {
+      // Remap row ids from original domain to sub domain
+      const auto& slice_output_counts =
+          arg_group_.slice_output_rows.at(k).at(job_idx_);
+      i64 offset = 0;
+      for (i64 i = 0; i < slice_group_; ++i) {
+        offset += slice_output_counts.at(i);
+      }
+      // For each row id, remap it and keep output element the same
+      auto& output_column = side_output_columns.back();
+      auto& output_row_ids = side_row_ids.back();
+      for (size_t i = 0; i < producible_row_ids.size(); ++i) {
+        output_row_ids.push_back(producible_row_ids[i] - offset);
+        auto& element = *(kernel_cache.at(0).begin() + i);
+        Element ele = add_element_ref(current_handle, element);
+        output_column.push_back(ele);
+      }
+    } else if (op_name == UNSLICE_OP_NAME) {
+      // Remap row ids from sub domain to original domain
+      const auto& unslice_input_counts =
+          arg_group_.unslice_input_rows.at(k).at(job_idx_);
+      i64 offset = 0;
+      for (i64 i = 0; i < slice_group_; ++i) {
+        offset += unslice_input_counts.at(i);
+      }
+      // For each row id, remap it and keep output element the same
+      auto& output_column = side_output_columns.back();
+      auto& output_row_ids = side_row_ids.back();
+      for (size_t i = 0; i < producible_row_ids.size(); ++i) {
+        output_row_ids.push_back(producible_row_ids[i] + offset);
+        auto& element = *(kernel_cache.at(0).begin() + i);
+        Element ele = add_element_ref(current_handle, element);
+        output_column.push_back(ele);
+      }
+    } else {
+      assert(!is_builtin_op(op_name));
+      // If a regular kernel
+      std::unique_ptr<BaseKernel>& kernel = kernels_[k];
+      i32 kernel_batch_size = arg_group_.kernel_batch_sizes[k];
+      i64 row_start = kernel_element_cache_input_idx;
+      i64 row_end = row_start + producible_elements;
+
+      for (i32 start = row_start; start < row_end; start += kernel_batch_size) {
+        i32 batch = std::min((i64)kernel_batch_size, row_end - start);
+        i32 end = start + batch;
+        // Stage inputs to the kernel using the stencil cache
+        StenciledBatchedColumns input_columns(input_column_idx.size());
+        // For each column
+        // NOTE(apoms): choosing the first columns row ids is fine because all
+        // input row ids for each column should be the same since all inputs
+        // must have the same domain
+        auto& cache_row_deque = kernel_cache_row_ids[0];
+        for (size_t i = 0; i < input_column_idx.size(); ++i) {
+          auto& cache_deque = kernel_cache[i];
+          auto& col = input_columns[i];
+          col.resize(batch);
+          // For each batch element
+          for (i64 r = start; r < end; ++r) {
+            auto& input_stencil = col[r - start];
+            i64 last_cache_element = 0;
+            // Place elements in "stencil" dimension of input columns
+            i64 curr_row = kernel_compute_rows[r];
+            for (i64 s : kernel_stencil) {
+              i64 desired_row = curr_row + s;
+              // Search for desired stencil element
+              bool found = false;
+              for (; last_cache_element < cache_row_deque.size();
+                   ++last_cache_element) {
+                i64 cache_row_id = cache_row_deque[last_cache_element];
+                if (desired_row == cache_row_id) {
+                  input_stencil.push_back(cache_deque[last_cache_element]);
+                  found = true;
+                  break;
+                }
+              }
+              assert(found);
+            }
+            assert(input_stencil.size() == kernel_stencil.size());
+          }
         }
 
         // Setup output buffers to receive op output
-        DeviceHandle output_handle = current_handle;
         BatchedColumns output_columns;
-        output_columns.resize(num_outputs);
+        output_columns.resize(num_output_columns);
 
+        // Map from previous output columns to the set of input columns needed
+        // by the kernel
         auto eval_start = now();
-        kernel->execute(input_columns, output_columns);
-        args.profiler.add_interval("evaluate", eval_start, now());
-        // Delete unused outputs
-        for (size_t y = 0; y < unused_outputs[k].size(); ++y) {
-          i32 unused_col_idx = unused_outputs[k][unused_outputs[k].size() - 1 - y];
-          RowList &column = output_columns[unused_col_idx];
-          for (Row &row : column.rows) {
-            u8 *buff = row.buffer;
-            delete_buffer(current_handle, buff);
+        kernel->execute_kernel(input_columns, output_columns);
+        profiler_.add_interval("evaluate:" + op_name, eval_start, now());
+
+        // Delete unused output columns
+        auto& unused_outputs = arg_group_.unused_outputs[k];
+        for (size_t y = 0; y < unused_outputs.size(); ++y) {
+          i32 unused_col_idx =
+              unused_outputs[unused_outputs.size() - 1 - y];
+          ElementList& column = output_columns[unused_col_idx];
+          for (Element& element : column) {
+            delete_element(current_output_handles[unused_col_idx], element);
           }
           output_columns.erase(output_columns.begin() + unused_col_idx);
         }
+
         // Verify the kernel produced the correct amount of output
         for (size_t i = 0; i < output_columns.size(); ++i) {
-          LOG_IF(FATAL, output_columns[i].rows.size() != batch_size)
-              << "Op " << k << " produced "
-              << output_columns[i].rows.size() << " output rows for column "
-              << i << ". Expected " << batch_size << " outputs.";
+          LOG_IF(FATAL, output_columns[i].size() != batch)
+              << "Op " << k << " produced " << output_columns[i].size()
+              << " output elements for column " << i << ". Expected " << batch
+              << " outputs.";
         }
-        // Delete dead columns
-        for (size_t y = 0; y < dead_columns[k].size(); ++y) {
-          i32 dead_col_idx = dead_columns[k][dead_columns[k].size() - 1 - y];
-          RowList &column = side_output_columns[dead_col_idx];
-          for (Row &row : column.rows) {
-            u8 *buff = row.buffer;
-            delete_buffer(side_output_handles[dead_col_idx], buff);
+
+        // Add new output columns
+        for (size_t cidx = 0; cidx < output_columns.size(); ++cidx) {
+          const ElementList& column = output_columns[cidx];
+          i32 col_idx = side_output_columns.size() - num_output_columns + cidx;
+          side_output_columns[col_idx].insert(
+              side_output_columns[col_idx].end(), column.begin(), column.end());
+          auto& output_row_ids = side_row_ids[col_idx];
+          output_row_ids.insert(
+              output_row_ids.end(),
+              producible_row_ids.begin() + start - row_start,
+              producible_row_ids.begin() + start - row_start + batch);
+        }
+      }
+    }
+
+    i64 row_start = kernel_element_cache_input_idx;
+    i64 row_end = row_start + producible_elements;
+    // Filter outputs to only the ones that will be used downstream
+    // For each output row, check if it is in the valid output rows
+    if (num_output_columns > 0) {
+      BatchedColumns temp_output_columns(num_output_columns);
+      std::vector<std::vector<i64>> temp_row_ids(num_output_columns);
+
+      // For each column, transfer all valid rows to temp output, deleting all
+      // the non valid rows, and then swap the temp rows into the side
+      // output columns
+      i32 first_col_idx = side_output_columns.size() - num_output_columns;
+      for (i64 row_start = 0;
+           row_start < side_output_columns[first_col_idx].size(); ++row_start) {
+        assert(!side_row_ids[first_col_idx].empty());
+        // assert(side_row_ids[first_col_idx][row_start] <=
+        //        kernel_valid_output_rows[kernel_current_output_idx]);
+        if (kernel_current_output_idx < kernel_valid_output_rows.size() &&
+            side_row_ids[first_col_idx][row_start] ==
+                kernel_valid_output_rows[kernel_current_output_idx]) {
+          i64 next_row = kernel_valid_output_rows[kernel_current_output_idx];
+          // Is a valid row, so keep
+          for (i64 i = 0; i < num_output_columns; ++i) {
+            i32 col_idx = side_output_columns.size() - num_output_columns + i;
+            auto& element = side_output_columns[col_idx][row_start];
+            temp_output_columns[i].push_back(element);
+            temp_row_ids[i].push_back(next_row);
+          }
+          kernel_current_output_idx++;
+        } else {
+          // Is not a valid row, so delete
+          for (i64 i = 0; i < num_output_columns; ++i) {
+            i32 col_idx = side_output_columns.size() - num_output_columns + i;
+            auto& element = side_output_columns[col_idx][row_start];
+            delete_element(side_output_handles[col_idx], element);
           }
-          side_output_columns.erase(side_output_columns.begin() + dead_col_idx);
-          side_output_handles.erase(side_output_handles.begin() + dead_col_idx);
         }
-        // Add new output columns
-        for (const RowList &column : output_columns) {
-          side_output_columns.push_back(column);
-          side_output_handles.push_back(current_handle);
+      }
+      for (i64 i = 0; i < num_output_columns; ++i) {
+        i32 col_idx = side_output_columns.size() - num_output_columns + i;
+        side_output_columns[col_idx].swap(temp_output_columns[i]);
+        side_row_ids[col_idx].swap(temp_row_ids[i]);
+      }
+    }
+
+    // Remove elements from the element cache we won't access anymore
+    if (kernel_valid_input_rows.size() > 0) {
+      i64 last_cache_element = 0;
+      i64 min_used_row = kernel_valid_input_rows[std::min(
+          row_end, (i64)kernel_valid_input_rows.size() - 1)];
+      min_used_row += kernel_stencil[0];
+      {
+        auto& row_id_deque = kernel_cache_row_ids[0];
+        while (row_id_deque.size() > 0) {
+          i64 cache_row = row_id_deque.front();
+          if (cache_row < min_used_row) {
+            for (auto& deqs : kernel_cache_row_ids) {
+              deqs.pop_front();
+            }
+            for (size_t i = 0; i < kernel_cache.size(); ++i) {
+              auto device = kernel_cache_devices[i];
+              auto& cache_deque = kernel_cache[i];
+              assert(cache_deque.size() > 0);
+              Element element = cache_deque.front();
+              delete_element(device, element);
+              cache_deque.pop_front();
+            }
+          } else {
+            break;
+          }
         }
+        kernel_element_cache_input_idx += producible_elements;
       }
-      if (work_item_output_columns.size() == 0) {
-        num_final_output_columns = side_output_columns.size();
-        work_item_output_columns.resize(side_output_columns.size());
-        work_item_output_handles = side_output_handles;
-      }
-      assert(num_final_output_columns == side_output_columns.size());
-      for (i32 i = 0; i < num_final_output_columns; ++i) {
-        i32 num_output_rows =
-            static_cast<i32>(side_output_columns[i].rows.size());
-        work_item_output_columns[i].rows.insert(
-            work_item_output_columns[i].rows.end(),
-            side_output_columns[i].rows.begin(),
-            side_output_columns[i].rows.end());
-      }
-      current_input += batch_size;
     }
 
-    args.profiler.add_interval("task", work_start, now());
+    // Remove dead columns from side_output_handles
+    // TODO(apoms): move this to before the Op eval
+    auto& dead_columns = arg_group_.dead_columns[k];
+    for (size_t y = 0; y < dead_columns.size(); ++y) {
+      i32 dead_col_idx = dead_columns[dead_columns.size() - 1 - y];
+      ElementList& column = side_output_columns[dead_col_idx];
+      for (Element& element : column) {
+        delete_element(side_output_handles[dead_col_idx], element);
+      }
+      side_output_columns.erase(side_output_columns.begin() + dead_col_idx);
+      side_output_handles.erase(side_output_handles.begin() + dead_col_idx);
+      side_row_ids.erase(side_row_ids.begin() + dead_col_idx);
+    }
+    // Delete elements from stencil cache that will no longer be used
+  }
+
+  final_output_handles_ = side_output_handles;
+  if (final_output_columns_.size() == 0) {
+    final_output_columns_.resize(side_output_columns.size());
+    final_row_ids_.resize(side_output_columns.size());
+  }
+  for (size_t i = 0; i < side_output_columns.size(); ++i) {
+    final_output_columns_[i].insert(final_output_columns_[i].end(),
+                                    side_output_columns[i].begin(),
+                                    side_output_columns[i].end());
+  }
+  for (size_t i = 0; i < side_output_columns.size(); ++i) {
+    final_row_ids_[i].insert(final_row_ids_[i].end(),
+                             side_row_ids[i].begin(),
+                             side_row_ids[i].end());
+  }
+
+  profiler_.add_interval("feed", feed_start, now());
+}
+
+bool EvaluateWorker::yield(i32 item_size, EvalWorkEntry& output_entry) {
+  EvalWorkEntry& work_entry = entry_;
+
+  auto yield_start = now();
 
-    VLOG(1) << "Evaluate (N/KI/G: " << args.node_id << "/" << args.ki << "/"
-              << args.kg << "): finished item " << work_entry.io_item_index;
+  EvalWorkEntry output_work_entry;
+  output_work_entry.table_id = work_entry.table_id;
+  output_work_entry.job_index = work_entry.job_index;
+  output_work_entry.task_index = work_entry.task_index;
+  output_work_entry.needs_configure = work_entry.needs_configure;
+  output_work_entry.needs_reset = work_entry.needs_reset;
+  output_work_entry.last_in_io_packet = work_entry.last_in_io_packet;
+  output_work_entry.last_in_task = work_entry.last_in_task;
 
-    args.output_work.push(std::make_tuple(io_item, output_work_entry));
+  BatchedColumns& work_item_output_columns = output_work_entry.columns;
+  std::vector<DeviceHandle>& work_item_output_handles =
+      output_work_entry.column_handles;
+  std::vector<std::vector<i64>>& work_item_row_ids =
+      output_work_entry.row_ids;
+  i32 num_final_output_columns = 0;
+  num_final_output_columns = final_output_columns_.size();
+  work_item_output_columns.resize(num_final_output_columns);
+  work_item_output_handles = final_output_handles_;
+  work_item_row_ids.resize(num_final_output_columns);
+
+  for (i32 i = 0; i < num_final_output_columns; ++i) {
+    work_item_output_columns[i].insert(work_item_output_columns[i].end(),
+                                       final_output_columns_[i].begin(),
+                                       final_output_columns_[i].end());
+    work_item_row_ids[i].insert(work_item_row_ids[i].end(),
+                                final_row_ids_[i].begin(),
+                                final_row_ids_[i].end());
+    final_output_columns_[i].clear();
+    final_row_ids_[i].clear();
   }
 
-  VLOG(1) << "Evaluate (N/KI: " << args.node_id << "/" << args.ki
-            << "): thread finished";
+  output_entry = output_work_entry;
+
+  profiler_.add_interval("yield", yield_start, now());
 
-  THREAD_RETURN_SUCCESS();
+  return true;
 }
 
-void *post_evaluate_thread(void *arg) {
-  PostEvaluateThreadArgs &args =
-      *reinterpret_cast<PostEvaluateThreadArgs *>(arg);
-  std::set<i32> column_set(args.column_mapping.begin(),
-                           args.column_mapping.end());
-
-  EvalWorkEntry buffered_entry;
-  i64 current_offset = 0;
-  while (true) {
-    auto idle_start = now();
-    // Wait for next work item to process
-    std::tuple<IOItem, EvalWorkEntry> entry;
-    args.input_work.pop(entry);
-    IOItem& io_item = std::get<0>(entry);
-    EvalWorkEntry& work_entry = std::get<1>(entry);
-
-    if (work_entry.io_item_index == -1) {
-      break;
+void EvaluateWorker::clear_stencil_cache() {
+  for (size_t k = 0; k < kernels_.size(); ++k) {
+    std::vector<i32>& kernel_stencil = arg_group_.kernel_stencils[k];
+    bool degenerate_stencil =
+        (kernel_stencil.size() == 1 && kernel_stencil[0] == 0);
+    std::vector<std::deque<Element>>& kernel_cache = element_cache_[k];
+    std::vector<DeviceHandle>& kernel_cache_devices = element_cache_devices_[k];
+    std::vector<std::deque<i64>>& kernel_cache_row_ids =
+        element_cache_row_ids_[k];
+    auto& input_column_idx = arg_group_.column_mapping[k];
+    for (i32 i = 0; i < input_column_idx.size(); ++i) {
+      auto& row_id_deque = kernel_cache_row_ids[i];
+      row_id_deque.clear();
+      auto& cache_deque = kernel_cache[i];
+      for (i64 j = 0; j < cache_deque.size(); ++j) {
+        assert(!kernel_cache_devices.empty());
+        Element element = cache_deque.back();
+        delete_element(kernel_cache_devices[i], element);
+        cache_deque.pop_back();
+      }
     }
+  }
+}
 
-    VLOG(1) << "Post-evaluate (N/PU: " << args.node_id << "/" << args.id
-              << "): processing item " << work_entry.io_item_index;
+PostEvaluateWorker::PostEvaluateWorker(const PostEvaluateWorkerArgs& args)
+  : profiler_(args.profiler),
+    column_mapping_(args.column_mapping),
+    columns_(args.columns),
+    column_set_(args.column_mapping.begin(), args.column_mapping.end()) {
+  assert(args.column_mapping.size() == args.columns.size());
 
-    args.profiler.add_interval("idle", idle_start, now());
+  encoder_handle_ = CPU_DEVICE;
+  encoder_type_ = VideoEncoderType::SOFTWARE;
 
-    auto work_start = now();
+  // Setup video encoders
+  // TODO(apoms): Make this dynamic based on the encoded column type
+  for (size_t i = 0; i < args.columns.size(); ++i) {
+    auto& col = args.columns[i];
+    auto& compression_opts = args.column_compression[i];
+    ColumnType type = col.type();
+    if (type != ColumnType::Video || compression_opts.codec == "raw") continue;
+    encoders_.emplace_back(
+        VideoEncoder::make_from_config(encoder_handle_, 1, encoder_type_));
+    encoder_configured_.push_back(false);
 
-    if (buffered_entry.columns.size() == 0) {
-      buffered_entry.io_item_index = work_entry.io_item_index;
-      buffered_entry.columns.resize(args.column_mapping.size());
-      for (i32 col_idx : args.column_mapping) {
-        buffered_entry.column_handles.push_back(
-            work_entry.column_handles[col_idx]);
-      }
+    EncodeOptions opts;
+    if (compression_opts.codec == "h264") {
+      opts.quality = std::atoi(compression_opts.options.at("quality").c_str());
+      opts.bitrate = std::atoi(compression_opts.options.at("bitrate").c_str());
+      opts.keyframe_distance =
+          std::atoi(compression_opts.options.at("keyframe_distance").c_str());
     }
+    encode_options_.push_back(opts);
+  }
+  for (auto& compression_opts : args.column_compression) {
+    auto& codec = compression_opts.codec;
+    bool enabled = true;
+    if (codec == "raw") {
+      enabled = false;
+    }
+    compression_enabled_.push_back(enabled);
+  }
+
+  current_offset_ = 0;
+}
+
+void PostEvaluateWorker::feed(EvalWorkEntry& entry) {
+  EvalWorkEntry& work_entry = entry;
 
+  // HACK(apoms): this will fail horrible and leak memory if
+  // we receive outputs at different rates.
+  if (entry.columns.empty() || entry.columns[0].empty()) {
+    return;
+  }
 
-    i64 num_rows = work_entry.columns[0].rows.size();
-    i32 warmup_frames = work_entry.warmup_rows;
-    current_offset += num_rows;
-    // Swizzle columns correctly
-    for (size_t i = 0; i < args.column_mapping.size(); ++i) {
-      i32 col_idx = args.column_mapping[i];
-      // Delete warmup frame outputs
-      for (i32 w = 0; w < warmup_frames; ++w) {
-        delete_buffer(work_entry.column_handles[col_idx],
-                      work_entry.columns[col_idx].rows[w].buffer);
-      }
-      // Keep non-warmup frame outputs
-      buffered_entry.columns[i].rows.insert(
-          buffered_entry.columns[i].rows.end(),
-          work_entry.columns[col_idx].rows.begin() + warmup_frames,
-          work_entry.columns[col_idx].rows.end());
+  // Setup row buffer if it was emptied
+  if (buffered_entry_.columns.size() == 0) {
+    buffered_entry_.table_id = work_entry.table_id;
+    buffered_entry_.job_index = work_entry.job_index;
+    buffered_entry_.task_index = work_entry.task_index;
+    buffered_entry_.last_in_task = work_entry.last_in_task;
+    buffered_entry_.columns.resize(column_mapping_.size());
+    buffered_entry_.row_ids.resize(column_mapping_.size());
+    assert(work_entry.column_handles.size() == columns_.size());
+    buffered_entry_.column_types.clear();
+    buffered_entry_.column_handles.clear();
+    buffered_entry_.frame_sizes.clear();
+    buffered_entry_.compressed.clear();
+    for (size_t i = 0; i < columns_.size(); ++i) {
+      i32 col_idx = column_mapping_[i];
+      buffered_entry_.column_types.push_back(columns_[i].type());
+      buffered_entry_.column_handles.push_back(CPU_DEVICE);
+      if (columns_[i].type() == ColumnType::Video) {
+        assert(work_entry.columns[col_idx].size() > 0);
+        Frame* frame = work_entry.columns[col_idx][0].as_frame();
+        buffered_entry_.frame_sizes.push_back(frame->as_frame_info());
+      }
+      buffered_entry_.compressed.push_back(compression_enabled_[i]);
     }
-    // Delete unused columns
-    for (size_t i = 0; i < work_entry.columns.size(); ++i) {
-      if (column_set.count(i) > 0) {
-        continue;
+    if (work_entry.needs_configure) {
+      for (size_t i = 0; i < encoder_configured_.size(); ++i) {
+        encoder_configured_[i] = false;
       }
-      for (i32 b = 0; b < work_entry.columns[i].rows.size(); ++b) {
-        delete_buffer(work_entry.column_handles[i],
-                      work_entry.columns[i].rows[b].buffer);
+    }
+  }
+
+  i64 num_rows = work_entry.columns[0].size();
+  current_offset_ += num_rows;
+
+  i32 encoder_idx = 0;
+  // Swizzle columns correctly
+  for (size_t i = 0; i < column_mapping_.size(); ++i) {
+    i32 col_idx = column_mapping_[i];
+    ColumnType column_type = columns_[i].type();
+    // Encode video frames
+    if (compression_enabled_[i] && column_type == ColumnType::Video &&
+        buffered_entry_.frame_sizes[encoder_idx].type == FrameType::U8) {
+      auto& encoder = encoders_[encoder_idx];
+      if (!encoder_configured_[encoder_idx]) {
+        // Configure encoder
+        encoder_configured_[encoder_idx] = true;
+        Frame* frame = work_entry.columns[col_idx][0].as_frame();
+        encoder->configure(frame->as_frame_info(),
+
+                           encode_options_[encoder_idx]);
+      }
+
+      // Move frames to device for the encoder
+      move_if_different_address_space(
+          profiler_, work_entry.column_handles[col_idx], encoder_handle_,
+          work_entry.columns[col_idx]);
+
+      // Pass frames into encoder
+      auto encode_start = now();
+      for (auto& row : work_entry.columns[col_idx]) {
+        Frame* frame = row.as_frame();
+        bool new_packet = encoder->feed(frame->data, frame->size());
+        while (new_packet) {
+          size_t buffer_size = 4 * 1024 * 1024;
+          u8* buffer = new_buffer(CPU_DEVICE, buffer_size);
+          size_t actual_size;
+          new_packet = encoder->get_packet(buffer, buffer_size, actual_size);
+          LOG_IF(FATAL, new_packet && actual_size > buffer_size)
+              << "Packet buffer not large enough (" << buffer_size << " vs "
+              << actual_size << ")";
+          insert_element(buffered_entry_.columns[i], buffer, actual_size);
+        }
+        delete_element(encoder_handle_, row);
+      }
+      profiler_.add_interval("encode", encode_start, now());
+      encoder_idx++;
+    } else {
+      // Move data to CPU to avoid overflow on GPU
+      move_if_different_address_space(
+          profiler_, work_entry.column_handles[col_idx], CPU_DEVICE,
+          work_entry.columns[col_idx]);
+      buffered_entry_.columns[i].insert(
+          buffered_entry_.columns[i].end(),
+          work_entry.columns[col_idx].begin(),
+          work_entry.columns[col_idx].end());
+      buffered_entry_.row_ids[i].insert(
+          buffered_entry_.row_ids[i].end(),
+          work_entry.row_ids[col_idx].begin(),
+          work_entry.row_ids[col_idx].end());
+    }
+  }
+  // Delete unused columns
+  for (size_t i = 0; i < work_entry.columns.size(); ++i) {
+    if (column_set_.count(i) > 0) {
+      continue;
+    }
+    for (i32 b = 0; b < work_entry.columns[i].size(); ++b) {
+      delete_element(work_entry.column_handles[i], work_entry.columns[i][b]);
+    }
+  }
+
+  encoder_idx = 0;
+
+  // Flush row buffer
+  if (work_entry.last_in_io_packet) {
+    // Flush video encoder and get rest of packets
+    for (size_t i = 0; i < column_mapping_.size(); ++i) {
+      ColumnType column_type = columns_[i].type();
+      if (compression_enabled_[i] && column_type == ColumnType::Video &&
+          buffered_entry_.frame_sizes[encoder_idx].type == FrameType::U8) {
+        auto& encoder = encoders_[encoder_idx];
+
+        // Get last packets in encoder
+        auto encode_flush_start = now();
+        bool new_packet = encoder->flush();
+        while (new_packet) {
+          size_t buffer_size = 4 * 1024 * 1024;
+          u8* buffer = new_buffer(CPU_DEVICE, buffer_size);
+          size_t actual_size;
+          new_packet = encoder->get_packet(buffer, buffer_size, actual_size);
+          LOG_IF(FATAL, new_packet && actual_size > buffer_size)
+              << "Packet buffer not large enough (" << buffer_size << " vs "
+              << actual_size << ")";
+          insert_element(buffered_entry_.columns[i], buffer, actual_size);
+        }
+        profiler_.add_interval("encode_flush", encode_flush_start, now());
+        encoder_configured_[encoder_idx] = false;
+        encoder_idx++;
       }
     }
 
-    if (work_entry.last_in_io_item) {
-      args.output_work.push(std::make_tuple(io_item, buffered_entry));
-      buffered_entry.columns.clear();
+    // Only push an entry if it is non empty
+    if (buffered_entry_.columns.size() > 0 &&
+        buffered_entry_.columns[0].size() > 0) {
+      buffered_entries_.push_back(buffered_entry_);
+      buffered_entry_.columns.clear();
+      buffered_entry_.row_ids.clear();
     }
   }
+}
 
-  VLOG(1) << "Post-evaluate (N/PU: " << args.node_id << "/" << args.id
-            << "): thread finished ";
+bool PostEvaluateWorker::yield(EvalWorkEntry& output) {
+  auto yield_start = now();
 
-  THREAD_RETURN_SUCCESS();
+  bool got_result = false;
+  if (buffered_entries_.size() > 0) {
+    output = buffered_entries_.front();
+    buffered_entries_.pop_front();
+    got_result = true;
+  }
+
+  profiler_.add_interval("yield", yield_start, now());
+  return got_result;
 }
+
 }
 }
diff --git a/scanner/engine/evaluate_worker.h b/scanner/engine/evaluate_worker.h
index 02ce33b8..da6b26f3 100644
--- a/scanner/engine/evaluate_worker.h
+++ b/scanner/engine/evaluate_worker.h
@@ -17,44 +17,82 @@
 
 #include "scanner/engine/kernel_factory.h"
 #include "scanner/engine/runtime.h"
+#include "scanner/engine/sampler.h"
 #include "scanner/util/common.h"
 #include "scanner/util/queue.h"
+#include "scanner/video/decoder_automata.h"
+#include "scanner/video/video_encoder.h"
+
+#include "hwang/decoder_automata.h"
 
 namespace scanner {
 namespace internal {
 
-void move_if_different_address_space(Profiler &profiler,
+void move_if_different_address_space(Profiler& profiler,
                                      DeviceHandle current_handle,
                                      DeviceHandle target_handle,
-                                     BatchedColumns &columns);
+                                     BatchedColumns& columns);
 
 ///////////////////////////////////////////////////////////////////////////////
 /// Worker thread arguments
-struct PreEvaluateThreadArgs {
+struct PreEvaluateWorkerArgs {
   // Uniform arguments
   i32 node_id;
   i32 num_cpus;
-  const proto::JobParameters* job_params;
+  i32 work_packet_size;
 
   // Per worker arguments
-  i32 id;
+  i32 worker_id;
   DeviceHandle device_handle;
   Profiler& profiler;
-
-  // Queues for communicating work
-  Queue<std::tuple<IOItem, EvalWorkEntry>>& input_work;
-  Queue<std::tuple<IOItem, EvalWorkEntry>>& output_work;
 };
 
-struct EvaluateThreadArgs {
-  // Uniform arguments
-  i32 node_id;
-  const proto::JobParameters* job_params;
+class PreEvaluateWorker {
+ public:
+  PreEvaluateWorker(const PreEvaluateWorkerArgs& args);
 
-  // Per worker arguments
-  i32 ki;
-  i32 kg;
-  std::vector<std::tuple<KernelFactory*, Kernel::Config>> kernel_factories;
+  void feed(EvalWorkEntry& entry, bool is_first_in_task);
+
+  bool yield(i32 item_size, EvalWorkEntry& output);
+
+ private:
+  const i32 node_id_;
+  const i32 worker_id_;
+  const DeviceHandle device_handle_;
+  const i32 num_cpus_;
+
+  Profiler& profiler_;
+
+  i32 last_job_idx_ = -1;
+
+  DeviceHandle decoder_output_handle_;
+  std::vector<std::unique_ptr<DecoderAutomata>> decoders_;
+  std::vector<std::unique_ptr<hwang::DecoderAutomata>> inplace_decoders_;
+
+  // Continuation state
+  bool first_item_;
+  bool needs_configure_;
+  bool needs_reset_;
+  EvalWorkEntry entry_;
+  i64 current_row_;
+  i64 total_rows_;
+
+  std::vector<std::vector<proto::DecodeArgs>> decode_args_;
+};
+
+struct OpArgGroup {
+  std::vector<std::string> op_names;
+  /// For sampling ops
+  // Op -> Job -> slice
+  std::map<i64, std::vector<std::vector<proto::SamplingArgs>>> sampling_args;
+  /// For slice ops
+  // Op -> Job -> slice
+  std::map<i64, std::vector<std::vector<i64>>> slice_output_rows;
+  /// For unslice ops
+  // Op -> Job -> slice
+  std::map<i64, std::vector<std::vector<i64>>> unslice_input_rows;
+  /// For regular kernels
+  std::vector<std::tuple<KernelFactory*, KernelConfig>> kernel_factories;
   std::vector<std::vector<std::tuple<i32, std::string>>> live_columns;
   // Discarded after kernel use
   std::vector<std::vector<i32>> dead_columns;
@@ -62,15 +100,107 @@ struct EvaluateThreadArgs {
   std::vector<std::vector<i32>> unused_outputs;
   // Index in columns for inputs
   std::vector<std::vector<i32>> column_mapping;
+  // Stencil needed by kernels
+  std::vector<std::vector<i32>> kernel_stencils;
+  // Batch size needed by kernels
+  std::vector<i32> kernel_batch_sizes;
+};
+
+struct EvaluateWorkerArgs {
+  // Uniform arguments
+  i32 node_id;
+  std::mutex& startup_lock;
+  std::condition_variable& startup_cv;
+  i32& startup_count;
+
+  // Per worker arguments
+  i32 ki;
+  i32 kg;
+  OpArgGroup arg_group;
+
   Profiler& profiler;
   proto::Result& result;
+};
+
+
+class EvaluateWorker {
+ public:
+  EvaluateWorker(const EvaluateWorkerArgs& args);
+  ~EvaluateWorker();
+
+  void new_task(i64 job_idx, i64 task_idx,
+                const std::vector<TaskStream>& task_streams);
+
+  void feed(EvalWorkEntry& entry);
+
+  bool yield(i32 item_size, EvalWorkEntry& output);
+
+ private:
+  void clear_stencil_cache();
+
+  const i32 node_id_;
+  const i32 worker_id_;
+
+  Profiler& profiler_;
+
+  OpArgGroup arg_group_;
+  std::vector<DeviceHandle> kernel_devices_;
+  std::vector<std::vector<DeviceHandle>> kernel_input_devices_;
+  std::vector<std::vector<DeviceHandle>> kernel_output_devices_;
+  std::vector<i32> kernel_num_outputs_;
+  std::vector<std::unique_ptr<BaseKernel>> kernels_;
+
+  // Used for computing complement of column mapping
+  std::vector<std::set<i32>> column_mapping_set_;
+
+  /// Task state
+  i64 job_idx_;
+  i64 task_idx_;
+  i64 slice_group_;
+  std::map<i64, std::unique_ptr<DomainSampler>> domain_samplers_;
+
+  // Inputs
+  std::vector<std::set<i64>> valid_input_rows_set_;
+  std::vector<std::vector<i64>> valid_input_rows_;
+  // Tracks which input we should expect next for which column
+  std::vector<std::vector<i64>> current_valid_input_idx_;
+
+  // List of row ids of the uutputs to compute
+  std::vector<std::set<i64>> compute_rows_set_;
+  std::vector<std::vector<i64>> compute_rows_;
+  // Tracks which index in compute_rows_ we should expect next
+  std::vector<i64> current_compute_idx_;
+
+  // Outputs to keep
+  std::vector<std::set<i64>> valid_output_rows_set_;
+  std::vector<std::vector<i64>> valid_output_rows_;
+  // Tracks which output we should expect next
+  std::vector<i64> current_valid_output_idx_;
+
+  // Per kernel -> per input column -> deque of element)
+  std::vector<i64> current_element_cache_input_idx_;
+  std::vector<std::vector<std::deque<Element>>> element_cache_;
+  // Per kernel -> per input column -> device handle
+  std::vector<std::vector<DeviceHandle>> element_cache_devices_;
+  // Per kernel -> per input column -> deque of row ids
+  std::vector<std::vector<std::deque<i64>>> element_cache_row_ids_;
+
+  // Continutation state
+  EvalWorkEntry entry_;
+  i32 current_input_;
+  i32 total_inputs_;
+
+  std::vector<DeviceHandle> final_output_handles_;
+  std::vector<std::deque<Element>> final_output_columns_;
+  std::vector<std::vector<i64>> final_row_ids_;
+};
 
-  // Queues for communicating work
-  Queue<std::tuple<IOItem, EvalWorkEntry>>& input_work;
-  Queue<std::tuple<IOItem, EvalWorkEntry>>& output_work;
+struct ColumnCompressionOptions {
+  std::string codec;
+  std::map<std::string, std::string> options;
 };
 
-struct PostEvaluateThreadArgs {
+struct PostEvaluateWorkerArgs {
   // Uniform arguments
   i32 node_id;
 
@@ -79,17 +209,35 @@ struct PostEvaluateThreadArgs {
   Profiler& profiler;
   // Index in columns for inputs
   std::vector<i32> column_mapping;
-
-  // Queues for communicating work
-  Queue<std::tuple<IOItem, EvalWorkEntry>>& input_work;
-  Queue<std::tuple<IOItem, EvalWorkEntry>>& output_work;
+  std::vector<Column> columns;
+  std::vector<ColumnCompressionOptions> column_compression;
 };
 
-void* pre_evaluate_thread(void* arg);
+class PostEvaluateWorker {
+ public:
+  PostEvaluateWorker(const PostEvaluateWorkerArgs& args);
+
+  void feed(EvalWorkEntry& entry);
 
-void* evaluate_thread(void* arg);
+  bool yield(EvalWorkEntry& output);
 
-void* post_evaluate_thread(void* arg);
+ private:
+  Profiler& profiler_;
+  std::vector<i32> column_mapping_;
+  std::vector<Column> columns_;
+  std::set<i32> column_set_;
 
+  DeviceHandle encoder_handle_;
+  VideoEncoderType encoder_type_;
+  std::vector<std::unique_ptr<VideoEncoder>> encoders_;
+  std::vector<bool> encoder_configured_;
+  std::vector<EncodeOptions> encode_options_;
+  std::vector<bool> compression_enabled_;
+
+  // Generator state
+  EvalWorkEntry buffered_entry_;
+  i64 current_offset_;
+  std::deque<EvalWorkEntry> buffered_entries_;
+};
 }
 }
diff --git a/scanner/engine/halide_context.h b/scanner/engine/halide_context.h
deleted file mode 100644
index 69686ca3..00000000
--- a/scanner/engine/halide_context.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "scanner/util/cuda.h"
-
-namespace Halide { namespace Runtime { namespace Internal { namespace Cuda {
-CUcontext context = 0;
-}}}}
diff --git a/scanner/engine/ingest.cpp b/scanner/engine/ingest.cpp
index efea9456..2806ec97 100644
--- a/scanner/engine/ingest.cpp
+++ b/scanner/engine/ingest.cpp
@@ -14,7 +14,9 @@
  */
 
 #include "scanner/api/database.h"
-#include "scanner/engine/db.h"
+#include "scanner/api/frame.h"
+#include "scanner/engine/metadata.h"
+#include "scanner/video/h264_byte_stream_index_creator.h"
 
 #include "scanner/util/common.h"
 #include "scanner/util/h264.h"
@@ -22,6 +24,9 @@
 
 #include "storehouse/storage_backend.h"
 
+#include "hwang/video_index.h"
+#include "hwang/mp4_index_creator.h"
+
 #include <glog/logging.h>
 #include <thread>
 
@@ -52,67 +57,83 @@ const std::string BAD_VIDEOS_FILE_PATH = "bad_videos.txt";
 
 struct FFStorehouseState {
   std::unique_ptr<RandomReadFile> file = nullptr;
-  size_t size = 0; // total file size
+  size_t size = 0;  // total file size
   u64 pos = 0;
+
+  u64 buffer_start = 0;
+  u64 buffer_end = 0;
+  std::vector<u8> buffer;
 };
 
 // For custom AVIOContext that loads from memory
-i32 read_packet(void *opaque, u8 *buf, i32 buf_size) {
-  FFStorehouseState *fs = (FFStorehouseState *)opaque;
-  size_t size_read;
-  storehouse::StoreResult result;
-  EXP_BACKOFF(fs->file->read(fs->pos, buf_size, buf, size_read), result);
-  if (result != storehouse::StoreResult::EndOfFile) {
-    exit_on_error(result);
+i32 read_packet(void* opaque, u8* buf, i32 buf_size) {
+  FFStorehouseState* fs = (FFStorehouseState*)opaque;
+  if (!(fs->buffer_start <= fs->pos && fs->pos + buf_size < fs->buffer_end)) {
+    // Not in cache
+    size_t buffer_size = 64 * 1024 * 1024;
+    fs->buffer.resize(buffer_size);
+    size_t size_read;
+    storehouse::StoreResult result;
+    EXP_BACKOFF(
+        fs->file->read(fs->pos, buffer_size, fs->buffer.data(), size_read),
+        result);
+    if (result != storehouse::StoreResult::EndOfFile) {
+      exit_on_error(result);
+    }
+
+    fs->buffer_start = fs->pos;
+    fs->buffer_end = fs->pos + size_read;
   }
 
+  size_t size_read = std::min((size_t)buf_size, fs->buffer_end - fs->pos);
+  memcpy(buf, fs->buffer.data() + (fs->pos - fs->buffer_start), size_read);
   fs->pos += size_read;
   return static_cast<i32>(size_read);
 }
 
-i64 seek(void *opaque, i64 offset, i32 whence) {
-  FFStorehouseState *fs = (FFStorehouseState *)opaque;
+i64 seek(void* opaque, i64 offset, i32 whence) {
+  FFStorehouseState* fs = (FFStorehouseState*)opaque;
   switch (whence) {
-  case SEEK_SET:
-    assert(offset >= 0);
-    fs->pos = static_cast<u64>(offset);
-    break;
-  case SEEK_CUR:
-    fs->pos += offset;
-    break;
-  case SEEK_END:
-    fs->pos = fs->size;
-    break;
-  case AVSEEK_SIZE:
-    return fs->size;
-    break;
+    case SEEK_SET:
+      assert(offset >= 0);
+      fs->pos = static_cast<u64>(offset);
+      break;
+    case SEEK_CUR:
+      fs->pos += offset;
+      break;
+    case SEEK_END:
+      fs->pos = fs->size;
+      break;
+    case AVSEEK_SIZE:
+      return fs->size;
+      break;
   }
   return fs->size - fs->pos;
 }
 
 struct CodecState {
   AVPacket av_packet;
-  AVFrame *picture;
-  AVFormatContext *format_context;
-  AVIOContext *io_context;
-  AVCodec *in_codec;
-  AVCodecContext *in_cc;
+  AVFrame* picture;
+  AVFormatContext* format_context;
+  AVIOContext* io_context;
+  AVCodec* in_codec;
+  AVCodecContext* in_cc;
 #if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(57, 34, 0)
-  AVCodecParameters *in_cc_params;
+  AVCodecParameters* in_cc_params;
 #endif
   i32 video_stream_index;
-  AVBitStreamFilterContext *annexb;
+  AVBitStreamFilterContext* annexb;
 };
 
-bool setup_video_codec(FFStorehouseState *fs, CodecState &state) {
+bool setup_video_codec(FFStorehouseState* fs, CodecState& state) {
   VLOG(1) << "Setting up video codec";
   av_init_packet(&state.av_packet);
   state.picture = av_frame_alloc();
   state.format_context = avformat_alloc_context();
 
   size_t avio_context_buffer_size = 4096;
-  u8 *avio_context_buffer =
-      static_cast<u8 *>(av_malloc(avio_context_buffer_size));
+  u8* avio_context_buffer =
+      static_cast<u8*>(av_malloc(avio_context_buffer_size));
   state.io_context =
       avio_alloc_context(avio_context_buffer, avio_context_buffer_size, 0, fs,
                          &read_packet, NULL, &seek);
@@ -141,7 +162,7 @@ bool setup_video_codec(FFStorehouseState *fs, CodecState &state) {
     return false;
   }
 
-  AVStream const *const in_stream =
+  AVStream const* const in_stream =
       state.format_context->streams[state.video_stream_index];
 
   state.in_codec = avcodec_find_decoder(AV_CODEC_ID_H264);
@@ -197,29 +218,171 @@ void cleanup_video_codec(CodecState state) {
   av_bitstream_filter_close(state.annexb);
 }
 
-bool parse_and_write_video(storehouse::StorageBackend *storage,
-                           const std::string &table_name,
-                           i32 table_id,
-                           const std::string &path,
-                           std::string& error_message) {
+bool parse_video_inplace(storehouse::StorageBackend* storage,
+                         const std::string& table_name, i32 table_id,
+                         const std::string& path, std::string& error_message) {
   proto::TableDescriptor table_desc;
   table_desc.set_id(table_id);
   table_desc.set_name(table_name);
   table_desc.set_job_id(-1);
   table_desc.set_timestamp(
-    std::chrono::duration_cast<std::chrono::seconds>(now().time_since_epoch())
-    .count());
+      std::chrono::duration_cast<std::chrono::seconds>(now().time_since_epoch())
+          .count());
 
   {
-    Column *frame_col = table_desc.add_columns();
-    frame_col->set_name("frame");
-    frame_col->set_id(0);
+    Column* index_col = table_desc.add_columns();
+    index_col->set_name(index_column_name());
+    index_col->set_id(0);
+    index_col->set_type(ColumnType::Other);
+
+    Column* frame_col = table_desc.add_columns();
+    frame_col->set_name(frame_column_name());
+    frame_col->set_id(1);
     frame_col->set_type(ColumnType::Video);
+  }
+
+  StoreResult result;
+  std::unique_ptr<RandomReadFile> file = nullptr;
+  EXP_BACKOFF(make_unique_random_read_file(storage, path, file),
+              result);
+  if (result != StoreResult::Success) {
+    error_message = "Can not open video file";
+    return false;
+  }
+
+  u64 file_size;
+  EXP_BACKOFF(file->get_size(file_size), result);
+  if (result != StoreResult::Success) {
+    error_message = "Can not get file size";
+    return false;
+  }
+  if (file_size <= 0) {
+    error_message = "Can not ingest empty video file";
+    return false;
+  }
+
+  hwang::MP4IndexCreator index_creator(file_size);
+  u64 offset = 0;
+  u64 size_to_read = 1024;
+  while (!index_creator.is_done()) {
+    std::vector<u8> data(size_to_read);
+    size_t size_read;
+    EXP_BACKOFF(
+        file->read(offset, size_to_read, data.data(), size_read),
+        result);
+    exit_on_error(result);
+    assert(size_read == size_to_read);
+    u64 next_offset;
+    u64 next_size;
+    index_creator.feed(data.data(), size_read,
+                       next_offset,
+                       next_size);
+    offset = next_offset;
+    size_to_read = next_size;
+  }
+  if (index_creator.is_error()) {
+    error_message = index_creator.error_message();
+    return false;
+  }
+  hwang::VideoIndex index = index_creator.get_video_index();
+
+  VideoMetadata video_meta;
+  proto::VideoDescriptor& video_descriptor = video_meta.get_descriptor();
+  video_descriptor.set_table_id(table_id);
+  video_descriptor.set_column_id(1);
+  video_descriptor.set_item_id(0);
+
+  video_descriptor.set_width(index.frame_width());
+  video_descriptor.set_height(index.frame_height());
+  video_descriptor.set_channels(3);
+  video_descriptor.set_frame_type(FrameType::U8);
+  video_descriptor.set_chroma_format(proto::VideoDescriptor::YUV_420);
+  video_descriptor.set_codec_type(proto::VideoDescriptor::H264);
+
+  video_descriptor.set_data_path(path);
+  video_descriptor.set_inplace(true);
+
+  i64 frame = index.sample_sizes().size();
+  const std::vector<u8>& metadata_bytes = index.metadata_bytes();
+  const std::vector<u64>& keyframe_indices = index.keyframe_indices();
+  const std::vector<u64>& sample_offsets = index.sample_offsets();
+  const std::vector<u64>& sample_sizes = index.sample_sizes();
+
+  VLOG(2) << "Num frames: " << frame;
+  VLOG(2) << "Average GOP length: " << frame / (float)keyframe_indices.size();
+
+  // Create index column
+  std::string index_path = table_item_output_path(table_id, 0, 0);
+  std::unique_ptr<WriteFile> index_file{};
+  BACKOFF_FAIL(make_unique_write_file(storage, index_path, index_file));
+
+  std::string index_metadata_path = table_item_metadata_path(table_id, 0, 0);
+  std::unique_ptr<WriteFile> index_metadata_file{};
+  BACKOFF_FAIL(make_unique_write_file(storage, index_metadata_path,
+                                      index_metadata_file));
+  s_write<i64>(index_metadata_file.get(), frame);
+  for (i64 i = 0; i < frame; ++i) {
+    s_write(index_metadata_file.get(), sizeof(i64));
+  }
+  BACKOFF_FAIL(index_metadata_file->save());
+  for (i64 i = 0; i < frame; ++i) {
+    s_write(index_file.get(), i);
+  }
+  BACKOFF_FAIL(index_file->save());
+
+  table_desc.add_end_rows(frame);
+  video_descriptor.set_frames(frame);
+  video_descriptor.set_num_encoded_videos(1);
+  video_descriptor.add_frames_per_video(frame);
+  video_descriptor.add_keyframes_per_video(keyframe_indices.size());
+  video_descriptor.add_size_per_video(file_size);
+  video_descriptor.set_metadata_packets(metadata_bytes.data(),
+                                        metadata_bytes.size());
+
+  for (u64 v : keyframe_indices) {
+    video_descriptor.add_keyframe_indices(v);
+  }
+  for (u64 v : sample_offsets) {
+    video_descriptor.add_sample_offsets(v);
+  }
+  for (u64 v : sample_sizes) {
+    video_descriptor.add_sample_sizes(v);
+  }
+
+  // Save our metadata for the frame column
+  write_video_metadata(storage, video_meta);
+
+  // Save the table descriptor
+  write_table_metadata(storage, TableMetadata(table_desc));
+
+  std::fflush(NULL);
+  sync();
+
+  return true;
+}
+
+bool parse_and_write_video(storehouse::StorageBackend* storage,
+                           const std::string& table_name, i32 table_id,
+                           const std::string& path,
+                           std::string& error_message) {
+  proto::TableDescriptor table_desc;
+  table_desc.set_id(table_id);
+  table_desc.set_name(table_name);
+  table_desc.set_job_id(-1);
+  table_desc.set_timestamp(
+      std::chrono::duration_cast<std::chrono::seconds>(now().time_since_epoch())
+          .count());
 
-    Column *frame_info_col = table_desc.add_columns();
-    frame_info_col->set_name("frame_info");
-    frame_info_col->set_id(1);
-    frame_info_col->set_type(ColumnType::Other);
+  {
+    Column* index_col = table_desc.add_columns();
+    index_col->set_name(index_column_name());
+    index_col->set_id(0);
+    index_col->set_type(ColumnType::Other);
+
+    Column* frame_col = table_desc.add_columns();
+    frame_col->set_name(frame_column_name());
+    frame_col->set_id(1);
+    frame_col->set_type(ColumnType::Video);
   }
 
   // Setup custom buffer for libavcodec so that we can read from a storehouse
@@ -252,43 +415,27 @@ bool parse_and_write_video(storehouse::StorageBackend *storage,
   }
 
   VideoMetadata video_meta;
-  proto::VideoDescriptor &video_descriptor = video_meta.get_descriptor();
+  proto::VideoDescriptor& video_descriptor = video_meta.get_descriptor();
   video_descriptor.set_table_id(table_id);
-  video_descriptor.set_column_id(0);
+  video_descriptor.set_column_id(1);
   video_descriptor.set_item_id(0);
 
   video_descriptor.set_width(state.in_cc->width);
   video_descriptor.set_height(state.in_cc->height);
+  video_descriptor.set_channels(3);
+  video_descriptor.set_frame_type(FrameType::U8);
   video_descriptor.set_chroma_format(proto::VideoDescriptor::YUV_420);
   video_descriptor.set_codec_type(proto::VideoDescriptor::H264);
 
-  std::string data_path = table_item_output_path(table_id, 0, 0);
+  std::string data_path = table_item_output_path(table_id, 1, 0);
   std::unique_ptr<WriteFile> demuxed_bytestream{};
   BACKOFF_FAIL(make_unique_write_file(storage, data_path, demuxed_bytestream));
 
-  u64 bytestream_pos = 0;
-  std::vector<u8> metadata_bytes;
-  std::vector<i64> keyframe_positions;
-  std::vector<i64> keyframe_timestamps;
-  std::vector<i64> keyframe_byte_offsets;
+  video_descriptor.set_data_path(data_path);
+  video_descriptor.set_inplace(false);
 
   bool succeeded = true;
-  i32 frame = 0;
-  bool extradata_extracted = false;
-  bool in_meta_packet_sequence = false;
-  i64 meta_packet_sequence_start_offset = 0;
-  bool saw_sps_nal = false;
-  bool saw_pps_nal = false;
-  std::map<u32, SPS> sps_map;
-  std::map<u32, PPS> pps_map;
-  u32 last_sps = -1;
-  u32 last_pps = -1;
-  std::map<u32, std::vector<u8>> sps_nal_bytes;
-  std::map<u32, std::vector<u8>> pps_nal_bytes;
-  SliceHeader prev_sh;
-
-  i32 num_non_ref_frames = 0;
-  i32 avcodec_frame = 0;
+  H264ByteStreamIndexCreator index_creator(demuxed_bytestream.get());
   while (true) {
     // Read from format context
     i32 err = av_read_frame(state.format_context, &state.av_packet);
@@ -298,6 +445,7 @@ bool parse_and_write_video(storehouse::StorageBackend *storage,
     } else if (err != 0) {
       char err_msg[256];
       av_strerror(err, err_msg, 256);
+      int frame = index_creator.frames();
       LOG(ERROR) << "Error while decoding frame " << frame << " (" << err
                  << "): " << err_msg;
       cleanup_video_codec(state);
@@ -327,15 +475,17 @@ bool parse_and_write_video(storehouse::StorageBackend *storage,
     /* here, we use a stream based decoder (mpeg1video), so we
        feed decoder and see if it could decode a frame */
 
-    u8 *orig_data = state.av_packet.data;
+    u8* orig_data = state.av_packet.data;
     i32 orig_size = state.av_packet.size;
 
-    u8 *filtered_data;
+    u8* filtered_data;
     i32 filtered_data_size;
-    if (av_bitstream_filter_filter(
-            state.annexb, state.in_cc, NULL, &filtered_data,
-            &filtered_data_size, state.av_packet.data, state.av_packet.size,
-            state.av_packet.flags & AV_PKT_FLAG_KEY) < 0) {
+    err = av_bitstream_filter_filter(state.annexb, state.in_cc, NULL,
+                                     &filtered_data, &filtered_data_size,
+                                     state.av_packet.data, state.av_packet.size,
+                                     state.av_packet.flags & AV_PKT_FLAG_KEY);
+    if (err < 0) {
+      int frame = index_creator.frames();
       char err_msg[256];
       av_strerror(err, err_msg, 256);
       LOG(ERROR) << "Error while filtering " << frame << " (" << frame
@@ -346,210 +496,71 @@ bool parse_and_write_video(storehouse::StorageBackend *storage,
       return false;
     }
 
-    if (!extradata_extracted) {
-      const u8 *extradata = state.in_cc->extradata;
-      i32 extradata_size_left = state.in_cc->extradata_size;
-
-      metadata_bytes.resize(extradata_size_left);
-      memcpy(metadata_bytes.data(), extradata, extradata_size_left);
-
-      while (extradata_size_left > 3) {
-        const u8 *nal_start = nullptr;
-        i32 nal_size = 0;
-        next_nal(extradata, extradata_size_left, nal_start, nal_size);
-        i32 nal_ref_idc = (*nal_start >> 5);
-        i32 nal_unit_type = (*nal_start) & 0x1F;
-        VLOG(1) << "extradata nal size: " << nal_size << ", nal ref "
-                  << nal_ref_idc << ", nal unit " << nal_unit_type;
-      }
-      extradata_extracted = true;
-    }
-
-    i64 nal_bytestream_offset = bytestream_pos;
-
-    VLOG(1) << "new packet " << nal_bytestream_offset;
-    bool insert_sps_nal = false;
-    // Parse NAL unit
-    const u8 *nal_parse = filtered_data;
-    i32 size_left = filtered_data_size;
-    i32 nals_parsed = 0;
-    while (size_left > 3) {
-      const u8 *nal_start = nullptr;
-      i32 nal_size = 0;
-      next_nal(nal_parse, size_left, nal_start, nal_size);
-
-      i32 nal_ref_idc = (*nal_start >> 5);
-      i32 nal_unit_type = (*nal_start) & 0x1F;
-      VLOG(1) << "frame " << frame << ", nal size " << nal_size
-                << ", nal_ref_idc " << nal_ref_idc << ", nal unit "
-                << nal_unit_type;
-      if (nal_ref_idc == 0) {
-        num_non_ref_frames += 1;
-      }
-      if (nal_unit_type > 4) {
-        if (!in_meta_packet_sequence) {
-          meta_packet_sequence_start_offset = nal_bytestream_offset;
-          filtered_data_size - size_left;
-          VLOG(1) << "in meta sequence " << nal_bytestream_offset;
-          in_meta_packet_sequence = true;
-          saw_sps_nal = false;
-        }
-      }
-      std::vector<u8> rbsp_buffer;
-      rbsp_buffer.reserve(64 * 1024);
-      u32 consecutive_zeros = 0;
-      i32 bytes = nal_size - 1;
-      const u8* pb = nal_start + 1;
-      while (bytes > 0) {
-        /* Copy the byte into the rbsp, unless it
-         * is the 0x03 in a 0x000003 */
-        if (consecutive_zeros < 2 || *pb != 0x03) {
-          rbsp_buffer.push_back(*pb);
-        }
-        if (*pb == 0) {
-          ++consecutive_zeros;
-        } else {
-          consecutive_zeros = 0;
-        }
-        ++pb;
-        --bytes;
-      }
-
-      // We need to track the last SPS NAL because some streams do
-      // not insert an SPS every keyframe and we need to insert it
-      // ourselves.
-      // fprintf(stderr, "nal_size %d, rbsp size %lu\n", nal_size, rbsp_buffer.size());
-      const u8* rbsp_start = rbsp_buffer.data();
-      i32 rbsp_size = rbsp_buffer.size();
-
-      // SPS
-      if (nal_unit_type == 7) {
-        saw_sps_nal = true;
-        i32 offset = 8;
-        GetBitsState gb;
-        gb.buffer = rbsp_start;
-        gb.offset = 0;
-        SPS sps;
-        if (!parse_sps(gb, sps)) {
-          return false;
-        }
-        i32 sps_id = sps.sps_id;
-        sps_map[sps_id] = sps;
-        last_sps = sps.sps_id;
-
-        sps_nal_bytes[sps_id].clear();
-        sps_nal_bytes[sps_id].insert(sps_nal_bytes[sps_id].end(), nal_start - 3,
-                                     nal_start + nal_size + 3);
-        VLOG(1) << "Last SPS NAL (" << sps_id << ", " << offset << ")"
-                  << " seen at frame " << frame;
-      }
-      // PPS
-      if (nal_unit_type == 8) {
-        GetBitsState gb;
-        gb.buffer = rbsp_start;
-        gb.offset = 0;
-        PPS pps;
-        if (!parse_pps(gb, pps)) {
-          return false;
-        }
-        pps_map[pps.pps_id] = pps;
-        last_pps = pps.pps_id;
-        saw_pps_nal = true;
-        i32 pps_id = pps.pps_id;
-        pps_nal_bytes[pps_id].clear();
-        pps_nal_bytes[pps_id].insert(pps_nal_bytes[pps_id].end(), nal_start - 3,
-                                     nal_start + nal_size + 3);
-        VLOG(1) << "PPS id " << pps.pps_id << ", SPS id " << pps.sps_id
-                  << ", frame " << frame;
-      }
-      if (is_vcl_nal(nal_unit_type)) {
-        assert(last_pps != -1);
-        assert(last_sps != -1);
-        GetBitsState gb;
-        gb.buffer = nal_start;
-        gb.offset = 8;
-        SliceHeader sh;
-        if(!parse_slice_header(gb, sps_map.at(last_sps), pps_map,
-                               nal_unit_type, nal_ref_idc, sh)) {
-          return false;
-        }
-        if (frame == 0 || is_new_access_unit(sps_map, pps_map, prev_sh, sh)) {
-          frame++;
-          size_t bytestream_offset;
-          if (nal_unit_type == 5) {
-            // Insert an SPS NAL if we did not see one in the meta packet
-            // sequence
-            keyframe_byte_offsets.push_back(nal_bytestream_offset);
-            keyframe_positions.push_back(frame - 1);
-            keyframe_timestamps.push_back(state.av_packet.pts);
-            saw_sps_nal = false;
-            VLOG(1) << "keyframe " << frame - 1 << ", byte offset "
-                      << meta_packet_sequence_start_offset;
-
-            // Insert metadata
-            VLOG(1) << "inserting sps and pss nals";
-            i32 size = filtered_data_size;
-            for (auto& kv : sps_nal_bytes) {
-              auto& sps_nal = kv.second;
-              size += static_cast<i32>(sps_nal.size());
-            }
-            for (auto& kv : pps_nal_bytes) {
-              auto& pps_nal = kv.second;
-              size += static_cast<i32>(pps_nal.size());
-            }
-
-            s_write(demuxed_bytestream.get(), size);
-            for (auto &kv : sps_nal_bytes) {
-              auto &sps_nal = kv.second;
-              s_write(demuxed_bytestream.get(), sps_nal.data(), sps_nal.size());
-            }
-            for (auto &kv : pps_nal_bytes) {
-              auto &pps_nal = kv.second;
-              s_write(demuxed_bytestream.get(), pps_nal.data(), pps_nal.size());
-            }
-            // Append the packet to the stream
-            s_write(demuxed_bytestream.get(), filtered_data,
-                    filtered_data_size);
-
-            bytestream_pos += sizeof(size) + size;
-          } else {
-            s_write(demuxed_bytestream.get(), filtered_data_size);
-            bytestream_pos += sizeof(filtered_data_size) + filtered_data_size;
-            // Append the packet to the stream
-            s_write(demuxed_bytestream.get(), filtered_data,
-                    filtered_data_size);
-          }
-        }
-        in_meta_packet_sequence = false;
-        prev_sh = sh;
-      }
-      nals_parsed++;
+    if (!index_creator.feed_packet(filtered_data, filtered_data_size)) {
+      error_message = index_creator.error_message();
+      return false;
     }
-
     free(filtered_data);
 
     av_packet_unref(&state.av_packet);
   }
 
+  video_descriptor.set_time_base_num(state.in_cc->time_base.num);
+  video_descriptor.set_time_base_denom(state.in_cc->time_base.den);
+
+  i64 frame = index_creator.frames();
+  i32 num_non_ref_frames = index_creator.num_non_ref_frames();
+  const std::vector<u8>& metadata_bytes = index_creator.metadata_bytes();
+  const std::vector<u64>& keyframe_indices = index_creator.keyframe_indices();
+  const std::vector<u64>& sample_offsets = index_creator.sample_offsets();
+  const std::vector<u64>& sample_sizes = index_creator.sample_sizes();
+
+  VLOG(2) << "Num frames: " << frame;
+  VLOG(2) << "Num non-reference frames: " << num_non_ref_frames;
+  VLOG(2) << "% non-reference frames: " << num_non_ref_frames / (float)frame;
+  VLOG(2) << "Average GOP length: " << frame / (float)keyframe_indices.size();
+
   // Cleanup video decoder
   cleanup_video_codec(state);
 
   // Save demuxed stream
   BACKOFF_FAIL(demuxed_bytestream->save());
 
+  // Create index column
+  std::string index_path = table_item_output_path(table_id, 0, 0);
+  std::unique_ptr<WriteFile> index_file{};
+  BACKOFF_FAIL(make_unique_write_file(storage, index_path, index_file));
+
+  std::string index_metadata_path = table_item_metadata_path(table_id, 0, 0);
+  std::unique_ptr<WriteFile> index_metadata_file{};
+  BACKOFF_FAIL(make_unique_write_file(storage, index_metadata_path, index_metadata_file));
+  s_write<i64>(index_metadata_file.get(), frame);
+  for (i64 i = 0; i < frame; ++i) {
+    s_write(index_metadata_file.get(), sizeof(i64));
+  }
+  BACKOFF_FAIL(index_metadata_file->save());
+  for (i64 i = 0; i < frame; ++i) {
+    s_write(index_file.get(), i);
+  }
+  BACKOFF_FAIL(index_file->save());
+
   table_desc.add_end_rows(frame);
   video_descriptor.set_frames(frame);
+  video_descriptor.set_num_encoded_videos(1);
+  video_descriptor.add_frames_per_video(frame);
+  video_descriptor.add_keyframes_per_video(keyframe_indices.size());
+  video_descriptor.add_size_per_video(index_creator.bytestream_pos());
   video_descriptor.set_metadata_packets(metadata_bytes.data(),
                                         metadata_bytes.size());
 
-  for (i64 v : keyframe_positions) {
-    video_descriptor.add_keyframe_positions(v);
+  for (u64 v : sample_offsets) {
+    video_descriptor.add_sample_offsets(v);
   }
-  for (i64 v : keyframe_timestamps) {
-    video_descriptor.add_keyframe_timestamps(v);
+  for (u64 v : sample_sizes) {
+    video_descriptor.add_sample_sizes(v);
   }
-  for (i64 v : keyframe_byte_offsets) {
-    video_descriptor.add_keyframe_byte_offsets(v);
+  for (u64 v : keyframe_indices) {
+    video_descriptor.add_keyframe_indices(v);
   }
 
   // Save our metadata for the frame column
@@ -558,6 +569,9 @@ bool parse_and_write_video(storehouse::StorageBackend *storage,
   // Save the table descriptor
   write_table_metadata(storage, TableMetadata(table_desc));
 
+  std::fflush(NULL);
+  sync();
+
   return succeeded;
 }
 
@@ -874,13 +888,14 @@ bool parse_and_write_video(storehouse::StorageBackend *storage,
 //     BACKOFF_FAIL(metadata_file->save());
 //   }
 // }
-} // end anonymous namespace
-
-Result ingest_videos(storehouse::StorageConfig *storage_config,
-                   const std::string &db_path,
-                   const std::vector<std::string> &table_names,
-                   const std::vector<std::string> &paths,
-                   std::vector<FailedVideo> &failed_videos) {
+}  // end anonymous namespace
+
+Result ingest_videos(storehouse::StorageConfig* storage_config,
+                     const std::string& db_path,
+                     const std::vector<std::string>& table_names,
+                     const std::vector<std::string>& paths,
+                     bool inplace,
+                     std::vector<FailedVideo>& failed_videos) {
   Result result;
   result.set_success(true);
 
@@ -892,6 +907,7 @@ Result ingest_videos(storehouse::StorageConfig *storage_config,
 
   internal::DatabaseMetadata meta = internal::read_database_metadata(
       storage.get(), internal::DatabaseMetadata::descriptor_path());
+
   std::vector<i32> table_ids;
   std::set<std::string> inserted_table_names;
   for (size_t i = 0; i < table_names.size(); ++i) {
@@ -926,11 +942,29 @@ Result ingest_videos(storehouse::StorageConfig *storage_config,
     i32 end = videos_allocated;
     ingest_threads.emplace_back([&, start, end]() {
       for (i32 i = start; i < end; ++i) {
-        if (!internal::parse_and_write_video(storage.get(), table_names[i],
-                                             table_ids[i], paths[i],
-                                             bad_messages[i])) {
-          // Did not ingest correctly, skip it
-          bad_videos[i] = true;
+        // If inplace, try to run inplace first
+        bool inplace_succeeded = false;
+        if (inplace) {
+          std::string inplace_error_string;
+          if (internal::parse_video_inplace(storage.get(), table_names[i],
+                                            table_ids[i], paths[i],
+                                            inplace_error_string)) {
+            inplace_succeeded = true;
+          } else {
+            LOG(WARNING) << "Failed to ingest " << paths[i]
+                         << " inplace: " << inplace_error_string;
+            std::cerr << "Failed to ingest " << paths[i]
+                      << " inplace: " << inplace_error_string;
+          }
+        }
+        // If inplace failed or not specified, copy
+        if (!inplace_succeeded) {
+          if (!internal::parse_and_write_video(storage.get(), table_names[i],
+                                               table_ids[i], paths[i],
+                                               bad_messages[i])) {
+            // Did not ingest correctly, skip it
+            bad_videos[i] = true;
+          }
         }
       }
     });
@@ -946,6 +980,8 @@ Result ingest_videos(storehouse::StorageConfig *storage_config,
       LOG(WARNING) << "Failed to ingest video " << paths[i] << "!";
       failed_videos.push_back({paths[i], bad_messages[i]});
       meta.remove_table(table_ids[i]);
+    } else {
+      meta.commit_table(table_ids[i]);
     }
   }
   if (num_bad_videos == table_names.size()) {
@@ -959,9 +995,9 @@ Result ingest_videos(storehouse::StorageConfig *storage_config,
   return result;
 }
 
-void ingest_images(storehouse::StorageConfig *storage_config,
-                   const std::string &db_path, const std::string &table_name,
-                   const std::vector<std::string> &paths) {
+void ingest_images(storehouse::StorageConfig* storage_config,
+                   const std::string& db_path, const std::string& table_name,
+                   const std::vector<std::string>& paths) {
   internal::set_database_path(db_path);
 
   std::unique_ptr<storehouse::StorageBackend> storage{
diff --git a/scanner/engine/ingest.h b/scanner/engine/ingest.h
index 6cc7a58a..8291d5d3 100644
--- a/scanner/engine/ingest.h
+++ b/scanner/engine/ingest.h
@@ -18,6 +18,7 @@
 #include "scanner/api/database.h"
 #include "scanner/util/common.h"
 
+#include "storehouse/storage_backend.h"
 #include "storehouse/storage_config.h"
 
 #include <string>
@@ -25,11 +26,12 @@
 namespace scanner {
 namespace internal {
 
-Result ingest_videos(storehouse::StorageConfig *storage_config,
-                     const std::string &db_path,
-                     const std::vector<std::string> &table_names,
-                     const std::vector<std::string> &paths,
-                     std::vector<FailedVideo> &failed_videos);
+Result ingest_videos(storehouse::StorageConfig* storage_config,
+                     const std::string& db_path,
+                     const std::vector<std::string>& table_names,
+                     const std::vector<std::string>& paths,
+                     bool inplace,
+                     std::vector<FailedVideo>& failed_videos);
 
 // void ingest_images(storehouse::StorageConfig *storage_config,
 //                    const std::string &db_path, const std::string &table_name,
diff --git a/scanner/engine/kernel_factory.h b/scanner/engine/kernel_factory.h
index 8cff18b5..63a6de81 100644
--- a/scanner/engine/kernel_factory.h
+++ b/scanner/engine/kernel_factory.h
@@ -37,33 +37,41 @@ namespace internal {
  */
 class KernelFactory {
  public:
-  KernelFactory(const std::string& op_name,
-                DeviceType type, i32 max_devices, i32 warmup_size,
-                KernelConstructor constructor)
-      : op_name_(op_name),
-        type_(type), max_devices_(max_devices), warmup_size_(warmup_size),
-        constructor_(constructor) {}
+  KernelFactory(const std::string& op_name, DeviceType type, i32 max_devices,
+                const std::map<std::string, DeviceType>& input_devices,
+                const std::map<std::string, DeviceType>& output_devices,
+                bool can_batch, i32 batch_size, KernelConstructor constructor)
+    : op_name_(op_name),
+      type_(type),
+      max_devices_(max_devices),
+      input_devices_(input_devices),
+      output_devices_(output_devices),
+      can_batch_(can_batch),
+      preferred_batch_size_(batch_size),
+      constructor_(constructor) {}
 
-  const std::string& get_op_name() const {
-    return op_name_;
-  }
+  const std::string& get_op_name() const { return op_name_; }
 
   /** Describes the capabilities of the ops the factory produces. */
-  DeviceType get_device_type() const {
-    return type_;
-  }
+  DeviceType get_device_type() const { return type_; }
+
+  i32 get_max_devices() const { return max_devices_; }
 
-  i32 get_max_devices() const {
-    return max_devices_;
+  const std::map<std::string, DeviceType>& get_input_devices() const {
+    return input_devices_;
   }
 
-  i32 get_warmup_size() const {
-    return warmup_size_;
+  const std::map<std::string, DeviceType>& get_output_devices() const {
+    return output_devices_;
   }
 
-  /* @brief Constructs a kernel to be used for processing rows of data.
+  bool can_batch() const { return can_batch_; }
+
+  i32 preferred_batch_size() const { return preferred_batch_size_; }
+
+  /* @brief Constructs a kernel to be used for processing elements of data.
    */
-  Kernel* new_instance(const Kernel::Config& config) {
+  BaseKernel* new_instance(const KernelConfig& config) {
     return constructor_(config);
   }
 
@@ -71,9 +79,11 @@ class KernelFactory {
   std::string op_name_;
   DeviceType type_;
   i32 max_devices_;
-  i32 warmup_size_;
+  std::map<std::string, DeviceType> input_devices_;
+  std::map<std::string, DeviceType> output_devices_;
+  bool can_batch_;
+  i32 preferred_batch_size_;
   KernelConstructor constructor_;
 };
-
 }
 }
diff --git a/scanner/engine/kernel_registry.cpp b/scanner/engine/kernel_registry.cpp
index 85f00dca..896cd133 100644
--- a/scanner/engine/kernel_registry.cpp
+++ b/scanner/engine/kernel_registry.cpp
@@ -18,28 +18,28 @@
 namespace scanner {
 namespace internal {
 
-void KernelRegistry::add_kernel(const std::string &name,
-                                KernelFactory *factory) {
+void KernelRegistry::add_kernel(const std::string& name,
+                                KernelFactory* factory) {
   DeviceType type = factory->get_device_type();
   factories_.insert({factory_name(name, type), factory});
 }
 
-bool KernelRegistry::has_kernel(const std::string &name, DeviceType type) {
+bool KernelRegistry::has_kernel(const std::string& name, DeviceType type) {
   return factories_.count(factory_name(name, type)) > 0;
 }
 
-KernelFactory *KernelRegistry::get_kernel(const std::string &name,
+KernelFactory* KernelRegistry::get_kernel(const std::string& name,
                                           DeviceType type) {
   return factories_.at(factory_name(name, type));
 }
 
-std::string KernelRegistry::factory_name(const std::string &name,
+std::string KernelRegistry::factory_name(const std::string& name,
                                          DeviceType type) {
   return name + ((type == DeviceType::CPU) ? "_cpu" : "_gpu");
 }
 
-KernelRegistry *get_kernel_registry() {
-  static KernelRegistry *registry = new KernelRegistry;
+KernelRegistry* get_kernel_registry() {
+  static KernelRegistry* registry = new KernelRegistry;
   return registry;
 }
 }
diff --git a/scanner/engine/kernel_registry.h b/scanner/engine/kernel_registry.h
index 3d638962..ae187d51 100644
--- a/scanner/engine/kernel_registry.h
+++ b/scanner/engine/kernel_registry.h
@@ -27,21 +27,19 @@ namespace internal {
 
 class KernelRegistry {
  public:
-   void add_kernel(const std::string &name, KernelFactory *factory);
+  void add_kernel(const std::string& name, KernelFactory* factory);
 
-   bool has_kernel(const std::string &name, DeviceType device_type);
+  bool has_kernel(const std::string& name, DeviceType device_type);
 
-   KernelFactory *get_kernel(const std::string &name, DeviceType device_type);
+  KernelFactory* get_kernel(const std::string& name, DeviceType device_type);
 
  protected:
-  static std::string factory_name(const std::string &name, DeviceType type);
+  static std::string factory_name(const std::string& name, DeviceType type);
 
  private:
-
   std::map<std::string, KernelFactory*> factories_;
 };
 
 KernelRegistry* get_kernel_registry();
-
 }
 }
diff --git a/scanner/engine/load_worker.cpp b/scanner/engine/load_worker.cpp
index bd495b0a..00a0184b 100644
--- a/scanner/engine/load_worker.cpp
+++ b/scanner/engine/load_worker.cpp
@@ -14,7 +14,6 @@
  */
 
 #include "scanner/engine/load_worker.h"
-#include "scanner/engine/sampling.h"
 
 #include "storehouse/storage_backend.h"
 
@@ -27,9 +26,145 @@ using storehouse::RandomReadFile;
 namespace scanner {
 namespace internal {
 namespace {
-std::tuple<size_t, size_t>
-find_keyframe_indices(i32 start_frame, i32 end_frame,
-                      const std::vector<i64> &keyframe_positions) {
+
+struct RowIntervals {
+  std::vector<i32> item_ids;
+  std::vector<i64> item_start_offsets;
+  std::vector<std::tuple<i64, i64>> item_intervals;
+  std::vector<std::vector<i64>> valid_offsets;
+};
+
+struct VideoIntervals {
+  std::vector<std::tuple<size_t, size_t>> keyframe_index_intervals;
+  std::vector<std::vector<i64>> valid_frames;
+};
+
+// Gets the list of work items for a sequence of rows in the job
+RowIntervals slice_into_row_intervals(const TableMetadata& table,
+                                      const std::vector<i64>& rows) {
+  RowIntervals info;
+  // Analyze rows and table to determine what item ids and offsets in them to
+  // sample from
+  std::vector<i64> end_rows = table.end_rows();
+  auto item_from_row = [&end_rows](i64 r) -> i32 {
+    i64 i = 0;
+    for (; i < end_rows.size(); ++i) {
+      if (r < end_rows[i]) {
+        break;
+      }
+    }
+    assert(i != end_rows.size());
+    return i;
+  };
+
+  auto offset_from_row = [&end_rows](i64 r) -> i64 {
+    i64 i = 0;
+    i64 last_end_row = 0;
+    for (; i < end_rows.size(); ++i) {
+      if (r < end_rows[i]) {
+        break;
+      }
+      last_end_row = end_rows[i];
+    }
+    assert(i != end_rows.size());
+    return r - last_end_row;
+  };
+
+  assert(!rows.empty());
+  i32 current_item = item_from_row(rows[0]);
+  i64 item_start = offset_from_row(rows[0]);
+  i64 item_end = item_start + 1;
+  i64 prev_row = -1;
+  std::vector<i64> valid_offsets;
+  for (i64 row : rows) {
+    i32 item = item_from_row(row);
+    i64 item_offset = offset_from_row(row);
+    // We check two cases:
+    //   1. if the row is in a new item, then we have found all the consecutive
+    //      increasing rows that will be in this item and we should move on
+    //      to the next one.
+    //   2. if the row we are asking for is the same as the existing row or
+    //      before it, we end the current item and start back with the item
+    //      for this new row, even if the item is the same as the current item.
+    //      NOTE(apoms): We could fuse these together and only load the item
+    //      once, but to do so requires reordering the data after it is read
+    //      from disk to match the ordering requested.
+    if (item != current_item || row <= prev_row) {
+      // Start a new item and push the current one into the list
+      info.item_ids.push_back(current_item);
+      info.item_start_offsets.push_back(current_item == 0 ? 0 :
+                                        end_rows[current_item - 1]);
+      info.item_intervals.push_back(std::make_tuple(item_start, item_end));
+      info.valid_offsets.push_back(valid_offsets);
+
+      current_item = item;
+      item_start = item_offset;
+      item_end = item_offset + 1;
+      valid_offsets.clear();
+    }
+
+    valid_offsets.push_back(item_offset);
+    item_end = item_offset + 1;
+    prev_row = row;
+  }
+  info.item_ids.push_back(current_item);
+  info.item_start_offsets.push_back(
+      current_item == 0 ? 0 : end_rows[current_item - 1]);
+  info.item_intervals.push_back(std::make_tuple(item_start, item_end));
+  info.valid_offsets.push_back(valid_offsets);
+
+  return info;
+}
+
+VideoIntervals slice_into_video_intervals(
+    const std::vector<u64>& keyframe_positions,
+    const std::vector<u64>& sample_offsets,
+    const std::vector<u64>& sample_sizes,
+    const std::vector<i64>& rows) {
+  VideoIntervals info;
+  assert(keyframe_positions.size() >= 2);
+  size_t start_keyframe_index = 0;
+  size_t end_keyframe_index = 1;
+  i64 next_keyframe = keyframe_positions[end_keyframe_index];
+  std::vector<i64> valid_frames;
+  for (i64 row : rows) {
+    if (row >= next_keyframe) {
+      // Check if this keyframe is adjacent
+      uint64_t last_endpoint = sample_offsets.at(next_keyframe - 1) +
+                               sample_sizes.at(next_keyframe - 1);
+      bool is_adjacent =
+          (last_endpoint == sample_offsets.at(next_keyframe));
+
+      assert(end_keyframe_index < keyframe_positions.size() - 1);
+      next_keyframe = keyframe_positions[++end_keyframe_index];
+      if (row >= next_keyframe) {
+        // Skipped a keyframe, so make a new interval
+        if (!valid_frames.empty()) {
+          info.keyframe_index_intervals.push_back(
+              std::make_tuple(start_keyframe_index, end_keyframe_index - 1));
+          info.valid_frames.push_back(valid_frames);
+        }
+
+        while (row >= keyframe_positions[end_keyframe_index]) {
+          end_keyframe_index++;
+          assert(end_keyframe_index < keyframe_positions.size());
+        }
+        valid_frames.clear();
+        start_keyframe_index = end_keyframe_index - 1;
+        next_keyframe = keyframe_positions[end_keyframe_index];
+      }
+    }
+    valid_frames.push_back(row);
+  }
+  info.keyframe_index_intervals.push_back(
+      std::make_tuple(start_keyframe_index, end_keyframe_index));
+  info.valid_frames.push_back(valid_frames);
+  return info;
+}
+
+std::tuple<size_t, size_t> find_keyframe_indices(
+    i32 start_frame, i32 end_frame,
+    const std::vector<i64>& keyframe_positions) {
   size_t start_keyframe_index = std::numeric_limits<size_t>::max();
   for (size_t i = 1; i < keyframe_positions.size(); ++i) {
     if (keyframe_positions[i] > start_frame) {
@@ -49,55 +184,177 @@ find_keyframe_indices(i32 start_frame, i32 end_frame,
   assert(end_keyframe_index != 0);
   return std::make_tuple(start_keyframe_index, end_keyframe_index);
 }
+}
 
-struct VideoIndexEntry {
-  i32 width;
-  i32 height;
-  std::unique_ptr<RandomReadFile> file;
-  u64 file_size;
-  std::vector<i64> keyframe_positions;
-  std::vector<i64> keyframe_byte_offsets;
-};
+LoadWorker::LoadWorker(const LoadWorkerArgs& args)
+  : node_id_(args.node_id),
+    worker_id_(args.worker_id),
+    profiler_(args.profiler),
+    load_sparsity_threshold_(args.load_sparsity_threshold),
+    io_packet_size_(args.io_packet_size),
+    work_packet_size_(args.work_packet_size) {
+  storage_.reset(
+      storehouse::StorageBackend::make_from_config(args.storage_config));
+  meta_ = read_database_metadata(storage_.get(),
+                                 DatabaseMetadata::descriptor_path());
+  table_metadata_.reset(new TableMetaCache(storage_.get(), meta_));
+}
+
+void LoadWorker::feed(LoadWorkEntry& input_entry) {
+  LoadWorkEntry& load_work_entry = input_entry;
+
+  if (load_work_entry.table_id() != last_table_id_) {
+    // Not from the same task so clear cached data
+    last_table_id_ = load_work_entry.table_id();
+    index_.clear();
+  }
+
+  entry_ = input_entry;
+  current_row_ = 0;
+  total_rows_ = 0;
+  for (auto& sample : load_work_entry.samples()) {
+    total_rows_ = std::max((i64)sample.input_row_ids_size(), total_rows_);
+  }
+}
+
+bool LoadWorker::yield(i32 item_size,
+                       EvalWorkEntry& output_entry) {
+  LoadWorkEntry& load_work_entry = entry_;
+
+  // Ignoring item size for now and just yielding one IO item at a time
+  if (current_row_ >= total_rows_) {
+    return false;
+  }
+
+  EvalWorkEntry eval_work_entry;
+  eval_work_entry.table_id = load_work_entry.table_id();
+  eval_work_entry.job_index = load_work_entry.job_index();
+  eval_work_entry.task_index = load_work_entry.task_index();
+
+  const auto& samples = load_work_entry.samples();
+  assert(!samples.empty());
+
+  // Aggregate all sample columns so we know the tuple size
+  i32 num_columns = samples.size();
+  eval_work_entry.columns.resize(num_columns);
+
+  // For each sample, insert the row ids and read the rows from disk
+  // NOTE(apoms): if the requested rows are different for each column,
+  // some of the output work entries will have an uneven number of rows
+  i32 media_col_idx = 0;
+  i32 out_col_idx = 0;
+  for (const proto::LoadSample& sample : samples) {
+    i32 table_id = sample.table_id();
+    const TableMetadata& table_meta = table_metadata_->at(table_id);
+
+    i64 total_rows = sample.input_row_ids_size();
+    i64 row_start = current_row_;
+    i64 row_end = std::min(current_row_ + item_size, total_rows);
+
+    const auto& sample_rows = sample.input_row_ids();
+    const auto& output_row_ids = sample.output_row_ids();
+    std::vector<i64> rows(sample_rows.begin() + row_start,
+                          sample_rows.begin() + row_end);
+    std::vector<i64> output_rows(output_row_ids.begin() + row_start,
+                                 output_row_ids.begin() + row_end);
+    eval_work_entry.row_ids.push_back(output_rows);
+
+    RowIntervals intervals = slice_into_row_intervals(table_meta, rows);
+    size_t num_items = intervals.item_ids.size();
+    i32 col_id = sample.column_id();
+
+    ColumnType column_type = ColumnType::Other;
+    if (table_meta.column_type(col_id) == ColumnType::Video) {
+      column_type = ColumnType::Video;
+      // video frame column
+      FrameInfo info;
+      proto::VideoDescriptor::VideoCodecType encoding_type;
+      bool inplace = false;
+      for (size_t i = 0; i < num_items; ++i) {
+        i32 item_id = intervals.item_ids[i];
+        i64 item_start_row = intervals.item_start_offsets[i];
+        const std::vector<i64>& valid_offsets = intervals.valid_offsets[i];
+
+        auto key = std::make_tuple(table_id, col_id, item_id);
+        if (index_.count(key) == 0) {
+          index_[key] =
+              read_video_index(storage_.get(), table_id, col_id, item_id);
+        }
+        const VideoIndexEntry& entry = index_.at(key);
+        inplace = entry.inplace;
+        info = FrameInfo(entry.height, entry.width, entry.channels,
+                         entry.frame_type);
+        encoding_type = entry.codec_type;
+        if (entry.codec_type == proto::VideoDescriptor::H264) {
+          // Video was encoded using h264
+          read_video_column(profiler_, entry, valid_offsets, item_start_row,
+                            eval_work_entry.columns[out_col_idx]);
+        } else {
+          // Video was encoded as individual images
+          i32 item_id = intervals.item_ids[i];
+          i64 item_start;
+          i64 item_end;
+          std::tie(item_start, item_end) = intervals.item_intervals[i];
+
+          read_other_column(table_id, col_id, item_id, item_start, item_end,
+                            valid_offsets,
+                            eval_work_entry.columns[out_col_idx]);
+        }
+      }
+      if (num_items == 0) {
+        eval_work_entry.frame_sizes.emplace_back();
+        eval_work_entry.video_encoding_type.emplace_back();
+        eval_work_entry.inplace_video.push_back(false);
+      } else {
+        eval_work_entry.frame_sizes.push_back(info);
+        eval_work_entry.video_encoding_type.push_back(encoding_type);
+        eval_work_entry.inplace_video.push_back(inplace);
+      }
+      media_col_idx++;
+    } else {
+      // regular column
+      for (size_t i = 0; i < num_items; ++i) {
+        i32 item_id = intervals.item_ids[i];
+        i64 item_start;
+        i64 item_end;
+        std::tie(item_start, item_end) = intervals.item_intervals[i];
+        const std::vector<i64>& valid_offsets = intervals.valid_offsets[i];
+
+        read_other_column(table_id, col_id, item_id, item_start, item_end,
+                          valid_offsets, eval_work_entry.columns[out_col_idx]);
+      }
+      eval_work_entry.inplace_video.push_back(false);
+    }
+    eval_work_entry.column_types.push_back(column_type);
+    eval_work_entry.column_handles.push_back(CPU_DEVICE);
+    out_col_idx++;
+  }
+
+  output_entry = eval_work_entry;
+
+  current_row_ += item_size;
 
-VideoIndexEntry read_video_index(storehouse::StorageBackend *storage,
-                                 i32 table_id, i32 column_id, i32 item_id) {
-  VideoIndexEntry index_entry;
-  VideoMetadata video_meta = read_video_metadata(
-      storage, VideoMetadata::descriptor_path(table_id, column_id, item_id));
-
-  // Open the video file for reading
-  index_entry.width = video_meta.width();
-  index_entry.height = video_meta.height();
-  BACKOFF_FAIL(storehouse::make_unique_random_read_file(
-      storage, table_item_output_path(table_id, column_id, item_id),
-      index_entry.file));
-  BACKOFF_FAIL(index_entry.file->get_size(index_entry.file_size));
-  index_entry.keyframe_positions = video_meta.keyframe_positions();
-  index_entry.keyframe_byte_offsets = video_meta.keyframe_byte_offsets();
-  // Place total frames at the end of keyframe positions and total file size
-  // at the end of byte offsets to make interval calculation not need to
-  // deal with edge cases surrounding those
-  index_entry.keyframe_positions.push_back(video_meta.frames());
-  index_entry.keyframe_byte_offsets.push_back(index_entry.file_size);
-
-  return index_entry;
+  return true;
 }
 
-void read_video_column(Profiler &profiler, VideoIndexEntry &index_entry,
-                       const std::vector<i64> &rows, RowList &row_list) {
-  RandomReadFile *video_file = index_entry.file.get();
+bool LoadWorker::done() { return current_row_ >= total_rows_; }
+
+void read_video_column(Profiler& profiler, const VideoIndexEntry& index_entry,
+                       const std::vector<i64>& rows, i64 start_frame,
+                       ElementList& element_list) {
+  std::unique_ptr<RandomReadFile> video_file = index_entry.open_file();
   u64 file_size = index_entry.file_size;
-  const std::vector<i64> &keyframe_positions = index_entry.keyframe_positions;
-  const std::vector<i64> &keyframe_byte_offsets =
-      index_entry.keyframe_byte_offsets;
+  const std::vector<u64>& keyframe_indices = index_entry.keyframe_indices;
+  const std::vector<u64>& sample_offsets = index_entry.sample_offsets;
+  const std::vector<u64>& sample_sizes = index_entry.sample_sizes;
 
   // Read the bytes from the file that correspond to the sequences of
   // frames we are interested in decoding. This sequence will contain
-  // the bytes starting at the iframe at or preceding the first frame
+  // the bytes starting at the first iframe at or preceding the first frame
   // we are interested and will continue up to the bytes before the
-  // iframe at or after the last frame we are interested in.
-  VideoIntervals intervals =
-      slice_into_video_intervals(keyframe_positions, rows);
+  // first iframe at or after the last frame we are interested in.
+  VideoIntervals intervals = slice_into_video_intervals(
+      keyframe_indices, sample_offsets, sample_sizes, rows);
   size_t num_intervals = intervals.keyframe_index_intervals.size();
   for (size_t i = 0; i < num_intervals; ++i) {
     size_t start_keyframe_index;
@@ -105,31 +362,38 @@ void read_video_column(Profiler &profiler, VideoIndexEntry &index_entry,
     std::tie(start_keyframe_index, end_keyframe_index) =
         intervals.keyframe_index_intervals[i];
 
+    i64 start_keyframe = keyframe_indices[start_keyframe_index];
+    i64 end_keyframe = keyframe_indices[end_keyframe_index];
+
     u64 start_keyframe_byte_offset =
-        static_cast<u64>(keyframe_byte_offsets[start_keyframe_index]);
+        static_cast<u64>(sample_offsets[start_keyframe]);
     u64 end_keyframe_byte_offset =
-        static_cast<u64>(keyframe_byte_offsets[end_keyframe_index]);
+        static_cast<u64>(sample_offsets[end_keyframe]);
 
-    i64 start_keyframe = keyframe_positions[start_keyframe_index];
-    i64 end_keyframe = keyframe_positions[end_keyframe_index];
     std::vector<i64> all_keyframes;
-    for (size_t i = start_keyframe_index; i < end_keyframe_index + 1; ++i) {
-      all_keyframes.push_back(keyframe_positions[i]);
+    std::vector<i64> all_keyframe_indices;
+    for (size_t i = start_keyframe_index; i <= end_keyframe_index; ++i) {
+      all_keyframes.push_back(keyframe_indices[i]);
+      all_keyframe_indices.push_back(keyframe_indices[i] - keyframe_indices[0]);
     }
 
-    std::vector<i64> all_keyframes_byte_offsets;
-    for (size_t i = start_keyframe_index; i < end_keyframe_index + 1; ++i) {
-      all_keyframes_byte_offsets.push_back(keyframe_byte_offsets[i] -
-                                           start_keyframe_byte_offset);
+    std::vector<u64> all_offsets;
+    std::vector<u64> all_sizes;
+    for (size_t i = start_keyframe; i <= end_keyframe; ++i) {
+      all_offsets.push_back(sample_offsets[i] - start_keyframe_byte_offset);
+      all_sizes.push_back(sample_sizes[i]);
     }
 
     size_t buffer_size = end_keyframe_byte_offset - start_keyframe_byte_offset;
-    u8 *buffer = new_buffer(CPU_DEVICE, buffer_size);
+    u8* buffer = new_buffer(CPU_DEVICE, buffer_size);
 
     auto io_start = now();
 
     u64 pos = start_keyframe_byte_offset;
-    s_read(video_file, buffer, buffer_size, pos);
+    size_t size_read;
+    storehouse::StoreResult r =
+        video_file->read(pos, buffer_size, buffer, size_read);
+    //s_read(video_file.get(), buffer, buffer_size, pos);
 
     profiler.add_interval("io", io_start, now());
     profiler.increment("io_read", static_cast<i64>(buffer_size));
@@ -137,235 +401,137 @@ void read_video_column(Profiler &profiler, VideoIndexEntry &index_entry,
     proto::DecodeArgs decode_args;
     decode_args.set_width(index_entry.width);
     decode_args.set_height(index_entry.height);
-    decode_args.set_start_keyframe(keyframe_positions[start_keyframe_index]);
-    decode_args.set_end_keyframe(keyframe_positions[end_keyframe_index]);
+    // We add the start frame of this item to all frames since the decoder
+    // works in terms of absolute frame numbers, instead of item relative
+    // frame numbers
+    decode_args.set_start_keyframe(keyframe_indices[start_keyframe_index] +
+                                   start_frame);
+    decode_args.set_end_keyframe(keyframe_indices[end_keyframe_index] +
+                                 start_frame);
     for (i64 k : all_keyframes) {
-      decode_args.add_keyframes(k);
+      decode_args.add_keyframes(k + start_frame);
+    }
+    for (i64 k : all_keyframe_indices) {
+      decode_args.add_keyframe_indices(k);
     }
-    for (i64 k : all_keyframes_byte_offsets) {
-      decode_args.add_keyframe_byte_offsets(k);
+    for (u64 k : all_offsets) {
+      decode_args.add_sample_offsets(k);
+    }
+    for (u64 k : all_sizes) {
+      decode_args.add_sample_sizes(k);
     }
     for (size_t j = 0; j < intervals.valid_frames[i].size(); ++j) {
-      decode_args.add_valid_frames(intervals.valid_frames[i][j]);
+      decode_args.add_valid_frames(intervals.valid_frames[i][j] + start_frame);
     }
-    decode_args.set_encoded_video(buffer, buffer_size);
+    decode_args.set_encoded_video((i64)buffer);
+    decode_args.set_encoded_video_size(buffer_size);
+    decode_args.set_metadata(index_entry.metadata.data(),
+                             index_entry.metadata.size());
 
-    size_t size = decode_args.ByteSize();
-    u8 *decode_args_buffer = new_buffer(CPU_DEVICE, size);
+    size_t size = decode_args.ByteSizeLong();
+    u8* decode_args_buffer = new_buffer(CPU_DEVICE, size);
     bool result = decode_args.SerializeToArray(decode_args_buffer, size);
     assert(result);
-    INSERT_ROW(row_list, decode_args_buffer, size);
-
-    delete_buffer(CPU_DEVICE, buffer);
+    insert_element(element_list, decode_args_buffer, size);
   }
 }
 
-void read_other_column(storehouse::StorageBackend *storage, i32 table_id,
-                       i32 column_id, i32 item_id, i32 item_start, i32 item_end,
-                       const std::vector<i64> &rows, RowList &row_list) {
-  const std::vector<i64> &valid_offsets = rows;
+void LoadWorker::read_other_column(i32 table_id, i32 column_id, i32 item_id,
+                                   i32 item_start, i32 item_end,
+                                   const std::vector<i64>& rows,
+                                   ElementList& element_list) {
+  const std::vector<i64>& valid_offsets = rows;
+
+  // Read metadata file to determine num rows and sizes
+  u64 num_elements = 0;
+  std::vector<i64> element_sizes;
+  {
+    std::unique_ptr<RandomReadFile> file;
+    StoreResult result;
+    BACKOFF_FAIL(make_unique_random_read_file(
+        storage_.get(), table_item_metadata_path(table_id, column_id, item_id),
+        file));
+
+    u64 file_size = 0;
+    BACKOFF_FAIL(file->get_size(file_size));
+
+    // Read number of elements in file
+    u64 pos = 0;
+    while (pos < file_size) {
+      u64 elements = s_read<u64>(file.get(), pos);
+
+      // Read element sizes from work item file header
+      size_t prev_size = element_sizes.size();
+      element_sizes.resize(prev_size + elements);
+      s_read(file.get(),
+             reinterpret_cast<u8*>(element_sizes.data() + prev_size),
+             elements * sizeof(i64), pos);
+
+      num_elements += elements;
+    }
+    assert(pos == file_size);
+  }
 
   std::unique_ptr<RandomReadFile> file;
   StoreResult result;
   BACKOFF_FAIL(make_unique_random_read_file(
-      storage, table_item_output_path(table_id, column_id, item_id), file));
+      storage_.get(), table_item_output_path(table_id, column_id, item_id),
+      file));
 
   u64 file_size = 0;
   BACKOFF_FAIL(file->get_size(file_size));
 
-  // Read number of rows in file
   u64 pos = 0;
-  u64 num_rows = s_read<u64>(file.get(), pos);
-
-  // Read row sizes from work item file header
-  std::vector<i64> row_sizes(num_rows);
-  s_read(file.get(), reinterpret_cast<u8 *>(row_sizes.data()),
-         row_sizes.size() * sizeof(i64), pos);
-
-  // Determine start and end position of rows to read in file
+  // Determine start and end position of elements to read in file
   u64 start_offset = 0;
+  assert(item_start <= element_sizes.size());
   for (i64 i = 0; i < item_start; ++i) {
-    start_offset += row_sizes[i];
+    start_offset += element_sizes[i];
   }
   u64 end_offset = start_offset;
+  assert(item_end <= element_sizes.size());
   for (i64 i = item_start; i < item_end; ++i) {
-    end_offset += row_sizes[i];
-  }
-  u64 row_data_size = end_offset - start_offset;
-  std::vector<u8> row_data(row_data_size);
-
-  // Read chunk of file corresponding to requested rows
-  pos += start_offset;
-  s_read(file.get(), row_data.data(), row_data.size(), pos);
-
-  // Extract individual rows and insert into output work entry
-  u64 offset = 0;
-  size_t valid_idx = 0;
-  for (i32 i = item_start; i < item_end; ++i) {
-    size_t buffer_size = static_cast<size_t>(row_sizes[i]);
-    if (i == valid_offsets[valid_idx]) {
-      u8 *buffer = new_buffer(CPU_DEVICE, buffer_size);
-      memcpy(buffer, row_data.data() + offset, buffer_size);
-      INSERT_ROW(row_list, buffer, buffer_size);
-      valid_idx++;
-    }
-    offset += buffer_size;
+    end_offset += element_sizes[i];
   }
-  assert(valid_idx == valid_offsets.size());
-}
-}
-
-void *load_thread(void *arg) {
-  LoadThreadArgs &args = *reinterpret_cast<LoadThreadArgs *>(arg);
-
-  auto setup_start = now();
-
-  const i32 work_item_size = args.job_params->work_item_size();
-
-  // Setup a distinct storage backend for each IO thread
-  storehouse::StorageBackend *storage =
-      storehouse::StorageBackend::make_from_config(args.storage_config);
-
-  // Caching table metadata
-  std::map<i32, TableMetadata> table_metadata;
-
-  // To ammortize opening files
-  i32 last_table_id = -1;
-  std::vector<VideoIndexEntry> index;
 
-  args.profiler.add_interval("setup", setup_start, now());
-  while (true) {
-    auto idle_start = now();
-
-    std::tuple<IOItem, LoadWorkEntry> entry;
-    args.load_work.pop(entry);
-    IOItem& io_item = std::get<0>(entry);
-    LoadWorkEntry& load_work_entry = std::get<1>(entry);
-
-    if (load_work_entry.io_item_index() == -1) {
-      break;
-    }
-
-    VLOG(1) << "Load (N/PU: " << args.node_id << "/" << args.id
-              << "): processing item " << load_work_entry.io_item_index();
-
-    args.profiler.add_interval("idle", idle_start, now());
-
-    auto work_start = now();
-
-    const auto &samples = load_work_entry.samples();
-
-    if (io_item.table_id() != last_table_id) {
-      // Not from the same task so clear cached data
-      last_table_id = io_item.table_id();
-      index.clear();
-    }
-
-    EvalWorkEntry eval_work_entry;
-    eval_work_entry.io_item_index = load_work_entry.io_item_index();
-
-    // Aggregate all sample columns so we know the tuple size
-    assert(!samples.empty());
-    eval_work_entry.warmup_rows = samples.Get(0).warmup_rows_size();
-
-    i32 num_columns = 0;
-    for (size_t i = 0; i < samples.size(); ++i) {
-      num_columns += samples.Get(i).column_ids_size();
-    }
-    eval_work_entry.columns.resize(num_columns);
-
-    i32 media_col_idx = 0;
-    i32 out_col_idx = 0;
-    for (const proto::LoadSample &sample : samples) {
-      i32 table_id = sample.table_id();
-      auto it = table_metadata.find(table_id);
-      if (it == table_metadata.end()) {
-        table_metadata[table_id] = read_table_metadata(
-            storage, TableMetadata::descriptor_path(table_id));
-        it = table_metadata.find(table_id);
+  // If the requested elements are sufficiently sparse by some threshold, we
+  // read each element individually. Otherwise, we read the entire block and
+  // copy out only the necessary elements.
+  if ((item_end - item_start) / rows.size() >= load_sparsity_threshold_) {
+    for (i32 row : rows) {
+      size_t buffer_size = static_cast<size_t>(element_sizes[row]);
+      u8* buffer = new_buffer(CPU_DEVICE, buffer_size);
+      u64 row_offset = pos + start_offset;
+      for (i32 i = item_start; i < row; ++i) {
+        row_offset += element_sizes[i];
       }
-      const TableMetadata &table_meta = it->second;
-
-      const google::protobuf::RepeatedField<i64> &sample_warmup_rows =
-          sample.warmup_rows();
-      const google::protobuf::RepeatedField<i64> &sample_rows = sample.rows();
-      std::vector<i64> rows(sample_warmup_rows.begin(),
-                            sample_warmup_rows.end());
-      rows.insert(rows.end(), sample_rows.begin(), sample_rows.end());
-      RowIntervals intervals = slice_into_row_intervals(table_meta, rows);
-      size_t num_items = intervals.item_ids.size();
-      for (i32 col_id : sample.column_ids()) {
-        ColumnType column_type = ColumnType::Other;
-        if (table_meta.column_type(col_id) == ColumnType::Video) {
-          column_type = ColumnType::Video;
-          // video frame column
-          for (size_t i = 0; i < num_items; ++i) {
-            i32 item_id = intervals.item_ids[i];
-            const std::vector<i64> &valid_offsets = intervals.valid_offsets[i];
-
-            // TODO(apoms): cache this so we avoid the IO and recompute for each
-            //   request
-            VideoIndexEntry entry =
-                read_video_index(storage, table_id, col_id, item_id);
-            read_video_column(args.profiler, entry, valid_offsets,
-                              eval_work_entry.columns[out_col_idx]);
-          }
-          media_col_idx++;
-        } else if (col_id > 0 &&
-                   // Convention is that frame info column is immediately
-                   // after frame column
-                   table_meta.column_type(col_id - 1) == ColumnType::Video) {
-          // video meta column
-          VideoIndexEntry entry =
-              read_video_index(storage, table_id, col_id - 1, 0);
-          proto::FrameInfo frame_info;
-          frame_info.set_width(entry.width);
-          frame_info.set_height(entry.height);
-
-          size_t frame_info_size = frame_info.ByteSize();
-          for (size_t i = 0; i < num_items; ++i) {
-            size_t total_rows = intervals.valid_offsets[i].size();
-            u8 *buffer = new_block_buffer(
-                CPU_DEVICE, frame_info_size * total_rows, total_rows);
-            for (size_t j = 0; j < intervals.valid_offsets[i].size(); ++j) {
-              u8 *b = buffer + frame_info_size * j;
-              frame_info.SerializeToArray(b, frame_info_size);
-              INSERT_ROW(eval_work_entry.columns[out_col_idx], b,
-                         frame_info_size);
-            }
-          }
-        } else {
-          // regular column
-          for (size_t i = 0; i < num_items; ++i) {
-            i32 item_id = intervals.item_ids[i];
-            i64 item_start;
-            i64 item_end;
-            std::tie(item_start, item_end) = intervals.item_intervals[i];
-            const std::vector<i64> &valid_offsets = intervals.valid_offsets[i];
-
-            read_other_column(storage, table_id, col_id, item_id, item_start,
-                              item_end, valid_offsets,
-                              eval_work_entry.columns[out_col_idx]);
-          }
-        }
-        eval_work_entry.column_types.push_back(column_type);
-        eval_work_entry.column_handles.push_back(CPU_DEVICE);
-        out_col_idx++;
+      s_read(file.get(), buffer, buffer_size, row_offset);
+      insert_element(element_list, buffer, buffer_size);
+    }
+  } else {
+    pos += start_offset;
+
+    u64 element_data_size = end_offset - start_offset;
+    std::vector<u8> element_data(element_data_size);
+
+    // Read chunk of file corresponding to requested elements
+    s_read(file.get(), element_data.data(), element_data.size(), pos);
+
+    // Extract individual elements and insert into output work entry
+    u64 offset = 0;
+    size_t valid_idx = 0;
+    for (i32 i = item_start; i < item_end; ++i) {
+      size_t buffer_size = static_cast<size_t>(element_sizes[i]);
+      if (i == valid_offsets[valid_idx]) {
+        u8* buffer = new_buffer(CPU_DEVICE, buffer_size);
+        memcpy(buffer, element_data.data() + offset, buffer_size);
+        insert_element(element_list, buffer, buffer_size);
+        valid_idx++;
       }
+      offset += buffer_size;
     }
-
-    args.profiler.add_interval("task", work_start, now());
-
-    args.eval_work.push(std::make_tuple(io_item, eval_work_entry));
+    assert(valid_idx == valid_offsets.size());
   }
-
-  VLOG(1) << "Load (N/PU: " << args.node_id << "/" << args.id
-            << "): thread finished";
-
-  // Cleanup
-  delete storage;
-
-  THREAD_RETURN_SUCCESS();
 }
 }
 }
diff --git a/scanner/engine/load_worker.h b/scanner/engine/load_worker.h
index 91d2b8b0..12c81be4 100644
--- a/scanner/engine/load_worker.h
+++ b/scanner/engine/load_worker.h
@@ -16,28 +16,68 @@
 #pragma once
 
 #include "scanner/engine/runtime.h"
+#include "scanner/engine/video_index_entry.h"
+#include "scanner/engine/table_meta_cache.h"
 #include "scanner/util/common.h"
 #include "scanner/util/queue.h"
 
 namespace scanner {
 namespace internal {
 
-struct LoadThreadArgs {
+struct LoadWorkerArgs {
   // Uniform arguments
   i32 node_id;
-  const proto::JobParameters* job_params;
-
   // Per worker arguments
-  int id;
+  int worker_id;
   storehouse::StorageConfig* storage_config;
   Profiler& profiler;
-
-  // Queues for communicating work
-  Queue<std::tuple<IOItem, LoadWorkEntry>>& load_work;  // in
-  Queue<std::tuple<IOItem, EvalWorkEntry>>& eval_work;  // out
+  i32 load_sparsity_threshold;
+  i32 io_packet_size;
+  i32 work_packet_size;
 };
 
-void* load_thread(void* arg);
+class LoadWorker {
+ public:
+  LoadWorker(const LoadWorkerArgs& args);
+
+  void feed(LoadWorkEntry& input_entry);
+
+  bool yield(i32 item_size, EvalWorkEntry& output_entry);
+
+  bool done();
+
+ private:
+  void read_other_column(i32 table_id, i32 column_id, i32 item_id,
+                         i32 item_start, i32 item_end,
+                         const std::vector<i64>& rows,
+                         ElementList& element_list);
+  const i32 node_id_;
+  const i32 worker_id_;
+  Profiler& profiler_;
+  // Setup a distinct storage backend for each IO thread
+  std::unique_ptr<storehouse::StorageBackend> storage_;
+  // Caching table metadata
+  DatabaseMetadata meta_;
+  std::unique_ptr<TableMetaCache> table_metadata_;
+  // To ammortize opening files
+  i32 last_table_id_ = -1;
+  std::map<std::tuple<i32, i32, i32>, VideoIndexEntry> index_;
+  i32 load_sparsity_threshold_;
+  i32 io_packet_size_;
+  i32 work_packet_size_;
+
+  // Continuation state
+  bool first_item_;
+  bool needs_configure_;
+  bool needs_reset_;
+  LoadWorkEntry entry_;
+  i64 current_row_;
+  i64 total_rows_;
+};
 
+void read_video_column(Profiler& profiler,
+                       const VideoIndexEntry& index_entry,
+                       const std::vector<i64>& rows, i64 start_offset,
+                       ElementList& element_list);
 }
 }
diff --git a/scanner/engine/master.cpp b/scanner/engine/master.cpp
index 9bd61f04..5f79d850 100644
--- a/scanner/engine/master.cpp
+++ b/scanner/engine/master.cpp
@@ -13,514 +13,1590 @@
  * limitations under the License.
  */
 
-#include "scanner/engine/runtime.h"
+#include "scanner/engine/master.h"
 #include "scanner/engine/ingest.h"
 #include "scanner/engine/sampler.h"
-#include "scanner/util/progress_bar.h"
-#include <grpc/support/log.h>
+#include "scanner/engine/dag_analysis.h"
+#include "scanner/util/cuda.h"
+#include "scanner/util/util.h"
+#include "scanner/util/glog.h"
+#include "scanner/util/grpc.h"
+#include "scanner/engine/python_kernel.h"
+#include "scanner/util/thread_pool.h"
 
+#include <grpc/support/log.h>
+#include <set>
 #include <mutex>
+
 namespace scanner {
 namespace internal {
-namespace {
-void validate_task_set(DatabaseMetadata &meta, const proto::TaskSet &task_set,
-                       Result *result) {
-  auto &tasks = task_set.tasks();
-  // Validate tasks
-  std::set<std::string> task_output_table_names;
-  for (auto &task : task_set.tasks()) {
-    if (task.output_table_name() == "") {
-      LOG(WARNING) << "Task specified with empty output table name. Output "
-                      "tables can not have empty names";
-      result->set_success(false);
-    }
-    if (meta.has_table(task.output_table_name())) {
-      LOG(WARNING) << "Task specified with duplicate output table name. "
-                   << "A table with name " << task.output_table_name() << " "
-                   << "already exists.";
-      result->set_success(false);
-    }
-    if (task_output_table_names.count(task.output_table_name()) > 0) {
-      LOG(WARNING) << "Mulitple tasks specified with output table name "
-                   << task.output_table_name()
-                   << ". Table names must be unique.";
-      result->set_success(false);
-    }
-    task_output_table_names.insert(task.output_table_name());
-    if (task.samples().size() == 0) {
-      LOG(WARNING) << "Task " << task.output_table_name() << " did not "
-                   << "specify any tables to sample from. Tasks must sample "
-                   << "from at least one table.";
-      result->set_success(false);
+
+MasterImpl::MasterImpl(DatabaseParameters& params)
+  : watchdog_awake_(true), db_params_(params) {
+  VLOG(1) << "Creating master...";
+
+  init_glog("scanner_master");
+  storage_ =
+      storehouse::StorageBackend::make_from_config(db_params_.storage_config);
+  set_database_path(params.db_path);
+
+  // Perform database consistency checks on startup
+  recover_and_init_database();
+
+  start_job_processor();
+  VLOG(1) << "Master created.";
+}
+
+MasterImpl::~MasterImpl() {
+  trigger_shutdown_.set();
+  {
+    std::unique_lock<std::mutex> lock(finished_mutex_);
+    finished_ = true;
+  }
+  finished_cv_.notify_all();
+
+  {
+    std::unique_lock<std::mutex> lk(work_mutex_);
+  }
+
+  stop_job_processor();
+
+  stop_worker_pinger();
+  if (watchdog_thread_.joinable()) {
+    watchdog_thread_.join();
+  }
+  delete storage_;
+}
+
+// Expects context->peer() to return a string in the format
+// ipv4:<peer_address>:<random_port>
+// Returns the <peer_address> from the above format.
+std::string MasterImpl::get_worker_address_from_grpc_context(
+    grpc::ServerContext* context) {
+  std::string worker_address = context->peer();
+  std::size_t portSep = worker_address.find_last_of(':');
+  if (portSep == std::string::npos) {
+  }
+  std::string worker_address_base = worker_address.substr(0, portSep);
+
+  portSep = worker_address_base.find_first_of(':');
+  if (portSep == std::string::npos) {
+  }
+
+  std::string worker_address_actual = worker_address_base.substr(portSep + 1);
+
+  return worker_address_actual;
+}
+
+grpc::Status MasterImpl::ListTables(grpc::ServerContext* context,
+                                    const proto::Empty* empty,
+                                    proto::ListTablesResult* result) {
+  std::unique_lock<std::mutex> lk(work_mutex_);
+
+  for (const auto& table_name : meta_.table_names()) {
+    result->add_tables(table_name);
+  }
+
+  return grpc::Status::OK;
+}
+
+grpc::Status MasterImpl::GetTables(grpc::ServerContext* context,
+                                   const proto::GetTablesParams* params,
+                                   proto::GetTablesResult* result) {
+  std::unique_lock<std::mutex> lk(work_mutex_);
+  result->mutable_result()->set_success(true);
+
+  std::vector<std::string> table_names;
+  for (const auto& table_name : params->tables()) {
+    table_names.push_back(table_name);
+  }
+  table_metas_->prefetch(table_names);
+
+  std::vector<proto::VideoDescriptor*> video_descriptors;
+  for (const auto& table_name : params->tables()) {
+    video_descriptors.push_back(result->add_videos());
+  }
+
+  VLOG(1) << "Prefetching video metadata";
+  auto load_video_meta = [&](i32 i) {
+    const std::string& table_name = params->tables(i);
+    const TableMetadata& table_meta = table_metas_->at(table_name);
+    proto::VideoDescriptor* desc_dst = video_descriptors[i];
+    if (table_meta.columns().size() == 2 && table_meta.column_type(1) == ColumnType::Video) {
+      VideoMetadata video_meta = read_video_metadata(
+        storage_, VideoMetadata::descriptor_path(table_meta.id(), 1, 0));
+      proto::VideoDescriptor& desc = video_meta.get_descriptor();
+      desc.clear_sample_offsets();
+      desc.clear_sample_sizes();
+      desc.clear_keyframe_indices();
+      desc.clear_frames_per_video();
+      desc.clear_keyframes_per_video();
+      desc.clear_size_per_video();
+      desc_dst->CopyFrom(desc);
     } else {
-      for (auto &sample : task.samples()) {
-        if (!meta.has_table(sample.table_name())) {
-          LOG(WARNING) << "Task " << task.output_table_name() << " tried to "
-                       << "sample from non-existent table "
-                       << sample.table_name()
-                       << ". TableSample must sample from existing table.";
-          result->set_success(false);
-        }
-        // TODO(apoms): validate sampler functions
-        if (sample.column_names().size() == 0) {
-          LOG(WARNING) << "Task" << task.output_table_name() << " tried to "
-                       << "sample zero columns from table "
-                       << sample.table_name()
-                       << ". TableSample must sample at least one column";
-          result->set_success(false);
-        }
-      }
+      desc_dst->set_table_id(-1);
     }
+  };
+
+  ThreadPool prefetch_pool(64);
+  std::vector<std::future<void>> futures;
+  for (i32 i = 0; i < params->tables().size(); ++i) {
+    futures.emplace_back(prefetch_pool.enqueue(load_video_meta, i));
   }
-  // Validate ops
-  {
-    OpRegistry *op_registry = get_op_registry();
-    KernelRegistry *kernel_registry = get_kernel_registry();
-
-    i32 op_idx = 0;
-    std::vector<std::string> op_names;
-    std::vector<std::vector<std::string>> op_outputs;
-    for (auto &op : task_set.ops()) {
-      op_names.push_back(op.name());
-      if (op_idx == 0) {
-        if (op.name() != "InputTable") {
-          RESULT_ERROR(result, "First Op is %s but must be Op InputTable",
-                       op.name().c_str());
-          break;
-        }
-        op_outputs.emplace_back();
-        for (auto &input : op.inputs()) {
-          for (auto &col : input.columns()) {
-            op_outputs.back().push_back(col);
-          }
-        }
-        op_idx++;
-        continue;
-      }
-      if (op.name() != "OutputTable") {
-        op_outputs.emplace_back();
-        if (!op_registry->has_op(op.name())) {
-          RESULT_ERROR(result, "Op %s is not registered.", op.name().c_str());
-        } else {
-          op_outputs.back() =
-              op_registry->get_op_info(op.name())->output_columns();
-        }
-        if (!kernel_registry->has_kernel(op.name(), op.device_type())) {
-          RESULT_ERROR(result,
-                       "Op %s at index %d requested kernel with device type "
-                       "%s but no such kernel exists.",
-                       op.name().c_str(), op_idx,
-                       (op.device_type() == DeviceType::CPU ? "CPU" : "GPU"));
-        }
-      }
-      for (auto &input : op.inputs()) {
-        if (input.op_index() >= op_idx) {
-          RESULT_ERROR(result,
-                       "Op %s at index %d referenced input index %d."
-                       "Ops must be specified in topo sort order.",
-                       op.name().c_str(), op_idx, input.op_index());
-        } else {
-          std::string &input_op_name = op_names.at(input.op_index());
-          std::vector<std::string> &inputs = op_outputs.at(input.op_index());
-          for (auto &col : input.columns()) {
-            bool found = false;
-            for (auto &out_col : inputs) {
-              if (col == out_col) {
-                found = true;
-                break;
-              }
-            }
-            if (!found) {
-              RESULT_ERROR(result,
-                           "Op %s at index %d requested column %s from input "
-                           "Op %s at index %d but that Op does not have the "
-                           "requsted column.",
-                           op.name().c_str(), op_idx, col.c_str(),
-                           input_op_name.c_str(), input.op_index());
-            }
-          }
-        }
-      }
-      op_idx++;
-    }
-    if (op_names.size() < 3) {
-      RESULT_ERROR(result,
-                   "Task set must specify at least three Ops: "
-                   "an InputTable Op, any other Op, and an OutputTable Op. "
-                   "However, only %lu Ops were specified.",
-                   op_names.size());
+
+  for (auto& future : futures) {
+    future.wait();
+  }
+
+  VLOG(1) << "Creating output";
+  for (const auto& table_name : params->tables()) {
+    // Check if has table
+    if (!meta_.has_table(table_name)) {
+      RESULT_ERROR(result->mutable_result(),
+                   "Requested table %s is not in database.",
+                   table_name.c_str());
+      result->clear_tables();
+      break;
     } else {
-      if (op_names.front() != "InputTable") {
-        RESULT_ERROR(result, "First Op is %s but must be InputTable",
-                     op_names.front().c_str());
-      }
-      if (op_names.back() != "OutputTable") {
-        RESULT_ERROR(result, "Last Op is %s but must be OutputTable",
-                     op_names.back().c_str());
-      }
+      // Add table descriptor to result
+      const TableMetadata& table_meta = table_metas_->at(table_name);
+      proto::TableDescriptor& descriptor = table_meta.get_descriptor();
+      proto::TableDescriptor* desc = result->add_tables();
+      desc->CopyFrom(descriptor);
     }
   }
+
+  return grpc::Status::OK;
 }
 
-Result
-get_task_end_rows(const std::map<std::string, TableMetadata> &table_metas,
-                  const proto::Task &task, std::vector<i64> &rows) {
-  Result result;
-  result.set_success(true);
+grpc::Status MasterImpl::DeleteTables(grpc::ServerContext* context,
+                                      const proto::DeleteTablesParams* params,
+                                      proto::Empty* empty) {
+  std::unique_lock<std::mutex> lk(work_mutex_);
 
-  TaskSampler sampler(table_metas, task);
-  result = sampler.validate();
-  if (!result.success()) {
-    return result;
+  // For each table, remove the entry from the database
+  for (const auto& table_name : params->tables()) {
+    if (meta_.has_table(table_name)) {
+      meta_.remove_table(meta_.get_table_id(table_name));
+    }
   }
-  i64 num_samples = sampler.total_samples();
-  for (i64 i = 0; i < num_samples; ++i) {
-    proto::NewWork new_work;
-    result = sampler.next_work(new_work);
-    if (!result.success()) {
-      rows.clear();
-      return result;
+
+  // TODO(apoms): delete the actual table data
+
+  write_database_metadata(storage_, meta_);
+
+  return grpc::Status::OK;
+}
+
+grpc::Status MasterImpl::NewTable(grpc::ServerContext* context,
+                                  const proto::NewTableParams* params,
+                                  proto::Empty* empty) {
+  std::unique_lock<std::mutex> lk(work_mutex_);
+
+  const std::string& table_name = params->table_name();
+  const auto& columns = params->columns();
+  const auto& rows = params->rows();
+
+  i32 table_id = meta_.add_table(table_name);
+  LOG_IF(FATAL, table_id == -1) << "failed to add table";
+  proto::TableDescriptor table_desc;
+  table_desc.set_id(table_id);
+  table_desc.set_name(table_name);
+  table_desc.set_timestamp(
+      std::chrono::duration_cast<std::chrono::seconds>(now().time_since_epoch())
+          .count());
+  for (size_t i = 0; i < columns.size(); ++i) {
+    proto::Column* col = table_desc.add_columns();
+    col->set_id(i);
+    col->set_name(columns[i]);
+    col->set_type(proto::ColumnType::Other);
+  }
+
+  table_desc.add_end_rows(rows.size());
+  table_desc.set_job_id(-1);
+  meta_.commit_table(table_id);
+
+  write_table_metadata(storage_, TableMetadata(table_desc));
+  write_database_metadata(storage_, meta_);
+
+  LOG_IF(FATAL, rows[0].columns().size() != columns.size()) << "Row 0 doesn't have # entries == # columns";
+  for (size_t j = 0; j < columns.size(); ++j) {
+    const std::string output_path =
+        table_item_output_path(table_id, j, 0);
+
+    const std::string output_metadata_path =
+        table_item_metadata_path(table_id, j, 0);
+
+    std::unique_ptr<storehouse::WriteFile> output_file;
+    storehouse::make_unique_write_file(storage_, output_path,
+                                       output_file);
+
+    std::unique_ptr<storehouse::WriteFile> output_metadata_file;
+    storehouse::make_unique_write_file(storage_, output_metadata_path,
+                                       output_metadata_file);
+
+    u64 num_rows = rows.size();
+    s_write(output_metadata_file.get(), num_rows);
+    for (size_t i = 0; i < num_rows; ++i) {
+      u64 buffer_size = rows[i].columns()[j].size();
+      s_write(output_metadata_file.get(), buffer_size);
+    }
+
+    for (size_t i = 0; i < num_rows; ++i) {
+      i64 buffer_size = rows[i].columns()[j].size();
+      u8* buffer = (u8*)rows[i].columns()[j].data();
+      s_write(output_file.get(), buffer, buffer_size);
     }
-    rows.push_back(new_work.io_item().end_row());
+
+    BACKOFF_FAIL(output_file->save());
+    BACKOFF_FAIL(output_metadata_file->save());
+  }
+
+  return grpc::Status::OK;
+}
+
+grpc::Status MasterImpl::RegisterWorker(grpc::ServerContext* context,
+                                        const proto::WorkerParams* worker_info,
+                                        proto::Registration* registration) {
+  std::unique_lock<std::mutex> lk(work_mutex_);
+
+  set_database_path(db_params_.db_path);
+
+  std::string worker_address = get_worker_address_from_grpc_context(context);
+  worker_address += ":" + worker_info->port();
+
+  i32 node_id = next_worker_id_++;
+  VLOG(1) << "Adding worker: " << node_id << ", " << worker_address;
+  workers_[node_id] = proto::Worker::NewStub(
+      grpc::CreateChannel(worker_address, grpc::InsecureChannelCredentials()));
+  registration->set_node_id(node_id);
+  worker_addresses_[node_id] = worker_address;
+  worker_active_[node_id] = true;
+
+  // Load ops into worker
+  for (const std::string& so_path : so_paths_) {
+    proto::OpPath op_path;
+    proto::Empty empty;
+    op_path.set_path(so_path);
+    grpc::Status status;
+    GRPC_BACKOFF(workers_[node_id]->LoadOp(&ctx, op_path, &empty), status);
+    LOG_IF(WARNING, !status.ok())
+        << "Master could not load op for worker at " << worker_address << " ("
+        << status.error_code() << "): " << status.error_message();
   }
-  return result;
+
+  unstarted_workers_.push_back(node_id);
+
+  return grpc::Status::OK;
 }
+
+grpc::Status MasterImpl::UnregisterWorker(grpc::ServerContext* context,
+                                          const proto::NodeInfo* node_info,
+                                          proto::Empty* empty) {
+  std::unique_lock<std::mutex> lk(work_mutex_);
+
+  set_database_path(db_params_.db_path);
+
+  i32 node_id = node_info->node_id();
+  remove_worker(node_id);
+
+  return grpc::Status::OK;
 }
 
-class MasterImpl final : public proto::Master::Service {
-public:
-  MasterImpl(DatabaseParameters &params) : db_params_(params), bar_(nullptr) {
-    storage_ =
-        storehouse::StorageBackend::make_from_config(db_params_.storage_config);
-    set_database_path(params.db_path);
+grpc::Status MasterImpl::ActiveWorkers(
+    grpc::ServerContext* context, const proto::Empty* empty,
+    proto::RegisteredWorkers* registered_workers) {
+  std::unique_lock<std::mutex> lk(work_mutex_);
+
+  set_database_path(db_params_.db_path);
+
+  for (auto& kv : worker_active_) {
+    if (kv.second) {
+      i32 worker_id = kv.first;
+      proto::WorkerInfo* info = registered_workers->add_workers();
+      info->set_id(worker_id);
+      info->set_address(worker_addresses_.at(worker_id));
+    }
   }
 
-  ~MasterImpl() { delete storage_; }
+  return grpc::Status::OK;
+}
 
-  grpc::Status RegisterWorker(grpc::ServerContext *context,
-                              const proto::WorkerParams *worker_info,
-                              proto::Registration *registration) {
-    set_database_path(db_params_.db_path);
+grpc::Status MasterImpl::IngestVideos(grpc::ServerContext* context,
+                                      const proto::IngestParameters* params,
+                                      proto::IngestResult* result) {
+  std::vector<FailedVideo> failed_videos;
+  result->mutable_result()->CopyFrom(
+      ingest_videos(db_params_.storage_config, db_params_.db_path,
+                    std::vector<std::string>(params->table_names().begin(),
+                                             params->table_names().end()),
+                    std::vector<std::string>(params->video_paths().begin(),
+                                             params->video_paths().end()),
+                    params->inplace(), failed_videos));
+  for (auto& failed : failed_videos) {
+    result->add_failed_paths(failed.path);
+    result->add_failed_messages(failed.message);
+  }
+
+  // HACK(apoms): instead of doing this, we should just add tables to db and
+  //              table cache.
+  recover_and_init_database();
+
+  return grpc::Status::OK;
+}
+
+grpc::Status MasterImpl::GetJobStatus(grpc::ServerContext* context,
+                                      const proto::Empty* empty,
+                                      proto::JobStatus* job_status) {
+  VLOG(2) << "Master received GetJobStatus command";
+  std::unique_lock<std::mutex> lock(active_mutex_);
+  if (!active_bulk_job_) {
+    job_status->set_finished(true);
+    job_status->mutable_result()->CopyFrom(job_result_);
+
+    job_status->set_tasks_done(0);
+    job_status->set_total_tasks(0);
+
+    job_status->set_jobs_done(0);
+    job_status->set_jobs_failed(0);
+    job_status->set_total_jobs(0);
+  } else {
+    job_status->set_finished(false);
 
-    workers_.push_back(proto::Worker::NewStub(grpc::CreateChannel(
-        worker_info->address(), grpc::InsecureChannelCredentials())));
-    registration->set_node_id(workers_.size() - 1);
-    addresses_.push_back(worker_info->address());
+    job_status->set_tasks_done(total_tasks_used_);
+    job_status->set_total_tasks(total_tasks_);
 
+    job_status->set_jobs_done(next_job_ - 1);
+    job_status->set_jobs_failed(0);
+    job_status->set_total_jobs(num_jobs_);
+  }
+  // Num workers
+  i32 num_workers = 0;
+  for (auto& kv : worker_active_) {
+    if (kv.second) {
+      num_workers++;
+    }
+  }
+  job_status->set_num_workers(num_workers);
+  job_status->set_failed_workers(num_failed_workers_);
+  return grpc::Status::OK;
+}
+
+grpc::Status MasterImpl::Ping(grpc::ServerContext* context,
+                              const proto::Empty* empty1,
+                              proto::Empty* empty2) {
+  return grpc::Status::OK;
+}
+
+grpc::Status MasterImpl::GetOpInfo(grpc::ServerContext* context,
+                                   const proto::OpInfoArgs* op_info_args,
+                                   proto::OpInfo* op_info) {
+  OpRegistry* registry = get_op_registry();
+  std::string op_name = op_info_args->op_name();
+  if (!registry->has_op(op_name)) {
+    op_info->mutable_result()->set_success(false);
+    op_info->mutable_result()->set_msg("Op " + op_name + " does not exist");
     return grpc::Status::OK;
   }
 
-  grpc::Status ActiveWorkers(grpc::ServerContext *context,
-                              const proto::Empty *empty,
-                              proto::RegisteredWorkers *registered_workers) {
-    set_database_path(db_params_.db_path);
+  OpInfo* info = registry->get_op_info(op_name);
 
-    for (size_t i = 0; i < workers_.size(); ++i) {
-      proto::WorkerInfo* info = registered_workers->add_workers();
-      info->set_id(i);
-      info->set_address(addresses_[i]);
+  op_info->set_variadic_inputs(info->variadic_inputs());
+  for (auto& input_column : info->input_columns()) {
+    Column* info = op_info->add_input_columns();
+    info->CopyFrom(input_column);
+  }
+  for (auto& output_column : info->output_columns()) {
+    Column* info = op_info->add_output_columns();
+    info->CopyFrom(output_column);
+  }
+  op_info->mutable_result()->set_success(true);
+
+  return grpc::Status::OK;
+}
+
+grpc::Status MasterImpl::LoadOp(grpc::ServerContext* context,
+                                const proto::OpPath* op_path, Result* result) {
+  std::unique_lock<std::mutex> lk(work_mutex_);
+  const std::string& so_path = op_path->path();
+
+  for (auto& loaded_path : so_paths_) {
+    if (loaded_path == so_path) {
+      LOG(WARNING) << "Master received redundant request to load op " << so_path;
+      result->set_success(true);
+      return grpc::Status::OK;
     }
+  }
 
+  {
+    std::ifstream infile(so_path);
+    if (!infile.good()) {
+      RESULT_ERROR(result, "Op library was not found: %s", so_path.c_str());
+      return grpc::Status::OK;
+    }
+  }
+
+  void* handle = dlopen(so_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+  if (handle == nullptr) {
+    RESULT_ERROR(result, "Failed to load op library: %s", dlerror());
     return grpc::Status::OK;
   }
+  so_paths_.push_back(so_path);
+
+  for (auto& kv : worker_active_) {
+    if (kv.second) {
+      auto& worker = workers_[kv.first];
+      proto::Empty empty;
+      grpc::Status status;
+      GRPC_BACKOFF(worker->LoadOp(&ctx, *op_path, &empty), status);
+      const std::string& worker_address = worker_addresses_[kv.first];
+      LOG_IF(WARNING, !status.ok())
+          << "Master could not load op for worker at " << worker_address << " ("
+          << status.error_code() << "): " << status.error_message();
+    }
+  }
+
+  result->set_success(true);
+  return grpc::Status::OK;
+}
 
-  grpc::Status IngestVideos(grpc::ServerContext *context,
-                            const proto::IngestParameters *params,
-                            proto::IngestResult *result) {
-    std::vector<FailedVideo> failed_videos;
-    result->mutable_result()->CopyFrom(
-        ingest_videos(db_params_.storage_config, db_params_.db_path,
-                      std::vector<std::string>(params->table_names().begin(),
-                                               params->table_names().end()),
-                      std::vector<std::string>(params->video_paths().begin(),
-                                               params->video_paths().end()),
-                      failed_videos));
-    for (auto& failed : failed_videos) {
-      result->add_failed_paths(failed.path);
-      result->add_failed_messages(failed.message);
+grpc::Status MasterImpl::RegisterOp(
+    grpc::ServerContext* context, const proto::OpRegistration* op_registration,
+    proto::Result* result) {
+  std::unique_lock<std::mutex> lk(work_mutex_);
+  VLOG(1) << "Master registering Op: " << op_registration->name();
+
+  result->set_success(true);
+  const std::string& name = op_registration->name();
+  {
+    const bool variadic_inputs = op_registration->variadic_inputs();
+    std::vector<Column> input_columns;
+    size_t i = 0;
+    for (auto& c : op_registration->input_columns()) {
+      Column col;
+      col.set_id(i++);
+      col.set_name(c.name());
+      col.set_type(c.type());
+      input_columns.push_back(col);
+    }
+    std::vector<Column> output_columns;
+    i = 0;
+    for (auto& c : op_registration->output_columns()) {
+      Column col;
+      col.set_id(i++);
+      col.set_name(c.name());
+      col.set_type(c.type());
+      output_columns.push_back(col);
     }
+    bool can_stencil = op_registration->can_stencil();
+    std::vector<i32> stencil(op_registration->preferred_stencil().begin(),
+                             op_registration->preferred_stencil().end());
+    if (stencil.empty()) {
+      stencil = {0};
+    }
+    bool has_bounded_state = op_registration->has_bounded_state();
+    i32 warmup = op_registration->warmup();
+    bool has_unbounded_state = op_registration->has_unbounded_state();
+    OpInfo* info = new OpInfo(name, variadic_inputs, input_columns,
+                              output_columns, can_stencil, stencil,
+                              has_bounded_state, warmup, has_unbounded_state);
+    OpRegistry* registry = get_op_registry();
+    *result = registry->add_op(name, info);
+  }
+  if (!result->success()) {
+    LOG(WARNING) << "Master failed to register op " << name;
     return grpc::Status::OK;
   }
 
-  grpc::Status NextWork(grpc::ServerContext *context,
-                        const proto::NodeInfo *node_info,
-                        proto::NewWork *new_work) {
-    std::unique_lock<std::mutex> lk(work_mutex_);
-    if (samples_left_ <= 0) {
-      if (next_task_ < num_tasks_ && task_result_.success()) {
-        // More tasks left
-        task_sampler_.reset(new TaskSampler(
-            table_metas_, job_params_.task_set().tasks(next_task_)));
-        task_result_ = task_sampler_->validate();
-        if (task_result_.success()) {
-          samples_left_ = task_sampler_->total_samples();
-          next_task_++;
-          VLOG(1) << "Tasks left: " << num_tasks_ - next_task_;
-        }
+  for (auto& kv : worker_active_) {
+    if (kv.second) {
+      auto& worker = workers_[kv.first];
+      proto::Result w_result;
+      grpc::Status status;
+      GRPC_BACKOFF(worker->RegisterOp(&ctx, *op_registration, &w_result),
+                   status);
+      const std::string& worker_address = worker_addresses_[kv.first];
+      LOG_IF(WARNING, !status.ok())
+          << "Master could not load op for worker at " << worker_address << " ("
+          << status.error_code() << "): " << status.error_message();
+    }
+  }
+
+  op_registrations_.push_back(*op_registration);
+  return grpc::Status::OK;
+}
+
+grpc::Status MasterImpl::RegisterPythonKernel(
+    grpc::ServerContext* context,
+    const proto::PythonKernelRegistration* python_kernel,
+    proto::Result* result) {
+  std::unique_lock<std::mutex> lk(work_mutex_);
+  VLOG(1) << "Master registering Python Kernel: " << python_kernel->op_name();
+
+  {
+    const std::string& op_name = python_kernel->op_name();
+    DeviceType device_type = python_kernel->device_type();
+    const std::string& kernel_str = python_kernel->kernel_str();
+    const std::string& pickled_config = python_kernel->pickled_config();
+    const int batch_size = python_kernel->batch_size();
+    // Create a kernel builder function
+    auto constructor = [kernel_str, pickled_config,
+                        batch_size](const KernelConfig& config) {
+      return new PythonKernel(config, kernel_str, pickled_config, batch_size);
+    };
+    // Set all input and output columns to be CPU
+    std::map<std::string, DeviceType> input_devices;
+    std::map<std::string, DeviceType> output_devices;
+    {
+      OpRegistry* registry = get_op_registry();
+      OpInfo* info = registry->get_op_info(op_name);
+      if (info->variadic_inputs()) {
+        assert(device_type != DeviceType::GPU);
       } else {
-        // No more tasks left
-        new_work->mutable_io_item()->set_item_id(-1);
-        return grpc::Status::OK;
+        for (const auto& in_col : info->input_columns()) {
+          input_devices[in_col.name()] = DeviceType::CPU;
+        }
+      }
+      for (const auto& out_col : info->output_columns()) {
+        output_devices[out_col.name()] = DeviceType::CPU;
       }
     }
-    if (!task_result_.success()) {
-      new_work->mutable_io_item()->set_item_id(-1);
-      return grpc::Status::OK;
+    // Create a new kernel factory
+    bool can_batch = (batch_size > 1);
+    KernelFactory* factory =
+        new KernelFactory(op_name, device_type, 1, input_devices,
+                          output_devices, can_batch, batch_size, constructor);
+
+    // Register the kernel
+    KernelRegistry* registry = get_kernel_registry();
+    registry->add_kernel(op_name, factory);
+  }
+
+  for (auto& kv : worker_active_) {
+    if (kv.second) {
+      auto& worker = workers_[kv.first];
+      proto::Result w_result;
+      grpc::Status status;
+      GRPC_BACKOFF(worker->RegisterPythonKernel(&ctx, *python_kernel, &w_result),
+                   status);
+      const std::string& worker_address = worker_addresses_[kv.first];
+      LOG_IF(WARNING, !status.ok())
+          << "Master could not register python kernel for worker at "
+          << worker_address << " (" << status.error_code()
+          << "): " << status.error_message();
     }
+  }
 
-    assert(samples_left_ > 0);
-    task_result_ = task_sampler_->next_work(*new_work);
-    if (!task_result_.success()) {
-      new_work->mutable_io_item()->set_item_id(-1);
-      return grpc::Status::OK;
+  py_kernel_registrations_.push_back(*python_kernel);
+  result->set_success(true);
+  return grpc::Status::OK;
+}
+
+grpc::Status MasterImpl::Shutdown(grpc::ServerContext* context,
+                                  const proto::Empty* empty, Result* result) {
+  VLOG(1) << "Master received shutdown!";
+  result->set_success(true);
+  trigger_shutdown_.set();
+  return grpc::Status::OK;
+}
+
+grpc::Status MasterImpl::PokeWatchdog(grpc::ServerContext* context,
+                                      const proto::Empty* empty,
+                                      proto::Empty* result) {
+  watchdog_awake_ = true;
+
+  std::map<i32, proto::Worker::Stub*> ws;
+  {
+    std::unique_lock<std::mutex> lk(work_mutex_);
+    for (auto& kv : workers_) {
+      i32 worker_id = kv.first;
+      auto& worker = kv.second;
+      if (!worker_active_[worker_id]) continue;
+
+      ws.insert({worker_id, kv.second.get()});
+    }
+  }
+
+  std::vector<grpc::ClientContext> contexts(ws.size());
+  std::vector<grpc::Status> statuses(ws.size());
+  std::vector<proto::Empty> results(ws.size());
+  std::vector<std::unique_ptr<grpc::ClientAsyncResponseReader<proto::Empty>>>
+      rpcs(ws.size());
+  grpc::CompletionQueue cq;
+  int i = 0;
+  for (auto& kv : ws) {
+    i64 id = kv.first;
+    auto& worker = kv.second;
+    proto::Empty em;
+    rpcs[i] = worker->AsyncPokeWatchdog(&contexts[i], em, &cq);
+    rpcs[i]->Finish(&results[i], &statuses[i], (void*)id);
+    i++;
+  }
+  for (int i = 0; i < ws.size(); ++i) {
+    void* got_tag;
+    bool ok = false;
+    GPR_ASSERT(cq.Next(&got_tag, &ok));
+    // GPR_ASSERT((i64)got_tag < workers_.size());
+    i64 worker_id = (i64)got_tag;
+    if (!ok) {
+      LOG(WARNING) << "Could not poke worker " << worker_id << "!";
     }
+  }
+  cq.Shutdown();
+  return grpc::Status::OK;
+}
 
-    samples_left_--;
-    total_samples_used_++;
-    if (bar_) { bar_->Progressed(total_samples_used_); }
+grpc::Status MasterImpl::NextWork(grpc::ServerContext* context,
+                                  const proto::NodeInfo* node_info,
+                                  proto::NewWork* new_work) {
+  std::unique_lock<std::mutex> lk(work_mutex_);
+  VLOG(2) << "Master received NextWork command";
+  if (!worker_active_.at(node_info->node_id())) {
+    // Worker is not active
+    new_work->set_no_more_work(true);
     return grpc::Status::OK;
   }
 
-  grpc::Status NewJob(grpc::ServerContext *context,
-                      const proto::JobParameters *job_params,
-                      proto::Result *job_result) {
-    job_result->set_success(true);
-    set_database_path(db_params_.db_path);
+  // If we do not have any outstanding work, try and create more
+  if (unallocated_job_tasks_.empty()) {
+    // If we have no more samples for this task, try and get another task
+    if (next_task_ == num_tasks_) {
+      // Check if there are any tasks left
+      if (next_job_ < num_jobs_ && task_result_.success()) {
+        next_task_ = 0;
+        num_tasks_ = job_tasks_.at(next_job_).size();
+        next_job_++;
+        VLOG(1) << "Tasks left: " << total_tasks_ - total_tasks_used_;
+      }
+    }
 
-    job_params_.CopyFrom(*job_params);
+    // Create more work if possible
+    if (next_task_ < num_tasks_) {
+      i64 current_job = next_job_ - 1;
+      i64 current_task = next_task_;
 
-    const i32 io_item_size = job_params->io_item_size();
-    const i32 work_item_size = job_params->work_item_size();
+      unallocated_job_tasks_.push_front(
+          std::make_tuple(current_job, current_task));
+      next_task_++;
+    }
+  }
 
-    i32 warmup_size = 0;
-    i32 total_rows = 0;
+  if (unallocated_job_tasks_.empty()) {
+    if (finished_) {
+      // No more work
+      new_work->set_no_more_work(true);
+    } else {
+      // Still have tasks that might be reassigned
+      new_work->set_wait_for_work(true);
+    }
+    return grpc::Status::OK;
+  }
 
-    proto::JobDescriptor job_descriptor;
-    job_descriptor.set_io_item_size(io_item_size);
-    job_descriptor.set_work_item_size(work_item_size);
-    job_descriptor.set_num_nodes(workers_.size());
+  // Grab the next task sample
+  std::tuple<i64, i64> job_task_id = unallocated_job_tasks_.back();
+  unallocated_job_tasks_.pop_back();
+
+  assert(next_task_ <= num_tasks_);
+
+  i64 job_idx;
+  i64 task_idx;
+  std::tie(job_idx, task_idx) = job_task_id;
+
+  // If the job was blacklisted, then we throw it away
+  if (blacklisted_jobs_.count(job_idx) > 0) {
+    // TODO(apoms): we are telling the worker to re request work here
+    // but we should just loop this whole process again
+    new_work->set_wait_for_work(true);
+    return grpc::Status::OK;
+  }
+
+  new_work->set_table_id(job_to_table_id_.at(job_idx));
+  new_work->set_job_index(job_idx);
+  new_work->set_task_index(task_idx);
+  const auto& task_rows = job_tasks_.at(job_idx).at(task_idx);
+  for (i64 r : task_rows) {
+    new_work->add_output_rows(r);
+  }
+
+  auto task_start =
+      std::chrono::duration_cast<std::chrono::seconds>(now().time_since_epoch())
+          .count();
+  // Track sample assigned to worker
+  active_job_tasks_[node_info->node_id()].insert(job_task_id);
+  active_job_tasks_starts_[std::make_tuple(
+      (i64)node_info->node_id(), job_idx, task_idx)] = task_start;
+  worker_histories_[node_info->node_id()].tasks_assigned += 1;
+
+  return grpc::Status::OK;
+}
+
+grpc::Status MasterImpl::FinishedWork(
+    grpc::ServerContext* context, const proto::FinishedWorkParameters* params,
+    proto::Empty* empty) {
+  std::unique_lock<std::mutex> lk(work_mutex_);
+  VLOG(2) << "Master received FinishedWork command";
+
+  i32 worker_id = params->node_id();
+  i64 job_id = params->job_id();
+  i64 task_id = params->task_id();
+  i64 num_rows = params->num_rows();
+
+  if (!worker_active_[worker_id]) {
+    // Technically the task was finished, but we don't count it for now
+    // because it would have been reinstered into the work queue
+    return grpc::Status::OK;
+  }
+
+  auto& worker_tasks = active_job_tasks_.at(worker_id);
+
+  std::tuple<i64, i64> job_tasks = std::make_tuple(job_id, task_id);
+  assert(worker_tasks.count(job_tasks) > 0);
+  worker_tasks.erase(job_tasks);
+  active_job_tasks_starts_.erase(std::make_tuple((i64)worker_id, job_id, task_id));
+
+  worker_histories_[worker_id].tasks_retired += 1;
 
-    // Get output columns from last output op
-    auto &ops = job_params->task_set().ops();
-    // OpRegistry* op_registry = get_op_registry();
-    // OpInfo* output_op = op_registry->get_op_info(
-    //   ops.Get(ops.size()-1).name());
-    // const std::vector<std::string>& output_columns =
-    //   output_op->output_columns();
-    auto &last_op = ops.Get(ops.size() - 1);
-    assert(last_op.name() == "OutputTable");
-    std::vector<std::string> output_columns;
-    for (const auto &eval_input : last_op.inputs()) {
-      for (const std::string &name : eval_input.columns()) {
-        output_columns.push_back(name);
+  i64 active_job = next_job_ - 1;
+
+  // If job was blacklisted, then we have already updated total tasks
+  // used to reflect that and we should ignore it
+  if (blacklisted_jobs_.count(job_id) == 0) {
+    total_tasks_used_++;
+    tasks_used_per_job_[job_id]++;
+
+    if (tasks_used_per_job_[job_id] == job_tasks_[job_id].size()) {
+      i32 tid = job_uncommitted_tables_[job_id];
+      meta_.commit_table(tid);
+      write_database_metadata(storage_, meta_);
+
+    }
+  }
+
+  if (total_tasks_used_ == total_tasks_) {
+    VLOG(1) << "Master FinishedWork triggered finished!";
+    assert(next_job_ == num_jobs_);
+    {
+      std::unique_lock<std::mutex> lock(finished_mutex_);
+      finished_ = true;
+    }
+    finished_cv_.notify_all();
+  }
+
+  return grpc::Status::OK;
+}
+
+grpc::Status MasterImpl::FinishedJob(grpc::ServerContext* context,
+                                     const proto::FinishedJobParams* params,
+                                     proto::Empty* empty) {
+  std::unique_lock<std::mutex> lk(work_mutex_);
+  VLOG(1) << "Master received FinishedJob command";
+
+  i32 worker_id = params->node_id();
+
+  unfinished_workers_[worker_id] = false;
+
+  if (!worker_active_.at(worker_id)) {
+    return grpc::Status::OK;
+  }
+
+  if (!params->result().success()) {
+    LOG(WARNING) << "Worker " << worker_id << " sent FinishedJob with error: "
+                 << params->result().msg();
+  }
+
+  if (active_bulk_job_) {
+    stop_job_on_worker(worker_id);
+  }
+
+  return grpc::Status::OK;
+}
+
+grpc::Status MasterImpl::NewJob(grpc::ServerContext* context,
+                                const proto::BulkJobParameters* job_params,
+                                proto::Result* job_result) {
+  VLOG(1) << "Master received NewJob command";
+  job_result->set_success(true);
+  set_database_path(db_params_.db_path);
+
+  job_params_.Clear();
+  job_params_.MergeFrom(*job_params);
+  {
+    std::unique_lock<std::mutex> lock(finished_mutex_);
+    finished_ = false;
+  }
+  finished_cv_.notify_all();
+
+  {
+    std::unique_lock<std::mutex> lock(active_mutex_);
+    active_bulk_job_ = true;
+  }
+  active_cv_.notify_all();
+
+  return grpc::Status::OK;
+}
+
+
+void MasterImpl::start_watchdog(grpc::Server* server, bool enable_timeout,
+                                i32 timeout_ms) {
+  watchdog_thread_ = std::thread([this, server, enable_timeout, timeout_ms]() {
+    double time_since_check = 0;
+    // Wait until shutdown is triggered or watchdog isn't woken up
+    if (!enable_timeout) {
+      trigger_shutdown_.wait();
+    }
+    while (!trigger_shutdown_.raised()) {
+      auto sleep_start = now();
+      trigger_shutdown_.wait_for(timeout_ms);
+      time_since_check += nano_since(sleep_start) / 1e6;
+      if (time_since_check > timeout_ms) {
+        if (!watchdog_awake_) {
+          // Watchdog not woken, time to bail out
+          LOG(ERROR) << "Master did not receive heartbeat in " << timeout_ms
+                     << "ms. Shutting down.";
+          trigger_shutdown_.set();
+        }
+        watchdog_awake_ = false;
+        time_since_check = 0;
+      }
+    }
+    // Shutdown workers
+    std::vector<i32> worker_ids;
+    std::map<i32, proto::Worker::Stub*> workers_copy;
+    {
+      std::unique_lock<std::mutex> lk(work_mutex_);
+      for (auto& kv : workers_) {
+        if (worker_active_[kv.first]) {
+          worker_ids.push_back(kv.first);
+          workers_copy[kv.first] = workers_[kv.first].get();
+        }
       }
     }
-    for (size_t i = 0; i < output_columns.size(); ++i) {
-      auto &col_name = output_columns[i];
-      Column *col = job_descriptor.add_columns();
-      col->set_id(i);
-      col->set_name(col_name);
-      col->set_type(ColumnType::Other);
+    for (i32 i : worker_ids) {
+      proto::Empty empty;
+      proto::Result wresult;
+      grpc::Status status;
+      GRPC_BACKOFF(workers_copy.at(i)->Shutdown(&ctx, empty, &wresult), status);
+      const std::string& worker_address = worker_addresses_[i];
+      LOG_IF(WARNING, !status.ok())
+          << "Master could not send shutdown message to worker at "
+          << worker_address << " (" << status.error_code()
+          << "): " << status.error_message();
     }
+    // Shutdown self
+    server->Shutdown();
+  });
+}
 
-    DatabaseMetadata meta =
-        read_database_metadata(storage_, DatabaseMetadata::descriptor_path());
+void MasterImpl::recover_and_init_database() {
+  VLOG(1) << "Initializing database...";
 
-    auto &tasks = job_params->task_set().tasks();
-    job_descriptor.mutable_tasks()->CopyFrom(tasks);
+  VLOG(1) << "Reading database metadata";
+  // TODO(apoms): handle uncommitted database tables
+  meta_ = read_database_metadata(storage_, DatabaseMetadata::descriptor_path());
 
-    validate_task_set(meta, job_params->task_set(), job_result);
-    if (!job_result->success()) {
-      // No database changes made at this point, so just return
-      return grpc::Status::OK;
+  VLOG(1) << "Setting up table metadata cache";
+  // Setup table metadata cache
+  table_metas_.reset(new TableMetaCache(storage_, meta_));
+
+  // std::vector<std::string> valid_table_names;
+  // for (const auto& name : meta_.table_names()) {
+  //   i32 table_id = meta_.get_table_id(name);
+  //   if (!meta_.table_is_committed(table_id)) {
+  //     //
+  //   }
+  // }
+
+  // Prefetch table metadata for all tables
+  if (db_params_.prefetch_table_metadata) {
+    table_metas_->prefetch(meta_.table_names());
+  }
+
+  VLOG(1) << "Writing database metadata";
+  write_database_metadata(storage_, meta_);
+  VLOG(1) << "Database initialized.";
+}
+
+void MasterImpl::start_job_processor() {
+  VLOG(1) << "Starting job processor";
+  job_processor_thread_ = std::thread([this]() {
+    while (!trigger_shutdown_.raised()) {
+      // Wait on not finished
+      {
+        std::unique_lock<std::mutex> lock(active_mutex_);
+        active_cv_.wait(lock, [this] {
+          return active_bulk_job_ || trigger_shutdown_.raised();
+        });
+      }
+      if (trigger_shutdown_.raised()) break;
+      // Start processing job
+      bool result = process_job(&job_params_, &job_result_);
     }
+  });
+}
+
+void MasterImpl::stop_job_processor() {
+  // Wake up job processor
+  {
+    std::unique_lock<std::mutex> lock(active_mutex_);
+    active_bulk_job_ = true;
+  }
+  active_cv_.notify_all();
+  if (job_processor_thread_.joinable()) {
+    job_processor_thread_.join();
+  }
+}
 
-    // Add job name into database metadata so we can look up what jobs have
-    // been ran
-    i32 job_id = meta.add_job(job_params->job_name());
-    job_descriptor.set_id(job_id);
-    job_descriptor.set_name(job_params->job_name());
+bool MasterImpl::process_job(const proto::BulkJobParameters* job_params,
+                             proto::Result* job_result) {
+  // Reset job state
+  job_to_table_id_.clear();
+  slice_input_rows_per_job_.clear();
+  total_output_rows_per_job_.clear();
+  unallocated_job_tasks_.clear();
+  job_tasks_.clear();
+  next_job_ = 0;
+  num_jobs_ = -1;
+  next_task_ = 0;
+  num_tasks_ = -1;
+  task_result_.set_success(true);
+  active_job_tasks_.clear();
+  active_job_tasks_starts_.clear();
+  worker_histories_.clear();
+  unfinished_workers_.clear();
+  local_ids_.clear();
+  local_totals_.clear();
+  total_tasks_used_ = 0;
+  total_tasks_ = 0;
+  tasks_used_per_job_.clear();
+  num_failed_workers_ = 0;
+
+  job_result->set_success(true);
+
+  auto finished_fn = [this]() {
+    {
+      std::unique_lock<std::mutex> lock(finished_mutex_);
+      finished_ = true;
+    }
+    finished_cv_.notify_all();
+    {
+      std::unique_lock<std::mutex> lock(finished_mutex_);
+      active_bulk_job_ = false;
+    }
+    active_cv_.notify_all();
+  };
+
+  std::vector<proto::Job> jobs(job_params->jobs().begin(),
+                               job_params->jobs().end());
+  std::vector<proto::Op> ops(job_params->ops().begin(),
+                             job_params->ops().end());
+
+  const i32 work_packet_size = job_params->work_packet_size();
+  const i32 io_packet_size = job_params->io_packet_size() != -1
+                                 ? job_params->io_packet_size()
+                                 : work_packet_size;
+  if (io_packet_size > 0 && io_packet_size % work_packet_size != 0) {
+    RESULT_ERROR(job_result,
+                 "IO packet size (%d) must be a multiple of work packet size (%d).",
+                 io_packet_size,
+                 work_packet_size);
+    finished_fn();
+    return false;
+  }
+
+  i32 total_rows = 0;
+
+  DAGAnalysisInfo dag_info;
+  *job_result =
+      validate_jobs_and_ops(meta_, *table_metas_.get(), jobs, ops, dag_info);
+  if (!job_result->success()) {
+    // No database changes made at this point, so just return
+    finished_fn();
+    return false;
+  }
+
+  // Map all input Ops into a single input collection
+  const std::map<i64, i64>& input_op_idx_to_column_idx = dag_info.input_ops;
+
+  // Get output columns from last output op to set as output table columns
+  OpRegistry* op_registry = get_op_registry();
+  auto& last_op = ops.at(ops.size() - 1);
+  assert(last_op.name() == OUTPUT_OP_NAME);
+  std::vector<std::vector<Column>> job_output_columns;
+  for (const auto& job : jobs) {
+    // Get input columns from column inputs specified for each job
+    std::map<i64, Column> input_op_idx_to_column;
+    {
+      for (auto& ci : job.inputs()) {
+        const TableMetadata& table = table_metas_->at(ci.table_name());
+        std::vector<Column> table_columns = table.columns();
+        const std::string& c = ci.column_name();
+        bool found = false;
+        for (Column& col : table_columns) {
+          if (c == col.name()) {
+            Column new_col;
+            new_col.CopyFrom(col);
+            new_col.set_id(0);
+            input_op_idx_to_column[ci.op_index()] = new_col;
+            found = true;
+            break;
+          }
+        }
+        assert(found);
+      }
+    }
+
+    job_output_columns.emplace_back();
+    std::vector<Column>& output_columns = job_output_columns.back();
+    // For an op column, find the Column info
+    std::function<Column(const proto::OpInput&)> determine_column_info =
+        [&determine_column_info, &ops, &input_op_idx_to_column,
+         op_registry](const proto::OpInput& op_input) -> Column {
+      i64 op_idx = op_input.op_index();
+      const std::string& col = op_input.column();
+      auto& input_op = ops.at(op_idx);
+      // For builtin ops, find non bulit-in parent column
+      if (input_op.name() != INPUT_OP_NAME && is_builtin_op(input_op.name())) {
+        // Find the column
+        for (auto& in : input_op.inputs()) {
+          if (in.column() == col) {
+            return determine_column_info(in);
+          }
+        }
+        assert(false);
+      }
+
+      std::vector<Column> input_columns;
+      std::vector<Column> actual_columns;
+      if (input_op.name() == INPUT_OP_NAME) {
+        Column col = input_op_idx_to_column.at(op_idx);
+        actual_columns = {col};
+        col.set_name(input_op.inputs(0).column());
+        input_columns = {col};
+      } else {
+        OpInfo* input_op_info = op_registry->get_op_info(input_op.name());
+        input_columns = input_op_info->output_columns();
+        actual_columns = input_columns;
+      }
+      const std::string& name = col;
+      bool found = false;
+      for (size_t i = 0; i < input_columns.size(); ++i) {
+        auto& in_col = input_columns[i];
+        if (in_col.name() == name) {
+          Column c = actual_columns[i];
+          return c;
+        }
+      }
+      assert(false);
+    };
+    for (const auto& input : last_op.inputs()) {
+      Column c = determine_column_info(input);
+      c.set_id(output_columns.size());
+      output_columns.push_back(c);
+    }
+  }
+  proto::BulkJobDescriptor job_descriptor;
+  job_descriptor.set_io_packet_size(io_packet_size);
+  job_descriptor.set_work_packet_size(work_packet_size);
+  job_descriptor.set_num_nodes(workers_.size());
+
+  {
+    auto& jobs = job_params->jobs();
+    job_descriptor.mutable_jobs()->CopyFrom(jobs);
+  }
+
+  // Add job name into database metadata so we can look up what jobs have
+  // been run
+  i32 bulk_job_id = meta_.add_bulk_job(job_params->job_name());
+  job_descriptor.set_id(bulk_job_id);
+  job_descriptor.set_name(job_params->job_name());
+  // Determine total output rows and slice input rows for using to
+  // split stream
+  *job_result = determine_input_rows_to_slices(meta_, *table_metas_.get(), jobs,
+                                               ops, dag_info);
+  slice_input_rows_per_job_ = dag_info.slice_input_rows;
+  total_output_rows_per_job_ = dag_info.total_output_rows;
+
+  if (!job_result->success()) {
+    // No database changes made at this point, so just return
+    finished_fn();
+    return false;
+  }
+
+  // HACK(apoms): we currently split work into tasks in two ways:
+  //  a) align with the natural boundaries defined by the slice partitioner
+  //  b) use a user-specified size to chunk up the output sequence
+
+  // Job -> task -> rows
+  i32 total_tasks_temp = 0;
+  for (size_t i = 0; i < jobs.size(); ++i) {
+    tasks_used_per_job_.push_back(0);
+
+    auto& slice_input_rows = slice_input_rows_per_job_[i];
+    i64 total_output_rows = total_output_rows_per_job_[i];
+
+    std::vector<i64> partition_boundaries;
+    if (slice_input_rows.size() == 0) {
+      // No slices, so we can split as desired. Currently use IO packet size
+      // since it is the smallest granularity we can specify
+      for (i64 r = 0; r < total_output_rows; r += io_packet_size) {
+        partition_boundaries.push_back(r);
+      }
+      partition_boundaries.push_back(total_output_rows);
+    } else {
+      // Split stream into partitions, respecting slice boundaries
+      // We assume there is only one slice for now since
+      // they all must have the same number of groups
+      assert(slice_input_rows.size() == 1);
+      // Derive the output rows produced by each slice group
+      i64 slice_op_idx = slice_input_rows.begin()->first;
+      i64 slice_in_rows = slice_input_rows.begin()->second;
+      *job_result = derive_slice_final_output_rows(
+          jobs.at(i), ops, slice_op_idx, slice_in_rows, dag_info,
+          partition_boundaries);
+      if (!job_result->success()) {
+        // No database changes made at this point, so just return
+        finished_fn();
+        return false;
+      }
+    }
+    assert(partition_boundaries.back() == total_output_rows);
+    job_tasks_.emplace_back();
+    auto& tasks = job_tasks_.back();
+    for (i64 pi = 0; pi < partition_boundaries.size() - 1; ++pi) {
+      tasks.emplace_back();
+      auto& task_rows = tasks.back();
 
-    // Read all table metadata
-    for (const std::string &table_name : meta.table_names()) {
-      std::string table_path =
-          TableMetadata::descriptor_path(meta.get_table_id(table_name));
-      table_metas_[table_name] = read_table_metadata(storage_, table_path);
+      i64 s = partition_boundaries[pi];
+      i64 e = partition_boundaries[pi + 1];
+      for (i64 r = s; r < e; ++r) {
+        task_rows.push_back(r);
+      }
+      total_tasks_temp++;
     }
+  }
+  total_tasks_ = total_tasks_temp;
+
+  if (!job_result->success()) {
+    // No database changes made at this point, so just return
+    finished_fn();
+    return false;
+  }
+
+  // Write out database metadata so that workers can read it
+  write_bulk_job_metadata(storage_, BulkJobMetadata(job_descriptor));
 
-    total_samples_used_ = 0;
-    total_samples_ = 0;
-    for (auto &task : job_params->task_set().tasks()) {
-      i32 table_id = meta.add_table(task.output_table_name());
+  job_uncommitted_tables_.clear();
+  {
+    for (i64 job_idx = 0; job_idx < job_params->jobs_size(); ++job_idx) {
+      auto& job = job_params->jobs(job_idx);
+      i32 table_id = meta_.add_table(job.output_table_name());
+      job_to_table_id_[job_idx] = table_id;
       proto::TableDescriptor table_desc;
       table_desc.set_id(table_id);
-      table_desc.set_name(task.output_table_name());
+      table_desc.set_name(job.output_table_name());
       table_desc.set_timestamp(std::chrono::duration_cast<std::chrono::seconds>(
                                    now().time_since_epoch())
                                    .count());
       // Set columns equal to the last op's output columns
-      for (size_t i = 0; i < output_columns.size(); ++i) {
-        Column *col = table_desc.add_columns();
-        col->set_id(i);
-        col->set_name(output_columns[i]);
-        col->set_type(ColumnType::Other);
+      for (size_t i = 0; i < job_output_columns[job_idx].size(); ++i) {
+        Column* col = table_desc.add_columns();
+        col->CopyFrom(job_output_columns[job_idx][i]);
       }
-      table_metas_[task.output_table_name()] = TableMetadata(table_desc);
+      table_metas_->update(TableMetadata(table_desc));
+
+      i64 total_rows = 0;
       std::vector<i64> end_rows;
-      Result result = get_task_end_rows(table_metas_, task, end_rows);
-      if (!result.success()) {
-        *job_result = result;
-        break;
+      auto& tasks = job_tasks_.at(job_idx);
+      for (i64 task_id = 0; task_id < tasks.size(); ++task_id) {
+        i64 task_rows = tasks.at(task_id).size();
+        total_rows += task_rows;
+        end_rows.push_back(total_rows);
       }
-      total_samples_ += end_rows.size();
       for (i64 r : end_rows) {
         table_desc.add_end_rows(r);
       }
-      table_desc.set_job_id(job_id);
-
-      write_table_metadata(storage_, TableMetadata(table_desc));
-      table_metas_[task.output_table_name()] = TableMetadata(table_desc);
+      table_desc.set_job_id(bulk_job_id);
+      job_uncommitted_tables_.push_back(table_id);
+      table_metas_->update(TableMetadata(table_desc));
     }
-    if (!job_result->success()) {
-      // No database changes made at this point, so just return
-      return grpc::Status::OK;
+    // Write table metadata in parallel
+    auto write_meta = [&](std::vector<i32> table_ids) {
+      for (i32 table_id : table_ids) {
+        write_table_metadata(storage_, table_metas_->at(table_id));
+      }
+    };
+    std::vector<std::thread> threads;
+    i32 num_threads = std::thread::hardware_concurrency() * 4;
+    i32 job_idx = 0;
+    for (i64 tid = 0; tid < num_threads; ++tid) {
+      std::vector<i32> table_ids;
+      i32 jobs_to_compute =
+          (job_params->jobs_size() - job_idx) / (num_threads - tid);
+      for (i32 i = job_idx; i < job_idx + jobs_to_compute; ++i) {
+        table_ids.push_back(job_uncommitted_tables_[i]);
+      }
+      threads.emplace_back(write_meta, table_ids);
+      job_idx += jobs_to_compute;
     }
+    for (i64 tid = 0; tid < num_threads; ++tid) {
+      threads[tid].join();
+    }
+  }
 
-    // Write out database metadata so that workers can read it
-    write_job_metadata(storage_, JobMetadata(job_descriptor));
-
-    // Setup initial task sampler
-    task_result_.set_success(true);
-    samples_left_ = 0;
-    next_task_ = 0;
-    num_tasks_ = job_params->task_set().tasks_size();
-
-    write_database_metadata(storage_, meta);
+  // Setup initial task sampler
+  task_result_.set_success(true);
+  next_task_ = 0;
+  num_tasks_ = 0;
+  next_job_ = 0;
+  num_jobs_ = jobs.size();
 
-    VLOG(1) << "Total tasks: " << num_tasks_;
+  write_database_metadata(storage_, meta_);
 
-    grpc::CompletionQueue cq;
-    std::vector<grpc::ClientContext> client_contexts(workers_.size());
-    std::vector<grpc::Status> statuses(workers_.size());
-    std::vector<proto::Result> replies(workers_.size());
-    std::vector<std::unique_ptr<grpc::ClientAsyncResponseReader<proto::Result>>>
-        rpcs;
+  VLOG(1) << "Total jobs: " << num_jobs_;
 
-    if (bar_) { delete bar_; }
-    if (job_params->show_progress()) {
-      bar_ = new ProgressBar(total_samples_, "");
-    } else {
-      bar_ = nullptr;
+  // TODO(apoms): change this to support adding and removing nodes
+  //              the main change is that the workers should handle
+  //              spawning sub processes instead of appearing as
+  //              multiple logical nodes
+  for (auto kv : worker_addresses_) {
+    const std::string& address = kv.second;
+    // Strip port
+    std::vector<std::string> split_addr = split(address, ':');
+    std::string sans_port = split_addr[0];
+    if (local_totals_.count(sans_port) == 0) {
+      local_totals_[sans_port] = 0;
     }
+    local_totals_[sans_port] += 1;
+  }
 
-    std::map<std::string, i32> local_ids;
-    std::map<std::string, i32> local_totals;
-    for (std::string &address : addresses_) {
-      if (local_totals.count(address) == 0) {
-        local_totals[address] = 0;
+  // Send new job command to workers
+  VLOG(1) << "Sending new job command to workers";
+  {
+    std::unique_lock<std::mutex> lk(work_mutex_);
+    std::vector<i32> worker_ids;
+    for (auto& kv : worker_active_) {
+      if (kv.second) {
+        worker_ids.push_back(kv.first);
       }
-      local_totals[address] += 1;
     }
+    start_job_on_workers(worker_ids);
+    unstarted_workers_.clear();
+  }
 
-    proto::JobParameters w_job_params;
-    w_job_params.CopyFrom(*job_params);
-    for (size_t i = 0; i < workers_.size(); ++i) {
-      auto &worker = workers_[i];
-      std::string &address = addresses_[i];
-      if (local_ids.count(address) == 0) {
-        local_ids[address] = 0;
+  // Ping workers every 10 seconds to make sure they are alive
+  start_worker_pinger();
+
+  // Wait for all workers to finish
+  VLOG(1) << "Waiting for workers to finish";
+
+  // Wait until all workers are done and work has been completed
+  auto all_workers_finished_start = now();
+  while (true) {
+    // Check if we have unfinished workers
+    bool all_workers_finished = true;
+    {
+      std::unique_lock<std::mutex> lk(work_mutex_);
+      for (auto& kv : unfinished_workers_) {
+        // If the worker is active and it is not finished, then
+        // we need to keep working
+        if (worker_active_[kv.first] && kv.second) {
+          all_workers_finished = false;
+          break;
+        }
       }
-      w_job_params.set_local_id(local_ids[address]);
-      w_job_params.set_local_total(local_totals[address]);
-      local_ids[address] += 1;
-      rpcs.emplace_back(
-          worker->AsyncNewJob(&client_contexts[i], w_job_params, &cq));
-      rpcs[i]->Finish(&replies[i], &statuses[i], (void *)i);
-    }
-
-    for (size_t i = 0; i < workers_.size(); ++i) {
-      void *got_tag;
-      bool ok = false;
-      GPR_ASSERT(cq.Next(&got_tag, &ok));
-      GPR_ASSERT((i64)got_tag < workers_.size());
-      assert(ok);
-
-      if (!replies[i].success()) {
-        LOG(WARNING) << "Worker returned error: " << replies[i].msg();
-        job_result->set_success(false);
-        job_result->set_msg(replies[i].msg());
-        next_task_ = num_tasks_;
+    }
+    if (all_workers_finished && !finished_) {
+      // If we have unfinished work but no workers for some period of time,
+      // then fail
+      double seconds_since = std::chrono::duration_cast<std::chrono::seconds>(
+                                 now() - all_workers_finished_start)
+                                 .count();
+      if (seconds_since > db_params_.no_workers_timeout) {
+        RESULT_ERROR(job_result,
+                     "No workers but have unfinished work after %ld seconds",
+                     db_params_.no_workers_timeout);
+        finished_fn();
+        return false;
       }
+    } else {
+      // Reset timer
+      all_workers_finished_start = now();
     }
-
-    if (!job_result->success()) {
-      // TODO(apoms): We wrote the db meta with the tables so we should clear
-      // them out here since the job failed.
+    if (all_workers_finished && finished_) {
+      break;
     }
-    if (!task_result_.success()) {
-      job_result->CopyFrom(task_result_);
-    } else {
-      assert(next_task_ == num_tasks_);
-      if (bar_) { bar_->Progressed(total_samples_); }
+    // Check if any tasks have gone on longer than timeout
+    if (job_params_.task_timeout() > 0.0001) {
+      std::unique_lock<std::mutex> lk(work_mutex_);
+      auto current_time = std::chrono::duration_cast<std::chrono::seconds>(
+                              now().time_since_epoch())
+                              .count();
+      for (const auto& kv : active_job_tasks_starts_) {
+        if (current_time - kv.second > job_params_.task_timeout()) {
+          i64 worker_id;
+          i64 job_id;
+          i64 task_id;
+          std::tie(worker_id, job_id, task_id) = kv.first;
+          // Task has timed out, stop the worker
+          LOG(WARNING) << "Node " << worker_id << " ("
+                       << worker_addresses_.at(worker_id) << ") "
+                       << "failed to finish task (" << job_id << ", " << task_id
+                       << ") after " << job_params_.task_timeout()
+                       << " seconds. Removing that worker as an active worker.";
+          remove_worker(worker_id);
+          num_failed_workers_++;
+        }
+      }
+    }
+    // Check if we have unstarted workers and start them if so
+    {
+      std::unique_lock<std::mutex> lk(work_mutex_);
+      if (!unstarted_workers_.empty()) {
+        // Update locals
+        for (i32 wid : unstarted_workers_) {
+          const std::string& address = worker_addresses_.at(wid);
+          std::vector<std::string> split_addr = split(address, ':');
+          std::string sans_port = split_addr[0];
+          if (local_totals_.count(sans_port) == 0) {
+            local_totals_[sans_port] = 0;
+          }
+          local_totals_[sans_port] += 1;
+        }
+        start_job_on_workers(unstarted_workers_);
+      }
+      unstarted_workers_.clear();
     }
+    std::this_thread::yield();
+  }
 
-    return grpc::Status::OK;
+  {
+    std::unique_lock<std::mutex> lock(finished_mutex_);
+    finished_cv_.wait(lock, [this] { return finished_.load(); });
   }
 
-  grpc::Status Ping(grpc::ServerContext *context, const proto::Empty *empty1,
-                    proto::Empty *empty2) {
-    return grpc::Status::OK;
+  // If we are shutting down, then the job did not finish and we should fail
+  if (trigger_shutdown_.raised()) {
+    job_result->set_success(false);
   }
 
-  grpc::Status LoadOp(grpc::ServerContext *context,
-                      const proto::OpInfo *op_info, Result *result) {
-    const std::string &so_path = op_info->so_path();
-    {
-      std::ifstream infile(so_path);
-      if (!infile.good()) {
-        RESULT_ERROR(result, "Op library was not found: %s", so_path.c_str());
-        return grpc::Status::OK;
+  if (job_result->success()) {
+    // Commit job since it was successful
+    meta_.commit_bulk_job(bulk_job_id);
+  }
+  write_database_metadata(storage_, meta_);
+
+  if (!task_result_.success()) {
+    job_result->CopyFrom(task_result_);
+  } else {
+    assert(next_job_ == num_jobs_);
+  }
+
+  std::fflush(NULL);
+  sync();
+
+  // No need to check status of workers anymore
+  stop_worker_pinger();
+
+  // Update job metadata with new # of nodes
+  {
+    std::unique_lock<std::mutex> lk(work_mutex_);
+    job_descriptor.set_num_nodes(workers_.size());
+  }
+  write_bulk_job_metadata(storage_, BulkJobMetadata(job_descriptor));
+
+  finished_fn();
+
+  VLOG(1) << "Master finished job";
+
+  return true;
+}
+
+void MasterImpl::start_worker_pinger() {
+  VLOG(1) << "Starting worker pinger";
+  pinger_active_ = true;
+  pinger_thread_ = std::thread([this]() {
+    while (!finished_ && pinger_active_) {
+      std::map<i32, proto::Worker::Stub*> ws;
+      {
+        std::unique_lock<std::mutex> lk(work_mutex_);
+        for (auto& kv : workers_) {
+          i32 worker_id = kv.first;
+          auto& worker = kv.second;
+          if (!worker_active_[worker_id]) continue;
+
+          ws.insert({worker_id, kv.second.get()});
+        }
       }
-    }
 
-    void *handle = dlopen(so_path.c_str(), RTLD_NOW | RTLD_LOCAL);
-    if (handle == nullptr) {
-      RESULT_ERROR(result, "Failed to load op library: %s", dlerror());
-      return grpc::Status::OK;
-    }
+      for (auto& kv : ws) {
+        i32 worker_id = kv.first;
+        auto& worker = kv.second;
 
-    for (auto &worker : workers_) {
-      grpc::ClientContext ctx;
-      proto::Empty empty;
-      worker->LoadOp(&ctx, *op_info, &empty);
+        grpc::ClientContext ctx;
+        proto::Empty empty1;
+        proto::Empty empty2;
+        grpc::Status status = worker->Ping(&ctx, empty1, &empty2);
+        if (!status.ok()) {
+          // Worker not responding, increment ping count
+          i64 num_failed_pings = ++pinger_number_of_failed_pings_[worker_id];
+          const i64 FAILED_PINGS_BEFORE_REMOVAL = 3;
+          if (num_failed_pings >= FAILED_PINGS_BEFORE_REMOVAL) {
+            // remove it from active workers
+            LOG(WARNING) << "Worker " << worker_id
+                         << " did not respond to Ping. "
+                         << "Removing worker from active list.";
+            remove_worker(worker_id);
+            num_failed_workers_++;
+          }
+        } else {
+          pinger_number_of_failed_pings_[worker_id] = 0;
+        }
+      }
+      // FIXME(apoms): this sleep is unfortunate because it means a
+      //               job must take at least this long. A solution
+      //               would be to have this wait on a cv so it could
+      //               be woken up early.
+      std::this_thread::sleep_for(std::chrono::seconds(5));
     }
+  });
+}
 
-    result->set_success(true);
-    return grpc::Status::OK;
+void MasterImpl::stop_worker_pinger() {
+  if (pinger_thread_.joinable()) {
+    pinger_active_ = false;
+    pinger_thread_.join();
   }
+}
 
-private:
-  std::vector<std::unique_ptr<proto::Worker::Stub>> workers_;
-  std::vector<std::string> addresses_;
-  DatabaseParameters db_params_;
-  storehouse::StorageBackend *storage_;
-  std::map<std::string, TableMetadata> table_metas_;
-  proto::JobParameters job_params_;
-  ProgressBar *bar_;
+void MasterImpl::start_job_on_workers(const std::vector<i32>& worker_ids) {
+  proto::BulkJobParameters w_job_params;
+  w_job_params.MergeFrom(job_params_);
 
-  i64 total_samples_used_;
-  i64 total_samples_;
+  grpc::CompletionQueue cq;
+  std::map<i32, std::unique_ptr<grpc::ClientContext>> client_contexts;
+  std::map<i32, std::unique_ptr<grpc::Status>> statuses;
+  std::map<i32, std::unique_ptr<proto::Result>> replies;
+  std::map<i32, std::unique_ptr<grpc::ClientAsyncResponseReader<proto::Result>>>
+      rpcs;
+  for (i32 worker_id : worker_ids) {
+    const std::string& address = worker_addresses_.at(worker_id);
+    auto& worker = workers_.at(worker_id);
+    std::vector<std::string> split_addr = split(address, ':');
+    std::string sans_port = split_addr[0];
+    w_job_params.set_local_id(local_ids_[sans_port]);
+    w_job_params.set_local_total(local_totals_[sans_port]);
+    local_ids_[sans_port] += 1;
+    client_contexts[worker_id] =
+        std::unique_ptr<grpc::ClientContext>(new grpc::ClientContext);
+    statuses[worker_id] = std::unique_ptr<grpc::Status>(new grpc::Status);
+    replies[worker_id] = std::unique_ptr<proto::Result>(new proto::Result);
+    rpcs[worker_id] = worker->AsyncNewJob(client_contexts[worker_id].get(),
+                                          w_job_params, &cq);
+    rpcs[worker_id]->Finish(replies[worker_id].get(), statuses[worker_id].get(),
+                            (void*)worker_id);
+    worker_histories_[worker_id].start_time = now();
+    worker_histories_[worker_id].tasks_assigned = 0;
+    worker_histories_[worker_id].tasks_retired = 0;
+    unfinished_workers_[worker_id] = true;
+    VLOG(2) << "Sent NewJob command to worker " << worker_id;
+  }
+
+  for (i64 i = 0; i < worker_ids.size(); ++i) {
+    void* got_tag;
+    bool ok = false;
+    auto status = (cq.Next(&got_tag, &ok));
+    assert(status != grpc::CompletionQueue::NextStatus::SHUTDOWN);
+    assert(ok);
 
-  std::mutex work_mutex_;
-  i64 next_task_;
-  i64 num_tasks_;
-  std::unique_ptr<TaskSampler> task_sampler_;
-  i64 samples_left_;
-  Result task_result_;
-};
+    i64 worker_id = (i64)got_tag;
+    VLOG(2) << "Worker " << worker_id << " NewJob returned.";
 
-proto::Master::Service *get_master_service(DatabaseParameters &param) {
-  return new MasterImpl(param);
+    if (worker_active_[worker_id] && !replies[worker_id]->success()) {
+      LOG(WARNING) << "Worker " << worker_id << " ("
+                   << worker_addresses_.at(worker_id) << ") "
+                   << "returned error: " << replies[worker_id]->msg();
+    }
+  }
+  cq.Shutdown();
+}
+
+void MasterImpl::stop_job_on_worker(i32 worker_id) {
+  // Place workers active tasks back into the unallocated task samples
+  if (active_job_tasks_.count(worker_id) > 0) {
+    // Place workers active tasks back into the unallocated task samples
+    VLOG(1) << "Reassigning worker " << worker_id << "'s "
+            << active_job_tasks_.at(worker_id).size() << " task samples.";
+    for (const std::tuple<i64, i64>& worker_job_task :
+         active_job_tasks_.at(worker_id)) {
+      unallocated_job_tasks_.push_back(worker_job_task);
+      active_job_tasks_starts_.erase(
+          std::make_tuple((i64)worker_id, std::get<0>(worker_job_task),
+                          std::get<1>(worker_job_task)));
+
+      // The worker failure may be due to a bad task. We track number of times
+      // a task has failed to detect a bad task and remove it from this bulk
+      // job if it exceeds some threshold.
+      i64 job_id = std::get<0>(worker_job_task);
+      i64 task_id = std::get<1>(worker_job_task);
+
+      i64 num_failures = ++job_tasks_num_failures_[job_id][task_id];
+      const i64 TOTAL_FAILURES_BEFORE_REMOVAL = 3;
+      if (num_failures >= TOTAL_FAILURES_BEFORE_REMOVAL) {
+        blacklist_job(job_id);
+      }
+    }
+    active_job_tasks_.erase(worker_id);
+  }
+
+  worker_histories_[worker_id].end_time = now();
+  unfinished_workers_[worker_id] = false;
 }
+
+void MasterImpl::remove_worker(i32 node_id) {
+  assert(workers_.count(node_id) > 0);
+
+  std::string worker_address = worker_addresses_.at(node_id);
+  // Remove worker from list
+  worker_active_[node_id] = false;
+
+  {
+    std::unique_lock<std::mutex> lock(active_mutex_);
+    if (active_bulk_job_) {
+      stop_job_on_worker(node_id);
+    }
+  }
+
+  // Update locals
+  /*std::vector<std::string> split_addr = split(worker_address, ':');
+  std::string sans_port = split_addr[0];
+  assert(local_totals_.count(sans_port) > 0);
+  local_totals_[sans_port] -= 1;
+  local_ids_[sans_port] -= 1;*/
+
+  VLOG(1) << "Removing worker " << node_id << " (" << worker_address << ").";
 }
+
+void MasterImpl::blacklist_job(i64 job_id) {
+  // All tasks in unallocated_job_tasks_ with this job id will be thrown away
+  blacklisted_jobs_.insert(job_id);
+  // Add number of remaining tasks to tasks used
+  i64 num_tasks_left_in_job =
+      job_tasks_[job_id].size() - tasks_used_per_job_[job_id];
+  total_tasks_used_ += num_tasks_left_in_job;
+
+  VLOG(1) << "Blacklisted job " << job_id;
+
+  // Check if blacklisting job finished the bulk job
+  if (total_tasks_used_ == total_tasks_) {
+    VLOG(1) << "Master blacklisting job triggered finished!";
+    assert(next_job_ == num_jobs_);
+    {
+      std::unique_lock<std::mutex> lock(finished_mutex_);
+      finished_ = true;
+    }
+    finished_cv_.notify_all();
+  }
 }
+
+}  // namespace internal
+}  // namespace scanner
diff --git a/scanner/engine/master.h b/scanner/engine/master.h
new file mode 100644
index 00000000..377fd3d6
--- /dev/null
+++ b/scanner/engine/master.h
@@ -0,0 +1,249 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <grpc/support/log.h>
+#include "scanner/engine/rpc.grpc.pb.h"
+#include "scanner/engine/runtime.h"
+#include "scanner/engine/sampler.h"
+#include "scanner/util/util.h"
+
+#include <mutex>
+#include <thread>
+
+namespace scanner {
+namespace internal {
+
+class MasterImpl final : public proto::Master::Service {
+ public:
+  MasterImpl(DatabaseParameters& params);
+
+  ~MasterImpl();
+
+  // Expects context->peer() to return a string in the format
+  // ipv4:<peer_address>:<random_port>
+  // Returns the <peer_address> from the above format.
+  static std::string get_worker_address_from_grpc_context(
+      grpc::ServerContext* context);
+
+  grpc::Status Shutdown(grpc::ServerContext* context, const proto::Empty* empty,
+                        Result* result);
+
+  // Database query methods
+  grpc::Status ListTables(grpc::ServerContext* context,
+                         const proto::Empty* empty,
+                         proto::ListTablesResult* result);
+
+  grpc::Status GetTables(grpc::ServerContext* context,
+                         const proto::GetTablesParams* params,
+                         proto::GetTablesResult* result);
+
+  grpc::Status DeleteTables(grpc::ServerContext* context,
+                            const proto::DeleteTablesParams* params,
+                            proto::Empty* empty);
+
+  grpc::Status NewTable(grpc::ServerContext* context,
+                        const proto::NewTableParams* params,
+                        proto::Empty* empty);
+
+  // Worker methods
+  grpc::Status RegisterWorker(grpc::ServerContext* context,
+                              const proto::WorkerParams* worker_info,
+                              proto::Registration* registration);
+
+  grpc::Status UnregisterWorker(grpc::ServerContext* context,
+                                const proto::NodeInfo* node_info,
+                                proto::Empty* empty);
+
+  grpc::Status ActiveWorkers(grpc::ServerContext* context,
+                             const proto::Empty* empty,
+                             proto::RegisteredWorkers* registered_workers);
+
+
+  grpc::Status IngestVideos(grpc::ServerContext* context,
+                            const proto::IngestParameters* params,
+                            proto::IngestResult* result);
+
+  // Op and Kernel methods
+
+  grpc::Status GetOpInfo(grpc::ServerContext* context,
+                         const proto::OpInfoArgs* op_info_args,
+                         proto::OpInfo* op_info);
+
+  grpc::Status LoadOp(grpc::ServerContext* context,
+                      const proto::OpPath* op_path, Result* result);
+
+  grpc::Status RegisterOp(grpc::ServerContext* context,
+                          const proto::OpRegistration* op_registration,
+                          proto::Result* result);
+
+  grpc::Status RegisterPythonKernel(
+      grpc::ServerContext* context,
+      const proto::PythonKernelRegistration* python_kernel,
+      proto::Result* result);
+
+  grpc::Status GetJobStatus(grpc::ServerContext* context,
+                            const proto::Empty* empty,
+                            proto::JobStatus* job_status);
+
+  grpc::Status NextWork(grpc::ServerContext* context,
+                        const proto::NodeInfo* node_info,
+                        proto::NewWork* new_work);
+
+  grpc::Status FinishedWork(grpc::ServerContext* context,
+                            const proto::FinishedWorkParameters* params,
+                            proto::Empty* empty);
+
+  grpc::Status FinishedJob(grpc::ServerContext* context,
+                           const proto::FinishedJobParams* params,
+                           proto::Empty* empty);
+
+  grpc::Status NewJob(grpc::ServerContext* context,
+                      const proto::BulkJobParameters* job_params,
+                      proto::Result* job_result);
+
+  // Misc methods
+  grpc::Status Ping(grpc::ServerContext* context, const proto::Empty* empty1,
+                    proto::Empty* empty2);
+
+  grpc::Status PokeWatchdog(grpc::ServerContext* context,
+                            const proto::Empty* empty, proto::Empty* result);
+
+  //
+
+  void start_watchdog(grpc::Server* server, bool enable_timeout,
+                      i32 timeout_ms = 50000);
+
+ private:
+  void recover_and_init_database();
+
+  void start_job_processor();
+
+  void stop_job_processor();
+
+  bool process_job(const proto::BulkJobParameters* job_params,
+                   proto::Result* job_result);
+
+  void start_worker_pinger();
+
+  void stop_worker_pinger();
+
+  void start_job_on_workers(const std::vector<i32>& worker_ids);
+
+  void stop_job_on_worker(i32 node_id);
+
+  void remove_worker(i32 node_id);
+
+  void blacklist_job(i64 job_id);
+
+  DatabaseParameters db_params_;
+
+  std::thread pinger_thread_;
+  std::atomic<bool> pinger_active_;
+  // Tracks number of times the pinger has failed to reach a worker
+  std::map<i64, i64> pinger_number_of_failed_pings_;
+
+  std::thread watchdog_thread_;
+  std::atomic<bool> watchdog_awake_;
+  Flag trigger_shutdown_;
+  storehouse::StorageBackend* storage_;
+  DatabaseMetadata meta_;
+  std::unique_ptr<TableMetaCache> table_metas_;
+  std::vector<std::string> so_paths_;
+  std::vector<proto::OpRegistration> op_registrations_;
+  std::vector<proto::PythonKernelRegistration> py_kernel_registrations_;
+
+  // Worker state
+  i32 next_worker_id_ = 0;
+  std::map<i32, bool> worker_active_;
+  std::map<i32, std::unique_ptr<proto::Worker::Stub>> workers_;
+  std::map<i32, std::string> worker_addresses_;
+
+  i64 total_tasks_used_;
+  i64 total_tasks_;
+  std::vector<i64> tasks_used_per_job_;
+
+  // True if the master is executing a job
+  std::mutex active_mutex_;
+  std::condition_variable active_cv_;
+  bool active_bulk_job_ = false;
+  proto::BulkJobParameters job_params_;
+
+  // True if all work for job is done
+  std::mutex finished_mutex_;
+  std::condition_variable finished_cv_;
+  std::atomic<bool> finished_{true};
+  Result job_result_;
+
+  std::thread job_processor_thread_;
+  // Manages modification of all of the below structures
+  std::mutex work_mutex_;
+  // Mapping from jobs to table ids
+  std::map<i64, i64> job_to_table_id_;
+  // Slice input rows for each job at each slice op
+  std::vector<std::map<i64, i64>> slice_input_rows_per_job_;
+  // Output rows for each job
+  std::vector<i64> total_output_rows_per_job_;
+  // All job task output rows
+  // Job -> Task -> task output rows
+  std::vector<std::vector<std::vector<i64>>> job_tasks_;
+  // Outstanding set of generated task samples that should be processed
+  std::deque<std::tuple<i64, i64>> unallocated_job_tasks_;
+  // The next job to use to generate tasks
+  i64 next_job_;
+  // Total number of jobs
+  i64 num_jobs_;
+  // Next sample index in the current task
+  i64 next_task_;
+  // Total samples in the current task
+  i64 num_tasks_;
+  Result task_result_;
+
+  //============================================================================
+  // Assignment of tasks to workers
+  //============================================================================
+  // Tracks tasks assigned to worker so they can be reassigned if the worker
+  // fails
+  // Worker id -> (job_id, task_id)
+  std::map<i64, std::set<std::tuple<i64, i64>>> active_job_tasks_;
+  // (Worker id, job_id, task_id) -> start_time
+  std::map<std::tuple<i64, i64, i64>, double> active_job_tasks_starts_;
+  // Tracks number of times a task has been failed so that a job can be removed
+  // if it is causing consistent failures
+  // job_id -> task_id -> num_failures
+  std::map<i64, std::map<i64, i64>> job_tasks_num_failures_;
+  // Tracks the jobs that have failed too many times and should be ignored
+  std::set<i64> blacklisted_jobs_;
+  struct WorkerHistory {
+    timepoint_t start_time;
+    timepoint_t end_time;
+    i64 tasks_assigned;
+    i64 tasks_retired;
+  };
+  std::map<i64, WorkerHistory> worker_histories_;
+  std::map<i32, bool> unfinished_workers_;
+  std::vector<i32> unstarted_workers_;
+  std::atomic<i64> num_failed_workers_{0};
+  std::vector<i32> job_uncommitted_tables_;
+
+  std::map<i64, std::map<i64, i64>> job_task_num_rows_;
+
+  // Worker connections
+  std::map<std::string, i32> local_ids_;
+  std::map<std::string, i32> local_totals_;
+};
+}
+}
diff --git a/scanner/engine/metadata.cpp b/scanner/engine/metadata.cpp
new file mode 100644
index 00000000..e3b74bd0
--- /dev/null
+++ b/scanner/engine/metadata.cpp
@@ -0,0 +1,463 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scanner/engine/metadata.h"
+#include "scanner/engine/runtime.h"
+#include "scanner/util/storehouse.h"
+#include "scanner/util/util.h"
+#include "storehouse/storage_backend.h"
+
+#include <errno.h>
+#include <libgen.h>
+#include <limits.h> /* PATH_MAX */
+#include <string.h>
+#include <sys/stat.h> /* mkdir(2) */
+#include <cassert>
+#include <cstdarg>
+#include <iostream>
+#include <sstream>
+
+using storehouse::WriteFile;
+using storehouse::RandomReadFile;
+using storehouse::StoreResult;
+
+namespace scanner {
+using namespace proto;
+
+namespace internal {
+
+template <>
+std::string Metadata<DatabaseDescriptor>::descriptor_path() const {
+  const DatabaseMetadata* meta = (const DatabaseMetadata*)this;
+  return database_metadata_path();
+}
+
+template <>
+std::string Metadata<VideoDescriptor>::descriptor_path() const {
+  const VideoMetadata* meta = (const VideoMetadata*)this;
+  return table_item_video_metadata_path(meta->table_id(), meta->column_id(),
+                                        meta->item_id());
+}
+
+template <>
+std::string Metadata<BulkJobDescriptor>::descriptor_path() const {
+  const BulkJobMetadata* meta = (const BulkJobMetadata*)this;
+  return bulk_job_descriptor_path(meta->id());
+}
+
+template <>
+std::string Metadata<TableDescriptor>::descriptor_path() const {
+  const TableMetadata* meta = (const TableMetadata*)this;
+  return table_descriptor_path(meta->id());
+}
+
+DatabaseMetadata::DatabaseMetadata() : next_table_id_(0), next_bulk_job_id_(0) {}
+
+DatabaseMetadata::DatabaseMetadata(const DatabaseDescriptor& d)
+  : Metadata(d),
+    next_table_id_(d.next_table_id()),
+    next_bulk_job_id_(d.next_bulk_job_id()) {
+  for (int i = 0; i < descriptor_.tables_size(); ++i) {
+    const DatabaseDescriptor::Table& table = descriptor_.tables(i);
+    table_id_names_.insert({table.id(), table.name()});
+    table_committed_.insert({table.id(), table.committed()});
+  }
+  for (int i = 0; i < descriptor_.bulk_jobs_size(); ++i) {
+    const DatabaseDescriptor_BulkJob& bulk_job = descriptor_.bulk_jobs(i);
+    bulk_job_id_names_.insert({bulk_job.id(), bulk_job.name()});
+    bulk_job_committed_.insert({bulk_job.id(), bulk_job.committed()});
+  }
+}
+
+const DatabaseDescriptor& DatabaseMetadata::get_descriptor() const {
+  descriptor_.set_next_table_id(next_table_id_);
+  descriptor_.set_next_bulk_job_id(next_bulk_job_id_);
+  descriptor_.clear_tables();
+  descriptor_.clear_bulk_jobs();
+
+  for (auto& kv : table_id_names_) {
+    auto table = descriptor_.add_tables();
+    table->set_id(kv.first);
+    table->set_name(kv.second);
+    table->set_committed(table_committed_.at(kv.first));
+  }
+
+  for (auto& kv : bulk_job_id_names_) {
+    auto bulk_job = descriptor_.add_bulk_jobs();
+    bulk_job->set_id(kv.first);
+    bulk_job->set_name(kv.second);
+    bulk_job->set_committed(bulk_job_committed_.at(kv.first));
+  }
+
+  return descriptor_;
+}
+
+std::string DatabaseMetadata::descriptor_path() {
+  return database_metadata_path();
+}
+
+std::vector<std::string> DatabaseMetadata::table_names() const {
+  std::vector<std::string> names;
+  for (auto& entry : table_id_names_) {
+    names.push_back(entry.second);
+  }
+  return names;
+}
+
+bool DatabaseMetadata::has_table(const std::string& table) const {
+  for (const auto& kv : table_id_names_) {
+    if (kv.second == table) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool DatabaseMetadata::has_table(i32 table_id) const {
+  return table_id_names_.count(table_id) > 0;
+}
+
+i32 DatabaseMetadata::get_table_id(const std::string& table) const {
+  i32 id = -1;
+  for (const auto& kv : table_id_names_) {
+    if (kv.second == table) {
+      id = kv.first;
+      break;
+    }
+  }
+  LOG_IF(WARNING, id == -1) << "Table " << table << " does not exist.";
+  return id;
+}
+
+const std::string& DatabaseMetadata::get_table_name(i32 table_id) const {
+  return table_id_names_.at(table_id);
+}
+
+i32 DatabaseMetadata::add_table(const std::string& table) {
+  i32 table_id = -1;
+  if (!has_table(table)) {
+    table_id = next_table_id_++;
+    table_id_names_[table_id] = table;
+    table_committed_[table_id] = false;
+  }
+  return table_id;
+}
+
+void DatabaseMetadata::commit_table(i32 table_id) {
+  assert(table_id_names_.count(table_id) > 0);
+  table_committed_[table_id] = true;
+}
+
+bool DatabaseMetadata::table_is_committed(i32 table_id) const {
+  assert(table_id_names_.count(table_id) > 0);
+  return table_committed_.at(table_id);
+}
+
+void DatabaseMetadata::remove_table(i32 table_id) {
+  assert(table_id_names_.count(table_id) > 0);
+  table_id_names_.erase(table_id);
+}
+
+const std::vector<std::string>& DatabaseMetadata::bulk_job_names() const {
+  std::vector<std::string> names;
+  for (auto& entry : bulk_job_id_names_) {
+    names.push_back(entry.second);
+  }
+  return names;
+}
+
+bool DatabaseMetadata::has_bulk_job(const std::string& bulk_job) const {
+  for (const auto& kv : bulk_job_id_names_) {
+    if (kv.second == bulk_job) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool DatabaseMetadata::has_bulk_job(i32 bulk_job_id) const {
+  return bulk_job_id_names_.count(bulk_job_id) > 0;
+}
+
+i32 DatabaseMetadata::get_bulk_job_id(const std::string& bulk_job) const {
+  i32 bulk_job_id = -1;
+  for (const auto& kv : bulk_job_id_names_) {
+    if (kv.second == bulk_job) {
+      bulk_job_id = kv.first;
+      break;
+    }
+  }
+  assert(bulk_job_id != -1);
+  return bulk_job_id;
+}
+
+const std::string& DatabaseMetadata::get_bulk_job_name(i32 bulk_job_id) const {
+  return bulk_job_id_names_.at(bulk_job_id);
+}
+
+i32 DatabaseMetadata::add_bulk_job(const std::string& bulk_job_name) {
+  i32 bulk_job_id = next_bulk_job_id_++;
+  bulk_job_id_names_[bulk_job_id] = bulk_job_name;
+  bulk_job_committed_[bulk_job_id] = false;
+  return bulk_job_id;
+}
+
+void DatabaseMetadata::commit_bulk_job(i32 bulk_job_id) {
+  assert(bulk_job_id_names_.count(bulk_job_id) > 0);
+  bulk_job_committed_[bulk_job_id] = true;
+
+}
+
+bool DatabaseMetadata::bulk_job_is_committed(i32 bulk_job_id) const {
+  assert(bulk_job_id_names_.count(bulk_job_id) > 0);
+  return bulk_job_committed_.at(bulk_job_id);
+}
+
+void DatabaseMetadata::remove_bulk_job(i32 bulk_job_id) {
+  assert(bulk_job_id_names_.count(bulk_job_id) > 0);
+  bulk_job_id_names_.erase(bulk_job_id);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// VideoMetdata
+VideoMetadata::VideoMetadata() {}
+
+VideoMetadata::VideoMetadata(const VideoDescriptor& descriptor)
+  : Metadata(descriptor) {}
+
+std::string VideoMetadata::descriptor_path(i32 table_id, i32 column_id,
+                                           i32 item_id) {
+  return table_item_video_metadata_path(table_id, column_id, item_id);
+}
+
+i32 VideoMetadata::table_id() const { return descriptor_.table_id(); }
+
+i32 VideoMetadata::column_id() const { return descriptor_.column_id(); }
+
+i32 VideoMetadata::item_id() const { return descriptor_.item_id(); }
+
+i32 VideoMetadata::frames() const { return descriptor_.frames(); }
+
+i32 VideoMetadata::width() const { return descriptor_.width(); }
+
+i32 VideoMetadata::height() const { return descriptor_.height(); }
+
+i32 VideoMetadata::channels() const { return descriptor_.channels(); }
+
+FrameType VideoMetadata::frame_type() const { return descriptor_.frame_type(); }
+
+VideoDescriptor::VideoCodecType VideoMetadata::codec_type() const {
+  return descriptor_.codec_type();
+}
+
+i64 VideoMetadata::num_encoded_videos() const {
+  return descriptor_.num_encoded_videos();
+}
+
+std::vector<i64> VideoMetadata::frames_per_video() const {
+  return std::vector<i64>(descriptor_.frames_per_video().begin(),
+                          descriptor_.frames_per_video().end());
+}
+
+std::vector<i64> VideoMetadata::keyframes_per_video() const {
+  return std::vector<i64>(descriptor_.keyframes_per_video().begin(),
+                          descriptor_.keyframes_per_video().end());
+}
+
+std::vector<i64> VideoMetadata::size_per_video() const {
+  return std::vector<i64>(descriptor_.size_per_video().begin(),
+                          descriptor_.size_per_video().end());
+}
+
+std::vector<u64> VideoMetadata::keyframe_indices() const {
+  return std::vector<u64>(descriptor_.keyframe_indices().begin(),
+                          descriptor_.keyframe_indices().end());
+}
+
+std::vector<u64> VideoMetadata::sample_offsets() const {
+  return std::vector<u64>(descriptor_.sample_offsets().begin(),
+                          descriptor_.sample_offsets().end());
+}
+
+std::vector<u64> VideoMetadata::sample_sizes() const {
+  return std::vector<u64>(descriptor_.sample_sizes().begin(),
+                          descriptor_.sample_sizes().end());
+}
+
+std::vector<u8> VideoMetadata::metadata() const {
+  return std::vector<u8>(descriptor_.metadata_packets().begin(),
+                         descriptor_.metadata_packets().end());
+}
+
+std::string VideoMetadata::data_path() const {
+  return descriptor_.data_path();
+}
+
+bool VideoMetadata::inplace() const {
+  return descriptor_.inplace();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// ImageFormatGroupMetadata
+ImageFormatGroupMetadata::ImageFormatGroupMetadata() {}
+
+ImageFormatGroupMetadata::ImageFormatGroupMetadata(
+    const ImageFormatGroupDescriptor& descriptor)
+  : Metadata(descriptor) {}
+
+i32 ImageFormatGroupMetadata::num_images() const {
+  return descriptor_.num_images();
+}
+
+i32 ImageFormatGroupMetadata::width() const { return descriptor_.width(); }
+
+i32 ImageFormatGroupMetadata::height() const { return descriptor_.height(); }
+
+ImageEncodingType ImageFormatGroupMetadata::encoding_type() const {
+  return descriptor_.encoding_type();
+}
+
+ImageColorSpace ImageFormatGroupMetadata::color_space() const {
+  return descriptor_.color_space();
+}
+
+std::vector<i64> ImageFormatGroupMetadata::compressed_sizes() const {
+  return std::vector<i64>(descriptor_.compressed_sizes().begin(),
+                          descriptor_.compressed_sizes().end());
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// BulkJobMetadata
+BulkJobMetadata::BulkJobMetadata() {}
+BulkJobMetadata::BulkJobMetadata(const BulkJobDescriptor& job) : Metadata(job) {
+  for (auto& t : descriptor_.jobs()) {
+    table_names_.push_back(t.output_table_name());
+  }
+}
+
+std::string BulkJobMetadata::descriptor_path(i32 job_id) {
+  return bulk_job_descriptor_path(job_id);
+}
+
+i32 BulkJobMetadata::id() const { return descriptor_.id(); }
+
+std::string BulkJobMetadata::name() const { return descriptor_.name(); }
+
+i32 BulkJobMetadata::io_packet_size() const {
+  return descriptor_.io_packet_size();
+}
+
+i32 BulkJobMetadata::work_packet_size() const {
+  return descriptor_.work_packet_size();
+}
+
+i32 BulkJobMetadata::num_nodes() const { return descriptor_.num_nodes(); }
+
+const std::vector<std::string>& BulkJobMetadata::table_names() const {
+  return table_names_;
+}
+
+bool BulkJobMetadata::has_table(const std::string& name) const {
+  for (const std::string& n : table_names_) {
+    if (n == name) {
+      return true;
+    }
+  }
+  return false;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// TableMetadata
+TableMetadata::TableMetadata() {}
+TableMetadata::TableMetadata(const TableDescriptor& table) : Metadata(table) {
+  for (auto& c : descriptor_.columns()) {
+    columns_.push_back(c);
+  }
+}
+
+std::string TableMetadata::descriptor_path(i32 table_id) {
+  return table_descriptor_path(table_id);
+}
+
+i32 TableMetadata::id() const { return descriptor_.id(); }
+
+std::string TableMetadata::name() const { return descriptor_.name(); }
+
+i64 TableMetadata::num_rows() const {
+  return descriptor_.end_rows(descriptor_.end_rows_size() - 1);
+}
+
+std::vector<i64> TableMetadata::end_rows() const {
+  return std::vector<i64>(descriptor_.end_rows().begin(),
+                          descriptor_.end_rows().end());
+}
+
+const std::vector<Column>& TableMetadata::columns() const { return columns_; }
+
+bool TableMetadata::has_column(const std::string& name) const {
+  for (auto& c : descriptor_.columns()) {
+    if (c.name() == name) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::string TableMetadata::column_name(i32 column_id) const {
+  for (auto& c : descriptor_.columns()) {
+    if (c.id() == column_id) {
+      return c.name();
+    }
+  }
+  LOG(FATAL) << "Column id " << column_id << " not found!";
+}
+
+i32 TableMetadata::column_id(const std::string& column_name) const {
+  for (auto& c : descriptor_.columns()) {
+    if (c.name() == column_name) {
+      return c.id();
+    }
+  }
+  LOG(FATAL) << "Column name " << column_name << " not found!";
+}
+
+ColumnType TableMetadata::column_type(i32 column_id) const {
+  for (auto& c : descriptor_.columns()) {
+    if (c.id() == column_id) {
+      return c.type();
+    }
+  }
+  LOG(FATAL) << "Column id " << column_id << " not found!";
+}
+
+namespace {
+std::string& get_database_path_ref() {
+  static std::string prefix = "";
+  return prefix;
+}
+}
+
+const std::string& get_database_path() {
+  std::atomic_thread_fence(std::memory_order_acquire);
+  return get_database_path_ref();
+}
+
+void set_database_path(std::string path) {
+  VLOG(1) << "Setting DB path to " << path;
+  get_database_path_ref() = path + "/";
+  std::atomic_thread_fence(std::memory_order_release);
+}
+}
+}
diff --git a/scanner/engine/db.h b/scanner/engine/metadata.h
similarity index 61%
rename from scanner/engine/db.h
rename to scanner/engine/metadata.h
index 5621665a..131e4751 100644
--- a/scanner/engine/db.h
+++ b/scanner/engine/metadata.h
@@ -52,41 +52,46 @@ inline std::string table_item_output_path(i32 table_id, i32 column_id,
          std::to_string(item_id) + ".bin";
 }
 
-inline std::string
-table_item_video_metadata_path(i32 table_id, i32 column_id, i32 item_id) {
+inline std::string table_item_video_metadata_path(i32 table_id, i32 column_id,
+                                                  i32 item_id) {
   return table_directory(table_id) + "/" + std::to_string(column_id) + "_" +
          std::to_string(item_id) + "_video_metadata.bin";
 }
 
-inline std::string job_directory(i32 job_id) {
-  return get_database_path() + "jobs/" + std::to_string(job_id);
+inline std::string table_item_metadata_path(i32 table_id, i32 column_id,
+                                            i32 item_id) {
+  return table_directory(table_id) + "/" + std::to_string(column_id) + "_" +
+         std::to_string(item_id) + "_metadata.bin";
 }
 
-inline std::string job_descriptor_path(i32 job_id) {
-  return job_directory(job_id) + "/descriptor.bin";
+inline std::string bulk_job_directory(i32 bulk_job_id) {
+  return get_database_path() + "jobs/" + std::to_string(bulk_job_id);
 }
 
-inline std::string job_profiler_path(i32 job_id, i32 node) {
-  return job_directory(job_id) + "/profile_" +
-         std::to_string(node) + ".bin";
+inline std::string bulk_job_descriptor_path(i32 bulk_job_id) {
+  return bulk_job_directory(bulk_job_id) + "/descriptor.bin";
 }
 
+inline std::string bulk_job_profiler_path(i32 bulk_job_id, i32 node) {
+  return bulk_job_directory(bulk_job_id) + "/profile_" + std::to_string(node) +
+         ".bin";
+}
 
 ///////////////////////////////////////////////////////////////////////////////
 /// Common persistent data structs and their serialization helpers
 
-template<typename T>
+template <typename T>
 class Metadata {
-public:
+ public:
   using Descriptor = T;
   Metadata() {}
   Metadata(const Descriptor& d) : descriptor_(d) {}
 
-  Descriptor &get_descriptor() const { return descriptor_; }
+  Descriptor& get_descriptor() const { return descriptor_; }
 
   std::string descriptor_path() const;
 
-protected:
+ protected:
   mutable Descriptor descriptor_;
 };
 
@@ -95,35 +100,40 @@ class DatabaseMetadata : public Metadata<proto::DatabaseDescriptor> {
   DatabaseMetadata();
   DatabaseMetadata(const Descriptor& descriptor);
 
-  const Descriptor &get_descriptor() const;
+  const Descriptor& get_descriptor() const;
 
   static std::string descriptor_path();
 
-  const std::vector<std::string> table_names() const;
+  std::vector<std::string> table_names() const;
 
   bool has_table(const std::string& table) const;
   bool has_table(i32 table_id) const;
   i32 get_table_id(const std::string& table) const;
   const std::string& get_table_name(i32 table_id) const;
   i32 add_table(const std::string& table);
+  void commit_table(i32 table_id);
+  bool table_is_committed(i32 table_id) const;
   void remove_table(i32 table_id);
 
-  const std::vector<std::string>& job_names() const;
+  const std::vector<std::string>& bulk_job_names() const;
 
-  bool has_job(const std::string& job) const;
-  bool has_job(i32 job_id) const;
-  i32 get_job_id(const std::string& job_name) const;
-  const std::string& get_job_name(i32 job_id) const;
-  i32 add_job(const std::string& job_name);
-  void remove_job(i32 job_id);
+  bool has_bulk_job(const std::string& job) const;
+  bool has_bulk_job(i32 job_id) const;
+  i32 get_bulk_job_id(const std::string& job_name) const;
+  const std::string& get_bulk_job_name(i32 job_id) const;
+  i32 add_bulk_job(const std::string& job_name);
+  void commit_bulk_job(i32 job_id);
+  bool bulk_job_is_committed(i32 job_id) const;
+  void remove_bulk_job(i32 job_id);
 
  private:
   i32 next_table_id_;
-  i32 next_job_id_;
-  std::vector<std::string> table_names_;
-  std::vector<std::string> job_names_;
+  i32 next_bulk_job_id_;
   std::map<i32, std::string> table_id_names_;
-  std::map<i32, std::string> job_id_names_;
+  std::map<i32, bool> table_committed_;
+
+  std::map<i32, std::string> bulk_job_id_names_;
+  std::map<i32, bool> bulk_job_committed_;
 };
 
 class VideoMetadata : public Metadata<proto::VideoDescriptor> {
@@ -139,11 +149,24 @@ class VideoMetadata : public Metadata<proto::VideoDescriptor> {
   i32 frames() const;
   i32 width() const;
   i32 height() const;
-  std::vector<i64> keyframe_positions() const;
-  std::vector<i64> keyframe_byte_offsets() const;
+  i32 channels() const;
+  proto::FrameType frame_type() const;
+  proto::VideoDescriptor::VideoCodecType codec_type() const;
+  i64 num_encoded_videos() const;
+  std::vector<i64> frames_per_video() const;
+  std::vector<i64> keyframes_per_video() const;
+  std::vector<i64> size_per_video() const;
+
+  std::vector<u64> keyframe_indices() const;
+  std::vector<u64> sample_offsets() const;
+  std::vector<u64> sample_sizes() const;
+  std::vector<u8> metadata() const;
+  std::string data_path() const;
+  bool inplace() const;
 };
 
-class ImageFormatGroupMetadata : public Metadata<proto::ImageFormatGroupDescriptor> {
+class ImageFormatGroupMetadata
+    : public Metadata<proto::ImageFormatGroupDescriptor> {
  public:
   ImageFormatGroupMetadata();
   ImageFormatGroupMetadata(const Descriptor& descriptor);
@@ -156,10 +179,10 @@ class ImageFormatGroupMetadata : public Metadata<proto::ImageFormatGroupDescript
   std::vector<i64> compressed_sizes() const;
 };
 
-class JobMetadata : public Metadata<proto::JobDescriptor> {
+class BulkJobMetadata : public Metadata<proto::BulkJobDescriptor> {
  public:
-  JobMetadata();
-  JobMetadata(const Descriptor& job);
+  BulkJobMetadata();
+  BulkJobMetadata(const Descriptor& job);
 
   static std::string descriptor_path(i32 job_id);
 
@@ -167,7 +190,9 @@ class JobMetadata : public Metadata<proto::JobDescriptor> {
 
   std::string name() const;
 
-  i32 work_item_size() const;
+  i32 io_packet_size() const;
+
+  i32 work_packet_size() const;
 
   i32 num_nodes() const;
 
@@ -183,7 +208,7 @@ class JobMetadata : public Metadata<proto::JobDescriptor> {
 
   // i64 total_rows() const;
 
-private:
+ private:
   std::vector<Column> columns_;
   std::map<std::string, i32> column_ids_;
   std::vector<std::string> table_names_;
@@ -191,7 +216,7 @@ class JobMetadata : public Metadata<proto::JobDescriptor> {
 };
 
 class TableMetadata : public Metadata<proto::TableDescriptor> {
-public:
+ public:
   TableMetadata();
   TableMetadata(const Descriptor& table);
 
@@ -207,6 +232,8 @@ class TableMetadata : public Metadata<proto::TableDescriptor> {
 
   const std::vector<proto::Column>& columns() const;
 
+  bool has_column(const std::string& name) const;
+
   std::string column_name(i32 column_id) const;
 
   i32 column_id(const std::string& name) const;
@@ -217,10 +244,11 @@ class TableMetadata : public Metadata<proto::TableDescriptor> {
   std::vector<proto::Column> columns_;
 };
 
-
 ///////////////////////////////////////////////////////////////////////////////
 /// Constants
 
+inline std::string index_column_name() { return "index"; }
+
 inline std::string frame_column_name() { return "frame"; }
 
 inline std::string frame_info_column_name() { return "frame_info"; }
@@ -230,7 +258,7 @@ inline std::string frame_info_column_name() { return "frame_info"; }
 
 template <typename T>
 void serialize_db_proto(storehouse::WriteFile* file, const T& descriptor) {
-  int size = descriptor.ByteSize();
+  size_t size = descriptor.ByteSizeLong();
   std::vector<u8> data(size);
   descriptor.SerializeToArray(data.data(), size);
   s_write(file, data.data(), size);
@@ -239,64 +267,58 @@ void serialize_db_proto(storehouse::WriteFile* file, const T& descriptor) {
 template <typename T>
 T deserialize_db_proto(storehouse::RandomReadFile* file, u64& pos) {
   T descriptor;
-  std::vector<u8> data = storehouse::read_entire_file(file, pos);
+  size_t size;
+  BACKOFF_FAIL(file->get_size(size));
+  std::vector<u8> data = storehouse::read_entire_file(
+      file, pos, std::max(size, (size_t)1024 * 1024));
   descriptor.ParseFromArray(data.data(), data.size());
   return descriptor;
 }
 
 template <typename T>
-void write_db_proto(storehouse::StorageBackend *storage, T db_proto) {
+void write_db_proto(storehouse::StorageBackend* storage, T db_proto) {
   std::unique_ptr<storehouse::WriteFile> output_file;
-  BACKOFF_FAIL(
-    make_unique_write_file(
-      storage,
-      db_proto.Metadata<typename T::Descriptor>::descriptor_path(),
+  BACKOFF_FAIL(make_unique_write_file(
+      storage, db_proto.Metadata<typename T::Descriptor>::descriptor_path(),
       output_file));
-  serialize_db_proto<typename T::Descriptor>(
-    output_file.get(), db_proto.get_descriptor());
+  serialize_db_proto<typename T::Descriptor>(output_file.get(),
+                                             db_proto.get_descriptor());
   BACKOFF_FAIL(output_file->save());
 }
 
 template <typename T>
 T read_db_proto(storehouse::StorageBackend* storage, const std::string& path) {
   std::unique_ptr<storehouse::RandomReadFile> db_in_file;
-  BACKOFF_FAIL(
-    make_unique_random_read_file(storage, path, db_in_file));
+  BACKOFF_FAIL(make_unique_random_read_file(storage, path, db_in_file));
   u64 pos = 0;
   return T(deserialize_db_proto<typename T::Descriptor>(db_in_file.get(), pos));
 }
 
 template <typename T>
-using WriteFn = void(*)(storehouse::StorageBackend *storage, T db_proto);
+using WriteFn = void (*)(storehouse::StorageBackend* storage, T db_proto);
 
 template <typename T>
-using ReadFn = T(*)(storehouse::StorageBackend *storage,
-                    const std::string& path);
+using ReadFn = T (*)(storehouse::StorageBackend* storage,
+                     const std::string& path);
 
 constexpr WriteFn<DatabaseMetadata> write_database_metadata =
-  write_db_proto<DatabaseMetadata>;
+    write_db_proto<DatabaseMetadata>;
 constexpr ReadFn<DatabaseMetadata> read_database_metadata =
-  read_db_proto<DatabaseMetadata>;
+    read_db_proto<DatabaseMetadata>;
 
-constexpr WriteFn<JobMetadata> write_job_metadata =
-  write_db_proto<JobMetadata>;
-constexpr ReadFn<JobMetadata> read_job_metadata =
-  read_db_proto<JobMetadata>;
+constexpr WriteFn<BulkJobMetadata> write_bulk_job_metadata =
+    write_db_proto<BulkJobMetadata>;
+constexpr ReadFn<BulkJobMetadata> read_bulk_job_metadata =
+    read_db_proto<BulkJobMetadata>;
 
 constexpr WriteFn<TableMetadata> write_table_metadata =
-  write_db_proto<TableMetadata>;
+    write_db_proto<TableMetadata>;
 constexpr ReadFn<TableMetadata> read_table_metadata =
-  read_db_proto<TableMetadata>;
+    read_db_proto<TableMetadata>;
 
 constexpr WriteFn<VideoMetadata> write_video_metadata =
-  write_db_proto<VideoMetadata>;
+    write_db_proto<VideoMetadata>;
 constexpr ReadFn<VideoMetadata> read_video_metadata =
-  read_db_proto<VideoMetadata>;
-
-///////////////////////////////////////////////////////////////////////////////
-/// Database modification helper functions
-void write_new_table(storehouse::StorageBackend *storage,
-                     const DatabaseMetadata &meta,
-                     const proto::TableDescriptor &table);
+    read_db_proto<VideoMetadata>;
 }
 }
diff --git a/scanner/engine/op_info.h b/scanner/engine/op_info.h
index 4b4355ba..b85840d9 100644
--- a/scanner/engine/op_info.h
+++ b/scanner/engine/op_info.h
@@ -24,29 +24,54 @@ namespace scanner {
 namespace internal {
 
 class OpInfo {
-public:
-  OpInfo(const std::string &name, const std::vector<std::string> &input_columns,
-         const std::vector<std::string> &output_columns)
-      : name_(name), input_columns_(input_columns),
-        output_columns_(output_columns) {}
-
-  const std::string& name() const {
-    return name_;
-  }
+ public:
+  OpInfo(const std::string& name, bool variadic_inputs,
+         const std::vector<Column>& input_columns,
+         const std::vector<Column>& output_columns, bool can_stencil,
+         const std::vector<i32> preferred_stencil, bool bounded_state,
+         i32 warmup, bool unbounded_state)
+    : name_(name),
+      variadic_inputs_(variadic_inputs),
+      input_columns_(input_columns),
+      output_columns_(output_columns),
+      can_stencil_(can_stencil),
+      preferred_stencil_(preferred_stencil),
+      bounded_state_(bounded_state),
+      warmup_(warmup),
+      unbounded_state_(unbounded_state) {}
+
+  const std::string& name() const { return name_; }
+
+  const bool variadic_inputs() const { return variadic_inputs_; }
+
+  const std::vector<Column>& input_columns() const { return input_columns_; }
+
+  const std::vector<Column>& output_columns() const { return output_columns_; }
 
-  const std::vector<std::string>& input_columns() const {
-    return input_columns_;
+  const bool can_stencil() const { return can_stencil_; }
+
+  const std::vector<i32>& preferred_stencil() const {
+    return preferred_stencil_;
   }
 
-  const std::vector<std::string>& output_columns() const {
-    return output_columns_;
+  const bool has_bounded_state() const { return bounded_state_; }
+
+  const i32 warmup() const {
+    return warmup_;
   }
 
-private:
+  const bool has_unbounded_state() const { return unbounded_state_; }
+
+ private:
   std::string name_;
-  std::vector<std::string> input_columns_;
-  std::vector<std::string> output_columns_;
+  bool variadic_inputs_;
+  std::vector<Column> input_columns_;
+  std::vector<Column> output_columns_;
+  bool can_stencil_;
+  std::vector<i32> preferred_stencil_;
+  bool bounded_state_;
+  i32 warmup_;
+  bool unbounded_state_;
 };
-
 }
 }
diff --git a/scanner/engine/op_registry.cpp b/scanner/engine/op_registry.cpp
index 2b07599f..6ee350c8 100644
--- a/scanner/engine/op_registry.cpp
+++ b/scanner/engine/op_registry.cpp
@@ -18,32 +18,39 @@
 namespace scanner {
 namespace internal {
 
-void OpRegistry::add_op(const std::string &name, OpInfo *info) {
+Result OpRegistry::add_op(const std::string& name, OpInfo* info) {
+  Result result;
+  result.set_success(true);
   if (ops_.count(name) > 0) {
-    LOG(FATAL) << "Attempted to re-register op " << name;
+    RESULT_ERROR(&result, "Attempted to re-register op %s", name.c_str());
+    return result;
   }
-  if (info->input_columns().empty()) {
-    LOG(FATAL) << "Attempted to register op " << name
-               << " with empty input columns.";
+  if (info->input_columns().empty() && !info->variadic_inputs()) {
+    RESULT_ERROR(&result,
+                 "Attempted to register op %s with empty input columns",
+                 name.c_str());
+    return result;
   }
   if (info->output_columns().empty()) {
-    LOG(FATAL) << "Attempted to register op " << name
-               << " with empty output columns.";
+    RESULT_ERROR(&result,
+                 "Attempted to register op %s with empty output columns",
+                 name.c_str());
+    return result;
   }
   ops_.insert({name, info});
+  return result;
 }
 
-OpInfo *
-OpRegistry::get_op_info(const std::string &name) const {
+OpInfo* OpRegistry::get_op_info(const std::string& name) const {
   return ops_.at(name);
 }
 
-bool OpRegistry::has_op(const std::string &name) const {
+bool OpRegistry::has_op(const std::string& name) const {
   return ops_.count(name) > 0;
 }
 
-OpRegistry *get_op_registry() {
-  static OpRegistry *registry = new OpRegistry;
+OpRegistry* get_op_registry() {
+  static OpRegistry* registry = new OpRegistry;
   return registry;
 }
 }
diff --git a/scanner/engine/op_registry.h b/scanner/engine/op_registry.h
index 92fb188e..8638598a 100644
--- a/scanner/engine/op_registry.h
+++ b/scanner/engine/op_registry.h
@@ -26,18 +26,17 @@ namespace scanner {
 namespace internal {
 
 class OpRegistry {
-public:
-  void add_op(const std::string &name, OpInfo *info);
+ public:
+  Result add_op(const std::string& name, OpInfo* info);
 
-  OpInfo* get_op_info(const std::string &name) const;
+  OpInfo* get_op_info(const std::string& name) const;
 
   bool has_op(const std::string& name) const;
 
-private:
+ private:
   std::map<std::string, OpInfo*> ops_;
 };
 
 OpRegistry* get_op_registry();
-
 }
 }
diff --git a/scanner/engine/python.cpp b/scanner/engine/python.cpp
new file mode 100644
index 00000000..c152ac28
--- /dev/null
+++ b/scanner/engine/python.cpp
@@ -0,0 +1,133 @@
+#include "scanner/api/database.h"
+#include "scanner/engine/op_info.h"
+#include "scanner/engine/op_registry.h"
+#include "scanner/util/common.h"
+
+#include <boost/python.hpp>
+#include <boost/python/stl_iterator.hpp>
+#include <boost/python/numpy.hpp>
+#include <thread>
+
+namespace scanner {
+namespace {
+class GILRelease {
+ public:
+  inline GILRelease() {
+    PyEval_InitThreads();
+    m_thread_state = PyEval_SaveThread();
+  }
+
+  inline ~GILRelease() {
+    PyEval_RestoreThread(m_thread_state);
+    m_thread_state = NULL;
+  }
+
+ private:
+  PyThreadState* m_thread_state;
+};
+}
+
+namespace py = boost::python;
+
+template <typename T>
+inline std::vector<T> to_std_vector(const py::object& iterable) {
+  return std::vector<T>(py::stl_input_iterator<T>(iterable),
+                        py::stl_input_iterator<T>());
+}
+
+template <class T>
+py::list to_py_list(std::vector<T> vector) {
+  typename std::vector<T>::iterator iter;
+  py::list list;
+  for (iter = vector.begin(); iter != vector.end(); ++iter) {
+    list.append(*iter);
+  }
+  return list;
+}
+
+std::string default_machine_params_wrapper() {
+  MachineParameters params = default_machine_params();
+  proto::MachineParameters params_proto;
+  params_proto.set_num_cpus(params.num_cpus);
+  params_proto.set_num_load_workers(params.num_load_workers);
+  params_proto.set_num_save_workers(params.num_save_workers);
+  for (auto gpu_id : params.gpu_ids) {
+    params_proto.add_gpu_ids(gpu_id);
+  }
+
+  std::string output;
+  bool success = params_proto.SerializeToString(&output);
+  LOG_IF(FATAL, !success) << "Failed to serialize machine params";
+  return output;
+}
+
+proto::Result start_master_wrapper(Database& db, const std::string& port,
+                                   bool watchdog, bool prefetch_table_metadata,
+                                   i64 no_workers_timeout) {
+  GILRelease r;
+  return db.start_master(default_machine_params(), port, watchdog,
+                         prefetch_table_metadata,
+                         no_workers_timeout);
+}
+
+proto::Result start_worker_wrapper(Database& db, const std::string& params_s,
+                                   const std::string& port, bool watchdog,
+                                   bool prefetch_table_metadata) {
+  GILRelease r;
+  proto::MachineParameters params_proto;
+  params_proto.ParseFromString(params_s);
+  MachineParameters params;
+  params.num_cpus = params_proto.num_cpus();
+  params.num_load_workers = params_proto.num_load_workers();
+  params.num_save_workers = params_proto.num_save_workers();
+  for (auto gpu_id : params_proto.gpu_ids()) {
+    params.gpu_ids.push_back(gpu_id);
+  }
+
+  return db.start_worker(params, port, watchdog, prefetch_table_metadata);
+}
+
+py::list ingest_videos_wrapper(Database& db, const py::list table_names,
+                               const py::list paths,
+                               bool inplace) {
+  std::vector<FailedVideo> failed_videos;
+  {
+    GILRelease r;
+    db.ingest_videos(to_std_vector<std::string>(table_names),
+                     to_std_vector<std::string>(paths), inplace, failed_videos);
+  }
+  return to_py_list<FailedVideo>(failed_videos);
+}
+
+Result wait_for_server_shutdown_wrapper(Database& db) {
+  GILRelease r;
+  return db.wait_for_server_shutdown();
+}
+
+boost::shared_ptr<Database> initWrapper(storehouse::StorageConfig* sc,
+                                        const std::string& db_path,
+                                        const std::string& master_addr) {
+  GILRelease r;
+  return boost::shared_ptr<Database>( new Database(sc, db_path, master_addr) );
+}
+
+BOOST_PYTHON_MODULE(libscanner) {
+  boost::python::numpy::initialize();
+  using namespace py;
+  class_<Database, boost::noncopyable>("Database", no_init)
+      .def("__init__", make_constructor(&initWrapper))
+      .def("ingest_videos", &Database::ingest_videos);
+  class_<FailedVideo>("FailedVideo", no_init)
+      .def_readonly("path", &FailedVideo::path)
+      .def_readonly("message", &FailedVideo::message);
+  class_<proto::Result>("Result", no_init)
+      .def("success", &proto::Result::success,
+           return_value_policy<return_by_value>())
+      .def("msg", &proto::Result::msg, return_value_policy<return_by_value>());
+  def("start_master", start_master_wrapper);
+  def("start_worker", start_worker_wrapper);
+  def("ingest_videos", ingest_videos_wrapper);
+  def("wait_for_server_shutdown", wait_for_server_shutdown_wrapper);
+  def("default_machine_params", default_machine_params_wrapper);
+}
+}
diff --git a/scanner/engine/python.in.cpp b/scanner/engine/python.in.cpp
deleted file mode 100644
index 0311f237..00000000
--- a/scanner/engine/python.in.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-#include "scanner/api/database.h"
-#include "scanner/engine/op_info.h"
-#include "scanner/engine/op_registry.h"
-#include "scanner/util/common.h"
-
-#include <boost/python.hpp>
-#include <boost/python/stl_iterator.hpp>
-
-namespace scanner {
-
-namespace py = boost::python;
-
-std::string get_include() {
-  // This variable is filled in at compile time by CMake.
-  return "@dirs@";
-}
-
-std::string other_flags() {
-#ifdef HAVE_CUDA
-  return "-DHAVE_CUDA";
-#else
-  return "";
-#endif
-}
-
-template <typename T>
-inline std::vector<T> to_std_vector(const py::object &iterable) {
-  return std::vector<T>(py::stl_input_iterator<T>(iterable),
-                        py::stl_input_iterator<T>());
-}
-
-template <class T> py::list to_py_list(std::vector<T> vector) {
-  typename std::vector<T>::iterator iter;
-  py::list list;
-  for (iter = vector.begin(); iter != vector.end(); ++iter) {
-    list.append(*iter);
-  }
-  return list;
-}
-
-py::list get_output_columns(const std::string &op_name) {
-  internal::OpRegistry *registry = internal::get_op_registry();
-  LOG_IF(FATAL, !registry->has_op(op_name))
-      << "Op " << op_name << " does not exist.";
-  internal::OpInfo *info = registry->get_op_info(op_name);
-  return to_py_list(info->output_columns());
-}
-
-bool has_op(const std::string &name) {
-  internal::OpRegistry *registry = internal::get_op_registry();
-  return registry->has_op(name);
-}
-
-std::string default_machine_params_wrapper() {
-  MachineParameters params = default_machine_params();
-  proto::MachineParameters params_proto;
-  params_proto.set_num_cpus(params.num_cpus);
-  params_proto.set_num_load_workers(params.num_load_workers);
-  params_proto.set_num_save_workers(params.num_save_workers);
-  for (auto gpu_id : params.gpu_ids) {
-    params_proto.add_gpu_ids(gpu_id);
-  }
-
-  std::string output;
-  bool success = params_proto.SerializeToString(&output);
-  LOG_IF(FATAL, !success) << "Failed to serialize machine params";
-  return output;
-}
-
-void start_master_wrapper(Database& db) {
-  db.start_master(default_machine_params());
-}
-
-void start_worker_wrapper(Database& db, const std::string& params_s) {
-  proto::MachineParameters params_proto;
-  params_proto.ParseFromString(params_s);
-  MachineParameters params;
-  params.num_cpus = params_proto.num_cpus();
-  params.num_load_workers = params_proto.num_load_workers();
-  params.num_save_workers = params_proto.num_save_workers();
-  for (auto gpu_id : params_proto.gpu_ids()) {
-    params.gpu_ids.push_back(gpu_id);
-  }
-
-  db.start_worker(params);
-}
-
-py::list ingest_videos_wrapper(
-  Database& db,
-  const py::list table_names,
-  const py::list paths) {
-  std::vector<FailedVideo> failed_videos;
-  db.ingest_videos(
-    to_std_vector<std::string>(table_names),
-    to_std_vector<std::string>(paths),
-    failed_videos);
-  return to_py_list<FailedVideo>(failed_videos);
-}
-
-
-BOOST_PYTHON_MODULE(libscanner) {
-  using namespace py;
-  class_<Database, boost::noncopyable>(
-    "Database", init<storehouse::StorageConfig*, const std::string&, const std::string&>())
-    .def("ingest_videos", &Database::ingest_videos);
-  class_<FailedVideo>("FailedVideo", no_init)
-    .def_readonly("path", &FailedVideo::path)
-    .def_readonly("message", &FailedVideo::message);
-  def("start_master", start_master_wrapper);
-  def("start_worker", start_worker_wrapper);
-  def("ingest_videos", ingest_videos_wrapper);
-  def("get_include", get_include);
-  def("other_flags", other_flags);
-  def("get_output_columns", get_output_columns);
-  def("has_op", has_op);
-  def("default_machine_params", default_machine_params_wrapper);
-}
-}
diff --git a/scanner/engine/python_kernel.cpp b/scanner/engine/python_kernel.cpp
new file mode 100644
index 00000000..c2f34c14
--- /dev/null
+++ b/scanner/engine/python_kernel.cpp
@@ -0,0 +1,321 @@
+#include "scanner/engine/python_kernel.h"
+#include "scanner/util/util.h"
+
+#include <boost/python.hpp>
+#include <boost/python/numpy.hpp>
+
+namespace scanner {
+
+namespace py = boost::python;
+namespace np = boost::python::numpy;
+
+std::string handle_pyerror() {
+  using namespace boost::python;
+  using namespace boost;
+
+  PyObject *exc, *val, *tb;
+  object formatted_list, formatted;
+  PyErr_Fetch(&exc, &val, &tb);
+  handle<> hexc(exc), hval(allow_null(val)), htb(allow_null(tb));
+  object traceback(import("traceback"));
+  if (!tb) {
+    object format_exception_only(traceback.attr("format_exception_only"));
+    formatted_list = format_exception_only(hexc, hval);
+  } else {
+    object format_exception(traceback.attr("format_exception"));
+    formatted_list = format_exception(hexc, hval, htb);
+  }
+  formatted = str("\n").join(formatted_list);
+  return extract<std::string>(formatted);
+}
+
+PythonKernel::PythonKernel(const KernelConfig& config,
+                           const std::string& kernel_str,
+                           const std::string& pickled_config,
+                           const int preferred_batch)
+  : BatchedKernel(config), config_(config), device_(config.devices[0]) {
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  can_batch_ = (preferred_batch > 1);
+  try {
+    py::object main = py::import("__main__");
+    main.attr("kernel_str") = py::str(kernel_str);
+    main.attr("config_str") = py::str(pickled_config);
+
+    py::list devices;
+    py::list device_ids;
+    for (auto& handle : config.devices) {
+      devices.append(py::object(handle.type == DeviceType::CPU ? 0 : 1));
+      device_ids.append(py::object(handle.id));
+    }
+    py::list input_columns;
+    for (auto& inc : config.input_columns) {
+      input_columns.append(inc);
+    }
+    py::list input_column_types;
+    for (auto& inc : config.input_column_types) {
+      input_column_types.append(py::object(inc == ColumnType::Other ? 0 : 1));
+    }
+    py::list output_columns;
+    for (auto& outc : config.output_columns) {
+      output_columns.append(outc);
+    }
+    py::str args((const char*)config.args.data(), config.args.size());
+    py::object node_id(config.node_id);
+
+    main.attr("devices") = devices;
+    main.attr("device_ids") = device_ids;
+    main.attr("input_columns") = input_columns;
+    main.attr("input_column_types") = input_column_types;
+    main.attr("output_columns") = output_columns;
+    main.attr("args") = args;
+    main.attr("node_id") = node_id;
+
+    py::object main_namespace = main.attr("__dict__");
+    py::exec(
+        "import pickle\n"
+        "from scannerpy import Config, DeviceType, DeviceHandle, KernelConfig, "
+        "ColumnType\n"
+        "from scannerpy.protobuf_generator import ProtobufGenerator\n"
+        "config = pickle.loads(config_str)\n"
+        "protobufs = ProtobufGenerator(config)\n"
+        "handles = [DeviceHandle(DeviceType(d), di)\n"
+        "           for d, di in zip(devices, device_ids)]\n"
+        "input_types = [ColumnType(c) for c in input_column_types]\n"
+        "kernel_config = KernelConfig(handles, input_columns,\n"
+        "                             input_column_types, output_columns,\n"
+        "                             args, node_id)\n"
+        "exec(kernel_str)\n"
+        "kernel = KERNEL(kernel_config, protobufs)",
+        main_namespace);
+  } catch (py::error_already_set& e) {
+    LOG(FATAL) << handle_pyerror();
+  }
+  PyGILState_Release(gstate);
+}
+
+PythonKernel::~PythonKernel() {
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  try {
+    py::object main = py::import("__main__");
+    py::object kernel = main.attr("kernel");
+    kernel.attr("close")();
+  } catch (py::error_already_set& e) {
+    LOG(FATAL) << handle_pyerror();
+  }
+  PyGILState_Release(gstate);
+}
+
+void PythonKernel::batched_python_execute(const BatchedColumns& input_columns,
+                                          BatchedColumns& output_columns) {
+  i32 input_count = (i32)num_rows(input_columns[0]);
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  try {
+    py::object main = py::import("__main__");
+    py::object kernel = main.attr("kernel");
+
+    py::list batched_cols;
+    for (i32 j = 0; j < input_columns.size(); ++j) {
+      py::list rows;
+      // HACK(wcrichto): should pass column type in config and check here
+      if (config_.input_column_types[j] == proto::ColumnType::Video) {
+        for (i32 i = 0; i < input_count; ++i) {
+          const Frame *frame = input_columns[j][i].as_const_frame();
+          np::ndarray frame_np =
+              np::from_data(frame->data, np::dtype::get_builtin<uint8_t>(),
+                            py::make_tuple(frame->height(), frame->width(),
+                                           frame->channels()),
+                            py::make_tuple(frame->width() * frame->channels(),
+                                           frame->channels(), 1),
+                            py::object());
+          rows.append(frame_np);
+        }
+      } else {
+        for (i32 i = 0; i < input_count; ++i) {
+          rows.append(py::str((char const*)input_columns[j][i].buffer,
+                              input_columns[j][i].size));
+        }
+      }
+      batched_cols.append(rows);
+    }
+
+    py::list batched_out_cols =
+        py::extract<py::list>(kernel.attr("execute")(batched_cols));
+    LOG_IF(FATAL, py::len(batched_out_cols) != output_columns.size())
+        << "Incorrect number of output columns. Expected "
+        << output_columns.size();
+
+    for (i32 j = 0; j < output_columns.size(); ++j) {
+      // push all rows to that column
+      LOG_IF(FATAL, py::len(batched_out_cols[j]) != input_count)
+          << "Incorrect number of output rows. Expected "
+          << input_count;
+      if (config_.output_columns[j] == "frame") {
+        for (i32 i = 0; i < input_count; ++i) {
+          np::ndarray frame_np =
+              py::extract<np::ndarray>(batched_out_cols[j][i]);
+          FrameType frame_type;
+          {
+            np::dtype dtype = frame_np.get_dtype();
+            if (dtype == np::dtype::get_builtin<uint8_t>()) {
+              frame_type = FrameType::U8;
+            } else if (dtype == np::dtype::get_builtin<f32>()) {
+              frame_type = FrameType::F32;
+            } else if (dtype == np::dtype::get_builtin<f64>()) {
+              frame_type = FrameType::F64;
+            } else {
+              LOG(FATAL) << "Invalid numpy dtype: "
+                         << py::extract<char const*>(py::str(dtype));
+            }
+          }
+          i32 ndim = frame_np.get_nd();
+          if (ndim > 3) {
+            LOG(FATAL) << "Invalid number of dimensions (must be less than 4): "
+                       << ndim;
+          }
+          std::vector<i32> shapes;
+          std::vector<i32> strides;
+          for (int n = 0; n < ndim; ++n) {
+            shapes.push_back(frame_np.shape(n));
+            strides.push_back(frame_np.strides(n));
+          }
+          FrameInfo frame_info(shapes, frame_type);
+          Frame* frame = new_frame(CPU_DEVICE, frame_info);
+          const char* frame_data = frame_np.get_data();
+
+          if (ndim == 3) {
+            assert(strides[1] % strides[2] == 0);
+            for (int i = 0; i < shapes[0]; ++i) {
+              u64 offset = strides[0] * i;
+              memcpy(frame->data + offset, frame_data + offset,
+                     shapes[2] * shapes[1] * strides[2]);
+            }
+          } else {
+            LOG(FATAL) << "Can not support ndim != 3.";
+          }
+          insert_frame(output_columns[j], frame);
+        }
+      } else {
+        for (i32 i = 0; i < input_count; ++i) {
+          std::string field = py::extract<std::string>(batched_out_cols[j][i]);
+          size_t size = field.size();
+          u8* buf = new_buffer(CPU_DEVICE, size);
+          memcpy_buffer(buf, CPU_DEVICE, (u8*)field.data(), CPU_DEVICE, size);
+          insert_element(output_columns[j], buf, size);
+        }
+      }
+    }
+    
+  } catch (py::error_already_set& e) {
+    LOG(FATAL) << handle_pyerror();
+  }
+
+  PyGILState_Release(gstate);
+}
+
+void PythonKernel::single_python_execute(const BatchedColumns& input_columns,
+                                         BatchedColumns& output_columns) {
+  i32 input_count = (i32)num_rows(input_columns[0]);
+
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  try {
+    py::object main = py::import("__main__");
+    py::object kernel = main.attr("kernel");
+
+    for (i32 i = 0; i < input_count; ++i) {
+      py::list cols;
+      for (i32 j = 0; j < input_columns.size(); ++j) {
+        // HACK(wcrichto): should pass column type in config and check here
+        if (config_.input_column_types[j] == proto::ColumnType::Video) {
+          const Frame* frame = input_columns[j][i].as_const_frame();
+          np::ndarray frame_np =
+              np::from_data(frame->data, np::dtype::get_builtin<uint8_t>(),
+                            py::make_tuple(frame->height(), frame->width(),
+                                           frame->channels()),
+                            py::make_tuple(frame->width() * frame->channels(),
+                                           frame->channels(), 1),
+                            py::object());
+          cols.append(frame_np);
+        } else {
+          cols.append(py::str((char const*)input_columns[j][i].buffer,
+                              input_columns[j][i].size));
+        }
+      }
+
+      py::list out_cols = py::extract<py::list>(kernel.attr("execute")(cols));
+      LOG_IF(FATAL, py::len(out_cols) != output_columns.size())
+          << "Incorrect number of output columns. Expected "
+          << output_columns.size();
+
+      for (i32 j = 0; j < output_columns.size(); ++j) {
+        // HACK(wcrichto): should pass column type in config and check here
+        if (config_.output_columns[j] == "frame") {
+          np::ndarray frame_np = py::extract<np::ndarray>(out_cols[j]);
+          FrameType frame_type;
+          {
+            np::dtype dtype = frame_np.get_dtype();
+            if (dtype == np::dtype::get_builtin<uint8_t>()) {
+              frame_type = FrameType::U8;
+            } else if (dtype == np::dtype::get_builtin<f32>()) {
+              frame_type = FrameType::F32;
+            } else if (dtype == np::dtype::get_builtin<f64>()) {
+              frame_type = FrameType::F64;
+            } else {
+              LOG(FATAL) << "Invalid numpy dtype: "
+                         << py::extract<char const*>(py::str(dtype));
+            }
+          }
+          i32 ndim = frame_np.get_nd();
+          if (ndim > 3) {
+            LOG(FATAL) << "Invalid number of dimensions (must be less than 4): "
+                       << ndim;
+          }
+          std::vector<i32> shapes;
+          std::vector<i32> strides;
+          for (int n = 0; n < ndim; ++n) {
+            shapes.push_back(frame_np.shape(n));
+            strides.push_back(frame_np.strides(n));
+          }
+          FrameInfo frame_info(shapes, frame_type);
+          Frame* frame = new_frame(CPU_DEVICE, frame_info);
+          const char* frame_data = frame_np.get_data();
+
+          if (ndim == 3) {
+            assert(strides[1] % strides[2] == 0);
+            for (int i = 0; i < shapes[0]; ++i) {
+              u64 offset = strides[0] * i;
+              memcpy(frame->data + offset, frame_data + offset,
+                     shapes[2] * shapes[1] * strides[2]);
+            }
+          } else {
+            LOG(FATAL) << "Can not support ndim != 3.";
+          }
+          insert_frame(output_columns[j], frame);
+        } else {
+          std::string field = py::extract<std::string>(out_cols[j]);
+          size_t size = field.size();
+          u8* buf = new_buffer(CPU_DEVICE, size);
+          memcpy_buffer(buf, CPU_DEVICE, (u8*)field.data(), CPU_DEVICE, size);
+          insert_element(output_columns[j], buf, size);
+        }
+      }
+    }
+  } catch (py::error_already_set& e) {
+    LOG(FATAL) << handle_pyerror();
+  }
+
+  PyGILState_Release(gstate);
+}
+
+void PythonKernel::execute(const BatchedColumns& input_columns,
+                           BatchedColumns& output_columns) {
+  if (can_batch_) {
+    batched_python_execute(input_columns, output_columns);
+  } else {
+    single_python_execute(input_columns, output_columns);
+  }
+  
+}
+
+}
diff --git a/scanner/engine/python_kernel.h b/scanner/engine/python_kernel.h
new file mode 100644
index 00000000..5a949d7e
--- /dev/null
+++ b/scanner/engine/python_kernel.h
@@ -0,0 +1,32 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/memory.h"
+#include "scanner/metadata.pb.h"
+
+#include <boost/python.hpp>
+#include <boost/python/numpy.hpp>
+
+namespace scanner {
+
+class PythonKernel : public BatchedKernel {
+ public:
+  PythonKernel(const KernelConfig& config, const std::string& kernel_str,
+               const std::string& pickled_config,
+               const int preferred_batch = 1);
+
+  ~PythonKernel();
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override;
+
+ private:
+  void batched_python_execute(const BatchedColumns& input_columns,
+                              BatchedColumns& output_columns);
+  void single_python_execute(const BatchedColumns& input_columns,
+                             BatchedColumns& output_columns);
+  KernelConfig config_;
+  DeviceHandle device_;
+  bool can_batch_;
+};
+
+}
diff --git a/scanner/engine/rpc.proto b/scanner/engine/rpc.proto
index 2e1ad304..ad4d7ee4 100644
--- a/scanner/engine/rpc.proto
+++ b/scanner/engine/rpc.proto
@@ -4,20 +4,49 @@ import "scanner/metadata.proto";
 package scanner.proto;
 
 service Master {
+  /// Database metadata methods
+
+  // List table info
+  rpc ListTables (Empty) returns (ListTablesResult) {}
+  // Get table info
+  rpc GetTables (GetTablesParams) returns (GetTablesResult) {}
+  // Delete table in the database
+  rpc DeleteTables (DeleteTablesParams) returns (Empty) {}
+  rpc NewTable (NewTableParams) returns (Empty) {}
+
+  /// Worker registration methods
   // Called after a new worker spawns to register with the master
   rpc RegisterWorker (WorkerParams) returns (Registration) {}
+  // Called when a worker is removed
+  rpc UnregisterWorker (NodeInfo) returns (Empty) {}
   rpc ActiveWorkers (Empty) returns (RegisteredWorkers) {}
   // Ingest videos into the system
   rpc IngestVideos (IngestParameters) returns (IngestResult) {}
-  rpc NextWork (NodeInfo) returns (NewWork) {}
-  rpc NewJob (JobParameters) returns (Result) {}
+  rpc GetJobStatus (Empty) returns (JobStatus) {}
+
   rpc Ping (Empty) returns (Empty) {}
-  rpc LoadOp (OpInfo) returns (Result) {}
+  rpc LoadOp (OpPath) returns (Result) {}
+  rpc RegisterOp (OpRegistration) returns (Result) {}
+  rpc RegisterPythonKernel (PythonKernelRegistration) returns (Result) {}
+  rpc GetOpInfo (OpInfoArgs) returns (OpInfo) {}
+  rpc Shutdown (Empty) returns (Result) {}
+  rpc PokeWatchdog (Empty) returns (Empty) {}
+
+  // Internal
+  rpc NextWork (NodeInfo) returns (NewWork) {}
+  rpc FinishedWork (FinishedWorkParameters) returns (Empty) {}
+  rpc FinishedJob (FinishedJobParams) returns (Empty) {}
+  rpc NewJob (BulkJobParameters) returns (Result) {}
 }
 
 service Worker {
-  rpc NewJob (JobParameters) returns (Result) {}
-  rpc LoadOp (OpInfo) returns (Empty) {}
+  rpc NewJob (BulkJobParameters) returns (Result) {}
+  rpc LoadOp (OpPath) returns (Empty) {}
+  rpc RegisterOp (OpRegistration) returns (Result) {}
+  rpc RegisterPythonKernel (PythonKernelRegistration) returns (Result) {}
+  rpc Shutdown (Empty) returns (Result) {}
+  rpc PokeWatchdog (Empty) returns (Empty) {}
+  rpc Ping (Empty) returns (Empty) {}
 }
 
 message Empty {}
@@ -27,8 +56,53 @@ message Result {
   string msg = 2;
 }
 
+message JobStatus {
+  bool finished = 1;
+  Result result = 2;
+
+  int32 tasks_done = 3;
+  int32 total_tasks = 4;
+
+  int32 jobs_done = 5;
+  int32 jobs_failed = 6;
+  int32 total_jobs = 7;
+
+  int32 num_workers = 8;
+  int32 failed_workers = 9;
+}
+
+message ListTablesResult {
+  repeated string tables = 1;
+}
+
+message GetTablesParams {
+  repeated string tables = 1;
+}
+
+message GetTablesResult {
+  Result result = 1;
+  repeated TableDescriptor tables = 2;
+  repeated VideoDescriptor videos = 3;
+}
+
+
+message DeleteTablesParams {
+  repeated string tables = 1;
+}
+
+message NewTableRow {
+  repeated bytes columns = 1;
+}
+
+message NewTableParams {
+  string table_name = 1;
+  repeated string columns = 2;
+  repeated NewTableRow rows = 3;
+}
+
 message WorkerParams {
-  string address = 1;
+  string port = 1;
+  MachineParameters params = 2;
 }
 
 message Registration {
@@ -44,13 +118,34 @@ message RegisteredWorkers {
   repeated WorkerInfo workers = 1;
 }
 
-message OpInfo {
-  string so_path = 1;
+message OpPath {
+  string path = 1;
+}
+
+message OpRegistration {
+  string name = 1;
+  bool variadic_inputs = 2;
+  repeated Column input_columns = 3;
+  repeated Column output_columns = 4;
+  bool can_stencil = 5;
+  repeated int32 preferred_stencil = 6;
+  bool has_bounded_state = 7;
+  int32 warmup = 8;
+  bool has_unbounded_state = 9;
+}
+
+message PythonKernelRegistration {
+  string op_name = 1;
+  DeviceType device_type = 2;
+  string kernel_str = 3;
+  string pickled_config = 4;
+  int32 batch_size = 5;
 }
 
 message IngestParameters {
   repeated string table_names = 1;
   repeated string video_paths = 2;
+  bool inplace = 3;
 }
 
 message IngestResult {
@@ -63,19 +158,61 @@ message NodeInfo {
   int32 node_id = 1;
 }
 
-message JobParameters {
+message FinishedWorkParameters {
+  int32 node_id = 1;
+  int64 job_id = 2;
+  int64 task_id = 3;
+  int64 num_rows = 4;
+}
+
+message FinishedJobParams {
+  int32 node_id = 1;
+  Result result = 2;
+}
+
+message BulkJobParameters {
   string job_name = 1;
-  TaskSet task_set = 2;
-  MemoryPoolConfig memory_pool_config = 3;
-  int32 pipeline_instances_per_node = 4;
-  int32 io_item_size = 5;
-  int32 work_item_size = 6;
-  int32 local_id = 7;
-  int32 local_total = 8;
-  bool show_progress = 9;
+
+  repeated Op ops = 2; // Linearized DAG of Ops
+  repeated Job jobs = 3;
+
+  // Parameters
+  repeated OutputColumnCompression compression = 4;
+  MemoryPoolConfig memory_pool_config = 5;
+  int32 pipeline_instances_per_node = 6;
+  int32 io_packet_size = 7;
+  int32 work_packet_size = 8;
+  int32 local_id = 9;
+  int32 local_total = 10;
+  bool profiling = 12;
+  int32 load_sparsity_threshold = 13;
+  int32 tasks_in_queue_per_pu = 14;
+  enum BoundaryCondition {
+    REPEAT_EDGE = 0;
+    REPEAT_NULL = 1;
+    ERROR = 2;
+  };
+  BoundaryCondition boundary_condition = 15;
+
+  float task_timeout = 16;
 }
 
 message NewWork {
-  IOItem io_item = 1;
-  LoadWorkEntry load_work = 2;
-};
+  int32 table_id = 1;
+  int32 job_index = 2;
+  int32 task_index = 3;
+  repeated int64 output_rows = 4 [packed=true];
+  bool wait_for_work = 5;
+  bool no_more_work = 6;
+}
+
+message OpInfoArgs {
+  string op_name = 1;
+}
+
+message OpInfo {
+  Result result = 1;
+  bool variadic_inputs = 2;
+  repeated Column input_columns = 3;
+  repeated Column output_columns = 4;
+}
diff --git a/scanner/engine/runtime.cpp b/scanner/engine/runtime.cpp
new file mode 100644
index 00000000..17ee0e1f
--- /dev/null
+++ b/scanner/engine/runtime.cpp
@@ -0,0 +1,202 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scanner/engine/runtime.h"
+#include "scanner/engine/master.h"
+#include "scanner/engine/worker.h"
+
+namespace scanner {
+namespace internal {
+
+MasterImpl* get_master_service(DatabaseParameters& param) {
+  return new MasterImpl(param);
+}
+
+WorkerImpl* get_worker_service(DatabaseParameters& params,
+                               const std::string& master_address,
+                               const std::string& worker_port) {
+  return new WorkerImpl(params, master_address, worker_port);
+}
+
+void move_if_different_address_space(Profiler& profiler,
+                                     DeviceHandle current_handle,
+                                     DeviceHandle target_handle,
+                                     ElementList& column) {
+  if (!current_handle.is_same_address_space(target_handle) &&
+      column.size() > 0) {
+    bool is_frame = column[0].is_frame;
+
+    std::vector<u8*> src_buffers;
+    std::vector<u8*> dest_buffers;
+    std::vector<size_t> sizes;
+    if (is_frame) {
+      for (i32 b = 0; b < (i32)column.size(); ++b) {
+        Frame* frame = column[b].as_frame();
+        src_buffers.push_back(frame->data);
+        sizes.push_back(frame->size());
+      }
+    } else {
+      for (i32 b = 0; b < (i32)column.size(); ++b) {
+        src_buffers.push_back(column[b].buffer);
+        sizes.push_back(column[b].size);
+      }
+    }
+
+    size_t total_size = 0;
+    for (i32 b = 0; b < (i32)column.size(); ++b) {
+      total_size += sizes[b];
+    }
+
+    u8* block = new_block_buffer(target_handle, total_size, column.size());
+    for (i32 b = 0; b < (i32)column.size(); ++b) {
+      size_t size = sizes[b];
+      dest_buffers.push_back(block);
+      block += size;
+    }
+
+    auto memcpy_start = now();
+    memcpy_vec(dest_buffers, target_handle, src_buffers, current_handle, sizes);
+    profiler.add_interval("memcpy", memcpy_start, now());
+
+    auto delete_start = now();
+    if (is_frame) {
+      for (i32 b = 0; b < (i32)column.size(); ++b) {
+        Frame* frame = column[b].as_frame();
+        delete_buffer(current_handle, frame->data);
+        frame->data = dest_buffers[b];
+      }
+    } else {
+      for (i32 b = 0; b < (i32)column.size(); ++b) {
+        delete_buffer(current_handle, column[b].buffer);
+        column[b].buffer = dest_buffers[b];
+      }
+    }
+  }
+}
+
+void move_if_different_address_space(Profiler& profiler,
+                                     DeviceHandle current_handle,
+                                     DeviceHandle target_handle,
+                                     BatchedColumns& columns) {
+  for (i32 i = 0; i < (i32)columns.size(); ++i) {
+    ElementList& column = columns[i];
+    move_if_different_address_space(profiler, current_handle, target_handle,
+                                    column);
+  }
+}
+
+ElementList copy_elements(Profiler& profiler, DeviceHandle current_handle,
+                          DeviceHandle target_handle, ElementList& column) {
+  bool is_frame = column[0].is_frame;
+
+  std::vector<u8*> src_buffers;
+  std::vector<u8*> dest_buffers;
+  std::vector<size_t> sizes;
+  if (is_frame) {
+    for (i32 b = 0; b < (i32)column.size(); ++b) {
+      Frame* frame = column[b].as_frame();
+      src_buffers.push_back(frame->data);
+      sizes.push_back(frame->size());
+    }
+  } else {
+    for (i32 b = 0; b < (i32)column.size(); ++b) {
+      src_buffers.push_back(column[b].buffer);
+      sizes.push_back(column[b].size);
+    }
+  }
+
+  size_t total_size = 0;
+  for (i32 b = 0; b < (i32)column.size(); ++b) {
+    total_size += sizes[b];
+  }
+
+  u8* block = new_block_buffer(target_handle, total_size, column.size());
+  for (i32 b = 0; b < (i32)column.size(); ++b) {
+    size_t size = sizes[b];
+    dest_buffers.push_back(block);
+    block += size;
+  }
+
+  auto memcpy_start = now();
+  memcpy_vec(dest_buffers, target_handle, src_buffers, current_handle, sizes);
+  profiler.add_interval("memcpy", memcpy_start, now());
+
+  ElementList output_list;
+  if (is_frame) {
+    for (i32 b = 0; b < (i32)column.size(); ++b) {
+      Frame* frame =
+          new Frame(column[b].as_frame()->as_frame_info(), dest_buffers[b]);
+      insert_frame(output_list, frame);
+    }
+  } else {
+    for (i32 b = 0; b < (i32)column.size(); ++b) {
+      insert_element(output_list, dest_buffers[b], sizes[b]);
+    }
+  }
+  return output_list;
+}
+
+ElementList copy_or_ref_elements(Profiler& profiler,
+                                 DeviceHandle current_handle,
+                                 DeviceHandle target_handle,
+                                 ElementList& column) {
+  bool is_frame = column[0].is_frame;
+
+  std::vector<u8*> src_buffers;
+  std::vector<size_t> sizes;
+  if (is_frame) {
+    for (i32 b = 0; b < (i32)column.size(); ++b) {
+      Frame* frame = column[b].as_frame();
+      src_buffers.push_back(frame->data);
+      sizes.push_back(frame->size());
+    }
+  } else {
+    for (i32 b = 0; b < (i32)column.size(); ++b) {
+      src_buffers.push_back(column[b].buffer);
+      sizes.push_back(column[b].size);
+    }
+  }
+
+  size_t total_size = 0;
+  for (i32 b = 0; b < (i32)column.size(); ++b) {
+    total_size += sizes[b];
+  }
+
+  auto memcpy_start = now();
+  std::vector<u8*> dest_buffers;
+  copy_or_ref_buffers(dest_buffers, target_handle, src_buffers, current_handle,
+                      sizes);
+  profiler.add_interval("memcpy", memcpy_start, now());
+
+  ElementList output_list;
+  if (is_frame) {
+    for (i32 b = 0; b < (i32)column.size(); ++b) {
+      Frame* frame =
+          new Frame(column[b].as_frame()->as_frame_info(), dest_buffers[b]);
+      insert_frame(output_list, frame);
+    }
+  } else {
+    for (i32 b = 0; b < (i32)column.size(); ++b) {
+      insert_element(output_list, dest_buffers[b], sizes[b]);
+    }
+  }
+  for (i32 b = 0; b < (i32)column.size(); ++b) {
+    output_list[b].index = column[b].index;
+  }
+  return output_list;
+}
+
+}
+}
diff --git a/scanner/engine/runtime.h b/scanner/engine/runtime.h
index ccc7295e..abc8b9d2 100644
--- a/scanner/engine/runtime.h
+++ b/scanner/engine/runtime.h
@@ -18,10 +18,11 @@
 #include "scanner/api/database.h"
 #include "scanner/api/kernel.h"
 #include "scanner/api/op.h"
-#include "scanner/engine/rpc.grpc.pb.h"
-#include "scanner/engine/db.h"
-#include "scanner/engine/op_registry.h"
 #include "scanner/engine/kernel_registry.h"
+#include "scanner/engine/metadata.h"
+#include "scanner/engine/op_registry.h"
+#include "scanner/engine/rpc.grpc.pb.h"
+#include "scanner/util/queue.h"
 
 #include "storehouse/storage_backend.h"
 
@@ -30,9 +31,9 @@
 #include <grpc++/server.h>
 #include <grpc++/server_builder.h>
 
-#include <thread>
-#include <string>
 #include <dlfcn.h>
+#include <string>
+#include <thread>
 
 namespace scanner {
 namespace internal {
@@ -41,17 +42,45 @@ namespace internal {
 /// Work structs - structs used to exchange data between workers during
 ///   execution of the run command.
 struct EvalWorkEntry {
-  i32 io_item_index;
+  i64 table_id;
+  i64 job_index;
+  i64 task_index;
+  std::vector<std::vector<i64>> row_ids;
   BatchedColumns columns;
   std::vector<DeviceHandle> column_handles;
   // Below only for pre/evaluate/post workers
+  std::vector<bool> inplace_video;
   std::vector<ColumnType> column_types;
   bool needs_configure;
   bool needs_reset;
-  bool last_in_io_item;
-  i64 warmup_rows;
+  bool last_in_io_packet;
+  // Only for pre worker
+  std::vector<proto::VideoDescriptor::VideoCodecType> video_encoding_type;
+  bool first;
+  bool last_in_task;
+  // For save and pre worker
+  std::vector<FrameInfo> frame_sizes;
+  std::vector<bool> compressed;
 };
 
+struct TaskStream {
+  i64 slice_group;
+  std::vector<i64> valid_input_rows;
+  std::vector<i64> compute_input_rows;
+  std::vector<i64> valid_output_rows;
+};
+
+using LoadInputQueue =
+    Queue<std::tuple<i32, std::deque<TaskStream>, LoadWorkEntry>>;
+using EvalQueue =
+    Queue<std::tuple<std::deque<TaskStream>, EvalWorkEntry>>;
+using OutputEvalQueue =
+    Queue<std::tuple<i32, EvalWorkEntry>>;
+using SaveInputQueue =
+    Queue<std::tuple<i32, EvalWorkEntry>>;
+using SaveOutputQueue =
+    Queue<std::tuple<i32, i64, i64>>;
+
 struct DatabaseParameters {
   storehouse::StorageConfig* storage_config;
   std::string db_path;
@@ -59,12 +88,36 @@ struct DatabaseParameters {
   i32 num_load_workers;
   i32 num_save_workers;
   std::vector<i32> gpu_ids;
+  bool prefetch_table_metadata;
+  i64 no_workers_timeout; // in seconds
 };
 
-proto::Master::Service *get_master_service(DatabaseParameters &param);
+class MasterImpl;
+class WorkerImpl;
+
+MasterImpl* get_master_service(DatabaseParameters& param);
+
+WorkerImpl* get_worker_service(DatabaseParameters& params,
+                               const std::string& master_address,
+                               const std::string& worker_port);
+
+// Utilities
+void move_if_different_address_space(Profiler& profiler,
+                                     DeviceHandle current_handle,
+                                     DeviceHandle target_handle,
+                                     ElementList& column);
+
+void move_if_different_address_space(Profiler& profiler,
+                                     DeviceHandle current_handle,
+                                     DeviceHandle target_handle,
+                                     BatchedColumns& columns);
 
-proto::Worker::Service *get_worker_service(DatabaseParameters &params,
-                                           const std::string &master_address);
+ElementList copy_elements(Profiler& profiler, DeviceHandle current_handle,
+                          DeviceHandle target_handle, ElementList& column);
 
+ElementList copy_or_ref_elements(Profiler& profiler,
+                                 DeviceHandle current_handle,
+                                 DeviceHandle target_handle,
+                                 ElementList& column);
 }
 }
diff --git a/scanner/engine/sample_kernel.h b/scanner/engine/sample_kernel.h
new file mode 100644
index 00000000..eb190b0b
--- /dev/null
+++ b/scanner/engine/sample_kernel.h
@@ -0,0 +1,26 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/memory.h"
+#include "scanner/metadata.pb.h"
+
+#include <boost/python.hpp>
+#include <boost/python/numpy.hpp>
+
+namespace scanner {
+
+class SampleKernel : public Kernel {
+ public:
+  SampleKernel(const KernelConfig& config, const std::string& kernel_str,
+               const std::string& pickled_config);
+
+  ~SamplehonKernel();
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override;
+
+ private:
+  KernelConfig config_;
+  DeviceHandle device_;
+};
+
+}
diff --git a/scanner/engine/sample_op.cpp b/scanner/engine/sample_op.cpp
new file mode 100644
index 00000000..051f4a84
--- /dev/null
+++ b/scanner/engine/sample_op.cpp
@@ -0,0 +1,40 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/memory.h"
+
+namespace scanner {
+
+// Dummy Kernel
+class SampleKernel : public BatchedKernel {
+ public:
+  SampleKernel(const KernelConfig& config)
+    : BatchedKernel(config) {}
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    // No implementation
+  }
+};
+
+
+// Reserve Op name as builtin
+REGISTER_OP(Sample).input("col").output("out");
+
+REGISTER_KERNEL(Sample, SampleKernel).device(DeviceType::CPU).num_devices(1);
+
+REGISTER_KERNEL(Sample, SampleKernel).device(DeviceType::GPU).num_devices(1);
+
+
+REGISTER_OP(SampleFrame).frame_input("col").frame_output("out");
+
+REGISTER_KERNEL(SampleFrame, SampleKernel)
+    .device(DeviceType::CPU)
+    .batch()
+    .num_devices(1);
+
+REGISTER_KERNEL(SampleFrame, SampleKernel)
+    .device(DeviceType::GPU)
+    .batch()
+    .num_devices(1);
+
+}
diff --git a/scanner/engine/sampler.cpp b/scanner/engine/sampler.cpp
index 34dd9c75..65d93b19 100644
--- a/scanner/engine/sampler.cpp
+++ b/scanner/engine/sampler.cpp
@@ -16,37 +16,78 @@
 #include "scanner/engine/sampler.h"
 #include "scanner/metadata.pb.h"
 
-#include <vector>
 #include <cmath>
+#include <vector>
+#include <algorithm>
 
 namespace scanner {
 namespace internal {
 
 namespace {
 
-using SamplerFactory =
-    std::function<Sampler*(const std::vector<u8> &, const TableMetadata &)>;
+using DomainSamplerFactory =
+    std::function<DomainSampler*(const std::vector<u8>&)>;
 
-class AllSampler : public Sampler {
-public:
-  AllSampler(const std::vector<u8>& args, const TableMetadata &table)
-      : Sampler("All", table) {
+// 1 to 1 mapping
+class DefaultDomainSampler : public DomainSampler {
+ public:
+  DefaultDomainSampler(const std::vector<u8>& args)
+    : DomainSampler("Default") {
     valid_.set_success(true);
-    if (!args_.ParseFromArray(args.data(), args.size())) {
-      RESULT_ERROR(&valid_,
-                   "All sampler provided with invalid protobuf args");
-      return;
+  }
+
+  Result validate() override {
+    Result result;
+    result.set_success(true);
+    return result;
+  }
+
+  Result get_upstream_rows(const std::vector<i64>& input_rows,
+                           std::vector<i64>& output_rows) const {
+    output_rows = input_rows;
+    Result result;
+    result.set_success(true);
+    return result;
+  }
+
+  Result get_num_downstream_rows(i64 num_upstream_rows,
+                                 i64& num_downstream_rows) const {
+    num_downstream_rows = num_upstream_rows;
+    Result result;
+    result.set_success(true);
+    return result;
+  }
+
+  Result get_downstream_rows(
+      const std::vector<i64>& upstream_rows, std::vector<i64>& downstream_rows,
+      std::vector<i64>& downstream_upstream_mapping) const {
+    downstream_rows = upstream_rows;
+    for (i64 i = 0; i < upstream_rows.size(); ++i) {
+      downstream_upstream_mapping.push_back(i);
     }
-    if (args_.sample_size() <= 0) {
+    Result result;
+    result.set_success(true);
+    return result;
+  }
+
+ private:
+  Result valid_;
+};
+
+class StridedDomainSampler : public DomainSampler {
+ public:
+  StridedDomainSampler(const std::vector<u8>& args)
+    : DomainSampler("Strided") {
+    valid_.set_success(true);
+    if (!args_.ParseFromArray(args.data(), args.size())) {
       RESULT_ERROR(&valid_,
-                   "All sampler sample size (%ld) must be greater than 0",
-                   args_.sample_size());
+                   "StridedSampler provided with invalid protobuf args");
       return;
     }
-    if (args_.warmup_size() < 0) {
+    if (args_.stride() <= 0) {
       RESULT_ERROR(&valid_,
-                   "All sampler warmup size (%ld) must be non-negative",
-                   args_.warmup_size());
+                   "Strided sampler stride (%ld) must be greater than zero",
+                   args_.stride());
       return;
     }
   }
@@ -57,44 +98,48 @@ class AllSampler : public Sampler {
     return result;
   }
 
-  i64 total_rows() const override {
-    return table_.num_rows();
+  Result get_upstream_rows(const std::vector<i64>& downstream_rows,
+                           std::vector<i64>& upstream_rows) const {
+    for (i64 in : downstream_rows) {
+      upstream_rows.push_back(in * args_.stride());
+    }
+    Result result;
+    result.set_success(true);
+    return result;
   }
 
-  i64 total_samples() const override {
-    return (int) std::ceil((float) table_.num_rows() / args_.sample_size());
+  Result get_num_downstream_rows(i64 num_upstream_rows,
+                                 i64& num_downstream_rows) const {
+    num_downstream_rows = ceil(num_upstream_rows / float(args_.stride()));
+    Result result;
+    result.set_success(true);
+    return result;
   }
 
-  RowSample next_sample() override {
-    RowSample sample;
-    i64 ws = std::max(0l, rows_pos_ - args_.warmup_size());
-    i64 s = rows_pos_;
-    i64 e = std::min(total_rows(), rows_pos_ + args_.sample_size());
-    rows_pos_ = e;
-    assert(rows_pos_ <= total_rows());
-    for (i64 i = ws; i < s; ++i) {
-      sample.warmup_rows.push_back(i);
-    }
-    for (i64 i = s; i < e; ++i) {
-      sample.rows.push_back(i);
+  Result get_downstream_rows(
+      const std::vector<i64>& upstream_rows, std::vector<i64>& downstream_rows,
+      std::vector<i64>& downstream_upstream_mapping) const {
+    for (i64 i = 0; i < upstream_rows.size(); ++i) {
+      i64 in = upstream_rows[i];
+      if (in % args_.stride() == 0) {
+        downstream_rows.push_back(in / args_.stride());
+        downstream_upstream_mapping.push_back(i);
+      }
     }
-    return sample;
-  }
-
-  void reset() override {
-    rows_pos_ = 0;
+    Result result;
+    result.set_success(true);
+    return result;
   }
 
-private:
+ private:
   Result valid_;
-  proto::AllSamplerArgs args_;
-  i64 rows_pos_ = 0;
+  proto::StridedSamplerArgs args_;
 };
 
-class StridedRangeSampler : public Sampler {
-public:
-  StridedRangeSampler(const std::vector<u8>& args, const TableMetadata &table)
-      : Sampler("StridedRange", table) {
+class StridedRangesDomainSampler : public DomainSampler {
+ public:
+  StridedRangesDomainSampler(const std::vector<u8>& args)
+    : DomainSampler("StridedRanges") {
     valid_.set_success(true);
     if (!args_.ParseFromArray(args.data(), args.size())) {
       RESULT_ERROR(&valid_,
@@ -107,271 +152,325 @@ class StridedRangeSampler : public Sampler {
                    args_.stride());
       return;
     }
-    if (args_.warmup_starts_size() != args_.starts_size() ||
-        args_.starts_size() != args_.ends_size()) {
+    if (args_.starts_size() != args_.ends_size()) {
       RESULT_ERROR(&valid_,
-                   "StridedRange warmups, starts, and ends not the same size");
+                   "StridedRange starts and ends not the same size");
       return;
     }
-    for (i64 i = 0; i < args_.warmup_starts_size(); ++i) {
-      if (args_.warmup_starts(i) > args_.starts(i)) {
-        RESULT_ERROR(
-            &valid_,
-            "StridedRange warmup start (%ld) should not be after start (%ld)",
-            args_.warmup_starts(i), args_.starts(i));
-        return;
-      }
+    i64 offset = 0;
+    for (i64 i = 0; i < args_.starts_size(); ++i) {
       if (args_.starts(i) > args_.ends(i)) {
-        RESULT_ERROR(
-            &valid_,
-            "StridedRange start (%ld) should not be after end (%ld)",
-            args_.starts(i), args_.ends(i));
-        return;
-      }
-      if (args_.ends(i) > table.num_rows()) {
-        RESULT_ERROR(
-            &valid_,
-            "StridedRange end (%ld) should be less than table num rows (%ld)",
-            args_.ends(i), table.num_rows());
+        RESULT_ERROR(&valid_,
+                     "StridedRange start (%ld) should not be after end (%ld)",
+                     args_.starts(i), args_.ends(i));
         return;
       }
-      total_rows_ += (args_.ends(i) - args_.starts(i)) / args_.stride();
+      i64 rows =
+          ceil((args_.ends(i) - args_.starts(i)) / (float)args_.stride());
+      offset_at_range_starts_.push_back(offset);
+      offset += rows;
     }
-    total_samples_ = args_.warmup_starts_size();
+    offset_at_range_starts_.push_back(offset);
   }
 
-  Result validate() override {
-    return valid_;
-  }
-
-  i64 total_rows() const override {
-    return total_rows_;
-  }
-
-  i64 total_samples() const override {
-    return total_samples_;
+  Result validate() override { return valid_; }
+
+  Result get_upstream_rows(const std::vector<i64>& downstream_rows,
+                           std::vector<i64>& upstream_rows) const override {
+    Result valid;
+    valid.set_success(true);
+    for (i64 in_row : downstream_rows) {
+      i64 range_idx = -1;
+      for (i64 i = 1; i < offset_at_range_starts_.size(); ++i) {
+        i64 start_offset = offset_at_range_starts_[i];
+        if (in_row < start_offset) {
+          range_idx = i - 1;
+          break;
+        }
+      }
+      if (range_idx == -1) {
+        RESULT_ERROR(&valid,
+                     "StridedRange received out of bounds request for row %ld "
+                     "(max requestable row is %ld).",
+                     in_row,
+                     offset_at_range_starts_.back());
+        return valid;
+      }
+      i64 normed_in = in_row - offset_at_range_starts_[range_idx];
+      i64 out_row = args_.starts(range_idx) + normed_in * args_.stride();
+      upstream_rows.push_back(out_row);
+    }
+    return valid;
   }
 
-  RowSample next_sample() override {
-    RowSample sample;
-    i64 stride = args_.stride();
-    i64 ws = args_.warmup_starts(samples_pos_);
-    i64 s = args_.starts(samples_pos_);
-    i64 e = args_.ends(samples_pos_);
-    for (i64 i = ws; i < s; i += stride) {
-      sample.warmup_rows.push_back(i);
+  Result get_num_downstream_rows(i64 num_upstream_rows,
+                                 i64& num_downstream_rows) const {
+    i64 i = 0;
+    for (; i < args_.ends_size(); ++i) {
+      i64 start_offset = offset_at_range_starts_[i];
+      if (num_upstream_rows < args_.ends(i)) {
+        break;
+      }
     }
-    for (i64 i = s; i < e; i += stride) {
-      sample.rows.push_back(i);
+    num_downstream_rows = 0;
+    for (i64 se = 0; se < i; ++se) {
+      num_downstream_rows +=
+          ceil((args_.ends(se) - args_.starts(se)) / float(args_.stride()));
+    }
+    if (i != args_.ends_size()) {
+      num_downstream_rows +=
+          ceil((num_upstream_rows - args_.starts(i)) / float(args_.stride()));
     }
-    samples_pos_++;
-    assert(samples_pos_ <= args_.warmup_starts_size());
-    return sample;
+    Result valid;
+    valid.set_success(true);
+    return valid;
   }
 
-  void reset() override {
-    samples_pos_ = 0;
+  Result get_downstream_rows(
+      const std::vector<i64>& upstream_rows, std::vector<i64>& downstream_rows,
+      std::vector<i64>& downstream_upstream_mapping) const {
+    i64 offset = 0;
+    i64 range_idx = 0;
+    for (i64 i = 0; i < upstream_rows.size(); ++i) {
+      i64 r = upstream_rows[i];
+      while (range_idx < args_.ends_size() &&
+             !(r >= args_.starts(range_idx) && r < args_.ends(range_idx))) {
+        // Add number of valid rows in this range sequence to offset
+        offset += (args_.starts(range_idx) - args_.ends(range_idx) +
+                   args_.stride() - 1) /
+                  args_.stride();
+        range_idx++;
+      }
+      if (range_idx == args_.ends_size()) {
+        break;
+      }
+      i64 relative_r = (r - args_.starts(range_idx));
+      if (relative_r % args_.stride() == 0) {
+        downstream_rows.push_back(offset + relative_r / args_.stride());
+        downstream_upstream_mapping.push_back(i);
+      }
+    }
+    Result valid;
+    valid.set_success(true);
+    return valid;
   }
 
-private:
+ private:
   Result valid_;
   proto::StridedRangeSamplerArgs args_;
-  i64 total_rows_ = 0;
-  i64 total_samples_ = 0;
-  size_t samples_pos_ = 0;
+  std::vector<i64> offset_at_range_starts_;
 };
 
-class StencilSampler : public Sampler {
-public:
-  StencilSampler(const std::vector<u8> &args,
-                               const TableMetadata &table)
-      : Sampler("Stencil", table) {
+class GatherDomainSampler : public DomainSampler {
+ public:
+  GatherDomainSampler(const std::vector<u8>& args)
+    : DomainSampler("Gather") {
     valid_.set_success(true);
     if (!args_.ParseFromArray(args.data(), args.size())) {
-      RESULT_ERROR(
-          &valid_,
-          "Stencil sampler provided with invalid protobuf args");
+      RESULT_ERROR(&valid_,
+                   "Gather sampler provided with invalid protobuf args");
       return;
     }
-    if (args_.stride() <= 0) {
-      RESULT_ERROR(
-          &valid_,
-          "Stencil stride (%ld) must be greater than zero",
-          args_.stride());
-      return;
+    i64 offset = 0;
+    for (i64 r : args_.rows()) {
+      gather_rows_[r] = offset++;
     }
-    for (i64 i = 0; i < args_.stencil_size(); ++i) {
-      if (args_.stencil(i) >= 0) {
-        RESULT_ERROR(&valid_, "Stencil elements (%ld) must be less than zero",
-                     args_.stencil(i));
-        return;
+  }
+
+  Result validate() override { return valid_; }
+
+  Result get_upstream_rows(const std::vector<i64>& upstream_rows,
+                           std::vector<i64>& downstream_rows) const override {
+    Result valid;
+    valid.set_success(true);
+    for (i64 in_row : upstream_rows) {
+      if (in_row >= args_.rows_size()) {
+        RESULT_ERROR(&valid,
+                     "Gather sampler received out of bounds request for "
+                     "row %ld (max requestable row is %d).",
+                     in_row,
+                     args_.rows_size());
+        return valid;
       }
+      downstream_rows.push_back(args_.rows(in_row));
     }
-    for (i64 i = 0; i < args_.starts_size(); ++i) {
-      if (args_.starts(i) > args_.ends(i)) {
-        RESULT_ERROR(
-            &valid_,
-            "Stencil start (%ld) should not be after end (%ld)",
-            args_.starts(i), args_.ends(i));
-        return;
-      }
-      for (i64 j = 0; j < args_.stencil_size(); ++j) {
-        if (args_.starts(i) + args_.stencil(j) < 0) {
-          RESULT_ERROR(
-              &valid_,
-              "Stencil start (%ld) with stencil offset %ld should be greater "
-              "than zero",
-              args_.starts(i), args_.stencil(j));
-          return;
-        }
-      }
-      if (args_.ends(i) > table.num_rows()) {
-        RESULT_ERROR(&valid_, "Stencil end (%ld) should be less "
-                              "than table num rows (%ld)",
-                     args_.ends(i), table.num_rows());
-        return;
+    return valid;
+  }
+
+  Result get_num_downstream_rows(i64 num_upstream_rows,
+                                 i64& num_downstream_rows) const {
+    num_downstream_rows = 0;
+    for (i64 r : args_.rows()) {
+      if (r >= num_upstream_rows) {
+        break;
       }
-      total_rows_ += (args_.ends(i) - args_.starts(i)) / args_.stride();
+      num_downstream_rows++;
     }
-    total_samples_ = args_.starts_size();
+    Result valid;
+    valid.set_success(true);
+    return valid;
   }
 
-  Result validate() override {
-    return valid_;
+  Result get_downstream_rows(
+      const std::vector<i64>& upstream_rows, std::vector<i64>& downstream_rows,
+      std::vector<i64>& downstream_upstream_mapping) const {
+    for (i64 i = 0; i < upstream_rows.size(); ++i) {
+      i64 r = upstream_rows[i];
+      if (gather_rows_.count(r) > 0) {
+        downstream_rows.push_back(gather_rows_.at(r));
+        downstream_upstream_mapping.push_back(i);
+      }
+    }
+    Result valid;
+    valid.set_success(true);
+    return valid;
   }
 
-  i64 total_rows() const override {
-    return total_rows_;
+ private:
+  Result valid_;
+  proto::GatherSamplerArgs args_;
+  std::map<i64, i64> gather_rows_;
+};
+
+
+class SpaceNullDomainSampler : public DomainSampler {
+ public:
+  SpaceNullDomainSampler(const std::vector<u8>& args)
+    : DomainSampler("SpaceNull") {
+    valid_.set_success(true);
+    if (!args_.ParseFromArray(args.data(), args.size())) {
+      RESULT_ERROR(&valid_,
+                   "SpaceNull sampler provided with invalid protobuf args");
+      return;
+    }
   }
 
-  i64 total_samples() const override {
-    // NOTE(apoms): not a mistake, stencil sampler returns 1 row each time
-    return total_rows_;
+  Result validate() override {
+    return valid_;
   }
 
-  RowSample next_sample() override {
-    RowSample sample;
-    i64 stride = args_.stride();
-    i64 s = args_.starts(samples_pos_);
-    int curr_start = s + stride * rows_pos_;
-    for (i64 off : args_.stencil()) {
-      sample.warmup_rows.push_back(curr_start + off);
+  Result get_upstream_rows(const std::vector<i64>& downstream_rows,
+                           std::vector<i64>& upstream_rows) const {
+    std::set<i64> required_rows;
+    for (i64 r : downstream_rows) {
+      required_rows.insert(r / args_.spacing());
     }
-    sample.rows.push_back(curr_start);
-
-    rows_pos_++;
-    i64 e = args_.ends(samples_pos_);
-    if (curr_start + stride >= e) {
-      rows_pos_ = 0;
-      samples_pos_++;
+    for (i64 r : required_rows) {
+      upstream_rows.push_back(r);
     }
-    assert(samples_pos_ <= args_.starts_size());
-    return sample;
+    std::sort(upstream_rows.begin(), upstream_rows.end());
+    Result result;
+    result.set_success(true);
+    return result;
   }
 
-  void reset() override {
-    samples_pos_ = 0;
-    rows_pos_ = 0;
+  Result get_num_downstream_rows(i64 num_upstream_rows,
+                                 i64& num_downstream_rows) const {
+    num_downstream_rows = num_upstream_rows * args_.spacing();
+    Result result;
+    result.set_success(true);
+    return result;
+  }
+
+  Result get_downstream_rows(
+      const std::vector<i64>& upstream_rows, std::vector<i64>& downstream_rows,
+      std::vector<i64>& downstream_upstream_mapping) const {
+    for (i64 i = 0; i < upstream_rows.size(); ++i) {
+      i64 r = upstream_rows[i];
+      i64 base = r * args_.spacing();
+      downstream_rows.push_back(base);
+      downstream_upstream_mapping.push_back(i);
+      for (i64 offset = base + 1; offset < base + args_.spacing(); ++offset) {
+        downstream_rows.push_back(offset);
+        downstream_upstream_mapping.push_back(-1);
+      }
+    }
+    Result valid;
+    valid.set_success(true);
+    return valid;
   }
 
-private:
+ private:
   Result valid_;
-  proto::StencilSamplerArgs args_;
-  i64 total_rows_ = 0;
-  i64 total_samples_ = 0;
-  size_t samples_pos_ = 0;
-  size_t rows_pos_ = 0;
+  proto::SpaceNullSamplerArgs args_;
 };
 
-class GatherSampler : public Sampler {
-public:
-  GatherSampler(const std::vector<u8>& args, const TableMetadata &table)
-      : Sampler("Gather", table) {
+
+class SpaceRepeatDomainSampler : public DomainSampler {
+ public:
+  SpaceRepeatDomainSampler(const std::vector<u8>& args)
+    : DomainSampler("SpaceRepeat") {
     valid_.set_success(true);
     if (!args_.ParseFromArray(args.data(), args.size())) {
       RESULT_ERROR(&valid_,
-                   "Gather sampler provided with invalid protobuf args");
+                   "SpaceRepeat sampler provided with invalid protobuf args");
       return;
     }
-    for (i32 i = 0; i < args_.samples_size(); ++i) {
-      auto& s = args_.samples(i);
-      i64 max = -1;
-      for (i32 j = 0; j < s.warmup_rows_size(); ++j) {
-        if (s.warmup_rows(j) <= max) {
-          RESULT_ERROR(
-              &valid_,
-              "Gather sampler warmup row (%ld) less than previous row (%ld)",
-              s.warmup_rows(j), max);
-          return;
-        }
-        max = s.warmup_rows(j);
-      }
-      for (i32 j = 0; j < s.rows_size(); ++j) {
-        if (s.rows(j) <= max) {
-          RESULT_ERROR(
-              &valid_,
-              "Gather sampler row (%ld) less than previous row (%ld)",
-              s.rows(j), max);
-          return;
-        }
-        max = s.rows(j);
-      }
-      total_rows_ += args_.samples(i).rows_size();
-    }
   }
 
-  Result validate() override {
-    return valid_;
-  }
+  Result validate() override { return valid_; }
 
-  i64 total_rows() const override {
-    return total_rows_;
-  }
-
-  i64 total_samples() const override {
-    return args_.samples_size();
+  Result get_upstream_rows(const std::vector<i64>& input_rows,
+                           std::vector<i64>& output_rows) const {
+    std::unordered_set<i64> required_rows;
+    for (i64 r : input_rows) {
+      required_rows.insert(r / args_.spacing());
+    }
+    output_rows = std::vector<i64>(required_rows.begin(), required_rows.end());
+    std::sort(output_rows.begin(), output_rows.end());
+    Result result;
+    result.set_success(true);
+    return result;
   }
 
-  RowSample next_sample() override {
-    RowSample sample;
-    auto &s = args_.samples(samples_pos_);
-    sample.warmup_rows =
-        std::vector<i64>(s.warmup_rows().begin(), s.warmup_rows().end());
-    sample.rows = std::vector<i64>(s.rows().begin(), s.rows().end());
-    samples_pos_++;
-    assert(samples_pos_ <= args_.samples_size());
-    return sample;
+  Result get_num_downstream_rows(i64 num_upstream_rows,
+                                 i64& num_downstream_rows) const {
+    num_downstream_rows = num_upstream_rows * args_.spacing();
+    Result result;
+    result.set_success(true);
+    return result;
   }
 
-  void reset() override {
-    samples_pos_ = 0;
+  Result get_downstream_rows(
+      const std::vector<i64>& upstream_rows, std::vector<i64>& downstream_rows,
+      std::vector<i64>& downstream_upstream_mapping) const {
+    for (i64 i = 0; i < upstream_rows.size(); ++i) {
+      i64 r = upstream_rows[i];
+      i64 base = r * args_.spacing();
+      for (i64 offset = base; offset < base + args_.spacing(); ++offset) {
+        downstream_rows.push_back(offset);
+        downstream_upstream_mapping.push_back(i);
+      }
+    }
+    Result valid;
+    valid.set_success(true);
+    return valid;
   }
 
-private:
+ private:
   Result valid_;
-  proto::GatherSamplerArgs args_;
-  i64 total_rows_ = 0;
-  size_t samples_pos_ = 0;
+  proto::SpaceRepeatSamplerArgs args_;
 };
 
 template <typename T>
-SamplerFactory make_factory() {
-  return [](const std::vector<u8>& args, const TableMetadata& table) {
-    return new T(args, table);
+DomainSamplerFactory make_domain_factory() {
+  return [](const std::vector<u8>& args) {
+    return new T(args);
   };
 }
-
 }
 
-Result make_sampler_instance(const std::string& sampler_type,
-                             const std::vector<u8>& sampler_args,
-                             const TableMetadata& sampled_table,
-                             Sampler*& sampler) {
-  static std::map<std::string, SamplerFactory> samplers = {
-      {"All", make_factory<AllSampler>()},
-      {"StridedRange", make_factory<StridedRangeSampler>()},
-      {"Stencil", make_factory<StencilSampler>()},
-      {"Gather", make_factory<GatherSampler>()}};
+Result make_domain_sampler_instance(const std::string& sampler_type,
+                                    const std::vector<u8>& sampler_args,
+                                    DomainSampler*& sampler) {
+  static std::map<std::string, DomainSamplerFactory> samplers = {
+      {"All", make_domain_factory<DefaultDomainSampler>()},
+      {"Strided", make_domain_factory<StridedDomainSampler>()},
+      {"StridedRanges", make_domain_factory<StridedRangesDomainSampler>()},
+      {"Gather", make_domain_factory<GatherDomainSampler>()},
+      {"SpaceNull", make_domain_factory<SpaceNullDomainSampler>()},
+      {"SpaceRepeat", make_domain_factory<SpaceRepeatDomainSampler>()},
+  };
 
   Result result;
   result.set_success(true);
@@ -379,13 +478,14 @@ Result make_sampler_instance(const std::string& sampler_type,
   // Check if sampler type exists
   auto it = samplers.find(sampler_type);
   if (it == samplers.end()) {
-    RESULT_ERROR(&result, "Sampler type not found: %s", sampler_type.c_str());
+    RESULT_ERROR(&result, "DomainSampler type not found: %s",
+                 sampler_type.c_str());
     return result;
   }
 
   // Validate sampler args
-  SamplerFactory factory = it->second;
-  Sampler* potential_sampler = factory(sampler_args, sampled_table);
+  DomainSamplerFactory factory = it->second;
+  DomainSampler* potential_sampler = factory(sampler_args);
   result = potential_sampler->validate();
   if (!result.success()) {
     delete potential_sampler;
@@ -396,131 +496,277 @@ Result make_sampler_instance(const std::string& sampler_type,
   return result;
 }
 
-TaskSampler::TaskSampler(
-    const std::map<std::string, TableMetadata> &table_metas,
-    const proto::Task &task)
-    : table_metas_(table_metas), task_(task) {
-  valid_.set_success(true);
-  if (table_metas.count(task.output_table_name()) == 0) {
-    RESULT_ERROR(&valid_, "Output table %s does not exist.",
-                 task.output_table_name().c_str());
-    return;
-  }
-  // Create samplers for this task
-  for (auto& sample : task.samples()) {
-    if (table_metas.count(sample.table_name()) == 0) {
-      RESULT_ERROR(&valid_, "Requested table %s does not exist.",
-                   sample.table_name().c_str());
+namespace {
+
+using PartitionerFactory =
+    std::function<Partitioner*(const std::vector<u8>&, i64 num_rows)>;
+
+class StridedPartitioner : public Partitioner {
+ public:
+  StridedPartitioner(const std::vector<u8>& args, i64 num_rows)
+    : Partitioner("Strided", num_rows) {
+    valid_.set_success(true);
+    if (!args_.ParseFromArray(args.data(), args.size())) {
+      RESULT_ERROR(&valid_, "All sampler provided with invalid protobuf args");
       return;
     }
-    const TableMetadata &t_meta = table_metas.at(sample.table_name());
-    std::vector<u8> sampler_args(sample.sampling_args().begin(),
-                                 sample.sampling_args().end());
-    Sampler* sampler = nullptr;
-    valid_ = make_sampler_instance(sample.sampling_function(), sampler_args,
-                                   t_meta, sampler);
-    if (!valid_.success()) {
+    if (args_.stride() <= 0) {
+      RESULT_ERROR(&valid_,
+                   "Strided partitioner stride (%ld) must be greater than 0",
+                   args_.stride());
       return;
     }
-    samplers_.emplace_back(sampler);
-  }
-  total_rows_ = samplers_[0]->total_rows();
-  total_samples_ = samplers_[0]->total_samples();
-  for (auto &sampler : samplers_) {
-    if (sampler->total_rows() != total_rows_) {
-      RESULT_ERROR(&valid_, "Samplers for task %s output a different number "
-                            "of rows (%ld vs. %ld)",
-                   task.output_table_name().c_str(), sampler->total_rows(),
-                   total_rows_);
+    if (args_.group_size() <= 0) {
+      RESULT_ERROR(
+          &valid_,
+          "Strided partitioner group size (%ld) must be greater than 0",
+          args_.group_size());
       return;
     }
-    if (sampler->total_samples() != total_samples_) {
-      RESULT_ERROR(&valid_, "Samplers for task %s output a different number "
-                            "of samples (%ld vs. %ld)",
-                   task.output_table_name().c_str(), sampler->total_samples(),
-                   total_samples_);
-      return;
+    i64 num_strided_rows = (num_rows_ + args_.stride() - 1) / args_.stride();
+    total_groups_ =
+        (i64)std::ceil(num_strided_rows / (float)args_.group_size());
+    for (i64 i = 0; i < num_strided_rows; i += args_.group_size()) {
+      offset_at_group_.push_back(i);
     }
+    offset_at_group_.push_back(num_strided_rows);
   }
-  table_id_ = table_metas.at(task.output_table_name()).id();
-}
 
-Result TaskSampler::validate() {
-  return valid_;
-}
+  Result validate() override { return valid_; }
 
-i64 TaskSampler::total_rows() {
-  return total_rows_;
-}
+  i64 total_rows() const override {
+    return (num_rows_ + args_.stride() - 1) / args_.stride();
+  }
 
-i64 TaskSampler::total_samples() {
-  return total_samples_;
-}
+  i64 total_groups() const override { return total_groups_; }
 
-Result TaskSampler::next_work(proto::NewWork& new_work) {
-  if (!valid_.success()) {
-    return valid_;
+  std::vector<i64> total_rows_per_group() const override {
+    std::vector<i64> rows;
+    for (i64 i = 0; i < total_groups_; ++i) {
+      rows.push_back(offset_at_group_[i + 1] - offset_at_group_[i]);
+    }
+    return rows;
+  }
+
+  PartitionGroup next_group() override {
+    assert(curr_group_idx_ < total_groups_);
+    return group_at(curr_group_idx_++);
+  }
+
+  void reset() override { curr_group_idx_ = 0; }
+
+  PartitionGroup group_at(i64 group_idx) override {
+    i64 pos = args_.group_size() * group_idx;
+    i64 s = pos;
+    i64 e = std::min(total_rows(), pos + args_.group_size());
+    assert(s >= 0);
+    assert(e <= total_rows());
+    PartitionGroup group;
+    for (i64 i = s; i < e; ++i) {
+      group.rows.push_back(i * args_.stride());
+    }
+    return group;
+  }
+
+  i64 offset_at_group(i64 group_idx) const override {
+    return offset_at_group_.at(group_idx);
   }
 
-  i64 sample_num = samples_pos_;
-  samples_pos_++;
-  assert(samples_pos_ <= total_samples_);
-
-  i64 item_id = sample_num;
-
-  proto::LoadWorkEntry &load_item = *new_work.mutable_load_work();
-  load_item.set_io_item_index(item_id);
-  i64 warmup_rows = 0;
-  i64 rows = 0;
-  for (i32 i = 0; i < task_.samples_size(); ++i) {
-    auto &sample = task_.samples(i);
-    const TableMetadata &t_meta = table_metas_.at(sample.table_name());
-    i32 sample_table_id = t_meta.id();
-
-    auto &sampler = samplers_[i];
-    RowSample row_sample = sampler->next_sample();
-
-    proto::LoadSample *load_sample = load_item.add_samples();
-    load_sample->set_table_id(sample_table_id);
-    for (auto col_name : sample.column_names()) {
-      load_sample->add_column_ids(t_meta.column_id(col_name));
-    }
-    for (i64 r : row_sample.warmup_rows) {
-      load_sample->add_warmup_rows(r);
-    }
-    for (i64 r : row_sample.rows) {
-      load_sample->add_rows(r);
-    }
-    if (i == 0) {
-      warmup_rows = row_sample.warmup_rows.size();
-      rows = row_sample.rows.size();
-    } else {
-      if (row_sample.warmup_rows.size() != warmup_rows) {
-        RESULT_ERROR(&valid_, "Samplers for task %s output a different number "
-                              "of warmup rows per sample (%ld vs. %ld)",
-                     task_.output_table_name().c_str(),
-                     row_sample.warmup_rows.size(), warmup_rows);
-        return valid_;
+ private:
+  Result valid_;
+  proto::StridedPartitionerArgs args_;
+  i64 curr_group_idx_ = 0;
+  i64 total_groups_;
+  std::vector<i64> offset_at_group_;
+};
+
+class StridedRangePartitioner : public Partitioner {
+ public:
+  StridedRangePartitioner(const std::vector<u8>& args, i64 num_rows)
+    : Partitioner("StridedRange", num_rows) {
+    valid_.set_success(true);
+    if (!args_.ParseFromArray(args.data(), args.size())) {
+      RESULT_ERROR(&valid_,
+                   "StridedRange sampler provided with invalid protobuf args");
+      return;
+    }
+    if (args_.stride() <= 0) {
+      RESULT_ERROR(&valid_,
+                   "StridedRange stride (%ld) must be greater than zero",
+                   args_.stride());
+      return;
+    }
+    if (args_.starts_size() != args_.ends_size()) {
+      RESULT_ERROR(&valid_,
+                   "StridedRange tarts and ends not the same size");
+      return;
+    }
+    for (i64 i = 0; i < args_.starts_size(); ++i) {
+      if (args_.starts(i) > args_.ends(i)) {
+        RESULT_ERROR(&valid_,
+                     "StridedRange start (%ld) should not be after end (%ld)",
+                     args_.starts(i), args_.ends(i));
+        return;
       }
-      if (row_sample.rows.size() != rows) {
-        RESULT_ERROR(&valid_, "Samplers for task %s output a different number "
-                              "of rows per sample (%ld vs. %ld)",
-                     task_.output_table_name().c_str(), row_sample.rows.size(),
-                     rows);
-        return valid_;
+      if (args_.ends(i) > num_rows_) {
+        RESULT_ERROR(
+            &valid_,
+            "StridedRange end (%ld) should be less than table num rows (%ld)",
+            args_.ends(i), num_rows_);
+        return;
       }
+      i64 rows =
+          ceil((args_.ends(i) - args_.starts(i)) / (float)args_.stride());
+      offset_at_group_.push_back(total_rows_);
+      total_rows_ += rows;
+    }
+    offset_at_group_.push_back(total_rows_);
+    total_groups_ = args_.starts_size();
+  }
+
+  Result validate() override { return valid_; }
+
+  i64 total_rows() const override { return total_rows_; }
+
+  i64 total_groups() const override { return total_groups_; }
+
+  std::vector<i64> total_rows_per_group() const override {
+    std::vector<i64> rows;
+    for (i64 i = 0; i < total_groups_; ++i) {
+      rows.push_back(offset_at_group_[i + 1] - offset_at_group_[i]);
+    }
+    return rows;
+  }
+
+  PartitionGroup next_group() override {
+    assert(curr_group_idx_ < total_groups_);
+    return group_at(curr_group_idx_++);
+  }
+
+  void reset() override { curr_group_idx_ = 0; }
+
+  PartitionGroup group_at(i64 group_idx) override {
+    i64 stride = args_.stride();
+    i64 s = args_.starts(group_idx);
+    i64 e = args_.ends(group_idx);
+    PartitionGroup group;
+    for (i64 i = s; i < e; i += stride) {
+      group.rows.push_back(i);
+    }
+    return group;
+  }
+
+  i64 offset_at_group(i64 group_idx) const override {
+    return offset_at_group_.at(group_idx);
+  }
+
+ private:
+  Result valid_;
+  proto::StridedRangePartitionerArgs args_;
+  i64 total_rows_ = 0;
+  i64 total_groups_ = 0;
+  std::vector<i64> offset_at_group_;
+  i64 curr_group_idx_ = 0;
+};
+
+class GatherPartitioner : public Partitioner {
+ public:
+  GatherPartitioner(const std::vector<u8>& args, i64 num_rows)
+    : Partitioner("Gather", num_rows) {
+    valid_.set_success(true);
+    if (!args_.ParseFromArray(args.data(), args.size())) {
+      RESULT_ERROR(&valid_,
+                   "Gather sampler provided with invalid protobuf args");
+      return;
+    }
+    for (i32 i = 0; i < args_.groups_size(); ++i) {
+      auto& s = args_.groups(i);
+      i64 rows = s.rows_size();
+      offset_at_group_.push_back(total_rows_);
+      total_rows_ += rows;
+    }
+    offset_at_group_.push_back(total_rows_);
+    total_groups_ = args_.groups_size();
+  }
+
+  Result validate() override { return valid_; }
+
+  i64 total_rows() const override { return total_rows_; }
+
+  i64 total_groups() const override { return total_groups_; }
+
+  std::vector<i64> total_rows_per_group() const override {
+    std::vector<i64> rows;
+    for (i64 i = 0; i < total_groups_; ++i) {
+      rows.push_back(offset_at_group_[i + 1] - offset_at_group_[i]);
     }
+    return rows;
+  }
+
+  PartitionGroup next_group() override {
+    assert(curr_group_idx_ < total_groups_);
+    return group_at(curr_group_idx_++);
+  }
+
+  void reset() override { curr_group_idx_ = 0; }
+
+  PartitionGroup group_at(i64 group_idx) override {
+    PartitionGroup group;
+    auto& s = args_.groups(curr_group_idx_);
+    group.rows = std::vector<i64>(s.rows().begin(), s.rows().end());
+    return group;
+  }
+
+  i64 offset_at_group(i64 group_idx) const override {
+    return offset_at_group_.at(group_idx);
   }
 
-  proto::IOItem &item = *new_work.mutable_io_item();
-  item.set_table_id(table_id_);
-  item.set_item_id(item_id);
-  item.set_start_row(allocated_rows_);
-  item.set_end_row(allocated_rows_ + rows);
+ private:
+  Result valid_;
+  proto::GatherPartitionerArgs args_;
+  i64 total_rows_ = 0;
+  i64 total_groups_ = 0;
+  std::vector<i64> offset_at_group_;
+  i64 curr_group_idx_ = 0;
+};
 
-  allocated_rows_ += rows;
+template <typename T>
+PartitionerFactory make_factory() {
+  return [](const std::vector<u8>& args, i64 num_rows) {
+    return new T(args, num_rows);
+  };
+}
+}
 
-  return valid_;
+Result make_partitioner_instance(const std::string& sampler_type,
+                                 const std::vector<u8>& sampler_args,
+                                 i64 num_rows, Partitioner*& sampler) {
+  static std::map<std::string, PartitionerFactory> samplers = {
+      {"Strided", make_factory<StridedPartitioner>()},
+      {"StridedRange", make_factory<StridedRangePartitioner>()},
+      {"Gather", make_factory<GatherPartitioner>()}};
+
+  Result result;
+  result.set_success(true);
+
+  // Check if sampler type exists
+  auto it = samplers.find(sampler_type);
+  if (it == samplers.end()) {
+    RESULT_ERROR(&result, "Partitioner type not found: %s", sampler_type.c_str());
+    return result;
+  }
+
+  // Validate sampler args
+  PartitionerFactory factory = it->second;
+  Partitioner* potential_sampler = factory(sampler_args, num_rows);
+  result = potential_sampler->validate();
+  if (!result.success()) {
+    delete potential_sampler;
+  } else {
+    sampler = potential_sampler;
+  }
+
+  return result;
 }
 
 }
diff --git a/scanner/engine/sampler.h b/scanner/engine/sampler.h
index 98d34dff..7b67a44d 100644
--- a/scanner/engine/sampler.h
+++ b/scanner/engine/sampler.h
@@ -15,7 +15,8 @@
 
 #pragma once
 
-#include "scanner/engine/db.h"
+#include "scanner/engine/metadata.h"
+#include "scanner/engine/table_meta_cache.h"
 #include "scanner/util/common.h"
 #include "scanner/util/profiler.h"
 
@@ -35,62 +36,74 @@ namespace internal {
    - Filter: select all rows where some predicate holds on one of the columns
  */
 
-struct RowSample {
-  std::vector<i64> warmup_rows;
-  std::vector<i64> rows;
-};
+class DomainSampler {
+ public:
+  DomainSampler(const std::string& name)
+    : name_(name) {}
 
-class Sampler {
-public:
-  Sampler(const std::string& name, const TableMetadata& table) :
-      name_(name), table_(table) {}
+  virtual ~DomainSampler() {}
 
   const std::string& name() const { return name_; }
 
   virtual Result validate() = 0;
 
-  virtual i64 total_rows() const = 0;
+  virtual Result get_upstream_rows(const std::vector<i64>& downstream_rows,
+                                   std::vector<i64>& upstream_rows) const = 0;
 
-  virtual i64 total_samples() const = 0;
+  virtual Result get_num_downstream_rows(
+      i64 num_upstream_rows,
+      i64& num_downstream_rows) const = 0;
 
-  virtual RowSample next_sample() = 0;
-
-  virtual void reset() = 0;
+  virtual Result get_downstream_rows(
+      const std::vector<i64>& upstream_rows,
+      std::vector<i64>& downstream_rows,
+      std::vector<i64>& downstream_upstream_mapping) const = 0;
 
-protected:
+ protected:
   std::string name_;
-  TableMetadata table_;
 };
 
-Result make_sampler_instance(const std::string& sampler_type,
+Result
+make_domain_sampler_instance(const std::string& sampler_type,
                              const std::vector<u8>& sampler_args,
-                             const TableMetadata& sampled_table,
-                             Sampler*& sampler);
+                             DomainSampler*& sampler);
 
-class TaskSampler {
-public:
-  TaskSampler(const std::map<std::string, TableMetadata> &table_metas,
-              const proto::Task &task);
+struct PartitionGroup {
+  std::vector<i64> rows;
+};
 
-  Result validate();
+class Partitioner {
+ public:
+  Partitioner(const std::string& name, i64 num_rows)
+    : name_(name), num_rows_(num_rows) {}
 
-  i64 total_rows();
+  virtual ~Partitioner() {}
 
-  i64 total_samples();
+  const std::string& name() const { return name_; }
 
-  Result next_work(proto::NewWork& new_work);
+  virtual Result validate() = 0;
 
-private:
-  const std::map<std::string, TableMetadata>& table_metas_;
-  const proto::Task& task_;
-  Result valid_;
-  std::vector<std::unique_ptr<Sampler>> samplers_;
-  i64 total_rows_ = 0;
-  i32 table_id_;
-  i64 total_samples_ = 0;
-  i64 samples_pos_ = 0;
-  i64 allocated_rows_ = 0;
+  virtual i64 total_rows() const = 0;
+
+  virtual i64 total_groups() const = 0;
+
+  virtual std::vector<i64> total_rows_per_group() const = 0;
+
+  virtual PartitionGroup next_group() = 0;
+
+  virtual void reset() = 0;
+
+  virtual PartitionGroup group_at(i64 group_idx) = 0;
+
+  virtual i64 offset_at_group(i64 group_idx) const = 0;
+
+ protected:
+  std::string name_;
+  i64 num_rows_;
 };
 
+Result make_partitioner_instance(const std::string& sampler_type,
+                                 const std::vector<u8>& sampler_args,
+                                 i64 num_rows, Partitioner*& partitioner);
 }
 }
diff --git a/scanner/engine/sampling.h b/scanner/engine/sampler_registry.h
similarity index 51%
rename from scanner/engine/sampling.h
rename to scanner/engine/sampler_registry.h
index cf0fd643..402ba665 100644
--- a/scanner/engine/sampling.h
+++ b/scanner/engine/sampler_registry.h
@@ -15,30 +15,28 @@
 
 #pragma once
 
-#include "scanner/engine/runtime.h"
+#include "scanner/api/op.h"
+#include "scanner/engine/op_info.h"
+
 #include "scanner/util/common.h"
 
+#include <map>
+
 namespace scanner {
 namespace internal {
 
-struct RowIntervals {
-  std::vector<i32> item_ids;
-  std::vector<std::tuple<i64, i64>> item_intervals;
-  std::vector<std::vector<i64>> valid_offsets;
-};
+class SamplerRegistry {
+ public:
+  void add_sampler(const std::string& name, Sampler* info);
+
+  OpInfo* get_op_info(const std::string& name) const;
 
-// Gets the list of work items for a sequence of rows in the job
-RowIntervals
-slice_into_row_intervals(const TableMetadata &table,
-                         const std::vector<i64> &rows);
+  bool has_op(const std::string& name) const;
 
-struct VideoIntervals {
-  std::vector<std::tuple<size_t, size_t>> keyframe_index_intervals;
-  std::vector<std::vector<i64>> valid_frames;
+ private:
+  std::map<std::string, OpInfo*> ops_;
 };
 
-VideoIntervals
-slice_into_video_intervals(const std::vector<i64> &keyframe_positions,
-                           const std::vector<i64> &rows);
+OpRegistry* get_op_registry();
 }
 }
diff --git a/scanner/engine/sampling.cpp b/scanner/engine/sampling.cpp
deleted file mode 100644
index 13f3730d..00000000
--- a/scanner/engine/sampling.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright 2016 Carnegie Mellon University
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "scanner/engine/sampling.h"
-
-namespace scanner {
-namespace internal {
-
-// Gets the list of work items for a sequence of rows in the job
-RowIntervals slice_into_row_intervals(const TableMetadata &table,
-                                      const std::vector<i64> &rows) {
-  RowIntervals info;
-  // Analyze rows and table to determine what item ids and offsets in them to
-  // sample from
-  std::vector<i64> end_rows = table.end_rows();
-  auto item_from_row = [&end_rows](i64 r) -> i32 {
-    i64 i = 0;
-    for (; i < end_rows.size(); ++i) {
-      if (r < end_rows[i]) {
-        break;
-      }
-    }
-    assert(i != end_rows.size());
-    return i;
-  };
-
-  auto offset_from_row = [&end_rows](i64 r) -> i64 {
-    i64 i = 0;
-    i64 sum = 0;
-    for (; i < end_rows.size(); ++i) {
-      if (r < end_rows[i]) {
-        break;
-      }
-      sum += end_rows[i];
-    }
-    assert(i != end_rows.size());
-    return r - sum;
-  };
-
-  assert(!rows.empty());
-  i32 current_item = item_from_row(rows[0]);
-  i64 item_start = offset_from_row(rows[0]);
-  i64 item_end = item_start + 1;
-  std::vector<i64> valid_offsets;
-  for (i64 row : rows) {
-    i32 item = item_from_row(row);
-    i64 item_offset = offset_from_row(row);
-    if (item != current_item) {
-      // Start a new item and push the current one into the list
-      info.item_ids.push_back(current_item);
-      info.item_intervals.push_back(std::make_tuple(item_start, item_end));
-      info.valid_offsets.push_back(valid_offsets);
-
-      current_item = item;
-      item_start = item_offset;
-      item_end = item_offset + 1;
-      valid_offsets.clear();
-    }
-
-    valid_offsets.push_back(item_offset);
-    item_end = item_offset + 1;
-  }
-  info.item_ids.push_back(current_item);
-  info.item_intervals.push_back(std::make_tuple(item_start, item_end));
-  info.valid_offsets.push_back(valid_offsets);
-
-  return info;
-}
-
-VideoIntervals
-slice_into_video_intervals(const std::vector<i64> &keyframe_positions,
-                           const std::vector<i64> &rows) {
-  VideoIntervals info;
-  assert(keyframe_positions.size() >= 2);
-  size_t start_keyframe_index = 0;
-  size_t end_keyframe_index = 1;
-  i64 next_keyframe = keyframe_positions[end_keyframe_index];
-  std::vector<i64> valid_frames;
-  for (i64 row : rows) {
-    if (row >= next_keyframe) {
-      assert(end_keyframe_index < keyframe_positions.size() - 1);
-      next_keyframe = keyframe_positions[++end_keyframe_index];
-      if (row >= next_keyframe) {
-        // Skipped a keyframe, so make a new interval
-        if (!valid_frames.empty()) {
-          info.keyframe_index_intervals.push_back(
-              std::make_tuple(start_keyframe_index, end_keyframe_index - 1));
-          info.valid_frames.push_back(valid_frames);
-        }
-
-        while (row >= keyframe_positions[end_keyframe_index]) {
-          end_keyframe_index++;
-          assert(end_keyframe_index < keyframe_positions.size());
-        }
-        valid_frames.clear();
-        start_keyframe_index = end_keyframe_index - 1;
-        next_keyframe = keyframe_positions[end_keyframe_index];
-      }
-    }
-    valid_frames.push_back(row);
-  }
-  info.keyframe_index_intervals.push_back(
-      std::make_tuple(start_keyframe_index, end_keyframe_index));
-  info.valid_frames.push_back(valid_frames);
-  return info;
-}
-}
-}
diff --git a/scanner/engine/save_worker.cpp b/scanner/engine/save_worker.cpp
index ae5f746d..5459a6a6 100644
--- a/scanner/engine/save_worker.cpp
+++ b/scanner/engine/save_worker.cpp
@@ -15,9 +15,10 @@
 
 #include "scanner/engine/save_worker.h"
 
-#include "scanner/engine/db.h"
+#include "scanner/engine/metadata.h"
 #include "scanner/util/common.h"
 #include "scanner/util/storehouse.h"
+#include "scanner/video/h264_byte_stream_index_creator.h"
 
 #include "storehouse/storage_backend.h"
 
@@ -30,133 +31,224 @@ using storehouse::RandomReadFile;
 namespace scanner {
 namespace internal {
 
-void *save_thread(void *arg) {
-  SaveThreadArgs &args = *reinterpret_cast<SaveThreadArgs *>(arg);
-
+SaveWorker::SaveWorker(const SaveWorkerArgs& args)
+    : node_id_(args.node_id), worker_id_(args.worker_id), profiler_(args.profiler) {
   auto setup_start = now();
-
   // Setup a distinct storage backend for each IO thread
-  storehouse::StorageBackend *storage =
-      storehouse::StorageBackend::make_from_config(args.storage_config);
+  storage_.reset(
+      storehouse::StorageBackend::make_from_config(args.storage_config));
 
   args.profiler.add_interval("setup", setup_start, now());
 
-  while (true) {
-    auto idle_start = now();
-
-    std::tuple<IOItem, EvalWorkEntry> entry;
-    args.input_work.pop(entry);
-    IOItem &io_item = std::get<0>(entry);
-    EvalWorkEntry &work_entry = std::get<1>(entry);
-
-    if (work_entry.io_item_index == -1) {
-      break;
-    }
-
-    VLOG(1) << "Save (N/KI: " << args.node_id << "/" << args.id
-              << "): processing item " << work_entry.io_item_index;
-
-    args.profiler.add_interval("idle", idle_start, now());
-
-    auto work_start = now();
-
-    // Write out each output column to an individual data file
-    for (size_t out_idx = 0; out_idx < work_entry.columns.size(); ++out_idx) {
-      u64 num_rows = static_cast<u64>(work_entry.columns[out_idx].rows.size());
-
-      const std::string output_path = table_item_output_path(
-          io_item.table_id(), out_idx, io_item.item_id());
+}
 
-      auto io_start = now();
+SaveWorker::~SaveWorker() {
+  for (auto& file : output_) {
+    file->save();
+  }
+  for (auto& file : output_metadata_) {
+    file->save();
+  }
+  for (auto& meta : video_metadata_) {
+    write_video_metadata(storage_.get(), meta);
+  }
+  output_.clear();
+  output_metadata_.clear();
+  video_metadata_.clear();
+}
 
-      WriteFile *output_file = nullptr;
-      BACKOFF_FAIL(storage->make_write_file(output_path, output_file));
+void SaveWorker::feed(EvalWorkEntry& input_entry) {
+  EvalWorkEntry& work_entry = input_entry;
 
-      if (work_entry.columns[out_idx].rows.size() != num_rows) {
-        LOG(FATAL) << "Output layer's row vector has wrong length";
-      }
+  // Write out each output column to an individual data file
+  i32 video_col_idx = 0;
+  for (size_t out_idx = 0; out_idx < work_entry.columns.size(); ++out_idx) {
+    u64 num_elements = static_cast<u64>(work_entry.columns[out_idx].size());
 
-      if (!work_entry.column_handles[out_idx].is_same_address_space(
-              CPU_DEVICE)) {
-        std::vector<u8 *> dest_buffers, src_buffers;
-        std::vector<size_t> sizes;
-        size_t total_size = 0;
-        for (i32 f = 0; f < num_rows; ++f) {
-          Row &row = work_entry.columns[out_idx].rows[f];
-          total_size += row.size;
-        }
+    auto io_start = now();
 
-        if (num_rows > 0) {
-          u8 *output_block = new_block_buffer(CPU_DEVICE, total_size, num_rows);
-          for (i32 f = 0; f < num_rows; ++f) {
-            Row &row = work_entry.columns[out_idx].rows[f];
-            size_t size = row.size;
-            u8 *src_buffer = row.buffer;
-            u8 *dest_buffer = output_block;
+    WriteFile* output_file = output_.at(out_idx).get();
+    WriteFile* output_metadata_file = output_metadata_.at(out_idx).get();
 
-            dest_buffers.push_back(dest_buffer);
-            src_buffers.push_back(src_buffer);
-            sizes.push_back(size);
+    if (work_entry.columns[out_idx].size() != num_elements) {
+      LOG(FATAL) << "Output layer's element vector has wrong length";
+    }
 
-            output_block += size;
+    // Ensure the data is on the CPU
+    move_if_different_address_space(profiler_,
+                                    work_entry.column_handles[out_idx],
+                                    CPU_DEVICE, work_entry.columns[out_idx]);
+
+    bool compressed = work_entry.compressed[out_idx];
+    // If this is a video...
+    i64 size_written = 0;
+    if (work_entry.column_types[out_idx] == ColumnType::Video) {
+      // Read frame info column
+      assert(work_entry.columns[out_idx].size() > 0);
+      FrameInfo frame_info = work_entry.frame_sizes[video_col_idx];
+
+      // Create index column
+      VideoMetadata& video_meta = video_metadata_[video_col_idx];
+      proto::VideoDescriptor& video_descriptor = video_meta.get_descriptor();
+
+      video_descriptor.set_width(frame_info.width());
+      video_descriptor.set_height(frame_info.height());
+      video_descriptor.set_channels(frame_info.channels());
+      video_descriptor.set_frame_type(frame_info.type);
+
+      video_descriptor.set_time_base_num(1);
+      video_descriptor.set_time_base_denom(25);
+
+      video_descriptor.set_num_encoded_videos(
+          video_descriptor.num_encoded_videos() + 1);
+
+      if (compressed && frame_info.type == FrameType::U8 &&
+          frame_info.channels() == 3) {
+        H264ByteStreamIndexCreator index_creator(output_file);
+        for (size_t i = 0; i < num_elements; ++i) {
+          Element& element = work_entry.columns[out_idx][i];
+          if (!index_creator.feed_packet(element.buffer, element.size)) {
+            LOG(FATAL) << "Error in save worker h264 index creator: "
+                       << index_creator.error_message();
           }
+          size_written += element.size;
+        }
 
-          memcpy_vec(dest_buffers, CPU_DEVICE, src_buffers,
-                     work_entry.column_handles[out_idx], sizes);
-
-          for (i32 f = 0; f < num_rows; ++f) {
-            delete_buffer(work_entry.column_handles[out_idx] , src_buffers[f]);
-            work_entry.columns[out_idx].rows[f].buffer = dest_buffers[f];
-          }
+        i64 frame = index_creator.frames();
+        i32 num_non_ref_frames = index_creator.num_non_ref_frames();
+        const std::vector<u8>& metadata_bytes = index_creator.metadata_bytes();
+        const std::vector<u64>& keyframe_indices =
+            index_creator.keyframe_indices();
+        const std::vector<u64>& sample_offsets =
+            index_creator.sample_offsets();
+        const std::vector<u64>& sample_sizes =
+            index_creator.sample_sizes();
+
+        video_descriptor.set_chroma_format(proto::VideoDescriptor::YUV_420);
+        video_descriptor.set_codec_type(proto::VideoDescriptor::H264);
+
+        video_descriptor.set_frames(video_descriptor.frames() + frame);
+        video_descriptor.add_frames_per_video(frame);
+        video_descriptor.add_keyframes_per_video(keyframe_indices.size());
+        video_descriptor.add_size_per_video(index_creator.bytestream_pos());
+        video_descriptor.set_metadata_packets(metadata_bytes.data(),
+                                              metadata_bytes.size());
+
+        const std::string output_path =
+            table_item_output_path(video_descriptor.table_id(), out_idx,
+                                   video_descriptor.item_id());
+        video_descriptor.set_data_path(output_path);
+        video_descriptor.set_inplace(false);
+
+        for (u64 v : keyframe_indices) {
+          video_descriptor.add_keyframe_indices(v);
+        }
+        for (u64 v : sample_offsets) {
+          video_descriptor.add_sample_offsets(v);
+        }
+        for (u64 v : sample_sizes) {
+          video_descriptor.add_sample_sizes(v);
+        }
+      } else {
+        // Non h264 compressible video column
+        video_descriptor.set_codec_type(proto::VideoDescriptor::RAW);
+        // Need to specify but not used for this type
+        video_descriptor.set_chroma_format(proto::VideoDescriptor::YUV_420);
+        video_descriptor.set_frames(video_descriptor.frames() + num_elements);
+
+        // Write number of elements in the file
+        s_write(output_metadata_file, num_elements);
+        // Write out all output sizes first so we can easily index into the
+        // file
+        for (size_t i = 0; i < num_elements; ++i) {
+          Frame* frame = work_entry.columns[out_idx][i].as_frame();
+          u64 buffer_size = frame->size();
+          s_write(output_metadata_file, buffer_size);
+          size_written += sizeof(u64);
+        }
+        // Write actual output data
+        for (size_t i = 0; i < num_elements; ++i) {
+          Frame* frame = work_entry.columns[out_idx][i].as_frame();
+          i64 buffer_size = frame->size();
+          u8* buffer = frame->data;
+          s_write(output_file, buffer, buffer_size);
+          size_written += buffer_size;
         }
       }
 
-      // Write number of rows in the file
-      s_write(output_file, num_rows);
-      // Write out all output sizes first so we can easily index into the file
-      i64 size_written = 0;
-      for (size_t i = 0; i < num_rows; ++i) {
-        i64 buffer_size = work_entry.columns[out_idx].rows[i].size;
-        s_write(output_file, buffer_size);
-        size_written += sizeof(i64);
+      video_col_idx++;
+    } else {
+      // Write number of elements in the file
+      s_write(output_metadata_file, num_elements);
+      // Write out all output sizes to metadata file  so we can easily index into the data file
+      for (size_t i = 0; i < num_elements; ++i) {
+        u64 buffer_size = work_entry.columns[out_idx][i].size;
+        s_write(output_metadata_file, buffer_size);
+        size_written += sizeof(u64);
       }
       // Write actual output data
-      for (size_t i = 0; i < num_rows; ++i) {
-        i64 buffer_size = work_entry.columns[out_idx].rows[i].size;
-        u8 *buffer = work_entry.columns[out_idx].rows[i].buffer;
+      for (size_t i = 0; i < num_elements; ++i) {
+        i64 buffer_size = work_entry.columns[out_idx][i].size;
+        u8* buffer = work_entry.columns[out_idx][i].buffer;
         s_write(output_file, buffer, buffer_size);
         size_written += buffer_size;
       }
-
-      BACKOFF_FAIL(output_file->save());
-
-      // TODO(apoms): For now, all evaluators are expected to return CPU
-      //   buffers as output so just assume CPU
-      for (size_t i = 0; i < num_rows; ++i) {
-        delete_buffer(CPU_DEVICE, work_entry.columns[out_idx].rows[i].buffer);
-      }
-
-      delete output_file;
-
-      args.profiler.add_interval("io", io_start, now());
-      args.profiler.increment("io_write", size_written);
     }
 
-    VLOG(1) << "Save (N/KI: " << args.node_id << "/" << args.id
-              << "): finished item " << work_entry.io_item_index;
-
-    args.profiler.add_interval("task", work_start, now());
+    // TODO(apoms): For now, all evaluators are expected to return CPU
+    //   buffers as output so just assume CPU
+    for (size_t i = 0; i < num_elements; ++i) {
+      delete_element(CPU_DEVICE, work_entry.columns[out_idx][i]);
+    }
 
-    args.retired_items++;
+    profiler_.add_interval("io", io_start, now());
+    profiler_.increment("io_write", size_written);
   }
+}
 
-  VLOG(1) << "Save (N/KI: " << args.node_id << "/" << args.id
-            << "): thread finished ";
-
-  // Cleanup
-  delete storage;
-
-  THREAD_RETURN_SUCCESS();
+void SaveWorker::new_task(i32 table_id, i32 task_id,
+                          std::vector<ColumnType> column_types) {
+  auto io_start = now();
+  for (auto& file : output_) {
+    file->save();
+  }
+  for (auto& file : output_metadata_) {
+    file->save();
+  }
+  for (auto& meta : video_metadata_) {
+    write_video_metadata(storage_.get(), meta);
+  }
+  output_.clear();
+  output_metadata_.clear();
+  video_metadata_.clear();
+
+  profiler_.add_interval("io", io_start, now());
+
+  for (size_t out_idx = 0; out_idx < column_types.size(); ++out_idx) {
+    const std::string output_path =
+        table_item_output_path(table_id, out_idx, task_id);
+    const std::string output_metdata_path =
+        table_item_metadata_path(table_id, out_idx, task_id);
+
+    WriteFile* output_file = nullptr;
+    BACKOFF_FAIL(storage_->make_write_file(output_path, output_file));
+    output_.emplace_back(output_file);
+
+    WriteFile* output_metadata_file = nullptr;
+    BACKOFF_FAIL(
+        storage_->make_write_file(output_metdata_path, output_metadata_file));
+    output_metadata_.emplace_back(output_metadata_file);
+
+    if (column_types[out_idx] == ColumnType::Video) {
+      video_metadata_.emplace_back();
+
+      VideoMetadata& video_meta = video_metadata_.back();
+      proto::VideoDescriptor& video_descriptor = video_meta.get_descriptor();
+      video_descriptor.set_table_id(table_id);
+      video_descriptor.set_column_id(out_idx);
+      video_descriptor.set_item_id(task_id);
+    }
+  }
 }
 }
 }
diff --git a/scanner/engine/save_worker.h b/scanner/engine/save_worker.h
index 8f8b0632..cb9f847e 100644
--- a/scanner/engine/save_worker.h
+++ b/scanner/engine/save_worker.h
@@ -18,26 +18,52 @@
 #include "scanner/engine/runtime.h"
 #include "scanner/util/common.h"
 #include "scanner/util/queue.h"
+#include "scanner/util/storehouse.h"
 
 namespace scanner {
 namespace internal {
 
-struct SaveThreadArgs {
+struct SaveWorkerArgs {
   // Uniform arguments
   i32 node_id;
-  std::string job_name;
 
   // Per worker arguments
-  int id;
+  int worker_id;
   storehouse::StorageConfig* storage_config;
   Profiler& profiler;
-
-  // Queues for communicating work
-  Queue<std::tuple<IOItem, EvalWorkEntry>>& input_work;
-  std::atomic<i64>& retired_items;
 };
 
-void* save_thread(void* arg);
+class SaveWorker {
+ public:
+  SaveWorker(const SaveWorkerArgs& args);
+  ~SaveWorker();
+
+  void feed(EvalWorkEntry& input_entry);
+
+  void new_task(i32 table_id, i32 task_id,
+                std::vector<ColumnType> column_types);
+
+ private:
+  const i32 node_id_;
+  const i32 worker_id_;
+  Profiler& profiler_;
+  // Setup a distinct storage backend for each IO thread
+  std::unique_ptr<storehouse::StorageBackend> storage_;
+  // Files to write io packets to
+  std::vector<std::unique_ptr<storehouse::WriteFile>> output_;
+  std::vector<std::unique_ptr<storehouse::WriteFile>> output_metadata_;
+  std::vector<VideoMetadata> video_metadata_;
+
+  // Continuation state
+  bool first_item_;
+  bool needs_configure_;
+  bool needs_reset_;
+
+  i64 current_work_item_;
+  i64 current_row_;
+  i64 total_work_items_;
+
+};
 
 }
 }
diff --git a/scanner/engine/slice_op.cpp b/scanner/engine/slice_op.cpp
new file mode 100644
index 00000000..068d968e
--- /dev/null
+++ b/scanner/engine/slice_op.cpp
@@ -0,0 +1,40 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/memory.h"
+
+namespace scanner {
+
+// Dummy Kernel
+class SliceKernel : public BatchedKernel {
+ public:
+  SliceKernel(const KernelConfig& config)
+    : BatchedKernel(config) {}
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    // No implementation
+  }
+};
+
+
+// Reserve Op name as builtin
+REGISTER_OP(Slice).input("col").output("out");
+
+REGISTER_KERNEL(Slice, SliceKernel).device(DeviceType::CPU).num_devices(1);
+
+REGISTER_KERNEL(Slice, SliceKernel).device(DeviceType::GPU).num_devices(1);
+
+
+REGISTER_OP(SliceFrame).frame_input("col").frame_output("out");
+
+REGISTER_KERNEL(SliceFrame, SliceKernel)
+    .device(DeviceType::CPU)
+    .batch()
+    .num_devices(1);
+
+REGISTER_KERNEL(SliceFrame, SliceKernel)
+    .device(DeviceType::GPU)
+    .batch()
+    .num_devices(1);
+
+}
diff --git a/scanner/engine/space_op.cpp b/scanner/engine/space_op.cpp
new file mode 100644
index 00000000..ea636a72
--- /dev/null
+++ b/scanner/engine/space_op.cpp
@@ -0,0 +1,44 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/memory.h"
+
+namespace scanner {
+
+// Dummy Kernel
+class SpaceKernel : public BatchedKernel {
+ public:
+  SpaceKernel(const KernelConfig& config)
+    : BatchedKernel(config) {}
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    // No implementation
+  }
+};
+
+
+// Reserve Op name as builtin
+REGISTER_OP(Space).input("col").output("out");
+
+REGISTER_KERNEL(Space, SpaceKernel)
+    .device(DeviceType::CPU)
+    .batch()
+    .num_devices(1);
+
+REGISTER_KERNEL(Space, SpaceKernel)
+    .device(DeviceType::GPU)
+    .batch()
+    .num_devices(1);
+
+REGISTER_OP(SpaceFrame).frame_input("col").frame_output("out");
+
+REGISTER_KERNEL(spaceFrame, SpaceKernel)
+    .device(DeviceType::CPU)
+    .batch()
+    .num_devices(1);
+
+REGISTER_KERNEL(SpaceFrame, SpaceKernel)
+    .device(DeviceType::GPU)
+    .batch()
+    .num_devices(1);
+}
diff --git a/scanner/engine/table_meta_cache.cpp b/scanner/engine/table_meta_cache.cpp
new file mode 100644
index 00000000..a09de5cf
--- /dev/null
+++ b/scanner/engine/table_meta_cache.cpp
@@ -0,0 +1,97 @@
+/* Copyright 2017 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scanner/engine/table_meta_cache.h"
+#include "scanner/util/thread_pool.h"
+
+namespace scanner {
+namespace internal {
+
+static const i32 NUM_PREFETCH_THREADS = 64;
+
+TableMetaCache::TableMetaCache(storehouse::StorageBackend* storage,
+                               const DatabaseMetadata& meta)
+  : storage_(storage), meta_(meta) {}
+
+const TableMetadata& TableMetaCache::at(const std::string& table_name) const {
+  i32 table_id = meta_.get_table_id(table_name);
+  memoized_read(table_id);
+  std::lock_guard<std::mutex> lock(lock_);
+  return cache_.at(table_id);
+}
+
+const TableMetadata& TableMetaCache::at(i32 table_id) const {
+  memoized_read(table_id);
+  std::lock_guard<std::mutex> lock(lock_);
+  return cache_.at(table_id);
+}
+
+bool TableMetaCache::exists(const std::string& table_name) const {
+  return meta_.has_table(table_name);
+}
+
+bool TableMetaCache::exists(i32 table_id) const {
+  return meta_.has_table(table_id);
+}
+
+void TableMetaCache::update(const TableMetadata& meta) {
+  std::lock_guard<std::mutex> lock(lock_);
+  i32 table_id = meta_.get_table_id(meta.name());
+  cache_[table_id] = meta;
+}
+
+void TableMetaCache::prefetch(const std::vector<std::string> table_names) {
+  VLOG(1) << "Prefetching table metadata";
+  auto load_table_meta = [&](const std::string& table_name) {
+    std::string table_path = TableMetadata::descriptor_path(meta_.get_table_id(table_name));
+    update(read_table_metadata(storage_, table_path));
+  };
+
+  VLOG(1) << "Spawning thread pool";
+  ThreadPool prefetch_pool(NUM_PREFETCH_THREADS);
+  std::vector<std::future<void>> futures;
+  for (const auto& t : table_names) {
+    futures.emplace_back(prefetch_pool.enqueue(load_table_meta, t));
+  }
+
+  VLOG(1) << "Waiting on futures";
+  for (auto& future : futures) {
+    future.wait();
+  }
+
+  VLOG(1) << "Prefetch complete.";
+}
+
+
+void TableMetaCache::memoized_read(const std::string& table_name) const {
+  memoized_read(meta_.get_table_id(table_name));
+}
+
+void TableMetaCache::memoized_read(i32 table_id) const {
+  bool b;
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+    b = cache_.count(table_id) == 0 && meta_.has_table(table_id);
+  }
+  if (b) {
+    std::string table_path = TableMetadata::descriptor_path(table_id);
+    TableMetadata meta = read_table_metadata(storage_, table_path);
+    std::lock_guard<std::mutex> lock(lock_);
+    cache_.insert({table_id, meta});
+  }
+}
+
+}
+}
diff --git a/scanner/engine/table_meta_cache.h b/scanner/engine/table_meta_cache.h
new file mode 100644
index 00000000..b9a41061
--- /dev/null
+++ b/scanner/engine/table_meta_cache.h
@@ -0,0 +1,55 @@
+/* Copyright 2017 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "scanner/engine/metadata.h"
+
+#include <map>
+#include <mutex>
+
+namespace scanner {
+namespace internal {
+
+class TableMetaCache {
+ public:
+  TableMetaCache(storehouse::StorageBackend* storage,
+                 const DatabaseMetadata& meta);
+
+  const TableMetadata& at(const std::string& table_name) const;
+
+  const TableMetadata& at(i32 table_id) const;
+
+  bool exists(const std::string& table_name) const;
+
+  bool exists(i32 table_id) const;
+
+  void update(const TableMetadata& meta);
+
+  void prefetch(const std::vector<std::string> table_names);
+
+ private:
+  void memoized_read(const std::string& table_name) const;
+
+  void memoized_read(i32 table_id) const;
+
+  storehouse::StorageBackend* storage_;
+  const DatabaseMetadata& meta_;
+  mutable std::mutex lock_;
+  mutable std::map<i32, TableMetadata> cache_;
+};
+
+}
+}
diff --git a/scanner/engine/unslice_op.cpp b/scanner/engine/unslice_op.cpp
new file mode 100644
index 00000000..2d3df796
--- /dev/null
+++ b/scanner/engine/unslice_op.cpp
@@ -0,0 +1,40 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/memory.h"
+
+namespace scanner {
+
+// Dummy Kernel
+class UnsliceKernel : public BatchedKernel {
+ public:
+  UnsliceKernel(const KernelConfig& config)
+    : BatchedKernel(config) {}
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    // No implementation
+  }
+};
+
+
+// Reserve Op name as builtin
+REGISTER_OP(Unslice).input("col").output("out");
+
+REGISTER_KERNEL(Unslice, UnsliceKernel).device(DeviceType::CPU).num_devices(1);
+
+REGISTER_KERNEL(Unslice, UnsliceKernel).device(DeviceType::GPU).num_devices(1);
+
+
+REGISTER_OP(UnsliceFrame).frame_input("col").frame_output("out");
+
+REGISTER_KERNEL(UnsliceFrame, UnsliceKernel)
+    .device(DeviceType::CPU)
+    .batch()
+    .num_devices(1);
+
+REGISTER_KERNEL(UnsliceFrame, UnsliceKernel)
+    .device(DeviceType::GPU)
+    .batch()
+    .num_devices(1);
+
+}
diff --git a/scanner/engine/video_index_entry.cpp b/scanner/engine/video_index_entry.cpp
new file mode 100644
index 00000000..cd8bcb3b
--- /dev/null
+++ b/scanner/engine/video_index_entry.cpp
@@ -0,0 +1,98 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scanner/engine/video_index_entry.h"
+
+namespace scanner {
+namespace internal {
+
+std::unique_ptr<storehouse::RandomReadFile> VideoIndexEntry::open_file() const {
+  std::unique_ptr<storehouse::RandomReadFile> file;
+  const std::string p =
+      inplace ? path : table_item_output_path(table_id, column_id, item_id);
+  BACKOFF_FAIL(storehouse::make_unique_random_read_file(storage, p, file));
+  return std::move(file);
+}
+
+VideoIndexEntry read_video_index(storehouse::StorageBackend* storage,
+                                 i32 table_id, i32 column_id, i32 item_id) {
+  VideoMetadata video_meta = read_video_metadata(
+      storage, VideoMetadata::descriptor_path(table_id, column_id, item_id));
+  return read_video_index(storage, video_meta);
+}
+
+VideoIndexEntry read_video_index(storehouse::StorageBackend* storage,
+                                 const VideoMetadata& video_meta) {
+  VideoIndexEntry index_entry;
+
+  i32 table_id = video_meta.table_id();
+  i32 column_id = video_meta.column_id();
+  i32 item_id = video_meta.item_id();
+
+  // Open the video file for reading
+  index_entry.path = video_meta.data_path();
+  index_entry.inplace = video_meta.inplace();
+  index_entry.storage = storage;
+  index_entry.table_id = table_id;
+  index_entry.column_id = column_id;
+  index_entry.item_id = item_id;
+  index_entry.width = video_meta.width();
+  index_entry.height = video_meta.height();
+  index_entry.channels = video_meta.channels();
+  index_entry.frame_type = video_meta.frame_type();
+  index_entry.codec_type = video_meta.codec_type();
+
+  std::unique_ptr<storehouse::RandomReadFile> file = index_entry.open_file();
+  BACKOFF_FAIL(file->get_size(index_entry.file_size));
+  index_entry.num_encoded_videos = video_meta.num_encoded_videos();
+  index_entry.frames_per_video = video_meta.frames_per_video();
+  index_entry.keyframes_per_video = video_meta.keyframes_per_video();
+  index_entry.size_per_video = video_meta.size_per_video();
+  index_entry.keyframe_indices = video_meta.keyframe_indices();
+  index_entry.sample_offsets = video_meta.sample_offsets();
+  index_entry.sample_sizes = video_meta.sample_sizes();
+  if (index_entry.codec_type == proto::VideoDescriptor::H264) {
+    index_entry.metadata = video_meta.metadata();
+    // Update keyframe positions and byte offsets so that the separately
+    // encoded videos seem like they are one
+    i64 frame_offset = 0;
+    i64 keyframe_offset = 0;
+    i64 byte_offset = 0;
+    for (i64 v = 0; v < index_entry.num_encoded_videos; ++v) {
+      for (i64 i = 0; i < index_entry.keyframes_per_video[v]; ++i) {
+        i64 fo = keyframe_offset + i;
+        index_entry.keyframe_indices[fo] += frame_offset;
+      }
+      for (i64 i = 0; i < index_entry.frames_per_video[v]; ++i) {
+        i64 fo = frame_offset + i;
+        index_entry.sample_offsets[fo] += byte_offset;
+      }
+      frame_offset += index_entry.frames_per_video[v];
+      keyframe_offset += index_entry.keyframes_per_video[v];
+      byte_offset += index_entry.size_per_video[v];
+    }
+
+    // Place total frames at the end of keyframe positions and total file size
+    // at the end of byte offsets to make interval calculation not need to
+    // deal with edge cases surrounding those
+    index_entry.keyframe_indices.push_back(video_meta.frames());
+    index_entry.sample_offsets.push_back(index_entry.file_size);
+  }
+
+  return index_entry;
+}
+
+}
+}
diff --git a/scanner/engine/video_index_entry.h b/scanner/engine/video_index_entry.h
new file mode 100644
index 00000000..c884c353
--- /dev/null
+++ b/scanner/engine/video_index_entry.h
@@ -0,0 +1,59 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "scanner/engine/metadata.h"
+#include "scanner/engine/runtime.h"
+#include "scanner/util/common.h"
+
+#include "storehouse/storage_backend.h"
+
+namespace scanner {
+namespace internal {
+
+struct VideoIndexEntry {
+  std::unique_ptr<storehouse::RandomReadFile> open_file() const;
+
+  storehouse::StorageBackend* storage;
+  std::string path;
+  bool inplace;
+  i32 table_id;
+  i32 column_id;
+  i32 item_id;
+  i32 width;
+  i32 height;
+  i32 channels;
+  FrameType frame_type;
+  proto::VideoDescriptor::VideoCodecType codec_type;
+  u64 file_size;
+  i32 num_encoded_videos;
+  std::vector<i64> frames_per_video;
+  std::vector<i64> keyframes_per_video;
+  std::vector<i64> size_per_video;
+
+  std::vector<u64> keyframe_indices;
+  std::vector<u64> sample_offsets;
+  std::vector<u64> sample_sizes;
+  std::vector<u8> metadata;
+};
+
+VideoIndexEntry read_video_index(storehouse::StorageBackend *storage,
+                                 i32 table_id, i32 column_id, i32 item_id);
+
+VideoIndexEntry read_video_index(storehouse::StorageBackend *storage,
+                                 const VideoMetadata& video_meta);
+}
+}
diff --git a/scanner/engine/worker.cpp b/scanner/engine/worker.cpp
index b6c81649..f6470dfe 100644
--- a/scanner/engine/worker.cpp
+++ b/scanner/engine/worker.cpp
@@ -13,14 +13,31 @@
  * limitations under the License.
  */
 
-#include "scanner/engine/runtime.h"
+#include "scanner/engine/worker.h"
 #include "scanner/engine/evaluate_worker.h"
 #include "scanner/engine/kernel_registry.h"
 #include "scanner/engine/load_worker.h"
+#include "scanner/engine/runtime.h"
 #include "scanner/engine/save_worker.h"
-
-#include <grpc/support/log.h>
+#include "scanner/engine/table_meta_cache.h"
+#include "scanner/engine/python_kernel.h"
+#include "scanner/engine/dag_analysis.h"
+#include "scanner/util/cuda.h"
+#include "scanner/util/glog.h"
+#include "scanner/util/grpc.h"
+
+#include <arpa/inet.h>
 #include <grpc/grpc_posix.h>
+#include <grpc/support/log.h>
+#include <ifaddrs.h>
+#include <netdb.h>
+#include <sys/socket.h>
+#include <omp.h>
+
+// For avcodec_register_all()... should go in software video with global mutex
+extern "C" {
+#include "libavcodec/avcodec.h"
+}
 
 using storehouse::StoreResult;
 using storehouse::WriteFile;
@@ -30,855 +47,1656 @@ namespace scanner {
 namespace internal {
 
 namespace {
-inline bool operator==(const MemoryPoolConfig &lhs,
-                       const MemoryPoolConfig &rhs) {
+inline bool operator==(const MemoryPoolConfig& lhs,
+                       const MemoryPoolConfig& rhs) {
   return (lhs.cpu().use_pool() == rhs.cpu().use_pool()) &&
          (lhs.cpu().free_space() == rhs.cpu().free_space()) &&
          (lhs.gpu().use_pool() == rhs.gpu().use_pool()) &&
          (lhs.gpu().free_space() == rhs.gpu().free_space());
 }
-inline bool operator!=(const MemoryPoolConfig &lhs,
-                       const MemoryPoolConfig &rhs) {
+
+inline bool operator!=(const MemoryPoolConfig& lhs,
+                       const MemoryPoolConfig& rhs) {
   return !(lhs == rhs);
 }
 
-void analyze_dag(
-    const proto::TaskSet &task_set,
-    std::vector<std::vector<std::tuple<i32, std::string>>> &live_columns,
-    std::vector<std::vector<i32>> &dead_columns,
-    std::vector<std::vector<i32>> &unused_outputs,
-    std::vector<std::vector<i32>> &column_mapping) {
-  // Start off with the columns from the gathered tables
-  OpRegistry *op_registry = get_op_registry();
-  auto &ops = task_set.ops();
-  std::map<i32, std::vector<std::tuple<std::string, i32>>> intermediates;
-  {
-    auto &input_op = ops.Get(0);
-    for (const std::string &input_col : input_op.inputs(0).columns()) {
-      intermediates[0].push_back(std::make_tuple(input_col, 0));
-    }
-  }
-  for (size_t i = 1; i < ops.size(); ++i) {
-    auto &op = ops.Get(i);
-    // For each input, update the intermediate last used index to the
-    // current index
-    for (auto &eval_input : op.inputs()) {
-      i32 parent_index = eval_input.op_index();
-      for (const std::string &parent_col : eval_input.columns()) {
-        bool found = false;
-        for (auto &kv : intermediates.at(parent_index)) {
-          if (std::get<0>(kv) == parent_col) {
-            found = true;
-            std::get<1>(kv) = i;
-            break;
-          }
-        }
-        assert(found);
+void load_driver(LoadInputQueue& load_work,
+                 std::vector<EvalQueue>& initial_eval_work,
+                 LoadWorkerArgs args) {
+  Profiler& profiler = args.profiler;
+  LoadWorker worker(args);
+  while (true) {
+    auto idle_start = now();
+
+    std::tuple<i32, std::deque<TaskStream>, LoadWorkEntry> entry;
+    load_work.pop(entry);
+    i32& output_queue_idx = std::get<0>(entry);
+    auto& task_streams = std::get<1>(entry);
+    LoadWorkEntry& load_work_entry = std::get<2>(entry);
+
+    args.profiler.add_interval("idle", idle_start, now());
+
+    if (load_work_entry.job_index() == -1) {
+      break;
+    }
+
+    VLOG(2) << "Load (N/PU: " << args.node_id << "/" << args.worker_id
+            << "): processing job task (" << load_work_entry.job_index() << ", "
+            << load_work_entry.task_index() << ")";
+
+    auto work_start = now();
+
+    auto input_entry = load_work_entry;
+    worker.feed(input_entry);
+
+    while (true) {
+      EvalWorkEntry output_entry;
+      i32 io_packet_size = args.io_packet_size;
+      if (worker.yield(io_packet_size, output_entry)) {
+        auto& work_entry = output_entry;
+        work_entry.first = !task_streams.empty();
+        work_entry.last_in_task = worker.done();
+        initial_eval_work[output_queue_idx].push(
+            std::make_tuple(task_streams, work_entry));
+        // We use the task streams being empty to indicate that this is
+        // a new task, so clear it here to show that this is from the same task
+        task_streams.clear();
+      } else {
+        break;
       }
     }
-    // Add this op's outputs to the intermediate list
-    if (i == ops.size() - 1) {
+    profiler.add_interval("task", work_start, now());
+    VLOG(2) << "Load (N/PU: " << args.node_id << "/" << args.worker_id
+            << "): finished job task (" << load_work_entry.job_index() << ", "
+            << load_work_entry.task_index() << "), pushed to worker "
+            << output_queue_idx;
+  }
+  VLOG(1) << "Load (N/PU: " << args.node_id << "/" << args.worker_id
+          << "): thread finished";
+}
+
+std::map<int, std::mutex> no_pipelining_locks;
+std::map<int, std::condition_variable> no_pipelining_cvars;
+std::map<int, bool> no_pipelining_conditions;
+
+void pre_evaluate_driver(EvalQueue& input_work, EvalQueue& output_work,
+                         PreEvaluateWorkerArgs args) {
+  Profiler& profiler = args.profiler;
+  PreEvaluateWorker worker(args);
+  // We sort inputs into task work queues to ensure we process them
+  // sequentially
+  std::map<std::tuple<i32, i32>,
+           Queue<std::tuple<std::deque<TaskStream>, EvalWorkEntry>>>
+      task_work_queue;
+  i32 work_packet_size = args.work_packet_size;
+
+  std::tuple<i32, i32> active_job_task = std::make_tuple(-1, -1);
+  while (true) {
+    auto idle_start = now();
+
+    // If we have no work at all or we do not have work for our current task..
+    if (task_work_queue.empty() ||
+        (std::get<0>(active_job_task) != -1 &&
+         task_work_queue.at(active_job_task).size() <= 0)) {
+      std::tuple<std::deque<TaskStream>, EvalWorkEntry> entry;
+      input_work.pop(entry);
+
+
+      auto& task_streams = std::get<0>(entry);
+      EvalWorkEntry& work_entry = std::get<1>(entry);
+      VLOG(1) << "Pre-evaluate (N/KI: " << args.node_id << "/" << args.worker_id
+              << "): got work " << work_entry.job_index << " " << work_entry.task_index;
+      if (work_entry.job_index == -1) {
+        break;
+      }
+
+      VLOG(1) << "Pre-evaluate (N/KI: " << args.node_id << "/" << args.worker_id
+              << "): "
+              << "received job task " << work_entry.job_index << ", "
+              << work_entry.task_index;
+
+      task_work_queue[std::make_tuple(work_entry.job_index,
+                                      work_entry.task_index)]
+          .push(entry);
+    }
+
+    args.profiler.add_interval("idle", idle_start, now());
+
+    if (std::get<0>(active_job_task) == -1) {
+      // Choose the next task to work on
+      active_job_task = task_work_queue.begin()->first;
+    }
+
+    // Wait until we have the next io item for the current task
+    if (task_work_queue.at(active_job_task).size() <= 0) {
+      std::this_thread::yield();
       continue;
     }
-    const auto &op_info = op_registry->get_op_info(op.name());
-    for (const auto &output_column : op_info->output_columns()) {
-      intermediates[i].push_back(std::make_tuple(output_column, i));
+
+    // Grab next entry for active task
+    std::tuple<std::deque<TaskStream>, EvalWorkEntry> entry;
+    task_work_queue.at(active_job_task).pop(entry);
+
+    auto& task_streams = std::get<0>(entry);
+    EvalWorkEntry& work_entry = std::get<1>(entry);
+
+    VLOG(1) << "Pre-evaluate (N/KI: " << args.node_id << "/" << args.worker_id
+            << "): "
+            << "processing job task " << work_entry.job_index << ", "
+            << work_entry.task_index;
+
+    auto work_start = now();
+
+    i32 total_rows = 0;
+    for (size_t i = 0; i < work_entry.row_ids.size(); ++i) {
+      total_rows = std::max(total_rows, (i32)work_entry.row_ids[i].size());
     }
-  }
 
-  // The live columns at each op index
-  live_columns.resize(ops.size());
-  for (size_t i = 0; i < ops.size(); ++i) {
-    i32 op_index = i;
-    auto &columns = live_columns[i];
-    size_t max_i = std::min((size_t)(ops.size() - 2), i);
-    for (size_t j = 0; j <= max_i; ++j) {
-      for (auto &kv : intermediates.at(j)) {
-        i32 last_used_index = std::get<1>(kv);
-        if (last_used_index > op_index) {
-          // Last used index is greater than current index, so still live
-          columns.push_back(std::make_tuple((i32)j, std::get<0>(kv)));
-        }
+    bool first = work_entry.first;
+    bool last = work_entry.last_in_task;
+
+    auto input_entry = work_entry;
+    worker.feed(input_entry, first);
+    i32 rows_used = 0;
+    while (rows_used < total_rows) {
+      EvalWorkEntry output_entry;
+      if (!worker.yield(work_packet_size, output_entry)) {
+        break;
+      }
+
+      if (std::getenv("NO_PIPELINING")) {
+        no_pipelining_conditions[args.worker_id] = true;
       }
+
+      if (first) {
+        output_work.push(std::make_tuple(task_streams, output_entry));
+        first = false;
+      } else {
+        output_work.push(
+            std::make_tuple(std::deque<TaskStream>(), output_entry));
+      }
+
+      if (std::getenv("NO_PIPELINING")) {
+        std::unique_lock<std::mutex> lk(no_pipelining_locks[args.worker_id]);
+        no_pipelining_cvars[args.worker_id].wait(lk, [&] {
+          return !no_pipelining_conditions[args.worker_id];
+        });
+      }
+      rows_used += work_packet_size;
+    }
+
+    if (last) {
+      task_work_queue.erase(active_job_task);
+      active_job_task = std::make_tuple(-1, -1);
     }
+
+    profiler.add_interval("task", work_start, now());
   }
 
-  // The columns to remove for the current kernel
-  dead_columns.resize(ops.size() - 1);
-  // Outputs from the current kernel that are not used
-  unused_outputs.resize(ops.size() - 1);
-  // Indices in the live columns list that are the inputs to the current
-  // kernel. Starts from the second evalutor (index 1)
-  column_mapping.resize(ops.size() - 1);
-  for (size_t i = 1; i < ops.size(); ++i) {
-    i32 op_index = i;
-    auto &prev_columns = live_columns[i - 1];
-    auto &op = ops.Get(op_index);
-    // Determine which columns are no longer live
-    {
-      auto &unused = unused_outputs[i - 1];
-      auto &dead = dead_columns[i - 1];
-      size_t max_i = std::min((size_t)(ops.size() - 2), (size_t)i);
-      for (size_t j = 0; j <= max_i; ++j) {
-        i32 parent_index = j;
-        for (auto &kv : intermediates.at(j)) {
-          i32 last_used_index = std::get<1>(kv);
-          if (last_used_index == op_index) {
-            // Column is no longer live, so remove it.
-            const std::string &col_name = std::get<0>(kv);
-            if (j == i) {
-              // This op has an unused output
-              i32 col_index = -1;
-              const std::vector<std::string> &op_cols =
-                  op_registry->get_op_info(op.name())->output_columns();
-              for (size_t k = 0; k < op_cols.size(); k++) {
-                if (col_name == op_cols[k]) {
-                  col_index = k;
-                  break;
-                }
-              }
-              assert(col_index != -1);
-              unused.push_back(col_index);
-            } else {
-              // Determine where in the previous live columns list this
-              // column existed
-              i32 col_index = -1;
-              for (i32 k = 0; k < (i32)prev_columns.size(); ++k) {
-                const std::tuple<i32, std::string> &live_input =
-                    prev_columns[k];
-                if (parent_index == std::get<0>(live_input) &&
-                    col_name == std::get<1>(live_input)) {
-                  col_index = k;
-                  break;
-                }
-              }
-              assert(col_index != -1);
-              dead.push_back(col_index);
-            }
-          }
-        }
+  VLOG(1) << "Pre-evaluate (N/PU: " << args.node_id << "/" << args.worker_id
+          << "): thread finished ";
+}
+
+void evaluate_driver(EvalQueue& input_work, EvalQueue& output_work,
+                     EvaluateWorkerArgs args) {
+  Profiler& profiler = args.profiler;
+  EvaluateWorker worker(args);
+  while (true) {
+    auto idle_pull_start = now();
+
+    std::tuple<std::deque<TaskStream>, EvalWorkEntry> entry;
+    input_work.pop(entry);
+
+    auto& task_streams = std::get<0>(entry);
+    EvalWorkEntry& work_entry = std::get<1>(entry);
+
+    args.profiler.add_interval("idle_pull", idle_pull_start, now());
+
+    if (work_entry.job_index == -1) {
+      break;
+    }
+
+    VLOG(1) << "Evaluate (N/KI/G: " << args.node_id << "/" << args.ki << "/"
+            << args.kg << "): processing job task " << work_entry.job_index
+            << ", " << work_entry.task_index;
+
+    auto work_start = now();
+
+    if (task_streams.size() > 0) {
+      // Start of a new task. Tell kernels what outputs they should produce.
+      std::vector<TaskStream> streams;
+      for (i32 i = 0; i < args.arg_group.kernel_factories.size(); ++i) {
+        assert(!task_streams.empty());
+        streams.push_back(task_streams.front());
+        task_streams.pop_front();
       }
+      worker.new_task(work_entry.job_index, work_entry.task_index, streams);
     }
-    auto &mapping = column_mapping[op_index - 1];
-    for (const auto &eval_input : op.inputs()) {
-      i32 parent_index = eval_input.op_index();
-      for (const std::string &col : eval_input.columns()) {
-        i32 col_index = -1;
-        for (i32 k = 0; k < (i32)prev_columns.size(); ++k) {
-          const std::tuple<i32, std::string> &live_input = prev_columns[k];
-          if (parent_index == std::get<0>(live_input) &&
-              col == std::get<1>(live_input)) {
-            col_index = k;
-            break;
-          }
-        }
-        assert(col_index != -1);
-        mapping.push_back(col_index);
+
+    i32 work_packet_size = 0;
+    for (size_t i = 0; i < work_entry.columns.size(); ++i) {
+      work_packet_size =
+          std::max(work_packet_size, (i32)work_entry.columns[i].size());
+    }
+
+    auto input_entry = work_entry;
+    worker.feed(input_entry);
+    EvalWorkEntry output_entry;
+    bool result = worker.yield(work_packet_size, output_entry);
+    (void)result;
+    assert(result);
+
+    profiler.add_interval("task", work_start, now());
+
+    auto idle_push_start = now();
+    output_work.push(std::make_tuple(task_streams, output_entry));
+    args.profiler.add_interval("idle_push", idle_push_start, now());
+
+  }
+  VLOG(1) << "Evaluate (N/KI: " << args.node_id << "/" << args.ki
+          << "): thread finished";
+}
+
+void post_evaluate_driver(EvalQueue& input_work, OutputEvalQueue& output_work,
+                          PostEvaluateWorkerArgs args) {
+  Profiler& profiler = args.profiler;
+  PostEvaluateWorker worker(args);
+  while (true) {
+    auto idle_start = now();
+
+    std::tuple<std::deque<TaskStream>, EvalWorkEntry> entry;
+    input_work.pop(entry);
+    EvalWorkEntry& work_entry = std::get<1>(entry);
+
+    args.profiler.add_interval("idle", idle_start, now());
+
+    if (work_entry.job_index == -1) {
+      break;
+    }
+
+    VLOG(1) << "Post-evaluate (N/PU: " << args.node_id << "/" << args.id
+            << "): processing task " << work_entry.job_index << ", "
+            << work_entry.task_index;
+
+    auto work_start = now();
+
+    auto input_entry = work_entry;
+    worker.feed(input_entry);
+    EvalWorkEntry output_entry;
+    bool result = worker.yield(output_entry);
+    profiler.add_interval("task", work_start, now());
+
+    if (result) {
+      output_entry.last_in_task = work_entry.last_in_task;
+      output_work.push(std::make_tuple(args.id, output_entry));
+    }
+
+    if (std::getenv("NO_PIPELINING")) {
+      {
+          std::unique_lock<std::mutex> lk(no_pipelining_locks[args.id]);
+          no_pipelining_conditions[args.id] = false;
       }
+      no_pipelining_cvars[args.id].notify_one();
     }
   }
+
+  VLOG(1) << "Post-evaluate (N/PU: " << args.node_id << "/" << args.id
+          << "): thread finished ";
 }
+
+void save_coordinator(OutputEvalQueue& eval_work,
+                      std::vector<SaveInputQueue>& save_work) {
+  i32 num_save_workers = save_work.size();
+  std::map<std::tuple<i32, i32>, i32> task_to_worker_mapping;
+  i32 last_worker_assigned = 0;
+  while (true) {
+    auto idle_start = now();
+
+    std::tuple<i32, EvalWorkEntry> entry;
+    eval_work.pop(entry);
+    EvalWorkEntry& work_entry = std::get<1>(entry);
+
+    //args.profiler.add_interval("idle", idle_start, now());
+
+    if (work_entry.job_index == -1) {
+      break;
+    }
+
+    auto job_task_id =
+        std::make_tuple(work_entry.job_index, work_entry.task_index);
+    if (task_to_worker_mapping.count(job_task_id) == 0) {
+      // Assign worker to this task
+      task_to_worker_mapping[job_task_id] =
+          last_worker_assigned++ % num_save_workers;
+    }
+
+    i32 assigned_worker = task_to_worker_mapping.at(job_task_id);
+    save_work[assigned_worker].push(entry);
+
+    if (work_entry.last_in_task) {
+      task_to_worker_mapping.erase(job_task_id);
+    }
+  }
 }
 
-class WorkerImpl final : public proto::Worker::Service {
-public:
-  WorkerImpl(DatabaseParameters &db_params, std::string master_address)
-      : db_params_(db_params) {
-    set_database_path(db_params.db_path);
+void save_driver(SaveInputQueue& save_work,
+                 SaveOutputQueue& output_work,
+                 SaveWorkerArgs args) {
+  Profiler& profiler = args.profiler;
+  std::map<std::tuple<i32, i32>, std::unique_ptr<SaveWorker>> workers;
+  while (true) {
+    auto idle_start = now();
 
+    std::tuple<i32, EvalWorkEntry> entry;
+    save_work.pop(entry);
+
+    i32 pipeline_instance = std::get<0>(entry);
+    EvalWorkEntry& work_entry = std::get<1>(entry);
+
+    args.profiler.add_interval("idle", idle_start, now());
+
+    if (work_entry.job_index == -1) {
+      break;
+    }
+
+    VLOG(1) << "Save (N/KI: " << args.node_id << "/" << args.worker_id
+            << "): processing job task (" << work_entry.job_index << ", "
+            << work_entry.task_index << ")";
+
+    auto work_start = now();
+
+    // Check if we have a worker for this task
+    auto job_task_id =
+        std::make_tuple(work_entry.job_index, work_entry.task_index);
+    if (workers.count(job_task_id) == 0) {
+      SaveWorker* worker = new SaveWorker(args);
+      worker->new_task(work_entry.table_id, work_entry.task_index,
+                       work_entry.column_types);
+      workers[job_task_id].reset(worker);
+    }
+
+    auto& worker = workers.at(job_task_id);
+
+    auto input_entry = work_entry;
+    worker->feed(input_entry);
+
+    VLOG(1) << "Save (N/KI: " << args.node_id << "/" << args.worker_id
+            << "): finished task (" << work_entry.job_index << ", "
+            << work_entry.task_index << ")";
+
+    args.profiler.add_interval("task", work_start, now());
+
+    if (work_entry.last_in_task) {
+      output_work.push(std::make_tuple(pipeline_instance, work_entry.job_index,
+                                       work_entry.task_index));
+      workers.erase(job_task_id);
+    }
+  }
+
+  VLOG(1) << "Save (N/KI: " << args.node_id << "/" << args.worker_id
+          << "): thread finished ";
+}
+}
+
+WorkerImpl::WorkerImpl(DatabaseParameters& db_params,
+                       std::string master_address, std::string worker_port)
+  : watchdog_awake_(true),
+    db_params_(db_params),
+    state_(State::INITIALIZING),
+    master_address_(master_address),
+    worker_port_(worker_port) {
+  init_glog("scanner_worker");
+
+  set_database_path(db_params.db_path);
+
+  avcodec_register_all();
 #ifdef DEBUG
-    // Stop SIG36 from grpc when debugging
-    grpc_use_signal(-1);
+  // Stop SIG36 from grpc when debugging
+  grpc_use_signal(-1);
 #endif
-    // google::protobuf::io::CodedInputStream::SetTotalBytesLimit(67108864 * 4,
-    //                                                            67108864 * 2);
+  // google::protobuf::io::CodedInputStream::SetTotalBytesLimit(67108864 * 4,
+  //                                                            67108864 * 2);
+
+  VLOG(1) << "Create master stub";
+  master_ = proto::Master::NewStub(
+      grpc::CreateChannel(master_address, grpc::InsecureChannelCredentials()));
+  VLOG(1) << "Finish master stub";
+
+  storage_ =
+      storehouse::StorageBackend::make_from_config(db_params_.storage_config);
+
+  // Set up Python runtime if any kernels need it
+  Py_Initialize();
+
+  // Processes jobs in the background
+  start_job_processor();
+}
 
-    master_ = proto::Master::NewStub(grpc::CreateChannel(
-        master_address, grpc::InsecureChannelCredentials()));
+WorkerImpl::~WorkerImpl() {
+  State state = state_.get();
+  bool was_initializing = state == State::INITIALIZING;
+  state_.set(State::SHUTTING_DOWN);
 
-    proto::WorkerParams worker_info;
-    char hostname[1024];
-    if (gethostname(hostname, 1024)) {
-      LOG(FATAL) << "gethostname failed";
+  // Master is dead if we failed during initialization
+  if (!was_initializing) {
+    try_unregister();
+  }
+
+  trigger_shutdown_.set();
+
+  stop_job_processor();
+
+  if (watchdog_thread_.joinable()) {
+    watchdog_thread_.join();
+  }
+  delete storage_;
+  if (memory_pool_initialized_) {
+    destroy_memory_allocators();
+  }
+}
+
+grpc::Status WorkerImpl::NewJob(grpc::ServerContext* context,
+                                const proto::BulkJobParameters* job_params,
+                                proto::Result* job_result) {
+  // Ensure that only one job is running at a time and that the worker
+  // is in idle mode before transitioning to job start
+  State state = state_.get();
+  bool ready = false;
+  while (!ready) {
+    switch (state) {
+      case RUNNING_JOB: {
+        RESULT_ERROR(job_result, "This worker is already running a job!");
+        return grpc::Status::OK;
+      }
+      case SHUTTING_DOWN: {
+        RESULT_ERROR(job_result, "This worker is preparing to shutdown!");
+        return grpc::Status::OK;
+      }
+      case INITIALIZING: {
+        state_.wait_for_change(INITIALIZING);
+        break;
+      }
+      case IDLE: {
+        if (state_.test_and_set(state, RUNNING_JOB)) {
+          ready = true;
+          break;
+        }
+      }
     }
-    worker_info.set_address(std::string(hostname) + ":5002");
+    state = state_.get();
+  }
 
-    grpc::ClientContext context;
-    proto::Registration registration;
-    grpc::Status status =
-        master_->RegisterWorker(&context, worker_info, &registration);
-    LOG_IF(FATAL, !status.ok()) << "Worker could not contact master server at "
-                                << master_address << " (" << status.error_code()
-                                << "): " << status.error_message();
+  job_result->set_success(true);
+  set_database_path(db_params_.db_path);
 
-    node_id_ = registration.node_id();
+  job_params_.Clear();
+  job_params_.MergeFrom(*job_params);
+  {
+    std::unique_lock<std::mutex> lock(finished_mutex_);
+    finished_ = false;
+  }
+  finished_cv_.notify_all();
 
-    storage_ =
-        storehouse::StorageBackend::make_from_config(db_params_.storage_config);
+  {
+    std::unique_lock<std::mutex> lock(active_mutex_);
+    active_bulk_job_ = true;
   }
+  active_cv_.notify_all();
 
-  ~WorkerImpl() {
-    delete storage_;
-    if (memory_pool_initialized_) {
-      destroy_memory_allocators();
+  return grpc::Status::OK;
+}
+
+grpc::Status WorkerImpl::LoadOp(grpc::ServerContext* context,
+                                const proto::OpPath* op_path,
+                                proto::Empty* empty) {
+  const std::string& so_path = op_path->path();
+  VLOG(1) << "Worker " << node_id_ << " loading Op library: " << so_path;
+  void* handle = dlopen(so_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+  LOG_IF(FATAL, handle == nullptr)
+      << "dlopen of " << so_path << " failed: " << dlerror();
+  return grpc::Status::OK;
+}
+
+grpc::Status WorkerImpl::RegisterOp(
+    grpc::ServerContext* context, const proto::OpRegistration* op_registration,
+    proto::Result* result) {
+  const std::string& name = op_registration->name();
+  const bool variadic_inputs = op_registration->variadic_inputs();
+  std::vector<Column> input_columns;
+  size_t i = 0;
+  for (auto& c : op_registration->input_columns()) {
+    Column col;
+    col.set_id(i++);
+    col.set_name(c.name());
+    col.set_type(c.type());
+    input_columns.push_back(col);
+  }
+  std::vector<Column> output_columns;
+  i = 0;
+  for (auto& c : op_registration->output_columns()) {
+    Column col;
+    col.set_id(i++);
+    col.set_name(c.name());
+    col.set_type(c.type());
+    output_columns.push_back(col);
+  }
+  bool can_stencil = op_registration->can_stencil();
+  std::vector<i32> stencil(op_registration->preferred_stencil().begin(),
+                                 op_registration->preferred_stencil().end());
+  if (stencil.empty()) {
+    stencil = {0};
+  }
+  bool has_bounded_state = op_registration->has_bounded_state();
+  i32 warmup = op_registration->warmup();
+  bool has_unbounded_state = op_registration->has_unbounded_state();
+  OpInfo* info = new OpInfo(name, variadic_inputs, input_columns,
+                            output_columns, can_stencil, stencil,
+                            has_bounded_state, warmup, has_unbounded_state);
+  OpRegistry* registry = get_op_registry();
+  registry->add_op(name, info);
+  VLOG(1) << "Worker " << node_id_ << " registering Op: " << name;
+
+  return grpc::Status::OK;
+}
+
+grpc::Status WorkerImpl::RegisterPythonKernel(
+    grpc::ServerContext* context,
+    const proto::PythonKernelRegistration* python_kernel,
+    proto::Result* result) {
+  const std::string& op_name = python_kernel->op_name();
+  DeviceType device_type = python_kernel->device_type();
+  const std::string& kernel_str = python_kernel->kernel_str();
+  const std::string& pickled_config = python_kernel->pickled_config();
+  const int batch_size = python_kernel->batch_size();
+  // Create a kernel builder function
+  auto constructor = [kernel_str, pickled_config, batch_size](
+    const KernelConfig& config) {
+    return new PythonKernel(config, kernel_str, pickled_config, batch_size);
+  };
+  // Set all input and output columns to be CPU
+  std::map<std::string, DeviceType> input_devices;
+  std::map<std::string, DeviceType> output_devices;
+  {
+    OpRegistry* registry = get_op_registry();
+    OpInfo* info = registry->get_op_info(op_name);
+    if (info->variadic_inputs()) {
+      assert(device_type != DeviceType::GPU);
+    } else {
+      for (const auto& in_col : info->input_columns()) {
+        input_devices[in_col.name()] = DeviceType::CPU;
+      }
+    }
+    for (const auto& out_col : info->output_columns()) {
+      output_devices[out_col.name()] = DeviceType::CPU;
     }
   }
+  // Create a new kernel factory
+  bool can_batch = (batch_size > 1);
+  KernelFactory* factory =
+      new KernelFactory(op_name, device_type, 1, input_devices, output_devices,
+                        can_batch, batch_size, constructor);
+  // Register the kernel
+  KernelRegistry* registry = get_kernel_registry();
+  registry->add_kernel(op_name, factory);
+  VLOG(1) << "Worker " << node_id_ << " registering Python Kernel: " << op_name;
+  return grpc::Status::OK;
+}
 
-  grpc::Status NewJob(grpc::ServerContext *context,
-                      const proto::JobParameters *job_params,
-                      proto::Result *job_result) {
-    job_result->set_success(true);
-    set_database_path(db_params_.db_path);
-
-    i32 local_id = job_params->local_id();
-    i32 local_total = job_params->local_total();
-
-    timepoint_t base_time = now();
-    const i32 work_item_size = job_params->work_item_size();
-    i32 warmup_size = 0;
-
-    OpRegistry *op_registry = get_op_registry();
-    auto &ops = job_params->task_set().ops();
-
-    // Analyze op DAG to determine what inputs need to be pipped along
-    // and when intermediates can be retired -- essentially liveness analysis
-    // The live columns at each op index
-    std::vector<std::vector<std::tuple<i32, std::string>>> live_columns;
-    // The columns to remove for the current kernel
-    std::vector<std::vector<i32>> dead_columns;
-    // Outputs from the current kernel that are not used
-    std::vector<std::vector<i32>> unused_outputs;
-    // Indices in the live columns list that are the inputs to the current
-    // kernel. Starts from the second evalutor (index 1)
-    std::vector<std::vector<i32>> column_mapping;
-    analyze_dag(job_params->task_set(), live_columns, dead_columns,
-                unused_outputs, column_mapping);
-
-    // Setup kernel factories and the kernel configs that will be used
-    // to instantiate instances of the op pipeline
-    KernelRegistry *kernel_registry = get_kernel_registry();
-    std::vector<KernelFactory *> kernel_factories;
-    std::vector<Kernel::Config> kernel_configs;
-    i32 num_cpus = db_params_.num_cpus;
-    assert(num_cpus > 0);
-
-    i32 num_gpus = static_cast<i32>(db_params_.gpu_ids.size());
-    for (size_t i = 1; i < ops.size() - 1; ++i) {
-      auto &op = ops.Get(i);
-      const std::string &name = op.name();
-      OpInfo *op_info =
-          op_registry->get_op_info(name);
-
-      DeviceType requested_device_type = op.device_type();
-      if (requested_device_type == DeviceType::GPU && num_gpus == 0) {
-        RESULT_ERROR(
-          job_result,
-          "Scanner is configured with zero available GPUs but a GPU "
-          "op was requested! Please configure Scanner to have "
-          "at least one GPU using the `gpu_ids` config option.");
-        return grpc::Status::OK;
+grpc::Status WorkerImpl::Shutdown(grpc::ServerContext* context,
+                                  const proto::Empty* empty, Result* result) {
+  State state = state_.get();
+  switch (state) {
+    case RUNNING_JOB: {
+      // trigger_shutdown will inform job to stop working
+      break;
+    }
+    case SHUTTING_DOWN: {
+      // Already shutting down
+      result->set_success(true);
+      return grpc::Status::OK;
+    }
+    case INITIALIZING: {
+      break;
+    }
+    case IDLE: {
+      break;
+    }
+  }
+  state_.set(SHUTTING_DOWN);
+  try_unregister();
+  // Inform watchdog that we are done for
+  trigger_shutdown_.set();
+  result->set_success(true);
+  return grpc::Status::OK;
+}
+
+grpc::Status WorkerImpl::PokeWatchdog(grpc::ServerContext* context,
+                                      const proto::Empty* empty,
+                                      proto::Empty* result) {
+  watchdog_awake_ = true;
+  return grpc::Status::OK;
+}
+
+grpc::Status WorkerImpl::Ping(grpc::ServerContext* context,
+                              const proto::Empty* empty1,
+                              proto::Empty* empty2) {
+  return grpc::Status::OK;
+}
+
+void WorkerImpl::start_watchdog(grpc::Server* server, bool enable_timeout,
+                                i32 timeout_ms) {
+  watchdog_thread_ = std::thread([this, server, enable_timeout, timeout_ms]() {
+    double time_since_check = 0;
+    // Wait until shutdown is triggered or watchdog isn't woken up
+    if (!enable_timeout) {
+      trigger_shutdown_.wait();
+    }
+    while (!trigger_shutdown_.raised()) {
+      auto sleep_start = now();
+      trigger_shutdown_.wait_for(timeout_ms);
+      time_since_check += nano_since(sleep_start) / 1e6;
+      if (time_since_check > timeout_ms) {
+        if (!watchdog_awake_) {
+          // Watchdog not woken, time to bail out
+          LOG(ERROR) << "Worker did not receive heartbeat in " << timeout_ms
+                     << "ms. Shutting down.";
+          trigger_shutdown_.set();
+        }
+        watchdog_awake_ = false;
+        time_since_check = 0;
+      }
+    }
+    // Shutdown self
+    server->Shutdown();
+  });
+}
+
+Result WorkerImpl::register_with_master() {
+  assert(state_.get() == State::INITIALIZING);
+
+  VLOG(1) << "Worker try to register with master";
+
+  proto::WorkerParams worker_info;
+  worker_info.set_port(worker_port_);
+  proto::MachineParameters* params = worker_info.mutable_params();
+  params->set_num_cpus(db_params_.num_cpus);
+  params->set_num_load_workers(db_params_.num_cpus);
+  params->set_num_save_workers(db_params_.num_cpus);
+  for (i32 gpu_id : db_params_.gpu_ids) {
+    params->add_gpu_ids(gpu_id);
+  }
+
+  proto::Registration registration;
+  grpc::Status status;
+  GRPC_BACKOFF(master_->RegisterWorker(&ctx, worker_info, &registration),
+               status);
+  if (!status.ok()) {
+    Result result;
+    result.set_success(false);
+    LOG(WARNING)
+      << "Worker could not contact master server at " << master_address_ << " ("
+      << status.error_code() << "): " << status.error_message();
+    return result;
+  }
+
+  VLOG(1) << "Worker registered with master";
+
+  node_id_ = registration.node_id();
+
+  state_.set(State::IDLE);
+
+  Result result;
+  result.set_success(true);
+  return result;
+}
+
+void WorkerImpl::try_unregister() {
+  if (state_.get() != State::INITIALIZING && !unregistered_.test_and_set()) {
+    proto::NodeInfo node_info;
+    node_info.set_node_id(node_id_);
+
+    proto::Empty em;
+    grpc::Status status;
+    GRPC_BACKOFF(master_->UnregisterWorker(&ctx, node_info, &em), status);
+    if (!status.ok()) {
+      LOG(WARNING) << "Worker could not unregister from master server "
+                   << "(" << status.error_code()
+                   << "): " << status.error_message();
+      return;
+    }
+    VLOG(1) << "Worker unregistered from master server.";
+  }
+}
+
+void WorkerImpl::start_job_processor() {
+  job_processor_thread_ = std::thread([this]() {
+    while (!trigger_shutdown_.raised()) {
+      // Wait on not finished
+      {
+        std::unique_lock<std::mutex> lock(active_mutex_);
+        active_cv_.wait(lock, [this] {
+          return active_bulk_job_ || trigger_shutdown_.raised();
+        });
       }
+      if (trigger_shutdown_.raised()) break;
+      // Start processing job
+      bool result = process_job(&job_params_, &job_result_);
+      // Set to idle if we finished without a shutdown
+      state_.test_and_set(RUNNING_JOB, IDLE);
+    }
+  });
+}
+
+void WorkerImpl::stop_job_processor() {
+  // Wake up job processor
+  {
+    std::unique_lock<std::mutex> lock(active_mutex_);
+    active_bulk_job_ = true;
+  }
+  active_cv_.notify_all();
+  if (job_processor_thread_.joinable()) {
+    job_processor_thread_.join();
+  }
+}
+
+bool WorkerImpl::process_job(const proto::BulkJobParameters* job_params,
+                             proto::Result* job_result) {
+  job_result->set_success(true);
+  auto finished_fn = [&]() {
+    {
+      proto::FinishedJobParams node_info;
+      node_info.set_node_id(node_id_);
+      proto::Empty empty;
+      grpc::Status status;
+      GRPC_BACKOFF(master_->FinishedJob(&ctx, node_info, &empty), status);
+      LOG_IF(FATAL, !status.ok())
+          << "Worker could not send FinishedJob to master ("
+          << status.error_code() << "): " << status.error_message() << ". "
+          << "Failing since the master could hang if it sees the worker is "
+          << "still alive but has not finished its job.";
+    }
+
+    {
+      std::unique_lock<std::mutex> lock(finished_mutex_);
+      finished_ = true;
+    }
+    finished_cv_.notify_all();
+    {
+      std::unique_lock<std::mutex> lock(finished_mutex_);
+      active_bulk_job_ = false;
+    }
+    active_cv_.notify_all();
+  };
+
+  set_database_path(db_params_.db_path);
+
+  i32 local_id = job_params->local_id();
+  i32 local_total = job_params->local_total();
+  // Controls if work should be distributed roundrobin or dynamically
+  bool distribute_work_dynamically = true;
+
+  timepoint_t base_time = now();
+  const i32 work_packet_size = job_params->work_packet_size();
+  const i32 io_packet_size = job_params->io_packet_size() != -1
+                                 ? job_params->io_packet_size()
+                                 : work_packet_size;
+  i32 warmup_size = 0;
+
+  OpRegistry* op_registry = get_op_registry();
+  std::vector<proto::Job> jobs(job_params->jobs().begin(),
+                               job_params->jobs().end());
+  std::vector<proto::Op> ops(job_params->ops().begin(),
+                             job_params->ops().end());
+
+
+  // Setup table metadata cache for use in other operations
+  DatabaseMetadata meta =
+      read_database_metadata(storage_, DatabaseMetadata::descriptor_path());
+  TableMetaCache table_meta(storage_, meta);
+
+  // Initialize worker table metadata
+  std::vector<std::string> required_tables;
+  for (auto& job : jobs) {
+    for (auto& input : job.inputs()) {
+      required_tables.push_back(input.table_name());
+    }
+  }
+  table_meta.prefetch(required_tables);
+
+  DAGAnalysisInfo analysis_results;
+  populate_analysis_info(ops, analysis_results);
+  // Need slice input rows to know which slice we are in
+  determine_input_rows_to_slices(meta, table_meta, jobs, ops, analysis_results);
+  remap_input_op_edges(ops, analysis_results);
+  // Analyze op DAG to determine what inputs need to be pipped along
+  // and when intermediates can be retired -- essentially liveness analysis
+  perform_liveness_analysis(ops, analysis_results);
+  // The live columns at each op index
+  std::vector<std::vector<std::tuple<i32, std::string>>>& live_columns =
+      analysis_results.live_columns;
+  // The columns to remove for the current kernel
+  std::vector<std::vector<i32>> dead_columns =
+      analysis_results.dead_columns;
+  // Outputs from the current kernel that are not used
+  std::vector<std::vector<i32>> unused_outputs =
+      analysis_results.unused_outputs;
+  // Indices in the live columns list that are the inputs to the current
+  // kernel. Starts from the second evalutor (index 1)
+  std::vector<std::vector<i32>> column_mapping =
+      analysis_results.column_mapping;
+
+  // Read final output columns for use in post-evaluate worker
+  // (needed for determining column types)
+  std::vector<Column> final_output_columns;
+  {
+    std::string output_name = jobs.at(0).output_table_name();
+    const TableMetadata& table = table_meta.at(output_name);
+    final_output_columns = table.columns();
+  }
+  std::vector<ColumnCompressionOptions> final_compression_options;
+  for (auto& opts : job_params->compression()) {
+    ColumnCompressionOptions o;
+    o.codec = opts.codec();
+    for (auto& kv : opts.options()) {
+      o.options[kv.first] = kv.second;
+    }
+    final_compression_options.push_back(o);
+  }
+  assert(final_output_columns.size() == final_compression_options.size());
+
+  // Setup kernel factories and the kernel configs that will be used
+  // to instantiate instances of the op pipeline
+  KernelRegistry* kernel_registry = get_kernel_registry();
+  std::vector<KernelFactory*> kernel_factories;
+  std::vector<KernelConfig> kernel_configs;
+  i32 num_cpus = db_params_.num_cpus;
+  assert(num_cpus > 0);
+
+  i32 total_gpus = db_params_.gpu_ids.size();
+  i32 num_gpus = db_params_.gpu_ids.size() / local_total;
+  // Should have at least one gpu if there are gpus
+  assert(db_params_.gpu_ids.size() == 0 || num_gpus > 0);
+  std::vector<i32> gpu_ids;
+  {
+    i32 start_idx = num_gpus * local_id;
+    for (i32 i = 0; i < num_gpus; ++i) {
+      gpu_ids.push_back(db_params_.gpu_ids[(start_idx + i) % total_gpus]);
+    }
+  }
 
-      if (!kernel_registry->has_kernel(name, requested_device_type)) {
-        RESULT_ERROR(
+  // Populate kernel_factories and kernel_configs
+  for (size_t i = 0; i < ops.size(); ++i) {
+    auto& op = ops.at(i);
+    const std::string& name = op.name();
+    if (is_builtin_op(name)) {
+      kernel_factories.push_back(nullptr);
+      kernel_configs.emplace_back();
+      continue;
+    }
+    OpInfo* op_info = op_registry->get_op_info(name);
+
+    DeviceType requested_device_type = op.device_type();
+    if (requested_device_type == DeviceType::GPU && num_gpus == 0) {
+      RESULT_ERROR(job_result,
+                   "Scanner is configured with zero available GPUs but a GPU "
+                   "op was requested! Please configure Scanner to have "
+                   "at least one GPU using the `gpu_ids` config option.");
+      finished_fn();
+      return false;
+    }
+
+    if (!kernel_registry->has_kernel(name, requested_device_type)) {
+      RESULT_ERROR(
           job_result,
           "Requested an instance of op %s with device type %s, but no kernel "
           "exists for that configuration.",
           op.name().c_str(),
           (requested_device_type == DeviceType::CPU ? "CPU" : "GPU"));
-        return grpc::Status::OK;
-      }
+      finished_fn();
+      return false;
+    }
 
-      KernelFactory *kernel_factory =
-          kernel_registry->get_kernel(name, requested_device_type);
-      kernel_factories.push_back(kernel_factory);
+    KernelFactory* kernel_factory =
+        kernel_registry->get_kernel(name, requested_device_type);
+    kernel_factories.push_back(kernel_factory);
+
+    // Setup kernel config with args from Op DAG
+    KernelConfig kernel_config;
+    kernel_config.node_id = node_id_;
+    kernel_config.args =
+        std::vector<u8>(op.kernel_args().begin(), op.kernel_args().end());
+    const std::vector<Column>& output_columns = op_info->output_columns();
+    for (auto& col : output_columns) {
+      kernel_config.output_columns.push_back(col.name());
+    }
 
-      Kernel::Config kernel_config;
-      kernel_config.work_item_size = work_item_size;
-      kernel_config.args = std::vector<u8>(op.kernel_args().begin(),
-                                           op.kernel_args().end());
-      const std::vector<std::string> &output_columns =
-          op_info->output_columns();
-      kernel_config.output_columns = std::vector<std::string>(
-          output_columns.begin(), output_columns.end());
+    // Tell kernel what its inputs are from the Op DAG
+    // (for variadic inputs)
+    auto& input_columns = op_info->input_columns();
+    for (int i = 0; i < op.inputs().size(); ++i) {
+      auto input = op.inputs(i);
+      kernel_config.input_columns.push_back(input.column());
+      if (input_columns.size() == 0) {
+        // We ccan have 0 columns in op info if variadic arguments
+        kernel_config.input_column_types.push_back(ColumnType::Other);
+      } else {
+        kernel_config.input_column_types.push_back(input_columns[i].type());
+      }
+    }
+    kernel_configs.push_back(kernel_config);
+  }
 
-      for (auto &input : op.inputs()) {
-        const proto::Op &input_op =
-            ops.Get(input.op_index());
-        if (input_op.name() == "InputTable") {
-        } else {
-          OpInfo *input_op_info =
-              op_registry->get_op_info(input_op.name());
-          // TODO: verify that input.columns() are all in
-          // op_info->output_columns()
+  // Break up kernels into groups that run on the same device
+  std::vector<OpArgGroup> groups;
+  if (!kernel_factories.empty()) {
+    bool first_op = true;
+    DeviceType last_device_type;
+    groups.emplace_back();
+    for (size_t i = 1; i < kernel_factories.size() - 1; ++i) {
+      KernelFactory* factory = kernel_factories[i];
+      // Factory is nullptr when we are on a builtin op
+      if (factory != nullptr && first_op) {
+        last_device_type = factory->get_device_type();
+        first_op = false;
+      }
+      if (factory != nullptr &&
+          factory->get_device_type() != last_device_type) {
+        // Does not use the same device as previous kernel, so push into new
+        // group
+        last_device_type = factory->get_device_type();
+        groups.emplace_back();
+      }
+      auto& op_group = groups.back().op_names;
+      auto& op_sampling = groups.back().sampling_args;
+      auto& group = groups.back().kernel_factories;
+      auto& lc = groups.back().live_columns;
+      auto& dc = groups.back().dead_columns;
+      auto& uo = groups.back().unused_outputs;
+      auto& cm = groups.back().column_mapping;
+      auto& st = groups.back().kernel_stencils;
+      auto& bt = groups.back().kernel_batch_sizes;
+      const std::string& op_name = ops.at(i).name();
+      op_group.push_back(op_name);
+      if (analysis_results.slice_ops.count(i) > 0) {
+        i64 local_op_idx = group.size();
+        // Set sampling args
+        auto& slice_outputs_per_job =
+            groups.back().slice_output_rows[local_op_idx];
+        for (auto& job_slice_outputs : analysis_results.slice_output_rows) {
+          auto& slice_groups = job_slice_outputs.at(i);
+          slice_outputs_per_job.push_back(slice_groups);
         }
-        kernel_config.input_columns.insert(kernel_config.input_columns.end(),
-                                           input.columns().begin(),
-                                           input.columns().end());
-      }
-      kernel_configs.push_back(kernel_config);
-    }
-
-    // Break up kernels into groups that run on the same device
-    std::vector<std::vector<std::tuple<KernelFactory *, Kernel::Config>>>
-        kernel_groups;
-    std::vector<std::vector<std::vector<std::tuple<i32, std::string>>>>
-        kg_live_columns;
-    std::vector<std::vector<std::vector<i32>>> kg_dead_columns;
-    std::vector<std::vector<std::vector<i32>>> kg_unused_outputs;
-    std::vector<std::vector<std::vector<i32>>> kg_column_mapping;
-    if (!kernel_factories.empty()) {
-      DeviceType last_device_type = kernel_factories[0]->get_device_type();
-      kernel_groups.emplace_back();
-      kg_live_columns.emplace_back();
-      kg_dead_columns.emplace_back();
-      kg_unused_outputs.emplace_back();
-      kg_column_mapping.emplace_back();
-      for (size_t i = 0; i < kernel_factories.size(); ++i) {
-        KernelFactory *factory = kernel_factories[i];
-        if (factory->get_device_type() != last_device_type) {
-          // Does not use the same device as previous kernel, so push into new
-          // group
-          last_device_type = factory->get_device_type();
-          kernel_groups.emplace_back();
-          kg_live_columns.emplace_back();
-          kg_dead_columns.emplace_back();
-          kg_unused_outputs.emplace_back();
-          kg_column_mapping.emplace_back();
+      }
+      if (analysis_results.unslice_ops.count(i) > 0) {
+        i64 local_op_idx = group.size();
+        // Set sampling args
+        auto& unslice_inputs_per_job =
+            groups.back().unslice_input_rows[local_op_idx];
+        for (auto& job_unslice_inputs : analysis_results.unslice_input_rows) {
+          auto& slice_groups = job_unslice_inputs.at(i);
+          unslice_inputs_per_job.push_back(slice_groups);
         }
-        auto &group = kernel_groups.back();
-        auto &lc = kg_live_columns.back();
-        auto &dc = kg_dead_columns.back();
-        auto &uo = kg_unused_outputs.back();
-        auto &cm = kg_column_mapping.back();
-        group.push_back(std::make_tuple(factory, kernel_configs[i]));
-        lc.push_back(live_columns[i]);
-        dc.push_back(dead_columns[i]);
-        uo.push_back(unused_outputs[i]);
-        cm.push_back(column_mapping[i]);
-      }
-    }
-
-    i32 num_kernel_groups = static_cast<i32>(kernel_groups.size());
-    assert(num_kernel_groups > 0); // is this actually necessary?
-
-    i32 pipeline_instances_per_node = job_params->pipeline_instances_per_node();
-    // If ki per node is -1, we set a smart default. Currently, we calculate the
-    // maximum possible kernel instances without oversubscribing any part of the
-    // pipeline, either CPU or GPU.
-    bool has_gpu_kernel = false;
-    if (pipeline_instances_per_node == -1) {
-      pipeline_instances_per_node = std::numeric_limits<i32>::max();
-      for (i32 kg = 0; kg < num_kernel_groups; ++kg) {
-        auto &group = kernel_groups[kg];
-        for (i32 k = 0; k < group.size(); ++k) {
-          KernelFactory *factory = std::get<0>(group[k]);
-          DeviceType device_type = factory->get_device_type();
-          i32 max_devices = factory->get_max_devices();
-          if (max_devices == Kernel::UnlimitedDevices) {
-            pipeline_instances_per_node = 1;
-          } else {
-            pipeline_instances_per_node = std::min(
-              pipeline_instances_per_node,
-              device_type == DeviceType::CPU
-              ? db_params_.num_cpus / local_total / max_devices
-              : (i32) db_params_.gpu_ids.size() / local_total / max_devices);
-          }
-          if (device_type == DeviceType::GPU) {
-            has_gpu_kernel = true;
+      }
+      if (analysis_results.sampling_ops.count(i) > 0) {
+        i64 local_op_idx = group.size();
+        // Set sampling args
+        auto& sampling_args_per_job = groups.back().sampling_args[local_op_idx];
+        for (auto& job : jobs) {
+          for (auto& saa : job.sampling_args_assignment()) {
+            if (saa.op_index() == i) {
+              sampling_args_per_job.emplace_back(
+                  saa.sampling_args().begin(),
+                  saa.sampling_args().end());
+              break;
+            }
           }
         }
+        assert(sampling_args_per_job.size() == jobs.size());
       }
+      group.push_back(std::make_tuple(factory, kernel_configs[i]));
+      lc.push_back(live_columns[i]);
+      dc.push_back(dead_columns[i]);
+      uo.push_back(unused_outputs[i]);
+      cm.push_back(column_mapping[i]);
+      st.push_back(analysis_results.stencils[i]);
+      bt.push_back(analysis_results.batch_sizes[i]);
     }
+  }
 
-    if (pipeline_instances_per_node <= 0) {
-      RESULT_ERROR(
-          job_result,
-          "JobParameters.pipeline_instances_per_node must -1 for auto-default or "
-          " greater than 0 for manual configuration.");
-      return grpc::Status::OK;
-    }
+  i32 num_kernel_groups = static_cast<i32>(groups.size());
+  assert(num_kernel_groups > 0);  // is this actually necessary?
 
-    // Set up memory pool if different than previous memory pool
-    if (!memory_pool_initialized_ ||
-        job_params->memory_pool_config() != cached_memory_pool_config_) {
-      if (db_params_.num_cpus < local_total * pipeline_instances_per_node &&
-          job_params->memory_pool_config().cpu().use_pool()) {
-        RESULT_ERROR(
-          job_result,
-          "Cannot oversubscribe CPUs and also use CPU memory pool");
-        return grpc::Status::OK;
-      }
-      if (db_params_.gpu_ids.size() < local_total * pipeline_instances_per_node &&
-          job_params->memory_pool_config().gpu().use_pool()) {
-        RESULT_ERROR(
-          job_result,
-          "Cannot oversubscribe GPUs and also use GPU memory pool");
-        return grpc::Status::OK;
+  i32 pipeline_instances_per_node = job_params->pipeline_instances_per_node();
+  // If ki per node is -1, we set a smart default. Currently, we calculate the
+  // maximum possible kernel instances without oversubscribing any part of the
+  // pipeline, either CPU or GPU.
+  bool has_gpu_kernel = false;
+  if (pipeline_instances_per_node == -1) {
+    pipeline_instances_per_node = std::numeric_limits<i32>::max();
+    for (i32 kg = 0; kg < num_kernel_groups; ++kg) {
+      auto& group = groups[kg].kernel_factories;
+      for (i32 k = 0; k < group.size(); ++k) {
+        // Skip builtin ops
+        if (std::get<0>(group[k]) == nullptr) {
+          continue;
+        }
+        KernelFactory* factory = std::get<0>(group[k]);
+        DeviceType device_type = factory->get_device_type();
+        i32 max_devices = factory->get_max_devices();
+        if (max_devices == Kernel::UnlimitedDevices) {
+          pipeline_instances_per_node = 1;
+        } else {
+          pipeline_instances_per_node =
+              std::min(pipeline_instances_per_node,
+                       device_type == DeviceType::CPU
+                           ? db_params_.num_cpus / local_total / max_devices
+                           : (i32)num_gpus / max_devices);
+        }
+        if (device_type == DeviceType::GPU) {
+          has_gpu_kernel = true;
+        }
       }
-      if (memory_pool_initialized_) {
-        destroy_memory_allocators();
-      }
-      init_memory_allocators(job_params->memory_pool_config(), db_params_.gpu_ids);
-      cached_memory_pool_config_ = job_params->memory_pool_config();
-      memory_pool_initialized_ = true;
-    }
-
-
-    // Load table metadata for use in constructing io items
-    DatabaseMetadata meta =
-        read_database_metadata(storage_, DatabaseMetadata::descriptor_path());
-    std::map<std::string, TableMetadata> table_meta;
-    for (const std::string &table_name : meta.table_names()) {
-      std::string table_path =
-          TableMetadata::descriptor_path(meta.get_table_id(table_name));
-      table_meta[table_name] = read_table_metadata(storage_, table_path);
-    }
-
-    // Setup shared resources for distributing work to processing threads
-    i64 accepted_items = 0;
-    Queue<std::tuple<IOItem, LoadWorkEntry>> load_work;
-    Queue<std::tuple<IOItem, EvalWorkEntry>> initial_eval_work;
-    std::vector<std::vector<Queue<std::tuple<IOItem, EvalWorkEntry>>>>
-        eval_work(pipeline_instances_per_node);
-    Queue<std::tuple<IOItem, EvalWorkEntry>> save_work;
-    std::atomic<i64> retired_items{0};
-
-    // Setup load workers
-    i32 num_load_workers = db_params_.num_load_workers;
-    std::vector<Profiler> load_thread_profilers(num_load_workers,
-                                                Profiler(base_time));
-    std::vector<LoadThreadArgs> load_thread_args;
-    for (i32 i = 0; i < num_load_workers; ++i) {
-      // Create IO thread for reading and decoding data
-      load_thread_args.emplace_back(LoadThreadArgs{
-          // Uniform arguments
-          node_id_, job_params,
+    }
+    if (pipeline_instances_per_node == std::numeric_limits<i32>::max()) {
+      pipeline_instances_per_node = 1;
+    }
+  }
 
-          // Per worker arguments
-          i, db_params_.storage_config, load_thread_profilers[i],
+  if (pipeline_instances_per_node <= 0) {
+    RESULT_ERROR(job_result,
+                 "BulkJobParameters.pipeline_instances_per_node (%d) must be -1 "
+                 "for auto-default or greater than 0 for manual configuration.",
+                 pipeline_instances_per_node);
+    finished_fn();
+    return false;
+  }
 
-          // Queues
-          load_work, initial_eval_work,
-      });
+  // Set up memory pool if different than previous memory pool
+  if (!memory_pool_initialized_ ||
+      job_params->memory_pool_config() != cached_memory_pool_config_) {
+    if (db_params_.num_cpus < local_total * pipeline_instances_per_node &&
+        job_params->memory_pool_config().cpu().use_pool()) {
+      RESULT_ERROR(job_result,
+                   "Cannot oversubscribe CPUs and also use CPU memory pool");
+      finished_fn();
+      return false;
     }
-    std::vector<pthread_t> load_threads(num_load_workers);
-    for (i32 i = 0; i < num_load_workers; ++i) {
-      pthread_create(&load_threads[i], NULL, load_thread, &load_thread_args[i]);
+    if (db_params_.gpu_ids.size() < local_total * pipeline_instances_per_node &&
+        job_params->memory_pool_config().gpu().use_pool()) {
+      RESULT_ERROR(job_result,
+                   "Cannot oversubscribe GPUs and also use GPU memory pool");
+      finished_fn();
+      return false;
     }
+    if (memory_pool_initialized_) {
+      destroy_memory_allocators();
+    }
+    init_memory_allocators(job_params->memory_pool_config(), gpu_ids);
+    cached_memory_pool_config_ = job_params->memory_pool_config();
+    memory_pool_initialized_ = true;
+  }
 
-    // Setup evaluate workers
-    std::vector<std::vector<Profiler>> eval_profilers(
-        pipeline_instances_per_node);
-    std::vector<std::vector<proto::Result>> eval_results(
+  omp_set_num_threads(std::thread::hardware_concurrency());
+
+  // Setup shared resources for distributing work to processing threads
+  i64 accepted_tasks = 0;
+  LoadInputQueue load_work;
+  std::vector<EvalQueue> initial_eval_work(pipeline_instances_per_node);
+  std::vector<std::vector<EvalQueue>> eval_work(pipeline_instances_per_node);
+  OutputEvalQueue output_eval_work(pipeline_instances_per_node);
+  std::vector<SaveInputQueue> save_work(db_params_.num_save_workers);
+  SaveOutputQueue retired_tasks;
+
+  // Setup load workers
+  i32 num_load_workers = db_params_.num_load_workers;
+  std::vector<Profiler> load_thread_profilers;
+  for (i32 i = 0; i < num_load_workers; ++i) {
+    load_thread_profilers.emplace_back(Profiler(base_time));
+  }
+  std::vector<std::thread> load_threads;
+  for (i32 i = 0; i < num_load_workers; ++i) {
+    LoadWorkerArgs args{// Uniform arguments
+                        node_id_,
+                        // Per worker arguments
+                        i, db_params_.storage_config, load_thread_profilers[i],
+                        job_params->load_sparsity_threshold(), io_packet_size,
+                        work_packet_size};
+
+    load_threads.emplace_back(load_driver, std::ref(load_work),
+                              std::ref(initial_eval_work), args);
+  }
+
+  // Setup evaluate workers
+  std::vector<std::vector<Profiler>> eval_profilers(
       pipeline_instances_per_node);
-    std::vector<PreEvaluateThreadArgs> pre_eval_args;
-    std::vector<std::vector<EvaluateThreadArgs>> eval_args(
-        pipeline_instances_per_node);
-    std::vector<PostEvaluateThreadArgs> post_eval_args;
-
-
-    i32 next_cpu_num = 0;
-    i32 next_gpu_idx = db_params_.gpu_ids.size() / local_total * local_id;
-    for (i32 ki = 0; ki < pipeline_instances_per_node; ++ki) {
-      std::vector<Queue<std::tuple<IOItem, EvalWorkEntry>>> &work_queues =
-          eval_work[ki];
-      std::vector<Profiler> &eval_thread_profilers = eval_profilers[ki];
-      std::vector<proto::Result>& results = eval_results[ki];
-      work_queues.resize(num_kernel_groups - 1 + 2); // +2 for pre/post
-      results.resize(num_kernel_groups);
-      for (auto& result : results) {
-        result.set_success(true);
-      }
-      for (i32 i = 0; i < num_kernel_groups + 2; ++i) {
-        eval_thread_profilers.push_back(Profiler(base_time));
-      }
-
-      // Evaluate worker
-      DeviceHandle first_kernel_type;
-      for (i32 kg = 0; kg < num_kernel_groups; ++kg) {
-        auto &group = kernel_groups[kg];
-        auto &lc = kg_live_columns[kg];
-        auto &dc = kg_dead_columns[kg];
-        auto &uo = kg_unused_outputs[kg];
-        auto &cm = kg_column_mapping[kg];
-        std::vector<EvaluateThreadArgs> &thread_args = eval_args[ki];
-        // HACK(apoms): we assume all ops in a kernel group use the
-        //   same number of devices for now.
-        // for (size_t i = 0; i < group.size(); ++i) {
-        KernelFactory *factory = std::get<0>(group[0]);
-        DeviceType device_type = factory->get_device_type();
-        if (device_type == DeviceType::CPU) {
-          for (i32 i = 0; i < factory->get_max_devices(); ++i) {
-            i32 device_id = next_cpu_num++ % num_cpus;
-            for (size_t i = 0; i < group.size(); ++i) {
-              Kernel::Config &config = std::get<1>(group[i]);
-              config.devices.clear();
-              config.devices.push_back({device_type, device_id});
-            }
-          }
-        } else {
-          for (i32 i = 0; i < factory->get_max_devices(); ++i) {
-            i32 device_id = db_params_.gpu_ids[next_gpu_idx++ % num_gpus];
-            for (size_t i = 0; i < group.size(); ++i) {
-              Kernel::Config &config = std::get<1>(group[i]);
-              config.devices.clear();
-              config.devices.push_back({device_type, device_id});
-            }
+  std::vector<std::vector<proto::Result>> eval_results(
+      pipeline_instances_per_node);
+
+  std::vector<std::tuple<EvalQueue*, EvalQueue*>> pre_eval_queues;
+  std::vector<PreEvaluateWorkerArgs> pre_eval_args;
+  std::vector<std::vector<std::tuple<EvalQueue*, EvalQueue*>>> eval_queues(
+      pipeline_instances_per_node);
+  std::vector<std::vector<EvaluateWorkerArgs>> eval_args(
+      pipeline_instances_per_node);
+  std::vector<std::tuple<EvalQueue*, OutputEvalQueue*>> post_eval_queues;
+  std::vector<PostEvaluateWorkerArgs> post_eval_args;
+
+  i32 next_cpu_num = 0;
+  i32 next_gpu_idx = 0;
+  std::mutex startup_lock;
+  std::condition_variable startup_cv;
+  i32 startup_count = 0;
+  i32 eval_total = 0;
+  for (i32 ki = 0; ki < pipeline_instances_per_node; ++ki) {
+    auto& work_queues = eval_work[ki];
+    std::vector<Profiler>& eval_thread_profilers = eval_profilers[ki];
+    std::vector<proto::Result>& results = eval_results[ki];
+    work_queues.resize(num_kernel_groups - 1 + 2);  // +2 for pre/post
+    results.resize(num_kernel_groups);
+    for (auto& result : results) {
+      result.set_success(true);
+    }
+    for (i32 i = 0; i < num_kernel_groups + 2; ++i) {
+      eval_thread_profilers.push_back(Profiler(base_time));
+    }
+
+    // Evaluate worker
+    DeviceHandle first_kernel_type;
+    for (i32 kg = 0; kg < num_kernel_groups; ++kg) {
+      auto& group = groups[kg].kernel_factories;
+      std::vector<EvaluateWorkerArgs>& thread_args = eval_args[ki];
+      std::vector<std::tuple<EvalQueue*, EvalQueue*>>& thread_qs =
+          eval_queues[ki];
+      // HACK(apoms): we assume all ops in a kernel group use the
+      //   same number of devices for now.
+      // for (size_t i = 0; i < group.size(); ++i) {
+      KernelFactory* factory = nullptr;
+      for (size_t i = 0; i < group.size(); ++i) {
+        if (std::get<0>(group[i]) != nullptr) {
+          factory = std::get<0>(group[i]) ;
+        }
+      }
+      DeviceType device_type = DeviceType::CPU;
+      i32 max_devices = 1;
+      // Factory should only be null if we only have builtin ops
+      if (factory != nullptr) {
+        device_type = factory->get_device_type();
+        max_devices = factory->get_max_devices();
+      }
+      if (device_type == DeviceType::CPU) {
+        for (i32 i = 0; i < max_devices; ++i) {
+          i32 device_id = 0;
+          next_cpu_num++ % num_cpus;
+          for (size_t i = 0; i < group.size(); ++i) {
+            KernelConfig& config = std::get<1>(group[i]);
+            config.devices.clear();
+            config.devices.push_back({device_type, device_id});
           }
         }
-        // Get the device handle for the first kernel in the pipeline
-        if (kg == 0) {
-          first_kernel_type = std::get<1>(group[0]).devices[0];
+      } else {
+        for (i32 i = 0; i < max_devices; ++i) {
+          i32 device_id = gpu_ids[next_gpu_idx++ % num_gpus];
+          for (size_t i = 0; i < group.size(); ++i) {
+            KernelConfig& config = std::get<1>(group[i]);
+            config.devices.clear();
+            config.devices.push_back({device_type, device_id});
+          }
         }
+      }
+      // Get the device handle for the first kernel in the pipeline
+      if (kg == 0) {
+        first_kernel_type = std::get<1>(group[0]).devices[0];
+      }
 
-        // Input work queue
-        Queue<std::tuple<IOItem, EvalWorkEntry>> *input_work_queue =
-            &work_queues[kg];
-        // Create new queue for output, reuse previous queue as input
-        Queue<std::tuple<IOItem, EvalWorkEntry>> *output_work_queue =
-            &work_queues[kg + 1];
-        // Create eval thread for passing data through neural net
-        thread_args.emplace_back(EvaluateThreadArgs{
-            // Uniform arguments
-            node_id_, job_params,
-
-            // Per worker arguments
-            ki, kg, group, lc, dc, uo, cm, eval_thread_profilers[kg+1],
-            results[kg],
-
-            // Queues
-            *input_work_queue, *output_work_queue});
-      }
-      // Pre evaluate worker
-      {
-        Queue<std::tuple<IOItem, EvalWorkEntry>> *input_work_queue =
-            &initial_eval_work;
-        Queue<std::tuple<IOItem, EvalWorkEntry>> *output_work_queue =
-            &work_queues[0];
-        assert(kernel_groups.size() > 0);
-        pre_eval_args.emplace_back(PreEvaluateThreadArgs{
-            // Uniform arguments
-            node_id_, num_cpus, job_params,
+      // Input work queue
+      EvalQueue* input_work_queue = &work_queues[kg];
+      // Create new queue for output, reuse previous queue as input
+      EvalQueue* output_work_queue = &work_queues[kg + 1];
+      // Create eval thread for passing data through neural net
+      thread_qs.push_back(
+          std::make_tuple(input_work_queue, output_work_queue));
+      thread_args.emplace_back(EvaluateWorkerArgs{
+          // Uniform arguments
+          node_id_, startup_lock, startup_cv, startup_count,
 
-            // Per worker arguments
-              ki, first_kernel_type, eval_thread_profilers.front(),
+          // Per worker arguments
+          ki, kg, groups[kg], eval_thread_profilers[kg + 1], results[kg]});
+      eval_total += 1;
+    }
+    // Pre evaluate worker
+    {
+      EvalQueue* input_work_queue;
+      if (distribute_work_dynamically) {
+        input_work_queue = &initial_eval_work[ki];
+      } else {
+        input_work_queue = &initial_eval_work[0];
+      }
+      EvalQueue* output_work_queue =
+          &work_queues[0];
+      assert(groups.size() > 0);
+      pre_eval_queues.push_back(
+          std::make_tuple(input_work_queue, output_work_queue));
+      DeviceHandle decoder_type = std::getenv("FORCE_CPU_DECODE")
+        ? CPU_DEVICE
+        : first_kernel_type;
+      pre_eval_args.emplace_back(PreEvaluateWorkerArgs{
+          // Uniform arguments
+          node_id_, num_cpus, job_params->work_packet_size(),
 
-            // Queues
-            *input_work_queue, *output_work_queue});
+          // Per worker arguments
+          ki, decoder_type, eval_thread_profilers.front(),
+      });
+    }
+
+    // Post evaluate worker
+    {
+      auto& output_op = ops.at(ops.size() - 1);
+      std::vector<std::string> column_names;
+      for (auto& op_input : output_op.inputs()) {
+        column_names.push_back(op_input.column());
       }
 
-      // Post evaluate worker
-      {
-        Queue<std::tuple<IOItem, EvalWorkEntry>> *input_work_queue =
-            &work_queues.back();
-        Queue<std::tuple<IOItem, EvalWorkEntry>> *output_work_queue =
-            &save_work;
-        post_eval_args.emplace_back(PostEvaluateThreadArgs{
-            // Uniform arguments
-            node_id_,
+      EvalQueue* input_work_queue = &work_queues.back();
+      OutputEvalQueue* output_work_queue = &output_eval_work;
+      post_eval_queues.push_back(
+          std::make_tuple(input_work_queue, output_work_queue));
+      post_eval_args.emplace_back(PostEvaluateWorkerArgs{
+          // Uniform arguments
+          node_id_,
 
-            // Per worker arguments
-            ki, eval_thread_profilers.back(), column_mapping.back(),
+          // Per worker arguments
+          ki, eval_thread_profilers.back(), column_mapping.back(),
+          final_output_columns, final_compression_options,
+      });
+    }
+  }
 
-            // Queues
-            *input_work_queue, *output_work_queue});
-      }
+  // Launch eval worker threads
+  std::vector<std::thread> pre_eval_threads;
+  std::vector<std::vector<std::thread>> eval_threads;
+  std::vector<std::thread> post_eval_threads;
+  for (i32 pu = 0; pu < pipeline_instances_per_node; ++pu) {
+    // Pre thread
+    pre_eval_threads.emplace_back(
+        pre_evaluate_driver, std::ref(*std::get<0>(pre_eval_queues[pu])),
+        std::ref(*std::get<1>(pre_eval_queues[pu])), pre_eval_args[pu]);
+    // Op threads
+    eval_threads.emplace_back();
+    std::vector<std::thread>& threads = eval_threads.back();
+    for (i32 kg = 0; kg < num_kernel_groups; ++kg) {
+      threads.emplace_back(
+          evaluate_driver, std::ref(*std::get<0>(eval_queues[pu][kg])),
+          std::ref(*std::get<1>(eval_queues[pu][kg])), eval_args[pu][kg]);
     }
+    // Post threads
+    post_eval_threads.emplace_back(
+        post_evaluate_driver, std::ref(*std::get<0>(post_eval_queues[pu])),
+        std::ref(*std::get<1>(post_eval_queues[pu])), post_eval_args[pu]);
+  }
 
-    // Launch eval worker threads
-    std::vector<pthread_t> pre_eval_threads(pipeline_instances_per_node);
-    std::vector<std::vector<pthread_t>> eval_threads(pipeline_instances_per_node);
-    std::vector<pthread_t> post_eval_threads(pipeline_instances_per_node);
-    for (i32 pu = 0; pu < pipeline_instances_per_node; ++pu) {
-      // Pre thread
-      pthread_create(&pre_eval_threads[pu], NULL, pre_evaluate_thread,
-                     &pre_eval_args[pu]);
-      // Op threads
-      std::vector<pthread_t> &threads = eval_threads[pu];
-      threads.resize(num_kernel_groups);
-      for (i32 kg = 0; kg < num_kernel_groups; ++kg) {
-        pthread_create(&threads[kg], NULL, evaluate_thread, &eval_args[pu][kg]);
-      }
-      // Post threads
-      pthread_create(&post_eval_threads[pu], NULL, post_evaluate_thread,
-                     &post_eval_args[pu]);
-    }
-
-    // Setup save workers
-    i32 num_save_workers = db_params_.num_save_workers;
-    std::vector<Profiler> save_thread_profilers(num_save_workers,
-                                                Profiler(base_time));
-    std::vector<SaveThreadArgs> save_thread_args;
-    for (i32 i = 0; i < num_save_workers; ++i) {
-      // Create IO thread for reading and decoding data
-      save_thread_args.emplace_back(
-          SaveThreadArgs{// Uniform arguments
-                         node_id_, job_params->job_name(),
+  // Setup save coordinator
+  std::thread save_coordinator_thread(
+      save_coordinator, std::ref(output_eval_work), std::ref(save_work));
 
-                         // Per worker arguments
-                         i, db_params_.storage_config, save_thread_profilers[i],
+  // Setup save workers
+  i32 num_save_workers = db_params_.num_save_workers;
+  std::vector<Profiler> save_thread_profilers;
+  for (i32 i = 0; i < num_save_workers; ++i) {
+    save_thread_profilers.emplace_back(Profiler(base_time));
+  }
+  std::vector<std::thread> save_threads;
+  for (i32 i = 0; i < num_save_workers; ++i) {
+    SaveWorkerArgs args{// Uniform arguments
+                        node_id_,
 
-                         // Queues
-                         save_work, retired_items});
-    }
-    std::vector<pthread_t> save_threads(num_save_workers);
-    for (i32 i = 0; i < num_save_workers; ++i) {
-      pthread_create(&save_threads[i], NULL, save_thread, &save_thread_args[i]);
-    }
+                        // Per worker arguments
+                        i, db_params_.storage_config, save_thread_profilers[i]};
 
-#ifdef SCANNER_PROFILING
-    sleep(10);
-#endif
-    timepoint_t start_time = now();
+    save_threads.emplace_back(save_driver, std::ref(save_work[i]),
+                              std::ref(retired_tasks), args);
+  }
 
-    // Monitor amount of work left and request more when running low
-    while (true) {
-      i32 local_work = accepted_items - retired_items;
-      if (local_work < pipeline_instances_per_node * TASKS_IN_QUEUE_PER_PU) {
-        grpc::ClientContext context;
-        proto::NodeInfo node_info;
-        proto::NewWork new_work;
-
-        node_info.set_node_id(node_id_);
-        grpc::Status status =
-            master_->NextWork(&context, node_info, &new_work);
-        if (!status.ok()) {
-          RESULT_ERROR(job_result,
-                       "Worker %d could not get next work from master",
-                       node_id_);
-          break;
-        }
+  if (job_params->profiling()) {
+    // Wait until all evaluate workers have started up
+    std::unique_lock<std::mutex> lk(startup_lock);
+    startup_cv.wait(lk, [&] {
+      return eval_total == startup_count;
+    });
+  }
 
-        i32 next_item = new_work.io_item().item_id();
-        if (next_item == -1) {
-          // No more work left
-          VLOG(1) << "Node " << node_id_ << " received done signal.";
-          break;
-        } else {
-          load_work.push(
-              std::make_tuple(new_work.io_item(), new_work.load_work()));
-          accepted_items++;
-        }
+  timepoint_t start_time = now();
+
+  // Monitor amount of work left and request more when running low
+  // Round robin work
+  std::vector<i64> allocated_work_to_queues(pipeline_instances_per_node);
+  std::vector<i64> retired_work_for_queues(pipeline_instances_per_node);
+  bool finished = false;
+  while (true) {
+    if (trigger_shutdown_.raised()) {
+      // Abandon ship!
+      VLOG(1) << "Worker " << node_id_ << " received shutdown while in NewJob";
+      RESULT_ERROR(job_result, "Worker %d shutdown while processing NewJob",
+                   node_id_);
+      break;
+    }
+    if (!job_result->success()) {
+      VLOG(1) << "Worker " << node_id_ << " in error, stopping.";
+      break;
+    }
+    // We batch up retired tasks to avoid sync overhead
+    std::vector<std::tuple<i32, i64, i64>> batched_retired_tasks;
+    while (retired_tasks.size() > 0) {
+      // Pull retired tasks
+      std::tuple<i32, i64, i64> task_retired;
+      retired_tasks.pop(task_retired);
+      batched_retired_tasks.push_back(task_retired);
+    }
+    if (!batched_retired_tasks.empty()) {
+      // Make sure the retired tasks were flushed to disk before confirming
+      std::fflush(NULL);
+      sync();
+    }
+    for (std::tuple<i32, i64, i64>& task_retired : batched_retired_tasks) {
+      // Inform master that this task was finished
+      proto::FinishedWorkParameters params;
+
+      params.set_node_id(node_id_);
+      params.set_job_id(std::get<1>(task_retired));
+      params.set_task_id(std::get<2>(task_retired));
+
+      proto::Empty empty;
+      grpc::Status status;
+      GRPC_BACKOFF(master_->FinishedWork(&ctx, params, &empty), status);
+
+      // Update how much is in each pipeline instances work queue
+      retired_work_for_queues[std::get<0>(task_retired)] += 1;
+
+      if (!status.ok()) {
+        RESULT_ERROR(job_result,
+                     "Worker %d could not tell finished work to master",
+                     node_id_);
+        break;
+      }
+    }
+    i64 total_tasks_processed = 0;
+    for (i64 t : retired_work_for_queues) {
+      total_tasks_processed += t;
+    }
+    if (finished) {
+      if (total_tasks_processed == accepted_tasks) {
+        break;
+      } else {
+        std::this_thread::yield();
+        continue;
+      }
+    }
+    i32 local_work = accepted_tasks - total_tasks_processed;
+    if (local_work <
+        pipeline_instances_per_node * job_params->tasks_in_queue_per_pu()) {
+      proto::NodeInfo node_info;
+      node_info.set_node_id(node_id_);
+
+      proto::NewWork new_work;
+      grpc::Status status;
+      GRPC_BACKOFF(master_->NextWork(&ctx, node_info, &new_work), status);
+      if (!status.ok()) {
+        RESULT_ERROR(job_result,
+                     "Worker %d could not get next work from master", node_id_);
+        break;
       }
 
-      for (size_t i = 0; i < eval_results.size(); ++i) {
-        for (size_t j = 0; j < eval_results[i].size(); ++j) {
-          auto &result = eval_results[i][j];
-          if (!result.success()) {
-            LOG(WARNING) << "(N/KI/KG: " << node_id_ << "/" << i << "/" << j
-                         << ") returned error result: " << result.msg();
-            job_result->set_success(false);
-            job_result->set_msg(result.msg());
-            goto leave_loop;
+      if (new_work.wait_for_work()) {
+        // Waiting for more work
+        VLOG(1) << "Node " << node_id_ << " received wait for work signal.";
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+      }
+      else if (new_work.no_more_work()) {
+        // No more work left
+        VLOG(1) << "Node " << node_id_ << " received done signal.";
+        finished = true;
+      } else {
+        // Perform analysis on load work entry to determine upstream
+        // requirements and when to discard elements.
+        std::deque<TaskStream> task_stream;
+        LoadWorkEntry stenciled_entry;
+        derive_stencil_requirements(
+            meta, table_meta, jobs.at(new_work.job_index()), ops,
+            analysis_results, job_params->boundary_condition(),
+            new_work.table_id(), new_work.job_index(), new_work.task_index(),
+            std::vector<i64>(new_work.output_rows().begin(),
+                             new_work.output_rows().end()),
+            stenciled_entry, task_stream);
+
+        // Determine which worker to allocate to
+        i32 target_work_queue = -1;
+        i32 min_work = std::numeric_limits<i32>::max();
+        for (int i = 0; i < pipeline_instances_per_node; ++i) {
+          i64 outstanding_work =
+              allocated_work_to_queues[i] - retired_work_for_queues[i];
+          if (outstanding_work < min_work) {
+            min_work = outstanding_work;
+            target_work_queue = i;
           }
         }
+        load_work.push(
+            std::make_tuple(target_work_queue, task_stream, stenciled_entry));
+        allocated_work_to_queues[target_work_queue]++;
+        accepted_tasks++;
       }
-      goto remain_loop;
-    leave_loop:
-      break;
-    remain_loop:
-
-      std::this_thread::yield();
     }
 
-    // If the job failed, can't expect queues to have drained, so
-    // attempt to flush all all queues here (otherwise we could block
-    // on pushing into a queue)
-    if (!job_result->success()) {
-      load_work.clear();
-      initial_eval_work.clear();
-      for (i32 kg = 0; kg < num_kernel_groups; ++kg) {
-        for (i32 pu = 0; pu < pipeline_instances_per_node; ++pu) {
-          eval_work[pu][kg].clear();
+    for (size_t i = 0; i < eval_results.size(); ++i) {
+      for (size_t j = 0; j < eval_results[i].size(); ++j) {
+        auto& result = eval_results[i][j];
+        if (!result.success()) {
+          LOG(WARNING) << "(N/KI/KG: " << node_id_ << "/" << i << "/" << j
+                       << ") returned error result: " << result.msg();
+          job_result->set_success(false);
+          job_result->set_msg(result.msg());
+          goto leave_loop;
         }
       }
+    }
+    goto remain_loop;
+  leave_loop:
+    break;
+  remain_loop:
+
+    std::this_thread::yield();
+  }
+
+  // If the job failed, can't expect queues to have drained, so
+  // attempt to flush all queues here (otherwise we could block
+  // on pushing into a queue)
+  if (!job_result->success()) {
+    load_work.clear();
+    for (i32 pu = 0; pu < pipeline_instances_per_node; ++pu) {
+      initial_eval_work[pu].clear();
+    }
+    for (i32 kg = 0; kg < num_kernel_groups; ++kg) {
       for (i32 pu = 0; pu < pipeline_instances_per_node; ++pu) {
-        eval_work[pu].back().clear();
+        eval_work[pu][kg].clear();
       }
-      save_work.clear();
     }
-
-    // Push sentinel work entries into queue to terminate load threads
-    for (i32 i = 0; i < num_load_workers; ++i) {
-      LoadWorkEntry entry;
-      entry.set_io_item_index(-1);
-      load_work.push(std::make_tuple(IOItem{}, entry));
+    for (i32 pu = 0; pu < pipeline_instances_per_node; ++pu) {
+      eval_work[pu].back().clear();
     }
-
-    for (i32 i = 0; i < num_load_workers; ++i) {
-      // Wait until load has finished
-      void *result;
-      i32 err = pthread_join(load_threads[i], &result);
-      LOG_IF(FATAL, err != 0) << "error in pthread_join of load thread";
-      free(result);
+    output_eval_work.clear();
+    for (i32 i = 0; i < num_save_workers; ++i) {
+      save_work[i].clear();
     }
+    retired_tasks.clear();
+  }
 
-    // Push sentinel work entries into queue to terminate eval threads
-    for (i32 i = 0; i < pipeline_instances_per_node; ++i) {
-      EvalWorkEntry entry;
-      entry.io_item_index = -1;
-      initial_eval_work.push(std::make_tuple(IOItem{}, entry));
-    }
+  auto push_exit_message = [](EvalQueue& q) {
+    EvalWorkEntry entry;
+    entry.job_index = -1;
+    q.push(std::make_tuple(std::deque<TaskStream>(), entry));
+  };
+
+  auto push_output_eval_exit_message = [](OutputEvalQueue& q) {
+    EvalWorkEntry entry;
+    entry.job_index = -1;
+    q.push(std::make_tuple(0, entry));
+  };
+
+  auto push_save_exit_message = [](SaveInputQueue& q) {
+    EvalWorkEntry entry;
+    entry.job_index = -1;
+    q.push(std::make_tuple(0, entry));
+  };
+
+  // Push sentinel work entries into queue to terminate load threads
+  for (i32 i = 0; i < num_load_workers; ++i) {
+    LoadWorkEntry entry;
+    entry.set_job_index(-1);
+    load_work.push(
+        std::make_tuple(0, std::deque<TaskStream>(), entry));
+  }
 
-    for (i32 i = 0; i < pipeline_instances_per_node; ++i) {
-      // Wait until pre eval has finished
-      void *result;
-      i32 err = pthread_join(pre_eval_threads[i], &result);
-      LOG_IF(FATAL, err != 0) << "error in pthread_join of pre eval thread";
-      free(result);
-    }
+  for (i32 i = 0; i < num_load_workers; ++i) {
+    // Wait until all load threads have finished
+    load_threads[i].join();
+  }
 
-    for (i32 kg = 0; kg < num_kernel_groups; ++kg) {
-      for (i32 pu = 0; pu < pipeline_instances_per_node; ++pu) {
-        EvalWorkEntry entry;
-        entry.io_item_index = -1;
-        eval_work[pu][kg].push(std::make_tuple(IOItem{}, entry));
-      }
-      for (i32 pu = 0; pu < pipeline_instances_per_node; ++pu) {
-        // Wait until eval has finished
-        void *result;
-        i32 err = pthread_join(eval_threads[pu][kg], &result);
-        LOG_IF(FATAL, err != 0) << "error in pthread_join of eval thread";
-        free(result);
-      }
+  // Push sentinel work entries into queue to terminate eval threads
+  for (i32 i = 0; i < pipeline_instances_per_node; ++i) {
+    if (distribute_work_dynamically) {
+      push_exit_message(initial_eval_work[i]);
+    } else {
+      push_exit_message(initial_eval_work[0]);
     }
+  }
+
+  for (i32 i = 0; i < pipeline_instances_per_node; ++i) {
+    // Wait until pre eval has finished
+    LOG(INFO) << "Pre join " << i;
+    pre_eval_threads[i].join();
+  }
 
-    // Terminate post eval threads
+  for (i32 kg = 0; kg < num_kernel_groups; ++kg) {
     for (i32 pu = 0; pu < pipeline_instances_per_node; ++pu) {
-      EvalWorkEntry entry;
-      entry.io_item_index = -1;
-      eval_work[pu].back().push(std::make_tuple(IOItem{}, entry));
+      push_exit_message(eval_work[pu][kg]);
     }
     for (i32 pu = 0; pu < pipeline_instances_per_node; ++pu) {
       // Wait until eval has finished
-      void *result;
-      i32 err = pthread_join(post_eval_threads[pu], &result);
-      LOG_IF(FATAL, err != 0) << "error in pthread_join of post eval thread";
-      free(result);
+      eval_threads[pu][kg].join();
     }
+  }
 
-    // Push sentinel work entries into queue to terminate save threads
-    for (i32 i = 0; i < num_save_workers; ++i) {
-      EvalWorkEntry entry;
-      entry.io_item_index = -1;
-      save_work.push(std::make_tuple(IOItem{}, entry));
-    }
-    for (i32 i = 0; i < num_save_workers; ++i) {
-      // Wait until eval has finished
-      void *result;
-      i32 err = pthread_join(save_threads[i], &result);
-      LOG_IF(FATAL, err != 0) << "error in pthread_join of save thread";
-      free(result);
+  // Terminate post eval threads
+  for (i32 pu = 0; pu < pipeline_instances_per_node; ++pu) {
+    push_exit_message(eval_work[pu].back());
+  }
+  for (i32 pu = 0; pu < pipeline_instances_per_node; ++pu) {
+    // Wait until eval has finished
+    post_eval_threads[pu].join();
+  }
+
+  // Push sentinel work entries into queue to terminate coordinator thread
+  push_output_eval_exit_message(output_eval_work);
+  save_coordinator_thread.join();
+
+  // Push sentinel work entries into queue to terminate save threads
+  for (i32 i = 0; i < num_save_workers; ++i) {
+    // Wait until save thread is polling on save_work
+    while(save_work[i].size() >= 0) {
+      retired_tasks.clear();
     }
+    push_save_exit_message(save_work[i]);
+  }
+  for (i32 i = 0; i < num_save_workers; ++i) {
+    save_threads[i].join();
+  }
 
-// Ensure all files are flushed
-#ifdef SCANNER_PROFILING
+  // Ensure all files are flushed
+  if (job_params->profiling()) {
     std::fflush(NULL);
     sync();
-#endif
+  }
 
-    if (!job_result->success()) {
-      return grpc::Status::OK;
-    }
+  if (!job_result->success()) {
+    finished_fn();
+    return false;
+  }
 
-    // Write out total time interval
-    timepoint_t end_time = now();
-
-    // Execution done, write out profiler intervals for each worker
-    // TODO: job_name -> job_id?
-    i32 job_id = meta.get_job_id(job_params->job_name());
-    std::string profiler_file_name = job_profiler_path(job_id, node_id_);
-    std::unique_ptr<WriteFile> profiler_output;
-    BACKOFF_FAIL(
-        make_unique_write_file(storage_, profiler_file_name, profiler_output));
-
-    i64 base_time_ns =
-        std::chrono::time_point_cast<std::chrono::nanoseconds>(base_time)
-            .time_since_epoch()
-            .count();
-    i64 start_time_ns =
-        std::chrono::time_point_cast<std::chrono::nanoseconds>(start_time)
-            .time_since_epoch()
-            .count();
-    i64 end_time_ns =
-        std::chrono::time_point_cast<std::chrono::nanoseconds>(end_time)
-            .time_since_epoch()
-            .count();
-    s_write(profiler_output.get(), start_time_ns);
-    s_write(profiler_output.get(), end_time_ns);
-
-    i64 out_rank = node_id_;
-    // Load worker profilers
-    u8 load_worker_count = num_load_workers;
-    s_write(profiler_output.get(), load_worker_count);
-    for (i32 i = 0; i < num_load_workers; ++i) {
-      write_profiler_to_file(profiler_output.get(), out_rank, "load", "", i,
-                             load_thread_profilers[i]);
-    }
-
-    // Evaluate worker profilers
-    u8 eval_worker_count = pipeline_instances_per_node;
-    s_write(profiler_output.get(), eval_worker_count);
-    u8 profilers_per_chain = 3;
-    s_write(profiler_output.get(), profilers_per_chain);
-    for (i32 pu = 0; pu < pipeline_instances_per_node; ++pu) {
-      i32 i = pu;
-      {
-        std::string tag = "pre";
-        write_profiler_to_file(profiler_output.get(), out_rank, "eval", tag, i,
-                               eval_profilers[pu][0]);
-      }
-      {
-        std::string tag = "eval";
-        write_profiler_to_file(profiler_output.get(), out_rank, "eval", tag, i,
-                               eval_profilers[pu][1]);
-      }
-      {
-        std::string tag = "post";
-        write_profiler_to_file(profiler_output.get(), out_rank, "eval", tag, i,
-                               eval_profilers[pu][2]);
-      }
-    }
+  // Write out total time interval
+  timepoint_t end_time = now();
+
+  // Execution done, write out profiler intervals for each worker
+  // TODO: job_name -> job_id?
+  i32 job_id = meta.get_bulk_job_id(job_params->job_name());
+  std::string profiler_file_name = bulk_job_profiler_path(job_id, node_id_);
+  std::unique_ptr<WriteFile> profiler_output;
+  BACKOFF_FAIL(
+      make_unique_write_file(storage_, profiler_file_name, profiler_output));
+
+  i64 base_time_ns =
+      std::chrono::time_point_cast<std::chrono::nanoseconds>(base_time)
+          .time_since_epoch()
+          .count();
+  i64 start_time_ns =
+      std::chrono::time_point_cast<std::chrono::nanoseconds>(start_time)
+          .time_since_epoch()
+          .count();
+  i64 end_time_ns =
+      std::chrono::time_point_cast<std::chrono::nanoseconds>(end_time)
+          .time_since_epoch()
+          .count();
+  s_write(profiler_output.get(), start_time_ns);
+  s_write(profiler_output.get(), end_time_ns);
+
+  i64 out_rank = node_id_;
+  // Load worker profilers
+  u8 load_worker_count = num_load_workers;
+  s_write(profiler_output.get(), load_worker_count);
+  for (i32 i = 0; i < num_load_workers; ++i) {
+    write_profiler_to_file(profiler_output.get(), out_rank, "load", "", i,
+                           load_thread_profilers[i]);
+  }
 
-    // Save worker profilers
-    u8 save_worker_count = num_save_workers;
-    s_write(profiler_output.get(), save_worker_count);
-    for (i32 i = 0; i < num_save_workers; ++i) {
-      write_profiler_to_file(profiler_output.get(), out_rank, "save", "", i,
-                             save_thread_profilers[i]);
+  // Evaluate worker profilers
+  u8 eval_worker_count = pipeline_instances_per_node;
+  s_write(profiler_output.get(), eval_worker_count);
+  u8 profilers_per_chain = 3;
+  s_write(profiler_output.get(), profilers_per_chain);
+  for (i32 pu = 0; pu < pipeline_instances_per_node; ++pu) {
+    i32 i = pu;
+    {
+      std::string tag = "pre";
+      write_profiler_to_file(profiler_output.get(), out_rank, "eval", tag, i,
+                             eval_profilers[pu][0]);
+    }
+    {
+      std::string tag = "eval";
+      write_profiler_to_file(profiler_output.get(), out_rank, "eval", tag, i,
+                             eval_profilers[pu][1]);
+    }
+    {
+      std::string tag = "post";
+      write_profiler_to_file(profiler_output.get(), out_rank, "eval", tag, i,
+                             eval_profilers[pu][2]);
     }
+  }
 
-    BACKOFF_FAIL(profiler_output->save());
+  // Save worker profilers
+  u8 save_worker_count = num_save_workers;
+  s_write(profiler_output.get(), save_worker_count);
+  for (i32 i = 0; i < num_save_workers; ++i) {
+    write_profiler_to_file(profiler_output.get(), out_rank, "save", "", i,
+                           save_thread_profilers[i]);
+  }
 
-    VLOG(1) << "Worker " << node_id_ << " finished NewJob";
+  BACKOFF_FAIL(profiler_output->save());
 
-    return grpc::Status::OK;
-  }
+  std::fflush(NULL);
+  sync();
 
-  grpc::Status LoadOp(grpc::ServerContext* context, const proto::OpInfo* op_info,
-                      proto::Empty* empty) {
-    const std::string& so_path = op_info->so_path();
-    void *handle = dlopen(so_path.c_str(), RTLD_NOW | RTLD_LOCAL);
-    LOG_IF(FATAL, handle == nullptr)
-      << "dlopen of " << so_path << " failed: " << dlerror();
-    return grpc::Status::OK;
-  }
-
-private:
-  std::unique_ptr<proto::Master::Stub> master_;
-  storehouse::StorageConfig *storage_config_;
-  DatabaseParameters db_params_;
-  i32 node_id_;
-  storehouse::StorageBackend *storage_;
-  std::map<std::string, TableMetadata *> table_metas_;
-  bool memory_pool_initialized_ = false;
-  MemoryPoolConfig cached_memory_pool_config_;
-};
-
-proto::Worker::Service *get_worker_service(DatabaseParameters &params,
-                                           const std::string &master_address) {
-  return new WorkerImpl(params, master_address);
+  finished_fn();
+
+  VLOG(1) << "Worker " << node_id_ << " finished job";
+
+  return true;
 }
 
 }
diff --git a/scanner/engine/worker.h b/scanner/engine/worker.h
new file mode 100644
index 00000000..3a1e4d01
--- /dev/null
+++ b/scanner/engine/worker.h
@@ -0,0 +1,120 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "scanner/engine/metadata.h"
+#include "scanner/engine/rpc.grpc.pb.h"
+#include "scanner/engine/runtime.h"
+
+#include <grpc/grpc_posix.h>
+#include <grpc/support/log.h>
+#include <atomic>
+#include <thread>
+#include <boost/python.hpp>
+
+namespace scanner {
+namespace internal {
+
+class WorkerImpl final : public proto::Worker::Service {
+ public:
+  WorkerImpl(DatabaseParameters& db_params, std::string master_address,
+             std::string worker_port);
+
+  ~WorkerImpl();
+
+  grpc::Status NewJob(grpc::ServerContext* context,
+                      const proto::BulkJobParameters* job_params,
+                      proto::Result* job_result);
+
+  grpc::Status LoadOp(grpc::ServerContext* context,
+                      const proto::OpPath* op_path, proto::Empty* empty);
+
+  grpc::Status RegisterOp(grpc::ServerContext* context,
+                          const proto::OpRegistration* op_registration,
+                          proto::Result* result);
+
+  grpc::Status RegisterPythonKernel(
+      grpc::ServerContext* context,
+      const proto::PythonKernelRegistration* python_kernel,
+      proto::Result* result);
+
+  grpc::Status Shutdown(grpc::ServerContext* context, const proto::Empty* empty,
+                        Result* result);
+
+  grpc::Status PokeWatchdog(grpc::ServerContext* context,
+                            const proto::Empty* empty, proto::Empty* result);
+
+  grpc::Status Ping(grpc::ServerContext* context, const proto::Empty* empty,
+                    proto::Empty* result);
+
+  void start_watchdog(grpc::Server* server, bool enable_timeout,
+                      i32 timeout_ms = 50000);
+
+  Result register_with_master();
+
+ private:
+  void try_unregister();
+
+  void start_job_processor();
+
+  void stop_job_processor();
+
+  bool process_job(const proto::BulkJobParameters* job_params,
+                   proto::Result* job_result);
+
+  enum State {
+    INITIALIZING,
+    IDLE,
+    RUNNING_JOB,
+    SHUTTING_DOWN,
+  };
+
+  Condition<State> state_;
+  std::atomic_flag unregistered_;
+
+  std::thread watchdog_thread_;
+  std::atomic<bool> watchdog_awake_;
+  std::unique_ptr<proto::Master::Stub> master_;
+  storehouse::StorageConfig* storage_config_;
+  DatabaseParameters db_params_;
+  Flag trigger_shutdown_;
+  std::string master_address_;
+  std::string worker_port_;
+  i32 node_id_;
+  storehouse::StorageBackend* storage_;
+  std::map<std::string, TableMetadata*> table_metas_;
+  bool memory_pool_initialized_ = false;
+  MemoryPoolConfig cached_memory_pool_config_;
+
+  // True if the worker is executing a job
+  std::mutex active_mutex_;
+  std::condition_variable active_cv_;
+  bool active_bulk_job_ = false;
+  proto::BulkJobParameters job_params_;
+
+  // True if all work for job is done
+  std::mutex finished_mutex_;
+  std::condition_variable finished_cv_;
+  std::atomic<bool> finished_{true};
+  Result job_result_;
+
+
+  std::thread job_processor_thread_;
+  // Manages modification of all of the below structures
+  std::mutex work_mutex_;
+};
+}
+}
diff --git a/scanner/metadata.proto b/scanner/metadata.proto
index c7daac81..0b0bd279 100644
--- a/scanner/metadata.proto
+++ b/scanner/metadata.proto
@@ -4,19 +4,21 @@ package scanner.proto;
 
 // Database metadata messages
 message DatabaseDescriptor {
-  message Job {
+  message BulkJob {
     int32 id = 1;
     string name = 2;
+    bool committed = 3;
   }
 
   message Table {
     int32 id = 1;
     string name = 2;
+    bool committed = 3;
   }
 
-  int32 next_job_id = 1;
+  int32 next_bulk_job_id = 1;
   int32 next_table_id = 2;
-  repeated Job jobs = 3;
+  repeated BulkJob bulk_jobs = 3;
   repeated Table tables = 4;
 }
 
@@ -44,6 +46,12 @@ enum ColumnType {
   Image = 2;
 }
 
+enum FrameType {
+  U8 = 0;
+  F32 = 1;
+  F64 = 2;
+}
+
 message Column {
   int32 id = 1;
   string name = 2;
@@ -51,8 +59,14 @@ message Column {
 }
 
 message VideoDescriptor {
+  enum VideoContainerType {
+    MP4 = 0;
+    NONE = 1;
+  }
+
   enum VideoCodecType {
     H264 = 0;
+    RAW = 1;
   }
 
   enum VideoChromaFormat {
@@ -69,14 +83,25 @@ message VideoDescriptor {
   int64 frames = 4;
   int32 width = 5;
   int32 height = 6;
+  int32 channels = 14;
+  FrameType frame_type = 13;
 
   VideoCodecType codec_type = 7;
   VideoChromaFormat chroma_format = 8;
+  int32 time_base_num = 15;
+  int32 time_base_denom = 16;
+
+  int64 num_encoded_videos = 17;
+  repeated int64 frames_per_video = 18;
+  repeated int64 keyframes_per_video = 19;
+  repeated int64 size_per_video = 20;
 
-  repeated int64 keyframe_positions = 9 [packed=true];
-  repeated int64 keyframe_timestamps = 10 [packed=true];
-  repeated int64 keyframe_byte_offsets = 11 [packed=true];
+  repeated uint64 sample_offsets = 9 [packed=true];
+  repeated uint64 sample_sizes = 10 [packed=true];
+  repeated uint64 keyframe_indices = 11 [packed=true];
   bytes metadata_packets = 12;
+  string data_path = 21;
+  bool inplace = 22;
 }
 
 message ImageFormatGroupDescriptor {
@@ -99,56 +124,73 @@ message TableDescriptor {
   int64 timestamp = 7;
 }
 
-// Task set messages
-message TableSample {
-  string table_name = 1;
-  repeated string column_names = 2;
-  string sampling_function = 3;
-  bytes sampling_args = 4;
-}
-
-message Task {
-  string output_table_name = 2;
-  repeated TableSample samples = 3;
-}
-
 message OpInput {
   int32 op_index = 1;
-  repeated string columns = 2;
+  string column = 2;
 }
 
 message Op {
   string name = 1;
   repeated OpInput inputs = 2;
-  DeviceType device_type = 3;
-  bytes kernel_args = 4;
+  bytes kernel_args = 3;
+  // Used by kernels
+  DeviceType device_type = 4;
+  repeated int32 stencil = 5;
+  int32 batch = 6;
+  int32 warmup = 7;
+}
+
+message OutputColumnCompression {
+  string codec = 1;
+  map<string, string> options = 2;
+}
+
+message ColumnInput {
+  int32 op_index = 1;
+  string table_name = 2;
+  string column_name = 3;
+}
+
+message SamplingArgs {
+  string sampling_function = 1;
+  bytes sampling_args = 2;
 }
 
-message TaskSet {
-  repeated Task tasks = 1;
-  repeated Op ops = 2;
+message SamplingArgsAssignment {
+  int32 op_index = 1;
+  // Repeated to allow for slices
+  repeated SamplingArgs sampling_args = 2;
 }
 
-message JobDescriptor {
+message Job {
+  string output_table_name = 1;
+  repeated ColumnInput inputs = 2;
+  repeated SamplingArgsAssignment sampling_args_assignment = 3;
+}
+
+message BulkJobDescriptor {
   int32 id = 1;
   string name = 2;
-  int32 io_item_size = 3;
-  int32 work_item_size = 4;
+  int32 io_packet_size = 3;
+  int32 work_packet_size = 4;
   int32 num_nodes = 5;
-  repeated Task tasks = 6;
-  repeated Column columns = 7;
+  repeated Job jobs = 6;
 }
 
 // Interal messages
 message DecodeArgs {
-  int32 width = 4;
-  int32 height = 5;
+  int32 width = 10;
+  int32 height = 11;
   int64 start_keyframe = 6;
   int64 end_keyframe = 7;
   repeated int64 keyframes = 1;
-  repeated int64 keyframe_byte_offsets = 2;
-  repeated int64 valid_frames = 3;
-  bytes encoded_video = 8;
+  repeated int64 keyframe_indices = 2;
+  repeated uint64 sample_offsets = 3;
+  repeated uint64 sample_sizes = 4;
+  repeated int64 valid_frames = 5;
+  int64 encoded_video = 8;
+  int64 encoded_video_size = 9;
+  bytes metadata = 12;
 }
 
 message ImageDecodeArgs {
@@ -162,14 +204,16 @@ message ImageDecodeArgs {
 
 message LoadSample {
   int32 table_id = 1;
-  repeated int32 column_ids = 2;
-  repeated int64 warmup_rows = 3 [packed=true];
-  repeated int64 rows = 4 [packed=true];
+  int32 column_id = 2;
+  repeated int64 input_row_ids = 3 [packed=true];
+  repeated int64 output_row_ids = 4 [packed=true];
 }
 
 message LoadWorkEntry {
-  int32 io_item_index = 1;
-  repeated LoadSample samples = 2;
+  int32 table_id = 1;
+  int32 job_index = 2;
+  int32 task_index = 3;
+  repeated LoadSample samples = 4;
 }
 
 message MemoryPoolConfig {
@@ -178,6 +222,7 @@ message MemoryPoolConfig {
     int64 free_space = 2;
   }
 
+  bool pinned_cpu = 1;
   Pool cpu = 3;
   Pool gpu = 4;
 }
@@ -193,8 +238,8 @@ message CollectionsDescriptor {
 }
 
 message FrameInfo {
-  int32 width = 1;
-  int32 height = 2;
+  repeated int32 shape = 1;
+  int32 type = 2;
 }
 
 message MachineParameters {
@@ -204,43 +249,49 @@ message MachineParameters {
   repeated int32 gpu_ids = 4;
 }
 
-message IOItem {
-  // @brief the output table id
-  int32 table_id = 1;
-  // @brief the unique id for this item in the table
-  int64 item_id = 2;
-  // @brief the first row in this item
-  int64 start_row = 3;
-  // @brief the row after the last row in this item
-  int64 end_row = 4;
+// Sampler args
+message StridedSamplerArgs {
+  int64 stride = 1;
 }
 
-// Sampler args
+message StridedRangeSamplerArgs {
+  int64 stride = 1;
+  repeated int64 starts = 2;
+  repeated int64 ends = 3;
+}
 
-message AllSamplerArgs {
-  int64 sample_size = 1;
-  int64 warmup_size = 2;
+message GatherSamplerArgs {
+  repeated int64 rows = 1 [packed=true];
 }
 
-message StridedRangeSamplerArgs {
+
+message SpaceNullSamplerArgs {
+  int64 spacing = 1;
+}
+
+message SpaceRepeatSamplerArgs {
+  int64 spacing = 1;
+}
+
+message StridedPartitionerArgs {
   int64 stride = 1;
-  repeated int64 warmup_starts = 2;
-  repeated int64 starts = 3;
-  repeated int64 ends = 4;
+  int64 group_size = 2;
 }
 
-message StencilSamplerArgs {
+message StridedRangePartitionerArgs {
   int64 stride = 1;
-  repeated int64 stencil = 2;
-  repeated int64 starts = 3;
-  repeated int64 ends = 4;
+  repeated int64 starts = 2;
+  repeated int64 ends = 3;
 }
 
-message GatherSamplerArgs {
-  message Sample {
-    repeated int64 warmup_rows = 1 [packed=true];
-    repeated int64 rows = 2 [packed=true];
+message GatherPartitionerArgs {
+  message GatherList {
+    repeated int64 rows = 1 [packed=true];
   }
 
-  repeated Sample samples = 1;
+  repeated GatherList groups = 1;
+}
+
+message PythonArgs {
+  bytes py_args = 1;
 }
diff --git a/scanner/types.proto b/scanner/types.proto
index 6dce1240..dfc4c158 100644
--- a/scanner/types.proto
+++ b/scanner/types.proto
@@ -45,4 +45,5 @@ message NetDescriptor {
   bool preserve_aspect_ratio = 12;
   bool transpose = 13;
   int32 pad_mod = 14;
-}
\ No newline at end of file
+  bool uses_python = 15;
+}
diff --git a/scanner/util/CMakeLists.txt b/scanner/util/CMakeLists.txt
index 014b82da..699f70e4 100644
--- a/scanner/util/CMakeLists.txt
+++ b/scanner/util/CMakeLists.txt
@@ -18,9 +18,9 @@ set(SOURCE_FILES
   profiler.cpp
   fs.cpp
   bbox.cpp
-  progress_bar.cpp)
+  glog.cpp)
 
-if (OPENCV_FOUND)
+if (OpenCV_FOUND)
   list(APPEND SOURCE_FILES opencv.cpp)
 endif()
 
diff --git a/scanner/util/bbox.cpp b/scanner/util/bbox.cpp
index 0b70f0d2..def7f67a 100644
--- a/scanner/util/bbox.cpp
+++ b/scanner/util/bbox.cpp
@@ -4,7 +4,7 @@
 
 namespace scanner {
 
-std::vector<BoundingBox> best_nms(const std::vector<BoundingBox> &boxes,
+std::vector<BoundingBox> best_nms(const std::vector<BoundingBox>& boxes,
                                   f32 overlap) {
   std::vector<bool> valid(boxes.size(), true);
   auto cmp = [](std::pair<f32, i32> left, std::pair<f32, i32> right) {
@@ -21,14 +21,12 @@ std::vector<BoundingBox> best_nms(const std::vector<BoundingBox> &boxes,
     std::pair<f32, i32> entry = q.top();
     q.pop();
     i32 c_idx = entry.second;
-    if (!valid[c_idx])
-      continue;
+    if (!valid[c_idx]) continue;
 
     best.push_back(c_idx);
 
     for (i32 i = 0; i < (i32)boxes.size(); ++i) {
-      if (!valid[i])
-        continue;
+      if (!valid[i]) continue;
 
       f32 x1 = std::max(boxes[c_idx].x1(), boxes[i].x1());
       f32 y1 = std::max(boxes[c_idx].y1(), boxes[i].y1());
@@ -52,7 +50,7 @@ std::vector<BoundingBox> best_nms(const std::vector<BoundingBox> &boxes,
   return out_boxes;
 }
 
-std::vector<BoundingBox> average_nms(const std::vector<BoundingBox> &boxes,
+std::vector<BoundingBox> average_nms(const std::vector<BoundingBox>& boxes,
                                      f32 overlap) {
   std::vector<BoundingBox> best_boxes;
   std::vector<bool> valid(boxes.size(), true);
@@ -70,22 +68,20 @@ std::vector<BoundingBox> average_nms(const std::vector<BoundingBox> &boxes,
     std::pair<f32, i32> entry = q.top();
     q.pop();
     i32 c_idx = entry.second;
-    if (!valid[c_idx])
-      continue;
+    if (!valid[c_idx]) continue;
 
     best.push_back(c_idx);
 
-    const BoundingBox &current_box = boxes[c_idx];
+    const BoundingBox& current_box = boxes[c_idx];
     f64 total_weight = current_box.score();
     f64 best_x1 = current_box.x1() * current_box.score();
     f64 best_y1 = current_box.y1() * current_box.score();
     f64 best_x2 = current_box.x2() * current_box.score();
     f64 best_y2 = current_box.y2() * current_box.score();
     for (i32 i = 0; i < (i32)boxes.size(); ++i) {
-      if (!valid[i])
-        continue;
+      if (!valid[i]) continue;
 
-      const BoundingBox &candidate = boxes[i];
+      const BoundingBox& candidate = boxes[i];
 
       f32 x1 = std::max(current_box.x1(), candidate.x1());
       f32 y1 = std::max(current_box.y1(), candidate.y1());
diff --git a/scanner/util/common.cpp b/scanner/util/common.cpp
index d94c5b18..29a9a92b 100644
--- a/scanner/util/common.cpp
+++ b/scanner/util/common.cpp
@@ -17,7 +17,7 @@
 
 namespace scanner {
 
-std::ostream &operator<<(std::ostream &os, DeviceHandle const &handle) {
+std::ostream& operator<<(std::ostream& os, DeviceHandle const& handle) {
   std::string name;
   if (handle.type == DeviceType::CPU) {
     name = "CPU";
@@ -30,13 +30,13 @@ std::ostream &operator<<(std::ostream &os, DeviceHandle const &handle) {
 }
 
 StridedInterval::StridedInterval(i32 start, i32 end, i32 stride)
-    : start(start), end(end), stride(stride) {}
+  : start(start), end(end), stride(stride) {}
 
-StridedInterval::StridedInterval(const Interval &i)
-    : start(i.start), end(i.end), stride(1) {}
+StridedInterval::StridedInterval(const Interval& i)
+  : start(i.start), end(i.end), stride(1) {}
 
-bool string_to_image_encoding_type(const std::string &s,
-                                   ImageEncodingType &type) {
+bool string_to_image_encoding_type(const std::string& s,
+                                   ImageEncodingType& type) {
   bool success = true;
   if (s == "png" || s == "PNG") {
     type = ImageEncodingType::PNG;
@@ -55,26 +55,23 @@ bool string_to_image_encoding_type(const std::string &s,
 std::string image_encoding_type_to_string(ImageEncodingType t) {
   std::string s;
   switch (t) {
-  case ImageEncodingType::JPEG:
-    s = "jpeg";
-    break;
-  case ImageEncodingType::PNG:
-    s = "png";
-    break;
-  case ImageEncodingType::BMP:
-    s = "bmp";
-    break;
-  case ImageEncodingType::RAW:
-    s = "raw";
-    break;
-  default:
-    assert(false);
+    case ImageEncodingType::JPEG:
+      s = "jpeg";
+      break;
+    case ImageEncodingType::PNG:
+      s = "png";
+      break;
+    case ImageEncodingType::BMP:
+      s = "bmp";
+      break;
+    case ImageEncodingType::RAW:
+      s = "raw";
+      break;
+    default:
+      assert(false);
   }
   return s;
 }
 
-i64 IO_ITEM_SIZE = 64;         // Number of rows to load and save at a time
-i64 WORK_ITEM_SIZE = 8;        // Max size of a work item
-i32 TASKS_IN_QUEUE_PER_PU = 4; // How many tasks per PU to allocate to a node
-i32 NUM_CUDA_STREAMS = 32;     // Number of cuda streams for image processing
+i32 NUM_CUDA_STREAMS = 32;      // Number of cuda streams for image processing
 }
diff --git a/scanner/util/common.h b/scanner/util/common.h
index 44a9a3d1..ca36a72d 100644
--- a/scanner/util/common.h
+++ b/scanner/util/common.h
@@ -15,10 +15,10 @@
 
 #pragma once
 
+#include "glog/logging.h"
+#include "scanner/engine/rpc.pb.h"
 #include "scanner/metadata.pb.h"
 #include "scanner/types.pb.h"
-#include "scanner/engine/rpc.pb.h"
-#include "glog/logging.h"
 
 #include <cstdint>
 #include <string>
@@ -47,37 +47,35 @@ using proto::MemoryPoolConfig;
 using proto::BoundingBox;
 using proto::Point;
 using proto::Result;
-using proto::IOItem;
 
 struct DeviceHandle {
-public:
+ public:
   bool operator==(const DeviceHandle& other) {
     return type == other.type && id == other.id;
   }
 
-  bool operator!=(const DeviceHandle& other) {
-    return !(*this == other);
+  bool operator!=(const DeviceHandle& other) { return !(*this == other); }
+
+  bool operator<(const DeviceHandle& other) const {
+    return type < other.type && id < other.id;
   }
 
   bool can_copy_to(const DeviceHandle& other) {
-    return !(this->type == DeviceType::GPU &&
-             other.type == DeviceType::GPU &&
+    return !(this->type == DeviceType::GPU && other.type == DeviceType::GPU &&
              this->id != other.id);
   }
 
   bool is_same_address_space(const DeviceHandle& other) {
-    return
-      this->type == other.type &&
-      ((this->type == DeviceType::CPU) ||
-       (this->type == DeviceType::GPU &&
-        this->id == other.id));
+    return this->type == other.type &&
+           ((this->type == DeviceType::CPU) ||
+            (this->type == DeviceType::GPU && this->id == other.id));
   }
 
   DeviceType type;
   i32 id;
 };
 
-std::ostream& operator<<(std::ostream &os, const DeviceHandle& handle);
+std::ostream& operator<<(std::ostream& os, const DeviceHandle& handle);
 
 static const DeviceHandle CPU_DEVICE = {DeviceType::CPU, 0};
 
@@ -97,19 +95,20 @@ struct StridedInterval {
   i32 stride = 1;
 };
 
-bool string_to_image_encoding_type(const std::string& s, proto::ImageEncodingType& t);
+bool string_to_image_encoding_type(const std::string& s,
+                                   proto::ImageEncodingType& t);
 std::string image_encoding_type_to_string(proto::ImageEncodingType d);
 
-#define RESULT_ERROR(result__, str__, ...) {          \
-    char errstr__[1024];                              \
-    snprintf(errstr__, 1024, str__, ## __VA_ARGS__);  \
-    LOG(ERROR) << errstr__;                           \
-    (result__)->set_success(false);                   \
-    (result__)->set_msg(errstr__);                    \
+#define RESULT_ERROR(result__, str__, ...)          \
+  {                                                 \
+    char errstr__[1024];                            \
+    snprintf(errstr__, 1024, str__, ##__VA_ARGS__); \
+    LOG(ERROR) << errstr__;                         \
+    (result__)->set_success(false);                 \
+    (result__)->set_msg(errstr__);                  \
   }
 
 ///////////////////////////////////////////////////////////////////////////////
 /// Global constants
-extern i32 TASKS_IN_QUEUE_PER_PU;  // How many tasks per PU to allocate
-extern i32 NUM_CUDA_STREAMS;  // # of cuda streams for image processing
+extern i32 NUM_CUDA_STREAMS;       // # of cuda streams for image processing
 }
diff --git a/scanner/util/cuda.h b/scanner/util/cuda.h
index b85dab18..82cfe9a7 100644
--- a/scanner/util/cuda.h
+++ b/scanner/util/cuda.h
@@ -25,36 +25,36 @@
 #ifdef HAVE_CUDA
 #define CUDA_PROTECT(s) (s);
 #else
-#define CUDA_PROTECT(s) {                       \
-    LOG(FATAL) << "Cuda not enabled.";          \
-  }
+#define CUDA_PROTECT(s) \
+  { LOG(FATAL) << "Cuda not enabled."; }
 #endif
 
 #ifdef HAVE_CUDA
 
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
 #define CU_CHECK(ans) \
   { cuAssert((ans), __FILE__, __LINE__); }
 
-inline void cuAssert(cudaError_t code, const char *file, int line) {
+inline void cuAssert(cudaError_t code, const char* file, int line) {
   if (code != cudaSuccess) {
-    LOG(FATAL) << "GPUassert: "
-               << cudaGetErrorString(code) << " "
-               << file << " "
-               << line;
+    LOG(FATAL) << "GPUassert: " << cudaGetErrorString(code) << " " << file
+               << " " << line;
   }
 }
 
 #define CUD_CHECK(ans) \
   { cudAssert((ans), __FILE__, __LINE__); }
 
-inline void cudAssert(CUresult code, const char *file, int line) {
+inline void cudAssert(CUresult code, const char* file, int line) {
   if (code != CUDA_SUCCESS) {
-    const char *err_str;
+    const char* err_str;
     cuGetErrorString(code, &err_str);
-    LOG(FATAL) << "GPUassert: "
-               << err_str << " "
-               << file << " "
-               << line;
+    LOG(FATAL) << "GPUassert: " << err_str << " " << file << " " << line;
   }
 }
 
diff --git a/scanner/util/fs.cpp b/scanner/util/fs.cpp
index 3475d225..d1d970e9 100644
--- a/scanner/util/fs.cpp
+++ b/scanner/util/fs.cpp
@@ -7,17 +7,17 @@
 #include <sys/stat.h> /* mkdir(2) */
 #include <unistd.h>   /* access(2) */
 #include <cstdarg>
-#include <sstream>
 #include <fstream>
+#include <sstream>
 
 namespace scanner {
 // Stolen from
 // https://gist.github.com/JonathonReinhart/8c0d90191c38af2dcadb102c4e202950
-int mkdir_p(const char *path, mode_t mode) {
+int mkdir_p(const char* path, mode_t mode) {
   /* Adapted from http://stackoverflow.com/a/2336245/119527 */
   const size_t len = strlen(path);
   char _path[PATH_MAX];
-  char *p;
+  char* p;
 
   errno = 0;
 
@@ -40,8 +40,7 @@ int mkdir_p(const char *path, mode_t mode) {
           return -1;
         }
         if (mkdir(_path, mode) != 0) {
-          if (errno != EEXIST)
-            return -1;
+          if (errno != EEXIST) return -1;
         }
       }
 
@@ -50,14 +49,13 @@ int mkdir_p(const char *path, mode_t mode) {
   }
 
   if (mkdir(_path, mode) != 0) {
-    if (errno != EEXIST)
-      return -1;
+    if (errno != EEXIST) return -1;
   }
 
   return 0;
 }
 
-void temp_file(FILE **fp, std::string &name) {
+void temp_file(FILE** fp, std::string& name) {
   char n[] = "/tmp/scannerXXXXXX";
   int fd = mkstemp(n);
   *fp = fdopen(fd, "wb+");
@@ -72,7 +70,7 @@ void temp_file(std::string& name) {
 
 void temp_dir(std::string& name) {
   char n[] = "/tmp/scannerXXXXXX";
-  (void) mkdtemp(n);
+  (void)mkdtemp(n);
   name = std::string(n);
 }
 
@@ -102,5 +100,4 @@ std::vector<uint8_t> read_entire_file(const std::string& file_name) {
               std::istreambuf_iterator<char>());
   return data;
 }
-
 }
diff --git a/scanner/util/fs.h b/scanner/util/fs.h
index 8828826b..2c5d3dd4 100644
--- a/scanner/util/fs.h
+++ b/scanner/util/fs.h
@@ -15,14 +15,14 @@
 
 #pragma once
 
+#include <glog/logging.h>
 #include <libgen.h>
 #include <stdio.h>
 #include <sys/stat.h>
 #include <cstring>
-#include <string>
 #include <sstream>
+#include <string>
 #include <vector>
-#include <glog/logging.h>
 
 namespace scanner {
 
@@ -55,5 +55,4 @@ std::string download_temp(const std::string& url);
 void delete_file(const std::string& path);
 
 std::vector<uint8_t> read_entire_file(const std::string& file_name);
-
 }
diff --git a/scanner/util/glog.cpp b/scanner/util/glog.cpp
new file mode 100644
index 00000000..b0d1283f
--- /dev/null
+++ b/scanner/util/glog.cpp
@@ -0,0 +1,31 @@
+/* Copyright 2017 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "glog/logging.h"
+
+#include <atomic>
+
+namespace scanner {
+namespace {
+std::atomic_flag glog_initialized;
+}
+
+void init_glog(const char* prog_name) {
+  if (!glog_initialized.test_and_set()) {
+    google::InitGoogleLogging(prog_name);
+  }
+}
+
+}
diff --git a/scanner/util/glog.h b/scanner/util/glog.h
new file mode 100644
index 00000000..380cd4eb
--- /dev/null
+++ b/scanner/util/glog.h
@@ -0,0 +1,28 @@
+/* Copyright 2017 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "glog/logging.h"
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace scanner {
+
+void init_glog(const char* program_name);
+
+}
diff --git a/scanner/util/grpc.h b/scanner/util/grpc.h
new file mode 100644
index 00000000..9007fe96
--- /dev/null
+++ b/scanner/util/grpc.h
@@ -0,0 +1,44 @@
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdlib>
+#include <unistd.h>
+
+namespace scanner {
+
+#define GRPC_BACKOFF(expression__, status__)                             \
+  do {                                                                   \
+    int sleep_debt__ = 1;                                                \
+    while (true) {                                                       \
+      grpc::ClientContext ctx;                                           \
+      const grpc::Status result__ = (expression__);                      \
+      if (result__.error_code() == grpc::StatusCode::UNAVAILABLE) {      \
+        double sleep_time__ =                                            \
+            (sleep_debt__ + (static_cast<double>(rand()) / RAND_MAX));   \
+        if (sleep_debt__ < 64) {                                         \
+          sleep_debt__ *= 2;                                             \
+        } else {                                                         \
+          LOG(WARNING) << "GRPC_BACKOFF: reached max backoff.";          \
+        }                                                                \
+        LOG(WARNING) << "GRPC_BACKOFF: transient failure, sleeping for " \
+                     << sleep_time__ << " seconds.";                     \
+        usleep(sleep_time__ * 1000000);                                  \
+        continue;                                                        \
+      }                                                                  \
+      status__ = result__;                                               \
+      break;                                                             \
+    }                                                                    \
+  } while (0);
+}
diff --git a/scanner/util/h264.h b/scanner/util/h264.h
index 97082c29..c3d4e410 100644
--- a/scanner/util/h264.h
+++ b/scanner/util/h264.h
@@ -25,14 +25,14 @@ struct GetBitsState {
   i64 size;
 };
 
-inline u32 get_bit(GetBitsState &gb) {
+inline u32 get_bit(GetBitsState& gb) {
   u8 v =
       ((*(gb.buffer + (gb.offset >> 0x3))) >> (0x7 - (gb.offset & 0x7))) & 0x1;
   gb.offset++;
   return v;
 }
 
-inline u32 get_bits(GetBitsState &gb, i32 bits) {
+inline u32 get_bits(GetBitsState& gb, i32 bits) {
   u32 v = 0;
   for (i32 i = bits - 1; i >= 0; i--) {
     v |= get_bit(gb) << i;
@@ -40,7 +40,7 @@ inline u32 get_bits(GetBitsState &gb, i32 bits) {
   return v;
 }
 
-inline u32 get_ue_golomb(GetBitsState &gb) {
+inline u32 get_ue_golomb(GetBitsState& gb) {
   // calculate zero bits. Will be optimized.
   i32 zeros = 0;
   while (0 == get_bit(gb)) {
@@ -57,7 +57,7 @@ inline u32 get_ue_golomb(GetBitsState &gb) {
   return (info - 1);
 }
 
-inline u32 get_se_golomb(GetBitsState &gb) {
+inline u32 get_se_golomb(GetBitsState& gb) {
   // calculate zero bits. Will be optimized.
   i32 zeros = 0;
   while (0 == get_bit(gb)) {
@@ -74,11 +74,14 @@ inline u32 get_se_golomb(GetBitsState &gb) {
   return (info - 1);
 }
 
-
 inline void next_nal(const u8*& buffer, i32& buffer_size_left,
                      const u8*& nal_start, i32& nal_size) {
-  while (buffer_size_left > 2 &&
-         !(buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0x01)) {
+  bool found = false;
+  while (buffer_size_left > 2) {
+    if (buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0x01) {
+      found = true;
+      break;
+    }
     buffer++;
     buffer_size_left--;
   }
@@ -88,17 +91,22 @@ inline void next_nal(const u8*& buffer, i32& buffer_size_left,
 
   nal_start = buffer;
   nal_size = 0;
-  if (buffer_size_left > 2) {
-    while (!(buffer[0] == 0x00 && buffer[1] == 0x00 &&
-             (buffer[2] == 0x00 || buffer[2] == 0x01))) {
-      buffer++;
-      buffer_size_left--;
-      nal_size++;
-      if (buffer_size_left < 3) {
-        nal_size += buffer_size_left;
-        break;
-      }
-    }
+
+  if (!found) {
+    return;
+  }
+  while (buffer_size_left > 2 &&
+         !(buffer[0] == 0x00 && buffer[1] == 0x00 &&
+           (buffer[2] == 0x00 || buffer[2] == 0x01))) {
+    buffer++;
+    buffer_size_left--;
+    nal_size++;
+  }
+  if (!(buffer_size_left > 3)) {
+    nal_size += buffer_size_left;
+    // Not sure if this is needed or not...
+    // buffer += buffer_size_left;
+    // buffer_size_left = 0;
   }
 }
 
@@ -124,7 +132,7 @@ struct SPS {
   bool frame_mbs_only_flag;
 };
 
-inline bool parse_sps(GetBitsState &gb, SPS& info) {
+inline bool parse_sps(GetBitsState& gb, SPS& info) {
   // profile_idc
   info.profile_idc = get_bits(gb, 8);
   // constraint_set0_flag
@@ -139,17 +147,18 @@ inline bool parse_sps(GetBitsState &gb, SPS& info) {
   get_bits(gb, 8);
   // seq_parameter_set_id
   info.sps_id = get_ue_golomb(gb);
-  if (info.profile_idc == 100 || // High profile
-      info.profile_idc == 110 || // High10 profile
-      info.profile_idc == 122 || // High422 profile
-      info.profile_idc == 244 || // High444 Predictive profile
-      info.profile_idc == 44 ||  // Cavlc444 profile
-      info.profile_idc == 83 ||  // Scalable Constrained High profile (SVC)
-      info.profile_idc == 86 ||  // Scalable High Intra profile (SVC)
-      info.profile_idc == 118 || // Stereo High profile (MVC)
-      info.profile_idc == 128 || // Multiview High profile (MVC)
-      info.profile_idc == 138 || // Multiview Depth High profile (MVCD)
-      info.profile_idc == 144) {
+  if (info.profile_idc == 100 ||  // High profile
+      info.profile_idc == 110 ||  // High10 profile
+      info.profile_idc == 122 ||  // High422 profile
+      info.profile_idc == 244 ||  // High444 Predictive profile
+      info.profile_idc == 44 ||   // Cavlc444 profile
+      info.profile_idc == 83 ||   // Scalable Constrained High profile (SVC)
+      info.profile_idc == 86 ||   // Scalable High Intra profile (SVC)
+      info.profile_idc == 118 ||  // Stereo High profile (MVC)
+      info.profile_idc == 128 ||  // Multiview High profile (MVC)
+      info.profile_idc == 138 ||  // Multiview Depth High profile (MVCD)
+      info.profile_idc == 139 || info.profile_idc == 134 ||
+      info.profile_idc == 135 || info.profile_idc == 144) {
     // chroma_format_idc
     u32 chroma_format_idc = get_ue_golomb(gb);
     if (chroma_format_idc > 3U) {
@@ -190,28 +199,31 @@ inline bool parse_sps(GetBitsState &gb, SPS& info) {
   // pic_order_cnt_type
   info.poc_type = get_ue_golomb(gb);
   switch (info.poc_type) {
-  case 0: {
-    // log2_max_pic_order_cnt_lsb_minus4
-    info.log2_max_pic_order_cnt_lsb = get_ue_golomb(gb) + 4;
-  } break;
-  case 1: {
-    // delta_pic_order_always_zero_flag
-    info.delta_pic_order_always_zero_flag = get_bit(gb);
-    // offset_for_non_ref_pic
-    get_se_golomb(gb);
-    // offset_for_top_to_bottom_field
-    get_se_golomb(gb);
-    // num_ref_frames_in_pic_order_cnt_cycle
-    u32 num_ref_frames = get_ue_golomb(gb);
-    for (u32 i = 0; i < num_ref_frames; i++) {
-      // offset_for_ref_frame[ i ];
+    case 0: {
+      // log2_max_pic_order_cnt_lsb_minus4
+      info.log2_max_pic_order_cnt_lsb = get_ue_golomb(gb) + 4;
+    } break;
+    case 1: {
+      // delta_pic_order_always_zero_flag
+      info.delta_pic_order_always_zero_flag = get_bit(gb);
+      // offset_for_non_ref_pic
       get_se_golomb(gb);
-    }
-  } break;
-  default: {
-    LOG(WARNING) << "Illegal picture_order_count type: " << info.poc_type;
-    return false;
-  } break;
+      // offset_for_top_to_bottom_field
+      get_se_golomb(gb);
+      // num_ref_frames_in_pic_order_cnt_cycle
+      u32 num_ref_frames = get_ue_golomb(gb);
+      for (u32 i = 0; i < num_ref_frames; i++) {
+        // offset_for_ref_frame[ i ];
+        get_se_golomb(gb);
+      }
+    } break;
+    case 2: {
+      // NOTE(apoms): Nothing to do here
+    } break;
+    default: {
+      LOG(WARNING) << "Illegal picture_order_count type: " << info.poc_type;
+      return false;
+    } break;
   }
   // num_ref_frames
   get_ue_golomb(gb);
@@ -227,15 +239,18 @@ inline bool parse_sps(GetBitsState &gb, SPS& info) {
   return true;
 }
 
-
 struct PPS {
   u32 pps_id;
   u32 sps_id;
   bool pic_order_present_flag;
   bool redundant_pic_cnt_present_flag;
+  u32 num_ref_idx_l0_default_active;
+  u32 num_ref_idx_l1_default_active;
+  bool weighted_pred_flag;
+  u8 weighted_bipred_idc;
 };
 
-inline bool parse_pps(GetBitsState &gb, PPS& info) {
+inline bool parse_pps(GetBitsState& gb, PPS& info) {
   // pic_parameter_set_id
   info.pps_id = get_ue_golomb(gb);
   // seq_parameter_set_id
@@ -254,13 +269,13 @@ inline bool parse_pps(GetBitsState &gb, PPS& info) {
     return false;
   }
   // num_ref_idx_l0_active_minus1
-  u32 num_ref_idx_l0_active_minus1 = get_ue_golomb(gb);
+  info.num_ref_idx_l0_default_active = get_ue_golomb(gb) + 1;
   // num_ref_idx_l1_active_minus1
-  u32 num_ref_idx_l1_active_minus1 = get_ue_golomb(gb);
+  info.num_ref_idx_l1_default_active = get_ue_golomb(gb) + 1;
   // weighted_pred_flag
-  bool weighted_pred_flag = get_bit(gb);
+  info.weighted_pred_flag = get_bit(gb);
   // weighted_bipred_idc
-  bool weighted_bipred_idc = get_bits(gb, 2);
+  info.weighted_bipred_idc = get_bits(gb, 2);
   // pic_init_qp_minus26 /* relative to 26 */
   u32 pic_init_qp_minus26 = get_se_golomb(gb);
   // pic_init_qs_minus26 /* relative to 26 */
@@ -268,9 +283,9 @@ inline bool parse_pps(GetBitsState &gb, PPS& info) {
   // chroma_qp_index_offset
   u32 chroma_qp_index_offset = get_se_golomb(gb);
   // deblocking_filter_control_present_flag
-  (void) get_bit(gb);
+  (void)get_bit(gb);
   // constrained_intra_pred_flag
-  (void) get_bit(gb);
+  (void)get_bit(gb);
   // redundant_pic_cnt_present_flag
   info.redundant_pic_cnt_present_flag = get_bit(gb);
   // rbsp_trailing_bits()
@@ -282,7 +297,7 @@ struct SliceHeader {
   u32 nal_unit_type;
   u32 nal_ref_idc;
   u32 slice_type;
-  u32 sps_id; // Added for convenience
+  u32 sps_id;  // Added for convenience
   u32 pps_id;
   u32 frame_num;
   bool field_pic_flag;
@@ -292,13 +307,13 @@ struct SliceHeader {
   i32 delta_pic_order_cnt_bottom;
   u32 delta_pic_order_cnt[2];
   u32 redundant_pic_cnt;
+  u32 num_ref_idx_l0_active;
+  u32 num_ref_idx_l1_active;
 };
 
-inline bool parse_slice_header(GetBitsState &gb,
-                               SPS &sps,
-                               std::map<u32, PPS> &pps_map,
-                               u32 nal_unit_type, u32 nal_ref_idc,
-                               SliceHeader& info) {
+inline bool parse_slice_header(GetBitsState& gb, SPS& sps,
+                               std::map<u32, PPS>& pps_map, u32 nal_unit_type,
+                               u32 nal_ref_idc, SliceHeader& info) {
   info.nal_unit_type = nal_unit_type;
   info.nal_ref_idc = nal_ref_idc;
   // first_mb_in_slice
@@ -352,15 +367,34 @@ inline bool parse_slice_header(GetBitsState &gb,
   }
   info.redundant_pic_cnt =
       pps.redundant_pic_cnt_present_flag ? get_ue_golomb(gb) : 0;
+  if (info.slice_type == 1 || info.slice_type == 6) {
+    bool direct_spatial_mv_pred_flag = get_bit(gb);
+  }
+  if (info.slice_type == 0 || info.slice_type == 5 ||  // P
+      info.slice_type == 1 || info.slice_type == 6 ||  // B
+      info.slice_type == 3 || info.slice_type == 8     // SP
+      ) {
+    bool num_ref_idx_active_override_flag = get_bit(gb);
+    if (num_ref_idx_active_override_flag) {
+      info.num_ref_idx_l0_active = get_ue_golomb(gb);
+      if (info.slice_type == 1 || info.slice_type == 6) {
+        info.num_ref_idx_l1_active = get_ue_golomb(gb);
+      }
+    } else {
+      info.num_ref_idx_l0_active = pps.num_ref_idx_l0_default_active;
+      info.num_ref_idx_l1_active = pps.num_ref_idx_l1_default_active;
+      ;
+    }
+  }
   return true;
 }
 
-inline bool is_new_access_unit(std::map<u32, SPS> &sps_map,
-                        std::map<u32, PPS> &pps_map, SliceHeader &prev,
-                        SliceHeader &curr) {
-  SPS &prev_sps = sps_map.at(prev.sps_id);
-  SPS &curr_sps = sps_map.at(curr.sps_id);
-  PPS &curr_pps = pps_map.at(curr.pps_id);
+inline bool is_new_access_unit(std::map<u32, SPS>& sps_map,
+                               std::map<u32, PPS>& pps_map, SliceHeader& prev,
+                               SliceHeader& curr) {
+  SPS& prev_sps = sps_map.at(prev.sps_id);
+  SPS& curr_sps = sps_map.at(curr.sps_id);
+  PPS& curr_pps = pps_map.at(curr.pps_id);
   if (curr.nal_unit_type != 5 && curr.frame_num != prev.frame_num) {
     VLOG(1) << "frame num";
     return true;
@@ -383,9 +417,8 @@ inline bool is_new_access_unit(std::map<u32, SPS> &sps_map,
               prev.delta_pic_order_cnt_bottom !=
                   curr.delta_pic_order_cnt_bottom)) {
     VLOG(1) << "poc type 0: " << prev.pic_order_cnt_lsb << " vs. "
-              << curr.pic_order_cnt_lsb << ", "
-              << prev.delta_pic_order_cnt_bottom << " vs. "
-              << curr.delta_pic_order_cnt_bottom;
+            << curr.pic_order_cnt_lsb << ", " << prev.delta_pic_order_cnt_bottom
+            << " vs. " << curr.delta_pic_order_cnt_bottom;
     return true;
   } else if ((prev_sps.poc_type == 1 && curr_sps.poc_type == 1) &&
              (prev.delta_pic_order_cnt[0] != curr.delta_pic_order_cnt[0] ||
diff --git a/scanner/util/halide.h b/scanner/util/halide.h
index 3e401c11..e690152c 100644
--- a/scanner/util/halide.h
+++ b/scanner/util/halide.h
@@ -1,13 +1,13 @@
 #pragma once
 
-#include "scanner/util/common.h"
 #include "scanner/api/kernel.h"
+#include "scanner/util/common.h"
 
 #include "HalideRuntime.h"
 
 #ifdef HAVE_CUDA
 #include "HalideRuntimeCuda.h"
-#include "scanner/engine/halide_context.h"
+#include "scanner/util/halide_context.h"
 #endif
 
 namespace scanner {
@@ -23,10 +23,8 @@ void setup_halide_frame_buf(buffer_t& halide_buf, FrameInfo& frame_info) {
   halide_buf.elem_size = 1;
 }
 
-void set_halide_buf_ptr(const DeviceHandle& device,
-                        buffer_t &halide_buf,
-                        u8 *buf,
-                        size_t size) {
+void set_halide_buf_ptr(const DeviceHandle& device, buffer_t& halide_buf,
+                        u8* buf, size_t size) {
   if (device.type == DeviceType::GPU) {
     CUDA_PROTECT({
       halide_buf.dev = (uintptr_t) nullptr;
@@ -42,18 +40,16 @@ void set_halide_buf_ptr(const DeviceHandle& device,
       // "You'll need to set the host field of the buffer_t structs to
       // something other than nullptr as that is used to indicate bounds query
       // calls" - Zalman Stern
-      halide_buf.host = (u8 *)0xdeadbeef;
-      });
+      halide_buf.host = (u8*)0xdeadbeef;
+    });
   } else {
     halide_buf.host = buf;
   }
 }
 
-void unset_halide_buf_ptr(const DeviceHandle& device,
-                          buffer_t &halide_buf) {
+void unset_halide_buf_ptr(const DeviceHandle& device, buffer_t& halide_buf) {
   if (device.type == DeviceType::GPU) {
     CUDA_PROTECT({ halide_cuda_detach_device_ptr(nullptr, &halide_buf); });
   }
 }
-
 }
diff --git a/scanner/engine/halide_context.cpp b/scanner/util/halide_context.cpp
similarity index 85%
rename from scanner/engine/halide_context.cpp
rename to scanner/util/halide_context.cpp
index 1dff8661..de9e811c 100644
--- a/scanner/engine/halide_context.cpp
+++ b/scanner/util/halide_context.cpp
@@ -1,5 +1,6 @@
 #include "scanner/util/cuda.h"
 
+#ifdef HAVE_CUDA
 namespace Halide {
 namespace Runtime {
 namespace Internal {
@@ -9,3 +10,4 @@ CUcontext context = 0;
 }
 }
 }
+#endif
diff --git a/scanner/util/halide_context.h b/scanner/util/halide_context.h
new file mode 100644
index 00000000..3b480e2b
--- /dev/null
+++ b/scanner/util/halide_context.h
@@ -0,0 +1,13 @@
+#include "scanner/util/cuda.h"
+
+#pragma once
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+namespace Cuda {
+extern CUcontext context;
+}
+}
+}
+}
diff --git a/scanner/util/memory.cpp b/scanner/util/memory.cpp
index 32a54164..1275222d 100644
--- a/scanner/util/memory.cpp
+++ b/scanner/util/memory.cpp
@@ -16,11 +16,11 @@
 #include "scanner/util/memory.h"
 #include "scanner/util/cuda.h"
 
-#include <cassert>
-#include <mutex>
 #include <sys/syscall.h>
 #include <sys/sysinfo.h>
 #include <unistd.h>
+#include <cassert>
+#include <mutex>
 
 #ifdef HAVE_CUDA
 #include <cuda.h>
@@ -48,14 +48,15 @@ namespace scanner {
 //
 // 3. Block allocations allow ops to allocate a single block of memory
 // for
-//    their returned rows instead of allocating individually for each row. This
+//    their returned elements instead of allocating individually for each
+//    element. This
 //    again reduces the number of cudaMallocs if not using a memory pool.
 //    Regardless of pool usage, blocks can also be copied in a single memcpy
 //    instead of many, which reduces memcpy calls. To avoid complexity in the
-//    core Scanner engine, it is oblivious to whether a u8* in an output row
+//    core Scanner engine, it is oblivious to whether a u8* in an output element
 //    is from a block or an individual allocation. Instead, the allocation
 //    runtime does reference counting when the engine calls free on a memory
-//    block, e.g. if a memory block is allocated for 96 rows (96 different
+//    block, e.g. if a memory block is allocated for 96 elements (96 different
 //    pointers in the same block), then each free to a pointer into the block
 //    decrements a reference counter until freeing the block at 0 refs.
 //
@@ -65,35 +66,47 @@ namespace scanner {
 // and the latter by the pool if it exists.
 
 class Allocator {
-public:
+ public:
   virtual ~Allocator(){};
 
-  virtual u8 *allocate(size_t size) = 0;
-  virtual void free(u8 *buffer) = 0;
+  virtual u8* allocate(size_t size) = 0;
+  virtual void free(u8* buffer) = 0;
 };
 
 class SystemAllocator : public Allocator {
-public:
-  SystemAllocator(DeviceHandle device) : device_(device) {}
+ public:
+  SystemAllocator(DeviceHandle device)
+    : device_(device) {
+  }
 
-  u8 *allocate(size_t size) {
+  ~SystemAllocator() {
+    // Device reset ensures cuda-memcheck will work
+    if (device_.type == DeviceType::GPU) {
+      CUDA_PROTECT({
+        CU_CHECK(cudaSetDevice(device_.id));
+        CU_CHECK(cudaDeviceReset());
+      });
+    }
+  }
+
+  u8* allocate(size_t size) {
     if (device_.type == DeviceType::CPU) {
       try {
         return new u8[size];
-      } catch (const std::bad_alloc &e) {
+      } catch (const std::bad_alloc& e) {
         LOG(FATAL) << "CPU memory allocation failed: " << e.what();
       }
     } else if (device_.type == DeviceType::GPU) {
-      u8 *buffer;
+      u8* buffer;
       CUDA_PROTECT({
         CU_CHECK(cudaSetDevice(device_.id));
-        CU_CHECK(cudaMalloc((void **)&buffer, size));
-      })
+        CU_CHECK(cudaMalloc((void**)&buffer, size));
+      });
       return buffer;
     }
   }
 
-  void free(u8 *buffer) {
+  void free(u8* buffer) {
     if (device_.type == DeviceType::CPU) {
       delete[] buffer;
     } else if (device_.type == DeviceType::GPU) {
@@ -112,25 +125,27 @@ class SystemAllocator : public Allocator {
     }
   }
 
-private:
+ private:
   DeviceHandle device_;
 };
 
-bool pointer_in_buffer(u8 *ptr, u8 *buf_start, u8 *buf_end) {
+bool pointer_in_buffer(u8* ptr, u8* buf_start, u8* buf_end) {
   return (size_t)ptr >= (size_t)buf_start && (size_t)ptr < (size_t)buf_end;
 }
 
 class PoolAllocator : public Allocator {
-public:
-  PoolAllocator(DeviceHandle device, SystemAllocator *allocator,
+ public:
+  PoolAllocator(DeviceHandle device, SystemAllocator* allocator,
                 size_t pool_size)
-      : device_(device), system_allocator(allocator), pool_size_(pool_size) {
+    : device_(device), system_allocator(allocator), pool_size_(pool_size) {
     pool_ = system_allocator->allocate(pool_size_);
   }
 
-  ~PoolAllocator() { system_allocator->free(pool_); }
+  ~PoolAllocator() {
+    system_allocator->free(pool_);
+  }
 
-  u8 *allocate(size_t size) {
+  u8* allocate(size_t size) {
     Allocation alloc;
     alloc.length = size;
 
@@ -158,7 +173,7 @@ class PoolAllocator : public Allocator {
 
     if (!found) {
       if (num_alloc > 0) {
-        Allocation &last = allocations_[num_alloc - 1];
+        Allocation& last = allocations_[num_alloc - 1];
         alloc.offset = align(last.offset + last.length);
       } else {
         alloc.offset = 0;
@@ -169,7 +184,7 @@ class PoolAllocator : public Allocator {
     LOG_IF(FATAL, alloc.offset + alloc.length >= pool_size_)
         << "Exceeded pool size";
 
-    u8 *buffer = pool_ + alloc.offset;
+    u8* buffer = pool_ + alloc.offset;
     return buffer;
   }
 
@@ -183,7 +198,7 @@ class PoolAllocator : public Allocator {
     }
   }
 
-  void free(u8 *buffer) {
+  void free(u8* buffer) {
     LOG_IF(FATAL, !pointer_in_buffer(buffer, pool_, pool_ + pool_size_))
         << "Pool allocator tried to free buffer not in pool";
 
@@ -192,12 +207,12 @@ class PoolAllocator : public Allocator {
     bool found = find_buffer(buffer, index);
     LOG_IF(FATAL, !found) << "Attempted to free unallocated buffer in pool";
 
-    Allocation &alloc = allocations_[index];
+    Allocation& alloc = allocations_[index];
     allocations_.erase(allocations_.begin() + index);
   }
 
-private:
-  bool find_buffer(u8 *buffer, i32 &index) {
+ private:
+  bool find_buffer(u8* buffer, i32& index) {
     i32 num_alloc = allocations_.size();
     for (i32 i = 0; i < num_alloc; ++i) {
       Allocation alloc = allocations_[i];
@@ -215,29 +230,29 @@ class PoolAllocator : public Allocator {
   } Allocation;
 
   DeviceHandle device_;
-  u8 *pool_ = nullptr;
+  u8* pool_ = nullptr;
   size_t pool_size_;
   std::mutex lock_;
   std::vector<Allocation> allocations_;
 
-  SystemAllocator *system_allocator;
+  SystemAllocator* system_allocator;
 };
 
 class BlockAllocator {
-public:
-  BlockAllocator(Allocator *allocator) : allocator_(allocator) {}
+ public:
+  BlockAllocator(Allocator* allocator) : allocator_(allocator) {}
 
   ~BlockAllocator() {
     std::lock_guard<std::mutex> guard(lock_);
-
     for (Allocation& alloc : allocations_) {
       assert(alloc.refs > 0);
       allocator_->free(alloc.buffer);
     }
+    allocations_.clear();
   }
 
-  u8 *allocate(size_t size, i32 refs) {
-    u8 *buffer = allocator_->allocate(size);
+  u8* allocate(size_t size, i32 refs) {
+    u8* buffer = allocator_->allocate(size);
 
     Allocation alloc;
     alloc.buffer = buffer;
@@ -250,25 +265,36 @@ class BlockAllocator {
     return buffer;
   }
 
-  void free(u8 *buffer) {
+  void add_refs(u8* buffer, size_t refs) {
+    std::lock_guard<std::mutex> guard(lock_);
+
+    i32 index;
+    bool found = find_buffer(buffer, index);
+    LOG_IF(FATAL, !found)
+        << "Block allocator tried to add ref to non-block buffer";
+
+    Allocation& alloc = allocations_[index];
+    alloc.refs += refs;
+  }
+
+  void free(u8* buffer) {
     std::lock_guard<std::mutex> guard(lock_);
 
     i32 index;
     bool found = find_buffer(buffer, index);
     LOG_IF(FATAL, !found) << "Block allocator freed non-block buffer";
 
-    Allocation &alloc = allocations_[index];
+    Allocation& alloc = allocations_[index];
     assert(alloc.refs > 0);
     alloc.refs -= 1;
 
     if (alloc.refs == 0) {
       allocator_->free(alloc.buffer);
       allocations_.erase(allocations_.begin() + index);
-      return;
     }
   }
 
-  bool buffers_in_same_block(std::vector<u8 *> buffers) {
+  bool buffers_in_same_block(std::vector<u8*> buffers) {
     assert(buffers.size() > 0);
 
     std::lock_guard<std::mutex> guard(lock_);
@@ -289,14 +315,13 @@ class BlockAllocator {
     return true;
   }
 
-  bool buffer_in_block(u8 *buffer) {
+  bool buffer_in_block(u8* buffer) {
     std::lock_guard<std::mutex> guard(lock_);
     i32 index;
     return find_buffer(buffer, index);
   }
 
-private:
-  bool find_buffer(u8 *buffer, i32 &index) {
+  bool find_buffer(u8* buffer, i32& index) {
     i32 num_alloc = allocations_.size();
     for (i32 i = 0; i < num_alloc; ++i) {
       Allocation alloc = allocations_[i];
@@ -307,29 +332,192 @@ class BlockAllocator {
     }
     return false;
   }
+ private:
 
   typedef struct {
-    u8 *buffer;
+    u8* buffer;
     size_t size;
     i32 refs;
   } Allocation;
 
   std::mutex lock_;
   std::vector<Allocation> allocations_;
-  Allocator *allocator_;
+  Allocator* allocator_;
+};
+
+class LinkedAllocator {
+ public:
+  LinkedAllocator(std::map<DeviceHandle, Allocator*> allocators)
+    : allocators_(allocators) {}
+
+  ~LinkedAllocator() {
+    std::lock_guard<std::mutex> guard(lock_);
+    for (Allocation& alloc : allocations_) {
+      for (auto kv : alloc.buffers) {
+        auto& allocator = allocators_.at(kv.first);
+        allocator->free(kv.second);
+      }
+    }
+    allocations_.clear();
+  }
+
+  u8* allocate(DeviceHandle device, size_t size, i32 refs) {
+    auto& allocator = allocators_.at(device);
+    u8* buffer = allocator->allocate(size);
+
+    Allocation alloc;
+    alloc.buffers[device] = buffer;
+    alloc.size = size;
+    alloc.refs[device] = refs;
+
+    std::lock_guard<std::mutex> guard(lock_);
+    allocations_.push_back(alloc);
+
+    return buffer;
+  }
+
+  void add_refs(DeviceHandle device, u8* buffer, size_t refs) {
+    auto& allocator = allocators_.at(device);
+
+    std::lock_guard<std::mutex> guard(lock_);
+
+    i32 index;
+    bool found = find_buffer(device, buffer, index);
+    LOG_IF(FATAL, !found)
+        << "Block allocator tried to add ref to non-block buffer";
+
+    Allocation& alloc = allocations_[index];
+    alloc.refs[device] += refs;
+  }
+
+  void copy_or_add_refs(DeviceHandle source_device, u8* source_buffer,
+                        size_t refs, DeviceHandle target_device,
+                        u8*& target_buffer) {
+    std::lock_guard<std::mutex> guard(lock_);
+
+    // Check if buffer exists
+    i32 index;
+    bool found = find_buffer(source_device, source_buffer, index);
+    LOG_IF(FATAL, !found)
+        << "Linked allocator tried to copy or add ref to non-block buffer";
+    // Check if requested device exists
+    Allocation& alloc = allocations_[index];
+    if (alloc.refs.count(target_device) > 0) {
+      // Add ref
+      alloc.refs[target_device] += refs;
+    } else {
+      // Copy
+      auto& allocator = allocators_.at(source_device);
+      u8* new_buffer = allocator->allocate(alloc.size);
+      memcpy_buffer(new_buffer, target_device,
+                    alloc.buffers[source_device], source_device, alloc.size);
+      alloc.refs[target_device] = refs;
+      alloc.buffers[target_device] = new_buffer;
+    }
+    // Set target_buffer to same offset as it would be in the allocation that
+    // source_buffer is from
+    target_buffer = (u64)(source_buffer - alloc.buffers[source_device]) +
+                    alloc.buffers[target_device];
+  }
+
+  void free(DeviceHandle device, u8* buffer) {
+    auto& allocator = allocators_.at(device);
+
+    std::lock_guard<std::mutex> guard(lock_);
+
+    i32 index;
+    bool found = find_buffer(device, buffer, index);
+    LOG_IF(FATAL, !found) << "Block allocator freed non-block buffer";
+
+    Allocation& alloc = allocations_[index];
+    assert(alloc.refs[device] > 0);
+    alloc.refs[device] -= 1;
+
+    if (alloc.refs[device] == 0) {
+      allocator->free(alloc.buffers[device]);
+      alloc.buffers.erase(device);
+      alloc.refs.erase(device);
+      if (alloc.refs.size() == 0) {
+        allocations_.erase(allocations_.begin() + index);
+      }
+    }
+  }
+
+  bool buffers_in_same_block(DeviceHandle device, std::vector<u8*> buffers) {
+    assert(buffers.size() > 0);
+
+    std::lock_guard<std::mutex> guard(lock_);
+    i32 base_index;
+    bool found = find_buffer(device, buffers[0], base_index);
+    if (!found) {
+      return false;
+    }
+
+    for (i32 i = 1; i < buffers.size(); ++i) {
+      i32 index;
+      found = find_buffer(device, buffers[i], index);
+      if (!found || base_index != index) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  bool buffer_in_block(DeviceHandle device, u8* buffer) {
+    std::lock_guard<std::mutex> guard(lock_);
+    i32 index;
+    return find_buffer(device, buffer, index);
+  }
+
+ private:
+  bool find_buffer(DeviceHandle device, u8* buffer, i32& index) {
+    auto& allocations = allocations_;
+    i32 num_alloc = allocations_.size();
+    for (i32 i = 0; i < num_alloc; ++i) {
+      Allocation alloc = allocations_[i];
+      if (alloc.buffers.count(device) > 0) {
+        u8* alloc_buffer = alloc.buffers[device];
+        if (pointer_in_buffer(buffer, alloc_buffer,
+                              alloc_buffer + alloc.size)) {
+          index = i;
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  typedef struct {
+    std::map<DeviceHandle, u8*> buffers;
+    size_t size;
+    std::map<DeviceHandle, i32> refs;
+  } Allocation;
+
+  std::mutex lock_;
+  i64 last_allocation_id_;
+  std::vector<Allocation> allocations_;
+  std::map<DeviceHandle, Allocator*> allocators_;
 };
 
-static SystemAllocator *cpu_system_allocator = nullptr;
-static std::map<i32, SystemAllocator *> gpu_system_allocators;
-static PoolAllocator *cpu_pool_allocator = nullptr;
-static BlockAllocator *cpu_block_allocator = nullptr;
+static std::unique_ptr<SystemAllocator> cpu_system_allocator;
+static std::map<i32, SystemAllocator*> gpu_system_allocators;
+static PoolAllocator* cpu_pool_allocator = nullptr;
+static std::unique_ptr<BlockAllocator> cpu_block_allocator;
 static std::map<i32, PoolAllocator*> gpu_pool_allocators;
-static std::map<i32, BlockAllocator *> gpu_block_allocators;
+static std::map<i32, BlockAllocator*> gpu_block_allocators;
+static std::unique_ptr<LinkedAllocator> linked_allocator;
+
+//#define USE_LINKED_ALLOCATOR
+
+#define PINNED_BUFFER_SIZE (32<<20)
+static std::map<i32, u8*> pinned_cpu_buffers;
+static std::map<i32, std::mutex> pinned_cpu_locks;
 
 void init_memory_allocators(MemoryPoolConfig config,
                             std::vector<i32> gpu_device_ids) {
-  cpu_system_allocator = new SystemAllocator(CPU_DEVICE);
-  Allocator *cpu_block_allocator_base = cpu_system_allocator;
+  cpu_system_allocator.reset(new SystemAllocator(CPU_DEVICE));
+  Allocator* cpu_block_allocator_base = cpu_system_allocator.get();
   if (config.cpu().use_pool()) {
     struct sysinfo info;
     i32 err = sysinfo(&info);
@@ -339,23 +527,29 @@ void init_memory_allocators(MemoryPoolConfig config,
         << "Requested CPU free space (" << config.cpu().free_space() << ") "
         << "larger than total CPU memory size ( " << total_mem << ")";
     cpu_pool_allocator =
-        new PoolAllocator(CPU_DEVICE, cpu_system_allocator,
+        new PoolAllocator(CPU_DEVICE, cpu_system_allocator.get(),
                           total_mem - config.cpu().free_space());
     cpu_block_allocator_base = cpu_pool_allocator;
   }
-  cpu_block_allocator = new BlockAllocator(cpu_block_allocator_base);
+#ifdef USE_LINKED_ALLOCATOR
+  std::map<DeviceHandle, Allocator*> allocators;
+  allocators[CPU_DEVICE] = cpu_block_allocator_base;
+#else
+  cpu_block_allocator.reset(new BlockAllocator(cpu_block_allocator_base));
+#endif
 
 #ifdef HAVE_CUDA
   for (i32 device_id : gpu_device_ids) {
+    cudaSetDevice(device_id);
     DeviceHandle device = {DeviceType::GPU, device_id};
-    SystemAllocator *gpu_system_allocator = new SystemAllocator(device);
+    SystemAllocator* gpu_system_allocator = new SystemAllocator(device);
     gpu_system_allocators[device.id] = gpu_system_allocator;
-    Allocator *gpu_block_allocator_base = gpu_system_allocator;
+    Allocator* gpu_block_allocator_base = gpu_system_allocator;
     if (config.gpu().use_pool()) {
       cudaDeviceProp prop;
       CU_CHECK(cudaGetDeviceProperties(&prop, device_id));
       size_t total_mem = prop.totalGlobalMem;
-      LOG_IF(FATAL, config.cpu().free_space() > total_mem)
+      LOG_IF(FATAL, config.gpu().free_space() > total_mem)
           << "Requested GPU free space (" << config.gpu().free_space() << ") "
           << "larger than total GPU memory size ( " << total_mem << ") "
           << "on device " << device_id;
@@ -363,19 +557,29 @@ void init_memory_allocators(MemoryPoolConfig config,
           device, gpu_system_allocator, total_mem - config.gpu().free_space());
       gpu_block_allocator_base = gpu_pool_allocators[device.id];
     }
+#ifdef USE_LINKED_ALLOCATOR
+    allocators[device] = gpu_block_allocator_base;
+#else
     gpu_block_allocators[device.id] =
         new BlockAllocator(gpu_block_allocator_base);
+#endif
+    CU_CHECK(cudaMallocHost((void**)&pinned_cpu_buffers[device.id],
+                            PINNED_BUFFER_SIZE));
   }
 #endif
+#ifdef USE_LINKED_ALLOCATOR
+  linked_allocator.reset(new LinkedAllocator(allocators));
+#endif
 }
 
 void destroy_memory_allocators() {
-  delete cpu_block_allocator;
+  linked_allocator.reset(nullptr);
+  cpu_block_allocator.reset(nullptr);
   if (cpu_pool_allocator) {
     delete cpu_pool_allocator;
     cpu_pool_allocator = nullptr;
   }
-  delete cpu_system_allocator;
+  cpu_system_allocator.reset(nullptr);
 
 #ifdef HAVE_CUDA
   for (auto entry : gpu_block_allocators) {
@@ -387,15 +591,19 @@ void destroy_memory_allocators() {
   for (auto entry : gpu_system_allocators) {
     delete entry.second;
   }
+  for (auto entry : pinned_cpu_buffers) {
+    cudaFreeHost(entry.second);
+  }
   gpu_block_allocators.clear();
   gpu_pool_allocators.clear();
   gpu_system_allocators.clear();
+  pinned_cpu_buffers.clear();
 #endif
 }
 
-SystemAllocator *system_allocator_for_device(DeviceHandle device) {
+SystemAllocator* system_allocator_for_device(DeviceHandle device) {
   if (device.type == DeviceType::CPU) {
-    return cpu_system_allocator;
+    return cpu_system_allocator.get();
   } else if (device.type == DeviceType::GPU) {
     CUDA_PROTECT({/* dummy to trigger cuda check */});
     return gpu_system_allocators.at(device.id);
@@ -404,9 +612,9 @@ SystemAllocator *system_allocator_for_device(DeviceHandle device) {
   }
 }
 
-BlockAllocator *block_allocator_for_device(DeviceHandle device) {
+BlockAllocator* block_allocator_for_device(DeviceHandle device) {
   if (device.type == DeviceType::CPU) {
-    return cpu_block_allocator;
+    return cpu_block_allocator.get();
   } else if (device.type == DeviceType::GPU) {
     CUDA_PROTECT({/* dummy to trigger cuda check */});
     return gpu_block_allocators.at(device.id);
@@ -415,41 +623,76 @@ BlockAllocator *block_allocator_for_device(DeviceHandle device) {
   }
 }
 
-u8 *new_buffer(DeviceHandle device, size_t size) {
-  assert(size > 0);
-  SystemAllocator *allocator = system_allocator_for_device(device);
-  return allocator->allocate(size);
+u8* new_buffer(DeviceHandle device, size_t size) {
+  return new_block_buffer(device, size, 1);
 }
 
-u8 *new_block_buffer(DeviceHandle device, size_t size, i32 refs) {
+u8* new_block_buffer(DeviceHandle device, size_t size, i32 refs) {
   assert(size > 0);
-  BlockAllocator *allocator = block_allocator_for_device(device);
+#ifdef USE_LINKED_ALLOCATOR
+  return linked_allocator->allocate(device, size, refs);
+#else
+  BlockAllocator* allocator = block_allocator_for_device(device);
   return allocator->allocate(size, refs);
+#endif
+}
+
+void add_buffer_ref(DeviceHandle device, u8* buffer) {
+  add_buffer_refs(device, buffer, 1);
 }
 
-void delete_buffer(DeviceHandle device, u8 *buffer) {
+void add_buffer_refs(DeviceHandle device, u8* buffer, i32 refs) {
   assert(buffer != nullptr);
-  BlockAllocator *block_allocator = block_allocator_for_device(device);
+#ifdef USE_LINKED_ALLOCATOR
+  return linked_allocator->add_refs(device, buffer, refs);
+#else
+  BlockAllocator* block_allocator = block_allocator_for_device(device);
+  block_allocator->add_refs(buffer, refs);
+#endif
+}
+
+void delete_buffer(DeviceHandle device, u8* buffer) {
+  assert(buffer != nullptr);
+#ifdef USE_LINKED_ALLOCATOR
+  linked_allocator->free(device, buffer);
+#else
+  BlockAllocator* block_allocator = block_allocator_for_device(device);
   if (block_allocator->buffer_in_block(buffer)) {
     block_allocator->free(buffer);
   } else {
-    SystemAllocator *system_allocator = system_allocator_for_device(device);
+    SystemAllocator* system_allocator = system_allocator_for_device(device);
     system_allocator->free(buffer);
   }
+#endif
 }
 
 // FIXME(wcrichto): case if transferring between two different GPUs
-void memcpy_buffer(u8 *dest_buffer, DeviceHandle dest_device,
-                   const u8 *src_buffer, DeviceHandle src_device, size_t size) {
-  if (dest_device.type == DeviceType::CPU && src_device.type == DeviceType::CPU) {
+void memcpy_buffer(u8* dest_buffer, DeviceHandle dest_device,
+                   const u8* src_buffer, DeviceHandle src_device, size_t size) {
+  if (dest_device.type == DeviceType::CPU &&
+      src_device.type == DeviceType::CPU) {
     memcpy(dest_buffer, src_buffer, size);
   } else {
     assert(!(dest_device.type == DeviceType::GPU &&
              src_device.type == DeviceType::GPU &&
              dest_device.id != src_device.id));
     CUDA_PROTECT({
-        CU_CHECK(cudaSetDevice(src_device.id));
+      CU_CHECK(cudaSetDevice(src_device.id));
+      if (size <= PINNED_BUFFER_SIZE) {
+        if (dest_device.type == DeviceType::CPU) {
+          CU_CHECK(cudaMemcpy(pinned_cpu_buffers[src_device.id], src_buffer,
+                              size, cudaMemcpyDefault));
+          memcpy(dest_buffer, pinned_cpu_buffers[src_device.id], size);
+        } else if (src_device.type == DeviceType::CPU) {
+          memcpy(pinned_cpu_buffers[dest_device.id], src_buffer, size);
+          CU_CHECK(cudaMemcpy(dest_buffer, pinned_cpu_buffers[dest_device.id], size,
+                              cudaMemcpyDefault));
+        } else {
+          CU_CHECK(cudaMemcpy(dest_buffer, src_buffer, size, cudaMemcpyDefault));
+        }
+      } else {
         CU_CHECK(cudaMemcpy(dest_buffer, src_buffer, size, cudaMemcpyDefault));
+      }
     });
   }
 }
@@ -457,61 +700,102 @@ void memcpy_buffer(u8 *dest_buffer, DeviceHandle dest_device,
 #define NUM_CUDA_STREAMS 32
 
 // TODO(wcrichto): implement CPU-CPU transfer
-void memcpy_vec(std::vector<u8 *> dest_buffers, DeviceHandle dest_device,
-                const std::vector<u8 *> src_buffers, DeviceHandle src_device,
-                std::vector<size_t> sizes) {
-  assert(dest_device.type == DeviceType::GPU ||
-         src_device.type == DeviceType::GPU);
+void memcpy_vec(std::vector<u8*>& dest_buffers, DeviceHandle dest_device,
+                const std::vector<u8*>& src_buffers, DeviceHandle src_device,
+                const std::vector<size_t>& sizes) {
   assert(src_device.can_copy_to(dest_device));
   assert(dest_buffers.size() > 0);
   assert(src_buffers.size() > 0);
   assert(dest_buffers.size() == src_buffers.size());
 
-#ifdef HAVE_CUDA
-  thread_local std::vector<cudaStream_t> streams;
-  if (streams.size() == 0) {
-    streams.resize(NUM_CUDA_STREAMS);
-    for (i32 i = 0; i < NUM_CUDA_STREAMS; ++i) {
-      cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
-    }
-  }
-
-  BlockAllocator *dest_allocator = block_allocator_for_device(dest_device);
-  BlockAllocator *src_allocator = block_allocator_for_device(src_device);
+#ifndef USE_LINKED_ALLOCATOR
+  BlockAllocator* dest_allocator = block_allocator_for_device(dest_device);
+  BlockAllocator* src_allocator = block_allocator_for_device(src_device);
+#endif
 
-  if (src_device.type == DeviceType::GPU) {
-    CU_CHECK(cudaSetDevice(src_device.id));
-  } else if (dest_device.type == DeviceType::GPU) {
-    CU_CHECK(cudaSetDevice(dest_device.id));
+  size_t total_size = 0;
+  for (auto size : sizes) {
+    total_size += size;
   }
 
   // In the case where the dest and src vectors are each respectively drawn
   // from a single block, we do a single memcpy from one block to the other.
-  if (dest_allocator->buffers_in_same_block(dest_buffers) &&
-      src_allocator->buffers_in_same_block(src_buffers)) {
-    size_t total_size = 0;
-    for (auto size : sizes) {
-      total_size += size;
+  bool from_same_block = false;
+#ifdef USE_LINKED_ALLOCATOR
+  from_same_block =
+      linked_allocator->buffers_in_same_block(dest_device, dest_buffers) &&
+      linked_allocator->buffers_in_same_block(src_device, src_buffers);
+#else
+  from_same_block = dest_allocator->buffers_in_same_block(dest_buffers) &&
+                    src_allocator->buffers_in_same_block(src_buffers);
+#endif
+
+  if (dest_device.type == DeviceType::GPU ||
+      src_device.type == DeviceType::GPU) {
+#ifdef HAVE_CUDA
+    static thread_local std::vector<cudaStream_t> streams;
+    if (streams.size() == 0) {
+      streams.resize(NUM_CUDA_STREAMS);
+      for (i32 i = 0; i < NUM_CUDA_STREAMS; ++i) {
+        CU_CHECK(cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking));
+      }
     }
 
-    CU_CHECK(cudaMemcpyAsync(dest_buffers[0], src_buffers[0], total_size,
-                             cudaMemcpyDefault, streams[0]));
-    CU_CHECK(cudaStreamSynchronize(streams[0]));
-  } else {
-    i32 n = dest_buffers.size();
+    if (from_same_block) {
+      memcpy_buffer(dest_buffers[0], dest_device, src_buffers[0], src_device,
+                    total_size);
+    } else {
+      i32 n = dest_buffers.size();
 
-    for (i32 i = 0; i < n; ++i) {
-      CU_CHECK(cudaMemcpyAsync(dest_buffers[i], src_buffers[i], sizes[i],
-                               cudaMemcpyDefault,
-                               streams[i % NUM_CUDA_STREAMS]));
+      for (i32 i = 0; i < n; ++i) {
+        memcpy_buffer(dest_buffers[i], dest_device, src_buffers[i], src_device,
+                      sizes[i]);
+      }
     }
-
-    for (i32 i = 0; i < std::min(n, NUM_CUDA_STREAMS); ++i) {
-      cudaStreamSynchronize(streams[i]);
+#else
+    LOG(FATAL) << "Cuda not installed";
+#endif
+  } else {
+    if (from_same_block) {
+      memcpy(dest_buffers[0], src_buffers[0], total_size);
+    } else {
+      for (i32 i = 0; i < dest_buffers.size(); ++i) {
+        memcpy(dest_buffers[i], src_buffers[i], sizes[i]);
+      }
     }
   }
+}
+
+void copy_or_ref_buffers(std::vector<u8*>& dest_buffers,
+                         DeviceHandle dest_device,
+                         const std::vector<u8*>& src_buffers,
+                         DeviceHandle src_device,
+                         const std::vector<size_t>& sizes) {
+  assert(src_device.can_copy_to(dest_device));
+  assert(src_buffers.size() > 0);
+
+#ifdef USE_LINKED_ALLOCATOR
+  // If source buffers are all from same block, this will perform only one
+  // copy. However, it will perform multiple lookups in the allocator.
+  dest_buffers.resize(src_buffers.size());
+  for (i32 i = 0; i < dest_buffers.size(); ++i) {
+    linked_allocator->copy_or_add_refs(src_device, src_buffers[i], 1,
+                                       dest_device, dest_buffers[i]);
+  }
 #else
-  LOG(FATAL) << "Cuda not installed";
+  size_t total_size = 0;
+  for (auto size : sizes) {
+    total_size += size;
+  }
+
+  BlockAllocator* dest_allocator = block_allocator_for_device(dest_device);
+  u8* dest_buff = dest_allocator->allocate(total_size, sizes.size());
+  for (size_t size : sizes) {
+    dest_buffers.push_back(dest_buff);
+    dest_buff += size;
+  }
+  memcpy_vec(dest_buffers, dest_device, src_buffers, src_device, sizes);
 #endif
 }
+
 }
diff --git a/scanner/util/memory.h b/scanner/util/memory.h
index f6225fd4..42844079 100644
--- a/scanner/util/memory.h
+++ b/scanner/util/memory.h
@@ -21,9 +21,10 @@
 
 namespace scanner {
 
-static const i64 DEFAULT_POOL_SIZE = 2L*1024L*1024L*1024L;
+static const i64 DEFAULT_POOL_SIZE = 2L * 1024L * 1024L * 1024L;
 
-void init_memory_allocators(MemoryPoolConfig config, std::vector<i32> gpu_device_ids);
+void init_memory_allocators(MemoryPoolConfig config,
+                            std::vector<i32> gpu_device_ids);
 
 void destroy_memory_allocators();
 
@@ -31,13 +32,22 @@ u8* new_buffer(DeviceHandle device, size_t size);
 
 u8* new_block_buffer(DeviceHandle device, size_t size, i32 refs);
 
+void add_buffer_ref(DeviceHandle device, u8* buffer);
+
+void add_buffer_refs(DeviceHandle device, u8* buffer, i32 refs);
+
 void delete_buffer(DeviceHandle device, u8* buffer);
 
 void memcpy_buffer(u8* dest_buffer, DeviceHandle dest_device,
-                   const u8* src_buffer, DeviceHandle src_device,
-                   size_t size);
+                   const u8* src_buffer, DeviceHandle src_device, size_t size);
+
+void memcpy_vec(std::vector<u8*>& dest_buffers, DeviceHandle dest_device,
+                const std::vector<u8*>& src_buffers, DeviceHandle src_device,
+                const std::vector<size_t>& sizes);
 
-void memcpy_vec(std::vector<u8*> dest_buffers, DeviceHandle dest_device,
-                const std::vector<u8*> src_buffers, DeviceHandle src_device,
-                std::vector<size_t> sizes);
+void copy_or_ref_buffers(std::vector<u8*>& dest_buffers,
+                         DeviceHandle dest_device,
+                         const std::vector<u8*>& src_buffers,
+                         DeviceHandle src_device,
+                         const std::vector<size_t>& sizes);
 }
diff --git a/scanner/util/opencv.cpp b/scanner/util/opencv.cpp
index 7cf2852e..696da546 100644
--- a/scanner/util/opencv.cpp
+++ b/scanner/util/opencv.cpp
@@ -1,7 +1,7 @@
 #include "scanner/util/opencv.h"
 
 #include "scanner/api/kernel.h"
-#include "scanner/engine/db.h"
+#include "scanner/engine/metadata.h"
 #include "scanner/util/image.h"
 
 #ifdef HAVE_CUDA
@@ -10,28 +10,96 @@
 
 namespace scanner {
 
-cv::Mat bytesToImage(u8 *buf, const FrameInfo &metadata) {
+int frame_to_cv_type(FrameType type, int channels) {
+  int cv_type;
+  switch (type) {
+    case FrameType::U8: {
+      cv_type = CV_8U;
+      break;
+    }
+    case FrameType::F32: {
+      cv_type = CV_32F;
+      break;
+    }
+    case FrameType::F64: {
+      cv_type = CV_64F;
+      break;
+    }
+  }
+  return CV_MAKETYPE(cv_type, channels);
+}
+
+FrameType cv_to_frame_type(int t) {
+  FrameType type;
+  switch (t) {
+    case CV_8U: {
+      type = FrameType::U8;
+      break;
+    }
+    case CV_32F: {
+      type = FrameType::F32;
+      break;
+    }
+    case CV_64F: {
+      type = FrameType::F64;
+      break;
+    }
+    default: { LOG(FATAL) << "Unsupported OpenCV type: " << t; }
+  }
+  return type;
+}
+
+FrameInfo mat_to_frame_info(const cv::Mat& mat) {
+  return FrameInfo(mat.rows, mat.cols, mat.channels(), 
+                   cv_to_frame_type(mat.depth()));
+}
+
+cv::Mat frame_to_mat(const Frame* frame) { return frame_to_mat((Frame*)frame); }
+
+cv::Mat frame_to_mat(Frame* frame) {
+  return cv::Mat(frame->height(), frame->width(),
+                 frame_to_cv_type(frame->type, frame->channels()), frame->data);
+}
+
+cv::Mat bytesToImage(u8* buf, const FrameInfo& metadata) {
   return cv::Mat(metadata.height(), metadata.width(), CV_8UC3, buf);
 }
 
 #ifdef HAVE_CUDA
 
-cvc::GpuMat bytesToImage_gpu(u8 *buf, const FrameInfo &metadata) {
+cvc::GpuMat frame_to_gpu_mat(const Frame* frame) {
+  return frame_to_gpu_mat((Frame*)frame);
+}
+
+cvc::GpuMat frame_to_gpu_mat(Frame* frame) {
+  return cvc::GpuMat(frame->height(), frame->width(),
+                     frame_to_cv_type(frame->type, frame->channels()),
+                     frame->data);
+}
+
+cvc::GpuMat bytesToImage_gpu(u8* buf, const FrameInfo& metadata) {
   return cvc::GpuMat(metadata.height(), metadata.width(), CV_8UC3, buf);
 }
 
-cudaError_t convertNV12toRGBA(const cv::cuda::GpuMat &in,
-                              cv::cuda::GpuMat &outFrame, int width, int height,
-                              cv::cuda::Stream &stream) {
+
+FrameInfo gpu_mat_to_frame_info(const cv::cuda::GpuMat& mat) {
+  return FrameInfo(mat.channels(), mat.cols, mat.rows,
+                   cv_to_frame_type(mat.depth()));
+}
+
+
+cudaError_t convertNV12toRGBA(const cv::cuda::GpuMat& in,
+                              cv::cuda::GpuMat& outFrame, int width, int height,
+                              cv::cuda::Stream& stream) {
   cudaStream_t s = cv::cuda::StreamAccessor::getStream(stream);
   return convertNV12toRGBA(in.ptr<uchar>(), in.step, outFrame.ptr<uchar>(),
                            outFrame.step, width, height, s);
 }
 
-cudaError_t convertRGBInterleavedToPlanar(const cv::cuda::GpuMat &in,
-                                          cv::cuda::GpuMat &outFrame, int width,
+cudaError_t convertRGBInterleavedToPlanar(const cv::cuda::GpuMat& in,
+                                          cv::cuda::GpuMat& outFrame, int width,
                                           int height,
-                                          cv::cuda::Stream &stream) {
+                                          cv::cuda::Stream& stream) {
   cudaStream_t s = cv::cuda::StreamAccessor::getStream(stream);
   return convertRGBInterleavedToPlanar(in.ptr<uchar>(), in.step,
                                        outFrame.ptr<uchar>(), outFrame.step,
diff --git a/scanner/util/opencv.h b/scanner/util/opencv.h
index bd4283c6..5b8a1c53 100644
--- a/scanner/util/opencv.h
+++ b/scanner/util/opencv.h
@@ -15,14 +15,22 @@
 
 #pragma once
 
+#include "scanner/api/frame.h"
 #include "scanner/util/common.h"
 
 #include <opencv2/opencv.hpp>
 
 namespace scanner {
-namespace proto {
-class FrameInfo;
-}
+
+int frame_to_cv_type(FrameType type, int channels = 1);
+
+FrameType cv_to_frame_type(int type);
+
+FrameInfo mat_to_frame_info(const cv::Mat& mat);
+
+cv::Mat frame_to_mat(const Frame* frame);
+
+cv::Mat frame_to_mat(Frame* frame);
 
 cv::Mat bytesToImage(u8* buf, const proto::FrameInfo& metadata);
 }
@@ -37,8 +45,14 @@ namespace scanner {
 
 class InputFormat;
 
+cvc::GpuMat frame_to_gpu_mat(const Frame* frame);
+
+cvc::GpuMat frame_to_gpu_mat(Frame* frame);
+
 cvc::GpuMat bytesToImage_gpu(u8* buf, const proto::FrameInfo& metadata);
 
+FrameInfo gpu_mat_to_frame_info(const cv::cuda::GpuMat& mat);
+
 cudaError_t convertNV12toRGBA(
     const cv::cuda::GpuMat& in, cv::cuda::GpuMat& outFrame, int width,
     int height, cv::cuda::Stream& stream = cv::cuda::Stream::Null());
diff --git a/scanner/util/profiler.cpp b/scanner/util/profiler.cpp
index a70a32a6..4d05cd5b 100644
--- a/scanner/util/profiler.cpp
+++ b/scanner/util/profiler.cpp
@@ -15,6 +15,7 @@
 
 #include "scanner/util/profiler.h"
 #include "scanner/util/storehouse.h"
+#include "storehouse/storage_backend.h"
 
 #include <cmath>
 #include <map>
@@ -24,22 +25,20 @@ namespace scanner {
 
 Profiler::Profiler(timepoint_t base_time) : base_time_(base_time), lock_(0) {}
 
-Profiler::Profiler(const Profiler &other)
-    : base_time_(other.base_time_), records_(other.records_), lock_(0) {}
+Profiler::Profiler(const Profiler& other)
+  : base_time_(other.base_time_), records_(other.records_), lock_(0) {}
 
-Profiler::~Profiler(void) {}
-
-const std::vector<Profiler::TaskRecord> &Profiler::get_records() const {
+const std::vector<Profiler::TaskRecord>& Profiler::get_records() const {
   return records_;
 }
 
-const std::map<std::string, int64_t> &Profiler::get_counters() const {
+const std::map<std::string, int64_t>& Profiler::get_counters() const {
   return counters_;
 }
 
-void write_profiler_to_file(storehouse::WriteFile *file, int64_t node,
+void write_profiler_to_file(storehouse::WriteFile* file, int64_t node,
                             std::string type_name, std::string tag,
-                            int64_t worker_num, const Profiler &profiler) {
+                            int64_t worker_num, const Profiler& profiler) {
   // Write worker header information
   // Node
   s_write(file, node);
@@ -50,27 +49,28 @@ void write_profiler_to_file(storehouse::WriteFile *file, int64_t node,
   // Worker number
   s_write(file, worker_num);
   // Intervals
-  const std::vector<scanner::Profiler::TaskRecord> &records =
+  const std::vector<scanner::Profiler::TaskRecord>& records =
       profiler.get_records();
   // Perform dictionary compression on interval key names
   uint8_t record_key_id = 0;
   std::map<std::string, uint8_t> key_names;
   for (size_t j = 0; j < records.size(); j++) {
-    const std::string &key = records[j].key;
+    const std::string& key = records[j].key;
     if (key_names.count(key) == 0) {
       key_names.insert({key, record_key_id++});
     }
   }
   if (key_names.size() > std::pow(2, sizeof(record_key_id) * 8)) {
-    fprintf(stderr, "WARNING: Number of record keys (%lu) greater than "
-                    "max key id (%lu). Recorded intervals will alias in "
-                    "profiler file.\n",
+    fprintf(stderr,
+            "WARNING: Number of record keys (%lu) greater than "
+            "max key id (%lu). Recorded intervals will alias in "
+            "profiler file.\n",
             key_names.size(), std::pow(2, sizeof(record_key_id) * 8));
   }
   // Write out key name dictionary
   int64_t num_keys = static_cast<int64_t>(key_names.size());
   s_write(file, num_keys);
-  for (auto &kv : key_names) {
+  for (auto& kv : key_names) {
     std::string key = kv.first;
     uint8_t key_index = kv.second;
     s_write(file, key);
@@ -80,7 +80,7 @@ void write_profiler_to_file(storehouse::WriteFile *file, int64_t node,
   int64_t num_records = static_cast<int64_t>(records.size());
   s_write(file, num_records);
   for (size_t j = 0; j < records.size(); j++) {
-    const scanner::Profiler::TaskRecord &record = records[j];
+    const scanner::Profiler::TaskRecord& record = records[j];
     uint8_t key_index = key_names[record.key];
     int64_t start = record.start;
     int64_t end = record.end;
@@ -89,10 +89,10 @@ void write_profiler_to_file(storehouse::WriteFile *file, int64_t node,
     s_write(file, end);
   }
   // S_Write out counters
-  const std::map<std::string, int64_t> &counters = profiler.get_counters();
+  const std::map<std::string, int64_t>& counters = profiler.get_counters();
   int64_t num_counters = static_cast<int64_t>(counters.size());
   s_write(file, num_counters);
-  for (auto &kv : counters) {
+  for (auto& kv : counters) {
     s_write(file, kv.first);
     s_write(file, kv.second);
   }
diff --git a/scanner/util/profiler.h b/scanner/util/profiler.h
index b8674b54..66a2bdc2 100644
--- a/scanner/util/profiler.h
+++ b/scanner/util/profiler.h
@@ -16,7 +16,6 @@
 #pragma once
 
 #include "scanner/util/util.h"
-#include "storehouse/storage_backend.h"
 
 #include <atomic>
 #include <fstream>
@@ -24,6 +23,10 @@
 #include <string>
 #include <vector>
 
+namespace storehouse {
+class WriteFile;
+}
+
 namespace scanner {
 
 class Profiler {
@@ -32,8 +35,6 @@ class Profiler {
 
   Profiler(const Profiler& other);
 
-  ~Profiler(void);
-
   void add_interval(const std::string& key, timepoint_t start, timepoint_t end);
 
   void increment(const std::string& key, int64_t value);
diff --git a/scanner/util/progress_bar.cpp b/scanner/util/progress_bar.cpp
deleted file mode 100644
index c1421b5f..00000000
--- a/scanner/util/progress_bar.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-#include "scanner/util/progress_bar.h"
-#include <stdio.h>
-#include <unistd.h>
-namespace scanner {
-
-ProgressBar::ProgressBar() {}
-
-ProgressBar::ProgressBar(u64 n_, const char* description_,
-                         std::ostream& out_) {
-  n = n_;
-  frequency_update = n_;
-  description = description_;
-  out = &out_;
-
-  unit_bar = "=";
-  unit_space = " ";
-  desc_width =
-      std::strlen(description);  // character width of description field
-}
-
-void ProgressBar::SetFrequencyUpdate(u64 frequency_update_) {
-  if (frequency_update_ > n) {
-    frequency_update = n;  // prevents crash if freq_updates_ > n_
-  } else {
-    frequency_update = frequency_update_;
-  }
-}
-
-void ProgressBar::SetStyle(const char* unit_bar_, const char* unit_space_) {
-  unit_bar = unit_bar_;
-  unit_space = unit_space_;
-}
-
-int ProgressBar::GetConsoleWidth() {
-  int width;
-
-#ifdef _WINDOWS
-  CONSOLE_SCREEN_BUFFER_INFO csbi;
-  GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi);
-  width = csbi.srWindow.Right - csbi.srWindow.Left;
-#else
-  struct winsize win;
-  ioctl(0, TIOCGWINSZ, &win);
-  width = win.ws_col;
-#endif
-
-  return width;
-}
-
-int ProgressBar::GetBarLength() {
-  // get console width and according adjust the length of the progress bar
-
-  int bar_length = static_cast<int>(
-      (GetConsoleWidth() - desc_width - CHARACTER_WIDTH_PERCENTAGE) / 2.);
-
-  return bar_length;
-}
-
-void ProgressBar::ClearBarField() {
-  for (int i = 0; i < GetConsoleWidth(); ++i) {
-    *out << " ";
-  }
-  *out << "\r" << std::flush;
-}
-
-void ProgressBar::Progressed(u64 idx_) {
-  if (!isatty(fileno(stdin))) {
-    return;
-  }
-  try {
-    if (idx_ > n) throw idx_;
-
-    // determines whether to update the progress bar from frequency_update
-    if ((idx_ != n) && (idx_ % (n / frequency_update) != 0)) return;
-
-    // calculate the size of the progress bar
-    int bar_size = GetBarLength();
-
-    // calculate percentage of progress
-    double progress_percent = idx_ * TOTAL_PERCENTAGE / n;
-
-    // calculate the percentage value of a unit bar
-    double percent_per_unit_bar = TOTAL_PERCENTAGE / bar_size;
-
-    // display progress bar
-    *out << " " << description << " [";
-
-    for (int bar_length = 0; bar_length <= bar_size - 1; ++bar_length) {
-      if (bar_length * percent_per_unit_bar < progress_percent) {
-        *out << unit_bar;
-      } else {
-        *out << unit_space;
-      }
-    }
-
-    *out << "]" << std::setw(CHARACTER_WIDTH_PERCENTAGE + 1)
-         << std::setprecision(1) << std::fixed << progress_percent << "%\r"
-         << std::flush;
-  } catch (u64 e) {
-    ClearBarField();
-    std::cerr << "PROGRESS_BAR_EXCEPTION: _idx (" << e
-              << ") went out of bounds, greater than n (" << n << ")."
-              << std::endl
-              << std::flush;
-  }
-}
-
-}
diff --git a/scanner/util/progress_bar.h b/scanner/util/progress_bar.h
deleted file mode 100644
index 58060a9b..00000000
--- a/scanner/util/progress_bar.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#pragma once
-
-#include "scanner/util/common.h"
-#include <sys/ioctl.h>
-
-#include <cstring>
-#include <iomanip>
-#include <iostream>
-
-#define TOTAL_PERCENTAGE 100.0
-#define CHARACTER_WIDTH_PERCENTAGE 4
-
-namespace scanner {
-
-class ProgressBar {
- public:
-  ProgressBar();
-  ProgressBar(u64 n_, const char* description_ = "",
-              std::ostream& out_ = std::cerr);
-
-  void SetFrequencyUpdate(u64 frequency_update_);
-  void SetStyle(const char* unit_bar_, const char* unit_space_);
-
-  void Progressed(u64 idx_);
-
- private:
-  u64 n;
-  unsigned int desc_width;
-  u64 frequency_update;
-  std::ostream* out;
-
-  const char* description;
-  const char* unit_bar;
-  const char* unit_space;
-
-  void ClearBarField();
-  int GetConsoleWidth();
-  int GetBarLength();
-};
-
-}
diff --git a/scanner/util/queue.h b/scanner/util/queue.h
index 94fc67ec..4b1b3a4c 100644
--- a/scanner/util/queue.h
+++ b/scanner/util/queue.h
@@ -39,11 +39,16 @@ class Queue {
 
   void pop(T& item);
 
+  void peek(T& item);
+
   void clear();
 
+  void wait_until_empty();
+
  private:
   i32 max_size_;
   std::mutex mutex_;
+  std::condition_variable empty_;
   std::condition_variable not_empty_;
   std::condition_variable not_full_;
   std::deque<T> data_;
diff --git a/scanner/util/queue.inl b/scanner/util/queue.inl
index 484a5f7e..b223e1d0 100644
--- a/scanner/util/queue.inl
+++ b/scanner/util/queue.inl
@@ -67,9 +67,11 @@ bool Queue<T>::try_pop(T& item) {
   } else {
     item = data_.front();
     data_.pop_front();
-
     lock.unlock();
     not_full_.notify_one();
+    if (size() <= 0) {
+      empty_.notify_all();
+    }
     return true;
   }
 }
@@ -85,9 +87,24 @@ void Queue<T>::pop(T& item) {
   data_.pop_front();
 
   lock.unlock();
+  if (size() <= 0) {
+    empty_.notify_all();
+  }
   not_full_.notify_one();
 }
 
+template <typename T>
+void Queue<T>::peek(T& item) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  pop_waiters_++;
+  not_empty_.wait(lock, [this]{ return data_.size() > 0; });
+  pop_waiters_--;
+
+  item = data_.front();
+
+  lock.unlock();
+}
+
 template <typename T>
 void Queue<T>::clear() {
   std::unique_lock<std::mutex> lock(mutex_);
@@ -97,4 +114,10 @@ void Queue<T>::clear() {
   not_full_.notify_one();
 }
 
+template <typename T>
+void Queue<T>::wait_until_empty() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  empty_.wait(lock, [this]{ return data_.size() <= 0; });
+}
+
 }
diff --git a/scanner/util/serialize.h b/scanner/util/serialize.h
index c657869e..91cc9c2c 100644
--- a/scanner/util/serialize.h
+++ b/scanner/util/serialize.h
@@ -35,7 +35,7 @@ inline T deser(const u8*& buffer, size_t& size_left) {
 
 template <typename T>
 void serialize_proto(const T& element, u8*& buffer, size_t& size) {
-  i32 element_size = element.ByteSize();
+  size_t element_size = element.ByteSizeLong();
   buffer = new_buffer(CPU_DEVICE, size);
   size = element_size;
   element.SerializeToArray(buffer, element_size);
@@ -53,7 +53,7 @@ void serialize_proto_vector(const std::vector<T>& elements, u8*& buffer,
                             size_t& size) {
   size = sizeof(size_t);
   for (auto& e : elements) {
-    size += e.ByteSize() + sizeof(i32);
+    size += e.ByteSizeLong() + sizeof(size_t);
   }
   buffer = new_buffer(CPU_DEVICE, size);
 
@@ -62,9 +62,9 @@ void serialize_proto_vector(const std::vector<T>& elements, u8*& buffer,
   buf += sizeof(size_t);
   for (size_t i = 0; i < elements.size(); ++i) {
     const T& e = elements[i];
-    i32 element_size = e.ByteSize();
-    *((i32*)buf) = element_size;
-    buf += sizeof(i32);
+    size_t element_size = e.ByteSizeLong();
+    *((size_t*)buf) = element_size;
+    buf += sizeof(size_t);
     e.SerializeToArray(buf, element_size);
     buf += element_size;
   }
@@ -74,10 +74,10 @@ template <typename T>
 void serialize_proto_vector_of_vectors(
     const std::vector<std::vector<T>>& elements, u8*& buffer, size_t& size) {
   size = sizeof(size_t);
-  for (auto &vec_of_e : elements) {
+  for (auto& vec_of_e : elements) {
     size += sizeof(size_t);
-    for (auto &e : vec_of_e) {
-      size += e.ByteSize() + sizeof(i32);
+    for (auto& e : vec_of_e) {
+      size += e.ByteSizeLong() + sizeof(size_t);
     }
   }
   buffer = new_buffer(CPU_DEVICE, size);
@@ -91,9 +91,9 @@ void serialize_proto_vector_of_vectors(
     buf += sizeof(size_t);
     for (size_t j = 0; j < vec_of_e.size(); ++j) {
       const T& e = vec_of_e[j];
-      i32 element_size = e.ByteSize();
-      *((i32*)buf) = element_size;
-      buf += sizeof(i32);
+      size_t element_size = e.ByteSizeLong();
+      *((size_t*)buf) = element_size;
+      buf += sizeof(size_t);
       e.SerializeToArray(buf, element_size);
       buf += element_size;
     }
@@ -106,7 +106,7 @@ std::vector<T> deserialize_proto_vector(const u8* buffer, size_t size) {
   size_t num_elements = deser<size_t>(buf, size);
   std::vector<T> elements;
   for (size_t i = 0; i < num_elements; ++i) {
-    i32 element_size = deser<i32>(buf, size);
+    size_t element_size = deser<size_t>(buf, size);
     assert(size >= element_size);
     T e;
     e.ParseFromArray(buf, element_size);
@@ -122,6 +122,11 @@ inline void serialize_bbox_vector(const std::vector<BoundingBox>& bboxes,
   serialize_proto_vector(bboxes, buffer, size);
 }
 
+inline std::vector<BoundingBox> deserialize_bbox_vector(const u8* buffer,
+                                                         size_t size) {
+  return deserialize_proto_vector<BoundingBox>(buffer, size);
+}
+
 // inline void serialize_decode_args(const DecodeArgs& args, u8*& buffer,
 //                                   size_t& size) {
 //   size = args.ByteSize();
diff --git a/scanner/util/storehouse.h b/scanner/util/storehouse.h
index 4a59d4e4..73034260 100644
--- a/scanner/util/storehouse.h
+++ b/scanner/util/storehouse.h
@@ -23,7 +23,8 @@
 
 namespace scanner {
 
-inline void s_write(storehouse::WriteFile* file, const u8* buffer, size_t size) {
+inline void s_write(storehouse::WriteFile* file, const u8* buffer,
+                    size_t size) {
   storehouse::StoreResult result;
   EXP_BACKOFF(file->append(size, buffer), result);
   exit_on_error(result);
@@ -40,10 +41,9 @@ inline void s_write(storehouse::WriteFile* file, const std::string& s) {
 }
 
 inline void s_read(storehouse::RandomReadFile* file, u8* buffer, size_t size,
-                 u64& pos) {
-  VLOG(1) << "Reading " << file->path()
-            << " (size " << size
-            << ", pos " << pos << ")";
+                   u64& pos) {
+  VLOG(1) << "Reading " << file->path() << " (size " << size << ", pos " << pos
+          << ")";
   storehouse::StoreResult result;
   size_t size_read;
   EXP_BACKOFF(file->read(pos, size, buffer, size_read), result);
diff --git a/scanner/util/thread_pool.h b/scanner/util/thread_pool.h
new file mode 100644
index 00000000..5bbddf52
--- /dev/null
+++ b/scanner/util/thread_pool.h
@@ -0,0 +1,99 @@
+/* Taken from https://github.com/progschj/ThreadPool */
+#ifndef THREAD_POOL_H
+#define THREAD_POOL_H
+
+#include <vector>
+#include <queue>
+#include <memory>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <future>
+#include <functional>
+#include <stdexcept>
+
+class ThreadPool {
+public:
+  ThreadPool(size_t);
+  template<class F, class... Args>
+  auto enqueue(F&& f, Args&&... args)
+    -> std::future<typename std::result_of<F(Args...)>::type>;
+  ~ThreadPool();
+private:
+  // need to keep track of threads so we can join them
+  std::vector< std::thread > workers;
+  // the task queue
+  std::queue< std::function<void()> > tasks;
+
+  // synchronization
+  std::mutex queue_mutex;
+  std::condition_variable condition;
+  bool stop;
+};
+
+// the constructor just launches some amount of workers
+inline ThreadPool::ThreadPool(size_t threads)
+  :   stop(false)
+{
+  for(size_t i = 0;i<threads;++i)
+    workers.emplace_back(
+                  [this]
+                  {
+                    for(;;)
+                    {
+                      std::function<void()> task;
+
+                      {
+                        std::unique_lock<std::mutex> lock(this->queue_mutex);
+                        this->condition.wait(lock,
+                                             [this]{ return this->stop || !this->tasks.empty(); });
+                        if(this->stop && this->tasks.empty())
+                          return;
+                        task = std::move(this->tasks.front());
+                        this->tasks.pop();
+                      }
+
+                      task();
+                    }
+                  }
+    );
+}
+
+// add new work item to the pool
+template<class F, class... Args>
+auto ThreadPool::enqueue(F&& f, Args&&... args)
+  -> std::future<typename std::result_of<F(Args...)>::type>
+{
+  using return_type = typename std::result_of<F(Args...)>::type;
+
+  auto task = std::make_shared< std::packaged_task<return_type()> >(
+    std::bind(std::forward<F>(f), std::forward<Args>(args)...)
+  );
+
+  std::future<return_type> res = task->get_future();
+  {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+
+    // don't allow enqueueing after stopping the pool
+    if(stop)
+      throw std::runtime_error("enqueue on stopped ThreadPool");
+
+    tasks.emplace([task](){ (*task)(); });
+  }
+  condition.notify_one();
+  return res;
+}
+
+// the destructor joins all threads
+inline ThreadPool::~ThreadPool()
+{
+  {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+    stop = true;
+  }
+  condition.notify_all();
+  for(std::thread &worker: workers)
+    worker.join();
+}
+
+#endif
diff --git a/scanner/util/util.h b/scanner/util/util.h
index a435d00b..4e8b3f96 100644
--- a/scanner/util/util.h
+++ b/scanner/util/util.h
@@ -15,16 +15,22 @@
 
 #pragma once
 
+#include <glog/logging.h>
 #include <libgen.h>
 #include <stdio.h>
+#include <stdlib.h>
+#include <sys/prctl.h>
 #include <sys/stat.h>
+#include <sys/wait.h>
+#include <unistd.h>
 #include <atomic>
 #include <chrono>
+#include <condition_variable>
 #include <cstring>
-#include <string>
+#include <mutex>
 #include <sstream>
+#include <string>
 #include <vector>
-#include <glog/logging.h>
 
 namespace scanner {
 
@@ -51,8 +57,8 @@ inline double nano_since(timepoint_t then) {
 
 ///////////////////////////////////////////////////////////////////////////////
 /// String processing
-inline void split(const std::string &s, char delim,
-                  std::vector<std::string> &elems) {
+inline void split(const std::string& s, char delim,
+                  std::vector<std::string>& elems) {
   std::stringstream ss;
   ss.str(s);
   std::string item;
@@ -61,7 +67,7 @@ inline void split(const std::string &s, char delim,
   }
 }
 
-inline std::vector<std::string> split(const std::string &s, char delim) {
+inline std::vector<std::string> split(const std::string& s, char delim) {
   std::vector<std::string> elems;
   split(s, delim, elems);
   return elems;
@@ -89,4 +95,123 @@ template <typename T>
 T nano_to_ms(T ns) {
   return ns / 1000000;
 }
+
+template <typename T>
+class Condition {
+ public:
+  Condition(const T& v) : value_(v) {}
+
+  T get() {
+    return value_.load();
+  }
+
+  void set(const T& v) {
+    std::unique_lock<std::mutex> lock(m_);
+    value_ = v;
+    lock.unlock();
+    cv_.notify_all();
+  }
+
+  bool test_and_set(const T& test, const T& set) {
+    std::unique_lock<std::mutex> lock(m_);
+    if (value_ == test) {
+      value_ = set;
+      lock.unlock();
+      cv_.notify_all();
+      return true;
+    }
+    return false;
+  }
+
+  bool wait_and_set(const T& wait_for, const T& set_to) {
+    std::unique_lock<std::mutex> lock(m_);
+    if (value_ == wait_for) {
+      value_ = set_to;
+      lock.unlock();
+      cv_.notify_all();
+      return true;
+    }
+    T temp = value_.load();
+    cv_.wait(lock, [&] { return value_ == wait_for; });
+    value_ = set_to;
+    lock.unlock();
+    cv_.notify_all();
+    return;
+  }
+
+  T wait_for_change(const T& v) {
+    std::unique_lock<std::mutex> lock(m_);
+    if (value_ != v) return value_;
+    cv_.wait(lock, [&] { return value_ != v; });
+    return value_;
+  }
+
+  void wait_until_changed_to(const T& v) {
+    std::unique_lock<std::mutex> lock(m_);
+    if (value_ == v) return;
+    cv_.wait(lock, [&] { return value_ == v; });
+  }
+
+  void wait_until_changed_to_for(const T& v, int ms) {
+    std::unique_lock<std::mutex> lock(m_);
+    if (value_ == v) return;
+    cv_.wait_for(lock, std::chrono::milliseconds(ms),
+                 [&] { return value_ = v; });
+  }
+
+ private:
+  std::mutex m_;
+  std::condition_variable cv_;
+  std::atomic<T> value_;
+};
+
+class Flag {
+ public:
+  void set() {
+    std::unique_lock<std::mutex> lock(m_);
+    bit_ = true;
+    lock.unlock();
+    cv_.notify_all();
+  }
+
+  bool raised() { return bit_.load(); }
+
+  void wait() {
+    std::unique_lock<std::mutex> lock(m_);
+    cv_.wait(lock, [&] { return bit_.load(); });
+  }
+
+  void wait_for(int ms) {
+    std::unique_lock<std::mutex> lock(m_);
+    cv_.wait_for(lock, std::chrono::milliseconds(ms),
+                 [&] { return bit_.load(); });
+  }
+
+ private:
+  std::mutex m_;
+  std::condition_variable cv_;
+  std::atomic<bool> bit_{false};
+};
+///////////////////////////////////////////////////////////////////////////////
+/// Debugging utils
+
+// Hacky way to print a stack trace while running. Useful right before
+// a LOG(FATAL) or other type of fatal event.
+inline void print_trace() {
+  char pid_buf[30];
+  sprintf(pid_buf, "%d", getpid());
+  char name_buf[512];
+  name_buf[readlink("/proc/self/exe", name_buf, 511)] = 0;
+  prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
+  int child_pid = fork();
+  if (!child_pid) {
+    dup2(2, 1);  // redirect output to stderr
+    fprintf(stdout, "stack trace for %s pid=%s\n", name_buf, pid_buf);
+    execlp("gdb", "gdb", "--batch", "-n", "-ex", "thread apply all bt",
+           name_buf, pid_buf, NULL);
+    abort(); /* If gdb failed to start */
+  } else {
+    waitpid(child_pid, NULL, 0);
+  }
+}
 }
diff --git a/scanner/video/CMakeLists.txt b/scanner/video/CMakeLists.txt
index 21a3a3b2..a25379fe 100644
--- a/scanner/video/CMakeLists.txt
+++ b/scanner/video/CMakeLists.txt
@@ -1,6 +1,8 @@
 set(SOURCE_FILES
+  h264_byte_stream_index_creator.cpp
   decoder_automata.cpp
-  video_decoder.cpp)
+  video_decoder.cpp
+  video_encoder.cpp)
 
 if (BUILD_CUDA)
   add_definitions(-DHAVE_NVIDIA_VIDEO_HARDWARE)
@@ -15,7 +17,8 @@ if (MFX_FOUND)
 endif()
 
 list(APPEND SOURCE_FILES
-  software/software_video_decoder.cpp)
+  software/software_video_decoder.cpp
+  software/software_video_encoder.cpp)
 
 add_library(video OBJECT
   ${SOURCE_FILES})
@@ -23,8 +26,9 @@ add_library(video OBJECT
 set_source_files_properties(${PROTO_SRCS} ${GRPC_PROTO_SRCS} PROPERTIES
   GENERATED TRUE)
 
-add_executable(DecoderAutomataTest decoder_automata_test.cpp)
-target_link_libraries(DecoderAutomataTest
-  ${GTEST_LIBRARIES} ${GTEST_LIB_MAIN}
-  scanner)
-add_test(DecoderAutomataTest DecoderAutomataTest)
+# TODO(apoms): fix since change in decoder API
+# add_executable(DecoderAutomataTest decoder_automata_test.cpp)
+# target_link_libraries(DecoderAutomataTest
+#   ${GTEST_LIBRARIES} ${GTEST_LIB_MAIN}
+#   scanner)
+# add_test(DecoderAutomataTest DecoderAutomataTest)
diff --git a/scanner/video/decoder_automata.cpp b/scanner/video/decoder_automata.cpp
index ffd910cd..97b4c623 100644
--- a/scanner/video/decoder_automata.cpp
+++ b/scanner/video/decoder_automata.cpp
@@ -26,12 +26,15 @@ namespace internal {
 
 DecoderAutomata::DecoderAutomata(DeviceHandle device_handle, i32 num_devices,
                                  VideoDecoderType decoder_type)
-    : device_handle_(device_handle), num_devices_(num_devices),
-      decoder_type_(decoder_type),
-      decoder_(VideoDecoder::make_from_config(device_handle,
-                                              num_devices,
-                                              decoder_type)),
-      feeder_waiting_(false), not_done_(true), frames_retrieved_(0) {
+  : device_handle_(device_handle),
+    num_devices_(num_devices),
+    decoder_type_(decoder_type),
+    decoder_(VideoDecoder::make_from_config(device_handle, num_devices,
+                                            decoder_type)),
+    feeder_waiting_(false),
+    not_done_(true),
+    frames_retrieved_(0),
+    skip_frames_(false) {
   feeder_thread_ = std::thread(&DecoderAutomata::feeder, this);
 }
 
@@ -51,13 +54,28 @@ DecoderAutomata::~DecoderAutomata() {
     not_done_ = false;
     feeder_waiting_ = false;
   }
+
   wake_feeder_.notify_one();
   feeder_thread_.join();
+
+  for (auto& args : encoded_data_) {
+    delete_buffer(CPU_DEVICE, (u8*)args.encoded_video());
+  }
 }
 
 void DecoderAutomata::initialize(
-    const std::vector<proto::DecodeArgs> &encoded_data) {
+    const std::vector<proto::DecodeArgs>& encoded_data) {
   assert(!encoded_data.empty());
+  while (decoder_->discard_frame()) {
+  }
+
+  std::unique_lock<std::mutex> lk(feeder_mutex_);
+  wake_feeder_.wait(lk, [this] { return feeder_waiting_.load(); });
+
+  for (auto& args : encoded_data_) {
+    delete_buffer(CPU_DEVICE, (u8*)args.encoded_video());
+  }
+
   encoded_data_ = encoded_data;
   frame_size_ = encoded_data[0].width() * encoded_data[0].height() * 3;
   current_frame_ = encoded_data[0].start_keyframe();
@@ -65,31 +83,17 @@ void DecoderAutomata::initialize(
   retriever_data_idx_.store(0, std::memory_order_release);
   retriever_valid_idx_ = 0;
 
-  FrameInfo info;
-  info.set_width(encoded_data[0].width());
-  info.set_height(encoded_data[0].height());
+  FrameInfo info(encoded_data[0].height(), encoded_data[0].width(), 3,
+                 FrameType::U8);
 
-  while (decoder_->discard_frame()) {
-  }
-
-  std::unique_lock<std::mutex> lk(feeder_mutex_);
-  wake_feeder_.wait(lk, [this] { return feeder_waiting_.load(); });
-
-  if (info_.width() != info.width()
-      || info_.height() != info.height()) {
+  if (info_ != info) {
     decoder_->configure(info);
   }
   if (frames_retrieved_ > 0) {
     decoder_->feed(nullptr, 0, true);
   }
 
-  feeder_data_idx_.store(0, std::memory_order_release);
-  feeder_buffer_offset_.store(0, std::memory_order_release);
-  feeder_next_keyframe_.store(encoded_data[0].keyframes(1),
-                              std::memory_order_release);
-  feeder_data_idx_.store(0);
-  feeder_buffer_offset_.store(0);
-  feeder_next_keyframe_.store(encoded_data[0].keyframes(1));
+  set_feeder_idx(0);
   info_ = info;
   std::atomic_thread_fence(std::memory_order_release);
   seeking_ = false;
@@ -99,10 +103,6 @@ void DecoderAutomata::get_frames(u8* buffer, i32 num_frames) {
   i64 total_frames_decoded = 0;
   i64 total_frames_used = 0;
 
-  // profiler_->add_interval("decode", start, now());
-  // profiler_->increment("effective_frames", total_frames_used);
-  // profiler_->increment("decoded_frames", total_frames_decoded);
-
   auto start = now();
 
   // Wait until feeder is waiting
@@ -112,6 +112,14 @@ void DecoderAutomata::get_frames(u8* buffer, i32 num_frames) {
     wake_feeder_.wait(lk, [this] { return feeder_waiting_.load(); });
   }
 
+  if (encoded_data_.size() > feeder_data_idx_) {
+    // Make sure to not feed seek packet if we reached end of stream
+    if (seeking_) {
+      decoder_->feed(nullptr, 0, true);
+      seeking_ = false;
+    }
+  }
+
   // Start up feeder thread
   {
     std::unique_lock<std::mutex> lk(feeder_mutex_);
@@ -121,19 +129,22 @@ void DecoderAutomata::get_frames(u8* buffer, i32 num_frames) {
   }
   wake_feeder_.notify_one();
 
+  if (profiler_) {
+    profiler_->add_interval("get_frames_wait", start, now());
+  }
+
   while (frames_retrieved_ < frames_to_get_) {
     if (decoder_->decoded_frames_buffered() > 0) {
+      auto iter = now();
       // New frames
       bool more_frames = true;
       while (more_frames && frames_retrieved_ < frames_to_get_) {
-        const auto &valid_frames =
+        const auto& valid_frames =
             encoded_data_[retriever_data_idx_].valid_frames();
         assert(valid_frames.size() > retriever_valid_idx_.load());
         assert(current_frame_ <= valid_frames.Get(retriever_valid_idx_));
-        // printf("has buffered frames, curr %d, next %d\n",
-        //        current_frame_, valid_frames.Get(retriever_valid_idx_));
         if (current_frame_ == valid_frames.Get(retriever_valid_idx_)) {
-          u8 *decoded_buffer = buffer + frames_retrieved_ * frame_size_;
+          u8* decoded_buffer = buffer + frames_retrieved_ * frame_size_;
           more_frames = decoder_->get_frame(decoded_buffer, frame_size_);
           retriever_valid_idx_++;
           if (retriever_valid_idx_ == valid_frames.size()) {
@@ -144,9 +155,9 @@ void DecoderAutomata::get_frames(u8* buffer, i32 num_frames) {
             // Trigger feeder to start again and set ourselves to the
             // start of that keyframe
             if (retriever_data_idx_ < encoded_data_.size()) {
-              // Wait until feeder is waiting
               {
-                // Wait until frames are being requested
+                // Wait until feeder is waiting
+                // skip_frames_ = true;
                 std::unique_lock<std::mutex> lk(feeder_mutex_);
                 wake_feeder_.wait(lk, [this, &total_frames_decoded] {
                   while (decoder_->discard_frame()) {
@@ -154,6 +165,12 @@ void DecoderAutomata::get_frames(u8* buffer, i32 num_frames) {
                   }
                   return feeder_waiting_.load();
                 });
+                // skip_frames_ = false;
+              }
+
+              if (seeking_) {
+                decoder_->feed(nullptr, 0, true);
+                seeking_ = false;
               }
 
               {
@@ -163,6 +180,7 @@ void DecoderAutomata::get_frames(u8* buffer, i32 num_frames) {
                     encoded_data_[retriever_data_idx_].keyframes(0) - 1;
               }
               wake_feeder_.notify_one();
+              more_frames = false;
             } else {
               assert(frames_retrieved_ + 1 == frames_to_get_);
             }
@@ -183,12 +201,23 @@ void DecoderAutomata::get_frames(u8* buffer, i32 num_frames) {
         // printf("curr frame %d, frames decoded %d\n", current_frame_,
         //        total_frames_decoded);
       }
+      if (profiler_) {
+        profiler_->add_interval("iter", iter, now());
+      }
     }
     std::this_thread::yield();
   }
   decoder_->wait_until_frames_copied();
-  // printf("total frames decoded %d\n", total_frames_decoded);
-  // printf("total frames used %d\n", total_frames_used);
+  if (profiler_) {
+    profiler_->add_interval("get_frames", start, now());
+    profiler_->increment("frames_used", total_frames_used);
+    profiler_->increment("frames_decoded", total_frames_decoded);
+  }
+}
+
+void DecoderAutomata::set_profiler(Profiler* profiler) {
+  profiler_ = profiler;
+  decoder_->set_profiler(profiler);
 }
 
 void DecoderAutomata::feeder() {
@@ -215,52 +244,56 @@ void DecoderAutomata::feeder() {
       continue;
     }
 
-    if (seeking_) {
-      decoder_->feed(nullptr, 0, true);
-      seeking_ = false;
+    if (profiler_) {
+      profiler_->increment("frames_fed", frames_fed);
     }
-
     frames_fed = 0;
     bool seen_metadata = false;
     while (frames_retrieved_ < frames_to_get_) {
       i32 frames_to_wait = 8;
       while (frames_retrieved_ < frames_to_get_ &&
-            decoder_->decoded_frames_buffered() > frames_to_wait) {
+             decoder_->decoded_frames_buffered() > frames_to_wait) {
         wake_feeder_.notify_one();
         std::this_thread::yield();
       }
+      if (skip_frames_) {
+        seen_metadata = false;
+        seeking_ = true;
+        set_feeder_idx(feeder_data_idx_ + 1);
+        break;
+      }
       frames_fed++;
 
       i32 fdi = feeder_data_idx_.load(std::memory_order_acquire);
-      const u8 *encoded_buffer =
-          (const u8 *)encoded_data_[fdi].mutable_encoded_video()->data();
-      size_t encoded_buffer_size =
-          encoded_data_[fdi].mutable_encoded_video()->size();
+      const u8* encoded_buffer = (const u8*)encoded_data_[fdi].encoded_video();
+      size_t encoded_buffer_size = encoded_data_[fdi].encoded_video_size();
       i32 encoded_packet_size = 0;
-      const u8 *encoded_packet = NULL;
+      const u8* encoded_packet = NULL;
       if (feeder_buffer_offset_ < encoded_buffer_size) {
-        encoded_packet_size = *reinterpret_cast<const i32 *>(
-            encoded_buffer + feeder_buffer_offset_);
-        feeder_buffer_offset_ += sizeof(i32);
+        u64 start_keyframe = encoded_data_[fdi].start_keyframe();
+        encoded_packet_size = encoded_data_[fdi].sample_sizes().Get(
+            feeder_current_frame_ - start_keyframe);
         encoded_packet = encoded_buffer + feeder_buffer_offset_;
-        assert(encoded_packet_size < encoded_buffer_size);
+        assert(0 <= encoded_packet_size &&
+               encoded_packet_size < encoded_buffer_size);
         feeder_buffer_offset_ += encoded_packet_size;
         // printf("encoded packet size %d, ptr %p\n", encoded_packet_size,
         //        encoded_packet);
       }
 
       if (seen_metadata && encoded_packet_size > 0) {
-        const u8 *start_buffer = encoded_packet;
+        const u8* start_buffer = encoded_packet;
         i32 original_size = encoded_packet_size;
 
         while (encoded_packet_size > 0) {
-          const u8 *nal_start;
+          const u8* nal_start;
           i32 nal_size;
           next_nal(encoded_packet, encoded_packet_size, nal_start, nal_size);
           if (encoded_packet_size == 0) {
             break;
           }
           i32 nal_type = get_nal_unit_type(nal_start);
+          i32 nal_ref = get_nal_ref_idc(nal_start);
           if (is_vcl_nal(nal_type)) {
             encoded_packet = nal_start -= 3;
             encoded_packet_size = nal_size + encoded_packet_size + 3;
@@ -270,6 +303,20 @@ void DecoderAutomata::feeder() {
       }
 
       decoder_->feed(encoded_packet, encoded_packet_size, false);
+
+      if (feeder_current_frame_ == feeder_next_frame_) {
+        feeder_valid_idx_++;
+        if (feeder_valid_idx_ <
+            encoded_data_[feeder_data_idx_].valid_frames_size()) {
+          feeder_next_frame_ =
+              encoded_data_[feeder_data_idx_].valid_frames(feeder_valid_idx_);
+        } else {
+          // Done
+          feeder_next_frame_ = -1;
+        }
+      }
+      feeder_current_frame_++;
+
       // Set a discontinuity if we sent an empty packet to reset
       // the stream next time
       if (encoded_packet_size == 0) {
@@ -277,19 +324,25 @@ void DecoderAutomata::feeder() {
         // Reached the end of a decoded segment so wait for decoder to flush
         // before moving onto next segment
         seen_metadata = false;
-        feeder_data_idx_ += 1;
-        feeder_buffer_offset_ = 0;
         seeking_ = true;
-        if (feeder_data_idx_ < encoded_data_.size()) {
-          feeder_next_keyframe_ = encoded_data_[feeder_data_idx_].keyframes(1);
-        }
+        set_feeder_idx(feeder_data_idx_ + 1);
         break;
       } else {
         seen_metadata = true;
       }
       std::this_thread::yield();
     }
-    //printf("frames fed %d\n", frames_fed);
+  }
+}
+
+void DecoderAutomata::set_feeder_idx(i32 data_idx) {
+  feeder_data_idx_ = data_idx;
+  feeder_valid_idx_ = 0;
+  feeder_buffer_offset_ = 0;
+  if (feeder_data_idx_ < encoded_data_.size()) {
+    feeder_current_frame_ = encoded_data_[feeder_data_idx_].keyframes(0);
+    feeder_next_frame_ = encoded_data_[feeder_data_idx_].valid_frames(0);
+    feeder_next_keyframe_ = encoded_data_[feeder_data_idx_].keyframes(1);
   }
 }
 }
diff --git a/scanner/video/decoder_automata.h b/scanner/video/decoder_automata.h
index e59357ff..3993bd06 100644
--- a/scanner/video/decoder_automata.h
+++ b/scanner/video/decoder_automata.h
@@ -17,10 +17,10 @@
 
 #include "scanner/video/video_decoder.h"
 
-#include <thread>
-#include <mutex>
 #include <condition_variable>
 #include <memory>
+#include <mutex>
+#include <thread>
 
 namespace scanner {
 namespace internal {
@@ -29,7 +29,8 @@ class DecoderAutomata {
   DecoderAutomata() = delete;
   DecoderAutomata(const DecoderAutomata&) = delete;
   DecoderAutomata(const DecoderAutomata&& other) = delete;
-public:
+
+ public:
   DecoderAutomata(DeviceHandle device_handle, i32 num_devices,
                   VideoDecoderType decoder_type);
   ~DecoderAutomata();
@@ -38,11 +39,17 @@ class DecoderAutomata {
 
   void get_frames(u8* buffer, i32 num_frames);
 
-private:
+  void set_profiler(Profiler* profiler);
+
+ private:
   void feeder();
 
+  void set_feeder_idx(i32 data_idx);
+
   const i32 MAX_BUFFERED_FRAMES = 8;
 
+  Profiler* profiler_ = nullptr;
+
   DeviceHandle device_handle_;
   i32 num_devices_;
   VideoDecoderType decoder_type_;
@@ -51,7 +58,7 @@ class DecoderAutomata {
   std::thread feeder_thread_;
   std::atomic<bool> not_done_;
 
-  FrameInfo info_;
+  FrameInfo info_{};
   size_t frame_size_;
   i32 current_frame_;
   std::atomic<i32> reset_current_frame_;
@@ -64,14 +71,17 @@ class DecoderAutomata {
   std::atomic<i32> retriever_data_idx_;
   std::atomic<i32> retriever_valid_idx_;
 
+  std::atomic<bool> skip_frames_;
   std::atomic<bool> seeking_;
   std::atomic<i32> feeder_data_idx_;
+  std::atomic<i64> feeder_valid_idx_;
+  std::atomic<i64> feeder_current_frame_;
+  std::atomic<i64> feeder_next_frame_;
+
   std::atomic<size_t> feeder_buffer_offset_;
   std::atomic<i64> feeder_next_keyframe_;
   std::mutex feeder_mutex_;
   std::condition_variable wake_feeder_;
-
 };
-
 }
 }
diff --git a/scanner/video/decoder_automata_test.cpp b/scanner/video/decoder_automata_test.cpp
index c63108bc..94c7cde6 100644
--- a/scanner/video/decoder_automata_test.cpp
+++ b/scanner/video/decoder_automata_test.cpp
@@ -21,11 +21,20 @@
 
 #include <thread>
 
+extern "C" {
+#include "libavcodec/avcodec.h"
+}
+
 namespace scanner {
 namespace internal {
 TEST(DecoderAutomata, GetAllFrames) {
+  avcodec_register_all();
+
+  MemoryPoolConfig config;
+  init_memory_allocators(config, {});
   std::unique_ptr<storehouse::StorageConfig> sc(
       storehouse::StorageConfig::make_posix_config());
+
   auto storage = storehouse::StorageBackend::make_from_config(sc.get());
   VideoDecoderType decoder_type = VideoDecoderType::SOFTWARE;
   DeviceHandle device = CPU_DEVICE;
@@ -35,6 +44,9 @@ TEST(DecoderAutomata, GetAllFrames) {
   VideoMetadata video_meta =
       read_video_metadata(storage, download_video_meta(short_video));
   std::vector<u8> video_bytes = read_entire_file(download_video(short_video));
+  u8* video_buffer = new_buffer(CPU_DEVICE, video_bytes.size());
+  memcpy_buffer(video_buffer, CPU_DEVICE, video_bytes.data(), CPU_DEVICE,
+                video_bytes.size());
 
   std::vector<proto::DecodeArgs> args;
   args.emplace_back();
@@ -46,13 +58,12 @@ TEST(DecoderAutomata, GetAllFrames) {
   for (i64 r = 0; r < video_meta.frames(); ++r) {
     decode_args.add_valid_frames(r);
   }
-  for (i64 k : video_meta.keyframe_positions()) {
-    decode_args.add_keyframes(k);
-  }
-  for (i64 k : video_meta.keyframe_byte_offsets()) {
-    decode_args.add_keyframe_byte_offsets(k);
-  }
-  decode_args.set_encoded_video(video_bytes.data(), video_bytes.size());
+  // for (i64 k : video_meta.keyframe_indices()) {
+  //   decode_args.add_keyframes(k);
+  //   decode_args.add_keyframe_byte_offsets(video_meta.sample_offsets().at(k));
+  // }
+  decode_args.set_encoded_video((i64)video_buffer);
+  decode_args.set_encoded_video_size(video_bytes.size());
 
   decoder->initialize(args);
 
@@ -63,11 +74,17 @@ TEST(DecoderAutomata, GetAllFrames) {
 
   delete decoder;
   delete storage;
+  destroy_memory_allocators();
 }
 
 TEST(DecoderAutomata, GetStridedFrames) {
+  avcodec_register_all();
+
+  MemoryPoolConfig config;
+  init_memory_allocators(config, {});
   std::unique_ptr<storehouse::StorageConfig> sc(
       storehouse::StorageConfig::make_posix_config());
+
   auto storage = storehouse::StorageBackend::make_from_config(sc.get());
   VideoDecoderType decoder_type = VideoDecoderType::SOFTWARE;
   DeviceHandle device = CPU_DEVICE;
@@ -77,6 +94,9 @@ TEST(DecoderAutomata, GetStridedFrames) {
   VideoMetadata video_meta =
       read_video_metadata(storage, download_video_meta(short_video));
   std::vector<u8> video_bytes = read_entire_file(download_video(short_video));
+  u8* video_buffer = new_buffer(CPU_DEVICE, video_bytes.size());
+  memcpy_buffer(video_buffer, CPU_DEVICE, video_bytes.data(), CPU_DEVICE,
+                video_bytes.size());
 
   std::vector<proto::DecodeArgs> args;
   args.emplace_back();
@@ -85,16 +105,15 @@ TEST(DecoderAutomata, GetStridedFrames) {
   decode_args.set_height(video_meta.height());
   decode_args.set_start_keyframe(0);
   decode_args.set_end_keyframe(video_meta.frames());
-  for (i64 r = 0; r < video_meta.frames(); r+=2) {
+  for (i64 r = 0; r < video_meta.frames(); r += 2) {
     decode_args.add_valid_frames(r);
   }
-  for (i64 k : video_meta.keyframe_positions()) {
-    decode_args.add_keyframes(k);
-  }
-  for (i64 k : video_meta.keyframe_byte_offsets()) {
-    decode_args.add_keyframe_byte_offsets(k);
-  }
-  decode_args.set_encoded_video(video_bytes.data(), video_bytes.size());
+  // for (i64 k : video_meta.keyframe_indices()) {
+  //   decode_args.add_keyframes(k);
+  //   decode_args.add_keyframe_byte_offsets(video_meta.sample_offsets().at(k));
+  // }
+  decode_args.set_encoded_video((i64)video_buffer);
+  decode_args.set_encoded_video_size(video_bytes.size());
 
   decoder->initialize(args);
 
@@ -105,7 +124,7 @@ TEST(DecoderAutomata, GetStridedFrames) {
 
   delete decoder;
   delete storage;
+  destroy_memory_allocators();
 }
-
 }
 }
diff --git a/scanner/video/h264_byte_stream_index_creator.cpp b/scanner/video/h264_byte_stream_index_creator.cpp
new file mode 100644
index 00000000..abdc4b13
--- /dev/null
+++ b/scanner/video/h264_byte_stream_index_creator.cpp
@@ -0,0 +1,232 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scanner/video/h264_byte_stream_index_creator.h"
+#include "scanner/util/common.h"
+#include "scanner/util/storehouse.h"
+#include "scanner/util/util.h"
+
+#include "storehouse/storage_backend.h"
+#include "storehouse/storage_config.h"
+
+#include <glog/logging.h>
+#include <thread>
+
+// For video
+extern "C" {
+#include "libavcodec/avcodec.h"
+#include "libavfilter/avfilter.h"
+#include "libavformat/avformat.h"
+#include "libavformat/avio.h"
+#include "libavutil/error.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libswscale/swscale.h"
+}
+
+#include <cassert>
+#include <fstream>
+
+using storehouse::StoreResult;
+using storehouse::WriteFile;
+using storehouse::RandomReadFile;
+
+namespace scanner {
+namespace internal {
+
+H264ByteStreamIndexCreator::H264ByteStreamIndexCreator(WriteFile* b)
+  : demuxed_bytestream_(b) {}
+
+bool H264ByteStreamIndexCreator::feed_packet(u8* data, size_t size) {
+  u8* orig_data = data;
+  i32 orig_size = size;
+
+  i64 nal_bytestream_offset = bytestream_pos_;
+
+  VLOG(2) << "new packet " << nal_bytestream_offset;
+  bool insert_sps_nal = false;
+  // Parse NAL unit
+  const u8* nal_parse = data;
+  i32 size_left = size;
+  i32 nals_parsed = 0;
+
+  i32 write_size = 0;
+  while (size_left > 3) {
+    const u8* nal_start = nullptr;
+    i32 nal_size = 0;
+    next_nal(nal_parse, size_left, nal_start, nal_size);
+
+    if (size_left < 0 || nal_size < 1) {
+      continue;
+    }
+
+    i32 nal_ref_idc = (*nal_start >> 5);
+    i32 nal_unit_type = (*nal_start) & 0x1F;
+    VLOG(2) << "frame " << frame_ << ", nal size " << nal_size
+            << ", nal_ref_idc " << nal_ref_idc << ", nal unit "
+            << nal_unit_type;
+    if (nal_ref_idc == 0) {
+      num_non_ref_frames_ += 1;
+    }
+    if (nal_unit_type > 4) {
+      if (!in_meta_packet_sequence_) {
+        meta_packet_sequence_start_offset_ = nal_bytestream_offset;
+        VLOG(2) << "in meta sequence " << nal_bytestream_offset;
+        in_meta_packet_sequence_ = true;
+        saw_sps_nal_ = false;
+      }
+    }
+    std::vector<u8> rbsp_buffer;
+    rbsp_buffer.reserve(64 * 1024);
+    u32 consecutive_zeros = 0;
+    i32 bytes = nal_size - 1;
+    const u8* pb = nal_start + 1;
+    while (bytes > 0) {
+      /* Copy the byte into the rbsp, unless it
+       * is the 0x03 in a 0x000003 */
+      if (consecutive_zeros < 2 || *pb != 0x03) {
+        rbsp_buffer.push_back(*pb);
+      }
+      if (*pb == 0) {
+        ++consecutive_zeros;
+      } else {
+        consecutive_zeros = 0;
+      }
+      ++pb;
+      --bytes;
+    }
+
+    // We need to track the last SPS NAL because some streams do
+    // not insert an SPS every keyframe and we need to insert it
+    // ourselves.
+    // fprintf(stderr, "nal_size %d, rbsp size %lu\n", nal_size,
+    // rbsp_buffer.size());
+    const u8* rbsp_start = rbsp_buffer.data();
+    i32 rbsp_size = rbsp_buffer.size();
+
+    // SPS
+    if (nal_unit_type == 7) {
+      saw_sps_nal_ = true;
+      i32 offset = 8;
+      GetBitsState gb;
+      gb.buffer = rbsp_start;
+      gb.offset = 0;
+      SPS sps;
+      if (!parse_sps(gb, sps)) {
+        error_message_ = "Failed to parse sps";
+        return false;
+      }
+      i32 sps_id = sps.sps_id;
+      sps_map_[sps_id] = sps;
+      last_sps_ = sps.sps_id;
+
+      sps_nal_bytes_[sps_id].clear();
+      sps_nal_bytes_[sps_id].insert(sps_nal_bytes_[sps_id].end(), nal_start - 3,
+                                    nal_start + nal_size + 3);
+      VLOG(2) << "Last SPS NAL (" << sps_id << ", " << offset << ")"
+              << " seen at frame " << frame_;
+    }
+    // PPS
+    if (nal_unit_type == 8) {
+      GetBitsState gb;
+      gb.buffer = rbsp_start;
+      gb.offset = 0;
+      PPS pps;
+      if (!parse_pps(gb, pps)) {
+        error_message_ = "Failed to parse pps";
+        return false;
+      }
+      pps_map_[pps.pps_id] = pps;
+      last_pps_ = pps.pps_id;
+      saw_pps_nal_ = true;
+      i32 pps_id = pps.pps_id;
+      pps_nal_bytes_[pps_id].clear();
+      pps_nal_bytes_[pps_id].insert(pps_nal_bytes_[pps_id].end(), nal_start - 3,
+                                    nal_start + nal_size + 3);
+      VLOG(2) << "PPS id " << pps.pps_id << ", SPS id " << pps.sps_id
+              << ", frame " << frame_;
+    }
+    if (is_vcl_nal(nal_unit_type)) {
+      assert(last_pps_ != -1);
+      assert(last_sps_ != -1);
+      GetBitsState gb;
+      gb.buffer = nal_start;
+      gb.offset = 8;
+      SliceHeader sh;
+      if (!parse_slice_header(gb, sps_map_.at(last_sps_), pps_map_,
+                              nal_unit_type, nal_ref_idc, sh)) {
+        error_message_ = "Failed to parse slice header";
+        return false;
+      }
+      // printf("ref_idx_l0 %d, ref_idx_l1 %d\n",
+      // sh.num_ref_idx_l0_active, sh.num_ref_idx_l1_active);
+      if (frame_ == 0 || is_new_access_unit(sps_map_, pps_map_, prev_sh_, sh)) {
+        frame_++;
+        size_t bytestream_offset;
+        sample_offsets_.push_back(nal_bytestream_offset);
+        size_t total_size = 0;
+        if (nal_unit_type == 5) {
+          // Insert an SPS NAL if we did not see one in the meta packet
+          // sequence
+          keyframe_indices_.push_back(frame_ - 1);
+          saw_sps_nal_ = false;
+          VLOG(2) << "keyframe " << frame_ - 1 << ", byte offset "
+                  << meta_packet_sequence_start_offset_;
+
+          // Insert metadata
+          VLOG(2) << "inserting sps and pss nals";
+          i32 size = orig_size;
+          for (auto& kv : sps_nal_bytes_) {
+            auto& sps_nal = kv.second;
+            size += static_cast<i32>(sps_nal.size());
+          }
+          for (auto& kv : pps_nal_bytes_) {
+            auto& pps_nal = kv.second;
+            size += static_cast<i32>(pps_nal.size());
+          }
+
+          for (auto& kv : sps_nal_bytes_) {
+            auto& sps_nal = kv.second;
+            s_write(demuxed_bytestream_, sps_nal.data(), sps_nal.size());
+          }
+          for (auto& kv : pps_nal_bytes_) {
+            auto& pps_nal = kv.second;
+            s_write(demuxed_bytestream_, pps_nal.data(), pps_nal.size());
+          }
+          // Append the packet to the stream
+          s_write(demuxed_bytestream_, orig_data, orig_size);
+
+          bytestream_pos_ += size;
+
+          total_size = size;
+        } else {
+          // Append the packet to the stream
+          s_write(demuxed_bytestream_, orig_data, orig_size);
+
+          bytestream_pos_ += orig_size;
+
+          total_size = orig_size;
+        }
+        sample_sizes_.push_back(total_size);
+      }
+      in_meta_packet_sequence_ = false;
+      prev_sh_ = sh;
+    }
+    nals_parsed_++;
+  }
+  return true;
+}
+}
+}
diff --git a/scanner/video/h264_byte_stream_index_creator.h b/scanner/video/h264_byte_stream_index_creator.h
new file mode 100644
index 00000000..48f0736f
--- /dev/null
+++ b/scanner/video/h264_byte_stream_index_creator.h
@@ -0,0 +1,76 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "scanner/api/database.h"
+#include "scanner/util/common.h"
+#include "scanner/util/h264.h"
+
+#include "storehouse/storage_backend.h"
+#include "storehouse/storage_config.h"
+
+#include <string>
+
+namespace scanner {
+namespace internal {
+
+class H264ByteStreamIndexCreator {
+ public:
+  H264ByteStreamIndexCreator(storehouse::WriteFile* demuxed_bytestream);
+
+  bool feed_packet(u8* data, size_t size);
+
+  const std::vector<u8>& metadata_bytes() { return metadata_bytes_; }
+  const std::vector<u64>& sample_offsets() { return sample_offsets_; }
+  const std::vector<u64>& sample_sizes() { return sample_sizes_; }
+  const std::vector<u64>& keyframe_indices() { return keyframe_indices_; }
+
+  i32 frames() { return frame_; };
+  i32 num_non_ref_frames() { return num_non_ref_frames_; };
+  i32 nals_parsed() { return nals_parsed_; };
+  i64 bytestream_pos() { return bytestream_pos_; }
+
+  std::string error_message() { return error_message_; }
+
+ private:
+  std::string error_message_;
+
+  storehouse::WriteFile* demuxed_bytestream_;
+
+  u64 bytestream_pos_ = 0;
+  std::vector<u8> metadata_bytes_;
+  std::vector<u64> sample_offsets_;
+  std::vector<u64> sample_sizes_;
+  std::vector<u64> keyframe_indices_;
+
+  i64 frame_ = 0;
+  bool in_meta_packet_sequence_ = false;
+  i64 meta_packet_sequence_start_offset_ = 0;
+  bool saw_sps_nal_ = false;
+  bool saw_pps_nal_ = false;
+  std::map<u32, SPS> sps_map_;
+  std::map<u32, PPS> pps_map_;
+  u32 last_sps_ = -1;
+  u32 last_pps_ = -1;
+  std::map<u32, std::vector<u8>> sps_nal_bytes_;
+  std::map<u32, std::vector<u8>> pps_nal_bytes_;
+  SliceHeader prev_sh_;
+
+  i32 num_non_ref_frames_ = 0;
+  i32 nals_parsed_ = 0;
+};
+}
+}
diff --git a/scanner/video/intel/intel_video_decoder.cpp b/scanner/video/intel/intel_video_decoder.cpp
index f314244a..6c14e5f0 100644
--- a/scanner/video/intel/intel_video_decoder.cpp
+++ b/scanner/video/intel/intel_video_decoder.cpp
@@ -37,8 +37,12 @@ namespace scanner {
 ///////////////////////////////////////////////////////////////////////////////
 /// IntelVideoDecoder
 IntelVideoDecoder::IntelVideoDecoder(int device_id, DeviceType output_type)
-    : device_id_(device_id), output_type_(output_type), codec_(nullptr),
-      cc_(nullptr), reset_context_(true), sws_context_(nullptr) {
+  : device_id_(device_id),
+    output_type_(output_type),
+    codec_(nullptr),
+    cc_(nullptr),
+    reset_context_(true),
+    sws_context_(nullptr) {
   if (output_type != DeviceType::CPU && output_type != DeviceType::GPU) {
     LOG(FATAL) << "Unsupported output type for intel decoder";
   }
@@ -69,15 +73,15 @@ IntelVideoDecoder::~IntelVideoDecoder() {
   avcodec_close(cc_);
   av_freep(&cc_);
 #endif
-  for (AVFrame *frame : frame_pool_) {
+  for (AVFrame* frame : frame_pool_) {
     av_frame_free(&frame);
   }
-  for (AVFrame *frame : decoded_frame_queue_) {
+  for (AVFrame* frame : decoded_frame_queue_) {
     av_frame_free(&frame);
   }
 }
 
-void IntelVideoDecoder::configure(const InputFormat &metadata) {
+void IntelVideoDecoder::configure(const InputFormat& metadata) {
   metadata_ = metadata;
   reset_context_ = true;
 
@@ -87,7 +91,7 @@ void IntelVideoDecoder::configure(const InputFormat &metadata) {
   conversion_buffer_.resize(required_size);
 }
 
-bool IntelVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
+bool IntelVideoDecoder::feed(const u8* encoded_buffer, size_t encoded_size,
                              bool discontinuity) {
 // Debug read packets
 #if 0
@@ -122,7 +126,7 @@ bool IntelVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
   }
   memcpy(packet_.data, encoded_buffer, encoded_size);
 
-  uint8_t *orig_data = packet_.data;
+  uint8_t* orig_data = packet_.data;
   int orig_size = packet_.size;
   int got_picture = 0;
   do {
@@ -131,7 +135,7 @@ bool IntelVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
       // Create a new frame if our pool is empty
       frame_pool_.push_back(av_frame_alloc());
     }
-    AVFrame *frame = frame_pool_.back();
+    AVFrame* frame = frame_pool_.back();
     frame_pool_.pop_back();
 
     int consumed_length =
@@ -146,7 +150,7 @@ bool IntelVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
     if (got_picture) {
       if (frame->buf[0] == NULL) {
         // Must copy packet as data is stored statically
-        AVFrame *cloned_frame = av_frame_clone(frame);
+        AVFrame* cloned_frame = av_frame_clone(frame);
         if (cloned_frame == NULL) {
           fprintf(stderr, "could not clone frame\n");
           assert(false);
@@ -171,7 +175,7 @@ bool IntelVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
 }
 
 bool IntelVideoDecoder::discard_frame() {
-  AVFrame *frame = decoded_frame_queue_.front();
+  AVFrame* frame = decoded_frame_queue_.front();
   decoded_frame_queue_.pop_front();
   av_frame_unref(frame);
   frame_pool_.push_back(frame);
@@ -179,10 +183,10 @@ bool IntelVideoDecoder::discard_frame() {
   return decoded_frame_queue_.size() > 0;
 }
 
-bool IntelVideoDecoder::get_frame(u8 *decoded_buffer, size_t decoded_size) {
+bool IntelVideoDecoder::get_frame(u8* decoded_buffer, size_t decoded_size) {
   int64_t size_left = decoded_size;
 
-  AVFrame *frame = decoded_frame_queue_.front();
+  AVFrame* frame = decoded_frame_queue_.front();
   decoded_frame_queue_.pop_front();
 
   if (reset_context_) {
@@ -198,14 +202,14 @@ bool IntelVideoDecoder::get_frame(u8 *decoded_buffer, size_t decoded_size) {
     exit(EXIT_FAILURE);
   }
 
-  u8 *scale_buffer = nullptr;
+  u8* scale_buffer = nullptr;
   if (output_type_ == DeviceType::GPU) {
     scale_buffer = conversion_buffer_.data();
   } else if (output_type_ == DeviceType::CPU) {
     scale_buffer = decoded_buffer;
   }
 
-  uint8_t *out_slices[4];
+  uint8_t* out_slices[4];
   int out_linesizes[4];
   int required_size = av_image_fill_arrays(
       out_slices, out_linesizes, scale_buffer, AV_PIX_FMT_RGB24,
diff --git a/scanner/video/nvidia/nvidia_video_decoder.cpp b/scanner/video/nvidia/nvidia_video_decoder.cpp
index 9b44561c..76e6a7e2 100644
--- a/scanner/video/nvidia/nvidia_video_decoder.cpp
+++ b/scanner/video/nvidia/nvidia_video_decoder.cpp
@@ -31,13 +31,19 @@ namespace internal {
 
 NVIDIAVideoDecoder::NVIDIAVideoDecoder(int device_id, DeviceType output_type,
                                        CUcontext cuda_context)
-    : device_id_(device_id), output_type_(output_type),
-      cuda_context_(cuda_context), streams_(max_mapped_frames_),
-      parser_(nullptr), decoder_(nullptr), frame_queue_read_pos_(0),
-      frame_queue_elements_(0), last_displayed_frame_(-1) {
+  : device_id_(device_id),
+    output_type_(output_type),
+    cuda_context_(cuda_context),
+    streams_(max_mapped_frames_),
+    parser_(nullptr),
+    decoder_(nullptr),
+    frame_queue_read_pos_(0),
+    frame_queue_elements_(0),
+    last_displayed_frame_(-1) {
   CUcontext dummy;
 
   CUD_CHECK(cuCtxPushCurrent(cuda_context_));
+  cudaSetDevice(device_id_);
 
   for (int i = 0; i < max_mapped_frames_; ++i) {
     cudaStreamCreate(&streams_[i]);
@@ -48,9 +54,15 @@ NVIDIAVideoDecoder::NVIDIAVideoDecoder(int device_id, DeviceType output_type,
     undisplayed_frames_[i] = false;
     invalid_frames_[i] = false;
   }
+
+  CUD_CHECK(cuCtxPopCurrent(&dummy));
+
 }
 
 NVIDIAVideoDecoder::~NVIDIAVideoDecoder() {
+  CUD_CHECK(cuCtxPushCurrent(cuda_context_));
+  cudaSetDevice(device_id_);
+
   for (int i = 0; i < max_mapped_frames_; ++i) {
     if (mapped_frames_[i] != 0) {
       CUD_CHECK(cuvidUnmapVideoFrame(decoder_, mapped_frames_[i]));
@@ -58,17 +70,19 @@ NVIDIAVideoDecoder::~NVIDIAVideoDecoder() {
   }
 
   if (parser_) {
-    cuvidDestroyVideoParser(parser_);
+    CUD_CHECK(cuvidDestroyVideoParser(parser_));
   }
 
   if (decoder_) {
-    cuvidDestroyDecoder(decoder_);
+    CUD_CHECK(cuvidDestroyDecoder(decoder_));
   }
 
   for (int i = 0; i < max_mapped_frames_; ++i) {
-    cudaStreamDestroy(streams_[i]);
+    CU_CHECK(cudaStreamDestroy(streams_[i]));
   }
 
+  CUcontext dummy;
+  CUD_CHECK(cuCtxPopCurrent(&dummy));
   // HACK(apoms): We are only using the primary context right now instead of
   //   allowing the user to specify their own CUcontext. Thus we need to release
   //   the primary context we retained when using the factory function to create
@@ -76,11 +90,13 @@ NVIDIAVideoDecoder::~NVIDIAVideoDecoder() {
   CUD_CHECK(cuDevicePrimaryCtxRelease(device_id_));
 }
 
-void NVIDIAVideoDecoder::configure(const FrameInfo &metadata) {
+void NVIDIAVideoDecoder::configure(const FrameInfo& metadata) {
   frame_width_ = metadata.width();
   frame_height_ = metadata.height();
 
   CUcontext dummy;
+  CUD_CHECK(cuCtxPushCurrent(cuda_context_));
+  cudaSetDevice(device_id_);
 
   for (int i = 0; i < max_mapped_frames_; ++i) {
     if (mapped_frames_[i] != 0) {
@@ -89,11 +105,11 @@ void NVIDIAVideoDecoder::configure(const FrameInfo &metadata) {
   }
 
   if (parser_) {
-    cuvidDestroyVideoParser(parser_);
+    CUD_CHECK(cuvidDestroyVideoParser(parser_));
   }
 
   if (decoder_) {
-    cuvidDestroyDecoder(decoder_);
+    CUD_CHECK(cuvidDestroyDecoder(decoder_));
   }
 
   for (int i = 0; i < max_mapped_frames_; ++i) {
@@ -152,24 +168,25 @@ void NVIDIAVideoDecoder::configure(const FrameInfo &metadata) {
   size_t pos = 0;
   while (pos < metadata_packets_.size()) {
     int encoded_packet_size =
-        *reinterpret_cast<int *>(metadata_packets_.data() + pos);
+        *reinterpret_cast<int*>(metadata_packets_.data() + pos);
     pos += sizeof(int);
-    u8 *encoded_packet = (u8 *)(metadata_packets_.data() + pos);
+    u8* encoded_packet = (u8*)(metadata_packets_.data() + pos);
     pos += encoded_packet_size;
 
     feed(encoded_packet, encoded_packet_size);
   }
 }
 
-bool NVIDIAVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
+bool NVIDIAVideoDecoder::feed(const u8* encoded_buffer, size_t encoded_size,
                               bool discontinuity) {
   CUD_CHECK(cuCtxPushCurrent(cuda_context_));
+  cudaSetDevice(device_id_);
 
   if (discontinuity) {
     {
       std::unique_lock<std::mutex> lock(frame_queue_mutex_);
       while (frame_queue_elements_ > 0) {
-        const auto &dispinfo = frame_queue_[frame_queue_read_pos_];
+        const auto& dispinfo = frame_queue_[frame_queue_read_pos_];
         frame_in_use_[dispinfo.picture_index] = false;
         frame_queue_read_pos_ =
             (frame_queue_read_pos_ + 1) % max_output_frames_;
@@ -189,17 +206,19 @@ bool NVIDIAVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
       undisplayed_frames_[i] = false;
     }
     while (frame_queue_elements_ > 0) {
-      const auto &dispinfo = frame_queue_[frame_queue_read_pos_];
+      const auto& dispinfo = frame_queue_[frame_queue_read_pos_];
       frame_in_use_[dispinfo.picture_index] = false;
       frame_queue_read_pos_ = (frame_queue_read_pos_ + 1) % max_output_frames_;
       frame_queue_elements_--;
     }
 
+    CUcontext dummy;
+    CUD_CHECK(cuCtxPopCurrent(&dummy));
     return false;
   }
   CUVIDSOURCEDATAPACKET cupkt = {};
   cupkt.payload_size = encoded_size;
-  cupkt.payload = reinterpret_cast<const uint8_t *>(encoded_buffer);
+  cupkt.payload = reinterpret_cast<const uint8_t*>(encoded_buffer);
   if (encoded_size == 0) {
     cupkt.flags |= CUVID_PKT_ENDOFSTREAM;
   }
@@ -211,9 +230,9 @@ bool NVIDIAVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
     size_t pos = 0;
     while (pos < metadata_packets_.size()) {
       int encoded_packet_size =
-          *reinterpret_cast<int *>(metadata_packets_.data() + pos);
+          *reinterpret_cast<int*>(metadata_packets_.data() + pos);
       pos += sizeof(int);
-      u8 *encoded_packet = (u8 *)(metadata_packets_.data() + pos);
+      u8* encoded_packet = (u8*)(metadata_packets_.data() + pos);
       pos += encoded_packet_size;
 
       feed(encoded_packet, encoded_packet_size);
@@ -229,9 +248,10 @@ bool NVIDIAVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
 bool NVIDIAVideoDecoder::discard_frame() {
   std::unique_lock<std::mutex> lock(frame_queue_mutex_);
   CUD_CHECK(cuCtxPushCurrent(cuda_context_));
+  cudaSetDevice(device_id_);
 
   if (frame_queue_elements_ > 0) {
-    const auto &dispinfo = frame_queue_[frame_queue_read_pos_];
+    const auto& dispinfo = frame_queue_[frame_queue_read_pos_];
     frame_in_use_[dispinfo.picture_index] = false;
     frame_queue_read_pos_ = (frame_queue_read_pos_ + 1) % max_output_frames_;
     frame_queue_elements_--;
@@ -243,9 +263,11 @@ bool NVIDIAVideoDecoder::discard_frame() {
   return frame_queue_elements_ > 0;
 }
 
-bool NVIDIAVideoDecoder::get_frame(u8 *decoded_buffer, size_t decoded_size) {
+bool NVIDIAVideoDecoder::get_frame(u8* decoded_buffer, size_t decoded_size) {
+  auto start = now();
   std::unique_lock<std::mutex> lock(frame_queue_mutex_);
   CUD_CHECK(cuCtxPushCurrent(cuda_context_));
+  cudaSetDevice(device_id_);
   if (frame_queue_elements_ > 0) {
     CUVIDPARSERDISPINFO dispinfo = frame_queue_[frame_queue_read_pos_];
     frame_queue_read_pos_ = (frame_queue_read_pos_ + 1) % max_output_frames_;
@@ -270,7 +292,7 @@ bool NVIDIAVideoDecoder::get_frame(u8 *decoded_buffer, size_t decoded_size) {
       profiler_->add_interval("map_frame", start_map, now());
     }
     CUdeviceptr mapped_frame = mapped_frames_[mapped_frame_index];
-    CU_CHECK(convertNV12toRGBA((const u8 *)mapped_frame, pitch, decoded_buffer,
+    CU_CHECK(convertNV12toRGBA((const u8*)mapped_frame, pitch, decoded_buffer,
                                frame_width_ * 3, frame_width_, frame_height_,
                                0));
     CU_CHECK(cudaDeviceSynchronize());
@@ -286,6 +308,10 @@ bool NVIDIAVideoDecoder::get_frame(u8 *decoded_buffer, size_t decoded_size) {
   CUcontext dummy;
   CUD_CHECK(cuCtxPopCurrent(&dummy));
 
+  if (profiler_) {
+    profiler_->add_interval("get_frame", start, now());
+  }
+
   return frame_queue_elements_;
 }
 
@@ -295,15 +321,15 @@ int NVIDIAVideoDecoder::decoded_frames_buffered() {
 
 void NVIDIAVideoDecoder::wait_until_frames_copied() {}
 
-int NVIDIAVideoDecoder::cuvid_handle_video_sequence(void *opaque,
-                                                    CUVIDEOFORMAT *format) {
-  NVIDIAVideoDecoder &decoder = *reinterpret_cast<NVIDIAVideoDecoder *>(opaque);
+int NVIDIAVideoDecoder::cuvid_handle_video_sequence(void* opaque,
+                                                    CUVIDEOFORMAT* format) {
+  NVIDIAVideoDecoder& decoder = *reinterpret_cast<NVIDIAVideoDecoder*>(opaque);
   return 1;
 }
 
-int NVIDIAVideoDecoder::cuvid_handle_picture_decode(void *opaque,
-                                                    CUVIDPICPARAMS *picparams) {
-  NVIDIAVideoDecoder &decoder = *reinterpret_cast<NVIDIAVideoDecoder *>(opaque);
+int NVIDIAVideoDecoder::cuvid_handle_picture_decode(void* opaque,
+                                                    CUVIDPICPARAMS* picparams) {
+  NVIDIAVideoDecoder& decoder = *reinterpret_cast<NVIDIAVideoDecoder*>(opaque);
 
   int mapped_frame_index = picparams->CurrPicIdx;
   while (decoder.frame_in_use_[picparams->CurrPicIdx]) {
@@ -319,8 +345,8 @@ int NVIDIAVideoDecoder::cuvid_handle_picture_decode(void *opaque,
 }
 
 int NVIDIAVideoDecoder::cuvid_handle_picture_display(
-    void *opaque, CUVIDPARSERDISPINFO *dispinfo) {
-  NVIDIAVideoDecoder &decoder = *reinterpret_cast<NVIDIAVideoDecoder *>(opaque);
+    void* opaque, CUVIDPARSERDISPINFO* dispinfo) {
+  NVIDIAVideoDecoder& decoder = *reinterpret_cast<NVIDIAVideoDecoder*>(opaque);
   if (!decoder.invalid_frames_[dispinfo->picture_index]) {
     {
       std::unique_lock<std::mutex> lock(decoder.frame_queue_mutex_);
diff --git a/scanner/video/nvidia/nvidia_video_decoder.h b/scanner/video/nvidia/nvidia_video_decoder.h
index 8913256b..282b4667 100644
--- a/scanner/video/nvidia/nvidia_video_decoder.h
+++ b/scanner/video/nvidia/nvidia_video_decoder.h
@@ -15,10 +15,10 @@
 
 #pragma once
 
-#include "scanner/video/video_decoder.h"
 #include "scanner/api/kernel.h"
 #include "scanner/util/common.h"
 #include "scanner/util/queue.h"
+#include "scanner/video/video_decoder.h"
 
 #include <cuda.h>
 #include <cuda_runtime.h>
diff --git a/scanner/video/software/CMakeLists.txt b/scanner/video/software/CMakeLists.txt
index 7fb6377d..97418285 100644
--- a/scanner/video/software/CMakeLists.txt
+++ b/scanner/video/software/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_library(video_software OBJECT
-  software_video_decoder.cpp)
+  software_video_decoder.cpp
+  software_video_encoder.cpp)
diff --git a/scanner/video/software/software_video_decoder.cpp b/scanner/video/software/software_video_decoder.cpp
index 35623163..6f30281c 100644
--- a/scanner/video/software/software_video_decoder.cpp
+++ b/scanner/video/software/software_video_decoder.cpp
@@ -40,11 +40,14 @@ namespace internal {
 SoftwareVideoDecoder::SoftwareVideoDecoder(i32 device_id,
                                            DeviceType output_type,
                                            i32 thread_count)
-    : device_id_(device_id), output_type_(output_type), codec_(nullptr),
-      cc_(nullptr), reset_context_(true), sws_context_(nullptr),
-      frame_pool_(1024), decoded_frame_queue_(1024) {
-  avcodec_register_all();
-
+  : device_id_(device_id),
+    output_type_(output_type),
+    codec_(nullptr),
+    cc_(nullptr),
+    reset_context_(true),
+    sws_context_(nullptr),
+    frame_pool_(1024),
+    decoded_frame_queue_(1024) {
   av_init_packet(&packet_);
 
   codec_ = avcodec_find_decoder(AV_CODEC_ID_H264);
@@ -59,7 +62,8 @@ SoftwareVideoDecoder::SoftwareVideoDecoder(i32 device_id,
     exit(EXIT_FAILURE);
   }
 
-  cc_->thread_count = thread_count;
+  // cc_->thread_count = thread_count;
+  cc_->thread_count = 4;
 
   if (avcodec_open2(cc_, codec_, NULL) < 0) {
     fprintf(stderr, "could not open codec\n");
@@ -88,7 +92,7 @@ SoftwareVideoDecoder::~SoftwareVideoDecoder() {
   sws_freeContext(sws_context_);
 }
 
-void SoftwareVideoDecoder::configure(const FrameInfo &metadata) {
+void SoftwareVideoDecoder::configure(const FrameInfo& metadata) {
   metadata_ = metadata;
   frame_width_ = metadata_.width();
   frame_height_ = metadata_.height();
@@ -100,7 +104,7 @@ void SoftwareVideoDecoder::configure(const FrameInfo &metadata) {
   conversion_buffer_.resize(required_size);
 }
 
-bool SoftwareVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
+bool SoftwareVideoDecoder::feed(const u8* encoded_buffer, size_t encoded_size,
                                 bool discontinuity) {
 // Debug read packets
 #if 0
@@ -126,39 +130,117 @@ bool SoftwareVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
     }
   }
 #endif
-  static thread_local i32 what = 0;
   if (discontinuity) {
-    // printf("what %d, frames %d\n",
-    //        what,
-    //        decoded_frame_queue_.size() + frame_pool_.size());
     while (decoded_frame_queue_.size() > 0) {
-      AVFrame *frame;
+      AVFrame* frame;
       decoded_frame_queue_.pop(frame);
       av_frame_free(&frame);
-      what--;
     }
     while (frame_pool_.size() > 0) {
-      AVFrame *frame;
+      AVFrame* frame;
       frame_pool_.pop(frame);
       av_frame_free(&frame);
-      what--;
     }
-    // printf("disc, what %d\n", what);
-    avcodec_flush_buffers(cc_);
+
+    packet_.data = NULL;
+    packet_.size = 0;
+    feed_packet(true);
     return false;
   }
   if (encoded_size > 0) {
-    // if (av_new_packet(&packet_, encoded_size) < 0) {
-    //   fprintf(stderr, "could not allocate packet for feeding into decoder\n");
-    //   assert(false);
-    // }
-    // memcpy(packet_.data, encoded_buffer, encoded_size);
-    packet_.data = const_cast<u8*>(encoded_buffer);
-    packet_.size = encoded_size;
+    if (av_new_packet(&packet_, encoded_size) < 0) {
+      fprintf(stderr, "could not allocate packet for feeding into decoder\n");
+      assert(false);
+    }
+    memcpy(packet_.data, encoded_buffer, encoded_size);
   } else {
     packet_.data = NULL;
     packet_.size = 0;
   }
+
+  feed_packet(false);
+  av_packet_unref(&packet_);
+
+  return decoded_frame_queue_.size() > 0;
+}
+
+bool SoftwareVideoDecoder::discard_frame() {
+  if (decoded_frame_queue_.size() > 0) {
+    AVFrame* frame;
+    decoded_frame_queue_.pop(frame);
+    av_frame_unref(frame);
+    frame_pool_.push(frame);
+  }
+
+  return decoded_frame_queue_.size() > 0;
+}
+
+bool SoftwareVideoDecoder::get_frame(u8* decoded_buffer, size_t decoded_size) {
+  int64_t size_left = decoded_size;
+
+  AVFrame* frame;
+  if (decoded_frame_queue_.size() > 0) {
+    decoded_frame_queue_.pop(frame);
+  } else {
+    return false;
+  }
+
+  if (reset_context_) {
+    auto get_context_start = now();
+    AVPixelFormat decoder_pixel_format = cc_->pix_fmt;
+    sws_freeContext(sws_context_);
+    sws_context_ = sws_getContext(
+        frame_width_, frame_height_, decoder_pixel_format, frame_width_,
+        frame_height_, AV_PIX_FMT_RGB24, SWS_BICUBIC, NULL, NULL, NULL);
+    reset_context_ = false;
+    auto get_context_end = now();
+    if (profiler_) {
+      profiler_->add_interval("ffmpeg:get_sws_context", get_context_start,
+                              get_context_end);
+    }
+  }
+
+  if (sws_context_ == NULL) {
+    LOG(FATAL) << "Could not get sws_context for rgb conversion";
+  }
+
+  u8* scale_buffer = decoded_buffer;
+
+  uint8_t* out_slices[4];
+  int out_linesizes[4];
+  int required_size =
+      av_image_fill_arrays(out_slices, out_linesizes, scale_buffer,
+                           AV_PIX_FMT_RGB24, frame_width_, frame_height_, 1);
+  if (required_size < 0) {
+    LOG(FATAL) << "Error in av_image_fill_arrays";
+  }
+  if (required_size > decoded_size) {
+    LOG(FATAL) << "Decode buffer not large enough for image";
+  }
+  auto scale_start = now();
+  if (sws_scale(sws_context_, frame->data, frame->linesize, 0, frame->height,
+                out_slices, out_linesizes) < 0) {
+    LOG(FATAL) << "sws_scale failed";
+  }
+  auto scale_end = now();
+
+  av_frame_unref(frame);
+  frame_pool_.push(frame);
+
+  if (profiler_) {
+    profiler_->add_interval("ffmpeg:scale_frame", scale_start, scale_end);
+  }
+
+  return decoded_frame_queue_.size() > 0;
+}
+
+int SoftwareVideoDecoder::decoded_frames_buffered() {
+  return decoded_frame_queue_.size();
+}
+
+void SoftwareVideoDecoder::wait_until_frames_copied() {}
+
+void SoftwareVideoDecoder::feed_packet(bool flush) {
 #if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(57, 25, 0)
   auto send_start = now();
   int error = avcodec_send_packet(cc_, &packet_);
@@ -167,7 +249,7 @@ bool SoftwareVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
       char err_msg[256];
       av_strerror(error, err_msg, 256);
       fprintf(stderr, "Error while sending packet (%d): %s\n", error, err_msg);
-      assert(false);
+      LOG(FATAL) << "Error while sending packet";
     }
   }
   auto send_end = now();
@@ -175,14 +257,11 @@ bool SoftwareVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
   auto received_start = now();
   bool done = false;
   while (!done) {
-    AVFrame *frame;
+    AVFrame* frame;
     {
       if (frame_pool_.size() <= 0) {
         // Create a new frame if our pool is empty
         frame_pool_.push(av_frame_alloc());
-        what++;
-        // printf("what %d, frame pool %d, decoded %d\n", what, frame_pool_.size(),
-        //        decoded_frame_queue_.size());
       }
       frame_pool_.pop(frame);
     }
@@ -193,8 +272,12 @@ bool SoftwareVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
       break;
     }
     if (error == 0) {
-      //printf("decoded_frame_queue %d\n", decoded_frame_queue_.size());
-      decoded_frame_queue_.push(frame);
+      if (!flush) {
+        decoded_frame_queue_.push(frame);
+      } else {
+        av_frame_unref(frame);
+        frame_pool_.push(frame);
+      }
     } else if (error == AVERROR(EAGAIN)) {
       done = true;
       frame_pool_.push(frame);
@@ -202,7 +285,7 @@ bool SoftwareVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
       char err_msg[256];
       av_strerror(error, err_msg, 256);
       fprintf(stderr, "Error while receiving frame (%d): %s\n", error, err_msg);
-      exit(1);
+      LOG(FATAL) << "Error while receiving frame";
     }
   }
   auto received_end = now();
@@ -212,12 +295,12 @@ bool SoftwareVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
                             received_end);
   }
 #else
-  uint8_t *orig_data = packet_.data;
+  uint8_t* orig_data = packet_.data;
   int orig_size = packet_.size;
   int got_picture = 0;
   do {
     // Get frame from pool of allocated frames to decode video into
-    AVFrame *frame;
+    AVFrame* frame;
     {
       if (frame_pool_.size() <= 0) {
         // Create a new frame if our pool is empty
@@ -237,21 +320,26 @@ bool SoftwareVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
       av_strerror(consumed_length, err_msg, 256);
       fprintf(stderr, "Error while decoding frame (%d): %s\n", consumed_length,
               err_msg);
-      assert(false);
+      LOG(FATAL) << "Error while decoding frame";
     }
     if (got_picture) {
-      if (frame->buf[0] == NULL) {
-        // Must copy packet as data is stored statically
-        AVFrame *cloned_frame = av_frame_clone(frame);
-        if (cloned_frame == NULL) {
-          fprintf(stderr, "could not clone frame\n");
-          assert(false);
+      if (!flush) {
+        if (frame->buf[0] == NULL) {
+          // Must copy packet as data is stored statically
+          AVFrame* cloned_frame = av_frame_clone(frame);
+          if (cloned_frame == NULL) {
+            fprintf(stderr, "could not clone frame\n");
+            assert(false);
+          }
+          decoded_frame_queue_.push(cloned_frame);
+          av_frame_free(&frame);
+        } else {
+          // Frame is reference counted so we can just take it directly
+          decoded_frame_queue_.push(frame);
         }
-        decoded_frame_queue_.push(cloned_frame);
-        av_frame_free(&frame);
       } else {
-        // Frame is reference counted so we can just take it directly
-        decoded_frame_queue_.push(frame);
+        av_frame_unref(frame);
+        frame_pool_.push(frame);
       }
     } else {
       frame_pool_.push(frame);
@@ -262,89 +350,9 @@ bool SoftwareVideoDecoder::feed(const u8 *encoded_buffer, size_t encoded_size,
   packet_.data = orig_data;
   packet_.size = orig_size;
 #endif
-  av_packet_unref(&packet_);
-
-  return decoded_frame_queue_.size() > 0;
-}
-
-bool SoftwareVideoDecoder::discard_frame() {
-  if (decoded_frame_queue_.size() > 0) {
-    AVFrame *frame;
-    decoded_frame_queue_.pop(frame);
-    av_frame_unref(frame);
-    frame_pool_.push(frame);
-  }
-
-  return decoded_frame_queue_.size() > 0;
-}
-
-bool SoftwareVideoDecoder::get_frame(u8 *decoded_buffer, size_t decoded_size) {
-  int64_t size_left = decoded_size;
-
-  AVFrame *frame;
-  if (decoded_frame_queue_.size() > 0) {
-    decoded_frame_queue_.pop(frame);
-  } else {
-    return false;
-  }
-
-  if (reset_context_) {
-    auto get_context_start = now();
-    AVPixelFormat decoder_pixel_format = cc_->pix_fmt;
-    sws_freeContext(sws_context_);
-    sws_context_ = sws_getContext(
-        frame_width_, frame_height_, decoder_pixel_format, frame_width_,
-        frame_height_, AV_PIX_FMT_RGB24, SWS_BICUBIC, NULL, NULL, NULL);
-    reset_context_ = false;
-    auto get_context_end = now();
-    if (profiler_) {
-      profiler_->add_interval("ffmpeg:get_sws_context", get_context_start,
-                              get_context_end);
-    }
-  }
-
-  if (sws_context_ == NULL) {
-    fprintf(stderr, "Could not get sws_context for rgb conversion\n");
-    exit(EXIT_FAILURE);
-  }
-
-  u8 *scale_buffer = decoded_buffer;
-
-  uint8_t *out_slices[4];
-  int out_linesizes[4];
-  int required_size =
-      av_image_fill_arrays(out_slices, out_linesizes, scale_buffer,
-                           AV_PIX_FMT_RGB24, frame_width_, frame_height_, 1);
-  if (required_size < 0) {
-    fprintf(stderr, "Error in av_image_fill_arrays\n");
-    exit(EXIT_FAILURE);
-  }
-  if (required_size > decoded_size) {
-    fprintf(stderr, "Decode buffer not large enough for image\n");
-    exit(EXIT_FAILURE);
-  }
-  auto scale_start = now();
-  if (sws_scale(sws_context_, frame->data, frame->linesize, 0, frame->height,
-                out_slices, out_linesizes) < 0) {
-    fprintf(stderr, "sws_scale failed\n");
-    exit(EXIT_FAILURE);
-  }
-  auto scale_end = now();
-
-  av_frame_unref(frame);
-  frame_pool_.push(frame);
-
-  if (profiler_) {
-    profiler_->add_interval("ffmpeg:scale_frame", scale_start, scale_end);
+  if (packet_.size == 0) {
+    avcodec_flush_buffers(cc_);
   }
-
-  return decoded_frame_queue_.size() > 0;
-}
-
-int SoftwareVideoDecoder::decoded_frames_buffered() {
-  return decoded_frame_queue_.size();
 }
-
-void SoftwareVideoDecoder::wait_until_frames_copied() {}
 }
 }
diff --git a/scanner/video/software/software_video_decoder.h b/scanner/video/software/software_video_decoder.h
index df696eb9..3a460e08 100644
--- a/scanner/video/software/software_video_decoder.h
+++ b/scanner/video/software/software_video_decoder.h
@@ -15,9 +15,9 @@
 
 #pragma once
 
-#include "scanner/video/video_decoder.h"
 #include "scanner/api/kernel.h"
 #include "scanner/util/queue.h"
+#include "scanner/video/video_decoder.h"
 
 extern "C" {
 #include "libavcodec/avcodec.h"
@@ -31,8 +31,8 @@ extern "C" {
 }
 
 #include <deque>
-#include <vector>
 #include <mutex>
+#include <vector>
 
 namespace scanner {
 namespace internal {
@@ -59,6 +59,8 @@ class SoftwareVideoDecoder : public VideoDecoder {
   void wait_until_frames_copied() override;
 
  private:
+  void feed_packet(bool flush);
+
   int device_id_;
   DeviceType output_type_;
   AVPacket packet_;
diff --git a/scanner/video/software/software_video_encoder.cpp b/scanner/video/software/software_video_encoder.cpp
new file mode 100644
index 00000000..64fea473
--- /dev/null
+++ b/scanner/video/software/software_video_encoder.cpp
@@ -0,0 +1,307 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scanner/video/software/software_video_encoder.h"
+#include "scanner/util/h264.h"
+
+extern "C" {
+#include "libavcodec/avcodec.h"
+#include "libavformat/avformat.h"
+#include "libavutil/error.h"
+#include "libavutil/frame.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/opt.h"
+#include "libswscale/swscale.h"
+}
+
+#ifdef HAVE_CUDA
+#include "scanner/util/cuda.h"
+#endif
+
+#include <cassert>
+
+#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(57, 5, 0)
+#define PACKET_FREE(pkt) av_packet_free(&pkt);
+#else
+#define PACKET_FREE(pkt) \
+  av_packet_unref(pkt);  \
+  av_freep(&pkt);
+#endif
+
+namespace scanner {
+namespace internal {
+
+///////////////////////////////////////////////////////////////////////////////
+/// SoftwareVideoEncoder
+SoftwareVideoEncoder::SoftwareVideoEncoder(i32 device_id,
+                                           DeviceType output_type,
+                                           i32 thread_count)
+  : device_id_(device_id),
+    output_type_(output_type),
+    codec_(nullptr),
+    cc_(nullptr),
+    sws_context_(nullptr),
+    was_reset_(false),
+    ready_packet_queue_(1024),
+    frame_id_(0),
+    frame_(nullptr) {
+  avcodec_register_all();
+
+  codec_ = avcodec_find_encoder(AV_CODEC_ID_H264);
+  if (!codec_) {
+    fprintf(stderr, "could not find h264 encoder\n");
+    exit(EXIT_FAILURE);
+  }
+
+  annexb_ = av_bitstream_filter_init("h264_mp4toannexb");
+}
+
+SoftwareVideoEncoder::~SoftwareVideoEncoder() {
+  if (cc_) {
+#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(55, 53, 0)
+    avcodec_free_context(&cc_);
+#else
+    avcodec_close(cc_);
+    av_freep(&cc_);
+#endif
+  }
+  if (frame_) {
+    av_frame_free(&frame_);
+  }
+
+  if (sws_context_) {
+    sws_freeContext(sws_context_);
+  }
+
+  av_bitstream_filter_close(annexb_);
+}
+
+void SoftwareVideoEncoder::configure(const FrameInfo& metadata,
+                                     const EncodeOptions& opts) {
+  if (cc_ != NULL) {
+#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(55, 53, 0)
+    avcodec_free_context(&cc_);
+#else
+    avcodec_close(cc_);
+    av_freep(&cc_);
+#endif
+    while (ready_packet_queue_.size() > 0) {
+      AVPacket* packet;
+      ready_packet_queue_.pop(packet);
+      PACKET_FREE(packet);
+    }
+  }
+
+  cc_ = avcodec_alloc_context3(codec_);
+  if (!cc_) {
+    LOG(FATAL) << "could not alloc codec context";
+  }
+
+  metadata_ = metadata;
+  frame_width_ = metadata_.width();
+  frame_height_ = metadata_.height();
+  frame_id_ = 0;
+
+  int required_size = av_image_get_buffer_size(AV_PIX_FMT_RGB24, frame_width_,
+                                               frame_height_, 1);
+
+  cc_->thread_count = 4;
+  cc_->width = frame_width_;    // Note Resolution must be a multiple of 2!!
+  cc_->height = frame_height_;  // Note Resolution must be a multiple of 2!!
+  // TODO(apoms): figure out this fps from the input video automatically
+  cc_->time_base.den = 24;
+  cc_->time_base.num = 1;
+  cc_->gop_size = 120;  // Intra frames per x P frames
+  cc_->pix_fmt =
+      AV_PIX_FMT_YUV420P;  // Do not change this, H264 needs YUV format not RGB
+  if (opts.quality != -1) {
+    if (av_opt_set_int(cc_->priv_data, "crf", opts.quality, 0) < 0) {
+      LOG(FATAL) << "Could not set CRF on codec context";
+    }
+  }
+  if (opts.bitrate != -1) {
+    cc_->bit_rate = opts.bitrate;
+  }
+  if (opts.keyframe_distance != -1) {
+    cc_->gop_size = opts.keyframe_distance;
+  }
+
+  if (avcodec_open2(cc_, codec_, NULL) < 0) {
+    LOG(FATAL) << "could not open codec";
+  }
+
+  AVPixelFormat encoder_pixel_format = cc_->pix_fmt;
+  sws_context_ = sws_getContext(
+      frame_width_, frame_height_, AV_PIX_FMT_RGB24, frame_width_,
+      frame_height_, encoder_pixel_format, SWS_BICUBIC, NULL, NULL, NULL);
+  if (sws_context_ == NULL) {
+    LOG(FATAL) << "Could not get sws_context for rgb conversion";
+  }
+}
+
+bool SoftwareVideoEncoder::feed(const u8* frame_buffer, size_t frame_size) {
+  assert(frame_size > 0);
+  if (was_reset_) {
+    avcodec_flush_buffers(cc_);
+  }
+
+  // Convert image into YUV format from RGB
+  frame_ = av_frame_alloc();
+  if (!frame_) {
+    LOG(FATAL) << "Could not alloc frame";
+  }
+
+  frame_->format = cc_->pix_fmt;
+  frame_->width = frame_width_;
+  frame_->height = frame_height_;
+  if (av_frame_get_buffer(frame_, 32) < 0) {
+    LOG(FATAL) << "Could not get frame buffer";
+  }
+
+  uint8_t* out_slices[4];
+  int out_linesizes[4];
+  int required_size =
+      av_image_fill_arrays(out_slices, out_linesizes, frame_buffer,
+                           AV_PIX_FMT_RGB24, frame_width_, frame_height_, 1);
+  if (required_size < 0) {
+    LOG(FATAL) << "Error in av_image_fill_arrays";
+  }
+  if (required_size > frame_size) {
+    LOG(FATAL) << "Encode buffer not large enough for image";
+  }
+  auto scale_start = now();
+  if (sws_scale(sws_context_, out_slices, out_linesizes, 0, frame_height_,
+                frame_->data, frame_->linesize) < 0) {
+    LOG(FATAL) << "sws_scale failed";
+  }
+  auto scale_end = now();
+  if (profiler_) {
+    profiler_->add_interval("ffmpeg:scale_frame", scale_start, scale_end);
+  }
+
+  frame_->pts = frame_id_++;
+  feed_frame(false);
+
+  return ready_packet_queue_.size() > 0;
+}
+
+bool SoftwareVideoEncoder::flush() {
+  feed_frame(false);
+  was_reset_ = true;
+  return ready_packet_queue_.size() > 0;
+}
+
+bool SoftwareVideoEncoder::get_packet(u8* packet_buffer, size_t packet_size,
+                                      size_t& actual_packet_size) {
+  actual_packet_size = 0;
+
+  AVPacket* packet;
+  if (ready_packet_queue_.size() > 0) {
+    ready_packet_queue_.peek(packet);
+  } else {
+    return false;
+  }
+
+  u8* filtered_data;
+  i32 filtered_data_size;
+  int err = av_bitstream_filter_filter(
+      annexb_, cc_, NULL, &filtered_data, &filtered_data_size, packet->data,
+      packet->size, packet->flags & AV_PKT_FLAG_KEY);
+  if (err < 0) {
+    char err_msg[256];
+    av_strerror(err, err_msg, 256);
+    LOG(ERROR) << "Error while filtering: " << err_msg;
+    exit(1);
+  }
+
+  // Make sure we have space for this packet, otherwise return
+  actual_packet_size = filtered_data_size;
+  if (actual_packet_size > packet_size) {
+    free(filtered_data);
+    return true;
+  }
+
+  memcpy(packet_buffer, filtered_data, filtered_data_size);
+  free(filtered_data);
+
+  // Only pop packet when we know we can copy it out
+  ready_packet_queue_.pop(packet);
+  PACKET_FREE(packet);
+
+  return ready_packet_queue_.size() > 0;
+}
+
+int SoftwareVideoEncoder::decoded_packets_buffered() {
+  return ready_packet_queue_.size();
+}
+
+void SoftwareVideoEncoder::wait_until_packets_copied() {}
+
+void SoftwareVideoEncoder::feed_frame(bool flush) {
+#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(57, 25, 0)
+  auto send_start = now();
+  AVFrame* f = flush ? NULL : frame_;
+  int ret = avcodec_send_frame(cc_, f);
+  if (ret != AVERROR_EOF) {
+    if (ret < 0) {
+      char err_msg[256];
+      av_strerror(ret, err_msg, 256);
+      fprintf(stderr, "Error while sending frame (%d): %s\n", ret, err_msg);
+      LOG(FATAL) << "Error while sending frame";
+    }
+  }
+
+  auto send_end = now();
+
+  auto receive_start = now();
+  while (ret == 0) {
+    AVPacket* packet = av_packet_alloc();
+    ret = avcodec_receive_packet(cc_, packet);
+    if (ret == 0) {
+      ready_packet_queue_.push(packet);
+    } else if (ret == AVERROR(EAGAIN)) {
+      PACKET_FREE(packet);
+    } else if (ret == AVERROR_EOF) {
+      PACKET_FREE(packet);
+    } else {
+      char err_msg[256];
+      av_strerror(ret, err_msg, 256);
+      fprintf(stderr, "Error while receiving packet (%d): %s\n", ret, err_msg);
+      LOG(FATAL) << "Error while receiving packet";
+    }
+  }
+  auto receive_end = now();
+
+  if (f) {
+    av_frame_free(&frame_);
+    frame_ = nullptr;
+  }
+#else
+  auto send_start = now();
+  auto send_end = now();
+  auto receive_start = now();
+  auto receive_end = now();
+  LOG(FATAL) << "Frame output requires libavcodec >= 57.25.0 (current is "
+             << LIBAVCODEC_VERSION_MAJOR << "." << LIBAVCODEC_VERSION_MINOR
+             << "." << LIBAVCODEC_VERSION_MICRO << ")";
+#endif
+  if (profiler_) {
+    profiler_->add_interval("ffmpeg:send_frame", send_start, send_end);
+    profiler_->add_interval("ffmpeg:receive_packet", receive_start,
+                            receive_end);
+  }
+}
+}
+}
diff --git a/scanner/video/software/software_video_encoder.h b/scanner/video/software/software_video_encoder.h
new file mode 100644
index 00000000..674c9b5a
--- /dev/null
+++ b/scanner/video/software/software_video_encoder.h
@@ -0,0 +1,81 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "scanner/api/kernel.h"
+#include "scanner/util/queue.h"
+#include "scanner/video/video_encoder.h"
+
+extern "C" {
+#include "libavcodec/avcodec.h"
+#include "libavfilter/avfilter.h"
+#include "libavformat/avformat.h"
+#include "libavformat/avio.h"
+#include "libavutil/error.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libswscale/swscale.h"
+}
+
+#include <deque>
+#include <mutex>
+#include <vector>
+
+namespace scanner {
+namespace internal {
+
+///////////////////////////////////////////////////////////////////////////////
+/// SoftwareVideoEncoder
+class SoftwareVideoEncoder : public VideoEncoder {
+ public:
+  SoftwareVideoEncoder(i32 device_id, DeviceType output_type, i32 thread_count);
+
+  ~SoftwareVideoEncoder();
+
+  void configure(const FrameInfo& metadata, const EncodeOptions& opts) override;
+
+  bool feed(const u8* frame_buffer, size_t frame_size) override;
+
+  bool flush() override;
+
+  bool get_packet(u8* packet_buffer, size_t packet_size,
+                  size_t& actual_packet_size) override;
+
+  int decoded_packets_buffered() override;
+
+  void wait_until_packets_copied() override;
+
+ private:
+  void feed_frame(bool flush);
+
+  int device_id_;
+  DeviceType output_type_;
+  AVCodec* codec_;
+  AVCodecContext* cc_;
+  AVBitStreamFilterContext* annexb_;
+
+  FrameInfo metadata_;
+  i32 frame_width_;
+  i32 frame_height_;
+  SwsContext* sws_context_;
+  bool was_reset_;
+
+  i32 frame_id_;
+  AVFrame* frame_;
+  Queue<AVPacket*> ready_packet_queue_;
+};
+}
+}
diff --git a/scanner/video/video_decoder.cpp b/scanner/video/video_decoder.cpp
index ba115a9f..c77e8456 100644
--- a/scanner/video/video_decoder.cpp
+++ b/scanner/video/video_decoder.cpp
@@ -48,60 +48,59 @@ bool VideoDecoder::has_decoder_type(VideoDecoderType type) {
   std::vector<VideoDecoderType> types =
       VideoDecoder::get_supported_decoder_types();
 
-  for (const VideoDecoderType &supported_type : types) {
-    if (type == supported_type)
-      return true;
+  for (const VideoDecoderType& supported_type : types) {
+    if (type == supported_type) return true;
   }
 
   return false;
 }
 
-VideoDecoder *VideoDecoder::make_from_config(DeviceHandle device_handle,
+VideoDecoder* VideoDecoder::make_from_config(DeviceHandle device_handle,
                                              i32 num_devices,
                                              VideoDecoderType type) {
-  VideoDecoder *decoder = nullptr;
+  VideoDecoder* decoder = nullptr;
 
   switch (type) {
-  case VideoDecoderType::NVIDIA: {
+    case VideoDecoderType::NVIDIA: {
 #ifdef HAVE_NVIDIA_VIDEO_HARDWARE
-    // HACK(apoms): we are just going to assume all processing is done in the
-    //   default context for now and retain it ourselves. Ideally we would
-    //   allow the user to pass in the CUcontext they want to use for
-    //   decoding frames into but that would require providing opaque
-    //   configuration data to this function which we are avoiding for now.
-    //   The
-    //   reason we are avoding it for now is that by having configuration data
-    //   for different decoders, the client code ends up needing to do
-    //   conditional includes depending on which decoders are available in
-    //   order to fill in the configuration data, which is just messy.
-    CUD_CHECK(cuInit(0));
-    CUcontext cuda_context;
-    CUD_CHECK(cuDevicePrimaryCtxRetain(&cuda_context, device_handle.id));
-
-    decoder = new NVIDIAVideoDecoder(device_handle.id, device_handle.type,
-                                     cuda_context);
+      // HACK(apoms): we are just going to assume all processing is done in the
+      //   default context for now and retain it ourselves. Ideally we would
+      //   allow the user to pass in the CUcontext they want to use for
+      //   decoding frames into but that would require providing opaque
+      //   configuration data to this function which we are avoiding for now.
+      //   The
+      //   reason we are avoding it for now is that by having configuration data
+      //   for different decoders, the client code ends up needing to do
+      //   conditional includes depending on which decoders are available in
+      //   order to fill in the configuration data, which is just messy.
+      CUD_CHECK(cuInit(0));
+      CUcontext cuda_context;
+      CUD_CHECK(cuDevicePrimaryCtxRetain(&cuda_context, device_handle.id));
+
+      decoder = new NVIDIAVideoDecoder(device_handle.id, device_handle.type,
+                                       cuda_context);
 #else
 #endif
-    break;
-  }
-  case VideoDecoderType::INTEL: {
+      break;
+    }
+    case VideoDecoderType::INTEL: {
 #ifdef HAVE_INTEL_VIDEO_HARDWARE
-    decoder = new IntelVideoDecoder(device_handle.id, device_handle.type);
+      decoder = new IntelVideoDecoder(device_handle.id, device_handle.type);
 #else
 #endif
-    break;
-  }
-  case VideoDecoderType::SOFTWARE: {
-    decoder = new SoftwareVideoDecoder(device_handle.id, device_handle.type,
-                                       num_devices);
-    break;
-  }
-  default: {}
+      break;
+    }
+    case VideoDecoderType::SOFTWARE: {
+      decoder = new SoftwareVideoDecoder(device_handle.id, device_handle.type,
+                                         num_devices);
+      break;
+    }
+    default: {}
   }
 
   return decoder;
 }
 
-void VideoDecoder::set_profiler(Profiler *profiler) { profiler_ = profiler; }
+void VideoDecoder::set_profiler(Profiler* profiler) { profiler_ = profiler; }
 }
 }
diff --git a/scanner/video/video_decoder.h b/scanner/video/video_decoder.h
index 3e7224ab..9fdca5fe 100644
--- a/scanner/video/video_decoder.h
+++ b/scanner/video/video_decoder.h
@@ -16,7 +16,7 @@
 #pragma once
 
 #include "scanner/api/kernel.h"
-#include "scanner/engine/db.h"
+#include "scanner/engine/metadata.h"
 #include "scanner/util/common.h"
 #include "scanner/util/profiler.h"
 
@@ -42,8 +42,7 @@ class VideoDecoder {
   static bool has_decoder_type(VideoDecoderType type);
 
   static VideoDecoder* make_from_config(DeviceHandle device_handle,
-                                        i32 num_devices,
-                                        VideoDecoderType type);
+                                        i32 num_devices, VideoDecoderType type);
 
   virtual ~VideoDecoder(){};
 
diff --git a/scanner/video/video_encoder.cpp b/scanner/video/video_encoder.cpp
new file mode 100644
index 00000000..f558af45
--- /dev/null
+++ b/scanner/video/video_encoder.cpp
@@ -0,0 +1,106 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scanner/video/video_encoder.h"
+
+#ifdef HAVE_NVIDIA_VIDEO_HARDWARE
+#include "scanner/util/cuda.h"
+//#include "scanner/video/nvidia/nvidia_video_encoder.h"
+#endif
+
+#ifdef HAVE_INTEL_VIDEO_HARDWARE
+#include "scanner/video/intel/intel_video_encoder.h"
+#endif
+
+#include "scanner/video/software/software_video_encoder.h"
+
+#include <cassert>
+
+namespace scanner {
+namespace internal {
+
+std::vector<VideoEncoderType> VideoEncoder::get_supported_encoder_types() {
+  std::vector<VideoEncoderType> encoder_types;
+#ifdef HAVE_NVIDIA_VIDEO_HARDWARE
+// encoder_types.push_back(VideoEncoderType::NVIDIA);
+#endif
+#ifdef HAVE_INTEL_VIDEO_HARDWARE
+  encoder_types.push_back(VideoEncoderType::INTEL);
+#endif
+  encoder_types.push_back(VideoEncoderType::SOFTWARE);
+
+  return encoder_types;
+}
+
+bool VideoEncoder::has_encoder_type(VideoEncoderType type) {
+  std::vector<VideoEncoderType> types =
+      VideoEncoder::get_supported_encoder_types();
+
+  for (const VideoEncoderType& supported_type : types) {
+    if (type == supported_type) return true;
+  }
+
+  return false;
+}
+
+VideoEncoder* VideoEncoder::make_from_config(DeviceHandle device_handle,
+                                             i32 num_devices,
+                                             VideoEncoderType type) {
+  VideoEncoder* encoder = nullptr;
+
+  switch (type) {
+    case VideoEncoderType::NVIDIA: {
+#ifdef HAVE_NVIDIA_VIDEO_HARDWARE
+      // HACK(apoms): we are just going to assume all processing is done in the
+      //   default context for now and retain it ourselves. Ideally we would
+      //   allow the user to pass in the CUcontext they want to use for
+      //   decoding frames into but that would require providing opaque
+      //   configuration data to this function which we are avoiding for now.
+      //   The
+      //   reason we are avoding it for now is that by having configuration data
+      //   for different encoders, the client code ends up needing to do
+      //   conditional includes depending on which encoders are available in
+      //   order to fill in the configuration data, which is just messy.
+      CUD_CHECK(cuInit(0));
+      CUcontext cuda_context;
+      CUD_CHECK(cuDevicePrimaryCtxRetain(&cuda_context, device_handle.id));
+
+// encoder = new NVIDIAVideoEncoder(device_handle.id, device_handle.type,
+//                                  cuda_context);
+#else
+#endif
+      break;
+    }
+    case VideoEncoderType::INTEL: {
+#ifdef HAVE_INTEL_VIDEO_HARDWARE
+      encoder = new IntelVideoEncoder(device_handle.id, device_handle.type);
+#else
+#endif
+      break;
+    }
+    case VideoEncoderType::SOFTWARE: {
+      encoder = new SoftwareVideoEncoder(device_handle.id, device_handle.type,
+                                         num_devices);
+      break;
+    }
+    default: {}
+  }
+
+  return encoder;
+}
+
+void VideoEncoder::set_profiler(Profiler* profiler) { profiler_ = profiler; }
+}
+}
diff --git a/scanner/video/video_encoder.h b/scanner/video/video_encoder.h
new file mode 100644
index 00000000..5118888c
--- /dev/null
+++ b/scanner/video/video_encoder.h
@@ -0,0 +1,73 @@
+/* Copyright 2016 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "scanner/api/kernel.h"
+#include "scanner/engine/metadata.h"
+#include "scanner/util/common.h"
+#include "scanner/util/profiler.h"
+
+#include <vector>
+
+namespace scanner {
+namespace internal {
+
+enum class VideoEncoderType {
+  NVIDIA,
+  INTEL,
+  SOFTWARE,
+};
+
+struct EncodeOptions {
+  i32 quality = -1;
+  i64 bitrate = -1;
+  i64 keyframe_distance = -1;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+/// VideoEncoder
+class VideoEncoder {
+ public:
+  static std::vector<VideoEncoderType> get_supported_encoder_types();
+
+  static bool has_encoder_type(VideoEncoderType type);
+
+  static VideoEncoder* make_from_config(DeviceHandle device_handle,
+                                        i32 num_devices, VideoEncoderType type);
+
+  virtual ~VideoEncoder(){};
+
+  virtual void configure(const FrameInfo& metadata,
+                         const EncodeOptions& opts) = 0;
+
+  virtual bool feed(const u8* frame_buffer, size_t frame_size) = 0;
+
+  virtual bool flush() = 0;
+
+  virtual bool get_packet(u8* decoded_buffer, size_t decoded_size,
+                          size_t& actual_packet_size) = 0;
+
+  virtual int decoded_packets_buffered() = 0;
+
+  virtual void wait_until_packets_copied() = 0;
+
+  void set_profiler(Profiler* profiler);
+
+ protected:
+  Profiler* profiler_ = nullptr;
+};
+}
+}
diff --git a/scripts/dev-setup.sh b/scripts/dev-setup.sh
deleted file mode 100755
index 4e8957b6..00000000
--- a/scripts/dev-setup.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-mkdir -p python/scannerpy/include
-ln -fs ../../../scanner python/scannerpy/include
-ln -fs ../../build python/scannerpy
diff --git a/scripts/startup_node.sh b/scripts/startup_node.sh
new file mode 100644
index 00000000..ad015ba5
--- /dev/null
+++ b/scripts/startup_node.sh
@@ -0,0 +1,16 @@
+if [ -z "$1" ]
+  then
+    echo "Usage: startup_node.sh <id_for_node>"
+    exit
+fi
+
+num_gpus=$2
+if [ -z "$2" ]
+  then
+  	num_gpus=1
+    echo "num_gpus not specified. Defaulting to 1"
+fi
+
+gcloud compute --project "visualdb-1046" disks create "hackinstance-$1" --size "20" --zone "us-east1-d" --source-snapshot "hacksnapshot" --type "pd-standard"
+gcloud beta compute --project "visualdb-1046" instances create "hackinstance-$1" --zone "us-east1-d" --machine-type "n1-standard-4" --network "default" --metadata "ssh-keys=ubuntu:ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDXJ3JrrWKc0TAM5KBXYmuTVAG06DyA8F1hHbqUULCNp767bDNN1dTF9zTo+ZDWdCuHm49XWrpRK552G8U0A55HvBEjOj4eEUSuAibd0uDAYMZr3dJNTzXNU/KfgnbJYGbRboBk3fu47D4bhKPmjX5ZDsSN++BuUYpf1bH829invPBzlGeBb/QRe3Jk9DMK/swIqFc4j6PWeOItj4/1flXFFruR/bT0p2/MIxTTAMAWlhHRYqhtia1YYMbfdv38eqZMH1GY+n7GQJTuKBDvz0qPxCus86xaE4vCawD+iQJFuD8XxppsHbc1+oCAmi5AtbUeHXjXirN95itMBi7S2evd ubuntu,node_id=$1" --maintenance-policy "TERMINATE" --service-account "50518136478-compute@developer.gserviceaccount.com" --scopes "https://www.googleapis.com/auth/cloud-platform" --accelerator type=nvidia-tesla-k80,count=$num_gpus --tags "http-server","https-server" --disk "name=hackinstance-$1,device-name=hackinstance-$1,mode=rw,boot=yes,auto-delete=yes"
+
diff --git a/scripts/travis-build.sh b/scripts/travis-build.sh
index dd2bf5e7..535bc10a 100755
--- a/scripts/travis-build.sh
+++ b/scripts/travis-build.sh
@@ -1,26 +1,57 @@
 #!/bin/bash
 
+# Writing output (bell) keeps travis from timing out
+# https://github.com/travis-ci/travis-ci/issues/7961
+function bell() {
+  while true; do
+    echo -e "\a"
+    sleep 60
+  done
+}
+bell &
+
+set -e
+
 # The Travis VM isn't big enough to hold two Docker images of Scanner,
 # so we have to push and delete the CPU image before building the GPU one.
 
-set -e
+if [ "$TRAVIS_BRANCH" = "master" -a "$TRAVIS_PULL_REQUEST" = "false" ]; then
+    PUSH=0
+else
+    PUSH=1
+fi
 
 build_docker() {
     # We add -local to make sure it doesn't run the remote image if the build fails.
-    if [[ "$1" -eq "cpu" ]]
+    if [ "$1" = "cpu" ]
     then
-         docker build -t $DOCKER_REPO:$1-local . --build-arg gpu=OFF
-         docker run $DOCKER_REPO:$1-local /bin/bash \
-                -c "cd /opt/scanner/build && CTEST_OUTPUT_ON_FAILURE=1 make test"
+        docker build -t $DOCKER_REPO:$1-local . \
+               --build-arg gpu=OFF --build-arg tag=cpu \
+               -f docker/Dockerfile.scanner
+        # travis_wait allows tests to run for N minutes with no output
+        # https://docs.travis-ci.com/user/common-build-problems/#Build-times-out-because-no-output-was-received
+        docker run $DOCKER_REPO:$1-local /bin/bash \
+               -c "cd /opt/scanner/build && CTEST_OUTPUT_ON_FAILURE=1 make test"
+        docker rm $(docker ps -a -f status=exited -q)
     else
-         docker build -t $DOCKER_REPO:$1-local . --build-arg gpu=ON
+        docker build -t $DOCKER_REPO:$1-local . \
+               --build-arg gpu=ON --build-arg tag=gpu \
+               -f docker/Dockerfile.scanner
     fi
-    docker tag $DOCKER_REPO:$1-local $DOCKER_REPO:$1
-    docker push $DOCKER_REPO:$1
-    docker rm $(docker ps -a -f status=exited -q)
-    docker rmi -f $DOCKER_REPO:$1
+
+    if [ $PUSH -eq 0 ]; then
+        docker tag $DOCKER_REPO:$1-local $DOCKER_REPO:$1
+        docker push $DOCKER_REPO:$1
+        docker rmi -f $DOCKER_REPO:$1
+    fi
+
+    docker rmi -f $DOCKER_REPO:$1-local
 }
 
-docker login -u="$DOCKER_USER" -p="$DOCKER_PASS"
-build_docker cpu
-build_docker gpu
+if [ $PUSH -eq 0 ]; then
+    docker login -u="$DOCKER_USER" -p="$DOCKER_PASS"
+fi
+
+build_docker $BUILD_TYPE
+
+exit $?
diff --git a/scripts/travis-publish.sh b/scripts/travis-publish.sh
index ef8da355..d07a735f 100755
--- a/scripts/travis-publish.sh
+++ b/scripts/travis-publish.sh
@@ -1,3 +1,8 @@
+# Exit if this is not the master branch
+if ! [ "$TRAVIS_BRANCH" = "master" -a "$TRAVIS_PULL_REQUEST" = "false" ]; then
+    exit 0
+fi
+
 # Commit docs
 REPO_PATH=git@github.com:scanner-research/scanner.git
 HTML_PATH=build/doc/html
diff --git a/setup.py b/setup.py
deleted file mode 100644
index cd38826c..00000000
--- a/setup.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from setuptools import setup, find_packages
-import os
-
-REQUIRED_PACKAGES = [
-    'protobuf >= 3.1.0',
-    'grpcio >= 1.1.0',
-    'toml >= 0.9.2',
-    'enum34 >= 1.1.6',
-    'numpy >= 1.12.0',
-    'scipy >= 0.18.1',
-    'storehouse >= 0.1.0'
-]
-
-package_data = {
-    'scannerpy': [
-        'build/*.so',
-    ]
-}
-
-def get_build_dirs(d):
-    return [t[0]+'/*.*' for t in os.walk('build/'+d) if 'CMakeFiles' not in t[0]]
-
-package_data['scannerpy'] += get_build_dirs('scanner')
-package_data['scannerpy'] += get_build_dirs('stdlib')
-package_data['scannerpy'] += ['include/{}/*.h'.format(t[0])
-                              for t in os.walk('scanner')]
-
-setup(
-    name='scannerpy',
-    version='0.1.13',
-    description='Efficient video analysis at scale',
-    long_description='',
-    url='https://github.com/scanner-research/scanner',
-    author='Alex Poms and Will Crichton',
-    author_email='wcrichto@cs.stanford.edu',
-
-    package_dir={'': 'python'},
-    packages=find_packages(where='python'),
-    install_requires=REQUIRED_PACKAGES,
-    package_data=package_data,
-
-    license='Apache 2.0',
-    keywords='video distributed gpu',
-)
diff --git a/sphinx/Makefile b/sphinx/Makefile
new file mode 100644
index 00000000..4ba0d4a2
--- /dev/null
+++ b/sphinx/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = scanner
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/sphinx/conf.py b/sphinx/conf.py
new file mode 100644
index 00000000..dfcadbc0
--- /dev/null
+++ b/sphinx/conf.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# scanner documentation build configuration file, created by
+# sphinx-quickstart on Sun Nov 26 19:06:21 2017.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../python'))
+
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['sphinx.ext.autodoc',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.todo',
+    'sphinx.ext.coverage',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.viewcode',
+    'sphinx.ext.githubpages']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'scanner'
+copyright = '2017, Alex Poms, Will Crichton'
+author = 'Alex Poms, Will Crichton'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0'
+# The full version, including alpha/beta/rc tags.
+release = '1'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# This is required for the alabaster theme
+# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
+html_sidebars = {
+    '**': [
+        'relations.html',  # needs 'show_related': True theme option to display
+        'searchbox.html',
+    ]
+}
+
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'scannerdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'scanner.tex', 'scanner Documentation',
+     'Alex Poms, Will Crichton', 'manual'),
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'scanner', 'scanner Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'scanner', 'scanner Documentation',
+     author, 'scanner', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {'https://docs.python.org/': None}
diff --git a/sphinx/index.rst b/sphinx/index.rst
new file mode 100644
index 00000000..1b1ee80f
--- /dev/null
+++ b/sphinx/index.rst
@@ -0,0 +1,23 @@
+.. scanner documentation master file, created by
+   sphinx-quickstart on Sun Nov 26 19:06:21 2017.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to scanner's documentation!
+===================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   intro
+   tutorial
+   modules
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/sphinx/intro.rst b/sphinx/intro.rst
new file mode 100644
index 00000000..29f600fc
--- /dev/null
+++ b/sphinx/intro.rst
@@ -0,0 +1,2 @@
+Intro
+===================================
diff --git a/sphinx/modules.rst b/sphinx/modules.rst
new file mode 100644
index 00000000..112d6022
--- /dev/null
+++ b/sphinx/modules.rst
@@ -0,0 +1,7 @@
+python
+======
+
+.. toctree::
+   :maxdepth: 4
+
+   scannerpy
diff --git a/sphinx/scannerpy.rst b/sphinx/scannerpy.rst
new file mode 100644
index 00000000..65df9ccb
--- /dev/null
+++ b/sphinx/scannerpy.rst
@@ -0,0 +1,133 @@
+scannerpy package
+=================
+
+Subpackages
+-----------
+
+.. toctree::
+
+    scannerpy.stdlib
+
+Submodules
+----------
+
+scannerpy\.bulk\_job module
+---------------------------
+
+.. automodule:: scannerpy.bulk_job
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.collection module
+----------------------------
+
+.. automodule:: scannerpy.collection
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.column module
+------------------------
+
+.. automodule:: scannerpy.column
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.common module
+------------------------
+
+.. automodule:: scannerpy.common
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.config module
+------------------------
+
+.. automodule:: scannerpy.config
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.database module
+--------------------------
+
+.. automodule:: scannerpy.database
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.job module
+---------------------
+
+.. automodule:: scannerpy.job
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.kernel module
+------------------------
+
+.. automodule:: scannerpy.kernel
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.op module
+--------------------
+
+.. automodule:: scannerpy.op
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.partitioner module
+-----------------------------
+
+.. automodule:: scannerpy.partitioner
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.profiler module
+--------------------------
+
+.. automodule:: scannerpy.profiler
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.protobuf\_generator module
+-------------------------------------
+
+.. automodule:: scannerpy.protobuf_generator
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.sampler module
+-------------------------
+
+.. automodule:: scannerpy.sampler
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.table module
+-----------------------
+
+.. automodule:: scannerpy.table
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: scannerpy
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/sphinx/scannerpy.stdlib.rst b/sphinx/scannerpy.stdlib.rst
new file mode 100644
index 00000000..4837bf9a
--- /dev/null
+++ b/sphinx/scannerpy.stdlib.rst
@@ -0,0 +1,126 @@
+scannerpy\.stdlib package
+=========================
+
+Submodules
+----------
+
+scannerpy\.stdlib\.bbox\_nms\_kernel module
+-------------------------------------------
+
+.. automodule:: scannerpy.stdlib.bbox_nms_kernel
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.stdlib\.bboxes module
+--------------------------------
+
+.. automodule:: scannerpy.stdlib.bboxes
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.stdlib\.build\_flags module
+--------------------------------------
+
+.. automodule:: scannerpy.stdlib.build_flags
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.stdlib\.loaders module
+---------------------------------
+
+.. automodule:: scannerpy.stdlib.loaders
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.stdlib\.montage module
+---------------------------------
+
+.. automodule:: scannerpy.stdlib.montage
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.stdlib\.net\_descriptor module
+-----------------------------------------
+
+.. automodule:: scannerpy.stdlib.net_descriptor
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.stdlib\.parsers module
+---------------------------------
+
+.. automodule:: scannerpy.stdlib.parsers
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.stdlib\.pipelines module
+-----------------------------------
+
+.. automodule:: scannerpy.stdlib.pipelines
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.stdlib\.pose\_nms\_kernel module
+-------------------------------------------
+
+.. automodule:: scannerpy.stdlib.pose_nms_kernel
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.stdlib\.poses module
+-------------------------------
+
+.. automodule:: scannerpy.stdlib.poses
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.stdlib\.pykernel module
+----------------------------------
+
+.. automodule:: scannerpy.stdlib.pykernel
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.stdlib\.util module
+------------------------------
+
+.. automodule:: scannerpy.stdlib.util
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.stdlib\.video module
+-------------------------------
+
+.. automodule:: scannerpy.stdlib.video
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+scannerpy\.stdlib\.writers module
+---------------------------------
+
+.. automodule:: scannerpy.stdlib.writers
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: scannerpy.stdlib
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/sphinx/tutorial.rst b/sphinx/tutorial.rst
new file mode 100644
index 00000000..352a1d7a
--- /dev/null
+++ b/sphinx/tutorial.rst
@@ -0,0 +1,2 @@
+Tutorial
+===================================
diff --git a/stdlib/CMakeLists.txt b/stdlib/CMakeLists.txt
index b996984a..88f1de17 100644
--- a/stdlib/CMakeLists.txt
+++ b/stdlib/CMakeLists.txt
@@ -3,9 +3,11 @@ include(${SCANNER_PATH}/cmake/Util/Op.cmake)
 
 option(BUILD_IMGPROC_OPS "" ON)
 option(BUILD_CAFFE_OPS "" ON)
-option(BUILD_CPM2_OPS "" OFF)
+option(BUILD_OPENPOSE_OPS "" ON)
 option(BUILD_MOTION_OPS "" ON)
+option(BUILD_VIZ_OPS "" ON)
 option(BUILD_OPENFACE_OPS "" OFF)
+option(BUILD_GIPUMA_OPS "" OFF)
 
 set(STDLIB_LIBRARIES)
 set(OPENCV_MAJOR_VERSION 3)
@@ -43,6 +45,12 @@ else()
   add_definitions(-DCPU_ONLY)
 endif()
 
+if (BUILD_OPENPOSE_OPS)
+  find_package(OpenPose REQUIRED)
+  include_directories(${OPENPOSE_INCLUDE_DIRS})
+  list(APPEND STDLIB_LIBRARIES "${OPENPOSE_LIBRARIES}")
+endif()
+
 set(TARGETS)
 if (BUILD_CAFFE_OPS)
   add_subdirectory(caffe)
@@ -59,11 +67,21 @@ if (BUILD_OPENFACE_OPS)
   list(APPEND TARGETS openface)
 endif()
 
+if (BUILD_GIPUMA_OPS)
+  add_subdirectory(gipuma)
+  list(APPEND TARGETS gipuma)
+endif()
+
 if (BUILD_MOTION_OPS)
   add_subdirectory(motion)
   list(APPEND TARGETS motion)
 endif()
 
+if (BUILD_VIZ_OPS)
+  add_subdirectory(viz)
+  list(APPEND TARGETS viz)
+endif()
+
 add_subdirectory(misc)
 list(APPEND TARGETS misc)
 
@@ -87,7 +105,7 @@ foreach (TARGET ${TARGETS})
   add_dependencies(${TARGET} stdlib_proto_files)
 endforeach()
 
-target_link_libraries(stdlib PUBLIC "${STDLIB_LIBRARIES}")
+target_link_libraries(stdlib PUBLIC "${STDLIB_LIBRARIES}" scanner)
 
 if (HALIDE_TARGETS)
   foreach(HALIDE_TARGET ${HALIDE_TARGETS})
@@ -96,6 +114,11 @@ if (HALIDE_TARGETS)
   endforeach()
   if (BUILD_CUDA)
     add_dependencies(stdlib scanner_halide)
-    target_link_libraries(stdlib PRIVATE scanner_halide)
   endif()
 endif()
+
+find_package(Boost COMPONENTS thread program_options regex python REQUIRED)
+target_link_libraries(stdlib PUBLIC
+  "${Boost_LIBRAIRES}"
+  "${Boost_LIBRARY_DIRS}/libboost_numpy.so")
+target_include_directories(stdlib PUBLIC "${Boost_INCLUDE_DIRS}")
diff --git a/stdlib/caffe/CMakeLists.txt b/stdlib/caffe/CMakeLists.txt
index 0a621ea9..4dc43830 100644
--- a/stdlib/caffe/CMakeLists.txt
+++ b/stdlib/caffe/CMakeLists.txt
@@ -3,23 +3,25 @@ set(SOURCE_FILES
   caffe_kernel_cpu.cpp
   caffe_input_kernel.cpp
   caffe_input_kernel_cpu.cpp
-  facenet_output_kernel_cpu.cpp)
+  facenet_input_kernel_cpu.cpp
+  facenet_kernel.cpp
+  facenet_output_kernel_cpu.cpp
+  yolo_output_kernel_cpu.cpp
+  faster_rcnn_kernel.cpp
+  faster_rcnn_output_kernel_cpu.cpp)
 if (BUILD_CUDA)
   list(APPEND SOURCE_FILES
     caffe_kernel_gpu.cpp
     caffe_input_kernel_gpu.cpp
-    facenet_input_kernel_gpu.cpp
-    facenet_kernel.cpp)
+    facenet_input_kernel_gpu.cpp)
 endif()
 
-if (BUILD_CPM2_OPS)
-  list(APPEND SOURCE_FILES
-    cpm2_output_kernel_cpu.cpp)
-  if (BUILD_CUDA)
-    list(APPEND SOURCE_FILES
-      cpm2_input_kernel_gpu.cpp
-      cpm2_kernel.cpp)
+if (BUILD_OPENPOSE_OPS)
+  if (NOT BUILD_CUDA)
+    # TODO(apoms): add warning
   endif()
+  list(APPEND SOURCE_FILES
+    openpose_kernel.cpp)
 endif()
 
 add_halide_target(caffe_input_transformer_cpu.cpp host)
@@ -27,6 +29,8 @@ if (BUILD_CUDA)
   add_halide_target(caffe_input_transformer_gpu.cpp cuda)
 endif()
 
+add_definitions(-DUSE_OPENCV)
+
 add_library(caffe OBJECT ${SOURCE_FILES})
 
 # TODO(wcrichto): auto add these dependencies?
diff --git a/stdlib/caffe/caffe_input_kernel.cpp b/stdlib/caffe/caffe_input_kernel.cpp
index 3723e7c4..b041d871 100644
--- a/stdlib/caffe/caffe_input_kernel.cpp
+++ b/stdlib/caffe/caffe_input_kernel.cpp
@@ -6,13 +6,13 @@
 
 #ifdef HAVE_CUDA
 #include "HalideRuntimeCuda.h"
-#include "scanner/engine/halide_context.h"
+#include "scanner/util/halide_context.h"
 #endif
 
 namespace scanner {
 
-CaffeInputKernel::CaffeInputKernel(const Kernel::Config &config)
-    : VideoKernel(config), device_(config.devices[0]) {
+CaffeInputKernel::CaffeInputKernel(const KernelConfig& config)
+  : BatchedKernel(config), device_(config.devices[0]) {
   args_.ParseFromArray(config.args.data(), config.args.size());
   if (device_.type == DeviceType::GPU) {
     CUDA_PROTECT({
@@ -41,7 +41,7 @@ void CaffeInputKernel::new_frame_info() {
   }
 }
 
-void CaffeInputKernel::set_halide_buf(buffer_t &halide_buf, u8 *buf,
+void CaffeInputKernel::set_halide_buf(buffer_t& halide_buf, u8* buf,
                                       size_t size) {
   if (device_.type == DeviceType::GPU) {
     CUDA_PROTECT({
@@ -58,20 +58,21 @@ void CaffeInputKernel::set_halide_buf(buffer_t &halide_buf, u8 *buf,
       // "You'll need to set the host field of the buffer_t structs to
       // something other than nullptr as that is used to indicate bounds query
       // calls" - Zalman Stern
-      halide_buf.host = (u8 *)0xdeadbeef;
-      });
+      halide_buf.host = (u8*)0xdeadbeef;
+    });
   } else {
     halide_buf.host = buf;
   }
 }
 
-void CaffeInputKernel::unset_halide_buf(buffer_t &halide_buf) {
+void CaffeInputKernel::unset_halide_buf(buffer_t& halide_buf) {
   if (device_.type == DeviceType::GPU) {
     CUDA_PROTECT({ halide_cuda_detach_device_ptr(nullptr, &halide_buf); });
   }
 }
 
-void CaffeInputKernel::transform_halide(u8 *input_buffer, u8 *output_buffer) {
+void CaffeInputKernel::transform_halide(const u8* input_buffer,
+                                        u8* output_buffer) {
   i32 frame_width = frame_info_.width();
   i32 frame_height = frame_info_.height();
   size_t net_input_size =
@@ -79,7 +80,8 @@ void CaffeInputKernel::transform_halide(u8 *input_buffer, u8 *output_buffer) {
 
   buffer_t input_buf = {0}, output_buf = {0};
 
-  set_halide_buf(input_buf, input_buffer, frame_width * frame_height * 3);
+  set_halide_buf(input_buf, const_cast<u8*>(input_buffer),
+                 frame_width * frame_height * 3);
   set_halide_buf(output_buf, output_buffer, net_input_size);
 
   // Halide has the input format x * stride[0] + y * stride[1] + c * stride[2]
@@ -120,7 +122,7 @@ void CaffeInputKernel::transform_halide(u8 *input_buffer, u8 *output_buffer) {
   unset_halide_buf(output_buf);
 }
 
-void CaffeInputKernel::transform_caffe(u8 *input_buffer, u8 *output_buffer) {
+void CaffeInputKernel::transform_caffe(u8* input_buffer, u8* output_buffer) {
   i32 frame_width = frame_info_.width();
   i32 frame_height = frame_info_.height();
   size_t net_input_size =
@@ -137,11 +139,11 @@ void CaffeInputKernel::transform_caffe(u8 *input_buffer, u8 *output_buffer) {
 
   caffe::Blob<f32> output_blob;
   output_blob.Reshape(1, 3, net_input_height_, net_input_width_);
-  output_blob.set_cpu_data((f32 *)output_buffer);
+  output_blob.set_cpu_data((f32*)output_buffer);
 
   caffe::TransformationParameter param;
-  auto &descriptor = args_.net_descriptor();
-  auto &mean_colors = descriptor.mean_colors();
+  auto& descriptor = args_.net_descriptor();
+  auto& mean_colors = descriptor.mean_colors();
   param.set_force_color(true);
   if (descriptor.normalize()) {
     param.set_scale(1.0 / 255.0);
@@ -151,30 +153,28 @@ void CaffeInputKernel::transform_caffe(u8 *input_buffer, u8 *output_buffer) {
   }
 
   caffe::DataTransformer<f32> transformer(param, caffe::TEST);
-  // transformer.Transform(input_mats, &output_blob);
+  transformer.Transform(input_mats, &output_blob);
 }
 
-void CaffeInputKernel::execute(const BatchedColumns &input_columns,
-                               BatchedColumns &output_columns) {
-  check_frame_info(device_, input_columns[1]);
+void CaffeInputKernel::execute(const BatchedColumns& input_columns,
+                               BatchedColumns& output_columns) {
+  auto& frame_col = input_columns[0];
+  check_frame(device_, frame_col[0]);
 
   auto eval_start = now();
-  i32 input_count = input_columns[0].rows.size();
+  i32 input_count = num_rows(frame_col);
   size_t net_input_size =
       net_input_width_ * net_input_height_ * 3 * sizeof(float);
 
   set_device();
 
-  u8 *output_block =
-      new_block_buffer(device_, net_input_size * input_count, input_count);
-
+  FrameInfo info(3, net_input_height_, net_input_width_, FrameType::F32);
+  std::vector<Frame*> frames = new_frames(device_, info, input_count);
   for (i32 frame = 0; frame < input_count; frame++) {
-    u8 *input_buffer = input_columns[0].rows[frame].buffer;
-    u8 *output_buffer = output_block + frame * net_input_size;
+    const u8* input_buffer = frame_col[frame].as_const_frame()->data;
+    transform_halide(input_buffer, frames[frame]->data);
 
-    transform_halide(input_buffer, output_buffer);
-
-    INSERT_ROW(output_columns[0], output_buffer, net_input_size);
+    insert_frame(output_columns[0], frames[frame]);
   }
 
   extra_inputs(input_columns, output_columns);
@@ -191,5 +191,4 @@ void CaffeInputKernel::set_device() {
     halide_set_gpu_device(device_.id);
   });
 }
-
 }
diff --git a/stdlib/caffe/caffe_input_kernel.h b/stdlib/caffe/caffe_input_kernel.h
index 8b29e48c..fe73b4b0 100644
--- a/stdlib/caffe/caffe_input_kernel.h
+++ b/stdlib/caffe/caffe_input_kernel.h
@@ -1,10 +1,10 @@
 #pragma once
 
-#include "stdlib/stdlib.pb.h"
-#include "scanner/api/op.h"
 #include "scanner/api/kernel.h"
-#include "scanner/util/opencv.h"
+#include "scanner/api/op.h"
 #include "scanner/util/cuda.h"
+#include "scanner/util/opencv.h"
+#include "stdlib/stdlib.pb.h"
 
 #ifdef HAVE_CUDA
 #include "caffe_input_transformer_gpu/caffe_input_transformer_gpu.h"
@@ -13,9 +13,9 @@
 
 namespace scanner {
 
-class CaffeInputKernel : public VideoKernel {
-public:
-  CaffeInputKernel(const Kernel::Config& config);
+class CaffeInputKernel : public BatchedKernel, public VideoKernel {
+ public:
+  CaffeInputKernel(const KernelConfig& config);
   ~CaffeInputKernel();
 
   void new_frame_info() override;
@@ -28,10 +28,10 @@ class CaffeInputKernel : public VideoKernel {
   virtual void extra_inputs(const BatchedColumns& input_columns,
                             BatchedColumns& output_columns) {}
 
-protected:
+ protected:
   void set_halide_buf(buffer_t& halide_buf, u8* buf, size_t size);
   void unset_halide_buf(buffer_t& halide_buf);
-  void transform_halide(u8* input_buffer, u8* output_buffer);
+  void transform_halide(const u8* input_buffer, u8* output_buffer);
   void transform_caffe(u8* input_buffer, u8* output_buffer);
 
   DeviceHandle device_;
@@ -42,5 +42,4 @@ class CaffeInputKernel : public VideoKernel {
   CUcontext context_;
 #endif
 };
-
 }
diff --git a/stdlib/caffe/caffe_input_kernel_cpu.cpp b/stdlib/caffe/caffe_input_kernel_cpu.cpp
index 0409697b..a2fc8193 100644
--- a/stdlib/caffe/caffe_input_kernel_cpu.cpp
+++ b/stdlib/caffe/caffe_input_kernel_cpu.cpp
@@ -2,11 +2,10 @@
 
 namespace scanner {
 
-REGISTER_OP(CaffeInput)
-    .inputs({"frame", "frame_info"})
-    .outputs({"caffe_frame"});
+REGISTER_OP(CaffeInput).frame_input("frame").frame_output("caffe_frame");
 
 REGISTER_KERNEL(CaffeInput, CaffeInputKernel)
     .device(DeviceType::CPU)
+    .batch()
     .num_devices(1);
 }
diff --git a/stdlib/caffe/caffe_input_kernel_gpu.cpp b/stdlib/caffe/caffe_input_kernel_gpu.cpp
index 80ac7e00..20d8495b 100644
--- a/stdlib/caffe/caffe_input_kernel_gpu.cpp
+++ b/stdlib/caffe/caffe_input_kernel_gpu.cpp
@@ -4,5 +4,6 @@ namespace scanner {
 
 REGISTER_KERNEL(CaffeInput, CaffeInputKernel)
     .device(DeviceType::GPU)
+    .batch()
     .num_devices(1);
 }
diff --git a/stdlib/caffe/caffe_input_transformer_base.h b/stdlib/caffe/caffe_input_transformer_base.h
index dc026dd4..95402859 100644
--- a/stdlib/caffe/caffe_input_transformer_base.h
+++ b/stdlib/caffe/caffe_input_transformer_base.h
@@ -11,32 +11,28 @@ Expr kernel_box(Expr x) {
   return select(xx <= 0.5f, 1.0f, 0.0f);
 }
 
-Expr sinc(Expr x) {
-  return sin(float(M_PI) * x) / x;
-}
+Expr sinc(Expr x) { return sin(float(M_PI) * x) / x; }
 
 Expr kernel_lanczos(Expr x) {
-  Expr value = sinc(x) * sinc(x/3);
-  value = select(x == 0.0f, 1.0f, value); // Take care of singularity at zero
-  value = select(x > 3 || x < -3, 0.0f, value); // Clamp to zero out of bounds
+  Expr value = sinc(x) * sinc(x / 3);
+  value = select(x == 0.0f, 1.0f, value);  // Take care of singularity at zero
+  value = select(x > 3 || x < -3, 0.0f, value);  // Clamp to zero out of bounds
   return value;
 }
 
 struct KernelInfo {
-  const char *name;
+  const char* name;
   float size;
   Expr (*kernel)(Expr);
 };
 
-static KernelInfo kernelInfo[] = {
-  { "box", 0.5f, kernel_box },
-  // { "linear", 1.0f, kernel_linear },
-  // { "cubic", 2.0f, kernel_cubic },
-  { "lanczos", 3.0f, kernel_lanczos }
-};
+static KernelInfo kernelInfo[] = {{"box", 0.5f, kernel_box},
+                                  // { "linear", 1.0f, kernel_linear },
+                                  // { "cubic", 2.0f, kernel_cubic },
+                                  {"lanczos", 3.0f, kernel_lanczos}};
 
 class CaffeInputTransformer : public Halide::Generator<CaffeInputTransformer> {
-public:
+ public:
   ImageParam input{UInt(8), 3, "input"};
   Param<int> input_width{"input_width"}, input_height{"input_height"};
   Param<int> target_width{"target_width"}, target_height{"target_height"};
@@ -54,7 +50,7 @@ class CaffeInputTransformer : public Halide::Generator<CaffeInputTransformer> {
     Expr scaleX = target_width / cast<float>(input_width);
     Expr scaleY = target_height / cast<float>(input_height);
 
-    const KernelInfo &info = kernelInfo[0];
+    const KernelInfo& info = kernelInfo[0];
     Expr kernelSizeX = info.size / scaleX;
     Expr kernelSizeY = info.size / scaleY;
 
@@ -83,17 +79,16 @@ class CaffeInputTransformer : public Halide::Generator<CaffeInputTransformer> {
     resized_final(x, y, c) = clamp(resized_y(x, y, c), 0.0f, 255.0f);
 
     Func mean_subtract("mean_subtract");
-    mean_subtract(x, y, c) = resized_final(x, y, c) -
-      select(c==0, mean_r,
-             select(c==1, mean_g, mean_b));
+    mean_subtract(x, y, c) =
+        resized_final(x, y, c) -
+        select(c == 0, mean_r, select(c == 1, mean_g, mean_b));
 
     Func rescaled("rescaled");
-    rescaled(x, y, c) = mean_subtract(x, y, 2-c) / select(normalize, 255.0f, 1.0f);
+    rescaled(x, y, c) =
+        mean_subtract(x, y, 2 - c) / select(normalize, 255.0f, 1.0f);
     rescaled.bound(c, 0, 3);
 
-    input
-      .dim(0).set_stride(3)
-      .dim(2).set_stride(1);
+    input.dim(0).set_stride(3).dim(2).set_stride(1);
 
     Target target = Halide::get_target_from_environment();
 #ifdef HALIDE_USE_GPU
diff --git a/stdlib/caffe/caffe_kernel.cpp b/stdlib/caffe/caffe_kernel.cpp
index f809e3d7..e1114e91 100644
--- a/stdlib/caffe/caffe_kernel.cpp
+++ b/stdlib/caffe/caffe_kernel.cpp
@@ -1,5 +1,5 @@
 #include "stdlib/caffe/caffe_kernel.h"
-#include "scanner/engine/db.h"
+#include "scanner/engine/metadata.h"
 
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
@@ -9,6 +9,9 @@
 #include "caffe/util/io.hpp"
 #include "toml/toml.h"
 
+#include <Python.h>
+#include <boost/python.hpp>
+
 namespace scanner {
 
 using caffe::Blob;
@@ -20,30 +23,30 @@ caffe::Caffe::Brew device_type_to_caffe_mode(DeviceType type) {
   caffe::Caffe::Brew caffe_type;
 
   switch (type) {
-  case DeviceType::GPU:
-    caffe_type = caffe::Caffe::GPU;
-    break;
-  case DeviceType::CPU:
-    caffe_type = caffe::Caffe::CPU;
-    break;
-  default:
-    // TODO(apoms): error message
-    exit(EXIT_FAILURE);
-    break;
+    case DeviceType::GPU:
+      caffe_type = caffe::Caffe::GPU;
+      break;
+    case DeviceType::CPU:
+      caffe_type = caffe::Caffe::CPU;
+      break;
+    default:
+      // TODO(apoms): error message
+      exit(EXIT_FAILURE);
+      break;
   }
 
   return caffe_type;
 }
 
-proto::NetDescriptor
-descriptor_from_net_file(const std::string &net_file_path) {
+proto::NetDescriptor descriptor_from_net_file(
+    const std::string& net_file_path) {
   std::ifstream net_file{net_file_path};
 
   toml::ParseResult pr = toml::parse(net_file);
   if (!pr.valid()) {
     LOG(FATAL) << pr.errorReason;
   }
-  const toml::Value &root = pr.value;
+  const toml::Value& root = pr.value;
 
   proto::NetDescriptor descriptor;
 
@@ -96,10 +99,10 @@ descriptor_from_net_file(const std::string &net_file_path) {
 
   descriptor.set_model_path(model_path->as<std::string>());
   descriptor.set_model_weights_path(weights_path->as<std::string>());
-  for (const toml::Value &v : input_layers->as<toml::Array>()) {
+  for (const toml::Value& v : input_layers->as<toml::Array>()) {
     descriptor.add_input_layer_names(v.as<std::string>());
   }
-  for (const toml::Value &v : output_layers->as<toml::Array>()) {
+  for (const toml::Value& v : output_layers->as<toml::Array>()) {
     descriptor.add_output_layer_names(v.as<std::string>());
   }
 
@@ -133,6 +136,9 @@ descriptor_from_net_file(const std::string &net_file_path) {
   auto pad_mod = net->find("pad_mod");
   descriptor.set_pad_mod(pad_mod ? pad_mod->as<i32>() : -1);
 
+  auto uses_python = net->find("uses_python");
+  descriptor.set_uses_python(uses_python ? uses_python->as<bool>() : false);
+
   auto normalize = net->find("normalize");
   descriptor.set_normalize(normalize ? normalize->as<bool>() : false);
 
@@ -166,7 +172,7 @@ descriptor_from_net_file(const std::string &net_file_path) {
     float green = mean_green->as<double>();
     float red = mean_red->as<double>();
 
-    for (const toml::Value &v : channel_ordering->as<toml::Array>()) {
+    for (const toml::Value& v : channel_ordering->as<toml::Array>()) {
       std::string color = v.as<std::string>();
       if (color == "red") {
         descriptor.add_mean_colors(red);
@@ -218,8 +224,8 @@ bool file_exists(const std::string& path) {
   return stat(path.c_str(), &buffer) == 0;
 }
 
-CaffeKernel::CaffeKernel(const Kernel::Config &config)
-    : VideoKernel(config), device_(config.devices[0]) {
+CaffeKernel::CaffeKernel(const KernelConfig& config)
+  : BatchedKernel(config), device_(config.devices[0]) {
   valid_.set_success(true);
 
   if (!args_.ParseFromArray(config.args.data(), config.args.size())) {
@@ -229,24 +235,28 @@ CaffeKernel::CaffeKernel(const Kernel::Config &config)
 
   set_device();
   // Initialize our network
-  auto &descriptor = args_.net_descriptor();
+  auto& descriptor = args_.net_descriptor();
   if (!file_exists(descriptor.model_path())) {
-    RESULT_ERROR(
-      &valid_,
-      "Model path %s does not exist.",
-      descriptor.model_path().c_str());
+    RESULT_ERROR(&valid_, "Model path %s does not exist.",
+                 descriptor.model_path().c_str());
     return;
   }
   if (!file_exists(descriptor.model_weights_path())) {
-    RESULT_ERROR(
-      &valid_,
-      "Model weights path %s does not exist.",
-      descriptor.model_weights_path().c_str());
+    RESULT_ERROR(&valid_, "Model weights path %s does not exist.",
+                 descriptor.model_weights_path().c_str());
     return;
   }
 
+  PyGILState_STATE gstate;
+  if (descriptor.uses_python()) {
+    gstate = PyGILState_Ensure();
+  }
   net_.reset(new caffe::Net<float>(descriptor.model_path(), caffe::TEST));
   net_->CopyTrainedLayersFrom(descriptor.model_weights_path());
+  if (descriptor.uses_python()) {
+    PyGILState_Release(gstate);
+  }
+
   // Initialize memory
   const boost::shared_ptr<caffe::Blob<float>> input_blob{
       net_->blob_by_name(descriptor.input_layer_names(0))};
@@ -258,12 +268,11 @@ CaffeKernel::CaffeKernel(const Kernel::Config &config)
 
   if (intended_output != actual_output) {
     RESULT_ERROR(
-      &valid_,
-      "# output columns in net descriptor (%lu) does not match number of "
-      "output columns registered for op (%lu) If you have multiple net "
-      "outputs, you must register your own op using the CaffeKernel.",
-      intended_output,
-      actual_output);
+        &valid_,
+        "# output columns in net descriptor (%lu) does not match number of "
+        "output columns registered for op (%lu) If you have multiple net "
+        "outputs, you must register your own op using the CaffeKernel.",
+        intended_output, actual_output);
     return;
   }
 }
@@ -274,12 +283,12 @@ void CaffeKernel::validate(proto::Result* result) {
 }
 
 void CaffeKernel::new_frame_info() {
-  i32 frame_width = frame_info_.width();
-  i32 frame_height = frame_info_.height();
+  i32 frame_width = frame_info_.shape[2];
+  i32 frame_height = frame_info_.shape[1];
 
   set_device();
 
-  auto &descriptor = args_.net_descriptor();
+  auto& descriptor = args_.net_descriptor();
   assert(descriptor.input_layer_names().size() > 0);
   const boost::shared_ptr<caffe::Blob<float>> input_blob{
       net_->blob_by_name(descriptor.input_layer_names(0))};
@@ -324,20 +333,25 @@ void CaffeKernel::new_frame_info() {
   net_config();
 }
 
-void CaffeKernel::execute(const BatchedColumns &input_columns,
-                          BatchedColumns &output_columns) {
-  check_frame_info(device_, input_columns.back());
+void CaffeKernel::execute(const BatchedColumns& input_columns,
+                          BatchedColumns& output_columns) {
+  check_frame(device_, input_columns[0][0]);
   set_device();
 
-  auto &descriptor = args_.net_descriptor();
+  auto& descriptor = args_.net_descriptor();
   std::vector<boost::shared_ptr<caffe::Blob<float>>> input_blobs;
-  for (const std::string &name : descriptor.input_layer_names()) {
+  for (const std::string& name : descriptor.input_layer_names()) {
     input_blobs.emplace_back(net_->blob_by_name(name));
   }
   assert(input_blobs.size() > 0);
 
+  PyGILState_STATE gstate;
+  if (descriptor.uses_python()) {
+    gstate = PyGILState_Ensure();
+  }
+
   size_t num_outputs = descriptor.output_layer_names().size();
-  i32 input_count = (i32)input_columns[0].rows.size();
+  i32 input_count = (i32)input_columns[0].size();
   i32 batch_size = args_.batch_size();
   for (i32 frame = 0; frame < input_count; frame += batch_size) {
     i32 batch_count = std::min(input_count - frame, batch_size);
@@ -348,7 +362,7 @@ void CaffeKernel::execute(const BatchedColumns &input_columns,
     }
 
     for (i32 i = 0; i < input_blobs.size(); ++i) {
-      f32 *net_input_buffer = nullptr;
+      f32* net_input_buffer = nullptr;
       if (device_.type == DeviceType::GPU) {
         net_input_buffer = input_blobs[i]->mutable_gpu_data();
       } else {
@@ -357,43 +371,71 @@ void CaffeKernel::execute(const BatchedColumns &input_columns,
 
       size_t offset = 0;
       for (i32 j = 0; j < batch_count; ++j) {
-        memcpy_buffer((u8 *)net_input_buffer + offset, device_,
-                      input_columns[i].rows[frame + j].buffer, device_,
-                      input_columns[i].rows[frame + j].size);
-        offset += input_columns[i].rows[frame + j].size;
+        const Frame* fr = input_columns[i][frame + j].as_const_frame();
+        memcpy_buffer((u8*)net_input_buffer + offset, device_, fr->data,
+                      device_, fr->size());
+        offset += fr->size();
       }
     }
 
     // Compute features
     auto net_start = now();
-    net_->ForwardPrefilled();
+    try {
+      net_->ForwardPrefilled();
+    } catch (boost::python::error_already_set) {
+      PyErr_Print();
+      exit(0);
+    }
     if (profiler_) {
-      CUDA_PROTECT({ cudaDeviceSynchronize(); });
+      // #ifdef SCANNER_PROFILING
+      //      CUDA_PROTECT({ cudaDeviceSynchronize(); });
+      // #endif
       profiler_->add_interval("caffe:net", net_start, now());
     }
 
     // Save batch of frames
     i32 total_rows = num_outputs * batch_count;
     for (size_t i = 0; i < num_outputs; ++i) {
-      const std::string &output_layer_name = descriptor.output_layer_names(i);
+      const std::string& output_layer_name = descriptor.output_layer_names(i);
       const boost::shared_ptr<caffe::Blob<float>> output_blob{
           net_->blob_by_name(output_layer_name)};
-      size_t output_length = output_blob->count(1);
-      size_t output_size = output_length * sizeof(float);
-      size_t total_size = output_size * batch_count;
 
-      u8 *output_block = new_block_buffer(device_, total_size, batch_count);
-
-      u8 *src_buffer =
-          (u8 *)(device_.type == DeviceType::CPU ? output_blob->cpu_data()
-                                                 : output_blob->gpu_data());
-      memcpy_buffer(output_block, device_, src_buffer, device_, total_size);
+      i32 num_axes = output_blob->num_axes();
+      FrameInfo info(output_blob->shape(1),
+                     num_axes >= 3 ? output_blob->shape(2) : 1,
+                     num_axes >= 4 ? output_blob->shape(3) : 1, FrameType::F32);
+
+      // This is a special case to handle networks with multiple
+      // outputs (like FRCNN). It checks if the batch size is 1
+      // but the number of outputs is higher than that and forces the
+      // frame shape to incorporate the batch size dimension
+      // (the batch dimension would usually be output as multiple frames).
+      // if (batch_size == 1 && output_blob->shape(0) > 1) {
+      //   info = FrameInfo(output_blob->shape(0) * output_blob->shape(1),
+      //                    info.shape[1],
+      //                    info.shape[2],
+      //                    FrameType::F32);
+      // } else {
+      //   assert(batch_size == output_blob->shape(0));
+      // }
+      u8* output_block = new_block_buffer(
+          device_, info.size() * batch_count, batch_count);
+
+      u8* src_buffer =
+          (u8*)(device_.type == DeviceType::CPU ? output_blob->cpu_data()
+                                                : output_blob->gpu_data());
+      memcpy_buffer(output_block, device_, src_buffer, device_,
+                    info.size() * batch_count);
       for (i32 b = 0; b < batch_count; b++) {
-        output_columns[i].rows.push_back(
-            Row{output_block + output_size * b, output_size});
+        insert_frame(output_columns[i],
+                     new Frame(info, output_block + info.size() * b));
       }
     }
   }
+
+  if (descriptor.uses_python()) {
+    PyGILState_Release(gstate);
+  }
 }
 
 void CaffeKernel::set_device() {
@@ -408,5 +450,4 @@ void CaffeKernel::set_device() {
     });
   }
 }
-
 }
diff --git a/stdlib/caffe/caffe_kernel.h b/stdlib/caffe/caffe_kernel.h
index ace12497..ba5daa75 100644
--- a/stdlib/caffe/caffe_kernel.h
+++ b/stdlib/caffe/caffe_kernel.h
@@ -1,10 +1,10 @@
 #pragma once
 
-#include "stdlib/stdlib.pb.h"
-#include "scanner/api/op.h"
 #include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
 #include "scanner/util/cuda.h"
 #include "scanner/util/memory.h"
+#include "stdlib/stdlib.pb.h"
 
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
@@ -15,12 +15,12 @@
 
 namespace scanner {
 
-using CustomNetConfiguration = void (*)(const FrameInfo &frame_info,
-                                        caffe::Net<float> *net);
+using CustomNetConfiguration = void (*)(const FrameInfo& frame_info,
+                                        caffe::Net<float>* net);
 
-class CaffeKernel : public VideoKernel {
-public:
-  CaffeKernel(const Kernel::Config& config);
+class CaffeKernel : public BatchedKernel, public VideoKernel {
+ public:
+  CaffeKernel(const KernelConfig& config);
   void validate(proto::Result* result) override;
   void new_frame_info() override;
   void execute(const BatchedColumns& input_columns,
@@ -29,7 +29,7 @@ class CaffeKernel : public VideoKernel {
 
   virtual void net_config() {}
 
-protected:
+ protected:
   proto::Result valid_;
   DeviceHandle device_;
   proto::CaffeArgs args_;
@@ -38,5 +38,4 @@ class CaffeKernel : public VideoKernel {
 };
 
 proto::NetDescriptor descriptor_from_net_file(const std::string& path);
-
 }
diff --git a/stdlib/caffe/caffe_kernel_cpu.cpp b/stdlib/caffe/caffe_kernel_cpu.cpp
index 3d448ca2..823c7b11 100644
--- a/stdlib/caffe/caffe_kernel_cpu.cpp
+++ b/stdlib/caffe/caffe_kernel_cpu.cpp
@@ -2,8 +2,10 @@
 
 namespace scanner {
 
-REGISTER_OP(Caffe)
-    .inputs({"caffe_frame", "frame_info"})
-    .outputs({"caffe_output"});
-REGISTER_KERNEL(Caffe, CaffeKernel).device(DeviceType::CPU).num_devices(1);
+REGISTER_OP(Caffe).frame_input("caffe_frame").frame_output("caffe_output");
+
+REGISTER_KERNEL(Caffe, CaffeKernel)
+    .device(DeviceType::CPU)
+    .batch()
+    .num_devices(Kernel::UnlimitedDevices);
 }
diff --git a/stdlib/caffe/caffe_kernel_gpu.cpp b/stdlib/caffe/caffe_kernel_gpu.cpp
index 1e92f6dc..6ac2792b 100644
--- a/stdlib/caffe/caffe_kernel_gpu.cpp
+++ b/stdlib/caffe/caffe_kernel_gpu.cpp
@@ -2,6 +2,8 @@
 
 namespace scanner {
 
-REGISTER_KERNEL(Caffe, CaffeKernel).device(DeviceType::GPU).num_devices(1);
-
+REGISTER_KERNEL(Caffe, CaffeKernel)
+    .device(DeviceType::GPU)
+    .batch()
+    .num_devices(1);
 }
diff --git a/stdlib/caffe/cpm2_input_kernel_gpu.cpp b/stdlib/caffe/cpm2_input_kernel_gpu.cpp
index a7f113bc..e450f443 100644
--- a/stdlib/caffe/cpm2_input_kernel_gpu.cpp
+++ b/stdlib/caffe/cpm2_input_kernel_gpu.cpp
@@ -13,24 +13,26 @@
  * limitations under the License.
  */
 
-#include "scanner/api/op.h"
 #include "scanner/api/kernel.h"
-#include "stdlib/stdlib.pb.h"
-#include "scanner/util/opencv.h"
+#include "scanner/api/op.h"
 #include "scanner/util/cuda.h"
 #include "scanner/util/memory.h"
+#include "scanner/util/opencv.h"
+#include "stdlib/stdlib.pb.h"
 
 #include <opencv2/core/cuda_stream_accessor.hpp>
 
 namespace scanner {
 
-class CPM2InputKernel : public VideoKernel {
-public:
-  CPM2InputKernel(const Kernel::Config &config)
-      : VideoKernel(config), device_(config.devices[0])
+class CPM2InputKernel : public BatchedKernel, public VideoKernel {
+ public:
+  CPM2InputKernel(const KernelConfig& config)
+    : BatchedKernel(config),
+      device_(config.devices[0])
 #ifdef HAVE_CUDA
-        ,
-        num_cuda_streams_(32), streams_(num_cuda_streams_)
+      ,
+      num_cuda_streams_(32),
+      streams_(num_cuda_streams_)
 #endif
   {
     proto::CPM2Args args;
@@ -40,8 +42,8 @@ class CPM2InputKernel : public VideoKernel {
   }
 
   void new_frame_info() override {
-    frame_width_ = frame_info_.width();
-    frame_height_ = frame_info_.height();
+    frame_width_ = frame_info_.shape[1];
+    frame_height_ = frame_info_.shape[0];
 
     resize_width_ = frame_width_ * scale_;
     resize_height_ = frame_height_ * scale_;
@@ -85,34 +87,35 @@ class CPM2InputKernel : public VideoKernel {
     }
   }
 
-  void execute(const BatchedColumns &input_columns,
-               BatchedColumns &output_columns) override {
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
     auto eval_start = now();
 
-    i32 input_count = input_columns[0].rows.size();
-    check_frame_info(device_, input_columns[1]);
+    auto& frame_col = input_columns[0];
+    check_frame(device_, frame_col[0]);
+
+    i32 input_count = num_rows(frame_col);
 
     streams_.resize(0);
     streams_.resize(num_cuda_streams_);
 
-    size_t net_input_size =
-        net_input_width_ * net_input_height_ * 3 * sizeof(f32);
-    u8 *output_block =
-        new_block_buffer(device_, input_count * net_input_size, input_count);
+    FrameInfo net_input_info(3, net_input_height_, net_input_width_,
+                             FrameType::F32);
+    i32 net_input_size = net_input_info.size();
+    std::vector<Frame*> output_frames =
+        new_frames(device_, net_input_info, input_count);
 
     for (i32 i = 0; i < input_count; ++i) {
-      f32 *net_input =
-          reinterpret_cast<f32 *>(output_block + net_input_size * i);
+      Frame* output_frame = output_frames[i];
+      f32* net_input = reinterpret_cast<f32*>(output_frame->data);
 
       int sid = i % num_cuda_streams_;
-      cv::cuda::Stream &cv_stream = streams_[sid];
-
-      u8 *buffer = input_columns[0].rows[i].buffer;
-      assert(input_columns[0].rows[i].size == frame_height_ * frame_width_ * 3);
-      frame_input_[sid] =
-          cv::cuda::GpuMat(frame_height_, frame_width_, CV_8UC3, buffer);
-      cv::cuda::cvtColor(frame_input_[sid], bgr_input_[sid],
-                         cv::COLOR_RGB2BGR, 0, cv_stream);
+      cv::cuda::Stream& cv_stream = streams_[sid];
+
+      const Frame* input_frame = frame_col[i].as_const_frame();
+      frame_input_[sid] = frame_to_gpu_mat(input_frame);
+      cv::cuda::cvtColor(frame_input_[sid], bgr_input_[sid], cv::COLOR_RGB2BGR,
+                         0, cv_stream);
       cv::cuda::resize(bgr_input_[sid], resized_input_[sid],
                        cv::Size(resize_width_, resize_height_), 0, 0,
                        cv::INTER_CUBIC, cv_stream);
@@ -120,14 +123,14 @@ class CPM2InputKernel : public VideoKernel {
                                height_padding_, 0, width_padding_,
                                cv::BORDER_CONSTANT, cv::Scalar(128, 128, 128),
                                cv_stream);
-      padded_input_[sid].convertTo(float_input_[sid], CV_32FC3,
-                                     (1.0f / 256.0f), -0.5f, cv_stream);
+      padded_input_[sid].convertTo(float_input_[sid], CV_32FC3, (1.0f / 256.0f),
+                                   -0.5f, cv_stream);
       // Changed from interleaved BGR to planar RGB
       cv::cuda::split(float_input_[sid], input_planes_[sid], cv_stream);
-      auto &plane1 = input_planes_[sid][0];
-      auto &plane2 = input_planes_[sid][1];
-      auto &plane3 = input_planes_[sid][2];
-      auto &planar_input = planar_input_[sid];
+      auto& plane1 = input_planes_[sid][0];
+      auto& plane2 = input_planes_[sid][1];
+      auto& plane3 = input_planes_[sid][2];
+      auto& planar_input = planar_input_[sid];
       plane1.copyTo(planar_input(cv::Rect(
           0, net_input_height_ * 0, net_input_width_, net_input_height_)));
       plane2.copyTo(planar_input(cv::Rect(
@@ -141,9 +144,9 @@ class CPM2InputKernel : public VideoKernel {
           planar_input.step, net_input_width_ * sizeof(float),
           net_input_height_ * 3, cudaMemcpyDeviceToDevice, s));
 
-      INSERT_ROW(output_columns[0], (u8 *)net_input, net_input_size);
+      insert_frame(output_columns[0], output_frame);
     }
-    for (cv::cuda::Stream &s : streams_) {
+    for (cv::cuda::Stream& s : streams_) {
       s.waitForCompletion();
     }
 
@@ -152,7 +155,7 @@ class CPM2InputKernel : public VideoKernel {
     }
   }
 
-private:
+ private:
   DeviceHandle device_;
   proto::CaffeArgs args_;
   f32 scale_;
@@ -178,7 +181,8 @@ class CPM2InputKernel : public VideoKernel {
   std::vector<cv::cuda::GpuMat> planar_input_;
 };
 
-REGISTER_OP(CPM2Input).inputs({"frame", "frame_info"}).outputs({"cpm2_input"});
+REGISTER_OP(CPM2Input).frame_input("frame").frame_output("cpm2_input");
+
 REGISTER_KERNEL(CPM2Input, CPM2InputKernel)
     .device(DeviceType::GPU)
     .num_devices(1);
diff --git a/stdlib/caffe/cpm2_kernel.cpp b/stdlib/caffe/cpm2_kernel.cpp
index 7eb9d777..1d851915 100644
--- a/stdlib/caffe/cpm2_kernel.cpp
+++ b/stdlib/caffe/cpm2_kernel.cpp
@@ -1,28 +1,21 @@
 #include "scanner/api/op.h"
 #include "stdlib/caffe/caffe_kernel.h"
 
-#include "caffe/layers/imresize_layer.hpp"
+#include "caffe/cpm/layers/imresize_layer.hpp"
 
 namespace scanner {
 
 class CPM2Kernel : public CaffeKernel {
-public:
-  CPM2Kernel(const Kernel::Config& config)
+ public:
+  CPM2Kernel(const KernelConfig& config)
     : CaffeKernel(get_caffe_config(config)) {}
 
   void net_config() override {
-    // Calculate width by scaling by box size
-    int resize_width = frame_info_.width() * scale_;
-    int resize_height = frame_info_.height() * scale_;
+    int net_input_width = frame_info_.shape[2];
+    int net_input_height = frame_info_.shape[1];
 
-    int width_padding = (resize_width % 8) ? 8 - (resize_width % 8) : 0;
-    int height_padding = (resize_height % 8) ? 8 - (resize_height % 8) : 0;
-
-    int net_input_width = resize_width + width_padding;
-    int net_input_height = resize_height + height_padding;
-
-    caffe::ImResizeLayer<float> *resize_layer =
-        (caffe::ImResizeLayer<float> *)net_->layer_by_name("resize").get();
+    caffe::ImResizeLayer<float>* resize_layer =
+        (caffe::ImResizeLayer<float>*)net_->layer_by_name("resize").get();
 
     resize_layer->SetStartScale(1);
     resize_layer->SetScaleGap(0.1);
@@ -34,26 +27,27 @@ class CPM2Kernel : public CaffeKernel {
                          net_input_height, net_input_width});
   }
 
-  Kernel::Config get_caffe_config(const Kernel::Config& config) {
+  KernelConfig get_caffe_config(const KernelConfig& config) {
     proto::CPM2Args args;
     args.ParseFromArray(config.args.data(), config.args.size());
     scale_ = args.scale();
 
-    Kernel::Config new_config(config);
+    KernelConfig new_config(config);
     std::string caffe_string;
     args.caffe_args().SerializeToString(&caffe_string);
     new_config.args = std::vector<u8>(caffe_string.begin(), caffe_string.end());
     return new_config;
   }
 
-private:
+ private:
   f32 scale_;
 };
 
 REGISTER_OP(CPM2)
-    .inputs({"cpm2_input", "frame_info"})
-    .outputs({"cpm2_resized_map", "cpm2_joints"});
-REGISTER_KERNEL(CPM2, CPM2Kernel).device(DeviceType::CPU).num_devices(1);
-REGISTER_KERNEL(CPM2, CPM2Kernel).device(DeviceType::GPU).num_devices(1);
+    .frame_input("cpm2_input")
+    .frame_output("cpm2_resized_map")
+    .frame_output("cpm2_joints");
 
+REGISTER_KERNEL(CPM2, CPM2Kernel).device(DeviceType::CPU).num_devices(1).batch();
+REGISTER_KERNEL(CPM2, CPM2Kernel).device(DeviceType::GPU).num_devices(1).batch();
 }
diff --git a/stdlib/caffe/cpm2_output_kernel_cpu.cpp b/stdlib/caffe/cpm2_output_kernel_cpu.cpp
index e9bee629..885b0a64 100644
--- a/stdlib/caffe/cpm2_output_kernel_cpu.cpp
+++ b/stdlib/caffe/cpm2_output_kernel_cpu.cpp
@@ -13,34 +13,34 @@
  * limitations under the License.
  */
 
-#include "scanner/api/op.h"
 #include "scanner/api/kernel.h"
-#include "stdlib/stdlib.pb.h"
+#include "scanner/api/op.h"
 #include "scanner/types.pb.h"
 #include "scanner/util/common.h"
 #include "scanner/util/serialize.h"
 #include "scanner/util/util.h"
+#include "stdlib/stdlib.pb.h"
 
+#include <algorithm>
 #include <cassert>
 #include <cmath>
-#include <algorithm>
 #include <vector>
 
 namespace scanner {
 struct ModelDescriptor {
   virtual ~ModelDescriptor() {}
-  virtual const std::string &get_part_name(int n) = 0;
+  virtual const std::string& get_part_name(int n) = 0;
   virtual int num_parts() = 0;
   virtual int num_limb_seq() = 0;
-  virtual const int *get_limb_seq() = 0;
-  virtual const int *get_map_idx() = 0;
+  virtual const int* get_limb_seq() = 0;
+  virtual const int* get_map_idx() = 0;
   virtual const std::string name() = 0;
 };
 namespace {
 
 struct ColumnCompare {
-  bool operator()(const std::vector<double> &lhs,
-                  const std::vector<double> &rhs) const {
+  bool operator()(const std::vector<double>& lhs,
+                  const std::vector<double>& rhs) const {
     return lhs[2] > rhs[2];
   }
 };
@@ -54,17 +54,17 @@ struct MPIModelDescriptor : public ModelDescriptor {
                           42, 43, 32, 33, 34, 35, 36, 37};
   virtual int num_parts() { return 15; }
   virtual int num_limb_seq() { return 14; }
-  virtual const int *get_limb_seq() { return limbSeq; }
-  virtual const int *get_map_idx() { return mapIdx; }
+  virtual const int* get_limb_seq() { return limbSeq; }
+  virtual const int* get_map_idx() { return mapIdx; }
   virtual const std::string name() { return "MPI_15"; }
 
   MPIModelDescriptor()
-      : part2name{
-            {0, "Head"},   {1, "Neck"},      {2, "RShoulder"}, {3, "RElbow"},
-            {4, "RWrist"}, {5, "LShoulder"}, {6, "LElbow"},    {7, "LWrist"},
-            {8, "RHip"},   {9, "RKnee"},     {10, "RAnkle"},   {11, "LHip"},
-            {12, "LKnee"}, {13, "LAnkle"},   {14, "Chest"},    {15, "Bkg"},
-        } /* End initializers */ {
+    : part2name{
+          {0, "Head"},   {1, "Neck"},      {2, "RShoulder"}, {3, "RElbow"},
+          {4, "RWrist"}, {5, "LShoulder"}, {6, "LElbow"},    {7, "LWrist"},
+          {8, "RHip"},   {9, "RKnee"},     {10, "RAnkle"},   {11, "LHip"},
+          {12, "LKnee"}, {13, "LAnkle"},   {14, "Chest"},    {15, "Bkg"},
+      } /* End initializers */ {
     for (int l = 0; l < num_limb_seq(); l++) {
       int la = limbSeq[2 * l + 0];
       int lb = limbSeq[2 * l + 1];
@@ -74,7 +74,7 @@ struct MPIModelDescriptor : public ModelDescriptor {
       part2name[mb] = part2name[la] + "->" + part2name[lb] + "(Y)";
     }
   }
-  virtual const std::string &get_part_name(int n) { return part2name.at(n); }
+  virtual const std::string& get_part_name(int n) { return part2name.at(n); }
 };
 
 struct COCOModelDescriptor : public ModelDescriptor {
@@ -87,18 +87,18 @@ struct COCOModelDescriptor : public ModelDescriptor {
                     49, 50, 53, 54, 51, 52, 55, 56, 37, 38, 45, 46};
   virtual int num_parts() { return 18; }
   virtual int num_limb_seq() { return 38 / 2; }
-  virtual const int *get_limb_seq() { return limbSeq; }
-  virtual const int *get_map_idx() { return mapIdx; }
+  virtual const int* get_limb_seq() { return limbSeq; }
+  virtual const int* get_map_idx() { return mapIdx; }
   virtual const std::string name() { return "COCO_18"; }
 
   COCOModelDescriptor()
-      : part2name{
-            {0, "Nose"},   {1, "Neck"},      {2, "RShoulder"}, {3, "RElbow"},
-            {4, "RWrist"}, {5, "LShoulder"}, {6, "LElbow"},    {7, "LWrist"},
-            {8, "RHip"},   {9, "RKnee"},     {10, "RAnkle"},   {11, "LHip"},
-            {12, "LKnee"}, {13, "LAnkle"},   {14, "REye"},     {15, "LEye"},
-            {16, "REar"},  {17, "LEar"},     {18, "Bkg"},
-        } /* End initializers */ {
+    : part2name{
+          {0, "Nose"},   {1, "Neck"},      {2, "RShoulder"}, {3, "RElbow"},
+          {4, "RWrist"}, {5, "LShoulder"}, {6, "LElbow"},    {7, "LWrist"},
+          {8, "RHip"},   {9, "RKnee"},     {10, "RAnkle"},   {11, "LHip"},
+          {12, "LKnee"}, {13, "LAnkle"},   {14, "REye"},     {15, "LEye"},
+          {16, "REar"},  {17, "LEar"},     {18, "Bkg"},
+      } /* End initializers */ {
     for (int l = 0; l < num_limb_seq(); l++) {
       int la = limbSeq[2 * l + 0];
       int lb = limbSeq[2 * l + 1];
@@ -108,17 +108,17 @@ struct COCOModelDescriptor : public ModelDescriptor {
       part2name[mb] = part2name[la] + "->" + part2name[lb] + "(Y)";
     }
   }
-  virtual const std::string &get_part_name(int n) { return part2name.at(n); }
+  virtual const std::string& get_part_name(int n) { return part2name.at(n); }
 };
 }
 
-class CPM2OutputKernel : public VideoKernel {
-public:
-  CPM2OutputKernel(const Kernel::Config &config) : VideoKernel(config) {
+class CPM2OutputKernel : public BatchedKernel, public VideoKernel {
+ public:
+  CPM2OutputKernel(const KernelConfig& config) : BatchedKernel(config) {
     proto::CPM2Args args;
     args.ParseFromArray(config.args.data(), config.args.size());
     scale_ = args.scale();
-    modeldesc.reset(new MPIModelDescriptor());
+    modeldesc.reset(new COCOModelDescriptor());
 
     joints_.resize(max_people_ * 3 * max_num_parts_);
   }
@@ -137,38 +137,41 @@ class CPM2OutputKernel : public VideoKernel {
 
     feature_width_ = net_input_width_;
     feature_height_ = net_input_height_;
-    feature_channels_ = 44;
+    feature_channels_ = 57;
   }
 
-  void execute(const BatchedColumns &input_columns,
-               BatchedColumns &output_columns) override {
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
     assert(input_columns.size() == 3);
     i32 heatmap_idx = 0;
     i32 joints_idx = 1;
     i32 frame_info_idx = 2;
 
-    check_frame_info(CPU_DEVICE, input_columns[frame_info_idx]);
+    check_frame_info(CPU_DEVICE, input_columns[frame_info_idx][0]);
 
-    i32 input_count = (i32)input_columns[0].rows.size();
+    i32 input_count = (i32)num_rows(input_columns[0]);
 
     for (i32 b = 0; b < input_count; ++b) {
-      assert(input_columns[heatmap_idx].rows[b].size ==
+      const Frame* heatmap_frame =
+          input_columns[heatmap_idx][b].as_const_frame();
+      const Frame* joints_frame = input_columns[joints_idx][b].as_const_frame();
+      assert(heatmap_frame->size() ==
              feature_width_ * feature_height_ * feature_channels_ *
                  sizeof(f32));
 
-      float *heatmap =
-          reinterpret_cast<float *>(input_columns[heatmap_idx].rows[b].buffer);
-      float *peaks =
-          reinterpret_cast<float *>(input_columns[joints_idx].rows[b].buffer);
+      const float* heatmap = reinterpret_cast<float*>(heatmap_frame->data);
+      const float* peaks = reinterpret_cast<float*>(joints_frame->data);
 
       std::vector<std::vector<double>> subset;
       std::vector<std::vector<std::vector<double>>> connection;
-      int count =
-          connect_limbs(subset, connection, heatmap, peaks, joints_.data());
+      // int count =
+      //     connect_limbs(subset, connection, heatmap, peaks, joints_.data());
+      int count = connect_limbs_coco(subset, connection, heatmap, peaks,
+                                     joints_.data());
 
       std::vector<std::vector<scanner::Point>> bodies(count);
       for (int p = 0; p < count; ++p) {
-        std::vector<scanner::Point> &body_joints = bodies[p];
+        std::vector<scanner::Point>& body_joints = bodies[p];
         for (i32 j = 0; j < num_joints_; ++j) {
           int offset = p * num_joints_ * 3 + j * 3;
           float score = joints_[offset + 2];
@@ -183,17 +186,17 @@ class CPM2OutputKernel : public VideoKernel {
         }
       }
       size_t size;
-      u8 *buffer;
+      u8* buffer;
       serialize_proto_vector_of_vectors(bodies, buffer, size);
-      output_columns.at(heatmap_idx).rows.push_back(Row{buffer, size});
+      insert_element(output_columns.at(heatmap_idx), buffer, size);
     }
   }
 
-protected:
-  int connect_limbs(std::vector<std::vector<double>> &subset,
-                    std::vector<std::vector<std::vector<double>>> &connection,
-                    const float *heatmap_pointer, const float *peaks,
-                    float *joints) {
+ protected:
+  int connect_limbs(std::vector<std::vector<double>>& subset,
+                    std::vector<std::vector<std::vector<double>>>& connection,
+                    const float* heatmap_pointer, const float* peaks,
+                    float* joints) {
     /* Parts Connection ---------------------------------------*/
     // limbSeq = [15 2; 2 1; 2 3; 3 4; 4 5; 2 6; 6 7; 7 8; 15 12; 12 13; 13 14;
     // 15
@@ -204,8 +207,8 @@ class CPM2OutputKernel : public VideoKernel {
     // 24};
 
     const int NUM_PARTS = modeldesc->num_parts();
-    const int *limbSeq = modeldesc->get_limb_seq();
-    const int *mapIdx = modeldesc->get_map_idx();
+    const int* limbSeq = modeldesc->get_limb_seq();
+    const int* mapIdx = modeldesc->get_map_idx();
     const int num_limb_seq = modeldesc->num_limb_seq();
 
     int SUBSET_CNT = NUM_PARTS + 2;
@@ -223,14 +226,14 @@ class CPM2OutputKernel : public VideoKernel {
       // float* score_mid = heatmap_pointer + mapIdx[k] * INIT_PERSON_NET_HEIGHT
       // *
       // INIT_PERSON_NET_WIDTH;
-      const float *map_x = heatmap_pointer +
+      const float* map_x = heatmap_pointer +
                            mapIdx[2 * k] * net_input_height_ * net_input_width_;
-      const float *map_y =
+      const float* map_y =
           heatmap_pointer +
           mapIdx[2 * k + 1] * net_input_height_ * net_input_width_;
 
-      const float *candA = peaks + limbSeq[2 * k] * peaks_offset;
-      const float *candB = peaks + limbSeq[2 * k + 1] * peaks_offset;
+      const float* candA = peaks + limbSeq[2 * k] * peaks_offset;
+      const float* candB = peaks + limbSeq[2 * k + 1] * peaks_offset;
 
       std::vector<std::vector<double>> connection_k;
       int nA = candA[0];
@@ -243,12 +246,12 @@ class CPM2OutputKernel : public VideoKernel {
         for (int i = 1; i <= nB; i++) {
           std::vector<double> row_vec(SUBSET_SIZE, 0);
           row_vec[limbSeq[2 * k + 1]] =
-              limbSeq[2 * k + 1] * peaks_offset + i * 3 + 2; // store the index
+              limbSeq[2 * k + 1] * peaks_offset + i * 3 + 2;  // store the index
           row_vec[SUBSET_CNT] =
-              1; // last number in each row is the parts number of that person
+              1;  // last number in each row is the parts number of that person
           row_vec[SUBSET_SCORE] =
               candB[i * 3 +
-                    2]; // second last number in each row is the total score
+                    2];  // second last number in each row is the total score
           subset.push_back(row_vec);
         }
         continue;
@@ -256,12 +259,12 @@ class CPM2OutputKernel : public VideoKernel {
         for (int i = 1; i <= nA; i++) {
           std::vector<double> row_vec(SUBSET_SIZE, 0);
           row_vec[limbSeq[2 * k]] =
-              limbSeq[2 * k] * peaks_offset + i * 3 + 2; // store the index
+              limbSeq[2 * k] * peaks_offset + i * 3 + 2;  // store the index
           row_vec[SUBSET_CNT] =
-              1; // last number in each row is the parts number of that person
+              1;  // last number in each row is the parts number of that person
           row_vec[SUBSET_SCORE] =
               candA[i * 3 +
-                    2]; // second last number in each row is the total score
+                    2];  // second last number in each row is the total score
           subset.push_back(row_vec);
         }
         continue;
@@ -302,7 +305,7 @@ class CPM2OutputKernel : public VideoKernel {
             // parts score + cpnnection score
             std::vector<double> row_vec(4, 0);
             row_vec[3] =
-                sum / count + candA[i * 3 + 2] + candB[j * 3 + 2]; // score_all
+                sum / count + candA[i * 3 + 2] + candB[j * 3 + 2];  // score_all
             row_vec[2] = sum / count;
             row_vec[0] = i;
             row_vec[1] = j;
@@ -314,8 +317,7 @@ class CPM2OutputKernel : public VideoKernel {
       //** select the top num connection, assuming that each part occur only
       // once
       // sort rows in descending order based on parts + connection score
-      if (temp.size() > 0)
-        std::sort(temp.begin(), temp.end(), ColumnCompare());
+      if (temp.size() > 0) std::sort(temp.begin(), temp.end(), ColumnCompare());
 
       int num = std::min(nA, nB);
       int cnt = 0;
@@ -344,7 +346,7 @@ class CPM2OutputKernel : public VideoKernel {
           int i = int(temp[row][0]);
           int j = int(temp[row][1]);
           float score = temp[row][2];
-          if (occurA[i - 1] == 0 && occurB[j - 1] == 0) { // && score> (1+thre)
+          if (occurA[i - 1] == 0 && occurB[j - 1] == 0) {  // && score> (1+thre)
             std::vector<double> row_vec(3, 0);
             row_vec[0] = limbSeq[2 * k] * peaks_offset + i * 3 + 2;
             row_vec[1] = limbSeq[2 * k + 1] * peaks_offset + j * 3 + 2;
@@ -438,10 +440,11 @@ class CPM2OutputKernel : public VideoKernel {
           int idx = int(subset[i][j]);
           if (idx) {
             joints[cnt * NUM_PARTS * 3 + j * 3 + 2] = peaks[idx];
-            joints[cnt * NUM_PARTS * 3 + j * 3 + 1] =
-                peaks[idx - 1] * frame_info_.height() / (float)net_input_height_;
+            joints[cnt * NUM_PARTS * 3 + j * 3 + 1] = peaks[idx - 1] *
+                                                      frame_info_.shape[0] /
+                                                      (float)net_input_height_;
             joints[cnt * NUM_PARTS * 3 + j * 3] =
-                peaks[idx - 2] * frame_info_.width() / (float)net_input_width_;
+                peaks[idx - 2] * frame_info_.shape[1] / (float)net_input_width_;
           } else {
             joints[cnt * NUM_PARTS * 3 + j * 3 + 2] = 0;
             joints[cnt * NUM_PARTS * 3 + j * 3 + 1] = 0;
@@ -457,6 +460,299 @@ class CPM2OutputKernel : public VideoKernel {
     return cnt;
   }
 
+  int connect_limbs_coco(
+      std::vector<std::vector<double>>& subset,
+      std::vector<std::vector<std::vector<double>>>& connection,
+      const float* heatmap_pointer, const float* peaks,
+      float* joints) {
+    /* Parts Connection ---------------------------------------*/
+    const auto num_parts = modeldesc->num_parts();
+    const auto limbSeq = modeldesc->get_limb_seq();
+    const auto mapIdx = modeldesc->get_map_idx();
+    const auto number_limb_seq = modeldesc->num_limb_seq();
+
+    CHECK_EQ(num_parts, 18) << "Wrong connection function for model";
+    CHECK_EQ(number_limb_seq, 19) << "Wrong connection function for model";
+
+    int SUBSET_CNT = num_parts + 2;
+    int SUBSET_SCORE = num_parts + 1;
+    int SUBSET_SIZE = num_parts + 3;
+
+    const int peaks_offset = 3 * (max_peaks_ + 1);
+
+    subset.clear();
+    connection.clear();
+
+    for (int k = 0; k < number_limb_seq; k++) {
+      const float* map_x =
+          heatmap_pointer +
+          mapIdx[2 * k] * net_input_height_ * net_input_width_;
+      const float* map_y =
+          heatmap_pointer +
+          mapIdx[2 * k + 1] * net_input_height_ * net_input_width_;
+
+      const float* candA = peaks + limbSeq[2 * k] * peaks_offset;
+      const float* candB = peaks + limbSeq[2 * k + 1] * peaks_offset;
+
+      std::vector<std::vector<double>> connection_k;
+      int nA = candA[0];
+      int nB = candB[0];
+
+      // add parts into the subset in special case
+      if (nA == 0 && nB == 0) {
+        continue;
+      } else if (nA == 0) {
+        for (int i = 1; i <= nB; i++) {
+          int num = 0;
+          int indexB = limbSeq[2 * k + 1];
+          for (int j = 0; j < subset.size(); j++) {
+            int off = limbSeq[2 * k + 1] * peaks_offset + i * 3 + 2;
+            if (subset[j][indexB] == off) {
+              num = num + 1;
+              continue;
+            }
+          }
+          if (num != 0) {
+            // LOG(INFO) << " else if (nA==0) shouldn't have any nB already
+            // assigned?";
+          } else {
+            std::vector<double> row_vec(SUBSET_SIZE, 0);
+            row_vec[limbSeq[2 * k + 1]] = limbSeq[2 * k + 1] * peaks_offset +
+                                          i * 3 + 2;  // store the index
+            row_vec[SUBSET_CNT] = 1;  // last number in each row is the parts
+                                      // number of that person
+            row_vec[SUBSET_SCORE] =
+                candB[i * 3 +
+                      2];  // second last number in each row is the total score
+            subset.push_back(row_vec);
+          }
+          // LOG(INFO) << "nA==0 New subset on part " << k << " subsets: " <<
+          // subset.size();
+        }
+        continue;
+      } else if (nB == 0) {
+        for (int i = 1; i <= nA; i++) {
+          int num = 0;
+          int indexA = limbSeq[2 * k];
+          for (int j = 0; j < subset.size(); j++) {
+            int off = limbSeq[2 * k] * peaks_offset + i * 3 + 2;
+            if (subset[j][indexA] == off) {
+              num = num + 1;
+              continue;
+            }
+          }
+          if (num == 0) {
+            std::vector<double> row_vec(SUBSET_SIZE, 0);
+            row_vec[limbSeq[2 * k]] =
+                limbSeq[2 * k] * peaks_offset + i * 3 + 2;  // store the index
+            row_vec[SUBSET_CNT] = 1;  // last number in each row is the parts
+                                      // number of that person
+            row_vec[SUBSET_SCORE] =
+                candA[i * 3 +
+                      2];  // second last number in each row is the total score
+            subset.push_back(row_vec);
+            // LOG(INFO) << "nB==0 New subset on part " << k << " subsets: " <<
+            // subset.size();
+          } else {
+            // LOG(INFO) << "nB==0 discarded would have added";
+          }
+        }
+        continue;
+      }
+
+      std::vector<std::vector<double>> temp;
+      const int num_inter = 10;
+
+      for (int i = 1; i <= nA; i++) {
+        for (int j = 1; j <= nB; j++) {
+          float s_x = candA[i * 3];
+          float s_y = candA[i * 3 + 1];
+          float d_x = candB[j * 3] - candA[i * 3];
+          float d_y = candB[j * 3 + 1] - candA[i * 3 + 1];
+          float norm_vec = sqrt(d_x * d_x + d_y * d_y);
+          if (norm_vec < 1e-6) {
+            // The peaks are coincident. Don't connect them.
+            continue;
+          }
+          float vec_x = d_x / norm_vec;
+          float vec_y = d_y / norm_vec;
+
+          float sum = 0;
+          int count = 0;
+
+          for (int lm = 0; lm < num_inter; lm++) {
+            int my = round(s_y + lm * d_y / num_inter);
+            int mx = round(s_x + lm * d_x / num_inter);
+            if (mx >= net_input_width_) {
+              // LOG(ERROR) << "mx " << mx << "out of range";
+              mx = net_input_width_ - 1;
+            }
+            if (my >= net_input_height_) {
+              // LOG(ERROR) << "my " << my << "out of range";
+              my = net_input_height_ - 1;
+            }
+            CHECK_GE(mx, 0);
+            CHECK_GE(my, 0);
+            int idx = my * net_input_width_ + mx;
+            float score = (vec_x * map_x[idx] + vec_y * map_y[idx]);
+            if (score > connect_inter_threshold_) {
+              sum = sum + score;
+              count++;
+            }
+          }
+          // float score = sum / count; // + std::min((130/dist-1),0.f)
+
+          if (count >
+              connect_inter_min_above_threshold_) {  // num_inter*0.8) {
+                                                           // //thre/2
+            // parts score + cpnnection score
+            std::vector<double> row_vec(4, 0);
+            row_vec[3] =
+                sum / count + candA[i * 3 + 2] + candB[j * 3 + 2];  // score_all
+            row_vec[2] = sum / count;
+            row_vec[0] = i;
+            row_vec[1] = j;
+            temp.push_back(row_vec);
+          }
+        }
+      }
+
+      //** select the top num connection, assuming that each part occur only
+      //once
+      // sort rows in descending order based on parts + connection score
+      if (temp.size() > 0) std::sort(temp.begin(), temp.end(), ColumnCompare());
+
+      int num = std::min(nA, nB);
+      int cnt = 0;
+      std::vector<int> occurA(nA, 0);
+      std::vector<int> occurB(nB, 0);
+
+      for (int row = 0; row < temp.size(); row++) {
+        if (cnt == num) {
+          break;
+        } else {
+          int i = int(temp[row][0]);
+          int j = int(temp[row][1]);
+          float score = temp[row][2];
+          if (occurA[i - 1] == 0 && occurB[j - 1] == 0) {  // && score> (1+thre)
+            std::vector<double> row_vec(3, 0);
+            row_vec[0] = limbSeq[2 * k] * peaks_offset + i * 3 + 2;
+            row_vec[1] = limbSeq[2 * k + 1] * peaks_offset + j * 3 + 2;
+            row_vec[2] = score;
+            connection_k.push_back(row_vec);
+            cnt = cnt + 1;
+            occurA[i - 1] = 1;
+            occurB[j - 1] = 1;
+          }
+        }
+      }
+
+      //** cluster all the joints candidates into subset based on the part
+      //connection
+      // initialize first body part connection 15&16
+      if (k == 0) {
+        std::vector<double> row_vec(num_parts + 3, 0);
+        for (int i = 0; i < connection_k.size(); i++) {
+          double indexB = connection_k[i][1];
+          double indexA = connection_k[i][0];
+          row_vec[limbSeq[0]] = indexA;
+          row_vec[limbSeq[1]] = indexB;
+          row_vec[SUBSET_CNT] = 2;
+          // add the score of parts and the connection
+          row_vec[SUBSET_SCORE] =
+              peaks[int(indexA)] + peaks[int(indexB)] + connection_k[i][2];
+          // LOG(INFO) << "New subset on part " << k << " subsets: " <<
+          // subset.size();
+          subset.push_back(row_vec);
+        }
+      } /* else if (k==17 || k==18) { // TODO: Check k numbers?
+                //   %add 15 16 connection
+                for(int i = 0; i < connection_k.size(); i++) {
+                    double indexA = connection_k[i][0];
+                    double indexB = connection_k[i][1];
+
+                    for(int j = 0; j < subset.size(); j++) {
+                    // if subset(j, indexA) == partA(i) && subset(j, indexB) == 0
+                    //         subset(j, indexB) = partB(i);
+                    // elseif subset(j, indexB) == partB(i) && subset(j, indexA) == 0
+                    //         subset(j, indexA) = partA(i);
+                    // end
+                        if (subset[j][limbSeq[2*k]] == indexA && subset[j][limbSeq[2*k+1]]==0) {
+                            subset[j][limbSeq[2*k+1]] = indexB;
+                        } else if (subset[j][limbSeq[2*k+1]] == indexB && subset[j][limbSeq[2*k]]==0) {
+                            subset[j][limbSeq[2*k]] = indexA;
+                        }
+                }
+                continue;
+            }
+        }*/ else {
+        if (connection_k.size() == 0) {
+          continue;
+        }
+
+        // A is already in the subset, find its connection B
+        for (int i = 0; i < connection_k.size(); i++) {
+          int num = 0;
+          double indexA = connection_k[i][0];
+          double indexB = connection_k[i][1];
+
+          for (int j = 0; j < subset.size(); j++) {
+            if (subset[j][limbSeq[2 * k]] == indexA) {
+              subset[j][limbSeq[2 * k + 1]] = indexB;
+              num = num + 1;
+              subset[j][SUBSET_CNT] = subset[j][SUBSET_CNT] + 1;
+              subset[j][SUBSET_SCORE] = subset[j][SUBSET_SCORE] +
+                                        peaks[int(indexB)] + connection_k[i][2];
+            }
+          }
+          // if can not find partA in the subset, create a new subset
+          if (num == 0) {
+            // LOG(INFO) << "New subset on part " << k << " subsets: " <<
+            // subset.size();
+            std::vector<double> row_vec(SUBSET_SIZE, 0);
+            row_vec[limbSeq[2 * k]] = indexA;
+            row_vec[limbSeq[2 * k + 1]] = indexB;
+            row_vec[SUBSET_CNT] = 2;
+            row_vec[SUBSET_SCORE] =
+                peaks[int(indexA)] + peaks[int(indexB)] + connection_k[i][2];
+            subset.push_back(row_vec);
+          }
+        }
+      }
+    }
+
+    //** joints by deleteing some rows of subset which has few parts occur
+    int cnt = 0;
+    for (int i = 0; i < subset.size(); i++) {
+      if (subset[i][SUBSET_CNT] < 1) {
+        LOG(INFO) << "BAD SUBSET_CNT";
+      }
+      if (subset[i][SUBSET_CNT] >= connect_min_subset_cnt_ &&
+          (subset[i][SUBSET_SCORE] / subset[i][SUBSET_CNT]) >
+              connect_min_subset_score_) {
+        for (int j = 0; j < num_parts; j++) {
+          int idx = int(subset[i][j]);
+          if (idx) {
+            joints[cnt * num_parts * 3 + j * 3 + 2] = peaks[idx];
+            joints[cnt * num_parts * 3 + j * 3 + 1] = peaks[idx - 1] *
+                                                      frame_info_.shape[0] /
+                                                      (float)net_input_height_;
+            joints[cnt * num_parts * 3 + j * 3] =
+                peaks[idx - 2] * frame_info_.shape[1] / (float)net_input_width_;
+          } else {
+            joints[cnt * num_parts * 3 + j * 3 + 2] = 0;
+            joints[cnt * num_parts * 3 + j * 3 + 1] = 0;
+            joints[cnt * num_parts * 3 + j * 3] = 0;
+          }
+        }
+        cnt++;
+        if (cnt == max_people_) break;
+      }
+    }
+
+    return cnt;
+  }
+
  private:
   f32 threshold_ = 0.5f;
 
@@ -478,18 +774,30 @@ class CPM2OutputKernel : public VideoKernel {
 
   const int max_people_ = 96;
   const int max_num_parts_ = 70;
-  const int max_peaks_ = 20;
-  const int num_joints_ = 15;
-  int connect_min_subset_cnt_ = 3;
+  // For MPI
+  //const int max_peaks_ = 64;
+  // For COCO
+  const int max_peaks_ = 64;
+  const int num_joints_ = 18;
+  // For MPI
+  /*int connect_min_subset_cnt_ = 3;
   float connect_min_subset_score_ = 0.4;
   float connect_inter_threshold_ = 0.01;
-  int connect_inter_min_above_threshold_ = 8;
+  int connect_inter_min_above_threshold_ = 8;*/
+  // For COCO
+  int connect_min_subset_cnt_ = 3;
+  float connect_min_subset_score_ = 0.4;
+  float connect_inter_threshold_ = 0.050;
+  int connect_inter_min_above_threshold_ = 9;
   std::vector<float> joints_;
 };
 
 REGISTER_OP(CPM2Output)
-    .inputs({"cpm2_resized_map", "cpm2_joints", "frame_info"})
-    .outputs({"poses"});
+    .frame_input("cpm2_resized_map")
+    .frame_input("cpm2_joints")
+    .input("original_frame_info")
+    .output("poses");
+
 REGISTER_KERNEL(CPM2Output, CPM2OutputKernel)
     .device(DeviceType::CPU)
     .num_devices(1);
diff --git a/stdlib/caffe/facenet_input_kernel_cpu.cpp b/stdlib/caffe/facenet_input_kernel_cpu.cpp
new file mode 100644
index 00000000..4d951d39
--- /dev/null
+++ b/stdlib/caffe/facenet_input_kernel_cpu.cpp
@@ -0,0 +1,143 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/memory.h"
+#include "scanner/util/opencv.h"
+#include "stdlib/stdlib.pb.h"
+
+namespace scanner {
+
+class FacenetInputKernelCPU : public BatchedKernel, public VideoKernel {
+ public:
+  FacenetInputKernelCPU(const KernelConfig& config)
+    : BatchedKernel(config),
+      device_(config.devices[0])
+  {
+    proto::FacenetArgs args;
+    args.ParseFromArray(config.args.data(), config.args.size());
+    args_.CopyFrom(args.caffe_args());
+    scale_ = args.scale();
+  }
+
+  void new_frame_info() override {
+    net_input_width_ = std::floor(frame_info_.width() * scale_);
+    net_input_height_ = std::floor(frame_info_.height() * scale_);
+    if (net_input_width_ % 8 != 0) {
+      net_input_width_ += 8 - (net_input_width_ % 8);
+    };
+    if (net_input_height_ % 8 != 0) {
+      net_input_height_ += 8 - (net_input_height_ % 8);
+    }
+
+    mean_mat_ =
+        cv::Mat(net_input_height_, net_input_width_, CV_32FC3,
+                cv::Scalar(args_.net_descriptor().mean_colors(0),
+                           args_.net_descriptor().mean_colors(1),
+                           args_.net_descriptor().mean_colors(2)));
+
+    frame_input_.clear();
+    resized_input_.clear();
+    float_input_.clear();
+    flipped_planes_.clear();
+    normalized_input_.clear();
+    input_planes_.clear();
+    planar_input_.clear();
+    flipped_planes_.clear();
+    for (size_t i = 0; i < 1; ++i) {
+      frame_input_.push_back(
+          cv::Mat(frame_info_.height(), frame_info_.width(), CV_8UC3));
+      resized_input_.push_back(
+        cv::Mat(net_input_height_, net_input_width_, CV_8UC3));
+      float_input_.push_back(
+          cv::Mat(net_input_height_, net_input_width_, CV_32FC3));
+      normalized_input_.push_back(
+          cv::Mat(net_input_height_, net_input_width_, CV_32FC3));
+      std::vector<cv::Mat> planes;
+      std::vector<cv::Mat> flipped_planes;
+      for (i32 i = 0; i < 3; ++i) {
+        planes.push_back(
+            cv::Mat(net_input_height_, net_input_width_, CV_32FC1));
+        flipped_planes.push_back(
+            cv::Mat(net_input_width_, net_input_height_, CV_32FC1));
+      }
+      input_planes_.push_back(planes);
+      flipped_planes_.push_back(flipped_planes);
+      planar_input_.push_back(
+          cv::Mat(net_input_width_ * 3, net_input_height_, CV_32FC1));
+    }
+  }
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    auto& frame_col = input_columns[0];
+    check_frame(device_, frame_col[0]);
+
+    i32 input_count = (i32)frame_col.size();
+    FrameInfo net_input_info(3, net_input_width_, net_input_height_,
+                             FrameType::F32);
+    i32 net_input_size = net_input_info.size();
+    std::vector<Frame*> output_frames =
+        new_frames(device_, net_input_info, input_count);
+
+    for (i32 i = 0; i < input_count; ++i) {
+      Frame* output_frame = output_frames[i];
+      f32* net_input = (f32*)output_frame->data;
+
+      i32 sid = 0;
+
+      // Convert input frame to gpu mat
+      frame_input_[sid] = frame_to_mat(frame_col[i].as_const_frame());
+
+      cv::resize(frame_input_[sid], resized_input_[sid],
+                 cv::Size(net_input_width_, net_input_height_), 0, 0,
+                 cv::INTER_LINEAR);
+      resized_input_[sid].convertTo(float_input_[sid], CV_32FC3);
+      cv::subtract(float_input_[sid], mean_mat_, normalized_input_[sid],
+                   cv::noArray(), -1);
+      // Changed from interleaved RGB to planar RGB
+      cv::split(normalized_input_[sid], input_planes_[sid]);
+      cv::transpose(input_planes_[sid][0], flipped_planes_[sid][0]);
+      cv::transpose(input_planes_[sid][1], flipped_planes_[sid][1]);
+      cv::transpose(input_planes_[sid][2], flipped_planes_[sid][2]);
+      auto& plane1 = flipped_planes_[sid][0];
+      auto& plane2 = flipped_planes_[sid][1];
+      auto& plane3 = flipped_planes_[sid][2];
+      auto& planar_input = planar_input_[sid];
+      plane1.copyTo(planar_input(cv::Rect(
+          0, net_input_width_ * 0, net_input_height_, net_input_width_)));
+      plane2.copyTo(planar_input(cv::Rect(
+          0, net_input_width_ * 1, net_input_height_, net_input_width_)));
+      plane3.copyTo(planar_input(cv::Rect(
+          0, net_input_width_ * 2, net_input_height_, net_input_width_)));
+      assert(planar_input.cols == net_input_height_);
+      for (int j = 0; j < net_input_width_ * 3; ++j) {
+        memcpy(net_input + j * net_input_height_,
+               planar_input.data + j * planar_input.step,
+               net_input_height_ * sizeof(float));
+      }
+      insert_frame(output_columns[0], output_frame);
+    }
+  }
+
+ private:
+  DeviceHandle device_;
+  proto::CaffeArgs args_;
+  f32 scale_;
+  i32 net_input_width_;
+  i32 net_input_height_;
+
+  cv::Mat mean_mat_;
+  std::vector<cv::Mat> frame_input_;
+  std::vector<cv::Mat> resized_input_;
+  std::vector<cv::Mat> float_input_;
+  std::vector<cv::Mat> normalized_input_;
+  std::vector<std::vector<cv::Mat>> input_planes_;
+  std::vector<std::vector<cv::Mat>> flipped_planes_;
+  std::vector<cv::Mat> planar_input_;
+};
+
+REGISTER_OP(FacenetInput).frame_input("frame").frame_output("facenet_input");
+
+REGISTER_KERNEL(FacenetInput, FacenetInputKernelCPU)
+    .device(DeviceType::CPU)
+    .num_devices(1);
+}
diff --git a/stdlib/caffe/facenet_input_kernel_gpu.cpp b/stdlib/caffe/facenet_input_kernel_gpu.cpp
index bfcd6d63..c0700c0a 100644
--- a/stdlib/caffe/facenet_input_kernel_gpu.cpp
+++ b/stdlib/caffe/facenet_input_kernel_gpu.cpp
@@ -1,9 +1,9 @@
-#include "scanner/api/op.h"
 #include "scanner/api/kernel.h"
-#include "stdlib/stdlib.pb.h"
-#include "scanner/util/opencv.h"
+#include "scanner/api/op.h"
 #include "scanner/util/cuda.h"
 #include "scanner/util/memory.h"
+#include "scanner/util/opencv.h"
+#include "stdlib/stdlib.pb.h"
 
 #ifdef HAVE_CUDA
 #include <opencv2/core/cuda_stream_accessor.hpp>
@@ -11,28 +11,28 @@
 
 namespace scanner {
 
-class FacenetInputKernel : public VideoKernel {
-public:
-  FacenetInputKernel(const Kernel::Config &config)
-    : VideoKernel(config),
+class FacenetInputKernel : public BatchedKernel, public VideoKernel {
+ public:
+  FacenetInputKernel(const KernelConfig& config)
+    : BatchedKernel(config),
       device_(config.devices[0])
 #ifdef HAVE_CUDA
-    ,
+      ,
       num_cuda_streams_(32),
       streams_(num_cuda_streams_)
 #endif
-    {
-      proto::FacenetArgs args;
-      args.ParseFromArray(config.args.data(), config.args.size());
-      args_.CopyFrom(args.caffe_args());
-      scale_ = args.scale();
-    }
+  {
+    proto::FacenetArgs args;
+    args.ParseFromArray(config.args.data(), config.args.size());
+    args_.CopyFrom(args.caffe_args());
+    scale_ = args.scale();
+  }
 
   void new_frame_info() override {
     net_input_width_ = std::floor(frame_info_.width() * scale_);
     net_input_height_ = std::floor(frame_info_.height() * scale_);
-    if (net_input_width_ % 8 != 0)  {
-      net_input_width_  += 8 - (net_input_width_ % 8);
+    if (net_input_width_ % 8 != 0) {
+      net_input_width_ += 8 - (net_input_width_ % 8);
     };
     if (net_input_height_ % 8 != 0) {
       net_input_height_ += 8 - (net_input_height_ % 8);
@@ -59,7 +59,7 @@ class FacenetInputKernel : public VideoKernel {
       frame_input_g_.push_back(
           cv::cuda::GpuMat(frame_info_.height(), frame_info_.width(), CV_8UC3));
       resized_input_g_.push_back(
-          cv::cuda::GpuMat(net_input_height_, net_input_width_, CV_8UC3));
+        cv::cuda::GpuMat(net_input_height_, net_input_width_, CV_8UC3));
       float_input_g_.push_back(
           cv::cuda::GpuMat(net_input_height_, net_input_width_, CV_32FC3));
       normalized_input_g_.push_back(
@@ -79,29 +79,33 @@ class FacenetInputKernel : public VideoKernel {
     }
   }
 
-  void execute(const BatchedColumns &input_columns,
-               BatchedColumns &output_columns) override {
-    i32 input_count = (i32)input_columns[0].rows.size();
-    check_frame_info(device_, input_columns[1]);
+  // TODO(wcrichto): set_device
 
-    size_t frame_size = net_input_width_ * net_input_height_ * 3;
-    i32 net_input_size = frame_size * sizeof(f32);
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    auto& frame_col = input_columns[0];
+    check_frame(device_, frame_col[0]);
 
-    u8 *output_block = new_block_buffer(
-      device_, input_count * net_input_size, input_count);
+    i32 input_count = (i32)frame_col.size();
+    FrameInfo net_input_info(3, net_input_width_, net_input_height_,
+                             FrameType::F32);
+    i32 net_input_size = net_input_info.size();
+    std::vector<Frame*> output_frames =
+        new_frames(device_, net_input_info, input_count);
 
     streams_.resize(0);
     streams_.resize(num_cuda_streams_);
 
     for (i32 i = 0; i < input_count; ++i) {
       int sid = i % num_cuda_streams_;
-      cv::cuda::Stream &cv_stream = streams_[sid];
+      cv::cuda::Stream& cv_stream = streams_[sid];
+
+      Frame* output_frame = output_frames[i];
+      f32* net_input = (f32*)output_frame->data;
 
-      f32 *net_input = (f32 *)(output_block + i * net_input_size);
+      // Convert input frame to gpu mat
+      frame_input_g_[sid] = frame_to_gpu_mat(frame_col[i].as_const_frame());
 
-      u8 *buffer = input_columns[0].rows[i].buffer;
-      frame_input_g_[sid] = cv::cuda::GpuMat(
-          frame_info_.height(), frame_info_.width(), CV_8UC3, buffer);
       cv::cuda::resize(frame_input_g_[sid], resized_input_g_[sid],
                        cv::Size(net_input_width_, net_input_height_), 0, 0,
                        cv::INTER_LINEAR, cv_stream);
@@ -118,10 +122,10 @@ class FacenetInputKernel : public VideoKernel {
                           cv_stream);
       cv::cuda::transpose(input_planes_g_[sid][2], flipped_planes_g_[sid][2],
                           cv_stream);
-      auto &plane1 = flipped_planes_g_[sid][0];
-      auto &plane2 = flipped_planes_g_[sid][1];
-      auto &plane3 = flipped_planes_g_[sid][2];
-      auto &planar_input = planar_input_g_[sid];
+      auto& plane1 = flipped_planes_g_[sid][0];
+      auto& plane2 = flipped_planes_g_[sid][1];
+      auto& plane3 = flipped_planes_g_[sid][2];
+      auto& planar_input = planar_input_g_[sid];
       plane1.copyTo(planar_input(cv::Rect(
           0, net_input_width_ * 0, net_input_height_, net_input_width_)));
       plane2.copyTo(planar_input(cv::Rect(
@@ -135,14 +139,14 @@ class FacenetInputKernel : public VideoKernel {
           planar_input.step, net_input_height_ * sizeof(float),
           net_input_width_ * 3, cudaMemcpyDeviceToDevice, s));
 
-      INSERT_ROW(output_columns[0], (u8 *)net_input, net_input_size);
+      insert_frame(output_columns[0], output_frame);
     }
-    for (cv::cuda::Stream &s : streams_) {
+    for (cv::cuda::Stream& s : streams_) {
       s.waitForCompletion();
     }
   }
 
-private:
+ private:
   DeviceHandle device_;
   proto::CaffeArgs args_;
   f32 scale_;
@@ -161,9 +165,6 @@ class FacenetInputKernel : public VideoKernel {
   std::vector<cv::cuda::GpuMat> planar_input_g_;
 };
 
-REGISTER_OP(FacenetInput)
-    .inputs({"frame", "frame_info"})
-    .outputs({"facenet_input"});
 REGISTER_KERNEL(FacenetInput, FacenetInputKernel)
     .device(DeviceType::GPU)
     .num_devices(1);
diff --git a/stdlib/caffe/facenet_kernel.cpp b/stdlib/caffe/facenet_kernel.cpp
index 33142d41..c8821d9e 100644
--- a/stdlib/caffe/facenet_kernel.cpp
+++ b/stdlib/caffe/facenet_kernel.cpp
@@ -4,47 +4,43 @@
 namespace scanner {
 
 class FacenetKernel : public CaffeKernel {
-public:
-  FacenetKernel(const Kernel::Config& config)
+ public:
+  FacenetKernel(const KernelConfig& config)
     : CaffeKernel(get_caffe_config(config)) {}
 
   void net_config() override {
-    // Calculate width by scaling by box size
-    int resize_width = std::floor(frame_info_.width() * scale_);
-    int resize_height = std::floor(frame_info_.height() * scale_);
-
-    if (resize_width % 8 != 0)  { resize_width  += 8 - (resize_width % 8);  }
-    if (resize_height % 8 != 0) { resize_height += 8 - (resize_height % 8); }
-
-    int net_input_width = resize_height;
-    int net_input_height = resize_width;
+    int net_input_width = frame_info_.shape[1];
+    int net_input_height = frame_info_.shape[2];
 
     const boost::shared_ptr<caffe::Blob<float>> input_blob{
-      net_->blob_by_name("data")};
+        net_->blob_by_name("data")};
     input_blob->Reshape({input_blob->shape(0), input_blob->shape(1),
-          net_input_height, net_input_width});
+                         net_input_width, net_input_height});
   }
 
-  Kernel::Config get_caffe_config(const Kernel::Config& config) {
+  KernelConfig get_caffe_config(const KernelConfig& config) {
     proto::FacenetArgs args;
     args.ParseFromArray(config.args.data(), config.args.size());
     scale_ = args.scale();
 
-    Kernel::Config new_config(config);
+    KernelConfig new_config(config);
     std::string caffe_string;
     args.caffe_args().SerializeToString(&caffe_string);
     new_config.args = std::vector<u8>(caffe_string.begin(), caffe_string.end());
     return new_config;
   }
 
-private:
+ private:
   f32 scale_;
 };
 
 REGISTER_OP(Facenet)
-    .inputs({"facenet_input", "frame_info"})
-    .outputs({"facenet_output"});
-REGISTER_KERNEL(Facenet, FacenetKernel).device(DeviceType::CPU).num_devices(1);
-REGISTER_KERNEL(Facenet, FacenetKernel).device(DeviceType::GPU).num_devices(1);
+    .frame_input("facenet_input")
+    .frame_output("facenet_output");
 
+REGISTER_KERNEL(Facenet, FacenetKernel)
+    .device(DeviceType::CPU)
+    .num_devices(Kernel::UnlimitedDevices);
+
+REGISTER_KERNEL(Facenet, FacenetKernel).device(DeviceType::GPU).num_devices(1);
 }
diff --git a/stdlib/caffe/facenet_output_kernel_cpu.cpp b/stdlib/caffe/facenet_output_kernel_cpu.cpp
index f3f42f9b..5fbf6135 100644
--- a/stdlib/caffe/facenet_output_kernel_cpu.cpp
+++ b/stdlib/caffe/facenet_output_kernel_cpu.cpp
@@ -1,31 +1,30 @@
-#include "scanner/api/op.h"
 #include "scanner/api/kernel.h"
-#include "scanner/util/opencv.h"
+#include "scanner/api/op.h"
+#include "scanner/types.pb.h"
 #include "scanner/util/bbox.h"
+#include "scanner/util/opencv.h"
 #include "scanner/util/serialize.h"
 #include "stdlib/stdlib.pb.h"
-#include "scanner/types.pb.h"
 
 namespace scanner {
 
-class FacenetOutputKernel : public VideoKernel {
-public:
-  FacenetOutputKernel(const Kernel::Config& config) : VideoKernel(config) {
+class FacenetOutputKernel : public BatchedKernel, public VideoKernel {
+ public:
+  FacenetOutputKernel(const KernelConfig& config) : BatchedKernel(config) {
     proto::FacenetArgs args;
     args.ParseFromArray(config.args.data(), config.args.size());
 
     scale_ = args.scale();
     threshold_ = args.threshold();
 
-    std::ifstream template_file{"nets/caffe_facenet/facenet_templates.bin",
-        std::ifstream::binary};
+    std::ifstream template_file{args.templates_path(), std::ifstream::binary};
     LOG_IF(FATAL, !template_file.good()) << "Could not find template file.";
     templates_.resize(num_templates_, std::vector<float>(4));
     for (i32 t = 0; t < 25; ++t) {
       for (i32 i = 0; i < 4; ++i) {
         LOG_IF(FATAL, !template_file.good()) << "Template file not correct.";
         f32 d;
-        template_file.read(reinterpret_cast<char *>(&d), sizeof(f32));
+        template_file.read(reinterpret_cast<char*>(&d), sizeof(f32));
         templates_[t][i] = d;
       }
     }
@@ -35,8 +34,8 @@ class FacenetOutputKernel : public VideoKernel {
     net_input_width_ = std::floor(frame_info_.width() * scale_);
     net_input_height_ = std::floor(frame_info_.height() * scale_);
 
-    if (net_input_width_ % 8 != 0)  {
-      net_input_width_  += 8 - (net_input_width_ % 8);
+    if (net_input_width_ % 8 != 0) {
+      net_input_width_ += 8 - (net_input_width_ % 8);
     };
     if (net_input_height_ % 8 != 0) {
       net_input_height_ += 8 - (net_input_height_ % 8);
@@ -46,22 +45,23 @@ class FacenetOutputKernel : public VideoKernel {
     grid_height_ = std::ceil(float(net_input_height_) / cell_height_);
 
     feature_vector_lengths_ = {
-      grid_width_ * grid_height_ * num_templates_,     // template probabilities
-      grid_width_ * grid_height_ * num_templates_ * 4, // template adjustments
+        grid_width_ * grid_height_ * num_templates_,  // template probabilities
+        grid_width_ * grid_height_ * num_templates_ *
+            4,  // template adjustments
     };
     feature_vector_sizes_ = {
-      sizeof(f32) * feature_vector_lengths_[0],
-      sizeof(f32) * feature_vector_lengths_[1],
+        sizeof(f32) * feature_vector_lengths_[0],
+        sizeof(f32) * feature_vector_lengths_[1],
     };
   }
 
-  void execute(const BatchedColumns &input_columns,
-               BatchedColumns &output_columns) override {
-    check_frame_info(CPU_DEVICE, input_columns[1]);
-
-    i32 input_count = (i32)input_columns[0].rows.size();
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    auto& frame_col = input_columns[0];
+    auto& orig_frame_info_col = input_columns[1];
+    check_frame_info(CPU_DEVICE, orig_frame_info_col[0]);
 
-    assert(input_columns.size() >= 2);
+    i32 input_count = (i32)frame_col.size();
 
     std::vector<i32> valid_templates = regular_valid_templates_;
     if (scale_ > 1.0) {
@@ -70,17 +70,18 @@ class FacenetOutputKernel : public VideoKernel {
     // Get bounding box data from output feature vector and turn it
     // into canonical center x, center y, width, height
     for (i32 b = 0; b < input_count; ++b) {
+      const Frame* frame = frame_col[b].as_const_frame();
 
-      assert(input_columns[0].rows[b].size ==
+      assert(frame->type == FrameType::F32);
+      assert(frame->size() ==
              (feature_vector_sizes_[0] + feature_vector_sizes_[1]));
 
       std::vector<BoundingBox> bboxes;
       // Track confidence per pixel for each category so we can calculate
       // uncertainty across the frame
-      f32 *template_confidences =
-        reinterpret_cast<f32 *>(input_columns[0].rows[b].buffer);
-      f32 *template_adjustments =
-        template_confidences + feature_vector_lengths_[0];
+      f32* template_confidences = reinterpret_cast<f32*>(frame->data);
+      f32* template_adjustments =
+          template_confidences + feature_vector_lengths_[0];
 
       for (i32 t : valid_templates) {
         for (i32 xi = 0; xi < grid_width_; ++xi) {
@@ -88,36 +89,36 @@ class FacenetOutputKernel : public VideoKernel {
             i32 vec_offset = xi * grid_height_ + yi;
 
             f32 confidence =
-              template_confidences[t * grid_width_ * grid_height_ + vec_offset];
+                template_confidences[t * grid_width_ * grid_height_ +
+                                     vec_offset];
             // Apply sigmoid to confidence
             confidence = 1.0 / (1.0 + std::exp(-confidence));
 
-            if (confidence < threshold_)
-              continue;
+            if (confidence < threshold_) continue;
 
-            f32 x = xi * cell_width_ - 2;
-            f32 y = yi * cell_height_ - 2;
+            f32 x = xi * cell_width_ - 1;
+            f32 y = yi * cell_height_ - 1;
 
             f32 width = templates_[t][2] - templates_[t][0] + 1;
             f32 height = templates_[t][3] - templates_[t][1] + 1;
 
             f32 dcx = template_adjustments[(num_templates_ * 0 + t) *
-                                           grid_width_ * grid_height_ +
+                                               grid_width_ * grid_height_ +
                                            vec_offset];
             x += width * dcx;
 
             f32 dcy = template_adjustments[(num_templates_ * 1 + t) *
-                                           grid_width_ * grid_height_ +
+                                               grid_width_ * grid_height_ +
                                            vec_offset];
             y += height * dcy;
 
             f32 dcw = template_adjustments[(num_templates_ * 2 + t) *
-                                           grid_width_ * grid_height_ +
+                                               grid_width_ * grid_height_ +
                                            vec_offset];
             width *= std::exp(dcw);
 
             f32 dch = template_adjustments[(num_templates_ * 3 + t) *
-                                           grid_width_ * grid_height_ +
+                                               grid_width_ * grid_height_ +
                                            vec_offset];
             height *= std::exp(dch);
 
@@ -140,7 +141,8 @@ class FacenetOutputKernel : public VideoKernel {
             bbox.set_y2(y + height / 2);
             bbox.set_score(confidence);
 
-            // if (bbox.x1() < 0 || bbox.y1() < 0 || bbox.x2() > frame_info_.width()
+            // if (bbox.x1() < 0 || bbox.y1() < 0 || bbox.x2() >
+            // frame_info_.width()
             // ||
             //     bbox.y2() > frame_info_.height())
             //   continue;
@@ -155,13 +157,13 @@ class FacenetOutputKernel : public VideoKernel {
 
       // Assume size of a bounding box is the same size as all bounding boxes
       size_t size;
-      u8 *buffer;
+      u8* buffer;
       serialize_bbox_vector(best_bboxes, buffer, size);
-      output_columns[0].rows.push_back(Row{buffer, size});
+      output_columns[0].push_back(Element{buffer, size});
     }
   }
 
-private:
+ private:
   f32 scale_;
   const std::vector<i32> regular_valid_templates_ = {
       4, 5, 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, 24};
@@ -181,9 +183,12 @@ class FacenetOutputKernel : public VideoKernel {
   double threshold_;
 };
 
-REGISTER_OP(FacenetOutput).inputs({"facenet_output"}).outputs({"bboxes"});
+REGISTER_OP(FacenetOutput)
+    .frame_input("facenet_output")
+    .input("original_frame_info")
+    .output("bboxes");
+
 REGISTER_KERNEL(FacenetOutput, FacenetOutputKernel)
     .device(DeviceType::CPU)
     .num_devices(1);
-
 }
diff --git a/stdlib/caffe/faster_rcnn_kernel.cpp b/stdlib/caffe/faster_rcnn_kernel.cpp
new file mode 100644
index 00000000..a742e396
--- /dev/null
+++ b/stdlib/caffe/faster_rcnn_kernel.cpp
@@ -0,0 +1,32 @@
+#include "scanner/api/op.h"
+#include "stdlib/caffe/caffe_kernel.h"
+
+namespace scanner {
+
+class FasterRCNNKernel : public CaffeKernel {
+ public:
+  FasterRCNNKernel(const KernelConfig& config) : CaffeKernel(config) {}
+
+  void net_config() override {
+    boost::shared_ptr<caffe::Blob<float>> blob = net_->blob_by_name("im_info");
+    f32 buf[3] = {frame_info_.shape[2], frame_info_.shape[1], 1.0};
+    f32* blob_data = device_.type == DeviceType::GPU ? blob->mutable_gpu_data()
+                                                     : blob->mutable_cpu_data();
+    memcpy_buffer((u8*)blob_data, device_, (u8*)buf, CPU_DEVICE,
+                  3 * sizeof(f32));
+  }
+};
+
+REGISTER_OP(FasterRCNN)
+    .frame_input("caffe_input")
+    .frame_output("cls_prob")
+    .frame_output("rois")
+    .frame_output("fc7");
+
+REGISTER_KERNEL(FasterRCNN, FasterRCNNKernel)
+    .device(DeviceType::CPU)
+    .num_devices(1);
+REGISTER_KERNEL(FasterRCNN, FasterRCNNKernel)
+    .device(DeviceType::GPU)
+    .num_devices(1);
+}
diff --git a/stdlib/caffe/faster_rcnn_output_kernel_cpu.cpp b/stdlib/caffe/faster_rcnn_output_kernel_cpu.cpp
new file mode 100644
index 00000000..9fc61d79
--- /dev/null
+++ b/stdlib/caffe/faster_rcnn_output_kernel_cpu.cpp
@@ -0,0 +1,114 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/types.pb.h"
+#include "scanner/util/bbox.h"
+#include "scanner/util/opencv.h"
+#include "scanner/util/serialize.h"
+#include "stdlib/stdlib.pb.h"
+
+namespace scanner {
+
+#define CLASSES 81
+#define SCORE_THRESHOLD 0.7
+#define BOX_SIZE 5
+#define FEATURES 4096
+
+class FasterRCNNOutputKernel : public BatchedKernel {
+ public:
+  FasterRCNNOutputKernel(const KernelConfig& config) : BatchedKernel(config) {}
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    assert(input_columns.size() == 3);
+
+    i32 input_count = num_rows(input_columns[0]);
+    i32 cls_prob_idx = 0;
+    i32 rois_idx = 1;
+    i32 fc7_idx = 2;
+    const ElementList &cls_prob = input_columns[cls_prob_idx],
+                      &rois = input_columns[rois_idx],
+                      &fc7 = input_columns[fc7_idx];
+
+    for (i32 i = 0; i < input_count; ++i) {
+      const Frame* cls_p = cls_prob[i].as_const_frame();
+      const Frame* roi = rois[i].as_const_frame();
+      const Frame* fc = fc7[i].as_const_frame();
+
+      i32 proposal_count = roi->size() / (BOX_SIZE * sizeof(f32));
+      assert(roi->size() == BOX_SIZE * sizeof(f32) * proposal_count);
+      assert(cls_p->size() == CLASSES * sizeof(f32) * proposal_count);
+      std::vector<BoundingBox> bboxes;
+      for (i32 j = 0; j < proposal_count; ++j) {
+        f32* ro = (f32*)(roi->data + (j * BOX_SIZE * sizeof(f32)));
+        f32 x1 = ro[1], y1 = ro[2], x2 = ro[3], y2 = ro[4];
+
+        BoundingBox bbox;
+        bbox.set_x1(x1);
+        bbox.set_y1(y1);
+        bbox.set_x2(x2);
+        bbox.set_y2(y2);
+
+        f32 max_score = std::numeric_limits<f32>::min();
+        i32 max_cls = 0;
+        // Start at cls = 1 to skip background
+        for (i32 cls = 1; cls < CLASSES; ++cls) {
+          f32* scores =
+              (f32*)(cls_p->data + (j * CLASSES * sizeof(f32)));
+          f32 score = scores[cls];
+          if (score > max_score) {
+            max_score = score;
+            max_cls = cls;
+          }
+        }
+
+        if (max_score > SCORE_THRESHOLD) {
+          assert(max_cls != 0);
+          bbox.set_score(max_score);
+          bbox.set_track_id(j);
+          bbox.set_label(max_cls);
+          bboxes.push_back(bbox);
+        }
+      }
+
+      std::vector<BoundingBox> best_bboxes;
+      best_bboxes = best_nms(bboxes, 0.3);
+
+      {
+        size_t size;
+        u8* buffer;
+        serialize_bbox_vector(best_bboxes, buffer, size);
+        insert_element(output_columns[0], buffer, size);
+      }
+
+      if (best_bboxes.size() == 0) {
+        u8* buffer = new_buffer(CPU_DEVICE, 1);
+        insert_element(output_columns[1], buffer, 1);
+      } else {
+        {
+          size_t size =
+              std::max(best_bboxes.size() * FEATURES * sizeof(f32), (size_t)1);
+          u8* buffer = new_buffer(CPU_DEVICE, size);
+          for (i32 k = 0; k < best_bboxes.size(); ++k) {
+            i32 j = best_bboxes[k].track_id();
+            f32* fvec = (f32*)(fc->data + (j * FEATURES * sizeof(f32)));
+            std::memcpy(buffer + (k * FEATURES * sizeof(f32)), fvec,
+                        FEATURES * sizeof(f32));
+          }
+          insert_element(output_columns[1], buffer, size);
+        }
+      }
+    }
+  }
+};
+
+REGISTER_OP(FasterRCNNOutput)
+    .frame_input("cls_prob")
+    .frame_input("rois")
+    .frame_input("fc7")
+    .output("bboxes")
+    .output("features");
+
+REGISTER_KERNEL(FasterRCNNOutput, FasterRCNNOutputKernel)
+    .device(DeviceType::CPU)
+    .num_devices(Kernel::UnlimitedDevices);
+}
diff --git a/stdlib/caffe/faster_rcnn_parser_evaluator.cpp b/stdlib/caffe/faster_rcnn_parser_evaluator.cpp
deleted file mode 100644
index 94a862a3..00000000
--- a/stdlib/caffe/faster_rcnn_parser_evaluator.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-#include "scanner/evaluators/caffe/faster_rcnn/faster_rcnn_parser_evaluator.h"
-#include "scanner/evaluators/serialize.h"
-#include "scanner/util/bbox.h"
-
-namespace scanner {
-
-#define CLASSES 81
-#define SCORE_THRESHOLD 0.8
-#define BOX_SIZE 5
-#define FEATURES 4096
-
-void FasterRCNNParserEvaluator::evaluate(const BatchedColumns &input_columns,
-                                         BatchedColumns &output_columns) {
-  assert(input_columns.size() == 4);
-
-  i32 input_count = input_columns[0].rows.size();
-  i32 cls_prob_idx = 1;
-  i32 rois_idx = 2;
-  i32 fc7_idx = 3;
-  const std::vector<Row> &cls_prob = input_columns[cls_prob_idx].rows,
-                         &rois = input_columns[rois_idx].rows,
-                         &fc7 = input_columns[fc7_idx].rows;
-
-  for (i32 i = 0; i < input_count; ++i) {
-    i32 proposal_count =
-        input_columns[rois_idx].rows[i].size / (BOX_SIZE * sizeof(f32));
-    assert(rois[i].size == BOX_SIZE * sizeof(f32) * proposal_count);
-    assert(cls_prob[i].size == CLASSES * sizeof(f32) * proposal_count);
-    std::vector<BoundingBox> bboxes;
-    for (i32 j = 0; j < proposal_count; ++j) {
-      f32 *roi = (f32 *)(rois[i].buffer + (j * BOX_SIZE * sizeof(f32)));
-      f32 x1 = roi[1], y1 = roi[2], x2 = roi[3], y2 = roi[4];
-
-      BoundingBox bbox;
-      bbox.set_x1(x1);
-      bbox.set_y1(y1);
-      bbox.set_x2(x2);
-      bbox.set_y2(y2);
-
-      f32 max_score;
-      // Start at cls = 1 to skip background
-      for (i32 cls = 1; cls < CLASSES; ++cls) {
-        f32 *scores = (f32 *)(cls_prob[i].buffer + (j * CLASSES * sizeof(f32)));
-        f32 score = scores[cls];
-        if (score > SCORE_THRESHOLD) {
-          bbox.set_score(score);
-          bbox.set_track_id(j);
-          bbox.set_label(cls);
-          bboxes.push_back(bbox);
-          break;
-        }
-      }
-    }
-
-    std::vector<BoundingBox> best_bboxes;
-    best_bboxes = best_nms(bboxes, 0.3);
-
-    {
-      size_t size;
-      u8 *buffer;
-      serialize_bbox_vector(best_bboxes, buffer, size);
-      output_columns[0].rows.push_back(Row{buffer, size});
-    }
-
-    {
-      size_t size =
-          std::max(best_bboxes.size() * FEATURES * sizeof(f32), (size_t)1);
-      u8 *buffer = new_buffer(CPU_DEVICE, size);
-      for (i32 k = 0; k < best_bboxes.size(); ++k) {
-        i32 j = best_bboxes[k].track_id();
-        f32 *fvec = (f32 *)(fc7[i].buffer + (j * FEATURES * sizeof(f32)));
-        std::memcpy(buffer + (k * FEATURES * sizeof(f32)), fvec,
-                    FEATURES * sizeof(f32));
-      }
-      output_columns[1].rows.push_back(Row{buffer, size});
-    }
-  }
-}
-
-EvaluatorCapabilities FasterRCNNParserEvaluatorFactory::get_capabilities() {
-  EvaluatorCapabilities caps;
-  caps.device_type = DeviceType::CPU;
-  caps.max_devices = 1;
-  caps.warmup_size = 0;
-  return caps;
-}
-
-std::vector<std::string> FasterRCNNParserEvaluatorFactory::get_output_columns(
-    const std::vector<std::string> &input_column) {
-  return {"bboxes", "fc7"};
-}
-
-Evaluator *
-FasterRCNNParserEvaluatorFactory::new_evaluator(const EvaluatorConfig &config) {
-  return new FasterRCNNParserEvaluator;
-}
-}
diff --git a/stdlib/caffe/faster_rcnn_parser_evaluator.h b/stdlib/caffe/faster_rcnn_parser_evaluator.h
deleted file mode 100644
index 94ef63a8..00000000
--- a/stdlib/caffe/faster_rcnn_parser_evaluator.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#pragma once
-
-#include "scanner/eval/evaluator.h"
-#include "scanner/eval/evaluator_factory.h"
-
-namespace scanner {
-
-class FasterRCNNParserEvaluator : public Evaluator {
- public:
-  void evaluate(const BatchedColumns& input_columns,
-                BatchedColumns& output_columns) override;
-};
-
-class FasterRCNNParserEvaluatorFactory : public EvaluatorFactory {
- public:
-  EvaluatorCapabilities get_capabilities() override;
-
-  std::vector<std::string> get_output_columns(
-      const std::vector<std::string>& input_columns) override;
-
-  Evaluator* new_evaluator(const EvaluatorConfig& config) override;
-};
-}
diff --git a/stdlib/caffe/openpose_kernel.cpp b/stdlib/caffe/openpose_kernel.cpp
new file mode 100644
index 00000000..5fa3ef72
--- /dev/null
+++ b/stdlib/caffe/openpose_kernel.cpp
@@ -0,0 +1,142 @@
+#include <iostream>
+#include <opencv2/core/cuda.hpp>
+#include <openpose/headers.hpp>
+
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/cuda.h"
+#include "scanner/util/opencv.h"
+#include "stdlib/stdlib.pb.h"
+
+namespace scanner {
+
+const int POSE_KEYPOINTS = 18;
+const int FACE_KEYPOINTS = 70;
+const int HAND_KEYPOINTS = 21;
+const int TOTAL_KEYPOINTS =
+    POSE_KEYPOINTS + FACE_KEYPOINTS + 2 * HAND_KEYPOINTS;
+
+class OpenPoseKernel : public scanner::BatchedKernel,
+                       public scanner::VideoKernel {
+ public:
+  OpenPoseKernel(const scanner::KernelConfig& config)
+    : scanner::BatchedKernel(config),
+      opWrapper_{op::ThreadManagerMode::Asynchronous},
+      device_(config.devices[0]) {
+
+    proto::OpenPoseArgs args;
+    args.ParseFromArray(config.args.data(), config.args.size());
+
+    const op::WrapperStructPose wrapperStructPose{true,
+                                                  {-1, 368},
+                                                  {-1, -1},
+                                                  op::ScaleMode::ZeroToOne,
+                                                  1,
+                                                  device_.id,
+                                                  args.pose_num_scales(),
+                                                  args.pose_scale_gap(),
+                                                  op::RenderMode::None,
+                                                  op::PoseModel::COCO_18,
+                                                  false,
+                                                  0.6,
+                                                  0.7,
+                                                  0,
+                                                  args.model_directory(),
+                                                  {op::HeatMapType::Parts},
+                                                  op::ScaleMode::ZeroToOne,
+                                                  0.05,
+                                                  false};
+
+    const op::WrapperStructFace wrapperStructFace{
+      args.compute_face(), {368, 368}, op::RenderMode::None, 0.6, 0.7, 0.2};
+
+    const op::WrapperStructHand wrapperStructHand{args.compute_hands(),
+                                                  {368, 368},
+                                                  args.hand_num_scales(),
+                                                  args.hand_scale_gap(),
+                                                  false,
+                                                  op::RenderMode::None,
+                                                  0.6,
+                                                  0.7,
+                                                  0.2};
+
+    opWrapper_.configure(wrapperStructPose, wrapperStructFace,
+                         wrapperStructHand, op::WrapperStructInput{},
+                         op::WrapperStructOutput{});
+    opWrapper_.start();
+  }
+
+  void execute(const scanner::BatchedColumns& input_columns,
+               scanner::BatchedColumns& output_columns) override {
+    auto& frame_col = input_columns[0];
+
+    auto datumsPtr = std::make_shared<std::vector<op::Datum>>();
+    for (int i = 0; i < num_rows(frame_col); ++i) {
+      datumsPtr->emplace_back();
+      auto& datum = datumsPtr->at(datumsPtr->size() - 1);
+      CUDA_PROTECT({
+        cv::cuda::GpuMat gpu_input =
+            scanner::frame_to_gpu_mat(frame_col[i].as_const_frame());
+        datum.cvInputData = cv::Mat(gpu_input);
+      });
+    }
+
+    bool emplaced = opWrapper_.waitAndEmplace(datumsPtr);
+    LOG_IF(FATAL, !emplaced) << "Failed to emplace pose work";
+    std::shared_ptr<std::vector<op::Datum>> datumProcessed;
+    bool popped = opWrapper_.waitAndPop(datumProcessed);
+    LOG_IF(FATAL, !popped) << "Failed to pop pose results";
+
+    for (auto& datum : *datumProcessed) {
+      int num_people = datum.poseKeypoints.getSize(0);
+      size_t size =
+          num_people > 0 ? TOTAL_KEYPOINTS * num_people * 3 * sizeof(float) : 1;
+      float* kp = new float[size / sizeof(float)];
+      std::memset(kp, 0, size);
+      float* curr_kp = kp;
+      for (int i = 0; i < num_people; ++i) {
+        std::memcpy(curr_kp,
+                    datum.poseKeypoints.getPtr() + i * POSE_KEYPOINTS * 3,
+                    POSE_KEYPOINTS * 3 * sizeof(float));
+        curr_kp += POSE_KEYPOINTS * 3;
+        if (datum.faceKeypoints.getPtr() != nullptr) {
+          std::memcpy(curr_kp,
+                      datum.faceKeypoints.getPtr() + i * FACE_KEYPOINTS * 3,
+                      FACE_KEYPOINTS * 3 * sizeof(float));
+        }
+        curr_kp += FACE_KEYPOINTS * 3;
+        if (datum.handKeypoints[0].getPtr() != nullptr) {
+          std::memcpy(curr_kp,
+                      datum.handKeypoints[0].getPtr() + i * HAND_KEYPOINTS * 3,
+                      HAND_KEYPOINTS * 3 * sizeof(float));
+        }
+        curr_kp += HAND_KEYPOINTS * 3;
+        if (datum.handKeypoints[1].getPtr() != nullptr) {
+          std::memcpy(curr_kp,
+                      datum.handKeypoints[1].getPtr() + i * HAND_KEYPOINTS * 3,
+                      HAND_KEYPOINTS * 3 * sizeof(float));
+        }
+        curr_kp += HAND_KEYPOINTS * 3;
+      }
+
+      float* gpu_kp = (float*)scanner::new_buffer(device_, size);
+      scanner::memcpy_buffer((scanner::u8*)gpu_kp, device_, (scanner::u8*)kp,
+                             scanner::CPU_DEVICE, size);
+      scanner::insert_element(output_columns[0], (scanner::u8*)gpu_kp, size);
+      delete kp;
+    }
+  }
+
+ private:
+  scanner::DeviceHandle device_;
+  op::Wrapper<std::vector<op::Datum>> opWrapper_;
+};
+
+REGISTER_OP(OpenPose).frame_input("frame").output("pose");
+
+REGISTER_KERNEL(OpenPose, OpenPoseKernel)
+    .device(scanner::DeviceType::GPU)
+    .num_devices(1)
+    .batch();
+
+}  // namespace scanner
diff --git a/stdlib/caffe/yolo_output_kernel_cpu.cpp b/stdlib/caffe/yolo_output_kernel_cpu.cpp
new file mode 100644
index 00000000..a16375b8
--- /dev/null
+++ b/stdlib/caffe/yolo_output_kernel_cpu.cpp
@@ -0,0 +1,175 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/types.pb.h"
+#include "scanner/util/bbox.h"
+#include "scanner/util/opencv.h"
+#include "scanner/util/serialize.h"
+#include "stdlib/stdlib.pb.h"
+
+namespace scanner {
+
+class YoloOutputKernel : public BatchedKernel, public VideoKernel {
+ public:
+  YoloOutputKernel(const KernelConfig& config) : BatchedKernel(config) {
+    categories_ = {
+        "aeroplane",   "bicycle", "bird",  "boat",      "bottle",
+        "bus",         "car",     "cat",   "chair",     "cow",
+        "diningtable", "dog",     "horse", "motorbike", "person",
+        "pottedplant", "sheep",   "sofa",  "train",     "tvmonitor",
+    };
+    num_categories_ = static_cast<i32>(categories_.size());
+
+    input_width_ = 448;
+    input_height_ = 448;
+    grid_width_ = 7;
+    grid_height_ = 7;
+    cell_width_ = input_width_ / grid_width_;
+    cell_height_ = input_height_ / grid_height_;
+    num_bboxes_ = 2;
+
+    feature_vector_lengths_ = {
+        grid_width_ * grid_height_ * num_categories_,  // category confidences
+        grid_width_ * grid_height_ * num_bboxes_,      // objectness
+        grid_width_ * grid_height_ * num_bboxes_ * 4   // bbox attributes
+    };
+    feature_vector_sizes_ = {
+        sizeof(f32) * feature_vector_lengths_[0],
+        sizeof(f32) * feature_vector_lengths_[1],
+        sizeof(f32) * feature_vector_lengths_[2],
+    };
+
+    threshold_ = 0.5;
+  }
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    i32 input_count = (i32)num_rows(input_columns[0]);
+    for (i32 i = 0; i < input_count; ++i) {
+      assert(input_columns[0][i].as_const_frame()->size() ==
+             (feature_vector_sizes_[0] + feature_vector_sizes_[1] +
+              feature_vector_sizes_[2]));
+      f32* category_confidences_vector =
+          reinterpret_cast<f32*>(input_columns[0][i].as_const_frame()->data);
+      f32* objectness_vector =
+          category_confidences_vector + feature_vector_lengths_[0];
+      f32* bbox_vector = objectness_vector += feature_vector_lengths_[1];
+
+      std::vector<f32> pixel_confidences(
+          input_height_ * input_width_ * num_categories_, 0.0f);
+
+      // Get bounding box data from output feature vector and turn it
+      // into canonical center x, center y, width, height
+      std::vector<BoundingBox> bboxes;
+      for (i32 yi = 0; yi < grid_height_; ++yi) {
+        for (i32 xi = 0; xi < grid_width_; ++xi) {
+          for (i32 bi = 0; bi < num_bboxes_; ++bi) {
+            i32 vec_offset = yi * grid_width_ + xi;
+
+            f32 x = ((xi + bbox_vector[(vec_offset)*num_bboxes_ + bi * 4 + 0]) /
+                     grid_width_) *
+                    input_width_;
+            f32 y = ((yi + bbox_vector[(vec_offset)*num_bboxes_ + bi * 4 + 1]) /
+                     grid_height_) *
+                    input_height_;
+
+            f32 width =
+                std::pow(bbox_vector[(vec_offset)*num_bboxes_ + bi * 4 + 3],
+                         2) *
+                input_width_;
+            f32 height =
+                std::pow(bbox_vector[(vec_offset)*num_bboxes_ + bi * 4 + 4],
+                         2) *
+                input_height_;
+
+            std::vector<f32> category_probabilities(num_categories_);
+            for (i32 c = 0; c < num_categories_; ++c) {
+              f64 prob = objectness_vector[vec_offset * num_bboxes_ + bi] *
+                         category_confidences_vector[vec_offset + c];
+              category_probabilities[c] = prob;
+
+              if (prob < threshold_) continue;
+
+              for (i32 bbox_y = std::max(y - height / 2, 0.0f);
+                   bbox_y < std::min(y + height / 2, (f32)input_height_);
+                   ++bbox_y) {
+                for (i32 bbox_x = std::max(x - width / 2, 0.0f);
+                     bbox_x < std::min(x + width / 2, (f32)input_width_);
+                     ++bbox_x) {
+                  f32& max_confidence =
+                      pixel_confidences[bbox_y * input_width_ +
+                                        bbox_x * num_categories_ + c];
+                  if (prob > max_confidence) {
+                    max_confidence = prob;
+                  }
+                }
+              }
+
+              if (width < 0 || height < 0) continue;
+
+              BoundingBox bbox;
+              bbox.set_x1(x);
+              bbox.set_y1(y);
+              bbox.set_x2(x + width);
+              bbox.set_y2(y + height);
+              bbox.set_score(prob);
+              bbox.set_label(c);
+              bboxes.push_back(bbox);
+            }
+          }
+        }
+      }
+
+      i32 non_thresholded_pixels = 1;
+      f64 certainty = 0.0f;
+      for (i32 yi = 0; yi < input_height_; ++yi) {
+        for (i32 xi = 0; xi < input_width_; ++xi) {
+          // For each pixel, compute difference between two most
+          // confident classes
+          f32 max1 = std::numeric_limits<f32>::lowest();
+          f32 max2 = std::numeric_limits<f32>::lowest();
+          for (i32 c = 0; c < num_categories_; ++c) {
+            const f32& confidence =
+                pixel_confidences[yi * input_width_ + xi * num_categories_ + c];
+            if (confidence > max1) {
+              max2 = max1;
+              max1 = confidence;
+            } else if (confidence > max2) {
+              max2 = confidence;
+            }
+          }
+          certainty += (max1 - max2);
+          if (max1 > threshold_ || max2 > threshold_) {
+            non_thresholded_pixels++;
+          }
+        }
+      }
+
+      size_t size;
+      u8* buffer;
+      serialize_bbox_vector(bboxes, buffer, size);
+      insert_element(output_columns[0], buffer, size);
+    }
+  }
+
+ private:
+  std::vector<std::string> categories_;
+  i32 num_categories_;
+  i32 input_width_;
+  i32 input_height_;
+  i32 grid_width_;
+  i32 grid_height_;
+  i32 cell_width_;
+  i32 cell_height_;
+  i32 num_bboxes_;
+  std::vector<i32> feature_vector_lengths_;
+  std::vector<size_t> feature_vector_sizes_;
+
+  double threshold_;
+};
+
+REGISTER_OP(YoloOutput).frame_input("caffe_output").output("bboxes");
+
+REGISTER_KERNEL(YoloOutput, YoloOutputKernel)
+    .device(DeviceType::CPU)
+    .num_devices(1);
+}
diff --git a/stdlib/gipuma/CMakeLists.txt b/stdlib/gipuma/CMakeLists.txt
new file mode 100644
index 00000000..417daeba
--- /dev/null
+++ b/stdlib/gipuma/CMakeLists.txt
@@ -0,0 +1,10 @@
+find_package(Gipuma REQUIRED)
+
+set(SOURCE_FILES gipuma_kernel.cpp)
+
+add_library(gipuma OBJECT ${SOURCE_FILES})
+
+target_include_directories(gipuma PUBLIC "${GIPUMA_INCLUDE_DIRS}")
+list(APPEND STDLIB_LIBRARIES "${GIPUMA_LIBRARIES}")
+
+set(STDLIB_LIBRARIES ${STDLIB_LIBRARIES} PARENT_SCOPE)
diff --git a/stdlib/gipuma/gipuma_kernel.cpp b/stdlib/gipuma/gipuma_kernel.cpp
new file mode 100644
index 00000000..518905e8
--- /dev/null
+++ b/stdlib/gipuma/gipuma_kernel.cpp
@@ -0,0 +1,257 @@
+#include <opencv2/imgproc.hpp>
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/cuda.h"
+#include "scanner/util/memory.h"
+#include "scanner/util/opencv.h"
+#include "stdlib/stdlib.pb.h"
+
+#include "gipuma/cameraGeometryUtils.h"
+#include "gipuma/gipuma.h"
+
+namespace scanner {
+
+class GipumaKernel : public VideoKernel {
+ public:
+  GipumaKernel(const KernelConfig& config)
+    : VideoKernel(config), device_(config.devices[0]), was_reset_(true) {
+    set_device();
+
+    state_.reset(new GlobalState);
+    algo_params_ = new AlgorithmParameters;
+
+    valid_.set_success(true);
+    if (!args_.ParseFromArray(config.args.data(), config.args.size())) {
+      RESULT_ERROR(&valid_, "GipumaKernel could not parse protobuf args");
+      return;
+    }
+
+    num_cameras_ = config.input_columns.size() / 3;
+    algo_params_->num_img_processed = num_cameras_;
+    algo_params_->min_angle = 1.00;
+    algo_params_->max_angle = 70.00;
+
+    algo_params_->min_disparity = args_.min_disparity();
+    algo_params_->max_disparity = args_.max_disparity();
+    algo_params_->depthMin = args_.min_depth();
+    algo_params_->depthMax = args_.max_depth();
+    algo_params_->iterations = args_.iterations();
+    algo_params_->box_hsize = args_.kernel_width();
+    algo_params_->box_vsize = args_.kernel_height();
+
+    algo_params_->n_best = 3;
+    algo_params_->normTol = 0.1f;
+  }
+
+  ~GipumaKernel() { delete algo_params_; }
+
+  void validate(proto::Result* result) {
+    result->set_msg(valid_.msg());
+    result->set_success(valid_.success());
+  }
+
+  void reset() {
+    camera_params_ = CameraParameters();
+    delete state_->cameras;
+    state_->cameras = new CameraParameters_cu;
+    was_reset_ = true;
+  }
+
+  void setup_gipuma(const BatchedColumns& input_columns) {
+    i32 frame_width = frame_info_.width();
+    i32 frame_height = frame_info_.height();
+
+    // Read camera calibration matrix from columns
+    for (i32 i = 0; i < num_cameras_; ++i) {
+      i32 col_idx = 2 + 3 * i;
+      auto& calibration_col = input_columns[col_idx];
+      // Read camera parameters from camera calibration column
+
+      u8* buffer = new_buffer(CPU_DEVICE, calibration_col.rows[0].size);
+      memcpy_buffer((u8*)buffer, CPU_DEVICE, calibration_col.rows[0].buffer,
+                    device_, calibration_col.rows[0].size);
+      proto::Camera cam;
+      cam.ParseFromArray(buffer, calibration_col.rows[0].size);
+      delete_buffer(CPU_DEVICE, buffer);
+
+      camera_params_.cameras.emplace_back();
+      auto& c = camera_params_.cameras.back();
+      for (i32 i = 0; i < 3; ++i) {
+        for (i32 j = 0; j < 4; ++j) {
+          i32 idx = i * 4 + j;
+          c.P(i, j) = cam.p(idx);
+        }
+      }
+    }
+    camera_params_ = getCameraParameters(*(state_->cameras), camera_params_);
+
+    selectViews(camera_params_, frame_width, frame_height, *algo_params_);
+    i32 selected_views = camera_params_.viewSelectionSubset.size();
+    assert(selected_views > 0);
+    printf("Num cameras selected %d\n", selected_views);
+
+    for (i32 i = 0; i < num_cameras_; ++i) {
+      camera_params_.cameras[i].depthMin = algo_params_->depthMin;
+      camera_params_.cameras[i].depthMax = algo_params_->depthMax;
+      state_->cameras->cameras[i].depthMin = algo_params_->depthMin;
+      state_->cameras->cameras[i].depthMax = algo_params_->depthMax;
+
+      algo_params_->min_disparity = disparityDepthConversion(
+          camera_params_.f, camera_params_.cameras[i].baseline,
+          camera_params_.cameras[i].depthMax);
+
+      algo_params_->max_disparity = disparityDepthConversion(
+          camera_params_.f, camera_params_.cameras[i].baseline,
+          camera_params_.cameras[i].depthMin);
+    }
+
+    for (i32 i = 0; i < selected_views; ++i) {
+      state_->cameras->viewSelectionSubset[i] =
+          camera_params_.viewSelectionSubset[i];
+    }
+
+    state_->params = algo_params_;
+    state_->cameras->viewSelectionSubsetNumber = selected_views;
+
+    state_->cameras->cols = frame_width;
+    state_->cameras->rows = frame_height;
+    algo_params_->cols = frame_width;
+    algo_params_->rows = frame_height;
+
+    // Resize lines
+    state_->lines->n = frame_height * frame_width;
+    state_->lines->resize(frame_height * frame_width);
+    state_->lines->s = frame_width;
+    state_->lines->l = frame_width;
+  }
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    set_device();
+
+    auto& frame_info = input_columns[1];
+    check_frame_info(device_, frame_info);
+
+    if (was_reset_) {
+      setup_gipuma(input_columns);
+    }
+
+    i32 width = frame_info_.width();
+    i32 height = frame_info_.height();
+    i32 points_output_size = width * height * sizeof(float4);
+    i32 cost_output_size = width * height * sizeof(float);
+
+    i32 input_count = (i32)input_columns[0].rows.size();
+    std::vector<cvc::GpuMat> grayscale_images_gpu(num_cameras_);
+    std::vector<cv::Mat> grayscale_images(num_cameras_);
+    u8* points_output_buffer = new_block_buffer(
+        device_, points_output_size * input_count, input_count);
+    u8* cost_output_buffer =
+        new_block_buffer(device_, cost_output_size * input_count, input_count);
+    for (i32 i = 0; i < input_count; ++i) {
+      for (i32 c = 0; c < num_cameras_; ++c) {
+        auto& frame_column = input_columns[c * 3];
+        cvc::GpuMat frame_input(frame_info_.height(), frame_info_.width(),
+                                CV_8UC3, frame_column.rows[i].buffer);
+        assert(frame_column.rows[i].size == width * height * 3);
+
+        grayscale_images[c] =
+            cv::Mat(frame_info_.height(), frame_info_.width(), CV_8UC3);
+        frame_input.download(grayscale_images[c]);
+        cv::cvtColor(grayscale_images[c], grayscale_images[c], CV_BGR2GRAY, 0);
+        grayscale_images[c].convertTo(grayscale_images[c], CV_32FC1);
+      }
+
+      addImageToTextureFloatGray(grayscale_images, state_->imgs,
+                                 state_->cuArray);
+
+      runcuda(*state_.get());
+
+      // Copy estiamted points to output buffer
+      cudaMemcpy(points_output_buffer + points_output_size * i,
+                 state_->lines->norm4, points_output_size, cudaMemcpyDefault);
+      insert_element(output_columns[0],
+                     points_output_buffer + points_output_size * i,
+                     points_output_size);
+
+      // Copy costs to output buffer
+      cudaMemcpy(cost_output_buffer + cost_output_size * i, state_->lines->c,
+                 cost_output_size, cudaMemcpyDefault);
+      insert_element(output_columns[1],
+                     cost_output_buffer + cost_output_size * i,
+                     cost_output_size);
+
+      delTexture(algo_params_->num_img_processed, state_->imgs,
+                 state_->cuArray);
+    }
+  }
+
+  void set_device() {
+    cudaSetDevice(device_.id);
+    cvc::setDevice(device_.id);
+  }
+
+ private:
+  DeviceHandle device_;
+  proto::Result valid_;
+  proto::GipumaArgs args_;
+  CameraParameters camera_params_;
+  AlgorithmParameters* algo_params_;
+  std::unique_ptr<GlobalState> state_;
+  i32 num_cameras_;
+  bool was_reset_;
+};
+
+REGISTER_OP(Gipuma).variadic_inputs().outputs({"points", "cost"});
+
+REGISTER_KERNEL(Gipuma, GipumaKernel).device(DeviceType::GPU).num_devices(1);
+}
+
+// {
+//   cv::Mat left_cam(3, 3, CV_32F);
+//   cv::Mat left_rot(3, 3, CV_64F);
+//   cv::Mat left_t(3, 1, CV_32F);
+//   // left_cam.at<float>(0, 0) = 745.606;
+//   // left_cam.at<float>(1, 0) = 0;
+//   // left_cam.at<float>(2, 0) = 0;
+//   // left_cam.at<float>(0, 1) = 0;
+//   // left_cam.at<float>(1, 1) = 746.049;
+//   // left_cam.at<float>(2, 1) = 0;
+//   // left_cam.at<float>(0, 2) = 374.278;
+//   // left_cam.at<float>(1, 2) = 226.198;
+//   // left_cam.at<float>(2, 2) = 1;
+
+//   // left_rot.at<float>(0, 0) = 0.968079;
+//   // left_rot.at<float>(1, 0) = -0.0488040;
+//   // left_rot.at<float>(2, 0) = 0.245846;
+//   // left_rot.at<float>(0, 1) = 0.0286566;
+//   // left_rot.at<float>(1, 1) = 0.9959522125;
+//   // left_rot.at<float>(2, 1) = 0.0852241737;
+//   // left_rot.at<float>(0, 2) = -0.2490111439;
+//   // left_rot.at<float>(1, 2) = -0.0754808267;
+//   // left_rot.at<float>(2, 2) = 0.965554812;
+
+//   // left_t.at<float>(0, 0) = -49.73322;
+//   // left_t.at<float>(1, 0) = 142.7355424;
+//   // left_t.at<float>(2, 0) = 288.2857244;
+//   cv::decomposeProjectionMatrix(camera_params_.cameras[0].P, left_cam,
+//   left_rot,
+//                                 left_t);
+//   left_rot.convertTo(left_rot, CV_64F);
+//   cv::Mat_<float> t(3, 1);
+//   t(0, 0) = left_t.at<float>(0, 0);
+//   t(1, 0) = left_t.at<float>(1, 0);
+//   t(2, 0) = left_t.at<float>(2, 0);
+//   std::vector<cv::Point2f> project_points;
+//   cv::Mat dist(5, 1, CV_32F);
+//   dist.at<float>(0) = -0.319142;
+//   dist.at<float>(1) = 0.0562943;
+//   dist.at<float>(2) = -0.000819917;
+//   dist.at<float>(3) = 0.000917149;
+//   dist.at<float>(4) = 0.054014;
+//   cv::projectPoints(points, left_rot, t, left_cam, dist, project_points);
+//   cv::circle(grayscale_images[0], project_points[0], 10, cv::Scalar(255, 0,
+//   0),
+//              3);
+//   cv::imwrite("left.png", grayscale_images[0]);
+// }
diff --git a/stdlib/imgproc/CMakeLists.txt b/stdlib/imgproc/CMakeLists.txt
index beced41e..105134a5 100644
--- a/stdlib/imgproc/CMakeLists.txt
+++ b/stdlib/imgproc/CMakeLists.txt
@@ -1,16 +1,23 @@
 set(SOURCE_FILES
   blur_kernel_cpu.cpp
   histogram_kernel_cpu.cpp
-  image_encoder_kernel_cpu.cpp)
+  montage_kernel_cpu.cpp
+  image_encoder_kernel_cpu.cpp
+  image_decoder_kernel_cpu.cpp
+  resize_kernel.cpp)
 
 if (BUILD_CUDA)
   list(APPEND SOURCE_FILES
-    histogram_kernel_gpu.cpp)
+    histogram_kernel_gpu.cpp
+    montage_kernel_gpu.cpp
+    feature_extractor_kernel.cpp
+    feature_matcher_kernel.cpp)
+    #image_decoder_kernel_gpu.cpp
 endif()
 
 add_library(imgproc OBJECT ${SOURCE_FILES})
 
-list(APPEND OPENCV_COMPONENTS core highgui imgproc)
+list(APPEND OPENCV_COMPONENTS core highgui imgproc xfeatures2d cudafeatures2d cudacodec)
 set(OPENCV_COMPONENTS ${OPENCV_COMPONENTS} PARENT_SCOPE)
 
 set(STDLIB_LIBRARIES ${STDLIB_LIBRARIES} PARENT_SCOPE)
diff --git a/stdlib/imgproc/blur_kernel_cpu.cpp b/stdlib/imgproc/blur_kernel_cpu.cpp
index 08be06b0..7338ce9c 100644
--- a/stdlib/imgproc/blur_kernel_cpu.cpp
+++ b/stdlib/imgproc/blur_kernel_cpu.cpp
@@ -13,18 +13,18 @@
  * limitations under the License.
  */
 
-#include "scanner/api/op.h"
 #include "scanner/api/kernel.h"
-#include "stdlib/stdlib.pb.h"
+#include "scanner/api/op.h"
 #include "scanner/util/memory.h"
+#include "stdlib/stdlib.pb.h"
 
 #include <cmath>
 
 namespace scanner {
 
-class BlurKernel : public VideoKernel {
-public:
-  BlurKernel(const Kernel::Config &config) : VideoKernel(config) {
+class BlurKernel : public Kernel, public VideoKernel {
+ public:
+  BlurKernel(const KernelConfig& config) : Kernel(config) {
     scanner::proto::BlurArgs args;
     bool parsed = args.ParseFromArray(config.args.data(), config.args.size());
     if (!parsed || config.args.size() == 0) {
@@ -41,62 +41,45 @@ class BlurKernel : public VideoKernel {
     valid_.set_success(true);
   }
 
-  void validate(Result* result) override {
-    result->CopyFrom(valid_);
-  }
+  void validate(Result* result) override { result->CopyFrom(valid_); }
 
   void new_frame_info() {
     frame_width_ = frame_info_.width();
     frame_height_ = frame_info_.height();
   }
 
-  void execute(const BatchedColumns &input_columns,
-               BatchedColumns &output_columns) override {
-    i32 input_count = (i32)input_columns[0].rows.size();
-    check_frame_info(CPU_DEVICE, input_columns[1]);
+  void execute(const Columns& input_columns,
+               Columns& output_columns) override {
+    auto& frame_col = input_columns[0];
+    check_frame(CPU_DEVICE, frame_col);
 
     i32 width = frame_width_;
     i32 height = frame_height_;
     size_t frame_size = width * height * 3 * sizeof(u8);
-
-    for (i32 i = 0; i < input_count; ++i) {
-      u8 *input_buffer = input_columns[0].rows[i].buffer;
-      u8 *output_buffer = new u8[frame_size];
-
-      u8 *frame_buffer = input_buffer;
-      u8 *blurred_buffer = (output_buffer);
-      for (i32 y = filter_left_; y < height - filter_right_; ++y) {
-        for (i32 x = filter_left_; x < width - filter_right_; ++x) {
-          for (i32 c = 0; c < 3; ++c) {
-            u32 value = 0;
-            for (i32 ry = -filter_left_; ry < filter_right_ + 1; ++ry) {
-              for (i32 rx = -filter_left_; rx < filter_right_ + 1; ++rx) {
-                value += frame_buffer[(y + ry) * width * 3 + (x + rx) * 3 + c];
-              }
+    FrameInfo info = frame_col.as_const_frame()->as_frame_info();
+    Frame* output_frame = new_frame(CPU_DEVICE, info);
+
+    const u8* frame_buffer = frame_col.as_const_frame()->data;
+    u8* blurred_buffer = output_frame->data;
+    for (i32 y = filter_left_; y < height - filter_right_; ++y) {
+      for (i32 x = filter_left_; x < width - filter_right_; ++x) {
+        for (i32 c = 0; c < 3; ++c) {
+          u32 value = 0;
+          for (i32 ry = -filter_left_; ry < filter_right_ + 1; ++ry) {
+            for (i32 rx = -filter_left_; rx < filter_right_ + 1; ++rx) {
+              value += frame_buffer[(y + ry) * width * 3 + (x + rx) * 3 + c];
             }
-            blurred_buffer[y * width * 3 + x * 3 + c] =
-                value / ((filter_right_ + filter_left_ + 1) *
-                         (filter_right_ + filter_left_ + 1));
           }
+          blurred_buffer[y * width * 3 + x * 3 + c] =
+              value / ((filter_right_ + filter_left_ + 1) *
+                       (filter_right_ + filter_left_ + 1));
         }
       }
-      output_columns[0].rows.push_back(Row{output_buffer, frame_size});
-    }
-    FrameInfo info;
-    info.set_width(frame_width_);
-    info.set_height(frame_height_);
-    u8 *buffer = new_block_buffer(CPU_DEVICE, info.ByteSize() * input_count,
-                                  input_count);
-    for (i32 i = 0; i < input_count; ++i) {
-      Row row;
-      row.buffer = buffer + i * info.ByteSize();
-      row.size = info.ByteSize();
-      info.SerializeToArray(row.buffer, row.size);
-      output_columns[1].rows.push_back(row);
     }
+    insert_frame(output_columns[0], output_frame);
   }
 
-private:
+ private:
   i32 kernel_size_;
   i32 filter_left_;
   i32 filter_right_;
@@ -107,9 +90,7 @@ class BlurKernel : public VideoKernel {
   Result valid_;
 };
 
-REGISTER_OP(Blur)
-    .inputs({"frame", "frame_info"})
-    .outputs({"frame", "frame_info"});
+REGISTER_OP(Blur).frame_input("frame").frame_output("frame");
 
 REGISTER_KERNEL(Blur, BlurKernel).device(DeviceType::CPU).num_devices(1);
 }
diff --git a/stdlib/imgproc/feature_extractor_kernel.cpp b/stdlib/imgproc/feature_extractor_kernel.cpp
new file mode 100644
index 00000000..2e2d258d
--- /dev/null
+++ b/stdlib/imgproc/feature_extractor_kernel.cpp
@@ -0,0 +1,132 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/cuda.h"
+#include "scanner/util/memory.h"
+#include "scanner/util/opencv.h"
+#include "scanner/util/serialize.h"
+#include "stdlib/stdlib.pb.h"
+
+#include <opencv2/xfeatures2d.hpp>
+
+namespace scanner {
+
+class FeatureExtractorKernel : public Kernel, public VideoKernel {
+ public:
+  FeatureExtractorKernel(const KernelConfig& config)
+    : Kernel(config), device_(config.devices[0]) {
+    set_device();
+
+    if (!args_.ParseFromArray(config.args.data(), config.args.size())) {
+      LOG(FATAL) << "Failed to parse args";
+    }
+
+    if (args_.feature_type() == proto::ExtractorType::SIFT) {
+      if (device_.type == DeviceType::GPU) {
+        LOG(FATAL) << "GPU SIFT not supported yet";
+      } else {
+        cpu_extractor_ = cv::xfeatures2d::SIFT::create();
+      }
+    } else if (args_.feature_type() == proto::ExtractorType::SURF) {
+      if (device_.type == DeviceType::GPU) {
+        gpu_extractor_ = new cvc::SURF_CUDA(100);
+      } else {
+        cpu_extractor_ = cv::xfeatures2d::SURF::create();
+      }
+    } else {
+      LOG(FATAL) << "Invalid feature type";
+    }
+  }
+
+  void execute(const Columns& input_columns, Columns& output_columns) override {
+    set_device();
+
+    auto& frame_col = input_columns[0];
+    check_frame(device_, frame_col);
+
+    std::vector<cv::KeyPoint> keypoints;
+    std::tuple<u8*, size_t> features;
+
+    cvc::GpuMat feat_gpus;
+    cv::Mat cv_features;
+    cvc::GpuMat kp_gpu;
+
+    if (device_.type == DeviceType::GPU) {
+      if (args_.feature_type() == proto::ExtractorType::SURF) {
+        cvc::SURF_CUDA* surf = (cvc::SURF_CUDA*)gpu_extractor_;
+        cvc::GpuMat img = frame_to_gpu_mat(frame_col.as_const_frame());
+        cvc::cvtColor(img, img, CV_RGB2GRAY);
+        (*surf)(img, cvc::GpuMat(), kp_gpu, feat_gpus);
+        surf->downloadKeypoints(kp_gpu, keypoints);
+
+        LOG_IF(FATAL, !feat_gpus.empty() && feat_gpus.cols != 64)
+          << "Not 64 SURF columns?";
+
+        features = std::make_tuple(
+          feat_gpus.data, feat_gpus.step * feat_gpus.rows);
+      } else {
+        LOG(FATAL) << "SIFT GPU not supported";
+      }
+    } else {
+      cv::Mat img = frame_to_mat(frame_col.as_const_frame());
+      cv::cvtColor(img, img, CV_RGB2GRAY);
+      cpu_extractor_->detectAndCompute(img, cv::Mat(), keypoints,
+                                       cv_features);
+
+      features = std::make_tuple(
+        cv_features.data,
+        cv_features.total() * cv_features.elemSize());
+    }
+
+#define OR_4(N) std::max((N), (size_t)4)
+
+    u8* cv_buf = std::get<0>(features);
+    size_t size = std::get<1>(features);
+    u8* output_buf = new_buffer(device_, OR_4(size));
+    memcpy_buffer(output_buf, device_, cv_buf, CPU_DEVICE, size);
+    insert_element(output_columns[0], output_buf, OR_4(size));
+
+    std::vector<proto::Keypoint> kps_proto;
+    for (auto& kp : keypoints) {
+      proto::Keypoint kp_proto;
+      kp_proto.set_x(kp.pt.x);
+      kp_proto.set_y(kp.pt.y);
+      kps_proto.push_back(kp_proto);
+    }
+
+
+    output_buf = new_buffer(CPU_DEVICE, OR_4(size));
+    serialize_proto_vector(kps_proto, output_buf, size);
+    if (device_.type == DeviceType::GPU) {
+      u8* gpu_buf = new_buffer(device_, OR_4(size));
+      memcpy_buffer(gpu_buf, device_, output_buf, CPU_DEVICE, size);
+      delete_buffer(CPU_DEVICE, output_buf);
+      output_buf = gpu_buf;
+    }
+    insert_element(output_columns[1], output_buf, OR_4(size));
+  }
+
+  void set_device() {
+    CUDA_PROTECT({ CU_CHECK(cudaSetDevice(device_.id)); });
+    cvc::setDevice(device_.id);
+  }
+
+ private:
+  DeviceHandle device_;
+  proto::FeatureExtractorArgs args_;
+  void* gpu_extractor_;
+  cv::Ptr<cv::Feature2D> cpu_extractor_;
+};
+
+REGISTER_OP(FeatureExtractor)
+    .frame_input("frame")
+    .output("features")
+    .output("keypoints");
+
+REGISTER_KERNEL(FeatureExtractor, FeatureExtractorKernel)
+    .device(DeviceType::GPU)
+    .num_devices(1);
+
+REGISTER_KERNEL(FeatureExtractor, FeatureExtractorKernel)
+    .device(DeviceType::CPU)
+    .num_devices(1);
+}
diff --git a/stdlib/imgproc/feature_matcher_kernel.cpp b/stdlib/imgproc/feature_matcher_kernel.cpp
new file mode 100644
index 00000000..2fd34424
--- /dev/null
+++ b/stdlib/imgproc/feature_matcher_kernel.cpp
@@ -0,0 +1,219 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/cuda.h"
+#include "scanner/util/memory.h"
+#include "scanner/util/opencv.h"
+#include "scanner/util/serialize.h"
+#include "scanner/util/cycle_timer.h"
+#include "stdlib/stdlib.pb.h"
+
+#include <opencv2/xfeatures2d.hpp>
+
+namespace scanner {
+
+class Constants {
+ public:
+  int w = 32;
+  int g = 4;
+  int iw, ih;
+  int T;
+  int d;
+  float tau_c, gamma;
+
+  Constants(int iw, int ih, int T) {
+    this->iw = iw;
+    this->ih = ih;
+    this->T = T;
+    d = (int)sqrt((float)(ih * ih + iw * iw));
+    tau_c = 0.1 * (float)d;
+    gamma = 0.5 * (float)d;
+  }
+};
+
+class FeatureMatcherKernel : public StenciledKernel, public VideoKernel {
+ public:
+  FeatureMatcherKernel(const KernelConfig& config)
+    : StenciledKernel(config), device_(config.devices[0]), C_(0, 0, 0) {
+    set_device();
+
+    matcher_ = cvc::DescriptorMatcher::createBFMatcher();
+    features_suffix_.resize(C_.w);
+    kps_suffix_.resize(C_.w);
+  }
+
+  void new_frame_info() override {
+    set_device();
+
+    C_ = Constants(frame_info_.width(), frame_info_.height(), 0);
+    features_suffix_.clear();
+    kps_suffix_.clear();
+    features_suffix_.resize(C_.w);
+    kps_suffix_.resize(C_.w);
+  }
+
+  void set_device() {
+    CUDA_PROTECT({ CU_CHECK(cudaSetDevice(device_.id)); });
+    cvc::setDevice(device_.id);
+  }
+
+protected:
+  void execute(const StenciledColumns& input_columns,
+               Columns& output_columns) override {
+    set_device();
+
+    auto& features_col = input_columns[0];
+    auto& keypoints_col = input_columns[1];
+    auto& frame_info_col = input_columns[2];
+    check_frame_info(device_, frame_info_col[0]);
+
+    i32 window_size = features_col.size();
+
+    std::vector<cvc::GpuMat> features;
+    std::vector<std::vector<proto::Keypoint>> kps;
+
+    for (i32 i = 0; i < window_size; ++i) {
+      size_t size = keypoints_col[i].size;
+      u8* buf = new_buffer(CPU_DEVICE, size);
+      memcpy_buffer(buf, CPU_DEVICE, keypoints_col[i].buffer, device_,
+                    size);
+      std::vector<proto::Keypoint> kp =
+        deserialize_proto_vector<proto::Keypoint>(buf, size);
+      kps.push_back(kp);
+
+      size = features_col[i].size;
+      if (kp.size() == 0) {
+        features.push_back(cvc::GpuMat());
+      } else {
+        i32 step = size / kp.size();
+        i32 cols;
+        if (kp.size() == 1) {
+          cols = step / sizeof(f32);
+        } else {
+          cols = step / (sizeof(f32) * 2);
+        }
+        LOG_IF(FATAL, cols != 64) << "Not 64 cols: " << cols;
+        features.push_back(cvc::GpuMat(kp.size(), cols, CV_32F,
+                                       features_col[i].buffer, step));
+      }
+    }
+
+    size_t size = window_size * sizeof(f32);
+    f32* cost_buf = (f32*)new_buffer(CPU_DEVICE, size);
+
+    std::vector<std::vector<cv::DMatch>> matches;
+    matches.resize(window_size);
+    for (i32 j = 1; j < window_size; j++) {
+      if (kps[0].size() == 0 || kps[j].size() == 0) {
+        continue;
+      }
+      matcher_->match(features[0], features[j], matches[j]);
+    }
+
+#pragma omp parallel for
+    for (i32 j = 1; j < window_size; j++) {
+      f32 cost = match_cost(kps[0], kps[j], matches[j]);
+      cost_buf[j] = cost;
+    }
+
+    if (device_.type == DeviceType::GPU) {
+      u8* gpu_buf = new_buffer(device_, size);
+      memcpy_buffer(gpu_buf, device_, (u8*) cost_buf, CPU_DEVICE, size);
+      delete_buffer(CPU_DEVICE, (u8*) cost_buf);
+      cost_buf = (f32*) gpu_buf;
+    }
+
+    insert_element(output_columns[0], (u8*) cost_buf, size);
+  }
+
+ private:
+  float reprojection_error(std::vector<cv::Point2f>& src,
+                           std::vector<cv::Point2f>& dst, cv::Mat& H) {
+    std::vector<cv::Point2f> dst_proj;
+    perspectiveTransform(src, dst_proj, H);
+    int N = src.size();
+    cv::Mat dst_proj_m = cv::Mat::zeros(N, 2, CV_32F),
+            dst_m = cv::Mat::zeros(N, 2, CV_32F);
+    for (int i = 0; i < N; i++) {
+      dst_proj_m.at<float>(i, 0) = dst_proj[i].x;
+      dst_proj_m.at<float>(i, 1) = dst_proj[i].y;
+      dst_m.at<float>(i, 0) = dst[i].x;
+      dst_m.at<float>(i, 1) = dst[i].y;
+    }
+    cv::Mat diff = dst_m - dst_proj_m;
+    cv::Mat summed, sq;
+    reduce(diff.mul(diff), summed, 1, CV_REDUCE_SUM);
+    sqrt(summed, sq);
+    return mean(sq)[0];
+  }
+
+  float match_cost(std::vector<proto::Keypoint>& kp1,
+                   std::vector<proto::Keypoint>& kp2,
+                   std::vector<cv::DMatch>& matches) {
+    if (matches.size() == 0) {
+      return C_.gamma;
+    }
+
+    double min_dist = std::numeric_limits<double>::max();
+    for (auto& match : matches) {
+      double dist = match.distance;
+      if (dist < min_dist) {
+        min_dist = dist;
+      }
+    }
+
+    std::vector<cv::DMatch> good_matches;
+    std::vector<cv::Point2f> fr1, fr2;
+    for (auto& match : matches) {
+      if (match.distance <= std::max(1.5 * min_dist, 0.01)) {
+        good_matches.push_back(match);
+        fr1.push_back(
+            cv::Point2f(kp1[match.queryIdx].x(), kp1[match.queryIdx].y()));
+        fr2.push_back(
+            cv::Point2f(kp2[match.trainIdx].x(), kp2[match.trainIdx].y()));
+      }
+    }
+
+    // Need at least 4 points to find a homography
+    if (fr1.size() < 4) {
+      return C_.gamma;
+    }
+
+    cv::Mat H = cv::findHomography(fr1, fr2, CV_RANSAC);
+    // If H is empty, then homography could not be found
+    if (H.rows == 0) {
+      return C_.gamma;
+    }
+
+    float cr = reprojection_error(fr1, fr2, H);
+
+    cv::Point2f x(C_.ih / 2), y(C_.iw / 2);
+    std::vector<cv::Point2f> center = {x, y};
+    float co = reprojection_error(center, center, H);
+
+    // LOG(INFO) << "cr: " << cr << ", co: " << co << ", C_.tau_c: " <<
+    // C_.tau_c;
+    if (cr < C_.tau_c) {
+      return co;
+    } else {
+      return C_.gamma;
+    }
+  }
+
+  DeviceHandle device_;
+  Constants C_;
+  cv::Ptr<cvc::DescriptorMatcher> matcher_;
+  std::vector<cvc::GpuMat> features_suffix_;
+  std::vector<std::vector<proto::Keypoint>> kps_suffix_;
+};
+
+REGISTER_OP(FeatureMatcher)
+  .input("features")
+  .input("keypoints")
+  .input("frame_info")
+  .stencil()
+  .output("cost_matrix");
+
+REGISTER_KERNEL(FeatureMatcher, FeatureMatcherKernel)
+    .device(DeviceType::GPU)
+    .num_devices(1);
+}
diff --git a/stdlib/imgproc/frame_difference_kernel_cpu.cpp b/stdlib/imgproc/frame_difference_kernel_cpu.cpp
new file mode 100644
index 00000000..2f960e83
--- /dev/null
+++ b/stdlib/imgproc/frame_difference_kernel_cpu.cpp
@@ -0,0 +1,81 @@
+/* Copyright 2017 Carnegie Mellon University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/memory.h"
+#include "stdlib/stdlib.pb.h"
+
+#include <cmath>
+
+namespace scanner {
+
+class FrameDifferenceKernel : public StenciledKernel {
+ public:
+  BlurKernel(const KernelConfig& config) : StenciledKernel(config) {
+    valid_.set_success(true);
+  }
+
+  void validate(Result* result) override {
+    result->CopyFrom(valid_);
+  }
+
+  void new_frame_info() {
+    frame_width_ = frame_info_.width();
+    frame_height_ = frame_info_.height();
+  }
+
+  void execute(const StenciledColumns& input_columns,
+               Columns& output_columns) override {
+    auto& frame_col = input_columns[0];
+    check_frame(CPU_DEVICE, frame_col);
+
+    FrameInfo info = frame_col.as_const_frame()->as_frame_info();
+    i32 width = info.width();
+    i32 height = info.height();
+    i32 channels = info.channels();
+    size_t frame_size = width * height * channels * sizeof(u8);
+
+    const u8* secondary_frame_buffer = frame_col[0].as_const_frame()->data;
+    const u8* primary_frame_buffer = frame_col[1].as_const_frame()->data;
+
+    Frame* output_frame = new_frame(CPU_DEVICE, info);
+    u8* output_buffer = output_frame->data;
+    for (i32 y = 0; y < height; ++y) {
+      for (i32 x = 0; x < width; ++x) {
+        for (i32 c = 0; c < channels; ++c) {
+          i64 offset = y * width * channels + width * channels + c;
+          output_buffer[offset] =
+              primary_frame_buffer[offset] - secondary_frame_buffer[offset]
+        }
+      }
+    }
+    insert_frame(output_columns[0], output_frame);
+  }
+
+ private:
+  i32 frame_width_;
+  i32 frame_height_;
+  Result valid_;
+};
+
+REGISTER_OP(FrameDifference).frame_input("frame").frame_output("frame");
+
+REGISTER_KERNEL(FrameDifference, FrameDifferenceKernel)
+.device(DeviceType::CPU)
+.batch();
+.stencil({-1, 0});
+.num_devices(1);
+}
diff --git a/stdlib/imgproc/histogram_kernel_cpu.cpp b/stdlib/imgproc/histogram_kernel_cpu.cpp
index df6e7ff3..63d5d4bc 100644
--- a/stdlib/imgproc/histogram_kernel_cpu.cpp
+++ b/stdlib/imgproc/histogram_kernel_cpu.cpp
@@ -1,5 +1,5 @@
-#include "scanner/api/op.h"
 #include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
 #include "scanner/util/memory.h"
 #include "scanner/util/opencv.h"
 
@@ -8,50 +8,52 @@ namespace {
 const i32 BINS = 16;
 }
 
-class HistogramKernelCPU : public VideoKernel {
-public:
-  HistogramKernelCPU(const Kernel::Config &config)
-      : VideoKernel(config), device_(config.devices[0]) {
-    assert(config.input_columns.size() == 2);
-  }
+class HistogramKernelCPU : public BatchedKernel {
+ public:
+  HistogramKernelCPU(const KernelConfig& config)
+    : BatchedKernel(config), device_(config.devices[0]) {}
 
-  void execute(const BatchedColumns &input_columns,
-               BatchedColumns &output_columns) override {
-    check_frame_info(device_, input_columns[1]);
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    auto& frame_col = input_columns[0];
 
-    size_t hist_size = BINS * 3 * sizeof(float);
-    i32 input_count = input_columns[0].rows.size();
-    u8 *output_block =
+    size_t hist_size = BINS * 3 * sizeof(int);
+    i32 input_count = num_rows(frame_col);
+    u8* output_block =
         new_block_buffer(device_, hist_size * input_count, input_count);
 
-    cv::Mat tmp;
     for (i32 i = 0; i < input_count; ++i) {
-      cv::Mat img(frame_info_.height(), frame_info_.width(), CV_8UC3,
-                  (u8 *)input_columns[0].rows[i].buffer);
+      cv::Mat img = frame_to_mat(frame_col[i].as_const_frame());
 
       float range[] = {0, 256};
-      const float *histRange = {range};
+      const float* histRange = {range};
 
-      u8 *output_buf = output_block + i * hist_size;
+      u8* output_buf = output_block + i * hist_size;
 
       for (i32 j = 0; j < 3; ++j) {
         int channels[] = {j};
-        cv::Mat out(BINS, 1, CV_32F , output_buf + BINS * sizeof(float));
-        cv::calcHist(&img, 1, channels, cv::Mat(), out, 1, &BINS, &histRange);
-        out.convertTo(out, CV_32S);
+        cv::Mat hist;
+        cv::calcHist(&img, 1, channels, cv::Mat(),
+                     hist,
+                     1, &BINS,
+                     &histRange);
+        cv::Mat out(BINS, 1, CV_32SC1, output_buf + j * BINS * sizeof(int));
+        hist.convertTo(out, CV_32SC1);
       }
 
-      output_columns[0].rows.push_back(Row{output_buf, hist_size});
+      insert_element(output_columns[0], output_buf, hist_size);
     }
   }
 
-private:
+ private:
   DeviceHandle device_;
 };
 
-REGISTER_OP(Histogram).inputs({"frame", "frame_info"}).outputs({"histogram"});
+REGISTER_OP(Histogram).frame_input("frame").output("histogram");
 
 REGISTER_KERNEL(Histogram, HistogramKernelCPU)
     .device(DeviceType::CPU)
+    .batch()
     .num_devices(1);
 }
+
diff --git a/stdlib/imgproc/histogram_kernel_gpu.cpp b/stdlib/imgproc/histogram_kernel_gpu.cpp
index 77b27a84..48973a3b 100644
--- a/stdlib/imgproc/histogram_kernel_gpu.cpp
+++ b/stdlib/imgproc/histogram_kernel_gpu.cpp
@@ -1,5 +1,5 @@
-#include "scanner/api/op.h"
 #include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
 #include "scanner/util/cuda.h"
 #include "scanner/util/memory.h"
 #include "scanner/util/opencv.h"
@@ -9,11 +9,13 @@ namespace {
 const i32 BINS = 16;
 }
 
-class HistogramKernelGPU : public VideoKernel {
-public:
-  HistogramKernelGPU(const Kernel::Config &config)
-      : VideoKernel(config), device_(config.devices[0]), num_cuda_streams_(32),
-        streams_(num_cuda_streams_) {}
+class HistogramKernelGPU : public BatchedKernel, public VideoKernel {
+ public:
+  HistogramKernelGPU(const KernelConfig& config)
+    : BatchedKernel(config),
+      device_(config.devices[0]),
+      num_cuda_streams_(32),
+      streams_(num_cuda_streams_) {}
 
   void new_frame_info() override {
     set_device();
@@ -22,30 +24,31 @@ class HistogramKernelGPU : public VideoKernel {
     planes_.clear();
     for (i32 i = 0; i < 3; ++i) {
       planes_.push_back(
-          cvc::GpuMat(frame_info_.height(), frame_info_.width(), CV_8UC1));
+          cvc::GpuMat(frame_info_.width(), frame_info_.height(), CV_8UC1));
     }
   }
 
-  void execute(const BatchedColumns &input_columns,
-               BatchedColumns &output_columns) override {
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    auto& frame_col = input_columns[0];
+
     set_device();
-    check_frame_info(device_, input_columns[1]);
+    check_frame(device_, frame_col[0]);
 
     size_t hist_size = BINS * 3 * sizeof(float);
-    i32 input_count = input_columns[0].rows.size();
-    u8 *output_block =
+    i32 input_count = num_rows(frame_col);
+    u8* output_block =
         new_block_buffer(device_, hist_size * input_count, input_count);
 
     for (i32 i = 0; i < input_count; ++i) {
       i32 sid = i % num_cuda_streams_;
-      cv::cuda::Stream &s = streams_[sid];
+      cv::cuda::Stream& s = streams_[sid];
 
       // TODO(wcrichto): implement correctly w/ streams
-      cvc::GpuMat img(frame_info_.height(), frame_info_.width(), CV_8UC3,
-                      input_columns[0].rows[i].buffer);
+      cvc::GpuMat img = frame_to_gpu_mat(frame_col[i].as_const_frame());
       cvc::split(img, planes_);
 
-      u8 *output_buf = output_block + i * hist_size;
+      u8* output_buf = output_block + i * hist_size;
       cvc::GpuMat out_mat(1, BINS * 3, CV_32S, output_buf);
 
       for (i32 j = 0; j < 3; ++j) {
@@ -53,10 +56,10 @@ class HistogramKernelGPU : public VideoKernel {
                       0, 256);
       }
 
-      output_columns[0].rows.push_back(Row{output_buf, hist_size});
+      insert_element(output_columns[0], output_buf, hist_size);
     }
 
-    for (cv::cuda::Stream &s : streams_) {
+    for (cv::cuda::Stream& s : streams_) {
       s.waitForCompletion();
     }
   }
@@ -66,7 +69,7 @@ class HistogramKernelGPU : public VideoKernel {
     cvc::setDevice(device_.id);
   }
 
-private:
+ private:
   DeviceHandle device_;
   i32 num_cuda_streams_;
   std::vector<cv::cuda::Stream> streams_;
@@ -75,5 +78,6 @@ class HistogramKernelGPU : public VideoKernel {
 
 REGISTER_KERNEL(Histogram, HistogramKernelGPU)
     .device(DeviceType::GPU)
+    .batch()
     .num_devices(1);
 }
diff --git a/stdlib/imgproc/image_decoder_kernel_cpu.cpp b/stdlib/imgproc/image_decoder_kernel_cpu.cpp
new file mode 100644
index 00000000..2f2ecaa3
--- /dev/null
+++ b/stdlib/imgproc/image_decoder_kernel_cpu.cpp
@@ -0,0 +1,36 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/memory.h"
+#include "scanner/util/opencv.h"
+#include "stdlib/stdlib.pb.h"
+
+namespace scanner {
+
+class ImageDecoderKernelCPU : public BatchedKernel {
+ public:
+  ImageDecoderKernelCPU(const KernelConfig& config) : BatchedKernel(config) {}
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    i32 input_count = num_rows(input_columns[0]);
+
+    for (i32 i = 0; i < input_count; ++i) {
+      std::vector<u8> input_buf(
+        input_columns[0][i].buffer,
+        input_columns[0][i].buffer + input_columns[0][i].size);
+      cv::Mat img = cv::imdecode(input_buf, CV_LOAD_IMAGE_COLOR);
+      LOG_IF(FATAL, img.empty() || !img.data) << "Failed to decode image";
+      size_t size = img.total() * img.elemSize();
+      Frame* frame = new_frame(CPU_DEVICE, mat_to_frame_info(img));
+      std::memcpy(frame->data, img.data, size);
+      insert_frame(output_columns[0], frame);
+    }
+  }
+};
+
+REGISTER_OP(ImageDecoder).input("img").frame_output("frame");
+
+REGISTER_KERNEL(ImageDecoder, ImageDecoderKernelCPU)
+    .device(DeviceType::CPU)
+    .num_devices(1);
+}
diff --git a/stdlib/imgproc/image_decoder_kernel_gpu.cpp b/stdlib/imgproc/image_decoder_kernel_gpu.cpp
new file mode 100644
index 00000000..338673bc
--- /dev/null
+++ b/stdlib/imgproc/image_decoder_kernel_gpu.cpp
@@ -0,0 +1,105 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/memory.h"
+#include "scanner/util/opencv.h"
+#include "scanner/util/cuda.h"
+#include "stdlib/stdlib.pb.h"
+
+namespace scanner {
+
+namespace codec = cv::cudacodec;
+
+class ImageSource : public codec::RawVideoSource {
+public:
+  ImageSource(const BatchedColumns& input_columns, const cv::Mat& img)
+    : input_columns_(input_columns), img_(img) {}
+
+  bool getNextPacket(unsigned char** data, int* size, bool* endOfFile) override {
+    const Element& element = input_columns_[0][i_];
+    *data = element.buffer;
+    *size = element.size;
+    *endOfFile = false;
+    // Theoretically we should be able to set endOfFile to true at the last
+    // frame, but the OpenCV VideoReader appears to return false on a valid
+    // nextFrame request if I do this, so instead I just keep feeding packets
+    // until the loader thread dies.
+    i_ = (i_ + 1) % input_columns_[0].size();
+    return true;
+  }
+
+  codec::FormatInfo format() const override {
+    codec::FormatInfo format_info;
+    format_info.codec = codec::Codec::JPEG;
+    format_info.chromaFormat = codec::ChromaFormat::YUV420;
+    format_info.width = img_.cols;
+    format_info.height = img_.rows;
+    return format_info;
+  }
+
+private:
+  int i_ = 0;
+  const cv::Mat& img_;
+  const BatchedColumns& input_columns_;
+};
+
+class ImageDecoderKernelGPU : public Kernel {
+ public:
+  ImageDecoderKernelGPU(const Kernel::Config& config)
+    : Kernel(config), device_(config.devices[0]) {
+    if (!args_.ParseFromArray(config.args.data(), config.args.size())) {
+      LOG(FATAL) << "Failed to parse args";
+    }
+  }
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    i32 input_count = num_rows(input_columns[0]);
+
+    set_device();
+
+    // Assumes all images are the same size
+    size_t sz = input_columns[0][0].size;
+    u8* cpu_buf = new_buffer(CPU_DEVICE, sz);
+    memcpy_buffer(cpu_buf, CPU_DEVICE, input_columns[0][0].buffer,
+                  device_, sz);
+    std::vector<u8> input_buf(cpu_buf, cpu_buf + sz);
+    cv::Mat img = cv::imdecode(input_buf, CV_LOAD_IMAGE_COLOR);
+    FrameInfo frame_info(img.rows, img.cols, 3, FrameType::U8);
+    delete_buffer(CPU_DEVICE, cpu_buf);
+
+    proto::ImageDecoderArgs_ImageType image_type = args_.image_type();
+    std::vector<Frame*> frames = new_frames(device_, frame_info, input_count);
+
+    cv::Ptr<codec::RawVideoSource> src =
+      cv::Ptr<codec::RawVideoSource>(new ImageSource(input_columns, img));
+    cv::Ptr<codec::VideoReader> d_reader = codec::createVideoReader(src);
+
+    for (i32 i = 0; i < input_count; ++i) {
+      if (image_type == proto::ImageDecoderArgs_ImageType_JPEG) {
+        cvc::GpuMat gpu_mat = frame_to_gpu_mat(frames[i]);
+        if (!d_reader->nextFrame(gpu_mat)) {
+          LOG(FATAL) << "Failed to decode image";
+        }
+        insert_frame(output_columns[0], frames[i]);
+      } else if (image_type == proto::ImageDecoderArgs_ImageType_ANY) {
+        LOG(FATAL) << "Not yet supported";
+      } else {
+        LOG(FATAL) << "Invalid image type";
+      }
+    }
+  }
+
+  void set_device() {
+    CU_CHECK(cudaSetDevice(device_.id));
+    cvc::setDevice(device_.id);
+  }
+
+private:
+  proto::ImageDecoderArgs args_;
+  DeviceHandle device_;
+};
+
+REGISTER_KERNEL(ImageDecoder, ImageDecoderKernelGPU)
+    .device(DeviceType::GPU)
+    .num_devices(1);
+}
diff --git a/stdlib/imgproc/image_encoder_kernel_cpu.cpp b/stdlib/imgproc/image_encoder_kernel_cpu.cpp
index f8df98f9..7ddbd1d1 100644
--- a/stdlib/imgproc/image_encoder_kernel_cpu.cpp
+++ b/stdlib/imgproc/image_encoder_kernel_cpu.cpp
@@ -1,33 +1,39 @@
-#include "scanner/api/op.h"
 #include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
 #include "scanner/util/memory.h"
 #include "scanner/util/opencv.h"
 
 namespace scanner {
 
-class ImageEncoderKernel : public VideoKernel {
-public:
-  ImageEncoderKernel(const Kernel::Config &config) : VideoKernel(config) {}
+class ImageEncoderKernel : public BatchedKernel, public VideoKernel {
+ public:
+  ImageEncoderKernel(const KernelConfig& config) : BatchedKernel(config) {}
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    auto& frame_col = input_columns[0];
+    check_frame(CPU_DEVICE, frame_col[0]);
 
-  void execute(const BatchedColumns &input_columns,
-               BatchedColumns &output_columns) override {
-    check_frame_info(CPU_DEVICE, input_columns[1]);
+    std::vector<i32> encode_params;
+    encode_params.push_back(CV_IMWRITE_JPEG_QUALITY);
+    encode_params.push_back(100);
 
-    i32 input_count = input_columns[0].rows.size();
+    i32 input_count = num_rows(frame_col);
     for (i32 i = 0; i < input_count; ++i) {
-      cv::Mat img(frame_info_.height(), frame_info_.width(), CV_8UC3,
-                  (u8 *)input_columns[0].rows[i].buffer);
+      cv::Mat img = frame_to_mat(frame_col[i].as_const_frame());
       std::vector<u8> buf;
-      bool success = cv::imencode(".png", img, buf);
+      cv::Mat recolored;
+      cv::cvtColor(img, recolored, CV_RGB2BGR);
+      bool success = cv::imencode(".jpg", recolored, buf, encode_params);
       LOG_IF(FATAL, !success) << "Failed to encode image";
-      u8 *output_buf = new_buffer(CPU_DEVICE, buf.size());
+      u8* output_buf = new_buffer(CPU_DEVICE, buf.size());
       std::memcpy(output_buf, buf.data(), buf.size());
-      output_columns[0].rows.push_back(Row{output_buf, buf.size()});
+      insert_element(output_columns[0], output_buf, buf.size());
     }
   }
 };
 
-REGISTER_OP(ImageEncoder).inputs({"frame", "frame_info"}).outputs({"png"});
+REGISTER_OP(ImageEncoder).frame_input("frame").output("img");
 
 REGISTER_KERNEL(ImageEncoder, ImageEncoderKernel)
     .device(DeviceType::CPU)
diff --git a/stdlib/imgproc/montage_kernel_cpu.cpp b/stdlib/imgproc/montage_kernel_cpu.cpp
new file mode 100644
index 00000000..99fa0740
--- /dev/null
+++ b/stdlib/imgproc/montage_kernel_cpu.cpp
@@ -0,0 +1,117 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/memory.h"
+#include "scanner/util/opencv.h"
+#include "stdlib/stdlib.pb.h"
+
+namespace scanner {
+
+class MontageKernelCPU : public BatchedKernel, public VideoKernel {
+ public:
+  MontageKernelCPU(const KernelConfig& config)
+    : BatchedKernel(config),
+      device_(config.devices[0]),
+      frames_seen_(0),
+      montage_width_(0),
+      montage_buffer_(nullptr) {
+    valid_.set_success(true);
+    if (!args_.ParseFromArray(config.args.data(), config.args.size())) {
+      RESULT_ERROR(&valid_, "MontageKernel could not parse protobuf args");
+      return;
+    }
+
+    num_frames_ = args_.num_frames();
+    target_width_ = args_.target_width();
+    frames_per_row_ = args_.frames_per_row();
+  }
+
+  ~MontageKernelCPU() {
+    if (montage_buffer_ != nullptr) {
+      delete_buffer(device_, montage_buffer_);
+    }
+  }
+
+  void reset() {
+    if (montage_width_ != 0) {
+      if (montage_buffer_ != nullptr) {
+        delete_buffer(device_, montage_buffer_);
+      }
+      montage_buffer_ =
+          new_buffer(device_, montage_width_ * montage_height_ * 3);
+      montage_image_ =
+          cv::Mat(montage_height_, montage_width_, CV_8UC3, montage_buffer_);
+      montage_image_.setTo(0);
+      frames_seen_ = 0;
+    }
+  }
+
+  void new_frame_info() override {
+    frame_width_ = frame_info_.width();
+    frame_height_ = frame_info_.height();
+
+    target_height_ = (target_width_ / (1.0 * frame_width_) * frame_height_);
+
+    montage_width_ = frames_per_row_ * target_width_;
+    montage_height_ =
+        std::ceil(num_frames_ / (1.0 * frames_per_row_)) * target_height_;
+    reset();
+  }
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    auto& frame_col = input_columns[0];
+    check_frame(device_, frame_col[0]);
+
+    assert(montage_buffer_ != nullptr);
+    i32 input_count = num_rows(frame_col);
+    for (i32 i = 0; i < input_count; ++i) {
+      cv::Mat img = frame_to_mat(frame_col[i].as_const_frame());
+      i64 x = frames_seen_ % frames_per_row_;
+      i64 y = frames_seen_ / frames_per_row_;
+      cv::Mat montage_subimg =
+          montage_image_(cv::Rect(target_width_ * x, target_height_ * y,
+                                  target_width_, target_height_));
+      cv::resize(img, montage_subimg, cv::Size(target_width_, target_height_));
+
+      frames_seen_++;
+      if (frames_seen_ == num_frames_) {
+        assert(montage_buffer_ != nullptr);
+        FrameInfo info(montage_height_, montage_width_, 3, FrameType::U8);
+        insert_frame(output_columns[0], new Frame(info, montage_buffer_));
+        montage_image_ = cv::Mat();
+        montage_buffer_ = nullptr;
+      } else {
+        FrameInfo info(montage_height_, montage_width_, 3, FrameType::U8);
+        insert_frame(output_columns[0], new_frame(device_, info));
+      }
+    }
+  }
+
+ private:
+  proto::Result valid_;
+  DeviceHandle device_;
+  proto::MontageArgs args_;
+  i64 num_frames_;
+  i32 frame_width_;
+  i32 frame_height_;
+  i32 target_width_;
+  i32 target_height_;
+  i32 frames_per_row_;
+
+  i64 montage_width_;
+  i64 montage_height_;
+
+  u8* montage_buffer_;
+  cv::Mat montage_image_;
+  i64 frames_seen_;
+};
+
+REGISTER_OP(Montage)
+.frame_input("frame")
+.frame_output("montage")
+.unbounded_state();
+
+REGISTER_KERNEL(Montage, MontageKernelCPU)
+    .device(DeviceType::CPU)
+    .num_devices(1);
+}
diff --git a/stdlib/imgproc/montage_kernel_gpu.cpp b/stdlib/imgproc/montage_kernel_gpu.cpp
new file mode 100644
index 00000000..0961e1d8
--- /dev/null
+++ b/stdlib/imgproc/montage_kernel_gpu.cpp
@@ -0,0 +1,123 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/cuda.h"
+#include "scanner/util/memory.h"
+#include "scanner/util/opencv.h"
+#include "stdlib/stdlib.pb.h"
+
+namespace scanner {
+
+class MontageKernelGPU : public BatchedKernel, public VideoKernel {
+ public:
+  MontageKernelGPU(const KernelConfig& config)
+    : BatchedKernel(config),
+      device_(config.devices[0]),
+      frames_seen_(0),
+      montage_width_(0),
+      montage_buffer_(nullptr) {
+    valid_.set_success(true);
+    if (!args_.ParseFromArray(config.args.data(), config.args.size())) {
+      RESULT_ERROR(&valid_, "MontageKernel could not parse protobuf args");
+      return;
+    }
+
+    num_frames_ = args_.num_frames();
+    target_width_ = args_.target_width();
+    frames_per_row_ = args_.frames_per_row();
+  }
+
+  ~MontageKernelGPU() {
+    if (montage_buffer_ != nullptr) {
+      delete_buffer(device_, montage_buffer_);
+    }
+  }
+
+  void reset() {
+    set_device();
+    if (montage_width_ != 0) {
+      if (montage_buffer_ != nullptr) {
+        delete_buffer(device_, montage_buffer_);
+      }
+      montage_buffer_ =
+          new_buffer(device_, montage_width_ * montage_height_ * 3);
+      montage_image_ = cvc::GpuMat(montage_height_, montage_width_, CV_8UC3,
+                                   montage_buffer_);
+      montage_image_.setTo(0);
+      frames_seen_ = 0;
+    }
+  }
+
+  void new_frame_info() override {
+    set_device();
+    frame_width_ = frame_info_.width();
+    frame_height_ = frame_info_.height();
+
+    target_height_ = (target_width_ / (1.0 * frame_width_) * frame_height_);
+
+    montage_width_ = frames_per_row_ * target_width_;
+    montage_height_ =
+        std::ceil(num_frames_ / (1.0 * frames_per_row_)) * target_height_;
+    reset();
+  }
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    auto& frame_col = input_columns[0];
+    check_frame(device_, frame_col[0]);
+
+    set_device();
+
+    assert(montage_buffer_ != nullptr);
+    i32 input_count = num_rows(frame_col);
+    for (i32 i = 0; i < input_count; ++i) {
+      cvc::GpuMat img = frame_to_gpu_mat(frame_col[i].as_const_frame());
+      i64 x = frames_seen_ % frames_per_row_;
+      i64 y = frames_seen_ / frames_per_row_;
+      cvc::GpuMat montage_subimg =
+          montage_image_(cv::Rect(target_width_ * x, target_height_ * y,
+                                  target_width_, target_height_));
+      cvc::resize(img, montage_subimg, cv::Size(target_width_, target_height_));
+
+      frames_seen_++;
+      if (frames_seen_ == num_frames_) {
+        assert(montage_buffer_ != nullptr);
+        FrameInfo info(montage_height_, montage_width_, 3, FrameType::U8);
+        insert_frame(output_columns[0], new Frame(info, montage_buffer_));
+        montage_image_ = cvc::GpuMat();
+        montage_buffer_ = nullptr;
+      } else {
+        FrameInfo info(montage_height_, montage_width_, 3, FrameType::U8);
+        insert_frame(output_columns[0], new_frame(device_, info));
+      }
+    }
+  }
+
+  void set_device() {
+    CUDA_PROTECT({ CU_CHECK(cudaSetDevice(device_.id)); });
+    cvc::setDevice(device_.id);
+  }
+
+ private:
+  proto::Result valid_;
+  DeviceHandle device_;
+  proto::MontageArgs args_;
+  i64 num_frames_;
+  i32 frame_width_;
+  i32 frame_height_;
+  i32 target_width_;
+  i32 target_height_;
+  i32 frames_per_row_;
+
+  i64 montage_width_;
+  i64 montage_height_;
+
+  u8* montage_buffer_;
+  cvc::GpuMat montage_image_;
+  i64 frames_seen_;
+};
+
+REGISTER_KERNEL(Montage, MontageKernelGPU)
+    .device(DeviceType::GPU)
+    .batch()
+    .num_devices(1);
+}
diff --git a/stdlib/imgproc/resize_kernel.cpp b/stdlib/imgproc/resize_kernel.cpp
new file mode 100644
index 00000000..039657d4
--- /dev/null
+++ b/stdlib/imgproc/resize_kernel.cpp
@@ -0,0 +1,84 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/cuda.h"
+#include "scanner/util/memory.h"
+#include "scanner/util/opencv.h"
+#include "stdlib/stdlib.pb.h"
+
+namespace scanner {
+
+class ResizeKernel : public BatchedKernel {
+ public:
+  ResizeKernel(const KernelConfig& config)
+    : BatchedKernel(config), device_(config.devices[0]) {
+    args_.ParseFromArray(config.args.data(), config.args.size());
+  }
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    auto& frame_col = input_columns[0];
+    set_device();
+
+    const Frame* frame = frame_col[0].as_const_frame();
+
+    i32 target_width = args_.width();
+    i32 target_height = args_.height();
+    if (args_.preserve_aspect()) {
+      if (target_width == 0) {
+        target_width =
+            frame->width() * target_height / frame->height();
+      } else {
+        target_height =
+            frame->height() * target_width / frame->width();
+      }
+    }
+    if (args_.min()) {
+      if (frame->width() <= target_width &&
+          frame->height() <= target_height) {
+        target_width = frame->width();
+        target_height = frame->height();
+      }
+    }
+
+    i32 input_count = num_rows(frame_col);
+    FrameInfo info(target_height, target_width, 3, FrameType::U8);
+    std::vector<Frame*> output_frames = new_frames(device_, info, input_count);
+
+    for (i32 i = 0; i < input_count; ++i) {
+      if (device_.type == DeviceType::CPU) {
+        cv::Mat img = frame_to_mat(frame_col[i].as_const_frame());
+        cv::Mat out_mat = frame_to_mat(output_frames[i]);
+        cv::resize(img, out_mat, cv::Size(target_width, target_height));
+      } else {
+        CUDA_PROTECT({
+          cvc::GpuMat img = frame_to_gpu_mat(frame_col[i].as_const_frame());
+          cvc::GpuMat out_mat = frame_to_gpu_mat(output_frames[i]);
+          cvc::resize(img, out_mat, cv::Size(target_width, target_height));
+        });
+      }
+      insert_frame(output_columns[0], output_frames[i]);
+    }
+  }
+
+  void set_device() {
+    if (device_.type == DeviceType::GPU) {
+      CUDA_PROTECT({
+        CU_CHECK(cudaSetDevice(device_.id));
+        cvc::setDevice(device_.id);
+      });
+    }
+  }
+
+ private:
+  DeviceHandle device_;
+  proto::ResizeArgs args_;
+};
+
+REGISTER_OP(Resize).frame_input("frame").frame_output("frame");
+
+REGISTER_KERNEL(Resize, ResizeKernel).device(DeviceType::CPU).num_devices(1);
+
+#ifdef HAVE_CUDA
+REGISTER_KERNEL(Resize, ResizeKernel).device(DeviceType::GPU).num_devices(1);
+#endif
+}
diff --git a/stdlib/misc/CMakeLists.txt b/stdlib/misc/CMakeLists.txt
index 186daa5c..c8cfe1f6 100644
--- a/stdlib/misc/CMakeLists.txt
+++ b/stdlib/misc/CMakeLists.txt
@@ -1,4 +1,7 @@
 set(SOURCE_FILES
-  discard_kernel.cpp)
+  info_from_frame_kernel.cpp
+  discard_kernel.cpp
+  sleep_kernel.cpp
+  test_increment_kernel.cpp)
 
 add_library(misc OBJECT ${SOURCE_FILES})
diff --git a/stdlib/misc/discard_kernel.cpp b/stdlib/misc/discard_kernel.cpp
index 511ade2a..66d59b22 100644
--- a/stdlib/misc/discard_kernel.cpp
+++ b/stdlib/misc/discard_kernel.cpp
@@ -1,36 +1,42 @@
-#include "scanner/api/op.h"
 #include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
 #include "scanner/util/memory.h"
 
 namespace scanner {
 
-class DiscardKernel : public Kernel {
-public:
-  DiscardKernel(const Kernel::Config &config)
-      : Kernel(config), device_(config.devices[0]),
-        work_item_size_(config.work_item_size) {}
+class DiscardKernel : public BatchedKernel {
+ public:
+  DiscardKernel(const KernelConfig& config)
+    : BatchedKernel(config),
+      device_(config.devices[0]) {}
 
-  void execute(const BatchedColumns &input_columns,
-               BatchedColumns &output_columns) override {
-    i32 input_count = (i32)input_columns[0].rows.size();
-    u8 *output_block = new_block_buffer(device_, 1, input_count);
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    i32 input_count = (i32)num_rows(input_columns[0]);
     for (i32 i = 0; i < input_count; ++i) {
-      output_columns[0].rows.push_back(Row{output_block, 1});
+      insert_element(output_columns[0], new_buffer(device_, 1), 1);
     }
   }
 
-private:
+ private:
   DeviceHandle device_;
-  i32 work_item_size_;
 };
 
-REGISTER_OP(Discard).inputs({"ignore"}).outputs({"dummy"});
+REGISTER_OP(Discard).input("ignore").output("dummy");
+
+REGISTER_OP(DiscardFrame).frame_input("ignore").output("dummy");
+
+REGISTER_KERNEL(Discard, DiscardKernel).device(DeviceType::CPU).num_devices(1);
+
+REGISTER_KERNEL(Discard, DiscardKernel).device(DeviceType::GPU).num_devices(1);
 
-REGISTER_KERNEL(Discard, DiscardKernel)
+REGISTER_KERNEL(DiscardFrame, DiscardKernel)
     .device(DeviceType::CPU)
+    .batch()
     .num_devices(1);
 
-REGISTER_KERNEL(Discard, DiscardKernel)
+REGISTER_KERNEL(DiscardFrame, DiscardKernel)
     .device(DeviceType::GPU)
+    .batch()
     .num_devices(1);
 }
diff --git a/stdlib/misc/info_from_frame_kernel.cpp b/stdlib/misc/info_from_frame_kernel.cpp
new file mode 100644
index 00000000..f0225499
--- /dev/null
+++ b/stdlib/misc/info_from_frame_kernel.cpp
@@ -0,0 +1,44 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/memory.h"
+
+namespace scanner {
+
+class InfoFromFrameKernel : public BatchedKernel {
+ public:
+  InfoFromFrameKernel(const KernelConfig& config)
+    : BatchedKernel(config),
+      device_(config.devices[0]) {}
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    i32 input_count = (i32)num_rows(input_columns[0]);
+    u8* output_block =
+        new_block_buffer(device_, sizeof(FrameInfo) * input_count, input_count);
+    for (i32 i = 0; i < input_count; ++i) {
+      const Frame* frame = input_columns[0][i].as_const_frame();
+
+      u8* buffer = output_block + i * sizeof(FrameInfo);
+      FrameInfo* info = reinterpret_cast<FrameInfo*>(buffer);
+      FrameInfo info_cpu = frame->as_frame_info();
+      memcpy_buffer((u8*) info, device_,
+                    (u8*) &info_cpu, CPU_DEVICE,
+                    sizeof(FrameInfo));
+      insert_element(output_columns[0], buffer, sizeof(FrameInfo));
+    }
+  }
+
+ private:
+  DeviceHandle device_;
+};
+
+REGISTER_OP(InfoFromFrame).frame_input("frame").output("frame_info");
+
+REGISTER_KERNEL(InfoFromFrame, InfoFromFrameKernel)
+    .device(DeviceType::CPU)
+    .num_devices(1);
+
+REGISTER_KERNEL(InfoFromFrame, InfoFromFrameKernel)
+    .device(DeviceType::GPU)
+    .num_devices(1);
+}
diff --git a/stdlib/misc/sleep_kernel.cpp b/stdlib/misc/sleep_kernel.cpp
new file mode 100644
index 00000000..776fcc82
--- /dev/null
+++ b/stdlib/misc/sleep_kernel.cpp
@@ -0,0 +1,33 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/memory.h"
+
+namespace scanner {
+
+class SleepKernel : public Kernel {
+ public:
+  SleepKernel(const KernelConfig& config)
+    : Kernel(config),
+      device_(config.devices[0]) {}
+
+  void execute(const Columns& input_columns, Columns& output_columns) override {
+    sleep(2);
+    insert_element(output_columns[0], new_buffer(device_, 1), 1);
+  }
+
+ private:
+  DeviceHandle device_;
+};
+
+REGISTER_OP(Sleep).input("ignore").output("dummy");
+
+REGISTER_OP(SleepFrame).frame_input("ignore").output("dummy");
+
+REGISTER_KERNEL(Sleep, SleepKernel).device(DeviceType::CPU).num_devices(1);
+
+REGISTER_KERNEL(Sleep, SleepKernel).device(DeviceType::GPU).num_devices(1);
+
+REGISTER_KERNEL(SleepFrame, SleepKernel).device(DeviceType::CPU).num_devices(1);
+
+REGISTER_KERNEL(SleepFrame, SleepKernel).device(DeviceType::GPU).num_devices(1);
+}
diff --git a/stdlib/misc/test_increment_kernel.cpp b/stdlib/misc/test_increment_kernel.cpp
new file mode 100644
index 00000000..aa93f2f6
--- /dev/null
+++ b/stdlib/misc/test_increment_kernel.cpp
@@ -0,0 +1,71 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/memory.h"
+
+namespace scanner {
+
+class TestIncrementKernel : public Kernel {
+ public:
+  TestIncrementKernel(const KernelConfig& config)
+    : Kernel(config),
+      device_(config.devices[0]) {}
+
+  void reset() {
+    next_int_ = 0;
+  }
+
+  void execute(const Columns& input_columns,
+               Columns& output_columns) override {
+    if (last_row_ + 1 != input_columns[0].index) {
+      last_row_ = input_columns[0].index - 1;
+      reset();
+    }
+    last_row_++;
+
+    u8* buffer = new_buffer(device_, sizeof(i64));
+    *((i64*)buffer) = next_int_++;
+    insert_element(output_columns[0], buffer, sizeof(i64));
+  }
+
+ private:
+  DeviceHandle device_;
+  i64 next_int_ = 0;
+  i64 last_row_ = 0;
+};
+
+REGISTER_OP(TestIncrementUnbounded)
+.input("ignore")
+.output("integer")
+.unbounded_state();
+
+REGISTER_OP(TestIncrementUnboundedFrame)
+.frame_input("ignore")
+.output("integer")
+.unbounded_state();
+
+REGISTER_KERNEL(TestIncrementUnbounded, TestIncrementKernel)
+.device(DeviceType::CPU)
+.num_devices(1);
+
+REGISTER_KERNEL(TestIncrementUnboundedFrame, TestIncrementKernel)
+.device(DeviceType::CPU)
+.num_devices(1);
+
+REGISTER_OP(TestIncrementBounded)
+.input("ignore")
+.output("integer")
+.bounded_state();
+
+REGISTER_OP(TestIncrementBoundedFrame)
+.frame_input("ignore")
+.output("integer")
+.bounded_state();
+
+REGISTER_KERNEL(TestIncrementBounded, TestIncrementKernel)
+.device(DeviceType::CPU)
+.num_devices(1);
+
+REGISTER_KERNEL(TestIncrementBoundedFrame, TestIncrementKernel)
+.device(DeviceType::CPU)
+.num_devices(1);
+}
diff --git a/stdlib/motion/optical_flow_kernel_cpu.cpp b/stdlib/motion/optical_flow_kernel_cpu.cpp
index 97dd01fa..b754304d 100644
--- a/stdlib/motion/optical_flow_kernel_cpu.cpp
+++ b/stdlib/motion/optical_flow_kernel_cpu.cpp
@@ -1,6 +1,5 @@
-#include "scanner/api/op.h"
 #include "scanner/api/kernel.h"
-#include "scanner/util/cycle_timer.h"
+#include "scanner/api/op.h"
 #include "scanner/util/memory.h"
 #include "scanner/util/opencv.h"
 
@@ -8,74 +7,51 @@
 
 namespace scanner {
 
-class OpticalFlowKernelCPU : public VideoKernel {
-public:
-  OpticalFlowKernelCPU(const Kernel::Config &config)
-      : VideoKernel(config), device_(config.devices[0]),
-        work_item_size_(config.work_item_size) {
-    flow_finder_ = cv::FarnebackOpticalFlow::create();
+class OpticalFlowKernelCPU : public StenciledKernel, public VideoKernel {
+ public:
+  OpticalFlowKernelCPU(const KernelConfig& config)
+    : StenciledKernel(config),
+      device_(config.devices[0]) {
+    flow_finder_ =
+        cv::FarnebackOpticalFlow::create(3, 0.5, false, 15, 3, 5, 1.2, 0);
   }
 
   void new_frame_info() override {
     grayscale_.resize(0);
-    for (i32 i = 0; i < work_item_size_; ++i) {
+    for (i32 i = 0; i < 2; ++i) {
       grayscale_.emplace_back(frame_info_.height(), frame_info_.width(),
                               CV_8UC1);
     }
   }
 
-  void reset() override { initial_frame_ = cv::Mat(); }
-
-  void execute(const BatchedColumns &input_columns,
-               BatchedColumns &output_columns) override {
-    check_frame_info(device_, input_columns[1]);
-
-    i32 input_count = (i32)input_columns[0].rows.size();
-    size_t out_buf_size =
-        frame_info_.width() * frame_info_.height() * 2 * sizeof(float);
-
-    u8 *output_block =
-        new_block_buffer(device_, out_buf_size * input_count, input_count);
-
-    for (i32 i = 0; i < input_count; ++i) {
-      cv::Mat input(frame_info_.height(), frame_info_.width(), CV_8UC3,
-                    input_columns[0].rows[i].buffer);
-      cv::cvtColor(input, grayscale_[i], CV_BGR2GRAY);
-    }
-
-    double start = CycleTimer::currentSeconds();
-
-    cv::Ptr<cv::DenseOpticalFlow> flow_finder =
-        cv::FarnebackOpticalFlow::create();
-
-    for (i32 i = 0; i < input_count; ++i) {
-      cv::Mat flow(frame_info_.height(), frame_info_.width(), CV_32FC2,
-                   output_block + i * out_buf_size);
-
-      if (i == 0) {
-        if (initial_frame_.empty()) {
-          output_columns[0].rows.push_back(Row{flow.data, out_buf_size});
-          continue;
-        } else {
-          flow_finder_->calc(initial_frame_, grayscale_[0], flow);
-        }
-      } else {
-        flow_finder_->calc(grayscale_[i - 1], grayscale_[i], flow);
-      }
-
-      output_columns[0].rows.push_back(Row{flow.data, out_buf_size});
-    }
+  void execute(const StenciledColumns& input_columns,
+               Columns& output_columns) override {
+    auto& frame_col = input_columns[0];
+    check_frame(device_, frame_col[0]);
+
+    FrameInfo out_frame_info(frame_info_.height(), frame_info_.width(), 2,
+                             FrameType::F32);
+    Frame* output_frame = new_frame(device_, out_frame_info);
+
+    cv::Mat input0 = frame_to_mat(frame_col[0].as_const_frame());
+    cv::Mat input1 = frame_to_mat(frame_col[1].as_const_frame());
+    cv::cvtColor(input0, grayscale_[0], CV_BGR2GRAY);
+    cv::cvtColor(input1, grayscale_[1], CV_BGR2GRAY);
+    cv::Mat flow = frame_to_mat(output_frame);
+    flow_finder_->calc(grayscale_[0], grayscale_[1], flow);
+    insert_frame(output_columns[0], output_frame);
   }
 
-private:
+ private:
   DeviceHandle device_;
   cv::Ptr<cv::DenseOpticalFlow> flow_finder_;
-  cv::Mat initial_frame_;
   std::vector<cv::Mat> grayscale_;
-  i32 work_item_size_;
 };
 
-REGISTER_OP(OpticalFlow).inputs({"frame", "frame_info"}).outputs({"flow"});
+REGISTER_OP(OpticalFlow)
+    .frame_input("frame")
+    .frame_output("flow")
+    .stencil({0, 1});
 
 REGISTER_KERNEL(OpticalFlow, OpticalFlowKernelCPU)
     .device(DeviceType::CPU)
diff --git a/stdlib/motion/optical_flow_kernel_gpu.cpp b/stdlib/motion/optical_flow_kernel_gpu.cpp
index f9cfa968..693dba61 100644
--- a/stdlib/motion/optical_flow_kernel_gpu.cpp
+++ b/stdlib/motion/optical_flow_kernel_gpu.cpp
@@ -1,5 +1,5 @@
-#include "scanner/api/op.h"
 #include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
 #include "scanner/util/cuda.h"
 #include "scanner/util/cycle_timer.h"
 #include "scanner/util/memory.h"
@@ -9,96 +9,90 @@
 
 namespace scanner {
 
-class OpticalFlowKernelGPU : public VideoKernel {
-public:
-  OpticalFlowKernelGPU(const Kernel::Config &config)
-      : VideoKernel(config), device_(config.devices[0]),
-        work_item_size_(config.work_item_size), num_cuda_streams_(4) {
+class OpticalFlowKernelGPU : public StenciledBatchedKernel, public VideoKernel {
+ public:
+  OpticalFlowKernelGPU(const KernelConfig& config)
+    : StenciledBatchedKernel(config),
+      device_(config.devices[0]),
+      num_cuda_streams_(8) {
     set_device();
+    cv::cuda::setBufferPoolUsage(true);
+    cv::cuda::setBufferPoolConfig(device_.id, 50 * 1024 * 1024, 5);
     streams_.resize(num_cuda_streams_);
     for (i32 i = 0; i < num_cuda_streams_; ++i) {
-      flow_finders_.push_back(cvc::FarnebackOpticalFlow::create());
+      flow_finders_.push_back(
+          cvc::FarnebackOpticalFlow::create(3, 0.5, false, 15, 3, 5, 1.2, 0));
     }
   }
 
   ~OpticalFlowKernelGPU() {
     set_device();
+    flow_finders_.clear();
+    streams_.clear();
+    cv::cuda::setBufferPoolConfig(device_.id, 0, 0);
+    cv::cuda::setBufferPoolUsage(false);
   }
 
   void new_frame_info() override {
     set_device();
-    grayscale_.resize(0);
-    for (i32 i = 0; i < work_item_size_; ++i) {
-      grayscale_.emplace_back(frame_info_.height(), frame_info_.width(),
-                              CV_8UC1);
-    }
   }
 
   void reset() override {
     set_device();
     initial_frame_ = cvc::GpuMat();
-    flow_finders_.resize(0);
-    for (i32 i = 0; i < num_cuda_streams_; ++i) {
-      flow_finders_.push_back(cvc::FarnebackOpticalFlow::create());
-    }
   }
 
-  void execute(const BatchedColumns &input_columns,
-               BatchedColumns &output_columns) override {
+  void execute(const StenciledBatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
     set_device();
-    check_frame_info(device_, input_columns[1]);
-
-    i32 input_count = (i32)input_columns[0].rows.size();
-    size_t out_buf_size =
-        frame_info_.width() * frame_info_.height() * 2 * sizeof(float);
 
-    u8 *output_block =
-        new_block_buffer(device_, out_buf_size * input_count, input_count);
+    auto& frame_col = input_columns[0];
+    check_frame(device_, frame_col[0][0]);
 
+    i32 input_count = (i32)frame_col.size();
+    std::vector<const Frame*> input_frames;
     for (i32 i = 0; i < input_count; ++i) {
-      i32 sid = i % num_cuda_streams_;
-      cv::cuda::Stream &s = streams_[sid];
-      cvc::GpuMat input(frame_info_.height(), frame_info_.width(), CV_8UC3,
-                        input_columns[0].rows[i].buffer);
-      cvc::cvtColor(input, grayscale_[i], CV_BGR2GRAY, 0, s);
+      input_frames.push_back(frame_col[i][0].as_const_frame());
     }
+    input_frames.push_back(frame_col.back()[1].as_const_frame());
 
-    for (cv::cuda::Stream &s : streams_) {
+    grayscale_.resize(input_count + 1);
+
+    FrameInfo out_frame_info(frame_info_.height(), frame_info_.width(), 2,
+                             FrameType::F32);
+    std::vector<Frame*> output_frames =
+        new_frames(device_, out_frame_info, input_count);
+
+    for (i32 i = 0; i < input_count + 1; ++i) {
+      i32 sidx = i % num_cuda_streams_;
+      streams_[sidx].waitForCompletion();
+      cvc::GpuMat input = frame_to_gpu_mat(input_frames[i]);
+      cvc::cvtColor(input, grayscale_[i], CV_BGR2GRAY, 0, streams_[sidx]);
+    }
+    for (auto& s : streams_) {
       s.waitForCompletion();
     }
 
-    double start = CycleTimer::currentSeconds();
+    for (i32 i = 1; i < input_count + 1; ++i) {
+      i32 sidx = i % num_cuda_streams_;
 
-    // FIXME(wcrichto): TVL1 flow doesn't seem to work with multiple Cuda
-    // streams, segfaults in the TVL1 destructor. Investigate?
-    for (i32 i = 0; i < input_count; ++i) {
-      i32 sid = i % num_cuda_streams_;
-      cv::cuda::Stream &s = streams_[sid];
-      cvc::GpuMat flow(frame_info_.height(), frame_info_.width(), CV_32FC2,
-                       output_block + i * out_buf_size);
-
-      if (i == 0) {
-        if (initial_frame_.empty()) {
-          output_columns[0].rows.push_back(Row{flow.data, out_buf_size});
-          continue;
-        } else {
-          flow_finders_[sid]->calc(initial_frame_, grayscale_[0], flow, s);
-        }
-      } else {
-        flow_finders_[sid]->calc(grayscale_[i - 1], grayscale_[i], flow, s);
-      }
-
-      output_columns[0].rows.push_back(Row{flow.data, out_buf_size});
-    }
+      i32 curr_idx = i;
+      i32 prev_idx = (i - 1);
 
-    grayscale_[input_count - 1].copyTo(initial_frame_);
+      cvc::GpuMat& input0 = grayscale_[curr_idx];
+      cvc::GpuMat& input1 = grayscale_[prev_idx];
 
-    for (cv::cuda::Stream &s : streams_) {
+      //streams_[sidx].waitForCompletion();
+      cvc::GpuMat output_mat = frame_to_gpu_mat(output_frames[i - 1]);
+      flow_finders_[0]->calc(input0, input1, output_mat);
+      insert_frame(output_columns[0], output_frames[i - 1]);
+    }
+    for (auto& s : streams_) {
       s.waitForCompletion();
     }
   }
 
-private:
+ private:
   void set_device() {
     CU_CHECK(cudaSetDevice(device_.id));
     cvc::setDevice(device_.id);
@@ -108,12 +102,12 @@ class OpticalFlowKernelGPU : public VideoKernel {
   std::vector<cv::Ptr<cvc::DenseOpticalFlow>> flow_finders_;
   cvc::GpuMat initial_frame_;
   std::vector<cvc::GpuMat> grayscale_;
-  i32 work_item_size_;
   i32 num_cuda_streams_;
   std::vector<cv::cuda::Stream> streams_;
 };
 
 REGISTER_KERNEL(OpticalFlow, OpticalFlowKernelGPU)
     .device(DeviceType::GPU)
+    .batch()
     .num_devices(1);
 }
diff --git a/stdlib/motion/tracker_evaluator.cpp b/stdlib/motion/tracker_evaluator.cpp
index 16eb5ea4..7b3fc3ae 100644
--- a/stdlib/motion/tracker_evaluator.cpp
+++ b/stdlib/motion/tracker_evaluator.cpp
@@ -31,17 +31,20 @@
 
 namespace scanner {
 
-TrackerEvaluator::TrackerEvaluator(const EvaluatorConfig &config,
+TrackerEvaluator::TrackerEvaluator(const EvaluatorConfig& config,
                                    DeviceType device_type, i32 device_id,
                                    i32 warmup_count, i32 max_tracks)
-    : config_(config), device_type_(device_type), device_id_(device_id),
-      warmup_count_(warmup_count), max_tracks_(max_tracks) {
+  : config_(config),
+    device_type_(device_type),
+    device_id_(device_id),
+    warmup_count_(warmup_count),
+    max_tracks_(max_tracks) {
   if (device_type_ == DeviceType::GPU) {
     LOG(FATAL) << "GPU tracker support not implemented yet";
   }
 }
 
-void TrackerEvaluator::configure(const BatchConfig &config) {
+void TrackerEvaluator::configure(const BatchConfig& config) {
   VLOG(1) << "Tracker configure";
   assert(config.formats.size() == 1);
   metadata_ = config.formats[0];
@@ -52,8 +55,8 @@ void TrackerEvaluator::reset() {
   tracks_.clear();
 }
 
-void TrackerEvaluator::evaluate(const BatchedColumns &input_columns,
-                                BatchedColumns &output_columns) {
+void TrackerEvaluator::evaluate(const BatchedColumns& input_columns,
+                                BatchedColumns& output_columns) {
   assert(input_columns.size() >= 2);
 
   i32 input_count = input_columns[0].rows.size();
@@ -75,10 +78,10 @@ void TrackerEvaluator::evaluate(const BatchedColumns &input_columns,
     // For boxes which don't overlap existing ones, create a new track for them
     std::vector<BoundingBox> detected_bboxes;
     std::vector<BoundingBox> new_detected_bboxes;
-    for (const BoundingBox &box : all_boxes) {
+    for (const BoundingBox& box : all_boxes) {
       i32 overlap_idx = -1;
       for (size_t j = 0; j < tracks_.size(); ++j) {
-        auto &tracked_bbox = tracks_[j].box;
+        auto& tracked_bbox = tracks_[j].box;
         if (iou(box, tracked_bbox) > IOU_THRESHOLD) {
           overlap_idx = j;
           break;
@@ -98,30 +101,30 @@ void TrackerEvaluator::evaluate(const BatchedColumns &input_columns,
     // Perform tracking for all existing tracks that we have
     std::vector<BoundingBox> generated_bboxes;
     {
-      u8 *buffer = input_columns[frame_idx].rows[b].buffer;
+      u8* buffer = input_columns[frame_idx].rows[b].buffer;
       assert(input_columns[frame_idx].rows[b].size ==
              metadata_.height() * metadata_.width() * 3 * sizeof(u8));
       cv::Mat frame(metadata_.height(), metadata_.width(), CV_8UC3, buffer);
       std::vector<f64> scores(tracks_.size());
       std::vector<struck::FloatRect> tracked_bboxes(tracks_.size());
       std::vector<std::thread> tracker_threads(tracks_.size());
-      auto track_fn = [](struck::Tracker *tracker, const cv::Mat &frame,
-                         f64 &score, struck::FloatRect &tracked_bbox) {
+      auto track_fn = [](struck::Tracker* tracker, const cv::Mat& frame,
+                         f64& score, struck::FloatRect& tracked_bbox) {
         tracker->Track(frame);
         score = tracker->GetScore();
         tracked_bbox = tracker->GetBB();
       };
 
       for (i32 i = 0; i < (i32)tracks_.size(); ++i) {
-        auto &track = tracks_[i];
-        auto &tracker = track.tracker;
+        auto& track = tracks_[i];
+        auto& tracker = track.tracker;
         tracker_threads[i] =
             std::thread(track_fn, tracker.get(), std::ref(frame),
                         std::ref(scores[i]), std::ref(tracked_bboxes[i]));
       }
       for (i32 i = 0, jid = 0; i < (i32)tracks_.size(); ++i, ++jid) {
-        auto &track = tracks_[i];
-        auto &tracker = track.tracker;
+        auto& track = tracks_[i];
+        auto& tracker = track.tracker;
         tracker_threads[jid].join();
         f64 score = scores[jid];
         struck::FloatRect tracked_bbox = tracked_bboxes[jid];
@@ -152,7 +155,7 @@ void TrackerEvaluator::evaluate(const BatchedColumns &input_columns,
             std::make_tuple<f64, i32>(tracks_[i].tracker->GetScore(), (i32)i));
       }
       std::sort(track_thresholds.begin(), track_thresholds.end(),
-                [](auto &left, auto &right) {
+                [](auto& left, auto& right) {
                   return std::get<0>(left) < std::get<0>(right);
                 });
       i32 num_tracks_to_remove =
@@ -169,14 +172,14 @@ void TrackerEvaluator::evaluate(const BatchedColumns &input_columns,
       }
     }
     assert(tracks_.size() <= max_tracks_);
-    for (BoundingBox &box : new_detected_bboxes) {
+    for (BoundingBox& box : new_detected_bboxes) {
       tracks_.resize(tracks_.size() + 1);
-      Track &track = tracks_.back();
+      Track& track = tracks_.back();
       // i32 tracker_id = next_tracker_id_++;
       i32 tracker_id = unif(gen);
       track.id = tracker_id;
       track.config.reset(new struck::Config{});
-      struck::Config &config = *track.config.get();
+      struck::Config& config = *track.config.get();
       config.frameWidth = metadata_.width();
       config.frameHeight = metadata_.height();
       struck::Config::FeatureKernelPair fkp;
@@ -185,7 +188,7 @@ void TrackerEvaluator::evaluate(const BatchedColumns &input_columns,
       config.features.push_back(fkp);
       track.tracker.reset(new struck::Tracker(config));
 
-      u8 *buffer = input_columns[frame_idx].rows[b].buffer;
+      u8* buffer = input_columns[frame_idx].rows[b].buffer;
       assert(input_columns[frame_idx].rows[b].size ==
              metadata_.height() * metadata_.width() * 3);
       cv::Mat frame(metadata_.height(), metadata_.width(), CV_8UC3, buffer);
@@ -205,7 +208,7 @@ void TrackerEvaluator::evaluate(const BatchedColumns &input_columns,
 
     {
       size_t size;
-      u8 *buffer;
+      u8* buffer;
 
       serialize_bbox_vector(detected_bboxes, buffer, size);
       output_columns[1].rows.push_back(Row{buffer, size});
@@ -220,7 +223,7 @@ void TrackerEvaluator::evaluate(const BatchedColumns &input_columns,
   }
 }
 
-float TrackerEvaluator::iou(const BoundingBox &bl, const BoundingBox &br) {
+float TrackerEvaluator::iou(const BoundingBox& bl, const BoundingBox& br) {
   float x1 = std::max(bl.x1(), br.x1());
   float y1 = std::max(bl.y1(), br.y1());
   float x2 = std::min(bl.x2(), br.x2());
@@ -242,8 +245,9 @@ float TrackerEvaluator::iou(const BoundingBox &bl, const BoundingBox &br) {
 TrackerEvaluatorFactory::TrackerEvaluatorFactory(DeviceType device_type,
                                                  i32 warmup_count,
                                                  i32 max_tracks)
-    : device_type_(device_type), warmup_count_(warmup_count),
-      max_tracks_(max_tracks) {
+  : device_type_(device_type),
+    warmup_count_(warmup_count),
+    max_tracks_(max_tracks) {
   if (device_type_ == DeviceType::GPU) {
     LOG(FATAL) << "GPU tracker support not implemented yet";
   }
@@ -258,12 +262,12 @@ EvaluatorCapabilities TrackerEvaluatorFactory::get_capabilities() {
 }
 
 std::vector<std::string> TrackerEvaluatorFactory::get_output_columns(
-    const std::vector<std::string> &input_columns) {
+    const std::vector<std::string>& input_columns) {
   return {"frame", "before_bboxes", "after_bboxes"};
 }
 
-Evaluator *
-TrackerEvaluatorFactory::new_evaluator(const EvaluatorConfig &config) {
+Evaluator* TrackerEvaluatorFactory::new_evaluator(
+    const EvaluatorConfig& config) {
   return new TrackerEvaluator(config, device_type_, 0, warmup_count_,
                               max_tracks_);
 }
diff --git a/stdlib/openface/CMakeLists.txt b/stdlib/openface/CMakeLists.txt
index 26b67ece..28385e07 100644
--- a/stdlib/openface/CMakeLists.txt
+++ b/stdlib/openface/CMakeLists.txt
@@ -4,7 +4,11 @@ set(SOURCE_FILES openface_kernel.cpp)
 
 add_library(openface OBJECT ${SOURCE_FILES})
 
+message(STATUS "${OpenFace_LIBRARIES}")
+
 target_include_directories(openface PUBLIC "${OpenFace_INCLUDE_DIRS}")
 list(APPEND STDLIB_LIBRARIES "${OpenFace_LIBRARIES}")
-
 set(STDLIB_LIBRARIES ${STDLIB_LIBRARIES} PARENT_SCOPE)
+
+list(APPEND OPENCV_COMPONENTS core objdetect)
+set(OPENCV_COMPONENTS ${OPENCV_COMPONENTS} PARENT_SCOPE)
diff --git a/stdlib/openface/openface_kernel.cpp b/stdlib/openface/openface_kernel.cpp
index 187f008c..07fb339e 100644
--- a/stdlib/openface/openface_kernel.cpp
+++ b/stdlib/openface/openface_kernel.cpp
@@ -1,33 +1,37 @@
-#include "scanner/api/op.h"
 #include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
 #include "scanner/util/memory.h"
+#include "scanner/util/opencv.h"
 #include "scanner/util/serialize.h"
 
-#include "OpenFace/LandmarkCoreIncludes.h"
 #include "OpenFace/FaceAnalyser.h"
 #include "OpenFace/GazeEstimation.h"
+#include "OpenFace/LandmarkCoreIncludes.h"
 
 #include <boost/filesystem.hpp>
 #include <opencv2/imgproc.hpp>
 
 namespace scanner {
 
-class OpenFaceEvaluator : public VideoKernel {
-public:
-  OpenFaceEvaluator(const Kernel::Config& config)
-    : VideoKernel(config), clnf_model(det_parameters.model_location)  {
+class OpenFaceKernel : public BatchedKernel, public VideoKernel {
+ public:
+  OpenFaceKernel(const KernelConfig& config)
+    : BatchedKernel(config), clnf_model(det_parameters.model_location) {
     boost::filesystem::path au_loc_path =
-      boost::filesystem::path("AU_predictors/AU_all_static.txt");
+        boost::filesystem::path("AU_predictors/AU_all_static.txt");
     boost::filesystem::path tri_loc_path =
-      boost::filesystem::path("model/tris_68_full.txt");
-    face_analyser_ = FaceAnalysis::FaceAnalyser(
-      vector<cv::Vec3d>(), 0.7, 112, 112, au_loc_path.string(),
-      tri_loc_path.string());
+        boost::filesystem::path("model/tris_68_full.txt");
+    face_analyser_ =
+        FaceAnalysis::FaceAnalyser(vector<cv::Vec3d>(), 0.7, 112, 112,
+                                   au_loc_path.string(), tri_loc_path.string());
   }
 
   void execute(const BatchedColumns& input_columns,
                BatchedColumns& output_columns) override {
-    check_frame_info(CPU_DEVICE, input_columns[1]);
+    auto& frame_col = input_columns[0];
+    auto& bbox_col = input_columns[1];
+    check_frame(CPU_DEVICE, frame_col[0]);
+
     i32 width = frame_info_.width();
     i32 height = frame_info_.height();
     cx = width / 2.0f;
@@ -37,64 +41,74 @@ class OpenFaceEvaluator : public VideoKernel {
     fx = (fx + fy) / 2.0;
     fy = fx;
 
-    i32 input_count = input_columns[0].rows.size();
+    i32 input_count = num_rows(frame_col);
     for (i32 b = 0; b < input_count; ++b) {
-      cv::Mat img(frame_info_.height(), frame_info_.width(), CV_8UC3,
-                  (u8 *)input_columns[0].rows[b].buffer);
+      Frame* output_frame = new_frame(CPU_DEVICE, frame_info_);
+      memcpy(output_frame->data, frame_col[b].as_const_frame()->data,
+             output_frame->size());
+      cv::Mat img = frame_to_mat(output_frame);
       cv::Mat grey;
       cv::cvtColor(img, grey, CV_BGR2GRAY);
-      std::vector<BoundingBox> all_bboxes = deserialize_proto_vector<BoundingBox>(
-        input_columns[2].rows[b].buffer, input_columns[2].rows[b].size);
+      std::vector<BoundingBox> all_bboxes =
+          deserialize_proto_vector<BoundingBox>(bbox_col[b].buffer,
+                                                bbox_col[b].size);
+
       for (auto& bbox : all_bboxes) {
         f64 x1 = bbox.x1(), y1 = bbox.y1(), x2 = bbox.x2(), y2 = bbox.y2();
         f64 w = x2 - x1, h = y2 - y1;
         f64 nw = w, nh = h, dw = nw - w, dh = nh - h;
-        x1 = std::max(x1 - dw/2, 0.0);
-        y1 = std::max(y1 - dh/2, 0.0);
-        x2 = std::min(x2 + dw/2, (f64)(frame_info_.width()-1));
-        y2 = std::min(y2 + dh/2, (f64)(frame_info_.height()-1));
-        cv::Rect_<double> cv_bbox(x1, y1, x2-x1, y2-y1);
+        x1 = std::max(x1 - dw / 2, 0.0);
+        y1 = std::max(y1 - dh / 2, 0.0);
+        x2 = std::min(x2 + dw / 2, (f64)(frame_info_.width() - 1));
+        y2 = std::min(y2 + dh / 2, (f64)(frame_info_.height() - 1));
+        cv::Rect_<double> cv_bbox(x1, y1, x2 - x1, y2 - y1);
         cv::rectangle(img, cv_bbox, cv::Scalar(0, 255, 0));
 
         bool success = LandmarkDetector::DetectLandmarksInImage(
-          grey, cv_bbox, clnf_model, det_parameters);
+            grey, cv_bbox, clnf_model, det_parameters);
         if (success) {
           std::vector<cv::Point2d> landmarks =
-            LandmarkDetector::CalculateLandmarks(clnf_model);
+              LandmarkDetector::CalculateLandmarks(clnf_model);
 
           cv::Point3f gazeDirection0(0, 0, -1);
           cv::Point3f gazeDirection1(0, 0, -1);
-          FaceAnalysis::EstimateGaze(
-            clnf_model, gazeDirection0, fx, fy, cx, cy, true);
-          FaceAnalysis::EstimateGaze(
-            clnf_model, gazeDirection1, fx, fy, cx, cy, false);
+          FaceAnalysis::EstimateGaze(clnf_model, gazeDirection0, fx, fy, cx, cy,
+                                     true);
+          FaceAnalysis::EstimateGaze(clnf_model, gazeDirection1, fx, fy, cx, cy,
+                                     false);
 
-          auto ActionUnits = face_analyser_.PredictStaticAUs(grey, clnf_model, false);
+          auto ActionUnits =
+              face_analyser_.PredictStaticAUs(grey, clnf_model, false);
 
-          cv::Vec6d headPose =
-            LandmarkDetector::GetCorrectedPoseWorld(clnf_model, fx, fy, cx, cy);
+          cv::Vec6d headPose = LandmarkDetector::GetCorrectedPoseWorld(
+              clnf_model, fx, fy, cx, cy);
 
-          LandmarkDetector::DrawBox(
-            img, headPose, cv::Scalar(255.0, 0, 0), 3, fx, fy, cx, cy);
-          FaceAnalysis::DrawGaze(
-            img, clnf_model, gazeDirection0, gazeDirection1, fx, fy, cx, cy);
+          LandmarkDetector::DrawBox(img, headPose, cv::Scalar(255.0, 0, 0), 3,
+                                    fx, fy, cx, cy);
+          FaceAnalysis::DrawGaze(img, clnf_model, gazeDirection0,
+                                 gazeDirection1, fx, fy, cx, cy);
           LandmarkDetector::Draw(img, clnf_model);
         }
       }
 
-      INSERT_ROW(output_columns[0], img.data, input_columns[0].rows[b].size);
+      insert_frame(output_columns[0], output_frame);
     }
   }
 
-private:
+ private:
   LandmarkDetector::FaceModelParameters det_parameters;
   LandmarkDetector::CLNF clnf_model;
   std::vector<std::string> files, depth_files, output_images,
-    output_landmark_locations, output_pose_locations;
+      output_landmark_locations, output_pose_locations;
   std::vector<cv::Rect_<double>> bounding_boxes;
   int device;
   float fx, fy, cx, cy;
   FaceAnalysis::FaceAnalyser face_analyser_;
 };
 
+REGISTER_OP(OpenFace).frame_input("frame").input("faces").output("features");
+
+REGISTER_KERNEL(OpenFace, OpenFaceKernel)
+    .device(DeviceType::CPU)
+    .num_devices(1);
 }
diff --git a/stdlib/stdlib.proto b/stdlib/stdlib.proto
index 3297b2be..93ab0b82 100644
--- a/stdlib/stdlib.proto
+++ b/stdlib/stdlib.proto
@@ -8,6 +8,12 @@ message BlurArgs {
   float sigma = 2;
 }
 
+message MontageArgs {
+  int64 num_frames = 1;
+  int32 target_width = 4;
+  int32 frames_per_row = 6;
+}
+
 message CaffeInputArgs {
   NetDescriptor net_descriptor = 1;
   int32 batch_size = 2;
@@ -20,11 +26,77 @@ message CaffeArgs {
 
 message FacenetArgs {
   CaffeArgs caffe_args = 1;
-  float scale = 2;
-  float threshold = 3;
+  string templates_path = 2;
+  float scale = 3;
+  float threshold = 4;
 }
 
 message CPM2Args {
   CaffeArgs caffe_args = 1;
   float scale = 2;
 }
+
+message OpenPoseArgs {
+  string model_directory = 1;
+
+  int32 pose_num_scales = 2;
+  float pose_scale_gap = 3;
+
+  bool compute_hands = 4;
+  int32 hand_num_scales = 5;
+  float hand_scale_gap = 6;
+
+  bool compute_face = 7;
+}
+
+message Camera {
+  repeated float p = 1 [packed=true];
+}
+
+message GipumaArgs {
+  float min_disparity = 2;
+  float max_disparity = 3;
+  float min_depth = 4;
+  float max_depth = 5;
+  float iterations = 6;
+  int32 kernel_width = 7;
+  int32 kernel_height = 8;
+}
+
+enum ExtractorType {
+  SIFT = 0;
+  SURF = 1;
+}
+
+message FeatureExtractorArgs {
+  ExtractorType feature_type = 1;
+}
+
+message Keypoint {
+  float x = 1;
+  float y = 2;
+}
+
+message ResizeArgs {
+  int32 width = 1;
+  int32 height = 2;
+  bool min = 3;
+  bool preserve_aspect = 4;
+}
+
+message ImageDecoderArgs {
+  enum ImageType {
+    JPEG = 0;
+    ANY = 1;
+  }
+
+  ImageType image_type = 1;
+}
+
+message PoseNMSArgs {
+  int32 height = 1;
+}
+
+message BBoxNMSArgs {
+  float scale = 1;
+}
diff --git a/stdlib/viz/CMakeLists.txt b/stdlib/viz/CMakeLists.txt
new file mode 100644
index 00000000..99a4f196
--- /dev/null
+++ b/stdlib/viz/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(SOURCE_FILES
+  draw_box_kernel_cpu.cpp)
+
+if (BUILD_CUDA)
+  list(APPEND SOURCE_FILES)
+endif()
+
+add_library(viz OBJECT ${SOURCE_FILES})
+
+list(APPEND OPENCV_COMPONENTS core highgui imgproc)
+set(OPENCV_COMPONENTS ${OPENCV_COMPONENTS} PARENT_SCOPE)
+
+set(STDLIB_LIBRARIES ${STDLIB_LIBRARIES} PARENT_SCOPE)
diff --git a/stdlib/viz/draw_box_kernel_cpu.cpp b/stdlib/viz/draw_box_kernel_cpu.cpp
new file mode 100644
index 00000000..fb44baea
--- /dev/null
+++ b/stdlib/viz/draw_box_kernel_cpu.cpp
@@ -0,0 +1,52 @@
+#include "scanner/api/kernel.h"
+#include "scanner/api/op.h"
+#include "scanner/util/memory.h"
+#include "scanner/util/opencv.h"
+#include "scanner/util/serialize.h"
+
+namespace scanner {
+
+class DrawBoxKernelCPU : public BatchedKernel {
+ public:
+  DrawBoxKernelCPU(const KernelConfig& config)
+    : BatchedKernel(config), device_(config.devices[0]) {}
+
+  void execute(const BatchedColumns& input_columns,
+               BatchedColumns& output_columns) override {
+    auto& frame_col = input_columns[0];
+    auto& bbox_col = input_columns[1];
+
+    i32 input_count = num_rows(frame_col);
+    FrameInfo info = frame_col[0].as_const_frame()->as_frame_info();
+    std::vector<Frame*> output_frames = new_frames(device_, info, input_count);
+
+    for (i32 i = 0; i < input_count; ++i) {
+      cv::Mat img = frame_to_mat(frame_col[i].as_const_frame());
+      cv::Mat out_img = frame_to_mat(output_frames[i]);
+      img.copyTo(out_img);
+
+      // Deserialize bboxes
+      std::vector<BoundingBox> bboxes =
+          deserialize_bbox_vector(bbox_col[i].buffer, bbox_col[i].size);
+
+      // Draw all bboxes
+      for (auto& bbox : bboxes) {
+        i32 width = bbox.x2() - bbox.x1();
+        i32 height = bbox.y2() - bbox.y1();
+        cv::rectangle(out_img, cv::Rect(bbox.x1(), bbox.y1(), width, height),
+                      cv::Scalar(255, 0, 0), 2);
+      }
+      insert_frame(output_columns[0], output_frames[i]);
+    }
+  }
+
+ private:
+  DeviceHandle device_;
+};
+
+REGISTER_OP(DrawBox).frame_input("frame").input("bboxes").frame_output("frame");
+
+REGISTER_KERNEL(DrawBox, DrawBoxKernelCPU)
+    .device(DeviceType::CPU)
+    .num_devices(1);
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 8d0e6a82..02a4409a 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,14 +1,6 @@
-add_executable(CppTests cpp_test.cpp)
-target_link_libraries(CppTests ${GTEST_LIBRARIES} ${GTEST_LIB_MAIN} scanner stdlib)
-add_test(CppTests CppTests)
-
-if(NOT BUILD_CUDA)
-  set(MARKERS -m "not gpu")
-endif()
 add_test(
   NAME PythonTests
-  COMMAND pytest ${CMAKE_CURRENT_SOURCE_DIR} -xv ${MARKERS})
-
+  COMMAND pytest ${CMAKE_CURRENT_SOURCE_DIR} -x -vv)
 
 add_executable(FfmpegTest ffmpeg_test.cpp)
 target_link_libraries(FfmpegTest ${GTEST_LIBRARIES} ${GTEST_LIB_MAIN} scanner stdlib)
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..206a7af4
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,5 @@
+import pytest
+
+def pytest_addoption(parser):
+    parser.addoption('--runslow', action='store_true',
+                     help='Run slow tests')
diff --git a/tests/cpp_test.cpp b/tests/cpp_test.cpp
deleted file mode 100644
index ce396e7f..00000000
--- a/tests/cpp_test.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-#include "scanner/api/op.h"
-#include "stdlib/stdlib.pb.h"
-#include "scanner/api/database.h"
-#include "scanner/util/fs.h"
-
-#include <gtest/gtest.h>
-
-namespace scanner {
-
-// Fixtures are taken down after every test, so to avoid-redownloading and
-// ingesting the files, we use static globals.
-static bool downloaded = false;
-static std::string db_path = "";
-
-class ScannerTest : public ::testing::Test {
-protected:
-  void SetUp() override {
-    // Create database
-    if (!downloaded) {
-      scanner::temp_dir(db_path);
-    }
-    sc_.reset(storehouse::StorageConfig::make_posix_config());
-    std::string master_address = "localhost:5001";
-    db_ = new scanner::Database(sc_.get(), db_path, master_address);
-
-    // Ingest video
-    if (!downloaded) {
-      std::string video_path = scanner::download_temp(
-        "https://storage.googleapis.com/scanner-data/test/short_video.mp4");
-      scanner::Result result;
-      std::vector<scanner::FailedVideo> failed_videos;
-      result = db_->ingest_videos(
-        {"test"}, {video_path},
-        failed_videos);
-      assert(result.success());
-      assert(failed_videos.empty());
-      downloaded = true;
-    }
-
-    // Initialize master and one worker
-    scanner::MachineParameters machine_params =
-      scanner::default_machine_params();
-    db_->start_master(machine_params);
-    db_->start_worker(machine_params);
-
-    // Construct job parameters
-    params_.memory_pool_config.mutable_cpu()->set_use_pool(false);
-    params_.memory_pool_config.mutable_gpu()->set_use_pool(false);
-    params_.pipeline_instances_per_node = 1;
-    params_.work_item_size = 25;
-  }
-
-  void TearDown() {
-    delete db_;
-  }
-
-  scanner::Op* blur_op(scanner::Op* input, scanner::DeviceType device_type) {
-    scanner::proto::BlurArgs blur_args;
-    blur_args.set_sigma(0.5);
-    blur_args.set_kernel_size(3);
-
-    size_t blur_args_size = blur_args.ByteSize();
-    char* blur_args_buff = new char[blur_args_size];
-    blur_args.SerializeToArray(blur_args_buff, blur_args_size);
-
-    return new scanner::Op(
-      "Blur", {scanner::OpInput(input, {"frame", "frame_info"})},
-      device_type, blur_args_buff, blur_args_size);
-  }
-
-  scanner::Op* blur_dag() {
-    scanner::Op *input =
-      scanner::make_input_op({"frame", "frame_info"});
-    scanner::Op *output = scanner::make_output_op(
-      {scanner::OpInput(blur_op(input, DeviceType::CPU), {"frame", "frame_info"})});
-    return output;
-  }
-
-  void gen_random(char *s, const int len) {
-    static const char alphanum[] =
-      "0123456789"
-      "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-      "abcdefghijklmnopqrstuvwxyz";
-
-    for (int i = 0; i < len; ++i) {
-      s[i] = alphanum[rand() % (sizeof(alphanum) - 1)];
-    }
-
-    s[len] = 0;
-  }
-
-  void run_task(scanner::Task task, scanner::Op* op) {
-    char job_name[12];
-    gen_random(job_name, 12);
-    params_.job_name = job_name;
-    params_.task_set.tasks.clear();
-    params_.task_set.tasks.push_back(task);
-    params_.task_set.output_op = op;
-
-    scanner::Result result = db_->new_job(params_);
-    ASSERT_TRUE(result.success())
-      << "Run job failed: " << result.msg();
-  }
-
-  scanner::Task range_task(std::string output_table_name) {
-    scanner::Task task;
-    task.output_table_name = output_table_name;
-    scanner::TableSample sample;
-    sample.table_name = "test";
-    sample.column_names = {"frame", "frame_info"};
-    sample.sampling_function = "Gather";
-    scanner::proto::GatherSamplerArgs args;
-    auto &gather_sample = *args.add_samples();
-    for (int i = 0; i < 100; i += 1) {
-      gather_sample.add_rows(i);
-    }
-    std::vector<scanner::u8> args_data(args.ByteSize());
-    args.SerializeToArray(args_data.data(), args_data.size());
-    sample.sampling_args = args_data;
-
-    task.samples.push_back(sample);
-    return task;
-  }
-
-  scanner::JobParameters params_;
-  std::unique_ptr<storehouse::StorageConfig> sc_;
-  scanner::Database* db_;
-};
-
-TEST_F(ScannerTest, Range) {
-  run_task(range_task("Range"), blur_dag());
-}
-
-TEST_F(ScannerTest, NonLinearDAG) {
-  scanner::Op *input =
-    scanner::make_input_op({"frame", "frame_info"});
-
-  scanner::Op *hist = new scanner::Op(
-    "Histogram",
-    {scanner::OpInput(blur_op(input, DeviceType::CPU), {"frame"}),
-     scanner::OpInput(input, {"frame_info"})},
-    scanner::DeviceType::CPU);
-
-  scanner::Op *output = scanner::make_output_op(
-    {scanner::OpInput(hist, {"histogram"})});
-
-  run_task(range_task("NonLinearDAG"), output);
-}
-
-#ifdef HAVE_CUDA
-
-TEST_F(ScannerTest, CPUToGPU) {
-  scanner::Op *input =
-    scanner::make_input_op({"frame", "frame_info"});
-
-  scanner::Op *hist = new scanner::Op(
-    "Histogram",
-    {scanner::OpInput(blur_op(input, DeviceType::CPU), {"frame"}),
-     scanner::OpInput(input, {"frame_info"})},
-    scanner::DeviceType::GPU);
-
-  scanner::Op *output = scanner::make_output_op(
-    {scanner::OpInput(hist, {"histogram"})});
-
-  run_task(range_task("CPUToGPU"), output);
-}
-
-// TODO: need a GPU blur op
-// TEST_F(ScannerTest, GPUToCPU) {
-//   scanner::Op *input =
-//     scanner::make_input_op({"frame", "frame_info"});
-
-//   scanner::Op *hist = new scanner::Op(
-//     "Histogram",
-//     {scanner::OpInput(blur_op(input, DeviceType::GPU), {"frame"}),
-//      scanner::OpInput(input, {"frame_info"})},
-//     scanner::DeviceType::CPU);
-
-//   scanner::Op *output = scanner::make_output_op(
-//     {scanner::OpInput(hist, {"histogram"})});
-
-//   run_task(range_task("GPUToCPU"), output);
-// }
-
-#endif
-
-}
diff --git a/tests/ffmpeg_test.cpp b/tests/ffmpeg_test.cpp
index 830fb6c1..fe3c4697 100644
--- a/tests/ffmpeg_test.cpp
+++ b/tests/ffmpeg_test.cpp
@@ -13,12 +13,12 @@
  * limitations under the License.
  */
 
-#include "scanner/engine/db.h"
+#include "scanner/engine/metadata.h"
 #include "scanner/util/fs.h"
 #include "scanner/util/h264.h"
 #include "scanner/util/queue.h"
-#include "tests/videos.h"
 #include "storehouse/storage_backend.h"
+#include "tests/videos.h"
 
 extern "C" {
 #include "libavcodec/avcodec.h"
@@ -53,15 +53,14 @@ struct DecoderState {
   AVCodecContext* cc;
   SwsContext* sws_context;
 
-  //AVFrame* frame;
+  // AVFrame* frame;
   Queue<AVFrame*> pool;
   Queue<AVFrame*> frame;
 
   i32 frame_width;
   i32 frame_height;
 
-  DecoderState()
-      :pool(100000), frame(100000) {
+  DecoderState() : pool(100000), frame(100000) {
     av_init_packet(&packet);
 
     codec = avcodec_find_decoder(AV_CODEC_ID_H264);
@@ -76,7 +75,6 @@ struct DecoderState {
     EXPECT_TRUE(result >= 0) << "could not open codec";
 
     sws_context = nullptr;
-
   };
 
   ~DecoderState() {
@@ -94,7 +92,7 @@ struct DecoderState {
     sws_freeContext(sws_context);
   }
 
-  void feed_frame(const u8 *encoded_buffer, i32 encoded_size) {
+  void feed_frame(const u8* encoded_buffer, i32 encoded_size) {
     if (encoded_size > 0) {
       if (av_new_packet(&packet, encoded_size) < 0) {
         fprintf(stderr, "could not allocate packet for feeding into decoder\n");
@@ -145,118 +143,117 @@ struct DecoderState {
       }
     }
 #else
-  // uint8_t *orig_data = packet_.data;
-  // int orig_size = packet_.size;
-  // int got_picture = 0;
-  // do {
-  //   // Get frame from pool of allocated frames to decode video into
-  //   AVFrame *frame;
-  //   {
-  //     std::lock_guard<std::mutex> lock(frame_mutex_);
-  //     if (frame_pool_.empty()) {
-  //       // Create a new frame if our pool is empty
-  //       frame_pool_.push_back(av_frame_alloc());
-  //     }
-  //     frame = frame_pool_.back();
-  //     frame_pool_.pop_back();
-  //   }
-
-  //   auto decode_start = now();
-  //   int consumed_length =
-  //       avcodec_decode_video2(cc_, frame, &got_picture, &packet_);
-  //   if (profiler_) {
-  //     profiler_->add_interval("ffmpeg:decode_video", decode_start, now());
-  //   }
-  //   if (consumed_length < 0) {
-  //     char err_msg[256];
-  //     av_strerror(consumed_length, err_msg, 256);
-  //     fprintf(stderr, "Error while decoding frame (%d): %s\n", consumed_length,
-  //             err_msg);
-  //     assert(false);
-  //   }
-  //   if (got_picture) {
-  //     if (frame->buf[0] == NULL) {
-  //       // Must copy packet as data is stored statically
-  //       AVFrame *cloned_frame = av_frame_clone(frame);
-  //       if (cloned_frame == NULL) {
-  //         fprintf(stderr, "could not clone frame\n");
-  //         assert(false);
-  //       }
-  //       std::lock_guard<std::mutex> lock(frame_mutex_);
-  //       printf("clone\n");
-  //       decoded_frame_queue_.push_back(cloned_frame);
-  //       av_frame_unref(frame);
-  //       frame_pool_.push_back(frame);
-  //     } else {
-  //       // Frame is reference counted so we can just take it directly
-  //       std::lock_guard<std::mutex> lock(frame_mutex_);
-  //       printf("push\n");
-  //       decoded_frame_queue_.push_back(frame);
-  //     }
-  //   } else {
-  //     std::lock_guard<std::mutex> lock(frame_mutex_);
-  //     frame_pool_.push_back(frame);
-  //   }
-  //   packet_.data += consumed_length;
-  //   packet_.size -= consumed_length;
-  // } while (packet_.size > 0 || (orig_size == 0 && got_picture));
-  // packet_.data = orig_data;
-  // packet_.size = orig_size;
+// uint8_t *orig_data = packet_.data;
+// int orig_size = packet_.size;
+// int got_picture = 0;
+// do {
+//   // Get frame from pool of allocated frames to decode video into
+//   AVFrame *frame;
+//   {
+//     std::lock_guard<std::mutex> lock(frame_mutex_);
+//     if (frame_pool_.empty()) {
+//       // Create a new frame if our pool is empty
+//       frame_pool_.push_back(av_frame_alloc());
+//     }
+//     frame = frame_pool_.back();
+//     frame_pool_.pop_back();
+//   }
+
+//   auto decode_start = now();
+//   int consumed_length =
+//       avcodec_decode_video2(cc_, frame, &got_picture, &packet_);
+//   if (profiler_) {
+//     profiler_->add_interval("ffmpeg:decode_video", decode_start, now());
+//   }
+//   if (consumed_length < 0) {
+//     char err_msg[256];
+//     av_strerror(consumed_length, err_msg, 256);
+//     fprintf(stderr, "Error while decoding frame (%d): %s\n", consumed_length,
+//             err_msg);
+//     assert(false);
+//   }
+//   if (got_picture) {
+//     if (frame->buf[0] == NULL) {
+//       // Must copy packet as data is stored statically
+//       AVFrame *cloned_frame = av_frame_clone(frame);
+//       if (cloned_frame == NULL) {
+//         fprintf(stderr, "could not clone frame\n");
+//         assert(false);
+//       }
+//       std::lock_guard<std::mutex> lock(frame_mutex_);
+//       printf("clone\n");
+//       decoded_frame_queue_.push_back(cloned_frame);
+//       av_frame_unref(frame);
+//       frame_pool_.push_back(frame);
+//     } else {
+//       // Frame is reference counted so we can just take it directly
+//       std::lock_guard<std::mutex> lock(frame_mutex_);
+//       printf("push\n");
+//       decoded_frame_queue_.push_back(frame);
+//     }
+//   } else {
+//     std::lock_guard<std::mutex> lock(frame_mutex_);
+//     frame_pool_.push_back(frame);
+//   }
+//   packet_.data += consumed_length;
+//   packet_.size -= consumed_length;
+// } while (packet_.size > 0 || (orig_size == 0 && got_picture));
+// packet_.data = orig_data;
+// packet_.size = orig_size;
 #endif
-  av_packet_unref(&packet);
-}
-
-void get_frame(u8* decoded_buffer, i32 decoded_size) {
-  if (frame.size() <= 0) {
-    return;
+    av_packet_unref(&packet);
   }
 
-  AVFrame* f;
-  frame.pop(f);
+  void get_frame(u8* decoded_buffer, i32 decoded_size) {
+    if (frame.size() <= 0) {
+      return;
+    }
 
-  printf("decode\n");
-  if (sws_context == nullptr) {
-    AVPixelFormat decoder_pixel_format = cc->pix_fmt;
-    sws_context = sws_getContext(
-        frame_width, frame_height, decoder_pixel_format, frame_width,
-        frame_height, AV_PIX_FMT_RGB24, SWS_BICUBIC, NULL, NULL, NULL);
-  }
+    AVFrame* f;
+    frame.pop(f);
 
-  if (sws_context == NULL) {
-    fprintf(stderr, "Could not get sws_context for rgb conversion\n");
-    exit(EXIT_FAILURE);
-  }
+    printf("decode\n");
+    if (sws_context == nullptr) {
+      AVPixelFormat decoder_pixel_format = cc->pix_fmt;
+      sws_context = sws_getContext(
+          frame_width, frame_height, decoder_pixel_format, frame_width,
+          frame_height, AV_PIX_FMT_RGB24, SWS_BICUBIC, NULL, NULL, NULL);
+    }
 
-  u8 *scale_buffer = decoded_buffer;
+    if (sws_context == NULL) {
+      fprintf(stderr, "Could not get sws_context for rgb conversion\n");
+      exit(EXIT_FAILURE);
+    }
 
-  uint8_t *out_slices[4];
-  int out_linesizes[4];
-  int required_size = av_image_fill_arrays(out_slices, out_linesizes,
-                                           scale_buffer, AV_PIX_FMT_RGB24,
-                                           frame_width, frame_height, 1);
-  if (required_size < 0) {
-    fprintf(stderr, "Error in av_image_fill_arrays\n");
-    exit(EXIT_FAILURE);
-  }
-  if (required_size > decoded_size) {
-    fprintf(stderr, "Decode buffer not large enough for image\n");
-    exit(EXIT_FAILURE);
-  }
-  if (sws_scale(sws_context, f->data, f->linesize, 0,
-                f->height, out_slices, out_linesizes) < 0) {
-    fprintf(stderr, "sws_scale failed\n");
-    exit(EXIT_FAILURE);
-  }
-  sws_freeContext(sws_context);
-  sws_context = nullptr;
+    u8* scale_buffer = decoded_buffer;
 
-  pool.push(f);
-}
+    uint8_t* out_slices[4];
+    int out_linesizes[4];
+    int required_size =
+        av_image_fill_arrays(out_slices, out_linesizes, scale_buffer,
+                             AV_PIX_FMT_RGB24, frame_width, frame_height, 1);
+    if (required_size < 0) {
+      fprintf(stderr, "Error in av_image_fill_arrays\n");
+      exit(EXIT_FAILURE);
+    }
+    if (required_size > decoded_size) {
+      fprintf(stderr, "Decode buffer not large enough for image\n");
+      exit(EXIT_FAILURE);
+    }
+    if (sws_scale(sws_context, f->data, f->linesize, 0, f->height, out_slices,
+                  out_linesizes) < 0) {
+      fprintf(stderr, "sws_scale failed\n");
+      exit(EXIT_FAILURE);
+    }
+    sws_freeContext(sws_context);
+    sws_context = nullptr;
 
+    pool.push(f);
+  }
 };
 
 class FfmpegTest : public ::testing::Test {
-protected:
+ protected:
   void SetUp() override {
     avcodec_register_all();
 
@@ -267,14 +264,13 @@ class FfmpegTest : public ::testing::Test {
     }
   }
 
-  void TearDown() {
-  }
+  void TearDown() {}
 
-  void gen_random(char *s, const int len) {
+  void gen_random(char* s, const int len) {
     static const char alphanum[] =
-      "0123456789"
-      "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-      "abcdefghijklmnopqrstuvwxyz";
+        "0123456789"
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+        "abcdefghijklmnopqrstuvwxyz";
 
     for (int i = 0; i < len; ++i) {
       s[i] = alphanum[rand() % (sizeof(alphanum) - 1)];
@@ -284,7 +280,6 @@ class FfmpegTest : public ::testing::Test {
   }
 };
 
-
 TEST_F(FfmpegTest, MemoryLeak) {
   std::unique_ptr<storehouse::StorageConfig> sc(
       storehouse::StorageConfig::make_posix_config());
@@ -304,7 +299,7 @@ TEST_F(FfmpegTest, MemoryLeak) {
 
   u8* decode_buffer = new u8[required_size];
 
-  const u8 *encoded_buffer = (const u8 *)video_bytes.data();
+  const u8* encoded_buffer = (const u8*)video_bytes.data();
   size_t encoded_buffer_size = video_bytes.size();
 
   printf("Starting decoding...\n");
@@ -314,10 +309,10 @@ TEST_F(FfmpegTest, MemoryLeak) {
     size_t buffer_offset = 0;
     while (buffer_offset < encoded_buffer_size) {
       i32 encoded_packet_size = 0;
-      const u8 *encoded_packet = NULL;
+      const u8* encoded_packet = NULL;
       if (buffer_offset < encoded_buffer_size) {
-        encoded_packet_size = *reinterpret_cast<const i32 *>(
-            encoded_buffer + buffer_offset);
+        encoded_packet_size =
+            *reinterpret_cast<const i32*>(encoded_buffer + buffer_offset);
         buffer_offset += sizeof(i32);
         encoded_packet = encoded_buffer + buffer_offset;
         assert(encoded_packet_size < encoded_buffer_size);
@@ -332,5 +327,4 @@ TEST_F(FfmpegTest, MemoryLeak) {
   }
   delete[] decode_buffer;
 }
-
 }
diff --git a/tests/py_test.py b/tests/py_test.py
index 5734a1f1..761b13ba 100644
--- a/tests/py_test.py
+++ b/tests/py_test.py
@@ -1,100 +1,962 @@
-from scannerpy import Database, Config, DeviceType
+from scannerpy import (Database, Config, DeviceType, ColumnType, BulkJob, Job,
+                       ProtobufGenerator, ScannerException)
 from scannerpy.stdlib import parsers
 import tempfile
 import toml
 import pytest
-import subprocess
+from subprocess import check_call as run
+from multiprocessing import Process, Queue
 import requests
+import imp
+import os.path
+import socket
+import numpy as np
+import sys
+import grpc
+import struct
+
+try:
+    run(['nvidia-smi'])
+    has_gpu = True
+except OSError:
+    has_gpu = False
+
+gpu = pytest.mark.skipif(not has_gpu, reason='need GPU to run')
+slow = pytest.mark.skipif(
+    not pytest.config.getoption('--runslow'),
+    reason='need --runslow option to run')
+
+cwd = os.path.dirname(os.path.abspath(__file__))
+
+
+@slow
+def test_tutorial():
+    def run_py(path):
+        print(path)
+        run('cd {}/../examples/tutorial && python {}.py'.format(cwd, path),
+            shell=True)
+
+    run('cd {}/../examples/tutorial/resize_op && '
+        'mkdir -p build && cd build && cmake -D SCANNER_PATH={} .. && '
+        'make'.format(cwd, cwd + '/..'),
+        shell=True)
+
+    tutorials = [
+        '00_basic', '01_sampling', '02_collections', '03_ops',
+        '04_compression', '05_custom_op'
+    ]
+
+    for t in tutorials:
+        run_py(t)
+
+
+@slow
+def test_examples():
+    def run_py((d, f)):
+        print(f)
+        run('cd {}/../examples/{} && python {}.py'.format(cwd, d, f),
+            shell=True)
+
+    examples = [('face_detection', 'face_detect'), ('shot_detection',
+                                                    'shot_detect')]
+
+    for e in examples:
+        run_py(e)
+
 
 @pytest.fixture(scope="module")
 def db():
     # Create new config
     with tempfile.NamedTemporaryFile(delete=False) as f:
         cfg = Config.default_config()
+        cfg['network']['master'] = 'localhost'
         cfg['storage']['db_path'] = tempfile.mkdtemp()
         f.write(toml.dumps(cfg))
         cfg_path = f.name
 
     # Setup and ingest video
-    db = Database(cfg_path)
-    url = "https://storage.googleapis.com/scanner-data/test/short_video.mp4"
-    with tempfile.NamedTemporaryFile(delete=False) as f:
-        resp = requests.get(url, stream=True)
-        assert resp.ok
-        for block in resp.iter_content(1024):
-            f.write(block)
-        vid_path = f.name
-    db.ingest_videos([('test', vid_path)])
+    with Database(config_path=cfg_path, debug=True) as db:
+        # Download video from GCS
+        url = "https://storage.googleapis.com/scanner-data/test/short_video.mp4"
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as f:
+            host = socket.gethostname()
+            # HACK: special proxy case for Ocean cluster
+            if host in ['ocean', 'crissy', 'pismo', 'stinson']:
+                resp = requests.get(
+                    url,
+                    stream=True,
+                    proxies={'https': 'http://proxy.pdl.cmu.edu:3128/'})
+            else:
+                resp = requests.get(url, stream=True)
+            assert resp.ok
+            for block in resp.iter_content(1024):
+                f.write(block)
+            vid1_path = f.name
+
+        # Make a second one shorter than the first
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as f:
+            vid2_path = f.name
+        run([
+            'ffmpeg', '-y', '-i', vid1_path, '-ss', '00:00:00', '-t',
+            '00:00:10', '-c:v', 'libx264', '-strict', '-2', vid2_path
+        ])
+
+        db.ingest_videos([('test1', vid1_path), ('test2', vid2_path)])
+
+        db.ingest_videos(
+            [('test1_inplace', vid1_path), ('test2_inplace', vid2_path)],
+            inplace=True)
 
-    yield db
+        yield db
 
-    # Tear down
-    subprocess.check_call(['rm', '-rf',
-                           cfg['storage']['db_path'],
-                           cfg_path,
-                           vid_path])
+        # Tear down
+        run([
+            'rm', '-rf', cfg['storage']['db_path'], cfg_path, vid1_path,
+            vid2_path
+        ])
+
+
+def test_new_database(db):
+    pass
 
-@pytest.mark.first
-def test_new_database(db): pass
 
 def test_table_properties(db):
-    table = db.table('test')
-    assert table.id() == 0
-    assert table.name() == 'test'
-    assert table.num_rows() == 720
-    assert len(table.columns()) == 2
-    assert [c.name() for c in table.columns()] == ['frame', 'frame_info']
+    for name, i in [('test1', 0), ('test1_inplace', 2)]:
+        table = db.table(name)
+        assert table.id() == i
+        assert table.name() == name
+        assert table.num_rows() == 720
+        assert [c for c in table.column_names()] == ['index', 'frame']
+
+
+def test_summarize(db):
+    db.summarize()
 
-def test_make_collection(db):
-    db.new_collection('test', ['test'])
 
 def test_load_video_column(db):
-    next(db.table('test').columns(0).load())
+    for name in ['test1', 'test1_inplace']:
+        next(db.table(name).load(['frame']))
+
+
+def test_gather_video_column(db):
+    for name in ['test1', 'test1_inplace']:
+        # Gather rows
+        rows = [0, 10, 100, 200]
+        frames = [_ for _ in db.table(name).load(['frame'], rows=rows)]
+        assert len(frames) == len(rows)
+
 
 def test_profiler(db):
-    [output] = db.run(
-        db.sampler().all([('test', '_ignore')]),
-        db.ops.Histogram())
-    profiler = output.profiler()
+    frame = db.ops.FrameInput()
+    hist = db.ops.Histogram(frame=frame)
+    output_op = db.ops.Output(columns=[hist])
+
+    job = Job(op_args={
+        frame: db.table('test1').column('frame'),
+        output_op: '_ignore'
+    })
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+
+    output = db.run(bulk_job, show_progress=False, force=True)
+    profiler = output[0].profiler()
     f = tempfile.NamedTemporaryFile(delete=False)
     f.close()
     profiler.write_trace(f.name)
     profiler.statistics()
-    subprocess.check_call(['rm', '-f', f.name])
+    run(['rm', '-f', f.name])
+
+
+def test_new_table(db):
+    db.new_table('test', ['col1', 'col2'], [['r00', 'r01'], ['r10', 'r11']])
+    t = db.table('test')
+    assert (t.num_rows() == 2)
+    assert (next(t.column('col2').load())[1] == 'r01')
+
+
+def test_sample(db):
+    def run_sampler_job(sampler_args, expected_rows):
+        frame = db.ops.FrameInput()
+        sample_frame = frame.sample()
+        output_op = db.ops.Output(columns=[sample_frame])
+
+        job = Job(op_args={
+            frame: db.table('test1').column('frame'),
+            sample_frame: sampler_args,
+            output_op: 'test_sample',
+        })
+        bulk_job = BulkJob(output=output_op, jobs=[job])
+        tables = db.run(bulk_job, force=True, show_progress=False)
+        num_rows = 0
+        for (frame_index, _) in tables[0].column('frame').load():
+            num_rows += 1
+        assert num_rows == expected_rows
+
+    # Stride
+    expected = (db.table('test1').num_rows() + 8 - 1) / 8
+    run_sampler_job(db.sampler.strided(8), expected)
+    # Range
+    run_sampler_job(db.sampler.range(0, 30), 30)
+    # Strided Range
+    run_sampler_job(db.sampler.strided_range(0, 300, 10), 30)
+    # Gather
+    run_sampler_job(db.sampler.gather([0, 150, 377, 500]), 4)
+
+
+def test_space(db):
+    def run_spacer_job(spacing_args):
+        frame = db.ops.FrameInput()
+        hist = db.ops.Histogram(frame=frame)
+        space_hist = hist.space()
+        output_op = db.ops.Output(columns=[space_hist])
+
+        job = Job(op_args={
+            frame: db.table('test1').column('frame'),
+            space_hist: spacing_args,
+            output_op: 'test_space',
+        })
+        bulk_job = BulkJob(output=output_op, jobs=[job])
+        tables = db.run(bulk_job, force=True, show_progress=False)
+        return tables[0]
+
+    # Repeat
+    spacing_distance = 8
+    table = run_spacer_job(db.sampler.space_repeat(spacing_distance))
+    num_rows = 0
+    for (frame_index, hist) in table.load(['histogram'], parsers.histograms):
+        # Verify outputs are repeated correctly
+        if num_rows % spacing_distance == 0:
+            ref_hist = hist
+        assert len(hist) == 3
+        for c in range(len(hist)):
+            assert (ref_hist[c] == hist[c]).all()
+        num_rows += 1
+    assert num_rows == db.table('test1').num_rows() * spacing_distance
+
+    # Null
+    table = run_spacer_job(db.sampler.space_null(spacing_distance))
+    num_rows = 0
+    for (frame_index, hist) in table.load(['histogram'], parsers.histograms):
+        # Verify outputs are None for null rows
+        if num_rows % spacing_distance == 0:
+            assert hist is not None
+            assert len(hist) == 3
+            assert hist[0].shape[0] == 16
+        else:
+            assert hist is None
+        num_rows += 1
+    assert num_rows == db.table('test1').num_rows() * spacing_distance
+
+
+def test_slicing(db):
+    frame = db.ops.FrameInput()
+    slice_frame = frame.slice()
+    unsliced_frame = slice_frame.unslice()
+    output_op = db.ops.Output(columns=[unsliced_frame])
+    job = Job(op_args={
+        frame: db.table('test1').column('frame'),
+        slice_frame: db.partitioner.all(50),
+        output_op: 'test_slicing',
+    })
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    tables = db.run(bulk_job, force=True, show_progress=False)
+
+    num_rows = 0
+    for (frame_index, _) in tables[0].column('frame').load():
+        num_rows += 1
+    assert num_rows == db.table('test1').num_rows()
+
+
+def test_bounded_state(db):
+    warmup = 3
+
+    frame = db.ops.FrameInput()
+    increment = db.ops.TestIncrementBounded(ignore=frame, warmup=warmup)
+    sampled_increment = increment.sample()
+    output_op = db.ops.Output(columns=[sampled_increment])
+    job = Job(op_args={
+        frame: db.table('test1').column('frame'),
+        sampled_increment: db.sampler.gather([0, 10, 25, 26, 27]),
+        output_op: 'test_slicing',
+    })
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    tables = db.run(bulk_job, force=True, show_progress=False)
+
+    num_rows = 0
+    expected_output = [0, warmup, warmup, warmup + 1, warmup + 2]
+    for (frame_index, buf) in tables[0].column('integer').load():
+        (val, ) = struct.unpack('=q', buf)
+        assert val == expected_output[num_rows]
+        print(num_rows)
+        num_rows += 1
+    assert num_rows == 5
+
+
+def test_unbounded_state(db):
+    frame = db.ops.FrameInput()
+    slice_frame = frame.slice()
+    increment = db.ops.TestIncrementUnbounded(ignore=slice_frame)
+    unsliced_increment = increment.unslice()
+    output_op = db.ops.Output(columns=[unsliced_increment])
+    job = Job(op_args={
+        frame: db.table('test1').column('frame'),
+        slice_frame: db.partitioner.all(50),
+        output_op: 'test_slicing',
+    })
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    tables = db.run(bulk_job, force=True, show_progress=False)
+
+    num_rows = 0
+    for (frame_index, buf) in tables[0].column('integer').load():
+        (val, ) = struct.unpack('=q', buf)
+        assert val == frame_index % 50
+        num_rows += 1
+    assert num_rows == db.table('test1').num_rows()
+
 
 def builder(cls):
     inst = cls()
 
     class Generated:
         def test_cpu(self, db):
-            inst.run(db, inst.op(db, DeviceType.CPU))
+            inst.run(db, inst.job(db, DeviceType.CPU))
 
-        @pytest.mark.gpu
+        @gpu
         def test_gpu(self, db):
-            inst.run(db, inst.op(db, DeviceType.GPU))
+            inst.run(db, inst.job(db, DeviceType.GPU))
 
     return Generated
 
+
 @builder
 class TestHistogram:
-    def op(self, db, ty):
-        return db.ops.Histogram(device=ty)
+    def job(self, db, ty):
+        frame = db.ops.FrameInput()
+        hist = db.ops.Histogram(frame=frame, device=ty)
+        output_op = db.ops.Output(columns=[hist])
+
+        job = Job(op_args={
+            frame: db.table('test1').column('frame'),
+            output_op: 'test_hist'
+        })
+        bulk_job = BulkJob(output=output_op, jobs=[job])
+        return bulk_job
+
+    def run(self, db, job):
+        tables = db.run(job, force=True, show_progress=False)
+        next(tables[0].load(['histogram'], parsers.histograms))
 
-    def run(self, db, op):
-        [table] = db.run(db.sampler().all([('test', 'test_hist')]), op, force=True)
-        next(table.load([0], parsers.histograms))
 
 @builder
 class TestOpticalFlow:
-    def op(self, db, ty):
-        input = db.ops.Input()
-        flow = db.ops.OpticalFlow(
-            inputs=[(input,['frame', 'frame_info'])],
-            device=ty)
-        output = db.ops.Output(inputs=[(flow, ['flow']), (input, ['frame_info'])])
-        return output
-
-    def run(self, db, op):
-        tasks = db.sampler().range([('test', 'test_flow')], 0, 50, warmup_size=1)
-        [table] = db.run(tasks, op, force=True)
-        next(table.load([0, 1], parsers.flow))
+    def job(self, db, ty):
+        frame = db.ops.FrameInput()
+        flow = db.ops.OpticalFlow(frame=frame, stencil=[-1, 0], device=ty)
+        flow_range = flow.sample()
+        out = db.ops.Output(columns=[flow_range])
+        job = Job(op_args={
+            frame: db.table('test1').column('frame'),
+            flow_range: db.sampler.range(0, 50),
+            out: 'test_flow',
+        })
+        return BulkJob(output=out, jobs=[job])
+
+    def run(self, db, job):
+        [table] = db.run(job, force=True, show_progress=False)
+        num_rows = 0
+        for (frame_index, _) in table.column('flow').load():
+            num_rows += 1
+        assert num_rows == 50
+
+        fid, flows = next(table.load(['flow']))
+        flow_array = flows[0]
+        assert fid == 0
+        assert flow_array.dtype == np.float32
+        assert flow_array.shape[0] == 480
+        assert flow_array.shape[1] == 640
+        assert flow_array.shape[2] == 2
+
+
+def test_python_kernel(db):
+    db.register_op('TestPy', [('frame', ColumnType.Video)], ['dummy'])
+    db.register_python_kernel('TestPy', DeviceType.CPU,
+                              cwd + '/test_py_kernel.py')
+
+    frame = db.ops.FrameInput()
+    range_frame = frame.sample()
+    test_out = db.ops.TestPy(frame=range_frame)
+    output_op = db.ops.Output(columns=[test_out])
+    job = Job(op_args={
+        frame: db.table('test1').column('frame'),
+        range_frame: db.sampler.range(0, 30),
+        output_op: 'test_hist'
+    })
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+
+    tables = db.run(bulk_job, force=True, show_progress=False)
+    next(tables[0].load(['dummy']))
+
+
+def test_python_batch_kernel(db):
+    db.register_op('TestPyBatch', [('frame', ColumnType.Video)], ['dummy'])
+    db.register_python_kernel(
+        'TestPyBatch',
+        DeviceType.CPU,
+        cwd + '/test_py_batch_kernel.py',
+        batch=10)
+
+    frame = db.ops.FrameInput()
+    range_frame = frame.sample()
+    test_out = db.ops.TestPyBatch(frame=range_frame, batch=50)
+    output_op = db.ops.Output(columns=[test_out])
+    job = Job(op_args={
+        frame: db.table('test1').column('frame'),
+        range_frame: db.sampler.range(0, 30),
+        output_op: 'test_hist'
+    })
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+
+    tables = db.run(bulk_job, force=True, show_progress=False)
+    next(tables[0].load(['dummy']))
+
+
+def test_blur(db):
+    frame = db.ops.FrameInput()
+    range_frame = frame.sample()
+    blurred_frame = db.ops.Blur(frame=range_frame, kernel_size=3, sigma=0.1)
+    output_op = db.ops.Output(columns=[blurred_frame])
+    job = Job(op_args={
+        frame: db.table('test1').column('frame'),
+        range_frame: db.sampler.range(0, 30),
+        output_op: 'test_blur',
+    })
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    tables = db.run(bulk_job, force=True, show_progress=False)
+    table = tables[0]
+
+    fid, frames = next(table.load(['frame']))
+    frame_array = frames[0]
+    assert fid == 0
+    assert frame_array.dtype == np.uint8
+    assert frame_array.shape[0] == 480
+    assert frame_array.shape[1] == 640
+    assert frame_array.shape[2] == 3
+
+
+def test_lossless(db):
+    frame = db.ops.FrameInput()
+    range_frame = frame.sample()
+    blurred_frame = db.ops.Blur(frame=range_frame, kernel_size=3, sigma=0.1)
+    output_op = db.ops.Output(columns=[blurred_frame.lossless()])
+
+    job = Job(op_args={
+        frame: db.table('test1').column('frame'),
+        range_frame: db.sampler.range(0, 30),
+        output_op: 'test_blur_lossless'
+    })
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    tables = db.run(bulk_job, force=True, show_progress=False)
+    table = tables[0]
+    next(table.load(['frame']))
+
+
+def test_compress(db):
+    frame = db.ops.FrameInput()
+    range_frame = frame.sample()
+    blurred_frame = db.ops.Blur(frame=range_frame, kernel_size=3, sigma=0.1)
+    compressed_frame = blurred_frame.compress('video', bitrate=1 * 1024 * 1024)
+    output_op = db.ops.Output(columns=[compressed_frame])
+
+    job = Job(op_args={
+        frame: db.table('test1').column('frame'),
+        range_frame: db.sampler.range(0, 30),
+        output_op: 'test_blur_compressed'
+    })
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    tables = db.run(bulk_job, force=True, show_progress=False)
+    table = tables[0]
+    next(table.load(['frame']))
+
+
+def test_save_mp4(db):
+    frame = db.ops.FrameInput()
+    range_frame = frame.sample()
+    blurred_frame = db.ops.Blur(frame=range_frame, kernel_size=3, sigma=0.1)
+    output_op = db.ops.Output(columns=[blurred_frame])
+
+    job = Job(op_args={
+        frame: db.table('test1').column('frame'),
+        range_frame: db.sampler.range(0, 30),
+        output_op: 'test_save_mp4'
+    })
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    tables = db.run(bulk_job, force=True, show_progress=False)
+    table = tables[0]
+    f = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
+    f.close()
+    table.column('frame').save_mp4(f.name)
+    run(['rm', '-rf', f.name])
+
+
+@pytest.fixture()
+def no_workers_db():
+    # Create new config
+    #with tempfile.NamedTemporaryFile(delete=False) as f:
+    with open('/tmp/config_test', 'w') as f:
+        cfg = Config.default_config()
+        cfg['storage']['db_path'] = tempfile.mkdtemp()
+        cfg['network']['master'] = 'localhost'
+        cfg['network']['master_port'] = '5020'
+        cfg['network']['worker_port'] = '5021'
+        f.write(toml.dumps(cfg))
+        cfg_path = f.name
+
+    # Setup and ingest video
+    with Database(debug=True, workers=[], config_path=cfg_path) as db:
+        # Download video from GCS
+        url = "https://storage.googleapis.com/scanner-data/test/short_video.mp4"
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as f:
+            host = socket.gethostname()
+            # HACK: special proxy case for Ocean cluster
+            if host in ['ocean', 'crissy', 'pismo', 'stinson']:
+                resp = requests.get(
+                    url,
+                    stream=True,
+                    proxies={'https': 'http://proxy.pdl.cmu.edu:3128/'})
+            else:
+                resp = requests.get(url, stream=True)
+            assert resp.ok
+            for block in resp.iter_content(1024):
+                f.write(block)
+            vid1_path = f.name
+
+        # Make a second one shorter than the first
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as f:
+            vid2_path = f.name
+        run([
+            'ffmpeg', '-y', '-i', vid1_path, '-ss', '00:00:00', '-t',
+            '00:00:10', '-c:v', 'libx264', '-strict', '-2', vid2_path
+        ])
+
+        db.ingest_videos([('test1', vid1_path), ('test2', vid2_path)])
+
+        yield db
+
+        # Tear down
+        run([
+            'rm', '-rf', cfg['storage']['db_path'], cfg_path, vid1_path,
+            vid2_path
+        ])
+
+
+def test_no_workers(no_workers_db):
+    db = no_workers_db
+
+    frame = db.ops.FrameInput()
+    hist = db.ops.Histogram(frame=frame)
+    output_op = db.ops.Output(columns=[hist])
+
+    job = Job(op_args={
+        frame: db.table('test1').column('frame'),
+        output_op: '_ignore'
+    })
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+
+    exc = False
+    try:
+        output = db.run(bulk_job, show_progress=False, force=True)
+    except ScannerException:
+        exc = True
+
+    assert exc
+
+
+@pytest.fixture()
+def fault_db():
+    # Create new config
+    #with tempfile.NamedTemporaryFile(delete=False) as f:
+    with open('/tmp/config_test', 'w') as f:
+        cfg = Config.default_config()
+        cfg['storage']['db_path'] = tempfile.mkdtemp()
+        cfg['network']['master'] = 'localhost'
+        cfg['network']['master_port'] = '5010'
+        cfg['network']['worker_port'] = '5011'
+        f.write(toml.dumps(cfg))
+        cfg_path = f.name
+
+    # Setup and ingest video
+    with Database(master='localhost:5010',
+                  workers=[],
+                  config_path=cfg_path, no_workers_timeout=120) as db:
+        # Download video from GCS
+        url = "https://storage.googleapis.com/scanner-data/test/short_video.mp4"
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as f:
+            host = socket.gethostname()
+            # HACK: special proxy case for Ocean cluster
+            if host in ['ocean', 'crissy', 'pismo', 'stinson']:
+                resp = requests.get(
+                    url,
+                    stream=True,
+                    proxies={'https': 'http://proxy.pdl.cmu.edu:3128/'})
+            else:
+                resp = requests.get(url, stream=True)
+            assert resp.ok
+            for block in resp.iter_content(1024):
+                f.write(block)
+            vid1_path = f.name
+
+        # Make a second one shorter than the first
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as f:
+            vid2_path = f.name
+        run([
+            'ffmpeg', '-y', '-i', vid1_path, '-ss', '00:00:00', '-t',
+            '00:00:10', '-c:v', 'libx264', '-strict', '-2', vid2_path
+        ])
+
+        db.ingest_videos([('test1', vid1_path), ('test2', vid2_path)])
+
+        yield db
+
+        # Tear down
+        run([
+            'rm', '-rf', cfg['storage']['db_path'], cfg_path, vid1_path,
+            vid2_path
+        ])
+
+
+# def test_clean_worker_shutdown(fault_db):
+#     spawn_port = 5010
+#     def worker_shutdown_task(config, master_address, worker_address):
+#         from scannerpy import ProtobufGenerator, Config, start_worker
+#         import time
+#         import grpc
+#         import subprocess
+
+#         c = Config(None)
+
+#         import scanner.metadata_pb2 as metadata_types
+#         import scanner.engine.rpc_pb2 as rpc_types
+#         import scanner.types_pb2 as misc_types
+#         import libscanner as bindings
+
+#         protobufs = ProtobufGenerator(config)
+
+#         # Wait to kill worker
+#         time.sleep(8)
+#         # Kill worker
+#         channel = grpc.insecure_channel(
+#             worker_address,
+#             options=[('grpc.max_message_length', 24499183 * 2)])
+#         worker = protobufs.WorkerStub(channel)
+
+#         try:
+#             worker.Shutdown(protobufs.Empty())
+#         except grpc.RpcError as e:
+#             status = e.code()
+#             if status == grpc.StatusCode.UNAVAILABLE:
+#                 print('could not shutdown worker!')
+#                 exit(1)
+#             else:
+#                 raise ScannerException('Worker errored with status: {}'
+#                                        .format(status))
+
+#         # Wait a bit
+#         time.sleep(15)
+#         script_dir = os.path.dirname(os.path.realpath(__file__))
+#         subprocess.call(['python ' +  script_dir + '/spawn_worker.py'],
+#                         shell=True)
+
+#     master_addr = fault_db._master_address
+#     worker_addr = fault_db._worker_addresses[0]
+#     shutdown_process = Process(target=worker_shutdown_task,
+#                              args=(fault_db.config, master_addr, worker_addr))
+#     shutdown_process.daemon = True
+#     shutdown_process.start()
+
+#     frame = fault_db.ops.FrameInput()
+#     range_frame = frame.sample()
+#     sleep_frame = fault_db.ops.SleepFrame(ignore = range_frame)
+#     output_op = fault_db.ops.Output(columns=[sleep_frame])
+
+#     job = Job(
+#         op_args={
+#             frame: fault_db.table('test1').column('frame'),
+#             range_frame: fault_db.sampler.range(0, 15),
+#             output_op: 'test_shutdown',
+#         }
+#     )
+#     bulk_job = BulkJob(output=output_op, jobs=[job])
+#     table = fault_db.run(bulk_job, pipeline_instances_per_node=1, force=True,
+#                          show_progress=False)
+#     table = table[0]
+#     assert len([_ for _, _ in table.column('dummy').load()]) == 15
+
+#     # Shutdown the spawned worker
+#     channel = grpc.insecure_channel(
+#         'localhost:' + str(spawn_port),
+#         options=[('grpc.max_message_length', 24499183 * 2)])
+#     worker = fault_db.protobufs.WorkerStub(channel)
+
+#     try:
+#         worker.Shutdown(fault_db.protobufs.Empty())
+#     except grpc.RpcError as e:
+#         status = e.code()
+#         if status == grpc.StatusCode.UNAVAILABLE:
+#             print('could not shutdown worker!')
+#             exit(1)
+#         else:
+#             raise ScannerException('Worker errored with status: {}'
+#                                    .format(status))
+#     shutdown_process.join()
+
+
+def test_fault_tolerance(fault_db):
+    force_kill_spawn_port = 5012
+    normal_spawn_port = 5013
+
+    def worker_killer_task(config, master_address):
+        from scannerpy import ProtobufGenerator, Config, start_worker
+        import time
+        import grpc
+        import subprocess
+        import signal
+        import os
+
+        c = Config(None)
+
+        import scanner.metadata_pb2 as metadata_types
+        import scanner.engine.rpc_pb2 as rpc_types
+        import scanner.types_pb2 as misc_types
+        import scannerpy.libscanner as bindings
+
+        protobufs = ProtobufGenerator(config)
+
+        # Spawn a worker that we will force kill
+        script_dir = os.path.dirname(os.path.realpath(__file__))
+        with open(os.devnull, 'w') as fp:
+            p = subprocess.Popen(
+                [
+                    'python ' + script_dir +
+                    '/spawn_worker.py {:d}'.format(force_kill_spawn_port)
+                ],
+                shell=True,
+                stdout=fp,
+                stderr=fp,
+                preexec_fn=os.setsid)
+
+            # Wait a bit for the worker to do its thing
+            time.sleep(10)
+
+            # Force kill worker process to trigger fault tolerance
+            os.killpg(os.getpgid(p.pid), signal.SIGTERM)
+            p.kill()
+            p.communicate()
+
+            # Wait for fault tolerance to kick in
+            time.sleep(15)
+
+            # Spawn the worker again
+            subprocess.call(
+                [
+                    'python ' + script_dir +
+                    '/spawn_worker.py {:d}'.format(normal_spawn_port)
+                ],
+                shell=True)
+
+    master_addr = fault_db._master_address
+    killer_process = Process(
+        target=worker_killer_task,
+        args=(fault_db.config, master_addr))
+    killer_process.daemon = True
+    killer_process.start()
+
+    frame = fault_db.ops.FrameInput()
+    range_frame = frame.sample()
+    sleep_frame = fault_db.ops.SleepFrame(ignore=range_frame)
+    output_op = fault_db.ops.Output(columns=[sleep_frame])
+
+    job = Job(op_args={
+        frame: fault_db.table('test1').column('frame'),
+        range_frame: fault_db.sampler.range(0, 20),
+        output_op: 'test_fault',
+    })
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    table = fault_db.run(
+        bulk_job,
+        pipeline_instances_per_node=1,
+        force=True,
+        show_progress=False)
+    table = table[0]
+
+    assert len([_ for _, _ in table.column('dummy').load()]) == 20
+
+    # Shutdown the spawned worker
+    channel = grpc.insecure_channel(
+        'localhost:' + str(normal_spawn_port),
+        options=[('grpc.max_message_length', 24499183 * 2)])
+    worker = fault_db.protobufs.WorkerStub(channel)
+
+    try:
+        worker.Shutdown(fault_db.protobufs.Empty())
+    except grpc.RpcError as e:
+        status = e.code()
+        if status == grpc.StatusCode.UNAVAILABLE:
+            print('could not shutdown worker!')
+            exit(1)
+        else:
+            raise ScannerException('Worker errored with status: {}'
+                                   .format(status))
+    killer_process.join()
+
+@pytest.fixture()
+def blacklist_db():
+    # Create new config
+    #with tempfile.NamedTemporaryFile(delete=False) as f:
+    with open('/tmp/config_test', 'w') as f:
+        cfg = Config.default_config()
+        cfg['storage']['db_path'] = tempfile.mkdtemp()
+        cfg['network']['master'] = 'localhost'
+        cfg['network']['master_port'] = '5055'
+        cfg['network']['worker_port'] = '5060'
+        f.write(toml.dumps(cfg))
+        cfg_path = f.name
+
+    # Setup and ingest video
+    master = 'localhost:5055'
+    workers = ['localhost:{:04d}'.format(5060 + d) for d in range(4)]
+    with Database(config_path=cfg_path, no_workers_timeout=120,
+                  master=master, workers=workers) as db:
+        # Download video from GCS
+        url = "https://storage.googleapis.com/scanner-data/test/short_video.mp4"
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as f:
+            host = socket.gethostname()
+            # HACK: special proxy case for Ocean cluster
+            if host in ['ocean', 'crissy', 'pismo', 'stinson']:
+                resp = requests.get(url, stream=True, proxies={
+                    'https': 'http://proxy.pdl.cmu.edu:3128/'
+                })
+            else:
+                resp = requests.get(url, stream=True)
+            assert resp.ok
+            for block in resp.iter_content(1024):
+                f.write(block)
+            vid1_path = f.name
+
+        # Make a second one shorter than the first
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as f:
+            vid2_path = f.name
+        run(['ffmpeg', '-y', '-i', vid1_path, '-ss', '00:00:00', '-t',
+             '00:00:10', '-c:v', 'libx264', '-strict', '-2', vid2_path])
+
+        db.ingest_videos([('test1', vid1_path), ('test2', vid2_path)])
+
+        yield db
+
+        # Tear down
+        run(['rm', '-rf',
+            cfg['storage']['db_path'],
+            cfg_path,
+            vid1_path,
+            vid2_path])
+
+def test_job_blacklist(blacklist_db):
+    db = blacklist_db
+    db.register_op('TestPyFail',
+                   [('frame', ColumnType.Video)],
+                   ['dummy'])
+    db.register_python_kernel('TestPyFail', DeviceType.CPU,
+                              cwd + '/test_py_fail_kernel.py')
+
+    frame = db.ops.FrameInput()
+    range_frame = frame.sample()
+    failed_output = db.ops.TestPyFail(frame=range_frame)
+    output_op = db.ops.Output(columns=[failed_output])
+
+    job = Job(
+        op_args={
+            frame: db.table('test1').column('frame'),
+            range_frame: db.sampler.range(0, 1),
+            output_op: 'test_py_fail'
+        }
+    )
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    tables = db.run(bulk_job, force=True, show_progress=False,
+                    pipeline_instances_per_node=1)
+    table = tables[0]
+    assert table.committed() == False
+
+
+@pytest.fixture()
+def timeout_db():
+    # Create new config
+    #with tempfile.NamedTemporaryFile(delete=False) as f:
+    with open('/tmp/config_test', 'w') as f:
+        cfg = Config.default_config()
+        cfg['storage']['db_path'] = tempfile.mkdtemp()
+        cfg['network']['master'] = 'localhost'
+        cfg['network']['master_port'] = '5155'
+        cfg['network']['worker_port'] = '5160'
+        f.write(toml.dumps(cfg))
+        cfg_path = f.name
+
+    # Setup and ingest video
+    master = 'localhost:5155'
+    workers = ['localhost:{:04d}'.format(5160 + d) for d in range(4)]
+    with Database(config_path=cfg_path, no_workers_timeout=120,
+                  master=master, workers=workers) as db:
+        # Download video from GCS
+        url = "https://storage.googleapis.com/scanner-data/test/short_video.mp4"
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as f:
+            host = socket.gethostname()
+            # HACK: special proxy case for Ocean cluster
+            if host in ['ocean', 'crissy', 'pismo', 'stinson']:
+                resp = requests.get(url, stream=True, proxies={
+                    'https': 'http://proxy.pdl.cmu.edu:3128/'
+                })
+            else:
+                resp = requests.get(url, stream=True)
+            assert resp.ok
+            for block in resp.iter_content(1024):
+                f.write(block)
+            vid1_path = f.name
+
+        # Make a second one shorter than the first
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as f:
+            vid2_path = f.name
+        run(['ffmpeg', '-y', '-i', vid1_path, '-ss', '00:00:00', '-t',
+             '00:00:10', '-c:v', 'libx264', '-strict', '-2', vid2_path])
+
+        db.ingest_videos([('test1', vid1_path), ('test2', vid2_path)])
+
+        yield db
+
+        # Tear down
+        run(['rm', '-rf',
+            cfg['storage']['db_path'],
+            cfg_path,
+            vid1_path,
+            vid2_path])
+
+
+def test_job_timeout(timeout_db):
+    db = timeout_db
+
+    frame = db.ops.FrameInput()
+    range_frame = frame.sample()
+    sleep_frame = db.ops.SleepFrame(ignore=range_frame)
+    output_op = db.ops.Output(columns=[sleep_frame])
+
+    job = Job(op_args={
+        frame: db.table('test1').column('frame'),
+        range_frame: db.sampler.range(0, 1),
+        output_op: 'test_timeout',
+    })
+    bulk_job = BulkJob(output=output_op, jobs=[job])
+    table = db.run(
+        bulk_job,
+        pipeline_instances_per_node=1,
+        task_timeout=0.1,
+        force=True,
+        show_progress=False)
+    table = table[0]
+
+    assert table.committed() == False
diff --git a/tests/spawn_worker.py b/tests/spawn_worker.py
new file mode 100644
index 00000000..fc6906b5
--- /dev/null
+++ b/tests/spawn_worker.py
@@ -0,0 +1,27 @@
+from scannerpy import ProtobufGenerator, Config, start_worker
+import time
+import grpc
+import sys
+
+c = Config(None)
+
+import scanner.metadata_pb2 as metadata_types
+import scanner.engine.rpc_pb2 as rpc_types
+import scanner.types_pb2 as misc_types
+import scannerpy.libscanner as bindings
+
+con = Config(config_path='/tmp/config_test')
+protobufs = ProtobufGenerator(con)
+
+master_address = str(con.master_address) + ':' + str(con.master_port)
+port = int(sys.argv[1])
+
+params = bindings.default_machine_params()
+mp = protobufs.MachineParameters()
+mp.ParseFromString(params)
+del mp.gpu_ids[:]
+params = mp.SerializeToString()
+
+start_worker(master_address, machine_params=params, config=con, block=True,
+             port=port,
+             watchdog=False)
diff --git a/tests/test_py_batch_kernel.py b/tests/test_py_batch_kernel.py
new file mode 100644
index 00000000..0fbdce60
--- /dev/null
+++ b/tests/test_py_batch_kernel.py
@@ -0,0 +1,21 @@
+import scannerpy
+import scannerpy.stdlib.writers as writers
+
+class TestPyBatchKernel(scannerpy.Kernel):
+    def __init__(self, config, protobufs):
+        self.protobufs = protobufs
+        pass
+
+    def close(self):
+        pass
+
+    def execute(self, input_columns):
+        point = protobufs.Point()
+        point.x = 10
+        point.y = 5
+        input_count = len(input_columns[0])
+        column_count = len(input_columns)
+        return [[point.SerializeToString() for _ in xrange(input_count)]
+                 for _ in xrange(column_count)]
+
+KERNEL = TestPyBatchKernel
diff --git a/tests/test_py_fail_kernel.py b/tests/test_py_fail_kernel.py
new file mode 100644
index 00000000..9408096f
--- /dev/null
+++ b/tests/test_py_fail_kernel.py
@@ -0,0 +1,15 @@
+import scannerpy
+import scannerpy.stdlib.writers as writers
+
+class TestPyFailKernel(scannerpy.Kernel):
+    def __init__(self, config, protobufs):
+        self.protobufs = protobufs
+        pass
+
+    def close(self):
+        pass
+
+    def execute(self, input_columns):
+        raise scannerpy.ScannerException('Test')
+
+KERNEL = TestPyFailKernel
diff --git a/tests/test_py_kernel.py b/tests/test_py_kernel.py
new file mode 100644
index 00000000..c9b8cdb2
--- /dev/null
+++ b/tests/test_py_kernel.py
@@ -0,0 +1,18 @@
+import scannerpy
+import scannerpy.stdlib.writers as writers
+
+class TestPyKernel(scannerpy.Kernel):
+    def __init__(self, config, protobufs):
+        self.protobufs = protobufs
+        pass
+
+    def close(self):
+        pass
+
+    def execute(self, input_columns):
+        point = protobufs.Point()
+        point.x = 10
+        point.y = 5
+        return [point.SerializeToString()]
+
+KERNEL = TestPyKernel
diff --git a/tests/videos.h b/tests/videos.h
index ba6fffd3..556b3181 100644
--- a/tests/videos.h
+++ b/tests/videos.h
@@ -13,14 +13,14 @@
  * limitations under the License.
  */
 
-#include "scanner/video/decoder_automata.h"
 #include "scanner/util/fs.h"
+#include "scanner/video/decoder_automata.h"
 
 #include <thread>
 
 namespace scanner {
 struct TestVideoInfo {
-  TestVideoInfo(i32 w, i32 h, const std::string &u, const std::string& m)
+  TestVideoInfo(i32 w, i32 h, const std::string& u, const std::string& m)
       : width(w), height(h), data_url(u), metadata_url(m) {}
 
   i32 width;
diff --git a/thirdparty/CMakeLists.txt b/thirdparty/CMakeLists.txt
deleted file mode 100644
index 11ca4cb3..00000000
--- a/thirdparty/CMakeLists.txt
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright 2016 Carnegie Mellon University
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-cmake_minimum_required(VERSION 3.2.0 FATAL_ERROR)
-
-project(ScannerThirdparty)
-
-include(ExternalProject)
-
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/../cmake/Modules/")
-
-set(GLOBAL_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
-set(THIRDPARTY_SOURCE_DIR "${CMAKE_SOURCE_DIR}")
-
-find_package(TinyToml)
-find_package(Storehouse CONFIG)
-find_package(Struck CONFIG)
-find_package(GoogleTest)
-
-if (NOT TINYTOML_FOUND)
-  ExternalProject_Add(TinyToml
-    GIT_REPOSITORY "https://github.com/mayah/tinytoml.git"
-    GIT_TAG "3559856002eee57693349b8a2d8a0cf6250d269c"
-
-    UPDATE_COMMAND ""
-
-    SOURCE_DIR "${THIRDPARTY_SOURCE_DIR}/tinytoml"
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND ""
-
-    INSTALL_COMMAND
-    mkdir -p ${GLOBAL_OUTPUT_PATH}/tinytoml && cp -r ${THIRDPARTY_SOURCE_DIR}/tinytoml/include ${GLOBAL_OUTPUT_PATH}/tinytoml)
-endif()
-
-if (NOT STOREHOUSE_FOUND)
-  # storehouse
-  if (NOT OPENSSL_ROOT_DIR)
-    set(OPENSSL_ROOT_DIR $ENV{OPENSSL_ROOT_DIR})
-  endif()
-
-  include(ProcessorCount)
-  ProcessorCount(N)
-  ExternalProject_Add(Storehouse
-    GIT_REPOSITORY "https://github.com/scanner-research/storehouse"
-    GIT_TAG "52189a0ed9958c24b8451127ea60dbf1ddbe4237"
-
-    UPDATE_COMMAND mkdir -p thirdparty/build && cd thirdparty/build &&
-    cmake ${THIRDPARTY_SOURCE_DIR}/storehouse/thirdparty && make -j${N}
-
-    SOURCE_DIR "${THIRDPARTY_SOURCE_DIR}/storehouse"
-
-    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOBAL_OUTPUT_PATH}/storehouse
-    -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}
-    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    -DBZIP2_INCLUDE_DIR=${BZIP2_INCLUDE_DIR}
-    -DCMAKE_LIBRARY_PATH=${CMAKE_LIBRARY_PATH}
-    -DOPENSSL_ROOT_DIR=${OPENSSL_ROOT_DIR}
-    -DBUILD_STATIC=ON
-
-    CMAKE_CACHE_ARGS
-    -DGLOG_ROOT_DIR:FILEPATH=${GLOG_ROOT_DIR}
-    -DGFLAGS_ROOT_DIR:FILEPATH=${GFLAGS_ROOT_DIR}
-    -DBOOST_ROOT:FILEPATH=${BOOST_ROOT}
-
-    INSTALL_DIR "${GLOBAL_OUTPUT_PATH}/storehouse")
-endif()
-
-if (NOT STRUCK_FOUND)
-  find_package(OpenCV)
-  # struck
-  ExternalProject_Add(Struck
-    GIT_REPOSITORY "https://github.com/scanner-research/struck"
-    GIT_TAG "3ad24858e2ab08b7188e1401b66e5925c4b7ad4f"
-
-    UPDATE_COMMAND ""
-
-    SOURCE_DIR "${THIRDPARTY_SOURCE_DIR}/struck"
-
-    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOBAL_OUTPUT_PATH}/struck
-    -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}
-    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    -DCMAKE_LIBRARY_PATH=${CMAKE_LIBRARY_PATH}
-    -DOPENCV_ROOT_DIR=${OPENCV_ROOT_DIR}
-
-    CMAKE_CACHE_ARGS ""
-
-    INSTALL_DIR "${GLOBAL_OUTPUT_PATH}/struck")
-endif()
-
-if (NOT GOOGLETEST_FOUND)
-  ExternalProject_Add(GoogleTest
-    GIT_REPOSITORY "https://github.com/google/googletest"
-    GIT_TAG 0a439623f75c029912728d80cb7f1b8b48739ca4
-
-    UPDATE_COMMAND ""
-
-    SOURCE_DIR "${THIRDPARTY_SOURCE_DIR}/googletest"
-
-    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOBAL_OUTPUT_PATH}/googletest
-    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-
-    INSTALL_DIR "${GLOBAL_OUTPUT_PATH}/googletest"
-    )
-endif()
diff --git a/thirdparty/resources/caffe/Makefile.config b/thirdparty/resources/caffe/Makefile.config
new file mode 100644
index 00000000..48dd599d
--- /dev/null
+++ b/thirdparty/resources/caffe/Makefile.config
@@ -0,0 +1,112 @@
+## Refer to http://caffe.berkeleyvision.org/installation.html
+# Contributions simplifying and improving our build system are welcome!
+
+# cuDNN acceleration switch (uncomment to build with cuDNN).
+USE_CUDNN := 1
+
+# CPU-only switch (uncomment to build without GPU support).
+# CPU_ONLY := 1
+
+# uncomment to disable IO dependencies and corresponding data layers
+# USE_OPENCV := 0
+# USE_LEVELDB := 0
+# USE_LMDB := 0
+
+# uncomment to allow MDB_NOLOCK when reading LMDB files (only if necessary)
+#	You should not set this flag if you will be reading LMDBs with any
+#	possibility of simultaneous read and write
+# ALLOW_LMDB_NOLOCK := 1
+
+# Uncomment if you're using OpenCV 3
+OPENCV_VERSION := 3
+
+# To customize your choice of compiler, uncomment and set the following.
+# N.B. the default for Linux is g++ and the default for OSX is clang++
+# CUSTOM_CXX := g++
+
+# CUDA directory contains bin/ and lib/ directories that we need.
+CUDA_DIR := /usr/local/cuda
+# On Ubuntu 14.04, if cuda tools are installed via
+# "sudo apt-get install nvidia-cuda-toolkit" then use this instead:
+# CUDA_DIR := /usr
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
+		-gencode arch=compute_20,code=sm_21 \
+		-gencode arch=compute_30,code=sm_30 \
+		-gencode arch=compute_35,code=sm_35 \
+		-gencode arch=compute_50,code=sm_50 \
+		-gencode arch=compute_50,code=compute_50
+
+# BLAS choice:
+# atlas for ATLAS (default)
+# mkl for MKL
+# open for OpenBlas
+BLAS := atlas
+# Custom (MKL/ATLAS/OpenBLAS) include and lib directories.
+# Leave commented to accept the defaults for your choice of BLAS
+# (which should work)!
+# BLAS_INCLUDE := /path/to/your/blas
+# BLAS_LIB := /path/to/your/blas
+
+# Homebrew puts openblas in a directory that is not on the standard search path
+# BLAS_INCLUDE := $(shell brew --prefix openblas)/include
+# BLAS_LIB := $(shell brew --prefix openblas)/lib
+
+# This is required only if you will compile the matlab interface.
+# MATLAB directory should contain the mex binary in /bin.
+# MATLAB_DIR := /usr/local
+# MATLAB_DIR := /Applications/MATLAB_R2012b.app
+
+# NOTE: this is required only if you will compile the python interface.
+# We need to be able to find Python.h and numpy/arrayobject.h.
+PYTHON_INCLUDE := /usr/include/python2.7 \
+		/usr/lib/python2.7/dist-packages/numpy/core/include
+# Anaconda Python distribution is quite popular. Include path:
+# Verify anaconda location, sometimes it's in root.
+# ANACONDA_HOME := $(HOME)/anaconda
+# PYTHON_INCLUDE := $(ANACONDA_HOME)/include \
+		# $(ANACONDA_HOME)/include/python2.7 \
+		# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \
+
+# Uncomment to use Python 3 (default is Python 2)
+# PYTHON_LIBRARIES := boost_python3 python3.5m
+# PYTHON_INCLUDE := /usr/include/python3.5m \
+#                 /usr/lib/python3.5/dist-packages/numpy/core/include
+
+# We need to be able to find libpythonX.X.so or .dylib.
+PYTHON_LIB := /usr/lib
+# PYTHON_LIB := $(ANACONDA_HOME)/lib
+
+# Homebrew installs numpy in a non standard path (keg only)
+# PYTHON_INCLUDE += $(dir $(shell python -c 'import numpy.core; print(numpy.core.__file__)'))/include
+# PYTHON_LIB += $(shell brew --prefix numpy)/lib
+
+# Uncomment to support layers written in Python (will link against Python libs)
+# WITH_PYTHON_LAYER := 1
+
+# Whatever else you find you need goes here.
+INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include
+LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
+
+# If Homebrew is installed at a non standard location (for example your home directory) and you use it for general dependencies
+# INCLUDE_DIRS += $(shell brew --prefix)/include
+# LIBRARY_DIRS += $(shell brew --prefix)/lib
+
+# Uncomment to use `pkg-config` to specify OpenCV library paths.
+# (Usually not necessary -- OpenCV libraries are normally installed in one of the above $LIBRARY_DIRS.)
+# USE_PKG_CONFIG := 1
+
+# N.B. both build and distribute dirs are cleared on `make clean`
+BUILD_DIR := build
+DISTRIBUTE_DIR := distribute
+
+# Uncomment for debugging. Does not work on OSX due to https://github.com/BVLC/caffe/issues/171
+# DEBUG := 1
+
+# The ID of the GPU that 'make runtest' will use to run unit tests.
+TEST_GPUID := 0
+
+# enable pretty build (comment to see full commands)
+Q ?= @
diff --git a/thirdparty/resources/mkl/silent.cfg b/thirdparty/resources/mkl/silent.cfg
new file mode 100644
index 00000000..f436bbf3
--- /dev/null
+++ b/thirdparty/resources/mkl/silent.cfg
@@ -0,0 +1,36 @@
+# Patterns used to check silent configuration file
+#
+# anythingpat - any string
+# filepat     - the file location pattern (/file/location/to/license.lic)
+# lspat       - the license server address pattern (0123@hostname)
+# snpat       - the serial number pattern (ABCD-01234567)
+
+# Accept EULA, valid values are: {accept, decline}
+ACCEPT_EULA=accept
+
+# Optional error behavior, valid values are: {yes, no}
+CONTINUE_WITH_OPTIONAL_ERROR=yes
+
+# Continue with overwrite of existing installation directory, valid values are: {yes, no}
+CONTINUE_WITH_INSTALLDIR_OVERWRITE=yes
+
+# List of components to install, valid values are: {ALL, DEFAULTS, anythingpat}
+COMPONENTS=DEFAULTS
+
+# Installation mode, valid values are: {install, repair, uninstall}
+PSET_MODE=install
+
+# Directory for non-RPM database, valid values are: {filepat}
+#NONRPM_DB_DIR=filepat
+
+# Path to the cluster description file, valid values are: {filepat}
+#CLUSTER_INSTALL_MACHINES_FILE=filepat
+
+# Perform validation of digital signatures of RPM files, valid values are: {yes, no}
+SIGNING_ENABLED=yes
+
+# Select target architecture of your applications, valid values are: {IA32, INTEL64, ALL}
+ARCH_SELECTED=ALL
+
+# Install location, valid values are: {/opt/intel, filepat}
+#PSET_INSTALL_DIR=/opt/intel