From c522427c8dfac7173ac244cdb665476e3bf63c36 Mon Sep 17 00:00:00 2001
From: Leo Vandriel <git@leovandriel.com>
Date: Thu, 12 Oct 2017 16:29:52 -0700
Subject: [PATCH] fix use of utils

---
 README.md                         |  22 +-
 include/caffe2/util/cmd.h         |   7 +-
 include/caffe2/util/misc.h        | 430 ------------------------------
 include/caffe2/util/model.h       |  14 +-
 include/caffe2/util/net.h         |   7 +
 include/caffe2/util/preprocess.h  | 240 +++++++++++++++++
 include/caffe2/util/train.h       |  95 +++++++
 include/caffe2/zoo/keeper.h       |  56 ++--
 src/caffe2/binaries/diff.cc       |  10 +-
 src/caffe2/binaries/dream.cc      |  82 +++---
 src/caffe2/binaries/imagenet.cc   |  39 ++-
 src/caffe2/binaries/inspect.cc    |  35 ++-
 src/caffe2/binaries/mnist.cc      |  63 +++--
 src/caffe2/binaries/pretrained.cc |  10 +-
 src/caffe2/binaries/retrain.cc    | 264 ------------------
 src/caffe2/binaries/rnn.cc        | 129 ++++-----
 src/caffe2/binaries/train.cc      | 269 ++++++++-----------
 src/caffe2/util/model.cc          | 215 ++++++++++++++-
 src/caffe2/util/net.cc            |  45 ++--
 19 files changed, 936 insertions(+), 1096 deletions(-)
 delete mode 100644 include/caffe2/util/misc.h
 create mode 100644 include/caffe2/util/preprocess.h
 create mode 100644 include/caffe2/util/train.h
 delete mode 100644 src/caffe2/binaries/retrain.cc
diff --git a/README.md b/README.md
index 69872046..997b7a76 100644
--- a/README.md
+++ b/README.md
@@ -130,17 +130,17 @@ This tutorial is transcribed in [rnn.cc](src/caffe2/binaries/rnn.cc). It takes t
 
     ./bin/rnn
 
-In contrast to the tutorial, this script terminates after 10K iterations. To get more, use `--train_runs`:
+In contrast to the tutorial, this script terminates after 10K iterations. To get more, use `--train-runs`:
 
-    ./bin/run --train_runs 100000
+    ./bin/run --train-runs 100000
 
 To get better results (loss < 1), expand the hidden layer:
 
-    ./bin/rnn --train_runs 100000 --batch_size 32 --hidden_size 512 --seq_length 32
+    ./bin/rnn --train-runs 100000 --batch-size 32 --hidden-size 512 --seq-length 32
 
 The file `res/dickens.txt` contains a larger volume of text. Because the writing is a bit more recent, it's more challenging to generate convincing results. Also, single newlines are stripped to allow for more creativity.
 
-    ./bin/rnn --train_runs 100000 --batch_size 32 --hidden_size 768 --seq_length 32 --train_data res/dickens.txt
+    ./bin/rnn --train-runs 100000 --batch-size 32 --hidden-size 768 --seq-length 32 --train-data res/dickens.txt
 
 After 200K runs, the loss has not dropped below 36, in contrast to the shakespeare text. Perhaps this requires an additional hidden layer in the LSTM model.
 
@@ -150,7 +150,7 @@ Much of the progress in image recognition is published after the yearly [ImageNe
 
 To classify the content of an image, run:
 
-    ./bin/imagenet --model <name-of-model> --image_file <some-image>
+    ./bin/imagenet --model <name-of-model> --image-file <some-image>
 
 Where the model name is one of the following:
 
@@ -180,21 +180,21 @@ The article [DeCAF: A Deep Convolutional Activation Feature for Generic Visual R
 
 First divide all images in subfolders with the label a folder name. Then to retrain the final layer of GoogleNet:
 
-    ./bin/retrain --model googlenet --folder <image-folder> --layer pool5/7x7_s1
+    ./bin/train --model googlenet --folder <image-folder> --layer pool5/7x7_s1
 
 The script starts out by collecting all images and running them through the pre-trained part of the model. This allows for very fast training on the pre-processed image data.
 
 If you have more (GPU) power at your disposal retrain VGG16's final 2 layers:
 
-    ./bin/retrain --model vgg16 --folder <image-folder> --layer fc6
+    ./bin/train --model vgg16 --folder <image-folder> --layer fc6
 
 Some models, like SqueezeNet require reshaping of their output to N x D tensor:
 
-    ./bin/retrain --model squeezenet --folder <image-folder> --layer fire9/concat --reshape_output
+    ./bin/train --model squeezenet --folder <image-folder> --layer fire9/concat --reshape-output
 
 You can also provide your own pre-trained model. Specify the location of the init and predict `.pb` file including a `%` character:
 
-    ./bin/retrain --model res/googlenet_%_net.pb --folder <image-folder> --layer pool5/7x7_s1
+    ./bin/train --model res/googlenet_%_net.pb --folder <image-folder> --layer pool5/7x7_s1
 
 See also:
 
@@ -203,7 +203,7 @@ See also:
 
 ## Training from scratch
 
-To fully train an existing image classification model from scratch, run:
+To fully train an existing image classification model from scratch, run without the `--layer` option:
 
     ./bin/train --model <model-name> --folder <image-folder>
 
@@ -213,7 +213,7 @@ Add `--display` for training visualization.
 
 Some models, like SqueezeNet require reshaping of their output to N x D tensor:
 
-    ./bin/train --model squeezenet --folder <image-folder> --reshape_output
+    ./bin/train --model squeezenet --folder <image-folder> --reshape-output
 
 ## Deep Dream
 
diff --git a/include/caffe2/util/cmd.h b/include/caffe2/util/cmd.h
index af0c5ebe..2e1807f4 100644
--- a/include/caffe2/util/cmd.h
+++ b/include/caffe2/util/cmd.h
@@ -22,7 +22,6 @@ bool cmd_setup_cuda() {
   option.set_device_type(CUDA);
 #ifdef WITH_CUDA
   new CUDAContext(option);
-  std::cout << std::endl << "using CUDA" << std::endl;
   return true;
 #else
   return false;
@@ -50,11 +49,13 @@ bool cmd_init(const std::string title) {
     return false;
   }
 
-  if (FLAGS_device != "cpu") cmd_setup_cuda();
+  auto cuda = (FLAGS_device != "cpu" && cmd_setup_cuda());
 
   std::cout << "optimizer: " << FLAGS_optimizer << std::endl;
   std::cout << "device: " << FLAGS_device << std::endl;
-  std::cout << "dump_model: " << (FLAGS_dump_model ? "true" : "false")
+  std::cout << "using cuda: " << (cuda ? "true" : "false") << std::endl;
+  ;
+  std::cout << "dump-model: " << (FLAGS_dump_model ? "true" : "false")
             << std::endl;
 
   return true;
diff --git a/include/caffe2/util/misc.h b/include/caffe2/util/misc.h
deleted file mode 100644
index ad79f17e..00000000
--- a/include/caffe2/util/misc.h
+++ /dev/null
@@ -1,430 +0,0 @@
-#ifndef MISC_H
-#define MISC_H
-
-#include <caffe2/core/db.h>
-#include <caffe2/core/init.h>
-#include <caffe2/core/net.h>
-
-#include "caffe2/util/blob.h"
-#include "caffe2/util/model.h"
-#include "caffe2/util/tensor.h"
-
-#include <dirent.h>
-#include <sys/stat.h>
-
-namespace caffe2 {
-
-enum { kRunTrain = 0, kRunValidate = 1, kRunTest = 2, kRunNum = 3 };
-
-static std::map<int, std::string> name_for_run({
-    {kRunTrain, "train"}, {kRunValidate, "validate"}, {kRunTest, "test"},
-});
-
-static std::map<int, int> percentage_for_run({
-    {kRunTest, 10}, {kRunValidate, 20}, {kRunTrain, 70},
-});
-
-std::string filename_to_key(const std::string &filename) {
-  // return filename;
-  return std::to_string(std::hash<std::string>{}(filename)) + "_" + filename;
-}
-
-void load_labels(const std::string &folder, const std::string &path_prefix,
-                 std::vector<std::string> &class_labels,
-                 std::vector<std::pair<std::string, int>> &image_files) {
-  std::cout << "load class labels.." << std::endl;
-  auto classes_text_path = path_prefix + "classes.txt";
-  ;
-  std::ifstream infile(classes_text_path);
-  std::string line;
-  while (std::getline(infile, line)) {
-    if (line.size()) {
-      class_labels.push_back(line);
-      // std::cout << '.' << line << '.' << std::endl;
-    }
-  }
-
-  std::cout << "load image folder.." << std::endl;
-  auto directory = opendir(folder.c_str());
-  CAFFE_ENFORCE(directory, "no image folder " + folder);
-  if (directory) {
-    struct stat s;
-    struct dirent *entry;
-    while ((entry = readdir(directory))) {
-      auto class_name = entry->d_name;
-      auto class_path = folder + '/' + class_name;
-      if (class_name[0] != '.' && class_name[0] != '_' &&
-          !stat(class_path.c_str(), &s) && (s.st_mode & S_IFDIR)) {
-        auto subdir = opendir(class_path.c_str());
-        if (subdir) {
-          auto class_index =
-              find(class_labels.begin(), class_labels.end(), class_name) -
-              class_labels.begin();
-          if (class_index == class_labels.size()) {
-            class_labels.push_back(class_name);
-          }
-          while ((entry = readdir(subdir))) {
-            auto image_file = entry->d_name;
-            auto image_path = class_path + '/' + image_file;
-            if (image_file[0] != '.' && !stat(image_path.c_str(), &s) &&
-                (s.st_mode & S_IFREG)) {
-              // std::cout << class_name << ' ' <<  image_path << std::endl;
-              image_files.push_back({image_path, class_index});
-            }
-          }
-          closedir(subdir);
-        }
-      }
-    }
-    closedir(directory);
-  }
-  CAFFE_ENFORCE(image_files.size(), "no images found in " + folder);
-  std::random_shuffle(image_files.begin(), image_files.end());
-  std::cout << class_labels.size() << " labels found" << std::endl;
-  std::cout << image_files.size() << " images found" << std::endl;
-
-  std::cout << "write class labels.." << std::endl;
-  std::ofstream class_file(classes_text_path);
-  if (class_file.is_open()) {
-    for (auto &label : class_labels) {
-      class_file << label << std::endl;
-    }
-    class_file.close();
-  }
-  auto classes_header_path = path_prefix + "classes.h";
-  std::ofstream labels_file(classes_header_path.c_str());
-  if (labels_file.is_open()) {
-    labels_file << "const char * retrain_classes[] {";
-    bool first = true;
-    for (auto &label : class_labels) {
-      if (first) {
-        first = false;
-      } else {
-        labels_file << ',';
-      }
-      labels_file << std::endl << '"' << label << '"';
-    }
-    labels_file << std::endl << "};" << std::endl;
-    labels_file.close();
-  }
-}
-
-void write_batch(Workspace &workspace, NetBase *predict_net,
-                 std::string &input_name, std::string &output_name,
-                 std::vector<std::pair<std::string, int>> &batch_files,
-                 std::unique_ptr<db::DB> *database, int size_to_fit) {
-  std::unique_ptr<db::Transaction> transaction[kRunNum];
-  for (int i = 0; i < kRunNum; i++) {
-    transaction[i] = database[i]->NewTransaction();
-  }
-
-  std::vector<std::string> filenames;
-  for (auto &pair : batch_files) {
-    filenames.push_back(pair.first);
-  }
-  std::vector<int> indices;
-  TensorCPU input;
-  TensorUtil(input).ReadImages(filenames, size_to_fit, indices);
-  TensorCPU output;
-  if (predict_net) {
-    BlobUtil(*workspace.GetBlob(input_name)).Set(input);
-    predict_net->Run();
-    auto tensor = BlobUtil(*workspace.GetBlob(output_name)).Get();
-    output.ResizeLike(tensor);
-    output.ShareData(tensor);
-  } else {
-    output.ResizeLike(input);
-    output.ShareData(input);
-  }
-
-  TensorProtos protos;
-  TensorProto *data = protos.add_protos();
-  TensorProto *label = protos.add_protos();
-  data->set_data_type(TensorProto::FLOAT);
-  label->set_data_type(TensorProto::INT32);
-  label->add_int32_data(0);
-  TensorSerializer<CPUContext> serializer;
-  std::string value;
-  std::vector<TIndex> dims(output.dims().begin() + 1, output.dims().end());
-  auto size = output.size() / output.dim(0);
-  auto output_data = output.data<float>();
-  for (auto i : indices) {
-    auto single = TensorCPU(
-        dims, std::vector<float>(output_data, output_data + size), NULL);
-    output_data += size;
-    data->Clear();
-    serializer.Serialize(single, "", data, 0, kDefaultChunkSize);
-    label->set_int32_data(0, batch_files[i].second);
-    protos.SerializeToString(&value);
-    int percentage = 0, p = (int)(rand() * 100.0 / RAND_MAX);
-    auto key = filename_to_key(batch_files[i].first);
-    for (auto pair : percentage_for_run) {
-      percentage += pair.second;
-      if (p < percentage) {
-        transaction[pair.first]->Put(key, value);
-        break;
-      }
-    }
-  }
-
-  for (int i = 0; i < kRunNum; i++) {
-    transaction[i]->Commit();
-  }
-}
-
-void pre_process(const std::vector<std::pair<std::string, int>> &image_files,
-                 const std::string *db_paths, NetDef &init_model,
-                 NetDef &predict_model, const std::string &db_type,
-                 int batch_size, int size_to_fit) {
-  std::cout << "store partial prediction.." << std::endl;
-  std::unique_ptr<db::DB> database[kRunNum];
-  for (int i = 0; i < kRunNum; i++) {
-    database[i] = db::CreateDB(db_type, db_paths[i], db::WRITE);
-  }
-  auto image_count = 0;
-  Workspace workspace;
-  auto init_net = CreateNet(init_model, &workspace);
-  init_net->Run();
-  auto predict_net = predict_model.external_input_size()
-                         ? CreateNet(predict_model, &workspace)
-                         : NULL;
-  auto input_name = predict_model.external_input_size()
-                        ? predict_model.external_input(0)
-                        : "";
-  auto output_name = predict_model.external_output_size()
-                         ? predict_model.external_output(0)
-                         : "";
-  std::vector<std::pair<std::string, int>> batch_files;
-  for (auto &pair : image_files) {
-    auto &filename = pair.first;
-    auto class_index = pair.second;
-    image_count++;
-    auto in_db = false;
-    auto key = filename_to_key(filename);
-    for (int i = 0; i < kRunNum && !in_db; i++) {
-      auto cursor = database[i]->NewCursor();
-      cursor->Seek(key);
-      in_db |= (cursor->Valid() && cursor->key() == key);
-    }
-    if (!in_db) {
-      batch_files.push_back({filename, class_index});
-    }
-    if (image_count % 10 == 0) {
-      std::cerr << '\r' << std::string(40, ' ') << '\r' << "pre-processing.. "
-                << image_count << '/' << image_files.size() << " "
-                << std::setprecision(3)
-                << ((float)100 * image_count / image_files.size()) << "%"
-                << std::flush;
-    }
-    if (batch_files.size() == batch_size) {
-      write_batch(workspace, predict_net ? predict_net.get() : NULL, input_name,
-                  output_name, batch_files, database, size_to_fit);
-      batch_files.clear();
-    }
-  }
-  if (batch_files.size() > 0) {
-    write_batch(workspace, predict_net ? predict_net.get() : NULL, input_name,
-                output_name, batch_files, database, size_to_fit);
-  }
-  for (int i = 0; i < kRunNum; i++) {
-    CAFFE_ENFORCE(database[i]->NewCursor()->Valid(),
-                  "database " + name_for_run[i] + " is empty");
-  }
-  std::cerr << '\r' << std::string(80, ' ') << '\r' << image_files.size()
-            << " images processed" << std::endl;
-}
-
-void dump_database(const std::string db_path, const std::string &db_type) {
-  std::cout << "dumping database.." << std::endl;
-  std::unique_ptr<db::DB> database = db::CreateDB(db_type, db_path, db::READ);
-
-  for (auto cursor = database->NewCursor(); cursor->Valid(); cursor->Next()) {
-    auto key = cursor->key().substr(0, 48);
-    auto value = cursor->value();
-    TensorProtos protos;
-    protos.ParseFromString(value);
-    auto tensor_proto = protos.protos(0);
-    auto label_proto = protos.protos(1);
-    TensorDeserializer<CPUContext> deserializer;
-    TensorCPU tensor;
-    int label = label_proto.int32_data(0);
-    deserializer.Deserialize(tensor_proto, &tensor);
-    auto dims = tensor.dims();
-    dims.insert(dims.begin(), 1);
-    tensor.Resize(dims);
-    std::cout << key << "  "
-              << (value.size() > 1000 ? value.size() / 1000 : value.size())
-              << (value.size() > 1000 ? "K" : "B") << "  (" << tensor.dims()
-              << ")  " << label << std::endl;
-    TensorUtil(tensor).ShowImage("inspect", 0, 1.0, 128);
-  }
-}
-
-void pre_process(const std::vector<std::pair<std::string, int>> &image_files,
-                 const std::string *db_paths, const std::string &db_type,
-                 int size_to_fit) {
-  NetDef none;
-  pre_process(image_files, db_paths, none, none, db_type, 64, size_to_fit);
-}
-
-void split_model(NetDef &base_init_model, NetDef &base_predict_model,
-                 const std::string &layer, NetDef &first_init_model,
-                 NetDef &first_predict_model, NetDef &second_init_model,
-                 NetDef &second_predict_model, bool force_cpu,
-                 bool inclusive = true) {
-  std::cout << "split model.." << std::endl;
-  std::set<std::string> static_inputs =
-      NetUtil(base_predict_model).CollectLayers(layer);
-
-  // copy operators
-  for (const auto &op : base_init_model.op()) {
-    auto is_first = (static_inputs.find(op.output(0)) != static_inputs.end());
-    auto new_op = (is_first ? first_init_model : second_init_model).add_op();
-    new_op->CopyFrom(op);
-  }
-  for (const auto &op : base_predict_model.op()) {
-    auto is_first = (static_inputs.find(op.output(0)) != static_inputs.end() &&
-                     (inclusive || op.input(0) != op.output(0)));
-    auto new_op =
-        (is_first ? first_predict_model : second_predict_model).add_op();
-    new_op->CopyFrom(op);
-    if (!force_cpu) {
-      new_op->set_engine("CUDNN");  // TODO: not here
-    }
-  }
-
-  // copy externals
-  if (first_predict_model.op().size()) {
-    // first_predict_model.add_external_input(base_predict_model.external_input(0));
-  }
-  if (second_predict_model.op().size()) {
-    // second_predict_model.add_external_input(layer);
-  }
-  for (const auto &output : base_init_model.external_output()) {
-    auto is_first = (static_inputs.find(output) != static_inputs.end());
-    if (is_first) {
-      first_init_model.add_external_output(output);
-    } else {
-      second_init_model.add_external_output(output);
-    }
-  }
-  for (const auto &input : base_predict_model.external_input()) {
-    auto is_first = (static_inputs.find(input) != static_inputs.end());
-    if (is_first) {
-      first_predict_model.add_external_input(input);
-    } else {
-      second_predict_model.add_external_input(input);
-    }
-  }
-  if (first_predict_model.op().size()) {
-    first_predict_model.add_external_output(layer);
-  }
-  if (second_predict_model.op().size()) {
-    second_predict_model.add_external_output(
-        base_predict_model.external_output(0));
-  }
-
-  if (base_init_model.has_name()) {
-    if (!first_init_model.has_name()) {
-      first_init_model.set_name(base_init_model.name() + "_first");
-    }
-    if (!second_init_model.has_name()) {
-      second_init_model.set_name(base_init_model.name() + "_second");
-    }
-  }
-  if (base_predict_model.has_name()) {
-    if (!first_predict_model.has_name()) {
-      first_predict_model.set_name(base_predict_model.name() + "_first");
-    }
-    if (!second_predict_model.has_name()) {
-      second_predict_model.set_name(base_predict_model.name() + "_second");
-    }
-  }
-}
-
-void set_trainable(OperatorDef &op, bool train) {
-  if (op.type() == "Dropout") {
-    for (auto &arg : *op.mutable_arg()) {
-      if (arg.name() == "is_test") {
-        arg.set_i(!train);
-      }
-    }
-  }
-}
-
-void copy_train_model(NetDef &base_init_model, NetDef &base_predict_model,
-                      const std::string &layer, int out_size,
-                      NetDef &train_init_model, NetDef &train_predict_model) {
-  std::string last_w, last_b;
-  for (const auto &op : base_predict_model.op()) {
-    auto new_op = train_predict_model.add_op();
-    new_op->CopyFrom(op);
-    set_trainable(*new_op, true);
-    if (op.type() == "FC") {
-      last_w = op.input(1);
-      last_b = op.input(2);
-    }
-  }
-  NetUtil(train_predict_model).SetRenameInplace();
-  for (const auto &op : base_init_model.op()) {
-    auto &output = op.output(0);
-    auto init_op = train_init_model.add_op();
-    bool uniform = (output.find("_b") != std::string::npos);
-    init_op->set_type(uniform ? "ConstantFill" : "XavierFill");
-    for (const auto &arg : op.arg()) {
-      if (arg.name() == "shape") {
-        auto init_arg = init_op->add_arg();
-        init_arg->set_name("shape");
-        if (output == last_w) {
-          init_arg->add_ints(out_size);
-          init_arg->add_ints(arg.ints(1));
-        } else if (output == last_b) {
-          init_arg->add_ints(out_size);
-        } else {
-          init_arg->CopyFrom(arg);
-        }
-      }
-    }
-    init_op->add_output(output);
-  }
-  std::set<std::string> existing_inputs;
-  existing_inputs.insert(train_predict_model.external_input().begin(),
-                         train_predict_model.external_input().end());
-  for (const auto &op : train_predict_model.op()) {
-    for (auto &output : op.output()) {
-      existing_inputs.insert(output);
-    }
-  }
-  for (const auto &input : base_predict_model.external_input()) {
-    if (existing_inputs.find(input) == existing_inputs.end()) {
-      train_predict_model.add_external_input(input);
-    }
-  }
-  for (const auto &output : base_predict_model.external_output()) {
-    train_predict_model.add_external_output(output);
-  }
-  // auto op = train_init_model.add_op();
-  // op->set_type("ConstantFill");
-  // auto arg = op->add_arg();
-  // arg->set_name("shape");
-  // arg->add_ints(1);
-  // op->add_output(layer);
-}
-
-void copy_test_model(NetDef &base_predict_model, NetDef &test_predict_model) {
-  for (const auto &op : base_predict_model.op()) {
-    auto new_op = test_predict_model.add_op();
-    new_op->CopyFrom(op);
-    set_trainable(*new_op, false);
-  }
-  for (const auto &input : base_predict_model.external_input()) {
-    test_predict_model.add_external_input(input);
-  }
-  for (const auto &output : base_predict_model.external_output()) {
-    test_predict_model.add_external_output(output);
-  }
-}
-
-}  // namespace caffe2
-
-#endif  // MISC_H
diff --git a/include/caffe2/util/model.h b/include/caffe2/util/model.h
index 59304cf4..8fd2c3bf 100644
--- a/include/caffe2/util/model.h
+++ b/include/caffe2/util/model.h
@@ -16,7 +16,6 @@ class ModelUtil {
   }
   ModelUtil(NetUtil &init, NetUtil &predict) : init(init), predict(predict) {}
 
-  void SetName(const std::string &name);
   void AddDatabaseOps(const std::string &name, const std::string &data,
                       const std::string &db, const std::string &db_type,
                       int batch_size);
@@ -43,6 +42,19 @@ class ModelUtil {
 
   std::vector<std::string> Params() { return predict.CollectParams(); }
 
+  void Split(const std::string &layer, ModelUtil &firstModel,
+             ModelUtil &secondModel, bool force_cpu, bool inclusive = true);
+  void CopyTrain(const std::string &layer, int out_size,
+                 ModelUtil &train) const;
+  void CopyTest(ModelUtil &test) const;
+  void CopyDeploy(ModelUtil &deploy, Workspace &workspace) const;
+
+  size_t Write(const std::string &path_prefix) const;
+  size_t Read(const std::string &path_prefix);
+  void SetName(const std::string &name);
+  void SetDeviceCUDA();
+  std::string Short();
+
  public:
   NetUtil init;
   NetUtil predict;
diff --git a/include/caffe2/util/net.h b/include/caffe2/util/net.h
index 27fd9b48..e1b88cf9 100644
--- a/include/caffe2/util/net.h
+++ b/include/caffe2/util/net.h
@@ -166,6 +166,9 @@ class NetUtil {
 
   void AddInput(const std::string input);
   void AddOutput(const std::string output);
+  const std::string& Input(int i) { return net.external_input(i); }
+  const std::string& Output(int i) { return net.external_output(i); }
+
   void SetName(const std::string name);
   void SetType(const std::string type);
 
@@ -193,6 +196,10 @@ class NetUtil {
   std::string Proto();
   std::string Short();
   void Print();
+  size_t Write(const std::string& path) const;
+  size_t WriteText(const std::string& path) const;
+  size_t Read(const std::string& path);
+
   void SetDeviceCUDA();
 
  public:
diff --git a/include/caffe2/util/preprocess.h b/include/caffe2/util/preprocess.h
new file mode 100644
index 00000000..af6f5782
--- /dev/null
+++ b/include/caffe2/util/preprocess.h
@@ -0,0 +1,240 @@
+#ifndef PREPROCESS_H
+#define PREPROCESS_H
+
+#include <caffe2/core/db.h>
+#include <caffe2/core/init.h>
+#include <caffe2/core/net.h>
+
+#include "caffe2/util/blob.h"
+#include "caffe2/util/model.h"
+#include "caffe2/util/net.h"
+#include "caffe2/util/tensor.h"
+#include "caffe2/util/train.h"
+
+#include <dirent.h>
+#include <sys/stat.h>
+
+namespace caffe2 {
+
+static std::map<int, int> percentage_for_run({
+    {kRunTest, 10}, {kRunValidate, 20}, {kRunTrain, 70},
+});
+
+std::string filename_to_key(const std::string &filename) {
+  // return filename;
+  return std::to_string(std::hash<std::string>{}(filename)) + "_" + filename;
+}
+
+void load_labels(const std::string &folder, const std::string &path_prefix,
+                 std::vector<std::string> &class_labels,
+                 std::vector<std::pair<std::string, int>> &image_files) {
+  auto classes_text_path = path_prefix + "classes.txt";
+  ;
+  std::ifstream infile(classes_text_path);
+  std::string line;
+  while (std::getline(infile, line)) {
+    if (line.size()) {
+      class_labels.push_back(line);
+    }
+  }
+
+  auto directory = opendir(folder.c_str());
+  CAFFE_ENFORCE(directory, "no image folder " + folder);
+  if (directory) {
+    struct stat s;
+    struct dirent *entry;
+    while ((entry = readdir(directory))) {
+      auto class_name = entry->d_name;
+      auto class_path = folder + '/' + class_name;
+      if (class_name[0] != '.' && class_name[0] != '_' &&
+          !stat(class_path.c_str(), &s) && (s.st_mode & S_IFDIR)) {
+        auto subdir = opendir(class_path.c_str());
+        if (subdir) {
+          auto class_index =
+              find(class_labels.begin(), class_labels.end(), class_name) -
+              class_labels.begin();
+          if (class_index == class_labels.size()) {
+            class_labels.push_back(class_name);
+          }
+          while ((entry = readdir(subdir))) {
+            auto image_file = entry->d_name;
+            auto image_path = class_path + '/' + image_file;
+            if (image_file[0] != '.' && !stat(image_path.c_str(), &s) &&
+                (s.st_mode & S_IFREG)) {
+              image_files.push_back({image_path, class_index});
+            }
+          }
+          closedir(subdir);
+        }
+      }
+    }
+    closedir(directory);
+  }
+  CAFFE_ENFORCE(image_files.size(), "no images found in " + folder);
+  std::random_shuffle(image_files.begin(), image_files.end());
+
+  std::ofstream class_file(classes_text_path);
+  if (class_file.is_open()) {
+    for (auto &label : class_labels) {
+      class_file << label << std::endl;
+    }
+    class_file.close();
+  }
+  auto classes_header_path = path_prefix + "classes.h";
+  std::ofstream labels_file(classes_header_path.c_str());
+  if (labels_file.is_open()) {
+    labels_file << "const char * retrain_classes[] {";
+    bool first = true;
+    for (auto &label : class_labels) {
+      if (first) {
+        first = false;
+      } else {
+        labels_file << ',';
+      }
+      labels_file << std::endl << '"' << label << '"';
+    }
+    labels_file << std::endl << "};" << std::endl;
+    labels_file.close();
+  }
+}
+
+int write_batch(Workspace &workspace, NetBase *predict_net,
+                std::string &input_name, std::string &output_name,
+                std::vector<std::pair<std::string, int>> &batch_files,
+                std::unique_ptr<db::DB> *database, int size_to_fit) {
+  std::unique_ptr<db::Transaction> transaction[kRunNum];
+  for (int i = 0; i < kRunNum; i++) {
+    transaction[i] = database[i]->NewTransaction();
+  }
+
+  std::vector<std::string> filenames;
+  for (auto &pair : batch_files) {
+    filenames.push_back(pair.first);
+  }
+  std::vector<int> indices;
+  TensorCPU input;
+  TensorUtil(input).ReadImages(filenames, size_to_fit, indices);
+  TensorCPU output;
+  if (predict_net && input.size() > 0) {
+    BlobUtil(*workspace.GetBlob(input_name)).Set(input);
+    predict_net->Run();
+    auto tensor = BlobUtil(*workspace.GetBlob(output_name)).Get();
+    output.ResizeLike(tensor);
+    output.ShareData(tensor);
+  } else {
+    output.ResizeLike(input);
+    output.ShareData(input);
+  }
+
+  TensorProtos protos;
+  TensorProto *data = protos.add_protos();
+  TensorProto *label = protos.add_protos();
+  data->set_data_type(TensorProto::FLOAT);
+  label->set_data_type(TensorProto::INT32);
+  label->add_int32_data(0);
+  TensorSerializer<CPUContext> serializer;
+  std::string value;
+  std::vector<TIndex> dims(output.dims().begin() + 1, output.dims().end());
+  auto size = output.dim(0) ? output.size() / output.dim(0) : 0;
+  auto output_data = output.data<float>();
+  for (auto i : indices) {
+    auto single = TensorCPU(
+        dims, std::vector<float>(output_data, output_data + size), NULL);
+    output_data += size;
+    data->Clear();
+    serializer.Serialize(single, "", data, 0, kDefaultChunkSize);
+    label->set_int32_data(0, batch_files[i].second);
+    protos.SerializeToString(&value);
+    int percentage = 0, p = (int)(rand() * 100.0 / RAND_MAX);
+    auto key = filename_to_key(batch_files[i].first);
+    for (auto pair : percentage_for_run) {
+      percentage += pair.second;
+      if (p < percentage) {
+        transaction[pair.first]->Put(key, value);
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < kRunNum; i++) {
+    transaction[i]->Commit();
+  }
+
+  return indices.size();
+}
+
+int preprocess(const std::vector<std::pair<std::string, int>> &image_files,
+               const std::string *db_paths, ModelUtil &model,
+               const std::string &db_type, int batch_size, int size_to_fit) {
+  std::unique_ptr<db::DB> database[kRunNum];
+  for (int i = 0; i < kRunNum; i++) {
+    database[i] = db::CreateDB(db_type, db_paths[i], db::WRITE);
+  }
+  auto image_count = 0;
+  auto sample_count = 0;
+  Workspace workspace;
+  auto init_net = CreateNet(model.init.net, &workspace);
+  init_net->Run();
+  auto predict_net = model.predict.net.external_input_size()
+                         ? CreateNet(model.predict.net, &workspace)
+                         : NULL;
+  auto input_name = model.predict.net.external_input_size()
+                        ? model.predict.net.external_input(0)
+                        : "";
+  auto output_name = model.predict.net.external_output_size()
+                         ? model.predict.net.external_output(0)
+                         : "";
+  std::vector<std::pair<std::string, int>> batch_files;
+  for (auto &pair : image_files) {
+    auto &filename = pair.first;
+    auto class_index = pair.second;
+    image_count++;
+    auto in_db = false;
+    auto key = filename_to_key(filename);
+    for (int i = 0; i < kRunNum && !in_db; i++) {
+      auto cursor = database[i]->NewCursor();
+      cursor->Seek(key);
+      in_db |= (cursor->Valid() && cursor->key() == key);
+    }
+    if (!in_db) {
+      batch_files.push_back({filename, class_index});
+    }
+    if (image_count % 10 == 0) {
+      std::cerr << '\r' << std::string(40, ' ') << '\r' << "pre-processing.. "
+                << image_count << '/' << image_files.size() << " "
+                << std::setprecision(3)
+                << ((float)100 * image_count / image_files.size()) << "%"
+                << std::flush;
+    }
+    if (batch_files.size() == batch_size) {
+      sample_count += write_batch(
+          workspace, predict_net ? predict_net.get() : NULL, input_name,
+          output_name, batch_files, database, size_to_fit);
+      batch_files.clear();
+    }
+  }
+  if (batch_files.size() > 0) {
+    sample_count += write_batch(
+        workspace, predict_net ? predict_net.get() : NULL, input_name,
+        output_name, batch_files, database, size_to_fit);
+  }
+  for (int i = 0; i < kRunNum; i++) {
+    CAFFE_ENFORCE(database[i]->NewCursor()->Valid(),
+                  "database " + name_for_run[i] + " is empty");
+  }
+  std::cerr << '\r' << std::string(80, ' ') << '\r';
+
+  return sample_count;
+}
+
+void preprocess(const std::vector<std::pair<std::string, int>> &image_files,
+                const std::string *db_paths, const std::string &db_type,
+                int size_to_fit) {
+  NetDef n;
+  ModelUtil none(n, n);
+  preprocess(image_files, db_paths, none, db_type, 64, size_to_fit);
+}
+
+}  // namespace caffe2
+
+#endif  // PREPROCESS_H
diff --git a/include/caffe2/util/train.h b/include/caffe2/util/train.h
new file mode 100644
index 00000000..a475a836
--- /dev/null
+++ b/include/caffe2/util/train.h
@@ -0,0 +1,95 @@
+#ifndef TRAIN_H
+#define TRAIN_H
+
+#include <caffe2/core/db.h>
+#include <caffe2/core/init.h>
+#include <caffe2/core/net.h>
+
+#include "caffe2/util/blob.h"
+#include "caffe2/util/model.h"
+#include "caffe2/util/net.h"
+#include "caffe2/util/tensor.h"
+
+namespace caffe2 {
+
+enum { kRunTrain = 0, kRunValidate = 1, kRunTest = 2, kRunNum = 3 };
+
+static std::map<int, std::string> name_for_run({
+    {kRunTrain, "train"}, {kRunValidate, "validate"}, {kRunTest, "test"},
+});
+
+void run_trainer(int epochs, ModelUtil &train, ModelUtil &validate,
+                 Workspace &workspace, clock_t &train_time,
+                 clock_t &validate_time) {
+  CreateNet(train.init.net, &workspace)->Run();
+  CreateNet(validate.init.net, &workspace)->Run();
+
+  auto train_net = CreateNet(train.predict.net, &workspace);
+  auto validate_net = CreateNet(validate.predict.net, &workspace);
+
+  auto last_time = clock();
+  auto last_i = 0;
+  auto sum_accuracy = 0.f, sum_loss = 0.f;
+
+  for (auto i = 1; i <= epochs; i++) {
+    train_time -= clock();
+    train_net->Run();
+    train_time += clock();
+
+    sum_accuracy +=
+        BlobUtil(*workspace.GetBlob("accuracy")).Get().data<float>()[0];
+    sum_loss += BlobUtil(*workspace.GetBlob("loss")).Get().data<float>()[0];
+
+    auto steps_time = (float)(clock() - last_time) / CLOCKS_PER_SEC;
+    if (steps_time > 5 || i >= epochs) {
+      auto iter = BlobUtil(*workspace.GetBlob("iter")).Get().data<int64_t>()[0];
+      auto lr = BlobUtil(*workspace.GetBlob("lr")).Get().data<float>()[0];
+      auto train_loss = sum_loss / (i - last_i),
+           train_accuracy = sum_accuracy / (i - last_i);
+      sum_loss = 0;
+      sum_accuracy = 0;
+      validate_time -= clock();
+      validate_net->Run();
+      validate_time += clock();
+      auto validate_accuracy =
+          BlobUtil(*workspace.GetBlob("accuracy")).Get().data<float>()[0];
+      std::cout << "step: " << iter << "  rate: " << lr
+                << "  loss: " << train_loss << "  accuracy: " << train_accuracy
+                << " | " << validate_accuracy
+                << "  step_time: " << std::setprecision(3)
+                << steps_time / (i - last_i) << "s" << std::endl;
+      last_i = i;
+      last_time = clock();
+    }
+  }
+}
+
+void run_tester(int epochs, ModelUtil &test, Workspace &workspace,
+                clock_t &test_time) {
+  CreateNet(test.init.net, &workspace)->Run();
+  auto test_net = CreateNet(test.predict.net, &workspace);
+
+  auto sum_accuracy = 0.f, sum_loss = 0.f;
+  auto test_step = 10;
+  for (auto i = 1; i <= epochs; i++) {
+    test_time -= clock();
+    test_net->Run();
+    test_time += clock();
+
+    sum_accuracy +=
+        BlobUtil(*workspace.GetBlob("accuracy")).Get().data<float>()[0];
+    sum_loss += BlobUtil(*workspace.GetBlob("loss")).Get().data<float>()[0];
+
+    if (i % test_step == 0) {
+      auto loss = sum_loss / test_step, accuracy = sum_accuracy / test_step;
+      sum_loss = 0;
+      sum_accuracy = 0;
+      std::cout << "step: " << i << " loss: " << loss
+                << " accuracy: " << accuracy << std::endl;
+    }
+  }
+}
+
+}  // namespace caffe2
+
+#endif  // TRAIN_H
diff --git a/include/caffe2/zoo/keeper.h b/include/caffe2/zoo/keeper.h
index 4bbaf890..b221f8a1 100644
--- a/include/caffe2/zoo/keeper.h
+++ b/include/caffe2/zoo/keeper.h
@@ -119,7 +119,7 @@ class Keeper {
               << '\r';
     return result == CURLE_OK;
 #else
-    std::cout << "model download not supported, install cURL" << std::endl;
+    CAFFE_THROW("model download not supported, install cURL");
     return false;
 #endif
   }
@@ -145,50 +145,50 @@ class Keeper {
     return true;
   }
 
-  void addTrainedModel(NetDef &init_model, NetDef &predict_model) {
-    auto at = name_.find("%");
-    if (at == std::string::npos) {
-      CAFFE_ENFORCE(ensureModel(), "model ", name_, " not found");
-      std::string init_filename = "res/" + name_ + "_init_net.pb";
-      std::string predict_filename = "res/" + name_ + "_predict_net.pb";
-      CAFFE_ENFORCE(ReadProtoFromFile(init_filename.c_str(), &init_model));
-      CAFFE_ENFORCE(ReadProtoFromFile(predict_filename.c_str(), &predict_model));
-    } else {
-      std::string init_filename = name_.substr(0, at) + "init" + name_.substr(at + 1);
-      std::string predict_filename = name_.substr(0, at) + "predict" + name_.substr(at + 1);
-      CAFFE_ENFORCE(ReadProtoFromFile(init_filename.c_str(), &init_model));
-      CAFFE_ENFORCE(ReadProtoFromFile(predict_filename.c_str(), &predict_model));
-    }
+  size_t addTrainedModel(ModelUtil &model) {
+    CAFFE_ENFORCE(ensureModel(), "model ", name_, " not found");
+    return model.Read("res/" + name_);
   }
 
-  void addUntrainedModel(NetDef &init_model, NetDef &predict_model) {
+  size_t addUntrainedModel(ModelUtil &model) {
     if (name_ == "alexnet") {
-      AlexNetModel(init_model, predict_model).Add();
+      AlexNetModel(model.init.net, model.predict.net).Add();
     } else if (name_ == "googlenet") {
-      GoogleNetModel(init_model, predict_model).Add();
+      GoogleNetModel(model.init.net, model.predict.net).Add();
     } else if (name_ == "squeezenet") {
-      SqueezeNetModel(init_model, predict_model).Add();
+      SqueezeNetModel(model.init.net, model.predict.net).Add();
     } else if (name_ == "vgg16") {
-      VGGModel(init_model, predict_model).Add(16);
+      VGGModel(model.init.net, model.predict.net).Add(16);
     } else if (name_ == "vgg19") {
-      VGGModel(init_model, predict_model).Add(19);
+      VGGModel(model.init.net, model.predict.net).Add(19);
     } else if (name_ == "resnet50") {
-      ResNetModel(init_model, predict_model).Add(50);
+      ResNetModel(model.init.net, model.predict.net).Add(50);
     } else if (name_ == "resnet101") {
-      ResNetModel(init_model, predict_model).Add(101);
+      ResNetModel(model.init.net, model.predict.net).Add(101);
     } else if (name_ == "resnet152") {
-      ResNetModel(init_model, predict_model).Add(152);
+      ResNetModel(model.init.net, model.predict.net).Add(152);
     } else {
       CAFFE_THROW("model " + name_ + " not implemented");
     }
+    return 0;
   }
 
-  void AddModel(NetDef &init_model, NetDef &predict_model, bool trained) {
-    if (trained) {
-      addTrainedModel(init_model, predict_model);
+  size_t AddModel(ModelUtil &model, bool trained) {
+    auto at = name_.find("%");
+    size_t size = 0;
+    if (at == std::string::npos) {
+      if (trained) {
+        size = addTrainedModel(model);
+      } else {
+        size = addUntrainedModel(model);
+      }
     } else {
-      addUntrainedModel(init_model, predict_model);
+      size +=
+          model.init.Read(name_.substr(0, at) + "init" + name_.substr(at + 1));
+      size += model.predict.Read(name_.substr(0, at) + "predict" +
+                                 name_.substr(at + 1));
     }
+    return size;
   }
 
  protected:
diff --git a/src/caffe2/binaries/diff.cc b/src/caffe2/binaries/diff.cc
index 3ca0c884..9fb86d90 100644
--- a/src/caffe2/binaries/diff.cc
+++ b/src/caffe2/binaries/diff.cc
@@ -15,19 +15,19 @@ namespace caffe2 {
 
 void run() {
   NetDef init_model, predict_model;
+  ModelUtil model(init_model, predict_model);
 
   if (FLAGS_code && !FLAGS_file) {
-    Keeper(FLAGS_model).AddModel(init_model, predict_model, false);
+    Keeper(FLAGS_model).AddModel(model, false);
   } else if (!FLAGS_code && FLAGS_file) {
-    Keeper(FLAGS_model).AddModel(init_model, predict_model, true);
-    NetUtil(init_model).SetFillToTrain();
+    Keeper(FLAGS_model).AddModel(model, true);
+    model.init.SetFillToTrain();
   } else {
     std::cerr << "set either --code or --file" << std::endl;
   }
 
   if (FLAGS_short) {
-    std::cout << NetUtil(predict_model).Short();
-    std::cout << NetUtil(init_model).Short();
+    std::cout << model.Short();
   } else {
     google::protobuf::io::OstreamOutputStream stream(&std::cout);
     google::protobuf::TextFormat::Print(predict_model, &stream);
diff --git a/src/caffe2/binaries/dream.cc b/src/caffe2/binaries/dream.cc
index 32fbf1ef..d1516ce7 100644
--- a/src/caffe2/binaries/dream.cc
+++ b/src/caffe2/binaries/dream.cc
@@ -1,16 +1,15 @@
 #include <caffe2/core/init.h>
 #include <caffe2/core/net.h>
+#include <caffe2/utils/proto_utils.h>
 #include "caffe2/util/blob.h"
 #include "caffe2/util/plot.h"
 #include "caffe2/util/tensor.h"
 #include "caffe2/util/window.h"
-#include "caffe2/utils/proto_utils.h"
 #include "caffe2/zoo/keeper.h"
 
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/imgproc/imgproc.hpp>
 
-#include "caffe2/util/misc.h"
 #include "res/imagenet_classes.h"
 
 CAFFE2_DEFINE_string(model, "", "Name of one of the pre-trained models.");
@@ -32,44 +31,41 @@ CAFFE2_DEFINE_bool(display, false, "Show image while dreaming.");
 
 namespace caffe2 {
 
-void AddNaive(NetDef &init_model, NetDef &dream_model, NetDef &display_model,
-              int size) {
-  auto &input = dream_model.external_input(0);
-  auto &output = dream_model.external_output(0);
-
-  NetUtil init(init_model), dream(dream_model), display(display_model);
+void AddNaive(ModelUtil &dream, NetUtil &display, int size) {
+  auto &input = dream.predict.Input(0);
+  auto &output = dream.predict.Output(0);
 
   // initialize input data
-  init.AddUniformFillOp({FLAGS_batch, 3, size, size}, FLAGS_initial,
-                        FLAGS_initial + 1, input);
+  dream.init.AddUniformFillOp({FLAGS_batch, 3, size, size}, FLAGS_initial,
+                              FLAGS_initial + 1, input);
 
   // add squared l2 distance to zero as loss
   if (FLAGS_channel >= 0) {
-    dream.AddSquaredL2ChannelOp(output, "loss", FLAGS_channel);
+    dream.predict.AddSquaredL2ChannelOp(output, "loss", FLAGS_channel);
   } else {
-    dream.AddSquaredL2Op(output, "loss");
+    dream.predict.AddSquaredL2Op(output, "loss");
   }
-  dream.AddConstantFillWithOp(1.f, "loss", "loss_grad");
+  dream.predict.AddConstantFillWithOp(1.f, "loss", "loss_grad");
 
   if (FLAGS_display) {
-    NetUtil(dream).AddTimePlotOp("loss");
+    dream.predict.AddTimePlotOp("loss");
   }
 
   // add back prop
-  dream.AddAllGradientOp();
+  dream.predict.AddAllGradientOp();
 
   // scale gradient
-  dream.AddMeanStdevOp(input + "_grad", "_", input + "_grad_stdev");
-  dream.AddConstantFillWithOp(0.f, input + "_grad_stdev", "zero");
-  dream.AddScaleOp(input + "_grad_stdev", input + "_grad_stdev",
-                   1 / FLAGS_learning_rate);
-  dream.AddAffineScaleOp(input + "_grad", "zero", input + "_grad_stdev",
-                         input + "_grad", true);
+  dream.predict.AddMeanStdevOp(input + "_grad", "_", input + "_grad_stdev");
+  dream.predict.AddConstantFillWithOp(0.f, input + "_grad_stdev", "zero");
+  dream.predict.AddScaleOp(input + "_grad_stdev", input + "_grad_stdev",
+                           1 / FLAGS_learning_rate);
+  dream.predict.AddAffineScaleOp(input + "_grad", "zero", input + "_grad_stdev",
+                                 input + "_grad", true);
 
   // apply gradient to input data
-  init.AddConstantFillOp({1}, 1.f, "one");
-  dream.AddInput("one");
-  dream.AddWeightedSumOp({input, "one", input + "_grad", "one"}, input);
+  dream.init.AddConstantFillOp({1}, 1.f, "one");
+  dream.predict.AddInput("one");
+  dream.predict.AddWeightedSumOp({input, "one", input + "_grad", "one"}, input);
 
   // scale data to image
   if (FLAGS_image_file.size()) {
@@ -107,11 +103,11 @@ void run() {
   std::cout << "batch: " << FLAGS_batch << std::endl;
   std::cout << "size: " << FLAGS_size << std::endl;
 
-  std::cout << "train_runs: " << FLAGS_train_runs << std::endl;
-  std::cout << "scale_runs: " << FLAGS_scale_runs << std::endl;
-  std::cout << "percent_incr: " << FLAGS_percent_incr << std::endl;
+  std::cout << "train-runs: " << FLAGS_train_runs << std::endl;
+  std::cout << "scale-runs: " << FLAGS_scale_runs << std::endl;
+  std::cout << "percent-incr: " << FLAGS_percent_incr << std::endl;
   std::cout << "initial: " << FLAGS_initial << std::endl;
-  std::cout << "learning_rate: " << FLAGS_learning_rate << std::endl;
+  std::cout << "learning-rate: " << FLAGS_learning_rate << std::endl;
   std::cout << "display: " << (FLAGS_display ? "true" : "false") << std::endl;
 
   std::cout << std::endl;
@@ -143,21 +139,23 @@ void run() {
   std::cout << "loading model.." << std::endl;
   clock_t load_time = 0;
   NetDef base_init_model, base_predict_model;
-
+  ModelUtil base(base_init_model, base_predict_model);
   // read model files
   load_time -= clock();
-  Keeper(FLAGS_model).AddModel(base_init_model, base_predict_model, true);
+  Keeper(FLAGS_model).AddModel(base, true);
   load_time += clock();
 
   // extract dream model
-  NetUtil(base_predict_model).CheckLayerAvailable(FLAGS_layer);
+  base.predict.CheckLayerAvailable(FLAGS_layer);
   NetDef init_model, dream_model, display_model, unused_model;
-  NetUtil init(init_model), dream(dream_model), display(display_model);
-  split_model(base_init_model, base_predict_model, FLAGS_layer, init_model,
-              dream_model, unused_model, unused_model, FLAGS_device != "cudnn",
-              false);
+  NetUtil display(display_model);
+  ModelUtil dream(init_model, dream_model);
+  ModelUtil unused(unused_model, unused_model);
+
+  base.Split(FLAGS_layer, dream, unused, FLAGS_device != "cudnn", false);
 
-  // add_cout_op(dream_model, { "_conv2/norm2_scale" })->set_engine("CUDNN");
+  // add_cout_op(dream.predict.net, { "_conv2/norm2_scale"
+  // })->set_engine("CUDNN");
 
   // add dream operators
   auto image_size = FLAGS_size;
@@ -167,18 +165,16 @@ void run() {
   if (image_size < 20) {
     image_size = 20;
   }
-  AddNaive(init_model, dream_model, display_model, image_size);
+  AddNaive(dream, display, image_size);
 
   // set model to use CUDA
   if (FLAGS_device != "cpu") {
-    init.SetDeviceCUDA();
     dream.SetDeviceCUDA();
     display.SetDeviceCUDA();
     // dream.SetEngineCudnnOps();
   }
 
   if (FLAGS_dump_model) {
-    std::cout << init.Short();
     std::cout << dream.Short();
     std::cout << display.Short();
   }
@@ -188,14 +184,14 @@ void run() {
   Workspace workspace;
 
   // setup workspace
-  auto init_net = CreateNet(init_model, &workspace);
-  auto predict_net = CreateNet(dream_model, &workspace);
-  auto display_net = CreateNet(display_model, &workspace);
+  auto init_net = CreateNet(dream.init.net, &workspace);
+  auto predict_net = CreateNet(dream.predict.net, &workspace);
+  auto display_net = CreateNet(display.net, &workspace);
   init_net->Run();
 
   // read image as tensor
   if (FLAGS_image_file.size()) {
-    auto &input_name = dream_model.external_input(0);
+    auto &input_name = dream.predict.Input(0);
     TensorCPU input;
     std::vector<int> x;
     TensorUtil(input).ReadImages({FLAGS_image_file}, image_size, x, 128);
diff --git a/src/caffe2/binaries/imagenet.cc b/src/caffe2/binaries/imagenet.cc
index 7df15a43..aa901b7c 100644
--- a/src/caffe2/binaries/imagenet.cc
+++ b/src/caffe2/binaries/imagenet.cc
@@ -1,8 +1,8 @@
 #include <caffe2/core/init.h>
 #include <caffe2/core/net.h>
+#include <caffe2/utils/proto_utils.h>
 #include "caffe2/util/blob.h"
 #include "caffe2/util/tensor.h"
-#include "caffe2/utils/proto_utils.h"
 #include "caffe2/zoo/keeper.h"
 
 #include "caffe2/util/cmd.h"
@@ -54,12 +54,14 @@ void run() {
     return;
   }
 
+  auto cuda = (FLAGS_device != "cpu" && cmd_setup_cuda());
+
   std::cout << "model: " << FLAGS_model << std::endl;
-  std::cout << "image_file: " << FLAGS_image_file << std::endl;
-  std::cout << "size_to_fit: " << FLAGS_size_to_fit << std::endl;
+  std::cout << "image-file: " << FLAGS_image_file << std::endl;
+  std::cout << "size-to-fit: " << FLAGS_size_to_fit << std::endl;
   std::cout << "device: " << FLAGS_device << std::endl;
-
-  if (FLAGS_device != "cpu") cmd_setup_cuda();
+  std::cout << "using cuda: " << (cuda ? "true" : "false") << std::endl;
+  ;
 
   std::cout << std::endl;
 
@@ -70,31 +72,20 @@ void run() {
   std::cout << "loading model.." << std::endl;
   clock_t load_time = 0;
   NetDef init_model, predict_model;
-  NetUtil init(init_model), predict(predict_model);
+  ModelUtil model(init_model, predict_model);
 
   // read model files
   load_time -= clock();
-  Keeper(FLAGS_model).AddModel(init_model, predict_model, true);
+  size_t model_size = Keeper(FLAGS_model).AddModel(model, true);
   load_time += clock();
 
-  // get model size
-  auto init_size = std::ifstream("res/" + FLAGS_model + "_init_net.pb",
-                                 std::ifstream::ate | std::ifstream::binary)
-                       .tellg();
-  auto predict_size = std::ifstream("res/" + FLAGS_model + "_predict_net.pb",
-                                    std::ifstream::ate | std::ifstream::binary)
-                          .tellg();
-  auto model_size = init_size + predict_size;
-
   // set model to use CUDA
   if (FLAGS_device != "cpu") {
-    init.SetDeviceCUDA();
-    predict.SetDeviceCUDA();
+    model.SetDeviceCUDA();
   }
 
   if (FLAGS_dump_model) {
-    std::cout << init.Short();
-    std::cout << predict.Short();
+    std::cout << model.Short();
   }
 
   std::cout << "running model.." << std::endl;
@@ -102,10 +93,10 @@ void run() {
   Workspace workspace;
 
   // setup workspace
-  auto &input_name = predict_model.external_input(0);
-  auto &output_name = predict_model.external_output(0);
-  auto init_net = CreateNet(init_model, &workspace);
-  auto predict_net = CreateNet(predict_model, &workspace);
+  auto &input_name = model.predict.Input(0);
+  auto &output_name = model.predict.Output(0);
+  auto init_net = CreateNet(model.init.net, &workspace);
+  auto predict_net = CreateNet(model.predict.net, &workspace);
   init_net->Run();
 
   // run predictor
diff --git a/src/caffe2/binaries/inspect.cc b/src/caffe2/binaries/inspect.cc
index 22ad1d02..41a7fa4c 100644
--- a/src/caffe2/binaries/inspect.cc
+++ b/src/caffe2/binaries/inspect.cc
@@ -1,4 +1,9 @@
-#include "caffe2/util/misc.h"
+#include <caffe2/core/db.h>
+#include <caffe2/core/init.h>
+
+#include "caffe2/util/model.h"
+#include "caffe2/util/net.h"
+#include "caffe2/util/tensor.h"
 
 CAFFE2_DEFINE_string(path, "res/mnist-test-nchw-leveldb",
                      "path of the database");
@@ -6,13 +11,39 @@ CAFFE2_DEFINE_string(db_type, "leveldb", "The database type.");
 
 namespace caffe2 {
 
+void dump_database(const std::string db_path, const std::string& db_type) {
+  std::cout << "dumping database.." << std::endl;
+  std::unique_ptr<db::DB> database = db::CreateDB(db_type, db_path, db::READ);
+
+  for (auto cursor = database->NewCursor(); cursor->Valid(); cursor->Next()) {
+    auto key = cursor->key().substr(0, 48);
+    auto value = cursor->value();
+    TensorProtos protos;
+    protos.ParseFromString(value);
+    auto tensor_proto = protos.protos(0);
+    auto label_proto = protos.protos(1);
+    TensorDeserializer<CPUContext> deserializer;
+    TensorCPU tensor;
+    int label = label_proto.int32_data(0);
+    deserializer.Deserialize(tensor_proto, &tensor);
+    auto dims = tensor.dims();
+    dims.insert(dims.begin(), 1);
+    tensor.Resize(dims);
+    std::cout << key << "  "
+              << (value.size() > 1000 ? value.size() / 1000 : value.size())
+              << (value.size() > 1000 ? "K" : "B") << "  (" << tensor.dims()
+              << ")  " << label << std::endl;
+    TensorUtil(tensor).ShowImage("inspect", 0, 1.0, 128);
+  }
+}
+
 void run() {
   std::cout << std::endl;
   std::cout << "## Database inspector ##" << std::endl;
   std::cout << std::endl;
 
   std::cout << "path: " << FLAGS_path << std::endl;
-  std::cout << "db_type: " << FLAGS_db_type << std::endl;
+  std::cout << "db-type: " << FLAGS_db_type << std::endl;
 
   dump_database(FLAGS_path, FLAGS_db_type);
 }
diff --git a/src/caffe2/binaries/mnist.cc b/src/caffe2/binaries/mnist.cc
index 7879f146..57497d99 100644
--- a/src/caffe2/binaries/mnist.cc
+++ b/src/caffe2/binaries/mnist.cc
@@ -162,11 +162,11 @@ void run() {
     return;
   }
 
-  std::cout << "train_db: " << FLAGS_train_db << std::endl;
-  std::cout << "test_db: " << FLAGS_test_db << std::endl;
-  std::cout << "train_runs: " << FLAGS_train_runs << std::endl;
-  std::cout << "test_runs: " << FLAGS_test_runs << std::endl;
-  std::cout << "force_cpu: " << (FLAGS_force_cpu ? "true" : "false")
+  std::cout << "train-db: " << FLAGS_train_db << std::endl;
+  std::cout << "test-db: " << FLAGS_test_db << std::endl;
+  std::cout << "train-runs: " << FLAGS_train_runs << std::endl;
+  std::cout << "test-runs: " << FLAGS_test_runs << std::endl;
+  std::cout << "force-cpu: " << (FLAGS_force_cpu ? "true" : "false")
             << std::endl;
   std::cout << "display: " << (FLAGS_display ? "true" : "false") << std::endl;
 
@@ -200,64 +200,62 @@ void run() {
 
   // >>> train_model = model_helper.ModelHelper(name="mnist_train",
   // arg_scope={"order": "NCHW"})
-  NetDef initTrainModel, predictTrainModel;
-  ModelUtil trainModel(initTrainModel, predictTrainModel, "mnist_train");
+  NetDef train_init_model, train_predict_model;
+  ModelUtil train(train_init_model, train_predict_model, "mnist_train");
 
   // >>> data, label = AddInput(train_model, batch_size=64,
   // db=os.path.join(data_folder, 'mnist-train-nchw-leveldb'),
   // db_type='leveldb')
-  AddInput(trainModel, 64, FLAGS_train_db, "leveldb");
+  AddInput(train, 64, FLAGS_train_db, "leveldb");
 
   // >>> softmax = AddLeNetModel(train_model, data)
-  AddLeNetModel(trainModel, false);
+  AddLeNetModel(train, false);
 
   // >>> AddTrainingOperators(train_model, softmax, label)
-  AddTrainingOperators(trainModel);
+  AddTrainingOperators(train);
 
   // >>> AddBookkeepingOperators(train_model)
-  AddBookkeepingOperators(trainModel);
+  AddBookkeepingOperators(train);
 
   // >>> test_model = model_helper.ModelHelper(name="mnist_test",
   // arg_scope=arg_scope, init_params=False)
-  NetDef initTestModel, predictTestModel;
-  ModelUtil testModel(initTestModel, predictTestModel, "mnist_test");
+  NetDef test_init_model, test_predict_model;
+  ModelUtil test(test_init_model, test_predict_model, "mnist_test");
 
   // >>> data, label = AddInput(test_model, batch_size=100,
   // db=os.path.join(data_folder, 'mnist-test-nchw-leveldb'), db_type='leveldb')
-  AddInput(testModel, 100, FLAGS_test_db, "leveldb");
+  AddInput(test, 100, FLAGS_test_db, "leveldb");
 
   // >>> softmax = AddLeNetModel(test_model, data)
-  AddLeNetModel(testModel, true);
+  AddLeNetModel(test, true);
 
   // >>> AddAccuracy(test_model, softmax, label)
-  AddAccuracy(testModel);
+  AddAccuracy(test);
 
   // >>> deploy_model = model_helper.ModelHelper(name="mnist_deploy",
   // arg_scope=arg_scope, init_params=False)
-  NetDef initDeployModel, predictDeployModel;
-  ModelUtil deployModel(initDeployModel, predictDeployModel, "mnist_model");
-  predictDeployModel.add_external_input("data");
+  NetDef deploy_init_model, deploy_predict_model;
+  ModelUtil deploy(deploy_init_model, deploy_predict_model, "mnist_model");
+  deploy.predict.AddInput("data");
 
   // >>> AddLeNetModel(deploy_model, "data")
-  AddLeNetModel(deployModel, true);
+  AddLeNetModel(deploy, true);
 
 #ifdef WITH_CUDA
   if (!FLAGS_force_cpu) {
-    initTrainModel.mutable_device_option()->set_device_type(CUDA);
-    predictTrainModel.mutable_device_option()->set_device_type(CUDA);
-    initTestModel.mutable_device_option()->set_device_type(CUDA);
-    predictTestModel.mutable_device_option()->set_device_type(CUDA);
+    train.SetDeviceCUDA();
+    test.SetDeviceCUDA();
   }
 #endif
 
   std::cout << std::endl;
 
   // >>> workspace.RunNetOnce(train_model.param_init_net)
-  auto initTrainNet = CreateNet(initTrainModel, &workspace);
+  auto initTrainNet = CreateNet(train.init.net, &workspace);
   initTrainNet->Run();
 
   // >>> workspace.CreateNet(train_model.net)
-  auto predictTrainNet = CreateNet(predictTrainModel, &workspace);
+  auto predictTrainNet = CreateNet(train.predict.net, &workspace);
 
   std::cout << "training.." << std::endl;
 
@@ -280,11 +278,11 @@ void run() {
   std::cout << std::endl;
 
   // >>> workspace.RunNetOnce(test_model.param_init_net)
-  auto initTestNet = CreateNet(initTestModel, &workspace);
+  auto initTestNet = CreateNet(test.init.net, &workspace);
   initTestNet->Run();
 
   // >>> workspace.CreateNet(test_model.net)
-  auto predictTestNet = CreateNet(predictTestModel, &workspace);
+  auto predictTestNet = CreateNet(test.predict.net, &workspace);
 
   std::cout << "testing.." << std::endl;
 
@@ -303,9 +301,9 @@ void run() {
 
   // with open(os.path.join(root_folder, "deploy_net.pbtxt"), 'w') as fid:
   // fid.write(str(deploy_model.net.Proto()))
-  for (auto &param : predictDeployModel.external_input()) {
+  for (auto &param : deploy.predict.net.external_input()) {
     auto tensor = BlobUtil(*workspace.GetBlob(param)).Get();
-    auto op = initDeployModel.add_op();
+    auto op = deploy.init.net.add_op();
     op->set_type("GivenTensorFill");
     auto arg1 = op->add_arg();
     arg1->set_name("shape");
@@ -320,9 +318,8 @@ void run() {
     }
     op->add_output(param);
   }
-  WriteProtoToTextFile(predictDeployModel, "tmp/mnist_predict_net.pbtxt");
-  WriteProtoToBinaryFile(initDeployModel, "tmp/mnist_init_net.pb");
-  WriteProtoToBinaryFile(predictDeployModel, "tmp/mnist_predict_net.pb");
+  deploy.predict.WriteText("tmp/mnist_predict_net.pbtxt");
+  deploy.Write("tmp/mnist");
 }
 
 void predict_example() {
diff --git a/src/caffe2/binaries/pretrained.cc b/src/caffe2/binaries/pretrained.cc
index 1e047b24..c7c0ac92 100644
--- a/src/caffe2/binaries/pretrained.cc
+++ b/src/caffe2/binaries/pretrained.cc
@@ -1,6 +1,6 @@
 #include <caffe2/core/init.h>
 #include <caffe2/core/predictor.h>
-#include "caffe2/utils/proto_utils.h"
+#include <caffe2/utils/proto_utils.h>
 
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/imgproc/imgproc.hpp>
@@ -42,10 +42,10 @@ void run() {
     return;
   }
 
-  std::cout << "init_net: " << FLAGS_init_net << std::endl;
-  std::cout << "predict_net: " << FLAGS_predict_net << std::endl;
-  std::cout << "image_file: " << FLAGS_image_file << std::endl;
-  std::cout << "size_to_fit: " << FLAGS_size_to_fit << std::endl;
+  std::cout << "init-net: " << FLAGS_init_net << std::endl;
+  std::cout << "predict-net: " << FLAGS_predict_net << std::endl;
+  std::cout << "image-file: " << FLAGS_image_file << std::endl;
+  std::cout << "size-to-fit: " << FLAGS_size_to_fit << std::endl;
 
   std::cout << std::endl;
 
diff --git a/src/caffe2/binaries/retrain.cc b/src/caffe2/binaries/retrain.cc
deleted file mode 100644
index 7aed57d6..00000000
--- a/src/caffe2/binaries/retrain.cc
+++ /dev/null
@@ -1,264 +0,0 @@
-#include "caffe2/util/misc.h"
-
-#include <caffe2/core/db.h>
-#include <caffe2/core/init.h>
-#include <caffe2/core/operator_gradient.h>
-#include "caffe2/utils/proto_utils.h"
-#include "caffe2/zoo/keeper.h"
-
-#include "res/imagenet_classes.h"
-
-CAFFE2_DEFINE_string(model, "", "Name of one of the pre-trained models.");
-CAFFE2_DEFINE_string(layer, "",
-                     "Name of the layer on which to split the model.");
-CAFFE2_DEFINE_string(folder, "", "Folder with subfolders with images");
-
-CAFFE2_DEFINE_string(db_type, "leveldb", "The database type.");
-CAFFE2_DEFINE_int(size_to_fit, 224, "The image file.");
-CAFFE2_DEFINE_int(train_runs, 100, "The of training runs.");
-CAFFE2_DEFINE_int(test_runs, 50, "The of training runs.");
-CAFFE2_DEFINE_int(batch_size, 64, "Training batch size.");
-CAFFE2_DEFINE_double(learning_rate, 1e-4, "Learning rate.");
-CAFFE2_DEFINE_bool(reshape_output, false,
-                   "Reshape output (necessary for squeeznet)");
-
-#include "caffe2/util/cmd.h"
-
-namespace caffe2 {
-
-void run() {
-  if (!cmd_init("Partial Retrain Example")) {
-    return;
-  }
-
-  if (!FLAGS_model.size()) {
-    std::cerr << "specify a model name using --model <name>" << std::endl;
-    for (auto const &pair : keeper_model_lookup) {
-      std::cerr << "  " << pair.first << std::endl;
-    }
-    return;
-  }
-
-  if (!FLAGS_folder.size()) {
-    std::cerr << "specify a image folder using --folder <name>" << std::endl;
-    return;
-  }
-
-  if (!FLAGS_layer.size()) {
-    std::cerr << "specify a layer layer using --layer <name>" << std::endl;
-    return;
-  }
-
-  std::cout << "model: " << FLAGS_model << std::endl;
-  std::cout << "layer: " << FLAGS_layer << std::endl;
-  std::cout << "image_dir: " << FLAGS_folder << std::endl;
-  std::cout << "db_type: " << FLAGS_db_type << std::endl;
-  std::cout << "size_to_fit: " << FLAGS_size_to_fit << std::endl;
-  std::cout << "train_runs: " << FLAGS_train_runs << std::endl;
-  std::cout << "test_runs: " << FLAGS_test_runs << std::endl;
-  std::cout << "batch_size: " << FLAGS_batch_size << std::endl;
-  std::cout << "learning_rate: " << FLAGS_learning_rate << std::endl;
-  std::cout << "reshape_output: " << FLAGS_reshape_output << std::endl;
-
-  std::string layer_safe = FLAGS_layer;
-  std::replace(layer_safe.begin(), layer_safe.end(), '/', '_');
-  std::replace(layer_safe.begin(), layer_safe.end(), '.', '_');
-
-  std::string model_safe = FLAGS_model;
-  std::replace(model_safe.begin(), model_safe.end(), '/', '_');
-  auto path_prefix =
-      FLAGS_folder + '/' + '_' + model_safe + '_' + layer_safe + '_';
-  std::string db_paths[kRunNum];
-  for (int i = 0; i < kRunNum; i++) {
-    db_paths[i] = path_prefix + name_for_run[i] + ".db";
-  }
-
-  std::cout << std::endl;
-
-  auto load_time = -clock();
-  std::vector<std::string> class_labels;
-  std::vector<std::pair<std::string, int>> image_files;
-  load_labels(FLAGS_folder, path_prefix, class_labels, image_files);
-
-  std::cout << "load model.." << std::endl;
-  NetDef full_init_model, full_predict_model;
-  NetDef init_model[kRunNum], predict_model[kRunNum];
-  for (int i = 0; i < kRunNum; i++) {
-    init_model[i].set_name(name_for_run[i] + "_init_model");
-    predict_model[i].set_name(name_for_run[i] + "_predict_model");
-  }
-  Keeper(FLAGS_model).AddModel(full_init_model, full_predict_model, true);
-
-  NetUtil(full_predict_model).CheckLayerAvailable(FLAGS_layer);
-
-  NetDef first_init_model, first_predict_model, second_init_model,
-      second_predict_model;
-  split_model(full_init_model, full_predict_model, FLAGS_layer,
-              first_init_model, first_predict_model, second_init_model,
-              second_predict_model, FLAGS_device != "cudnn");
-
-  if (FLAGS_device != "cpu") {
-    NetUtil(first_init_model).SetDeviceCUDA();
-    NetUtil(first_predict_model).SetDeviceCUDA();
-  }
-
-  pre_process(image_files, db_paths, first_init_model, first_predict_model,
-              FLAGS_db_type, FLAGS_batch_size, FLAGS_size_to_fit);
-  load_time += clock();
-
-  for (int i = 0; i < kRunNum; i++) {
-    ModelUtil(init_model[i], predict_model[i])
-        .AddDatabaseOps(name_for_run[i], FLAGS_layer, db_paths[i],
-                        FLAGS_db_type, FLAGS_batch_size);
-  }
-  copy_train_model(second_init_model, second_predict_model, FLAGS_layer,
-                   class_labels.size(), init_model[kRunTrain],
-                   predict_model[kRunTrain]);
-  copy_test_model(second_predict_model, predict_model[kRunValidate]);
-  copy_test_model(second_predict_model, predict_model[kRunTest]);
-
-  auto output = predict_model[kRunTrain].external_output(0);
-  if (FLAGS_reshape_output) {
-    auto output_reshaped = output + "_reshaped";
-    for (int i = 0; i < kRunNum; i++) {
-      NetUtil(predict_model[i]).AddReshapeOp(output, output_reshaped, {0, -1});
-    }
-    output = output_reshaped;
-  }
-
-  ModelUtil(init_model[kRunTrain], predict_model[kRunTrain])
-      .AddTrainOps(output, FLAGS_learning_rate, FLAGS_optimizer);
-  ModelUtil(second_predict_model, predict_model[kRunValidate])
-      .AddTestOps(output);
-  ModelUtil(second_predict_model, predict_model[kRunTest]).AddTestOps(output);
-
-  if (FLAGS_device != "cpu") {
-    for (int i = 0; i < kRunNum; i++) {
-      NetUtil(init_model[i]).SetDeviceCUDA();
-      NetUtil(predict_model[i]).SetDeviceCUDA();
-    }
-  }
-
-  if (FLAGS_dump_model) {
-    std::cout << NetUtil(init_model[kRunTrain]).Short();
-    std::cout << NetUtil(predict_model[kRunTrain]).Short();
-  }
-
-  std::cout << std::endl;
-
-  Workspace workspace("tmp");
-  unique_ptr<caffe2::NetBase> predict_net[kRunNum];
-  for (int i = 0; i < kRunNum; i++) {
-    auto init_net = CreateNet(init_model[i], &workspace);
-    init_net->Run();
-    predict_net[i] = CreateNet(predict_model[i], &workspace);
-  }
-
-  clock_t train_time = 0;
-  clock_t validate_time = 0;
-  clock_t test_time = 0;
-
-  auto last_time = clock();
-  auto last_i = 0;
-
-  std::cout << "training.." << std::endl;
-  for (auto i = 1; i <= FLAGS_train_runs; i++) {
-    train_time -= clock();
-    predict_net[kRunTrain]->Run();
-    train_time += clock();
-
-    auto steps_time = (float)(clock() - last_time) / CLOCKS_PER_SEC;
-    if (steps_time > 5 || i == FLAGS_train_runs) {
-      auto iter = BlobUtil(*workspace.GetBlob("iter")).Get().data<int64_t>()[0];
-      auto lr = BlobUtil(*workspace.GetBlob("lr")).Get().data<float>()[0];
-      auto train_accuracy =
-          BlobUtil(*workspace.GetBlob("accuracy")).Get().data<float>()[0];
-      auto train_loss =
-          BlobUtil(*workspace.GetBlob("loss")).Get().data<float>()[0];
-      validate_time -= clock();
-      predict_net[kRunValidate]->Run();
-      validate_time += clock();
-      auto validate_accuracy =
-          BlobUtil(*workspace.GetBlob("accuracy")).Get().data<float>()[0];
-      std::cout << "step: " << iter << "  rate: " << lr
-                << "  loss: " << train_loss << "  accuracy: " << train_accuracy
-                << " | " << validate_accuracy
-                << "  step_time: " << std::setprecision(3)
-                << steps_time / (i - last_i) << "s" << std::endl;
-      last_i = i;
-      last_time = clock();
-    }
-  }
-
-  std::cout << std::endl;
-
-  std::cout << "testing.." << std::endl;
-  for (auto i = 1; i <= FLAGS_test_runs; i++) {
-    test_time -= clock();
-    predict_net[kRunTest]->Run();
-    test_time += clock();
-
-    if (i % 10 == 0) {
-      auto accuracy =
-          BlobUtil(*workspace.GetBlob("accuracy")).Get().data<float>()[0];
-      auto loss = BlobUtil(*workspace.GetBlob("loss")).Get().data<float>()[0];
-      std::cout << "step: " << i << " loss: " << loss
-                << " accuracy: " << accuracy << std::endl;
-    }
-  }
-
-  NetDef deploy_init_model;  // the final initialization model
-  deploy_init_model.set_name("retrain_" + full_init_model.name());
-  for (const auto &op : full_init_model.op()) {
-    auto &output = op.output(0);
-    auto blob = workspace.GetBlob(output);
-    if (blob) {
-      auto tensor = BlobUtil(*blob).Get();
-      auto init_op = deploy_init_model.add_op();
-      init_op->set_type("GivenTensorFill");
-      auto arg1 = init_op->add_arg();
-      arg1->set_name("shape");
-      for (auto dim : tensor.dims()) {
-        arg1->add_ints(dim);
-      }
-      auto arg2 = init_op->add_arg();
-      arg2->set_name("values");
-      const auto &data = tensor.data<float>();
-      for (auto i = 0; i < tensor.size(); ++i) {
-        arg2->add_floats(data[i]);
-      }
-      init_op->add_output(output);
-    } else {
-      deploy_init_model.add_op()->CopyFrom(op);
-    }
-  }
-
-  WriteProtoToBinaryFile(deploy_init_model, path_prefix + "init_net.pb");
-  WriteProtoToBinaryFile(full_predict_model, path_prefix + "predict_net.pb");
-  auto init_size = std::ifstream(path_prefix + "init_net.pb",
-                                 std::ifstream::ate | std::ifstream::binary)
-                       .tellg();
-  auto predict_size = std::ifstream(path_prefix + "predict_net.pb",
-                                    std::ifstream::ate | std::ifstream::binary)
-                          .tellg();
-  auto model_size = init_size + predict_size;
-
-  std::cout << std::endl;
-
-  std::cout << std::setprecision(3)
-            << "load: " << ((float)load_time / CLOCKS_PER_SEC)
-            << "s  train: " << ((float)train_time / CLOCKS_PER_SEC)
-            << "s  validate: " << ((float)validate_time / CLOCKS_PER_SEC)
-            << "s  test: " << ((float)test_time / CLOCKS_PER_SEC)
-            << "s  model: " << ((float)model_size / 1000000) << "MB"
-            << std::endl;
-}
-
-}  // namespace caffe2
-
-int main(int argc, char **argv) {
-  caffe2::GlobalInit(&argc, &argv);
-  caffe2::run();
-  google::protobuf::ShutdownProtobufLibrary();
-  return 0;
-}
diff --git a/src/caffe2/binaries/rnn.cc b/src/caffe2/binaries/rnn.cc
index 6589da17..b48cdd81 100644
--- a/src/caffe2/binaries/rnn.cc
+++ b/src/caffe2/binaries/rnn.cc
@@ -1,5 +1,6 @@
 #include <caffe2/core/init.h>
 #include "caffe2/util/blob.h"
+#include "caffe2/util/model.h"
 #include "caffe2/util/net.h"
 
 #include "caffe2/util/cmd.h"
@@ -18,56 +19,58 @@ CAFFE2_DEFINE_int(gen_length, 500, "One forward example sequence length");
 
 namespace caffe2 {
 
-void AddFC(NetUtil &init, NetUtil &predict, const std::string &input,
+void AddFC(ModelUtil &model, const std::string &input,
            const std::string &output, int in_size, int out_size) {
-  init.AddXavierFillOp({out_size, in_size}, output + "_w");
-  predict.AddInput(output + "_w");
-  init.AddConstantFillOp({out_size}, output + "_b");
-  predict.AddInput(output + "_b");
-  predict.AddFcOp(input, output + "_w", output + "_b", output, 2)
+  model.init.AddXavierFillOp({out_size, in_size}, output + "_w");
+  model.predict.AddInput(output + "_w");
+  model.init.AddConstantFillOp({out_size}, output + "_b");
+  model.predict.AddInput(output + "_b");
+  model.predict.AddFcOp(input, output + "_w", output + "_b", output, 2)
       ->set_engine("CUDNN");
 }
 
-void AddLSTM(NetUtil &init, NetUtil &predict, const std::string &input_blob,
+void AddLSTM(ModelUtil &model, const std::string &input_blob,
              const std::string &seq_lengths, const std::string &hidden_init,
              const std::string &cell_init, int vocab_size, int hidden_size,
              const std::string &scope, std::string *hidden_output,
              std::string *cell_state) {
   *hidden_output = scope + "/hidden_t_last";
   *cell_state = scope + "/cell_t_last";
-  AddFC(init, predict, input_blob, scope + "/i2h", vocab_size, 4 * hidden_size);
+  AddFC(model, input_blob, scope + "/i2h", vocab_size, 4 * hidden_size);
   // sight hack
-  init.AddXavierFillOp({4 * hidden_size, hidden_size}, scope + "/gates_t_w");
-  predict.AddInput(scope + "/gates_t_w");
-  init.AddConstantFillOp({4 * hidden_size}, scope + "/gates_t_b");
-  predict.AddInput(scope + "/gates_t_b");
-  predict.AddRecurrentNetworkOp(seq_lengths, hidden_init, cell_init, scope,
-                                *hidden_output, *cell_state,
-                                FLAGS_device == "cpu");
+  model.init.AddXavierFillOp({4 * hidden_size, hidden_size},
+                             scope + "/gates_t_w");
+  model.predict.AddInput(scope + "/gates_t_w");
+  model.init.AddConstantFillOp({4 * hidden_size}, scope + "/gates_t_b");
+  model.predict.AddInput(scope + "/gates_t_b");
+  model.predict.AddRecurrentNetworkOp(seq_lengths, hidden_init, cell_init,
+                                      scope, *hidden_output, *cell_state,
+                                      FLAGS_device == "cpu");
 }
 
-void AddSGD(NetUtil &init, NetUtil &predict, float base_learning_rate,
+void AddSGD(ModelUtil &model, float base_learning_rate,
             const std::string &policy, int stepsize, float gamma) {
-  predict.AddAtomicIterOp("iteration_mutex", "optimizer_iteration")
+  model.predict.AddAtomicIterOp("iteration_mutex", "optimizer_iteration")
       ->mutable_device_option()
       ->set_device_type(CPU);
-  init.AddConstantFillOp({1}, (int64_t)0, "optimizer_iteration")
+  model.init.AddConstantFillOp({1}, (int64_t)0, "optimizer_iteration")
       ->mutable_device_option()
       ->set_device_type(CPU);
-  init.AddCreateMutexOp("iteration_mutex")
+  model.init.AddCreateMutexOp("iteration_mutex")
       ->mutable_device_option()
       ->set_device_type(CPU);
-  predict.AddInput("iteration_mutex");
-  predict.AddInput("optimizer_iteration");
-  init.AddConstantFillOp({1}, 1.f, "ONE");
-  predict.AddInput("ONE");
-  predict.AddLearningRateOp("optimizer_iteration", "lr", base_learning_rate,
-                            gamma);
+  model.predict.AddInput("iteration_mutex");
+  model.predict.AddInput("optimizer_iteration");
+  model.init.AddConstantFillOp({1}, 1.f, "ONE");
+  model.predict.AddInput("ONE");
+  model.predict.AddLearningRateOp("optimizer_iteration", "lr",
+                                  base_learning_rate, gamma);
   std::vector<std::string> params({"LSTM/gates_t_w", "LSTM/i2h_b",
                                    "char_rnn_blob_0_w", "char_rnn_blob_0_b",
                                    "LSTM/gates_t_b", "LSTM/i2h_w"});
   for (auto &param : params) {
-    predict.AddWeightedSumOp({param, "ONE", param + "_grad", "lr"}, param);
+    model.predict.AddWeightedSumOp({param, "ONE", param + "_grad", "lr"},
+                                   param);
   }
 }
 
@@ -85,21 +88,23 @@ void run() {
     return;
   }
 
+  auto cuda = (FLAGS_device != "cpu" && cmd_setup_cuda());
+
   std::cout << "model: " << FLAGS_model << std::endl;
-  std::cout << "train_data: " << FLAGS_train_data << std::endl;
-  std::cout << "train_runs: " << FLAGS_train_runs << std::endl;
-  std::cout << "seq_length: " << FLAGS_seq_length << std::endl;
-  std::cout << "batch_size: " << FLAGS_batch_size << std::endl;
-  std::cout << "iters_to_report: " << FLAGS_iters_to_report << std::endl;
-  std::cout << "hidden_size: " << FLAGS_hidden_size << std::endl;
-  std::cout << "gen_length: " << FLAGS_gen_length << std::endl;
+  std::cout << "train-data: " << FLAGS_train_data << std::endl;
+  std::cout << "train-runs: " << FLAGS_train_runs << std::endl;
+  std::cout << "seq-length: " << FLAGS_seq_length << std::endl;
+  std::cout << "batch-size: " << FLAGS_batch_size << std::endl;
+  std::cout << "iters-to-report: " << FLAGS_iters_to_report << std::endl;
+  std::cout << "hidden-size: " << FLAGS_hidden_size << std::endl;
+  std::cout << "gen-length: " << FLAGS_gen_length << std::endl;
 
   std::cout << "device: " << FLAGS_device << std::endl;
-  std::cout << "dump_model: " << (FLAGS_dump_model ? "true" : "false")
+  std::cout << "using cuda: " << (cuda ? "true" : "false") << std::endl;
+  ;
+  std::cout << "dump-model: " << (FLAGS_dump_model ? "true" : "false")
             << std::endl;
 
-  if (FLAGS_device != "cpu") cmd_setup_cuda();
-
   std::cout << std::endl;
 
   // >>> with open(args.train_data) as f: self.text = f.read()
@@ -140,44 +145,40 @@ void run() {
   std::cout << "Start training" << std::endl;
 
   // >>> model = model_helper.ModelHelper(name="char_rnn")
-  NetDef initModel, forwardModel;
-  NetUtil init(initModel), forward(forwardModel);
-  init.SetName("char_rnn_init");
-  forward.SetName("char_rnn");
+  NetDef init_model, predict_model;
+  ModelUtil model(init_model, predict_model, "char_rnn");
 
   // >>> input_blob, seq_lengths, hidden_init, cell_init, target =
   // model.net.AddExternalInputs('input_blob', 'seq_lengths', 'hidden_init',
   // 'cell_init', 'target')
-  forward.AddInput("input_blob");
-  forward.AddInput("seq_lengths");
-  forward.AddInput("hidden_init");
-  forward.AddInput("cell_init");
-  forward.AddInput("target");
+  model.predict.AddInput("input_blob");
+  model.predict.AddInput("seq_lengths");
+  model.predict.AddInput("hidden_init");
+  model.predict.AddInput("cell_init");
+  model.predict.AddInput("target");
 
   // >>> hidden_output_all, self.hidden_output, _, self.cell_state = LSTM(model,
   // input_blob, seq_lengths, (hidden_init, cell_init), self.D,
   // self.hidden_size, scope="LSTM")
   std::string hidden_output;
   std::string cell_state;
-  AddLSTM(init, forward, "input_blob", "seq_lengths", "hidden_init",
-          "cell_init", D, FLAGS_hidden_size, "LSTM", &hidden_output,
-          &cell_state);
+  AddLSTM(model, "input_blob", "seq_lengths", "hidden_init", "cell_init", D,
+          FLAGS_hidden_size, "LSTM", &hidden_output, &cell_state);
 
   // >>> output = brew.fc(model, hidden_output_all, None,
   // dim_in=self.hidden_size, dim_out=self.D, axis=2)
-  AddFC(init, forward, "LSTM/hidden_t_all", "char_rnn_blob_0",
-        FLAGS_hidden_size, D);
+  AddFC(model, "LSTM/hidden_t_all", "char_rnn_blob_0", FLAGS_hidden_size, D);
 
   // >>> softmax = model.net.Softmax(output, 'softmax', axis=2)
-  forward.AddSoftmaxOp("char_rnn_blob_0", "softmax", 2);
+  model.predict.AddSoftmaxOp("char_rnn_blob_0", "softmax", 2);
 
   // >>> softmax_reshaped, _ = model.net.Reshape(softmax, ['softmax_reshaped',
   // '_'], shape=[-1, self.D])
-  forward.AddReshapeOp("softmax", "softmax_reshaped", {-1, D});
+  model.predict.AddReshapeOp("softmax", "softmax_reshaped", {-1, D});
 
   // >>> self.forward_net = core.Net(model.net.Proto())
-  NetDef trainModel(forwardModel);
-  NetUtil train(trainModel);
+  NetDef train_model(model.predict.net);
+  NetUtil train(train_model);
 
   // >>> xent = model.net.LabelCrossEntropy([softmax_reshaped, target], 'xent')
   train.AddLabelCrossEntropyOp("softmax_reshaped", "target", "xent");
@@ -191,7 +192,8 @@ void run() {
 
   // >>> build_sgd(model, base_learning_rate=0.1 * self.seq_length,
   // policy="step", stepsize=1, gamma=0.9999)
-  AddSGD(init, train, 0.1 * FLAGS_seq_length, "step", 1, 0.9999);
+  ModelUtil t(model.init, train);
+  AddSGD(t, 0.1 * FLAGS_seq_length, "step", 1, 0.9999);
 
   // >>> self.model = model
   // >>> self.predictions = softmax
@@ -202,22 +204,21 @@ void run() {
   // >>> self.prepare_state = core.Net("prepare_state")
   // >>> self.prepare_state.Copy(self.hidden_output, hidden_init)
   // >>> self.prepare_state.Copy(self.cell_state, cell_init)
-  NetDef prepareModel;
-  NetUtil prepare(prepareModel);
+  NetDef prepare_model;
+  NetUtil prepare(prepare_model);
   prepare.AddCopyOp(hidden_output, "hidden_init");
   prepare.AddCopyOp(cell_state, "cell_init");
   prepare.AddInput(hidden_output);
   prepare.AddInput(cell_state);
 
   if (FLAGS_device != "cpu") {
-    init.SetDeviceCUDA();
-    forward.SetDeviceCUDA();
+    model.SetDeviceCUDA();
     train.SetDeviceCUDA();
     prepare.SetDeviceCUDA();
   }
 
   if (FLAGS_dump_model) {
-    std::cout << init.Short();
+    std::cout << model.init.Short();
     std::cout << train.Short();
     std::cout << prepare.Short();
   }
@@ -229,7 +230,7 @@ void run() {
   std::cout << "Train model" << std::endl;
 
   // >>> workspace.RunNetOnce(self.model.param_init_net)
-  auto initNet = CreateNet(initModel, &workspace);
+  auto initNet = CreateNet(model.init.net, &workspace);
   initNet->Run();
 
   // >>> smooth_loss = -np.log(1.0 / self.D) * self.seq_length
@@ -278,7 +279,7 @@ void run() {
     BlobUtil(*workspace.CreateBlob(cell_state)).Set(value, true);
   }
   // >>> workspace.CreateNet(self.prepare_state)
-  auto prepareNet = CreateNet(prepareModel, &workspace);
+  auto prepareNet = CreateNet(prepare.net, &workspace);
 
   // >>> last_time = datetime.now()
   auto last_time = clock();
@@ -289,10 +290,10 @@ void run() {
   workspace.CreateBlob("input_blob");
   workspace.CreateBlob("seq_lengths");
   workspace.CreateBlob("target");
-  auto trainNet = CreateNet(trainModel, &workspace);
+  auto trainNet = CreateNet(train.net, &workspace);
 
   // >>> CreateNetOnce(self.forward_net)
-  auto forwardNet = CreateNet(forwardModel, &workspace);
+  auto forwardNet = CreateNet(model.predict.net, &workspace);
 
   // >>> while True:
   while (num_iter < FLAGS_train_runs) {
diff --git a/src/caffe2/binaries/train.cc b/src/caffe2/binaries/train.cc
index 83daddf4..b9c23b35 100644
--- a/src/caffe2/binaries/train.cc
+++ b/src/caffe2/binaries/train.cc
@@ -1,16 +1,18 @@
-#include "caffe2/util/misc.h"
-
+#include "caffe2/util/train.h"
 #include <caffe2/core/db.h>
 #include <caffe2/core/init.h>
 #include <caffe2/core/operator_gradient.h>
+#include <caffe2/utils/proto_utils.h>
 #include "caffe2/util/plot.h"
+#include "caffe2/util/preprocess.h"
 #include "caffe2/util/window.h"
-#include "caffe2/utils/proto_utils.h"
 #include "caffe2/zoo/keeper.h"
 
 #include "res/imagenet_classes.h"
 
 CAFFE2_DEFINE_string(model, "", "Name of one of the pre-trained models.");
+CAFFE2_DEFINE_string(layer, "",
+                     "Name of the layer on which to split the model.");
 CAFFE2_DEFINE_string(folder, "", "Folder with subfolders with images");
 
 CAFFE2_DEFINE_string(db_type, "leveldb", "The database type.");
@@ -31,7 +33,7 @@ CAFFE2_DEFINE_bool(reshape_output, false,
 namespace caffe2 {
 
 void run() {
-  if (!cmd_init("Full Train Example")) {
+  if (!cmd_init("CNN Training Example")) {
     return;
   }
 
@@ -49,22 +51,30 @@ void run() {
   }
 
   std::cout << "model: " << FLAGS_model << std::endl;
-  std::cout << "image_dir: " << FLAGS_folder << std::endl;
-  std::cout << "db_type: " << FLAGS_db_type << std::endl;
-  std::cout << "size_to_fit: " << FLAGS_size_to_fit << std::endl;
-  std::cout << "train_runs: " << FLAGS_train_runs << std::endl;
-  std::cout << "test_runs: " << FLAGS_test_runs << std::endl;
-  std::cout << "batch_size: " << FLAGS_batch_size << std::endl;
-  std::cout << "learning_rate: " << FLAGS_learning_rate << std::endl;
-  std::cout << "zero_one: " << (FLAGS_zero_one ? "true" : "false") << std::endl;
+  std::cout << "layer: " << FLAGS_layer << std::endl;
+  std::cout << "image-dir: " << FLAGS_folder << std::endl;
+  std::cout << "db-type: " << FLAGS_db_type << std::endl;
+  std::cout << "size-to-fit: " << FLAGS_size_to_fit << std::endl;
+  std::cout << "train-runs: " << FLAGS_train_runs << std::endl;
+  std::cout << "test-runs: " << FLAGS_test_runs << std::endl;
+  std::cout << "batch-size: " << FLAGS_batch_size << std::endl;
+  std::cout << "learning-rate: " << FLAGS_learning_rate << std::endl;
+  std::cout << "zero-one: " << (FLAGS_zero_one ? "true" : "false") << std::endl;
   std::cout << "display: " << (FLAGS_display ? "true" : "false") << std::endl;
-  std::cout << "reshape_output: " << FLAGS_reshape_output << std::endl;
+  std::cout << "reshape-output: " << (FLAGS_reshape_output ? "true" : "false")
+            << std::endl;
 
-  auto path_prefix = FLAGS_folder + '/' + '_';
-  std::string db_paths[kRunNum];
-  for (int i = 0; i < kRunNum; i++) {
-    db_paths[i] = path_prefix + name_for_run[i] + ".db";
+  auto has_split = FLAGS_layer.size() > 0;
+  std::string layer_prefix;
+  std::string model_safe = FLAGS_model;
+  std::replace(model_safe.begin(), model_safe.end(), '/', '_');
+  if (has_split) {
+    std::string layer_safe = FLAGS_layer;
+    std::replace(layer_safe.begin(), layer_safe.end(), '/', '_');
+    std::replace(layer_safe.begin(), layer_safe.end(), '.', '_');
+    layer_prefix = layer_safe + '_';
   }
+  auto path_prefix = FLAGS_folder + '/' + '_' + model_safe + '_' + layer_prefix;
 
   if (FLAGS_display) {
     superWindow("Full Train Example");
@@ -80,200 +90,133 @@ void run() {
     resizeWindow("loss", 500, 300);
   }
 
-  std::cout << std::endl;
+  std::string db_paths[kRunNum];
+  for (int i = 0; i < kRunNum; i++) {
+    db_paths[i] = path_prefix + name_for_run[i] + ".db";
+  }
 
-  auto load_time = -clock();
-  std::vector<std::string> class_labels;
-  std::vector<std::pair<std::string, int>> image_files;
-  load_labels(FLAGS_folder, path_prefix, class_labels, image_files);
+  std::cout << std::endl;
 
   std::cout << "load model.." << std::endl;
   NetDef full_init_model, full_predict_model;
-  Keeper(FLAGS_model).AddModel(full_init_model, full_predict_model, false);
+  ModelUtil full(full_init_model, full_predict_model);
+  Keeper(FLAGS_model).AddModel(full, has_split);
 
   if (FLAGS_device == "cudnn") {
-    NetUtil(full_init_model).SetEngineOps("CUDNN");
-    NetUtil(full_predict_model).SetEngineOps("CUDNN");
-  }
-
-  if (FLAGS_dump_model) {
-    std::cout << NetUtil(full_init_model).Short();
-    std::cout << NetUtil(full_predict_model).Short();
+    full.init.SetEngineOps("CUDNN");
+    full.predict.SetEngineOps("CUDNN");
+  }
+
+  NetDef init_model[kRunNum], predict_model[kRunNum];
+  ModelUtil models[kRunNum] = {
+      {init_model[kRunTrain], predict_model[kRunTrain],
+       name_for_run[kRunTrain]},
+      {init_model[kRunTest], predict_model[kRunTest], name_for_run[kRunTest]},
+      {init_model[kRunValidate], predict_model[kRunValidate],
+       name_for_run[kRunValidate]},
+  };
+
+  NetDef first_init_model, first_predict_model;
+  ModelUtil first(first_init_model, first_predict_model);
+  NetDef second_init_model, second_predict_model;
+  ModelUtil second(second_init_model, second_predict_model);
+
+  if (has_split) {
+    full.predict.CheckLayerAvailable(FLAGS_layer);
+    std::cout << "split model.. (at " << FLAGS_layer << ")" << std::endl;
+    full.Split(FLAGS_layer, first, second, FLAGS_device != "cudnn");
+    if (FLAGS_device != "cpu") {
+      first.SetDeviceCUDA();
+    }
+  } else {
+    second.init.net = full.init.net;
+    second.predict.net = full.predict.net;
   }
 
-  NetDef init_model[kRunNum];
-  NetDef predict_model[kRunNum];
-  for (int i = 0; i < kRunNum; i++) {
-    init_model[i].set_name(name_for_run[i] + "_init_model");
-    predict_model[i].set_name(name_for_run[i] + "_predict_model");
-  }
+  std::cout << "collect images.." << std::endl;
+  auto load_time = -clock();
+  std::vector<std::string> class_labels;
+  std::vector<std::pair<std::string, int>> image_files;
+  load_labels(FLAGS_folder, path_prefix, class_labels, image_files);
+  std::cout << class_labels.size() << " labels found" << std::endl;
+  std::cout << image_files.size() << " images found" << std::endl;
 
-  pre_process(image_files, db_paths, FLAGS_db_type, FLAGS_size_to_fit);
+  std::cout << "cache images.." << std::endl;
+  auto count = preprocess(image_files, db_paths, first, FLAGS_db_type,
+                          FLAGS_batch_size, FLAGS_size_to_fit);
+  std::cout << count << " images processed" << std::endl;
   load_time += clock();
 
+  auto model_in = has_split ? FLAGS_layer : full.predict.Input(0);
   for (int i = 0; i < kRunNum; i++) {
-    ModelUtil(init_model[i], predict_model[i])
-        .AddDatabaseOps(name_for_run[i], full_predict_model.external_input(0),
-                        db_paths[i], FLAGS_db_type, FLAGS_batch_size);
+    models[i].AddDatabaseOps(name_for_run[i], model_in, db_paths[i],
+                             FLAGS_db_type, FLAGS_batch_size);
   }
-  copy_train_model(full_init_model, full_predict_model,
-                   full_predict_model.external_input(0), class_labels.size(),
-                   init_model[kRunTrain], predict_model[kRunTrain]);
-  copy_test_model(full_predict_model, predict_model[kRunValidate]);
-  copy_test_model(full_predict_model, predict_model[kRunTest]);
+  second.CopyTrain(model_in, class_labels.size(), models[kRunTrain]);
+  second.CopyTest(models[kRunValidate]);
+  second.CopyTest(models[kRunTest]);
 
-  auto output = predict_model[kRunTrain].external_output(0);
+  auto output = models[kRunTrain].predict.Output(0);
   if (FLAGS_reshape_output) {
     auto output_reshaped = output + "_reshaped";
     for (int i = 0; i < kRunNum; i++) {
-      NetUtil(predict_model[i]).AddReshapeOp(output, output_reshaped, {0, -1});
+      models[i].predict.AddReshapeOp(output, output_reshaped, {0, -1});
     }
     output = output_reshaped;
   }
 
-  ModelUtil(init_model[kRunTrain], predict_model[kRunTrain])
-      .AddTrainOps(output, FLAGS_learning_rate, FLAGS_optimizer);
-  ModelUtil(full_predict_model, predict_model[kRunValidate]).AddTestOps(output);
-  ModelUtil(full_predict_model, predict_model[kRunTest]).AddTestOps(output);
+  models[kRunTrain].AddTrainOps(output, FLAGS_learning_rate, FLAGS_optimizer);
+  ModelUtil(second.predict, models[kRunValidate].predict).AddTestOps(output);
+  ModelUtil(second.predict, models[kRunTest].predict).AddTestOps(output);
 
   if (FLAGS_zero_one) {
-    NetUtil(predict_model[kRunValidate])
-        .AddZeroOneOp(output, "label");
-  }
-  if (FLAGS_display) {
-    NetUtil(predict_model[kRunValidate])
-        .AddShowWorstOp(output, "label",
-                        full_predict_model.external_input(0));
+    models[kRunValidate].predict.AddZeroOneOp(output, "label");
   }
 
   if (FLAGS_display) {
-    NetUtil(predict_model[kRunTrain])
-        .AddTimePlotOp("accuracy", "iter", "accuracy", "train", 10);
-    NetUtil(predict_model[kRunValidate])
-        .AddTimePlotOp("accuracy", "iter", "accuracy", "test");
-    NetUtil(predict_model[kRunTrain])
-        .AddTimePlotOp("loss", "iter", "loss", "train", 10);
-    NetUtil(predict_model[kRunValidate])
-        .AddTimePlotOp("loss", "iter", "loss", "test");
+    models[kRunValidate].predict.AddShowWorstOp(output, "label",
+                                                second.predict.Input(0));
+    models[kRunTrain].predict.AddTimePlotOp("accuracy", "iter", "accuracy",
+                                            "train", 10);
+    models[kRunValidate].predict.AddTimePlotOp("accuracy", "iter", "accuracy",
+                                               "test");
+    models[kRunTrain].predict.AddTimePlotOp("loss", "iter", "loss", "train",
+                                            10);
+    models[kRunValidate].predict.AddTimePlotOp("loss", "iter", "loss", "test");
   }
 
   if (FLAGS_device != "cpu") {
     for (int i = 0; i < kRunNum; i++) {
-      NetUtil(init_model[i]).SetDeviceCUDA();
-      NetUtil(predict_model[i]).SetDeviceCUDA();
+      models[i].SetDeviceCUDA();
     }
   }
 
+  if (FLAGS_dump_model) {
+    std::cout << models[kRunTrain].Short();
+  }
+
   std::cout << std::endl;
 
   Workspace workspace("tmp");
-  unique_ptr<caffe2::NetBase> predict_net[kRunNum];
-  for (int i = 0; i < kRunNum; i++) {
-    auto init_net = CreateNet(init_model[i], &workspace);
-    init_net->Run();
-    predict_net[i] = CreateNet(predict_model[i], &workspace);
-  }
 
   clock_t train_time = 0;
   clock_t validate_time = 0;
   clock_t test_time = 0;
 
-  auto last_time = clock();
-  auto last_i = 0;
-  auto sum_accuracy = 0.f, sum_loss = 0.f;
-
   std::cout << "training.." << std::endl;
-  for (auto i = 1; i <= FLAGS_train_runs; i++) {
-    train_time -= clock();
-    predict_net[kRunTrain]->Run();
-    train_time += clock();
-
-    sum_accuracy +=
-        BlobUtil(*workspace.GetBlob("accuracy")).Get().data<float>()[0];
-    sum_loss += BlobUtil(*workspace.GetBlob("loss")).Get().data<float>()[0];
-
-    auto steps_time = (float)(clock() - last_time) / CLOCKS_PER_SEC;
-    if (steps_time > 5 || i == FLAGS_train_runs) {
-      auto iter = BlobUtil(*workspace.GetBlob("iter")).Get().data<int64_t>()[0];
-      auto lr = BlobUtil(*workspace.GetBlob("lr")).Get().data<float>()[0];
-      auto train_loss = sum_loss / (i - last_i),
-           train_accuracy = sum_accuracy / (i - last_i);
-      sum_loss = 0;
-      sum_accuracy = 0;
-      validate_time -= clock();
-      predict_net[kRunValidate]->Run();
-      validate_time += clock();
-      auto validate_accuracy =
-          BlobUtil(*workspace.GetBlob("accuracy")).Get().data<float>()[0];
-      std::cout << "step: " << iter << "  rate: " << lr
-                << "  loss: " << train_loss << "  accuracy: " << train_accuracy
-                << " | " << validate_accuracy
-                << "  step_time: " << std::setprecision(3)
-                << steps_time / (i - last_i) << "s" << std::endl;
-      last_i = i;
-      last_time = clock();
-    }
-  }
+  run_trainer(FLAGS_train_runs, models[kRunTrain], models[kRunValidate],
+              workspace, train_time, validate_time);
 
   std::cout << std::endl;
-
   std::cout << "testing.." << std::endl;
-  auto test_step = 10;
-  for (auto i = 1; i <= FLAGS_test_runs; i++) {
-    test_time -= clock();
-    predict_net[kRunTest]->Run();
-    test_time += clock();
-
-    sum_accuracy +=
-        BlobUtil(*workspace.GetBlob("accuracy")).Get().data<float>()[0];
-    sum_loss += BlobUtil(*workspace.GetBlob("loss")).Get().data<float>()[0];
-
-    if (i % test_step == 0) {
-      auto loss = sum_loss / test_step, accuracy = sum_accuracy / test_step;
-      sum_loss = 0;
-      sum_accuracy = 0;
-      std::cout << "step: " << i << " loss: " << loss
-                << " accuracy: " << accuracy << std::endl;
-    }
-  }
+  run_tester(FLAGS_test_runs, models[kRunTest], workspace, test_time);
 
   NetDef deploy_init_model;  // the final initialization model
-  deploy_init_model.set_name("train_" + full_init_model.name());
-  for (const auto &op : full_init_model.op()) {
-    auto &output = op.output(0);
-    auto blob = workspace.GetBlob(output);
-    if (blob) {
-      auto tensor = BlobUtil(*blob).Get();
-      auto init_op = deploy_init_model.add_op();
-      init_op->set_type("GivenTensorFill");
-      auto arg1 = init_op->add_arg();
-      arg1->set_name("shape");
-      for (auto dim : tensor.dims()) {
-        arg1->add_ints(dim);
-      }
-      auto arg2 = init_op->add_arg();
-      arg2->set_name("values");
-      const auto &data = tensor.data<float>();
-      for (auto i = 0; i < tensor.size(); ++i) {
-        arg2->add_floats(data[i]);
-      }
-      init_op->add_output(output);
-    } else {
-      deploy_init_model.add_op()->CopyFrom(op);
-    }
-  }
+  ModelUtil deploy(deploy_init_model, full.predict.net,
+                   "train_" + full.init.net.name());
+  full.CopyDeploy(deploy, workspace);
 
-  auto init_path = path_prefix + FLAGS_model + "_init_net.pb";
-  auto predict_path = path_prefix + FLAGS_model + "_predict_net.pb";
-  WriteProtoToBinaryFile(deploy_init_model, init_path);
-  WriteProtoToBinaryFile(full_predict_model, predict_path);
-  auto init_size =
-      std::ifstream(init_path, std::ifstream::ate | std::ifstream::binary)
-          .tellg();
-  auto predict_size =
-      std::ifstream(predict_path, std::ifstream::ate | std::ifstream::binary)
-          .tellg();
-  auto model_size = init_size + predict_size;
+  size_t model_size = deploy.Write(path_prefix);
 
   std::cout << std::endl;
 
diff --git a/src/caffe2/util/model.cc b/src/caffe2/util/model.cc
index 096500d1..ac3b42a7 100644
--- a/src/caffe2/util/model.cc
+++ b/src/caffe2/util/model.cc
@@ -1,4 +1,5 @@
 #include "caffe2/util/model.h"
+#include "caffe2/util/blob.h"
 
 namespace caffe2 {
 
@@ -6,6 +7,11 @@ const std::string gradient_suffix("_grad");
 const std::string moment_suffix("_moment");
 const std::string meansq_suffix("_meansq");
 const std::string reader_suffix("_reader");
+const std::string init_net_suffix("_init_net.pb");
+const std::string predict_net_suffix("_predict_net.pb");
+const std::string init_name_suffix("_init");
+const std::string predict_name_suffix("_predict");
+
 const std::string iter_name("iter");
 const std::string lr_name("lr");
 const std::string one_name("one");
@@ -14,11 +20,6 @@ const std::string label_name("label");
 const std::string xent_name("xent");
 const std::string accuracy_name("accuracy");
 
-void ModelUtil::SetName(const std::string &name) {
-  init.SetName(name + "_init");
-  predict.SetName(name + "_predict");
-}
-
 void ModelUtil::AddDatabaseOps(const std::string &name, const std::string &data,
                                const std::string &db,
                                const std::string &db_type, int batch_size) {
@@ -166,4 +167,208 @@ void ModelUtil::AddConvOps(const std::string &input, const std::string &output,
                     padding, kernel);
 }
 
+void ModelUtil::Split(const std::string &layer, ModelUtil &firstModel,
+                      ModelUtil &secondModel, bool force_cpu, bool inclusive) {
+  std::set<std::string> static_inputs = predict.CollectLayers(layer);
+
+  // copy operators
+  for (const auto &op : init.net.op()) {
+    auto is_first = (static_inputs.find(op.output(0)) != static_inputs.end());
+    auto new_op =
+        (is_first ? firstModel.init.net : secondModel.init.net).add_op();
+    new_op->CopyFrom(op);
+  }
+  for (const auto &op : predict.net.op()) {
+    auto is_first = (static_inputs.find(op.output(0)) != static_inputs.end() &&
+                     (inclusive || op.input(0) != op.output(0)));
+    auto new_op =
+        (is_first ? firstModel.predict.net : secondModel.predict.net).add_op();
+    new_op->CopyFrom(op);
+    if (!force_cpu) {
+      new_op->set_engine("CUDNN");  // TODO: not here
+    }
+  }
+
+  // copy externals
+  if (firstModel.predict.net.op().size()) {
+    // firstModel.predict.net.add_external_input(predict.Input(0));
+  }
+  if (secondModel.predict.net.op().size()) {
+    // secondModel.predict.net.add_external_input(layer);
+  }
+  for (const auto &output : init.net.external_output()) {
+    auto is_first = (static_inputs.find(output) != static_inputs.end());
+    if (is_first) {
+      firstModel.init.net.add_external_output(output);
+    } else {
+      secondModel.init.net.add_external_output(output);
+    }
+  }
+  for (const auto &input : predict.net.external_input()) {
+    auto is_first = (static_inputs.find(input) != static_inputs.end());
+    if (is_first) {
+      firstModel.predict.net.add_external_input(input);
+    } else {
+      secondModel.predict.net.add_external_input(input);
+    }
+  }
+  if (firstModel.predict.net.op().size()) {
+    firstModel.predict.net.add_external_output(layer);
+  }
+  if (secondModel.predict.net.op().size()) {
+    secondModel.predict.net.add_external_output(predict.Output(0));
+  }
+
+  if (init.net.has_name()) {
+    if (!firstModel.init.net.has_name()) {
+      firstModel.init.SetName(init.net.name() + "_first");
+    }
+    if (!secondModel.init.net.has_name()) {
+      secondModel.init.SetName(init.net.name() + "_second");
+    }
+  }
+  if (predict.net.has_name()) {
+    if (!firstModel.predict.net.has_name()) {
+      firstModel.predict.SetName(predict.net.name() + "_first");
+    }
+    if (!secondModel.predict.net.has_name()) {
+      secondModel.predict.SetName(predict.net.name() + "_second");
+    }
+  }
+}
+
+void set_trainable(OperatorDef &op, bool train) {
+  if (op.type() == "Dropout") {
+    for (auto &arg : *op.mutable_arg()) {
+      if (arg.name() == "is_test") {
+        arg.set_i(!train);
+      }
+    }
+  }
+}
+
+void ModelUtil::CopyTrain(const std::string &layer, int out_size,
+                          ModelUtil &train) const {
+  std::string last_w, last_b;
+  for (const auto &op : predict.net.op()) {
+    auto new_op = train.predict.net.add_op();
+    new_op->CopyFrom(op);
+    set_trainable(*new_op, true);
+    if (op.type() == "FC") {
+      last_w = op.input(1);
+      last_b = op.input(2);
+    }
+  }
+  train.predict.SetRenameInplace();
+  for (const auto &op : init.net.op()) {
+    auto &output = op.output(0);
+    auto init_op = train.init.net.add_op();
+    bool uniform = (output.find("_b") != std::string::npos);
+    init_op->set_type(uniform ? "ConstantFill" : "XavierFill");
+    for (const auto &arg : op.arg()) {
+      if (arg.name() == "shape") {
+        auto init_arg = init_op->add_arg();
+        init_arg->set_name("shape");
+        if (output == last_w) {
+          init_arg->add_ints(out_size);
+          init_arg->add_ints(arg.ints(1));
+        } else if (output == last_b) {
+          init_arg->add_ints(out_size);
+        } else {
+          init_arg->CopyFrom(arg);
+        }
+      }
+    }
+    init_op->add_output(output);
+  }
+  std::set<std::string> existing_inputs;
+  existing_inputs.insert(train.predict.net.external_input().begin(),
+                         train.predict.net.external_input().end());
+  for (const auto &op : train.predict.net.op()) {
+    for (auto &output : op.output()) {
+      existing_inputs.insert(output);
+    }
+  }
+  for (const auto &input : predict.net.external_input()) {
+    if (existing_inputs.find(input) == existing_inputs.end()) {
+      train.predict.net.add_external_input(input);
+    }
+  }
+  for (const auto &output : predict.net.external_output()) {
+    train.predict.net.add_external_output(output);
+  }
+  // auto op = train_init_model.add_op();
+  // op->set_type("ConstantFill");
+  // auto arg = op->add_arg();
+  // arg->set_name("shape");
+  // arg->add_ints(1);
+  // op->add_output(layer);
+}
+
+void ModelUtil::CopyTest(ModelUtil &test) const {
+  for (const auto &op : predict.net.op()) {
+    auto new_op = test.predict.net.add_op();
+    new_op->CopyFrom(op);
+    set_trainable(*new_op, false);
+  }
+  for (const auto &input : predict.net.external_input()) {
+    test.predict.net.add_external_input(input);
+  }
+  for (const auto &output : predict.net.external_output()) {
+    test.predict.net.add_external_output(output);
+  }
+}
+
+void ModelUtil::CopyDeploy(ModelUtil &deploy, Workspace &workspace) const {
+  for (const auto &op : init.net.op()) {
+    auto &output = op.output(0);
+    auto blob = workspace.GetBlob(output);
+    if (blob) {
+      auto tensor = BlobUtil(*blob).Get();
+      auto init_op = deploy.init.net.add_op();
+      init_op->set_type("GivenTensorFill");
+      auto arg1 = init_op->add_arg();
+      arg1->set_name("shape");
+      for (auto dim : tensor.dims()) {
+        arg1->add_ints(dim);
+      }
+      auto arg2 = init_op->add_arg();
+      arg2->set_name("values");
+      const auto &data = tensor.data<float>();
+      for (auto i = 0; i < tensor.size(); ++i) {
+        arg2->add_floats(data[i]);
+      }
+      init_op->add_output(output);
+    } else {
+      deploy.init.net.add_op()->CopyFrom(op);
+    }
+  }
+}
+
+size_t ModelUtil::Write(const std::string &path_prefix) const {
+  size_t size = 0;
+  size += init.Write(path_prefix + init_net_suffix);
+  size += predict.Write(path_prefix + predict_net_suffix);
+  return size;
+}
+
+size_t ModelUtil::Read(const std::string &path_prefix) {
+  size_t size = 0;
+  size += init.Read(path_prefix + init_net_suffix);
+  size += predict.Read(path_prefix + predict_net_suffix);
+  return size;
+}
+
+void ModelUtil::SetName(const std::string &name) {
+  init.SetName(name + init_name_suffix);
+  predict.SetName(name + predict_name_suffix);
+}
+
+void ModelUtil::SetDeviceCUDA() {
+  init.SetDeviceCUDA();
+  predict.SetDeviceCUDA();
+}
+
+std::string ModelUtil::Short() { return predict.Short() + init.Short(); }
+
 }  // namespace caffe2
diff --git a/src/caffe2/util/net.cc b/src/caffe2/util/net.cc
index 9626f74b..9bc976d1 100644
--- a/src/caffe2/util/net.cc
+++ b/src/caffe2/util/net.cc
@@ -700,11 +700,11 @@ OperatorDef* NetUtil::AddGradientOp(
     GradientOpsMeta meta = GetGradientForOp(op, output);
     if (meta.ops_.size()) {
       if (meta.ops_.size() > 1) {
-        std::cout << "multiple gradients for operator (" << op.type();
+        std::cerr << "multiple gradients for operator (" << op.type();
         for (auto& o : meta.ops_) {
-          std::cout << " " << o.type();
+          std::cerr << " " << o.type();
         }
-        std::cout << ")" << std::endl;
+        std::cerr << ")" << std::endl;
       }
       grad->CopyFrom(meta.ops_[0]);
     } else {
@@ -794,7 +794,6 @@ std::vector<OperatorDef> NetUtil::CollectGradientOps(
   for (auto& op : net.op()) {
     if (trainable_ops.find(op.type()) != trainable_ops.end()) {
       gradient_ops.push_back(op);
-      // std::cout << "type: " << op.type() << std::endl;
       for (auto& input : op.input()) {
         auto& output = op.output();
         if (std::find(output.begin(), output.end(), input) == output.end()) {
@@ -806,7 +805,7 @@ std::vector<OperatorDef> NetUtil::CollectGradientOps(
         }
       }
     } else if (non_trainable_ops.find(op.type()) == non_trainable_ops.end()) {
-      std::cout << "unknown backprop operator type: " << op.type() << std::endl;
+      CAFFE_THROW("unknown backprop operator type: " + op.type());
     }
   }
   std::reverse(gradient_ops.begin(), gradient_ops.end());
@@ -850,9 +849,9 @@ void NetUtil::CheckLayerAvailable(const std::string& layer) {
     }
   }
   if (!layer_found) {
-    std::cout << "available layers:" << std::endl;
+    std::cerr << "available layers:" << std::endl;
     for (auto& layer : available_layers) {
-      std::cout << "  " << layer.first << " (" << layer.second << ")"
+      std::cerr << "  " << layer.first << " (" << layer.second << ")"
                 << std::endl;
     }
     LOG(FATAL) << "~ no layer with name " << layer << " in model.";
@@ -969,6 +968,24 @@ void NetUtil::Print() {
   google::protobuf::TextFormat::Print(net, &stream);
 }
 
+size_t NetUtil::Write(const std::string& path) const {
+  WriteProtoToBinaryFile(net, path);
+  return std::ifstream(path, std::ifstream::ate | std::ifstream::binary)
+      .tellg();
+}
+
+size_t NetUtil::WriteText(const std::string& path) const {
+  WriteProtoToTextFile(net, path);
+  return std::ifstream(path, std::ifstream::ate | std::ifstream::binary)
+      .tellg();
+}
+
+size_t NetUtil::Read(const std::string& path) {
+  CAFFE_ENFORCE(ReadProtoFromFile(path.c_str(), &net));
+  return std::ifstream(path, std::ifstream::ate | std::ifstream::binary)
+      .tellg();
+}
+
 void NetUtil::SetDeviceCUDA() {
 #ifdef WITH_CUDA
   net.mutable_device_option()->set_device_type(CUDA);
@@ -982,8 +999,8 @@ OperatorDef* NetUtil::AddRecurrentNetworkOp(const std::string& seq_lengths,
                                             const std::string& hidden_output,
                                             const std::string& cell_state,
                                             bool force_cpu) {
-  NetDef forwardModel;
-  NetUtil forward(forwardModel);
+  NetDef forward_model;
+  NetUtil forward(forward_model);
   forward.SetName(scope);
   forward.SetType("rnn");
   forward.AddInput("input_t");
@@ -1009,12 +1026,12 @@ OperatorDef* NetUtil::AddRecurrentNetworkOp(const std::string& seq_lengths,
     fc->mutable_device_option()->set_device_type(CUDA);
     sum->mutable_device_option()->set_device_type(CUDA);
     lstm->mutable_device_option()->set_device_type(CUDA);
-    forwardModel.mutable_device_option()->set_device_type(CUDA);
+    forward.SetDeviceCUDA();
   }
 #endif
 
-  NetDef backwardModel;
-  NetUtil backward(backwardModel);
+  NetDef backward_model;
+  NetUtil backward(backward_model);
   backward.SetName("RecurrentBackwardStep");
   backward.SetType("simple");
   backward.AddGradientOp(*lstm);
@@ -1035,11 +1052,9 @@ OperatorDef* NetUtil::AddRecurrentNetworkOp(const std::string& seq_lengths,
   backward.AddInput(seq_lengths);
   backward.AddInput(scope + "/hidden_t");
   backward.AddInput(scope + "/cell_t");
-#ifdef WITH_CUDA
   if (!force_cpu) {
-    backwardModel.mutable_device_option()->set_device_type(CUDA);
+    backward.SetDeviceCUDA();
   }
-#endif
 
   auto op =
       AddOp("RecurrentNetwork",