From 93584ef682177e486f4b9dd4da86e4998eebf113 Mon Sep 17 00:00:00 2001
From: Manish R Jain <manishrjain@gmail.com>
Date: Mon, 30 Nov 2015 10:56:50 +1100
Subject: [PATCH] Update README with notes about loader performance

---
 Dockerfile |  2 +-
 README.md  | 18 +++++++++++++++---
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index a781ae2d..c8eed48b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -21,7 +21,7 @@ ENV LD_LIBRARY_PATH "/usr/local/lib"
 # Install DGraph and update dependencies to right versions.
 RUN go get -v github.com/robfig/glock && \
 	go get -v github.com/dgraph-io/dgraph/... && \
-	glock sync github.com/dgraph-io/dgraph && echo "v0.1"
+	glock sync github.com/dgraph-io/dgraph && echo "v0.1.0"
 
 # Run some tests, don't build an image if we're failing tests.
 RUN go test github.com/dgraph-io/dgraph/...
diff --git a/README.md b/README.md
index 8f60b62a..0e96975e 100644
--- a/README.md
+++ b/README.md
@@ -41,15 +41,27 @@ $ git clone https://github.com/dgraph-io/benchmarks.git
 ```
 
 To load the data in bulk, use the data loader binary in dgraph/server/loader.
-Loader needs 2 directories:
-- mutations, where mutation commit logs are stored and
-- postings, where posting lists are stored.
+Loader needs a postings directory, where posting lists are stored.
 
 ```
 $ cd $GOPATH/src/github.com/dgraph-io/dgraph/server/loader
 $ go build . && ./loader --rdfgzips=path_of_benchmarks_dir/data/rdf-films.gz,path_of_benchmarks_dir/data/names.gz --postings DIRPATH/p
 ```
 
+### Loading performance
+Loader is memory bound. Every mutation loads a posting list in memory, where mutations
+are applied in layers above posting lists.
+While loader doesn't write to disk every time a mutation happens, it does periodically
+merge all the mutations to posting lists, and writes them to rocksdb which persists them.
+How often this merging happens can be fine tuned by specifying `max_ram_mb`.
+Every time loader determines it exceeds this threshold, it would *stop the world*, and start the merge process.
+The more memory is available for loader to work with, the less merging needs to be done, the faster the loading.
+
+Thus, loader performance is highly dependent on merging performance, which depends on how fast the underlying persistent storage is.
+So, *RAMFS/TMPFS > SSD > Hard disk*, when it comes to loading performance.
+
+As a reference point, it takes 220 seconds to load 4.1M RDFs from `names.gz`(from benchmarks repository) on my 6-core Intel Xeon Dell Precision T3500, using 1G TMPFS for postings directory, and with `max_ram_mb=3000` flag set.
+
 ## Querying
 Once data is loaded, point the dgraph server to the postings and mutations directory.
 ```
-- 
GitLab