From 9a3472b750bfc03b906ebe19506c56cbdb1eb22d Mon Sep 17 00:00:00 2001 From: Manish R Jain <manishrjain@gmail.com> Date: Thu, 22 Oct 2015 15:03:49 +1100 Subject: [PATCH] More thought spent on how query execution would work. Changing things. --- query/query.go | 30 +++++++++++++++++++++++++----- query/result.fbs | 14 ++++++++++++++ query/thoughts.md | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 5 deletions(-) create mode 100644 query/thoughts.md diff --git a/query/query.go b/query/query.go index fceda987..9426db65 100644 --- a/query/query.go +++ b/query/query.go @@ -46,12 +46,30 @@ import ( var log = x.Log("query") -type Mattr struct { - Attr string - Msg *Message +type SubGraph struct { + Attr string + Children []*SubGraph + + Query []byte + Result []byte +} + +func NewGraph(id uint64, xid string) *SubGraph { + // This would set the Result field in SubGraph, + // and populate the children for attributes. + return nil +} - ResultUids []byte // Flatbuffer result.Uids - ResultValue []byte // gob.Encode +type Mattr struct { + Attr string + Msg *Mattr + Query []byte // flatbuffer + Result []byte // flatbuffer + + /* + ResultUids []byte // Flatbuffer result.Uids + ResultValue []byte // gob.Encode + */ } type Message struct { @@ -130,7 +148,9 @@ func Run(m *Message) error { x.Err(log, err).WithField("uid", m.Id).WithField("attr", mattr.Attr). Error("While extracting data from posting list") } + if mattr.Msg != nil { + // Now this would most likely be sent over wire to other servers. if err := Run(mattr.Msg); err != nil { return err } diff --git a/query/result.fbs b/query/result.fbs index 8517d3fb..e75f4fc3 100644 --- a/query/result.fbs +++ b/query/result.fbs @@ -4,4 +4,18 @@ table Uids { uid:[ulong]; } +table TaskQuery { + attr:string; + uids:[ulong]; +} + +struct Value { + val:[ubyte]; +} + +table TaskResult { + uids:[ulong]; + values:[Value]; +} + root_type Uids; diff --git a/query/thoughts.md b/query/thoughts.md new file mode 100644 index 00000000..4e9ee160 --- /dev/null +++ b/query/thoughts.md @@ -0,0 +1,32 @@ +How to generate a unique list of uids by querying list of posting lists? + +Sol 1: +- Say there're k posting lists involved. +- One way to do so is to have a heap of k elements. +- At each iteration, we pop() an element from the heap (log k) +- Advance the pointer of that posting list, and retrieve another element (involves mutex read lock) +- Push() that element into the heap (log k) +- This would give us O(N*log k), with mutex lock acquired N times. +- With N=1000 and k=5, this gives us 1000 * ln(5) ~ 1600 + +Performance Improvements (memory tradeoff) [Sol1a]: +- We can alleviate the need for mutex locks by copying over all the posting list uids in separate vectors. +- This would avoid N lock acquisitions, only requiring the best-case scenario of k locks. +- But this also means all the posting list uids would be stored in memory. + +Performance with Memory [Sol1b]: +- Use k channels, with each channel only maintaining a buffer of say 1000 uids. +- In fact, keep the read lock acquired during this process, to avoid the posting list from changing during a query. +- So, basically have a way for a posting list to stream uids to a blocking channel, after having acquired a read lock. +- Overall this process of merging uids shouldn't take that long anyways; so this won't starve writes, only delay them. + +Another way [Sol2]: +- Pick a posting list, copy all it's uids in one go (one mutex lock) +- Use a binary tree to store uids. Eliminate duplicates. +- Iterate over each element in the uids vector, and insert into binary tree. [O(log N) max per insert] +- Repeat with other posting lists. +- This would give us O(N log N) complexity, with mutex lock acquired k times. +- With N=1000 and k=5, this gives us 1000 * ln(1000) ~ 7000 +- Not choosing this path. + +Solution: Sol1b -- GitLab