uidlist.go

package algo

import (
	"bytes"
	"container/heap"
	"sort"
	"strconv"

	"github.com/google/flatbuffers/go"

	"github.com/dgraph-io/dgraph/task"
	"github.com/dgraph-io/dgraph/x"
)

type UIDList struct {
	uints []uint64
	list  *task.UidList
}

// NewUIDList creates a new UIDList given uint64 array. If you want your list
// to be allocated on stack, do consider using FromUints instead. However, if you
// are certain you are allocating on heap, you can use this.
func NewUIDList(data []uint64) *UIDList {
	u := new(UIDList)
	u.FromUints(data)
	return u
}

// FromUints initialize UIDList from []uint64.
func (u *UIDList) FromUints(data []uint64) {
	x.Assert(u != nil && u.uints == nil && u.list == nil)
	u.uints = data
}

// FromUints initialize UIDList from task.UidList.
func (u *UIDList) FromTask(data *task.UidList) {
	x.Assert(u != nil && u.uints == nil && u.list == nil)
	u.list = data
}

// Get returns the i-th element of UIDList.
func (u *UIDList) Get(i int) uint64 {
	if u.list != nil {
		return u.list.Uids(i)
	}
	return u.uints[i]
}

// Size returns size of UIDList.
func (u *UIDList) Size() int {
	if u == nil {
		// In a subgraph node, in processGraph, sometimes we might fan out to zero
		// nodes, i.e., sg.destUIDs is empty. In this case, child subgraph might not
		// have its srcUIDs initialized.
		return 0
	}
	if u.list != nil {
		return u.list.UidsLength()
	}
	return len(u.uints)
}

// Reslice selects a slice of the data.
func (u *UIDList) ApplyFilter(f func(uid uint64) bool) {
	x.Assert(u != nil && (u.uints != nil || u.list != nil))
	var out []uint64
	if u.uints != nil {
		out = u.uints[:0]
	} else {
		out = make([]uint64, 0, 30)
	}
	n := u.Size()
	for i := 0; i < n; i++ {
		uid := u.Get(i)
		if f(uid) {
			out = append(out, uid)
		}
	}
	u.uints = out
	u.list = nil
}

// Slice selects a slice of the data.
func (u *UIDList) Slice(start, end int) {
	x.Assert(u != nil && (u.uints != nil || u.list != nil))
	if u.uints != nil {
		u.uints = u.uints[start:end]
		return
	}
	// This is a task list. Let's copy what we want and convert to a []uint64.
	x.Assert(start >= 0)
	x.Assert(end <= u.Size())
	x.Assert(start <= end)
	output := make([]uint64, 0, end-start)
	for i := start; i < end; i++ {
		output = append(output, u.list.Uids(i))
	}
	u.uints = output
	u.list = nil
}

// Intersect intersects with another list and updates this list.
func (u *UIDList) Intersect(v *UIDList) {
	x.Assert(u != nil && (u.uints != nil || u.list != nil))
	var out []uint64
	if u.uints != nil {
		out = u.uints[:0]
	} else {
		n := u.Size()
		if v.Size() < n {
			n = v.Size()
		}
		out = make([]uint64, 0, n)
	}

	n := u.Size()
	m := v.Size()
	var k int
	for i := 0; i < n; i++ {
		uid := u.Get(i)
		for ; k < m && v.Get(k) < uid; k++ {
		}
		if k < m && v.Get(k) == uid {
			out = append(out, uid)
		}
	}
	u.uints = out
	u.list = nil
}

// IntersectLists intersect a list of UIDLists. An alternative is to do
// pairwise intersections n-1 times where n=number of lists. This is less
// efficient:
// Let p be length of shortest list. Let q be average length of lists. So
// nq = total number of elements.
// There are many possible cases. Consider the case where the shortest list
// is the answer (or close to the answer). The following method requires nq
// reads (each element is read only once) whereas pairwise intersections can
// require np + nq - p reads, which can be up to ~twice as many.
func IntersectLists(lists []*UIDList) *UIDList {
	if len(lists) == 0 {
		return NewUIDList([]uint64{})
	}

	// Scan through the smallest list. Denote as A.
	// For each x in A,
	//   For each other list B,
	//     Keep popping elements until we get a y >= x.
	//     If y > x, mark x as "skipped". Break out of loop.
	//   If x is not marked as "skipped", append x to result.
	var minLenIdx int
	minLen := lists[0].Size()
	for i := 1; i < len(lists); i++ { // Start from 1.
		l := lists[i]
		n := l.Size()
		if n < minLen {
			minLen = n
			minLenIdx = i
		}
	}

	// Our final output. Give it some capacity.
	output := make([]uint64, 0, minLen)
	// idx[i] is the element we are looking at for lists[i].
	idx := make([]int, len(lists))
	shortList := lists[minLenIdx]
	for i := 0; i < shortList.Size(); i++ {
		val := shortList.Get(i)
		if i > 0 && val == shortList.Get(i-1) {
			continue // Avoid duplicates.
		}

		var skip bool                     // Should we skip val in output?
		for j := 0; j < len(lists); j++ { // For each other list in lists.
			if j == minLenIdx {
				// No point checking yourself.
				continue
			}
			l := lists[j]
			lidx := idx[j]
			lsz := l.Size()
			for ; lidx < lsz && l.Get(lidx) < val; lidx++ {
			}
			idx[j] = lidx
			if lidx >= lsz || l.Get(lidx) > val {
				skip = true
				break
			}
			// Otherwise, l[k] = val and we continue checking other lists.
		}
		if !skip {
			output = append(output, val)
		}
	}

	ul := new(UIDList)
	ul.FromUints(output)
	return ul
}

// MergeLists merges sorted uint64 lists. Only unique numbers are returned.
func MergeLists(lists []*UIDList) *UIDList {
	if len(lists) == 0 {
		return NewUIDList([]uint64{})
	}

	h := &uint64Heap{}
	heap.Init(h)

	for i, l := range lists {
		if l.Size() > 0 {
			heap.Push(h, elem{
				val:     l.Get(0),
				listIdx: i,
			})
		}
	}

	// Our final output. Give it some capacity.
	output := make([]uint64, 0, 100)
	// idx[i] is the element we are looking at for lists[i].
	idx := make([]int, len(lists))
	var last uint64   // Last element added to sorted / final output.
	for h.Len() > 0 { // While heap is not empty.
		me := (*h)[0] // Peek at the top element in heap.
		if len(output) == 0 || me.val != last {
			output = append(output, me.val) // Add if unique.
			last = me.val
		}
		l := lists[me.listIdx]
		if idx[me.listIdx] >= l.Size()-1 {
			heap.Pop(h)
		} else {
			idx[me.listIdx]++
			val := l.Get(idx[me.listIdx])
			(*h)[0].val = val
			heap.Fix(h, 0) // Faster than Pop() followed by Push().
		}
	}
	ul := new(UIDList)
	ul.FromUints(output)
	return ul
}

// IndexOf performs a binary search on the uids slice and returns the index at
// which it finds the uid, else returns -1
func (u *UIDList) IndexOf(uid uint64) int {
	x.Assert(u != nil && (u.uints != nil || u.list != nil))
	i := sort.Search(u.Size(), func(i int) bool { return u.Get(i) >= uid })
	if i < u.Size() && u.Get(i) == uid {
		return i
	}
	return -1
}

// UidlistOffset adds a UidList to buffer and returns the offset.
func (u *UIDList) AddTo(b *flatbuffers.Builder) flatbuffers.UOffsetT {
	x.Assert(u != nil && (u.uints != nil || u.list != nil))
	n := u.Size()
	task.UidListStartUidsVector(b, n)
	for i := n - 1; i >= 0; i-- {
		b.PrependUint64(u.Get(i))
	}
	ulist := b.EndVector(n)
	task.UidListStart(b)
	task.UidListAddUids(b, ulist)
	return task.UidListEnd(b)
}

// DebugString returns a debug string for UIDList.
func (u *UIDList) DebugString() string {
	var b bytes.Buffer
	for i := 0; i < u.Size(); i++ {
		b.WriteRune('[')
		b.WriteString(strconv.FormatUint(u.Get(i), 10))
		b.WriteString("] ")
	}
	return b.String()
}