uidlist.go

package algo

import (
	"container/heap"
	"sort"

	"github.com/dgraph-io/dgraph/task"
)

const blockSize = 100

// ListIterator is used to read through the task.List.
type ListIterator struct {
	list     *task.List
	curBlock *task.Block
	bidx     int  // Block index
	lidx     int  // List index
	isEnd    bool // To indicate reaching the end
}

// WriteIterator is used to append UIDs to a task.List.
type WriteIterator struct {
	list     *task.List
	curBlock *task.Block
	bidx     int // Block index
	lidx     int // List index

}

// AsList implements sort.Interface by for block lists
type AsList struct{ l *task.List }

func (s AsList) Len() int { return ListLen(s.l) }
func (s AsList) Swap(i, j int) {
	p, q := ridx(s.l, i)
	m, n := ridx(s.l, j)
	s.l.Blocks[p].List[q], s.l.Blocks[m].List[n] = s.l.Blocks[m].List[n], s.l.Blocks[p].List[q]
}
func (s AsList) Less(i, j int) bool {
	p, q := ridx(s.l, i)
	m, n := ridx(s.l, j)
	return s.l.Blocks[p].List[q] < s.l.Blocks[m].List[n]
}

func Sort(ul *task.List) {
	sort.Sort(AsList{ul})
	for _, it := range ul.Blocks {
		it.MaxInt = it.List[len(it.List)-1]
	}
}

func NewWriteIterator(l *task.List) WriteIterator {
	var cur *task.Block

	// Initialise and allocate some memory.
	if len(l.Blocks) == 0 {
		l.Blocks = make([]*task.Block, 0, 2)
	}

	if len(l.Blocks) > 0 {
		cur = l.Blocks[0]
	}
	return WriteIterator{
		list:     l,
		curBlock: cur,
		bidx:     0,
		lidx:     0,
	}
}

// Append appends an UID to the end of task.List following the blockSize specified.
func (l *WriteIterator) Append(uid uint64) {
	if l.lidx == blockSize {
		l.curBlock.MaxInt = l.curBlock.List[blockSize-1]
		l.lidx = 0
		l.bidx++
	}
	if l.bidx == len(l.list.Blocks) {
		// If we reached the end of blocks, add a new one.
		l.list.Blocks = append(l.list.Blocks, &task.Block{List: make([]uint64, blockSize)})
	}
	l.curBlock = l.list.Blocks[l.bidx]
	l.curBlock.List[l.lidx] = uid
	l.lidx++
}

// End is called after the write is over to update the MaxInt of the last block.
func (l *WriteIterator) End() {
	l.list.Blocks = l.list.Blocks[:l.bidx+1]
	if l.lidx == 0 {
		l.list.Blocks = l.list.Blocks[:l.bidx]
		return
	}
	l.curBlock.List = l.curBlock.List[:l.lidx]
	l.curBlock.MaxInt = l.curBlock.List[l.lidx-1]
}

// ListLen returns the number of items written.
func (l *WriteIterator) ListLen() int {
	return l.bidx*blockSize + l.lidx
}

// NewListIterator returns a read iterator for the list passed to it.
func NewListIterator(l *task.List) ListIterator {
	var isEnd bool
	if l == nil || len(l.Blocks) == 0 ||
		len(l.Blocks[0].List) == 0 {
		isEnd = true
	}

	var cur *task.Block
	if l != nil && len(l.Blocks) > 0 {
		cur = l.Blocks[0]
	}

	return ListIterator{
		list:     l,
		curBlock: cur,
		bidx:     0,
		lidx:     0,
		isEnd:    isEnd,
	}
}

// SeekToIndex moves the iterator to the specified index.
func (l *ListIterator) SeekToIndex(idx int) {
	i, j := ridx(l.list, idx)
	if i == -1 {
		l.isEnd = true
		return
	}
	l.bidx = i
	l.lidx = j
}

// Seek seeks to the index whose value is greater than or equal to the given UID.
// It uses binary search to move around.
func (l *ListIterator) Seek(uid uint64, whence int) {
	if whence == 1 {
		if l.isEnd {
			return
		}
		// Seek the current list first.
		for l.lidx < len(l.curBlock.List) && l.curBlock.List[l.lidx] < uid {
			l.lidx++
		}
		if l.lidx < len(l.curBlock.List) {
			return
		}
	}
	// TODO(Ashwin): Do a benchmark to see if linear scan is better than binary if whence is 1
	u := l.list
	i := sort.Search(len(u.Blocks), func(i int) bool { return u.Blocks[i].MaxInt >= uid })
	if i >= len(u.Blocks) {
		l.isEnd = true
		return
	}
	j := sort.Search(len(u.Blocks[i].List), func(j int) bool { return u.Blocks[i].List[j] >= uid })
	if j == len(u.Blocks[i].List) {
		l.isEnd = true
		return
	}
	l.bidx = i
	l.curBlock = l.list.Blocks[l.bidx]
	l.lidx = j
}

// Valid returns true if we haven't reached the end of the list.
func (l *ListIterator) Valid() bool {
	return !l.isEnd
}

// Val returns the value pointed to by the iterator.
func (l *ListIterator) Val() uint64 {
	return l.curBlock.List[l.lidx]
}

// Next moves the iterator to the next element and also sets the end if the last element
// is consumed already.
func (l *ListIterator) Next() {
	if l.isEnd {
		return
	}
	l.lidx++
	if l.lidx >= len(l.curBlock.List) {
		l.lidx = 0
		l.bidx++
	}
	if l.bidx >= len(l.list.Blocks) {
		l.isEnd = true
		return
	}
	// Update the current block.
	l.curBlock = l.list.Blocks[l.bidx]
	if len(l.curBlock.List) == 0 {
		l.isEnd = true
	}
}

// Slice returns a new task.List with the elements between start index and end index
// of  the list passed to it.
func Slice(ul *task.List, start, end int) {
	out := NewWriteIterator(ul)
	it := NewListIterator(ul)
	it.SeekToIndex(start)

	i := 0
	for ; start+i < end && it.Valid(); it.Next() {
		out.Append(it.Val())
		i++
	}
	out.End()
}

func SortedListToBlock(l []uint64) *task.List {
	b := new(task.List)
	if len(l) == 0 {
		return b
	}
	wit := NewWriteIterator(b)
	for _, it := range l {
		wit.Append(it)
	}

	wit.End()
	return b
}

func ListLen(l *task.List) int {
	if l == nil || len(l.Blocks) == 0 {
		return 0
	}

	n := len(l.Blocks) - 1
	length := n*blockSize + len(l.Blocks[n].List)
	return length
}

func IntersectWith(u, v *task.List) {
	itu := NewListIterator(u)
	itv := NewListIterator(v)
	out := NewWriteIterator(u)
	for itu.Valid() && itv.Valid() {
		uid := itu.Val()
		vid := itv.Val()
		if uid == vid {
			out.Append(uid)
			itu.Next()
			itv.Next()
		} else if uid < vid {
			itu.Seek(vid, 1)
		} else if uid > vid {
			itv.Seek(uid, 1)
		}
	}
	out.End()
}

// ApplyFilter applies a filter to our UIDList.
func ApplyFilter(u *task.List, f func(uint64, int) bool) {
	out := NewWriteIterator(u)
	for i, block := range u.Blocks {
		for j, uid := range block.List {
			if f(uid, Idx(u, i, j)) {
				out.Append(uid)
			}
		}
	}
	out.End()
}

// IntersectSorted intersect a list of UIDLists. An alternative is to do
// pairwise intersections n-1 times where n=number of lists. This is less
// efficient:
// Let p be length of shortest list. Let q be average length of lists. So
// nq = total number of elements.
// There are many possible cases. Consider the case where the shortest list
// is the answer (or close to the answer). The following method requires nq
// reads (each element is read only once) whereas pairwise intersections can
// require np + nq - p reads, which can be up to ~twice as many.
func IntersectSorted(lists []*task.List) *task.List {
	o := new(task.List)
	if len(lists) == 0 {
		return o
	}
	out := NewWriteIterator(o)
	// Scan through the smallest list. Denote as A.
	// For each x in A,
	//   For each other list B,
	//     Keep popping elements until we get a y >= x.
	//     If y > x, mark x as "skipped". Break out of loop.
	//   If x is not marked as "skipped", append x to result.
	var minLenIdx int
	minLen := ListLen(lists[0])
	for i := 1; i < len(lists); i++ { // Start from 1.
		l := lists[i]
		n := ListLen(l)
		if n < minLen {
			minLen = n
			minLenIdx = i
		}
	}

	// lptrs[j] is the element we are looking at for lists[j].
	lptrs := make([]ListIterator, len(lists))
	for i, l := range lists {
		lptrs[i] = NewListIterator(l)
	}
	shortListIt := lptrs[minLenIdx]
	elemsLeft := true // If some list has no elems left, we can't intersect more.

	for ; shortListIt.Valid() && elemsLeft; shortListIt.Next() { //for i := 0; i < len(shortList.Uids) && elemsLeft; i++ {
		val := shortListIt.Val()
		var skip bool                     // Should we skip val in output?
		for j := 0; j < len(lists); j++ { // For each other list in lists.
			if j == minLenIdx {
				// No point checking yourself.
				continue
			}

			for ; lptrs[j].Valid() && lptrs[j].Val() < val; lptrs[j].Next() {
			}

			if !lptrs[j].Valid() || lptrs[j].Val() > val {
				elemsLeft = lptrs[j].Valid()
				skip = true
				break
			}
			// Otherwise, lj.Get(ljp) = val and we continue checking other lists.
		}
		if !skip {
			out.Append(val)
		}
	}
	out.End()
	return o
}

func Difference(u, v *task.List) {
	itu := NewListIterator(u)
	itv := NewListIterator(v)
	out := NewWriteIterator(u)
	for itu.Valid() && itv.Valid() {
		uid := itu.Val()
		vid := itv.Val()
		if uid == vid {
			itu.Next()
			itv.Next()
		} else if uid < vid {
			out.Append(uid)
			itu.Next()
		} else if uid > vid {
			itv.Seek(uid, 1)
		}
	}
	out.End()
}

// MergeSorted merges sorted lists.
func MergeSorted(lists []*task.List) *task.List {
	o := new(task.List)
	if len(lists) == 0 {
		return o
	}
	out := NewWriteIterator(o)
	h := &uint64Heap{}
	heap.Init(h)
	var lIt []ListIterator
	for i, l := range lists {
		it := NewListIterator(l)
		lIt = append(lIt, it)
		if it.Valid() {
			heap.Push(h, elem{
				val:     it.Val(),
				listIdx: i,
			})
		}
	}

	var last uint64   // Last element added to sorted / final output.
	for h.Len() > 0 { // While heap is not empty.
		me := (*h)[0] // Peek at the top element in heap.
		if (out.lidx == 0 && out.bidx == 0) || me.val != last {
			out.Append(me.val) // Add if unique.
			last = me.val
		}
		lIt[me.listIdx].Next()
		if !lIt[me.listIdx].Valid() {
			heap.Pop(h)
		} else {
			val := lIt[me.listIdx].Val()
			(*h)[0].val = val
			heap.Fix(h, 0) // Faster than Pop() followed by Push().
		}
	}
	out.End()
	return o
}

// IndexOf performs a binary search on the uids slice and returns the index at
// which it finds the uid, else returns -1
func IndexOf(u *task.List, uid uint64) (int, int) {
	i := sort.Search(len(u.Blocks), func(i int) bool { return u.Blocks[i].MaxInt >= uid })
	if i < len(u.Blocks) {
		j := sort.Search(len(u.Blocks[i].List), func(j int) bool { return u.Blocks[i].List[j] >= uid })
		if j < len(u.Blocks[i].List) && u.Blocks[i].List[j] == uid {
			return i, j
		}
	}
	return -1, -1
}

func Swap(ul *task.List, i, j int) {
	i1, i2 := ridx(ul, i)
	j1, j2 := ridx(ul, j)
	ul.Blocks[i1].List[i2], ul.Blocks[j1].List[j2] = ul.Blocks[j1].List[j2], ul.Blocks[i1].List[i2]
}

func ItemAtIndex(ul *task.List, i int) uint64 {
	i, j := ridx(ul, i)
	return ul.Blocks[i].List[j]
}

func ridx(ul *task.List, i int) (int, int) {
	if i >= ListLen(ul) {
		return -1, -1
	}

	return i / blockSize, i % blockSize
}

func Idx(ul *task.List, i, j int) int {
	return i*blockSize + j
}

// ToUintsListForTest converts to list of uints for testing purpose only.
func ToUintsListForTest(ul []*task.List) [][]uint64 {
	out := make([][]uint64, 0, len(ul))
	for _, u := range ul {
		out = append(out, BlockToList(u))
	}
	return out
}

func BlockToList(b *task.List) []uint64 {
	res := make([]uint64, 0, 5)
	for _, it := range b.Blocks {
		for _, el := range it.List {
			res = append(res, el)
		}
	}
	return res
}