Newer
Older
/*
* Copyright (C) 2017 Dgraph Labs, Inc. and Contributors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
Manish R Jain
committed
package worker
import (
Manish R Jain
committed
"encoding/binary"
Manish R Jain
committed
"log"
Manish R Jain
committed
"math/rand"
Manish R Jain
committed
"sync"
"time"
"github.com/coreos/etcd/raft"
"github.com/coreos/etcd/raft/raftpb"
"golang.org/x/net/context"
"golang.org/x/net/trace"
Manish R Jain
committed
"github.com/dgraph-io/dgraph/conn"
"github.com/dgraph-io/dgraph/protos"
"github.com/dgraph-io/dgraph/schema"
Manish R Jain
committed
"github.com/dgraph-io/dgraph/x"
)
errorNodeIDExists = "Error Node ID already exists in the cluster"
type proposalCtx struct {
ch chan error
ctx context.Context
cnt int // used for reference counting
// Since each proposal consists of multiple tasks we need to store
// non-nil error returned by task
err error
index uint64
n *node
Manish R Jain
committed
type proposals struct {
sync.RWMutex
Manish R Jain
committed
}
func (p *proposals) Store(pid uint32, pctx *proposalCtx) bool {
Manish R Jain
committed
p.Lock()
defer p.Unlock()
if _, has := p.ids[pid]; has {
return false
}
Manish R Jain
committed
}
func (p *proposals) IncRef(pid uint32, index uint64, count int) {
p.Lock()
defer p.Unlock()
pd, has := p.ids[pid]
x.AssertTrue(has)
pd.cnt += count
pd.index = index
return
}
func (p *proposals) Ctx(pid uint32) (context.Context, bool) {
p.RLock()
defer p.RUnlock()
if pd, has := p.ids[pid]; has {
return pd.ctx, true
}
return nil, false
}
Manish R Jain
committed
func (p *proposals) Done(pid uint32, err error) {
p.Lock()
Manish R Jain
committed
if !has {
return
}
x.AssertTrue(pd.cnt > 0 && pd.index != 0)
pd.cnt -= 1
if err != nil {
pd.err = err
}
if pd.cnt > 0 {
return
}
delete(p.ids, pid)
pd.ch <- pd.err
// We emit one pending watermark as soon as we read from rd.committedentries.
// Since the tasks are executed in goroutines we need on guarding watermark which
// is done only when all the pending sync/applied marks have been emitted.
pd.n.Applied.Done(pd.index)
posting.SyncMarkFor(pd.n.gid).Done(pd.index)
Manish R Jain
committed
}
func (p *proposals) Has(pid uint32) bool {
p.RLock()
defer p.RUnlock()
_, has := p.ids[pid]
return has
}
Manish R Jain
committed
type node struct {
*conn.Node
// Changed after init but not protected by SafeMutex
requestCh chan linReadReq
// Fields which are never changed after init.
applyCh chan raftpb.Entry
ctx context.Context
stop chan struct{} // to send the stop signal to Run
done chan struct{} // to check whether node is running or not
gid uint32
props proposals
sch *scheduler
Manish R Jain
committed
func newNode(gid uint32, id uint64, myAddr string) *node {
x.Printf("Node with GroupID: %v, ID: %v\n", gid, id)
Manish R Jain
committed
rc := &protos.RaftContext{
Manish R Jain
committed
Addr: myAddr,
Group: gid,
Id: id,
}
m := conn.NewNode(rc)
props := proposals{
ids: make(map[uint32]*proposalCtx),
}
Manish R Jain
committed
n := &node{
Node: m,
requestCh: make(chan linReadReq),
ctx: context.Background(),
gid: gid,
// processConfChange etc are not throttled so some extra delta, so that we don't
// block tick when applyCh is full
applyCh: make(chan raftpb.Entry, Config.NumPendingProposals+1000),
props: props,
stop: make(chan struct{}),
done: make(chan struct{}),
sch: new(scheduler),
Manish R Jain
committed
}
Manish R Jain
committed
return n
}
Manish R Jain
committed
type header struct {
proposalId uint32
msgId uint16
}
func (h *header) Length() int {
return 6 // 4 bytes for proposalId, 2 bytes for msgId.
}
func (h *header) Encode() []byte {
result := make([]byte, h.Length())
binary.LittleEndian.PutUint32(result[0:4], h.proposalId)
binary.LittleEndian.PutUint16(result[4:6], h.msgId)
return result
}
func (h *header) Decode(in []byte) {
h.proposalId = binary.LittleEndian.Uint32(in[0:4])
h.msgId = binary.LittleEndian.Uint16(in[4:6])
}
func (n *node) ProposeAndWait(ctx context.Context, proposal *protos.Proposal) error {
return x.Errorf("RAFT isn't initialized yet")
}
// TODO: Should be based on number of edges (amount of work)
x.PendingProposals.Add(1)
defer func() { <-pendingProposals; x.PendingProposals.Add(-1) }()
if ctx.Err() != nil {
return ctx.Err()
}
// Do a type check here if schema is present
// In very rare cases invalid entries might pass through raft, which would
// be persisted, we do best effort schema check while writing
if proposal.Mutations != nil {
for _, edge := range proposal.Mutations.Edges {
if typ, err := schema.State().TypeOf(edge.Attr); err != nil {
continue
} else if err := ValidateAndConvert(edge, typ); err != nil {
for _, schema := range proposal.Mutations.Schema {
if err := checkSchema(schema); err != nil {
return err
}
}
pctx := &proposalCtx{
ch: che,
ctx: ctx,
Manish R Jain
committed
Manish R Jain
committed
if err != nil {
return err
}
Manish R Jain
committed
// we don't timeout on a mutation which has already been proposed.
if err = n.Raft().Propose(ctx, slice[:upto]); err != nil {
Manish R Jain
committed
return x.Wrapf(err, "While proposing")
}
// Wait for the proposal to be committed.
if tr, ok := trace.FromContext(ctx); ok {
tr.LazyPrintf("Waiting for the proposal: mutations.")
}
if tr, ok := trace.FromContext(ctx); ok {
tr.LazyPrintf("Waiting for the proposal: membership update.")
}
log.Fatalf("Unknown proposal")
err = <-che
if err != nil {
if tr, ok := trace.FromContext(ctx); ok {
tr.LazyPrintf(err.Error())
Manish R Jain
committed
}
Manish R Jain
committed
}
func (n *node) processMutation(pid uint32, index uint64, edge *protos.DirectedEdge) error {
var ctx context.Context
var has bool
if ctx, has = n.props.Ctx(pid); !has {
ctx = n.ctx
}
ctx = context.WithValue(ctx, "raft", rv)
if err := runMutation(ctx, edge); err != nil {
if tr, ok := trace.FromContext(ctx); ok {
tr.LazyPrintf(err.Error())
}
return err
}
return nil
}
func (n *node) processSchemaMutations(pid uint32, index uint64, s *protos.SchemaUpdate) error {
var ctx context.Context
var has bool
if ctx, has = n.props.Ctx(pid); !has {
ctx = n.ctx
}
rv := x.RaftValue{Group: n.gid, Index: index}
ctx = context.WithValue(n.ctx, "raft", rv)
if err := runSchemaMutation(ctx, s); err != nil {
if tr, ok := trace.FromContext(n.ctx); ok {
tr.LazyPrintf(err.Error())
}
return err
}
return nil
}
Manish R Jain
committed
func (n *node) applyConfChange(e raftpb.Entry) {
var cc raftpb.ConfChange
cc.Unmarshal(e.Data)
if len(cc.Context) > 0 {
var rc protos.RaftContext
x.Check(rc.Unmarshal(cc.Context))
n.Connect(rc.Id, rc.Addr)
}
cs := n.Raft().ApplyConfChange(cc)
n.SetConfState(cs)
n.Applied.Done(e.Index)
posting.SyncMarkFor(n.gid).Done(e.Index)
}
func (n *node) processApplyCh() {
for e := range n.applyCh {
n.Applied.Done(e.Index)
posting.SyncMarkFor(n.gid).Done(e.Index)
if e.Type == raftpb.EntryConfChange {
n.applyConfChange(e)
Manish R Jain
committed
}
x.AssertTrue(e.Type == raftpb.EntryNormal)
proposal := &protos.Proposal{}
if err := proposal.Unmarshal(e.Data); err != nil {
log.Fatalf("Unable to unmarshal proposal: %v %q\n", err, e.Data)
// One final applied and synced watermark would be emitted when proposal ctx ref count
// becomes zero.
Janardhan Reddy
committed
if !n.props.Has(proposal.Id) {
pctx := &proposalCtx{
ch: make(chan error, 1),
ctx: n.ctx,
n: n,
}
n.props.Store(proposal.Id, pctx)
}
n.sch.schedule(proposal, e.Index)
} else if proposal.Membership != nil {
x.Fatalf("Dgraph does not handle membership proposals anymore.")
Manish R Jain
committed
}
}
func (n *node) retrieveSnapshot(peerID uint64) {
pool, err := n.GetPeerPool(peerID)
// err is just going to be errNoConnection
log.Fatalf("Cannot retrieve snapshot from peer %v, no connection. Error: %v\n",
peerID, err)
Manish R Jain
committed
defer conn.Get().Release(pool)
lastIndex, err := n.Store.LastIndex()
x.Checkf(err, "Error while getting last index")
// Wait for watermarks to sync since populateShard writes directly to db, otherwise
// the values might get overwritten
// Safe to keep this line
// Need to clear pl's stored in memory for the case when retrieving snapshot with
// index greater than this node's last index
// Should invalidate/remove pl's to this group only ideally
Janardhan Reddy
committed
posting.EvictGroup(n.gid)
Janardhan Reddy
committed
if _, err := populateShard(n.ctx, pstore, pool, n.gid); err != nil {
// TODO: We definitely don't want to just fall flat on our face if we can't
// retrieve a simple snapshot.
log.Fatalf("Cannot retrieve snapshot from peer %v, error: %v\n", peerID, err)
}
// Populate shard stores the streamed data directly into db, so we need to refresh
// schema for current group id
x.Checkf(schema.LoadFromDb(n.gid), "Error while initilizating schema")
Manish R Jain
committed
}
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
type linReadReq struct {
// A one-shot chan which we send a raft index upon
indexCh chan<- uint64
}
func (n *node) readIndex(ctx context.Context) (chan uint64, error) {
ch := make(chan uint64, 1)
select {
case n.requestCh <- linReadReq{ch}:
return ch, nil
case <-ctx.Done():
return nil, ctx.Err()
}
}
func (n *node) runReadIndexLoop(stop <-chan struct{}, finished chan<- struct{},
requestCh <-chan linReadReq, readStateCh <-chan raft.ReadState) {
defer close(finished)
counter := x.NewNonceCounter()
requests := []linReadReq{}
// We maintain one linearizable ReadIndex request at a time. Others wait queued behind
// requestCh.
for {
select {
case <-stop:
return
case <-readStateCh:
// Do nothing, discard ReadState info we don't have an activeRctx for
case req := <-requestCh:
slurpLoop:
for {
requests = append(requests, req)
select {
case req = <-requestCh:
default:
break slurpLoop
}
}
activeRctx := counter.Generate()
// We ignore the err - it would be n.ctx cancellation (which we must ignore because
// it's our duty to continue until `stop` is triggered) or raft.ErrStopped (which we
// must ignore for the same reason).
_ = n.Raft().ReadIndex(n.ctx, activeRctx[:])
// To see if the ReadIndex request succeeds, we need to use a timeout and wait for a
// successful response. If we don't see one, the raft leader wasn't configured, or the
// raft leader didn't respond.
// This is supposed to use context.Background(). We don't want to cancel the timer
// externally. We want equivalent functionality to time.NewTimer.
timer, cancelTimer := context.WithTimeout(context.Background(), 10*time.Millisecond)
again:
select {
case <-stop:
cancelTimer()
return
case rs := <-readStateCh:
if 0 != bytes.Compare(activeRctx[:], rs.RequestCtx) {
goto again
}
cancelTimer()
index := rs.Index
for _, req := range requests {
req.indexCh <- index
}
case <-timer.Done():
for _, req := range requests {
req.indexCh <- raft.None
}
}
requests = requests[:0]
}
}
}
Manish R Jain
committed
func (n *node) Run() {
// See also our configuration of HeartbeatTick and ElectionTick.
ticker := time.NewTicker(20 * time.Millisecond)
rcBytes, err := n.RaftContext.Marshal()
// This chan could have capacity zero, because runReadIndexLoop never blocks without selecting
// on readStateCh. It's 2 so that sending rarely blocks (so the Go runtime doesn't have to
// switch threads as much.)
readStateCh := make(chan raft.ReadState, 2)
{
// We only stop runReadIndexLoop after the for loop below has finished interacting with it.
// That way we know sending to readStateCh will not deadlock.
finished := make(chan struct{})
stop := make(chan struct{})
defer func() { <-finished }()
defer close(stop)
go n.runReadIndexLoop(stop, finished, n.requestCh, readStateCh)
}
Manish R Jain
committed
for {
select {
Manish R Jain
committed
case rd := <-n.Raft().Ready():
for _, rs := range rd.ReadStates {
readStateCh <- rs
}
// TODO: Consider if we need to quickly update membership info.
leader = rd.RaftState == raft.StateLeader
}
if leader {
// Leader can send messages in parallel with writing to disk.
for _, msg := range rd.Messages {
// NOTE: We can do some optimizations here to drop messages.
msg.Context = rcBytes
n.Send(msg)
}
}
// First store the entries, then the hardstate and snapshot.
x.Check(n.Wal.Store(n.gid, rd.HardState, rd.Entries))
x.Check(n.Wal.StoreSnapshot(n.gid, rd.Snapshot))
// Now store them in the in-memory store.
n.SaveToStorage(rd.Snapshot, rd.HardState, rd.Entries)
Manish R Jain
committed
if !raft.IsEmptySnap(rd.Snapshot) {
// We don't send snapshots to other nodes. But, if we get one, that means
// either the leader is trying to bring us up to state; or this is the
// snapshot that I created. Only the former case should be handled.
var rc protos.RaftContext
x.AssertTrue(rc.Group == n.gid)
if rc.Id != n.Id {
// NOTE: Retrieving snapshot here is OK, after storing it above in WAL, because
// rc.Id != n.Id.
x.Printf("-------> SNAPSHOT [%d] from %d\n", n.gid, rc.Id)
// It's ok to block tick while retrieving snapshot, since it's a follower
n.retrieveSnapshot(rc.Id)
x.Printf("-------> SNAPSHOT [%d]. DONE.\n", n.gid)
x.Printf("-------> SNAPSHOT [%d] from %d [SELF]. Ignoring.\n", n.gid, rc.Id)
Manish R Jain
committed
}
Manish R Jain
committed
if len(rd.CommittedEntries) > 0 {
if tr, ok := trace.FromContext(n.ctx); ok {
tr.LazyPrintf("Found %d committed entries", len(rd.CommittedEntries))
}
Manish R Jain
committed
}
// Now schedule or apply committed entries.
for _, entry := range rd.CommittedEntries {
// Need applied watermarks for schema mutation also for read linearazibility
// Applied watermarks needs to be emitted as soon as possible sequentially.
// If we emit Mark{4, false} and Mark{4, true} before emitting Mark{3, false}
// then doneUntil would be set as 4 as soon as Mark{4,true} is done and before
// Mark{3, false} is emitted. So it's safer to emit watermarks as soon as
// possible sequentially
n.Applied.Begin(entry.Index)
posting.SyncMarkFor(n.gid).Begin(entry.Index)
if !leader && entry.Type == raftpb.EntryConfChange {
// Config changes in followers must be applied straight away.
n.applyConfChange(entry)
} else {
// Just queue up to be processed. Don't wait on them.
// TODO: Stop accepting requests when applyCh is full
// Just queue up to be processed. Don't wait on them.
n.applyCh <- entry
}
Manish R Jain
committed
}
if !leader {
// Followers should send messages later.
for _, msg := range rd.Messages {
// NOTE: We can do some optimizations here to drop messages.
msg.Context = rcBytes
n.Send(msg)
if firstRun && n.canCampaign {
go n.Raft().Campaign(n.ctx)
Manish R Jain
committed
if peerId, has := groups().Peer(n.gid, Config.RaftId); has && n.AmLeader() {
n.Raft().TransferLeadership(n.ctx, Config.RaftId, peerId)
go func() {
select {
case <-n.ctx.Done(): // time out
if tr, ok := trace.FromContext(n.ctx); ok {
tr.LazyPrintf("context timed out while transfering leadership")
}
if tr, ok := trace.FromContext(n.ctx); ok {
tr.LazyPrintf("Timed out transfering leadership")
}
}
n.Raft().Stop()
close(n.done)
}()
} else {
n.Raft().Stop()
close(n.done)
}
case <-n.done:
Manish R Jain
committed
return
}
}
}
func (n *node) Stop() {
select {
case n.stop <- struct{}{}:
case <-n.done:
// already stopped.
return
}
<-n.done // wait for Run to respond.
}
func (n *node) snapshotPeriodically() {
if n.gid == 0 {
// Group zero is dedicated for membership information, whose state we don't persist.
// So, taking snapshots would end up deleting the RAFT entries that we need to
// regenerate the state on a crash. Therefore, don't take snapshots.
return
}
ticker := time.NewTicker(time.Minute)
defer ticker.Stop()
for {
select {
n.snapshot(Config.MaxPendingCount)
Manish R Jain
committed
case <-n.done:
return
}
Manish R Jain
committed
}
}
if n.gid == 0 {
// Group zero is dedicated for membership information, whose state we don't persist.
// So, taking snapshots would end up deleting the RAFT entries that we need to
// regenerate the state on a crash. Therefore, don't take snapshots.
return
}
water := posting.SyncMarkFor(n.gid)
le := water.DoneUntil()
existing, err := n.Store.Snapshot()
x.Checkf(err, "Unable to get existing snapshot")
si := existing.Metadata.Index
if le <= si+skip {
return
}
snapshotIdx := le - skip
if tr, ok := trace.FromContext(n.ctx); ok {
tr.LazyPrintf("Taking snapshot for group: %d at watermark: %d\n", n.gid, snapshotIdx)
}
rc, err := n.RaftContext.Marshal()
s, err := n.Store.CreateSnapshot(snapshotIdx, n.ConfState(), rc)
x.Checkf(n.Store.Compact(snapshotIdx), "While compacting snapshot")
x.Check(n.Wal.StoreSnapshot(n.gid, s))
// Get leader information for MY group.
Manish R Jain
committed
n.Connect(pid, paddr)
x.Printf("joinPeers connected with: %q with peer id: %d\n", paddr, pid)
Manish R Jain
committed
Manish R Jain
committed
pool, err := conn.Get().Get(paddr)
if err != nil {
log.Fatalf("Unable to get pool for addr: %q for peer: %d, error: %v\n", paddr, pid, err)
}
Manish R Jain
committed
defer conn.Get().Release(pool)
Manish R Jain
committed
Manish R Jain
committed
// Bring the instance up to speed first.
// Raft would decide whether snapshot needs to fetched or not
// so populateShard is not needed
// _, err := populateShard(n.ctx, pool, n.gid)
// x.Checkf(err, "Error while populating shard")
Manish R Jain
committed
x.Printf("Calling JoinCluster")
_, err = c.JoinCluster(n.ctx, n.RaftContext)
// TODO: This should keep on indefinitely trying to join the cluster, instead of crashing.
Manish R Jain
committed
x.Checkf(err, "Error while joining cluster")
x.Printf("Done with JoinCluster call\n")
}
// InitAndStartNode gets called after having at least one membership sync with the cluster.
func (n *node) InitAndStartNode(wal *raftwal.Wal) {
idx, restart, err := n.InitFromWal(wal)
n.Applied.SetDoneUntil(idx)
posting.SyncMarkFor(n.gid).SetDoneUntil(idx)
x.Printf("Restarting node for group: %d\n", n.gid)
_, found := groups().Server(Config.RaftId, n.gid)
if !found && groups().HasPeer(n.gid) {
n.joinPeers()
}
n.SetRaft(raft.RestartNode(n.Cfg))
x.Printf("New Node for group: %d\n", n.gid)
if groups().HasPeer(n.gid) {
n.joinPeers()
n.SetRaft(raft.StartNode(n.Cfg, nil))
peers := []raft.Peer{{ID: n.Id}}
n.SetRaft(raft.StartNode(n.Cfg, peers))
// Trigger election, so this node can become the leader of this single-node cluster.
n.canCampaign = true
Manish R Jain
committed
go n.Run()
// TODO: Find a better way to snapshot, so we don't lose the membership
// state information, which isn't persisted.
go n.BatchAndSendMessages()
Manish R Jain
committed
}
func (n *node) AmLeader() bool {
r := n.Raft()
return r.Status().Lead == r.Status().ID
Manish R Jain
committed
}
func waitLinearizableRead(ctx context.Context, gid uint32) error {
replyCh, err := n.readIndex(ctx)
if err != nil {
return err
}
select {
case index := <-replyCh:
if index == raft.None {
return x.Errorf("cannot get linearized read (time expired or no configured leader)")
}
if err := n.Applied.WaitForMark(ctx, index); err != nil {
return err
}
return nil
case <-ctx.Done():
return ctx.Err()
}
}