diff --git a/lex/lexer.go b/lex/lexer.go index 0bd922b07a7e84078f33e313a05041548907f4db..ec88d5702822f8ed0f32b49f2bd0f666dfedea79 100644 --- a/lex/lexer.go +++ b/lex/lexer.go @@ -126,15 +126,24 @@ func (l *Lexer) Ignore() { l.Start = l.Pos } -type checkRune func(r rune) bool +type CheckRune func(r rune) bool -func (l *Lexer) AcceptRun(c checkRune) { +func (l *Lexer) AcceptRun(c CheckRune) { for { r := l.Next() - if !c(r) { + if r == EOF || !c(r) { break } } + l.Backup() +} +func (l *Lexer) AcceptUntil(c CheckRune) { + for { + r := l.Next() + if r == EOF || c(r) { + break + } + } l.Backup() } diff --git a/rdf/state.go b/rdf/state.go new file mode 100644 index 0000000000000000000000000000000000000000..66b8bf16a97b9b976c36526c0e8ec765b30db581 --- /dev/null +++ b/rdf/state.go @@ -0,0 +1,146 @@ +/* + * Copyright 2015 Manish R Jain <manishrjain@gmail.com> + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// rdf package parses N-Quad statements based on +// http://www.w3.org/TR/n-quads/ +package rdf + +import "github.com/dgraph-io/dgraph/lex" + +const ( + itemText lex.ItemType = 5 + iota // plain text + itemSubject // subject + itemPredicate // predicate + itemObject // object + itemLabel // label +) + +const ( + AT_SUBJECT int = iota + AT_PREDICATE + AT_OBJECT + AT_LABEL +) + +func run(l *lex.Lexer) { + for state := lexText; state != nil; { + state = state(l) + } + close(l.Items) // No more tokens. +} + +func lexText(l *lex.Lexer) lex.StateFn { +Loop: + for { + switch r := l.Next(); { + case r == '<' || r == '_': + if l.Depth == AT_SUBJECT { + l.Backup() + l.Emit(itemText) // emit whatever we have so far. + return lexSubject + + } else if l.Depth == AT_PREDICATE { + l.Backup() + l.Emit(itemText) + return lexPredicate + + } else if l.Depth == AT_OBJECT { + l.Backup() + l.Emit(itemText) + return lexObject + + } else { + return l.Errorf("Invalid input: %v at lexText", r) + } + case r == '.' || r == lex.EOF: + break Loop + } + } + if l.Pos > l.Start { + l.Emit(itemText) + } + l.Emit(lex.ItemEOF) + return nil +} + +func lexUntilClosing(l *lex.Lexer, styp lex.ItemType, + sfn lex.StateFn) lex.StateFn { + + l.AcceptUntil(isClosingBracket) + r := l.Next() + if r == lex.EOF { + return l.Errorf("Unexpected end of subject") + } + if r == '>' { + l.Emit(styp) + l.Depth += 1 + return sfn + } + return l.Errorf("Invalid character %v found for itemType: %v", r, styp) +} + +func lexSubject(l *lex.Lexer) lex.StateFn { + r := l.Next() + if r == '<' { + return lexUntilClosing(l, itemSubject, lexText) + } + + if r == '_' { + r = l.Next() + if r != ':' { + return l.Errorf("Invalid input RDF Blank Node found at pos: %v", r) + } + // RDF Blank Node. + // TODO: At some point do checkings based on the guidelines. For now, + // just accept everything until space. + l.AcceptUntil(isSpace) + r = l.Peek() + if r == lex.EOF { + return l.Errorf("Unexpected end of subject") + } + if isSpace(r) { + l.Emit(itemSubject) + l.Depth += 1 + return lexText + } + } + + return l.Errorf("Invalid character during lexSubject: %v", r) +} + +func lexPredicate(l *lex.Lexer) lex.StateFn { + r := l.Next() + if r != '<' { + return l.Errorf("Invalid character in lexPredicate: %v", r) + } + return lexUntilClosing(l, itemPredicate, lexText) +} + +func lexObject(l *lex.Lexer) lex.StateFn { + r := l.Next() + if r == '<' { + return lexUntilClosing(l, itemObject, lexText) + } + return l.Errorf("Invalid char: %v at lexObject", r) +} + +func isClosingBracket(r rune) bool { + return r == '>' +} + +func isSpace(r rune) bool { + return r == '\u0009' || r == '\u0020' +} diff --git a/rdf/state_test.go b/rdf/state_test.go new file mode 100644 index 0000000000000000000000000000000000000000..ae8e3c93f1aaf950dc592beb3cc722be8be286c1 --- /dev/null +++ b/rdf/state_test.go @@ -0,0 +1,75 @@ +/* + * Copyright 2015 Manish R Jain <manishrjain@gmail.com> + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rdf + +import ( + "testing" + + "github.com/dgraph-io/dgraph/lex" +) + +var testNQuads = []struct { + input string + entity string + attr string + valueid string + value interface{} +}{ + { + input: `<some_subject_id> <predicate> <Object_id> .`, + entity: "<some_subject_id>", + attr: "<predicate>", + valueid: "<Object_id>", + }, + { + input: `_:alice <predicate> <Object_id> .`, + entity: "_:alice", + attr: "<predicate>", + valueid: "<Object_id>", + }, +} + +func TestLex(t *testing.T) { + for _, test := range testNQuads { + l := lex.NewLexer(test.input) + go run(l) + for item := range l.Items { + t.Logf("Item: %v", item) + if item.Typ == itemSubject { + if item.Val != test.entity { + t.Errorf("Expected: %v. Got: %v", test.entity, item.Val) + } else { + t.Logf("Subject matches") + } + } + if item.Typ == itemPredicate { + if item.Val != test.attr { + t.Errorf("Expected: %v. Got: %v", test.attr, item.Val) + } else { + t.Logf("Predicate matches") + } + } + if item.Typ == itemObject { + if item.Val != test.valueid { + t.Errorf("Expected: %v. Got: %v", test.valueid, item.Val) + } else { + t.Logf("Object matches") + } + } + } + } +}