diff --git a/rdf/state.go b/rdf/state.go index 66b8bf16a97b9b976c36526c0e8ec765b30db581..3ed0cdbe479d3dfb6aab7014bc43ebb2865f9ca7 100644 --- a/rdf/state.go +++ b/rdf/state.go @@ -21,11 +21,14 @@ package rdf import "github.com/dgraph-io/dgraph/lex" const ( - itemText lex.ItemType = 5 + iota // plain text - itemSubject // subject - itemPredicate // predicate - itemObject // object - itemLabel // label + itemText lex.ItemType = 5 + iota // plain text + itemSubject // subject, 6 + itemPredicate // predicate, 7 + itemObject // object, 8 + itemLabel // label, 9 + itemLiteral // literal, 10 + itemLanguage // language, 11 + itemObjectType // object type, 12 ) const ( @@ -65,6 +68,15 @@ Loop: } else { return l.Errorf("Invalid input: %v at lexText", r) } + + case r == '"': + if l.Depth != AT_OBJECT { + return l.Errorf("Invalid quote for non-object.") + } + l.Backup() + l.Emit(itemText) + return lexObject + case r == '.' || r == lex.EOF: break Loop } @@ -85,6 +97,29 @@ func lexUntilClosing(l *lex.Lexer, styp lex.ItemType, return l.Errorf("Unexpected end of subject") } if r == '>' { + l.Emit(styp) + return sfn + } + return l.Errorf("Invalid character %v found for itemType: %v", r, styp) +} + +// Assumes that the current rune is '_'. +func lexBlankNode(l *lex.Lexer, styp lex.ItemType, + sfn lex.StateFn) lex.StateFn { + + r := l.Next() + if r != ':' { + return l.Errorf("Invalid input RDF Blank Node found at pos: %v", r) + } + // RDF Blank Node. + // TODO: At some point do checkings based on the guidelines. For now, + // just accept everything until space. + l.AcceptUntil(isSpace) + r = l.Peek() + if r == lex.EOF { + return l.Errorf("Unexpected end of subject") + } + if isSpace(r) { l.Emit(styp) l.Depth += 1 return sfn @@ -95,27 +130,12 @@ func lexUntilClosing(l *lex.Lexer, styp lex.ItemType, func lexSubject(l *lex.Lexer) lex.StateFn { r := l.Next() if r == '<' { + l.Depth += 1 return lexUntilClosing(l, itemSubject, lexText) } if r == '_' { - r = l.Next() - if r != ':' { - return l.Errorf("Invalid input RDF Blank Node found at pos: %v", r) - } - // RDF Blank Node. - // TODO: At some point do checkings based on the guidelines. For now, - // just accept everything until space. - l.AcceptUntil(isSpace) - r = l.Peek() - if r == lex.EOF { - return l.Errorf("Unexpected end of subject") - } - if isSpace(r) { - l.Emit(itemSubject) - l.Depth += 1 - return lexText - } + return lexBlankNode(l, itemSubject, lexText) } return l.Errorf("Invalid character during lexSubject: %v", r) @@ -126,14 +146,76 @@ func lexPredicate(l *lex.Lexer) lex.StateFn { if r != '<' { return l.Errorf("Invalid character in lexPredicate: %v", r) } + l.Depth += 1 return lexUntilClosing(l, itemPredicate, lexText) } +func lexLanguage(l *lex.Lexer) lex.StateFn { + r := l.Next() + if r != '@' { + return l.Errorf("Expected @ prefix for lexLanguage") + } + l.Ignore() + r = l.Next() + if !isLangTagPrefix(r) { + return l.Errorf("Invalid language tag prefix: %v", r) + } + l.AcceptRun(isLangTag) + l.Emit(itemLanguage) + return lexText +} + +// Assumes '"' has already been encountered. +func lexLiteral(l *lex.Lexer) lex.StateFn { + l.AcceptUntil(isEndLiteral) + l.Emit(itemLiteral) + l.Next() // Move to end literal. + l.Ignore() // Ignore end literal. + l.Depth += 1 + + r := l.Peek() + if r == '@' { + return lexLanguage(l) + + } else if r == '^' { + return lexObjectType(l) + + } else { + return lexText + } +} + +func lexObjectType(l *lex.Lexer) lex.StateFn { + r := l.Next() + if r != '^' { + return l.Errorf("Expected ^ for lexObjectType") + } + r = l.Next() + if r != '^' { + return l.Errorf("Expected ^^ for lexObjectType") + } + l.Ignore() + r = l.Next() + if r != '<' { + return l.Errorf("Expected < for lexObjectType") + } + return lexUntilClosing(l, itemObjectType, lexText) +} + func lexObject(l *lex.Lexer) lex.StateFn { r := l.Next() if r == '<' { + l.Depth += 1 return lexUntilClosing(l, itemObject, lexText) } + if r == '_' { + return lexBlankNode(l, itemObject, lexText) + } + if r == '"' { + l.Ignore() + return lexLiteral(l) + } + return l.Errorf("Invalid char: %v at lexObject", r) } @@ -144,3 +226,33 @@ func isClosingBracket(r rune) bool { func isSpace(r rune) bool { return r == '\u0009' || r == '\u0020' } + +func isEndLiteral(r rune) bool { + return r == '"' || r == '\u000d' || r == '\u000a' +} + +func isLangTagPrefix(r rune) bool { + switch { + case r >= 'a' && r <= 'z': + return true + case r >= 'A' && r <= 'Z': + return true + default: + return false + } +} + +func isLangTag(r rune) bool { + if isLangTagPrefix(r) { + return true + } + + switch { + case r == '-': + return true + case r >= '0' && r <= '9': + return true + default: + return false + } +} diff --git a/rdf/state_test.go b/rdf/state_test.go index ae8e3c93f1aaf950dc592beb3cc722be8be286c1..97c33782f40c426184bacba83836173455a8e7db 100644 --- a/rdf/state_test.go +++ b/rdf/state_test.go @@ -41,6 +41,36 @@ var testNQuads = []struct { attr: "<predicate>", valueid: "<Object_id>", }, + { + input: `_:alice <follows> _:bob0 .`, + entity: "_:alice", + attr: "<follows>", + valueid: "_:bob0", + }, + { + input: `_:alice <name> "Alice In Wonderland" .`, + entity: "_:alice", + attr: "<name>", + valueid: "Alice In Wonderland", + }, + { + input: `_:alice <name> "Alice In Wonderland"@en-0 .`, + entity: "_:alice", + attr: "<name>", + valueid: "Alice In Wonderland", + }, + { + input: `_:alice <age> "013"^^<integer> .`, + entity: "_:alice", + attr: "<age>", + valueid: "Alice In Wonderland", + }, + { + input: `<http://www.w3.org/2001/sw/RDFCore/ntriples/> <http://purl.org/dc/terms/title> "N-Triples"@en-US .`, + entity: "<http://www.w3.org/2001/sw/RDFCore/ntriples/>", + attr: "<http://purl.org/dc/terms/title>", + valueid: "Alice In Wonderland", + }, } func TestLex(t *testing.T) {