c++ - Boost.Spirit SQL grammar/lexer failure -
i have 2 problems following sql grammar:
#define boost_spirit_qi_debug #include <boost/spirit/include/qi.hpp> #include <boost/spirit/include/lex_lexertl.hpp> #include <boost/spirit/include/phoenix.hpp> #include <boost/spirit/include/karma.hpp> #include <boost/fusion/include/adapt_struct.hpp> #include <boost/fusion/include/std_pair.hpp> #include <boost/algorithm/string.hpp> #include <boost/shared_ptr.hpp> #include <boost/make_shared.hpp> #include <boost/lexical_cast.hpp> #include <iostream> #include <fstream> #include <string> #include <set> #include <utility> namespace bs = boost::spirit; namespace lex = boost::spirit::lex; namespace qi = boost::spirit::qi; namespace phx = boost::phoenix; // token definition base, defines tokens base grammar below template <typename lexer> struct sql_tokens : lex::lexer<lexer> { public: // tokens no attributes. lex::token_def<lex::omit> type_smallint, type_int, type_varchar, type_text, type_date; lex::token_def<lex::omit> kw_not_null, kw_auto_increment, kw_unique, kw_default, kw_create, kw_table, kw_constraint, kw_primary_key; // attributed tokens. (if add new type, don't forget add lex::lexertl::token definition too). lex::token_def<int> signed_digit; lex::token_def<std::size_t> unsigned_digit; lex::token_def<std::string> identifier; lex::token_def<std::string> quoted_string; sql_tokens() { // column data types. type_smallint = "(?i:smallint)"; type_int = "(?i:int)"; type_varchar = "(?i:varchar)"; type_text = "(?i:text)"; type_date = "(?i:date)"; // keywords. kw_not_null = "(?i:not +null)"; kw_auto_increment = "(?i:auto_increment)"; kw_unique = "(?i:unique)"; kw_default = "(?i:default)"; kw_create = "(?i:create)"; kw_table = "(?i:table)"; kw_constraint = "(?i:constraint)"; kw_primary_key = "(?i:primary +key)"; // values. signed_digit = "[+-]?[0-9]+"; unsigned_digit = "[0-9]+"; quoted_string = "\\\"(\\\\.|[^\\\"])*\\\""; // \"(\\.|[^\"])*\" // identifier. identifier = "[a-za-z][a-za-z0-9_]*"; // token must added in priority order. this->self += lex::token_def<>('(') | ')' | ',' | ';'; this->self += type_smallint | type_int | type_varchar | type_text | type_date; this->self += kw_not_null | kw_auto_increment | kw_unique | kw_default | kw_create | kw_table | kw_constraint | kw_primary_key; this->self += identifier | unsigned_digit | signed_digit | quoted_string; // define whitespace ignore. this->self("ws") = lex::token_def<>("[ \\t\\n]+") | "--[^\\n]*\\n" // single line comments -- | "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/" // c-style comments ; } }; // grammar definition, define little part of sql language. template <typename iterator, typename lexer> struct sql_grammar : qi::grammar<iterator, qi::in_state_skipper<lexer> > { template <typename tokendef> sql_grammar(tokendef const& tok) : sql_grammar::base_type(program, "program") { program = (statement % ';') >> *qi::lit(';') ; statement = create_statement.alias() ; create_statement = tok.kw_create >> create_table ; create_table = tok.kw_table >> tok.identifier >> '(' >> create_table_columns >> -(',' >> table_constraints) >> ')' ; table_constraints = constraint_definition % ',' ; constraint_definition = tok.kw_constraint >> tok.identifier >> primary_key_constraint ; primary_key_constraint = tok.kw_primary_key >> '(' >> (tok.identifier % ',') >> ')' ; create_table_columns = column_definition % ',' ; column_definition = tok.identifier >> column_type >> *type_constraint ; type_constraint = tok.kw_not_null | tok.kw_auto_increment | tok.kw_unique | default_value ; default_value = tok.kw_default > tok.quoted_string ; column_type = tok.type_smallint | tok.type_int | (tok.type_varchar > '(' > tok.unsigned_digit > ')') | tok.type_text | tok.type_date ; program.name("program"); statement.name("statement"); create_statement.name("create statement"); create_table.name("create table"); create_table_columns.name("create table columns"); column_definition.name("column definition"); column_type.name("column type"); default_value.name("default value"); type_constraint.name("type constraint"); table_constraints.name("table constraints"); constraint_definition.name("constraint definition"); primary_key_constraint.name("primary key constraint"); boost_spirit_debug_node(program); boost_spirit_debug_node(statement); boost_spirit_debug_node(create_statement); boost_spirit_debug_node(create_table); boost_spirit_debug_node(create_table_columns); boost_spirit_debug_node(column_definition); boost_spirit_debug_node(column_type); boost_spirit_debug_node(default_value); boost_spirit_debug_node(type_constraint); boost_spirit_debug_node(table_constraints); boost_spirit_debug_node(constraint_definition); boost_spirit_debug_node(primary_key_constraint); using namespace qi::labels; qi::on_error<qi::fail> ( program, std::cout << phx::val("error! expecting ") << bs::_4 // failed? << phx::val(" here: \"") << phx::construct<std::string>(bs::_3, bs::_2) // iterators error-pos, end << phx::val("\"") << std::endl ); } private: typedef qi::in_state_skipper<lexer> skipper_type; typedef qi::rule<iterator, skipper_type> simple_rule; simple_rule program, statement, create_statement, create_table, table_constraints, constraint_definition; simple_rule primary_key_constraint, create_table_columns, column_definition, type_constraint, default_value, column_type; }; std::string file2string(const std::string& filename) { std::ifstream s(filename.c_str(), std::ios_base::binary); std::stringstream ss; ss << s.rdbuf(); return ss.str(); } int main(int argc, char* argv[]) { if(argc != 2) { std::cerr << "usage: " << argv[0] << " schema_filename\n"; return 1; } // iterator type used expose underlying input stream typedef std::string::iterator base_iterator_type; // lexer token type use. typedef lex::lexertl::token< base_iterator_type, boost::mpl::vector<int, std::size_t, std::string> > token_type; // here use lexertl based lexer engine. typedef lex::lexertl::lexer<token_type> lexer_type; // token definition type (derived given lexer type). typedef sql_tokens<lexer_type> sql_tokens; // iterator type exposed lexer typedef sql_tokens::iterator_type iterator_type; // type of grammar parse typedef sql_grammar<iterator_type, sql_tokens::lexer_def> sql_grammar; // use types defined above create lexer , grammar // object instances needed invoke parsing process sql_tokens tokens; // our lexer sql_grammar sql(tokens); // our parser std::string str(file2string(argv[1])); // @ point generate iterator pair used expose // tokenized input stream. base_iterator_type = str.begin(); iterator_type iter = tokens.begin(it, str.end()); iterator_type end = tokens.end(); // parsing done based on the token stream, not character // stream read input. // note how use lexer defined above skip parser. must // explicitly wrapped inside state directive, switching lexer // state duration of skipping whitespace. std::string ws("ws"); bool r = qi::phrase_parse(iter, end, sql, qi::in_state(ws)[tokens.self]); if (r && iter == end) { std::cout << "-------------------------\n"; std::cout << "parsing succeeded\n"; std::cout << "-------------------------\n"; } else { std::cout << "-------------------------\n"; std::cout << "parsing failed\n"; std::cout << "-------------------------\n"; } return 0; }
problem 1: start comments
when file start comment, parsing fails:
/* bouh */ create table mytable ( id int not null auto_increment );
with failing tree:
<program> <try>[/]</try> <statement> <try>[/]</try> <create_statement> <try>[/]</try> <fail/> </create_statement> <fail/> </statement> <fail/> </program>
but if add line return before, works. both type of comments ("--" , "/**/") fail.
problem 2: keyword unique not recognized
the parsing fails under specific condition keyword unique. it's not working when unique in upper case , directly followed comma.
all following cases succeed:
-- success create table addon ( id int not null auto_increment, u smallint not null unique ); -- success create table addon ( id int not null auto_increment, u smallint not null unique, s int not null unique ); -- success create table addon ( id int not null auto_increment, u smallint not null unique , s int not null unique ); -- success create table addon ( id int not null auto_increment, u smallint unique not null, s int not null unique );
but 1 doesn't:
-- fail create table addon ( id int not null auto_increment, u smallint not null unique, s int not null );
do have ideas of wrong? thanks!
regarding whitespace skipping can conclude pre-skipping not being done (perhaps state not switched correctly).
of course, try remedy using i misrembered api, manual tokenization, precludes state switching qi in first place.lex::tokenize_and_parse
api (passing initial state "ws").
however, tend make skipping responsibility of lexer:
ws = "[ \\t\\n]+"; comment = "--[^\\n]*\\n"; // single line comments -- cstyle_comment = "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"; // c-style comments this->self += ws [ lex::_pass = lex::pass_flags::pass_ignore ] | comment [ lex::_pass = lex::pass_flags::pass_ignore ] | cstyle_comment [ lex::_pass = lex::pass_flags::pass_ignore ] ;
now there no need use skipper @ all, , succeeds in parsing first problem (starting comment).
full code: coliru can't handle compilation :(
look #ifdef state_ws
//#define boost_spirit_qi_debug //#define state_ws #include <boost/spirit/include/qi.hpp> #include <boost/spirit/include/lex_lexertl.hpp> #include <boost/spirit/include/phoenix.hpp> #include <boost/spirit/include/karma.hpp> #include <boost/fusion/include/adapt_struct.hpp> #include <boost/fusion/include/std_pair.hpp> #include <boost/algorithm/string.hpp> #include <boost/shared_ptr.hpp> #include <boost/make_shared.hpp> #include <boost/lexical_cast.hpp> #include <iostream> #include <fstream> #include <string> #include <set> #include <utility> namespace bs = boost::spirit; namespace lex = boost::spirit::lex; namespace qi = boost::spirit::qi; namespace phx = boost::phoenix; // token definition base, defines tokens base grammar below template <typename lexer> struct sql_tokens : lex::lexer<lexer> { public: // tokens no attributes. lex::token_def<lex::omit> type_smallint; lex::token_def<lex::omit> type_int; lex::token_def<lex::omit> type_varchar; lex::token_def<lex::omit> type_text; lex::token_def<lex::omit> type_date; lex::token_def<lex::omit> kw_not_null; lex::token_def<lex::omit> kw_auto_increment; lex::token_def<lex::omit> kw_unique; lex::token_def<lex::omit> kw_default; lex::token_def<lex::omit> kw_create; lex::token_def<lex::omit> kw_table; lex::token_def<lex::omit> kw_constraint; lex::token_def<lex::omit> kw_primary_key; // attributed tokens. (if add new type, don't forget add lex::lexertl::token definition too). lex::token_def<int> signed_digit; lex::token_def<std::size_t> unsigned_digit; lex::token_def<std::string> identifier; lex::token_def<std::string> quoted_string; lex::token_def<lex::omit> ws, comment, cstyle_comment; sql_tokens() { // column data types. type_smallint = "(?i:smallint)"; type_int = "(?i:int)"; type_varchar = "(?i:varchar)"; type_text = "(?i:text)"; type_date = "(?i:date)"; // keywords. kw_not_null = "(?i:not +null)"; kw_auto_increment = "(?i:auto_increment)"; kw_unique = "(?i:unique)"; kw_default = "(?i:default)"; kw_create = "(?i:create)"; kw_table = "(?i:table)"; kw_constraint = "(?i:constraint)"; kw_primary_key = "(?i:primary +key)"; // values. signed_digit = "[+-]?[0-9]+"; unsigned_digit = "[0-9]+"; quoted_string = "\\\"(\\\\.|[^\\\"])*\\\""; // \"(\\.|[^\"])*\" // identifier. identifier = "[a-za-z][a-za-z0-9_]*"; // token must added in priority order. this->self += lex::token_def<>('(') | ')' | ',' | ';'; this->self += type_smallint | type_int | type_varchar | type_text | type_date; this->self += kw_not_null | kw_auto_increment | kw_unique | kw_default | kw_create | kw_table | kw_constraint | kw_primary_key; this->self += identifier | unsigned_digit | signed_digit | quoted_string; #ifdef state_ws // define whitespace ignore. this->self("ws") = ws | comment | cstyle_comment ; #else ws = "[ \\t\\n]+"; comment = "--[^\\n]*\\n"; // single line comments -- cstyle_comment = "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"; // c-style comments this->self += ws [ lex::_pass = lex::pass_flags::pass_ignore ] | comment [ lex::_pass = lex::pass_flags::pass_ignore ] | cstyle_comment [ lex::_pass = lex::pass_flags::pass_ignore ] ; #endif } }; // grammar definition, define little part of sql language. template <typename iterator, typename lexer> struct sql_grammar #ifdef state_ws : qi::grammar<iterator, qi::in_state_skipper<lexer> > #else : qi::grammar<iterator> #endif { template <typename tokendef> sql_grammar(tokendef const& tok) : sql_grammar::base_type(program, "program") { program = (statement % ';') >> *qi::lit(';') ; statement = create_statement.alias() ; create_statement = tok.kw_create >> create_table ; create_table = tok.kw_table >> tok.identifier >> '(' >> create_table_columns >> -(',' >> table_constraints) >> ')' ; table_constraints = constraint_definition % ',' ; constraint_definition = tok.kw_constraint >> tok.identifier >> primary_key_constraint ; primary_key_constraint = tok.kw_primary_key >> '(' >> (tok.identifier % ',') >> ')' ; create_table_columns = column_definition % ',' ; column_definition = tok.identifier >> column_type >> *type_constraint ; type_constraint = tok.kw_not_null | tok.kw_auto_increment | tok.kw_unique | default_value ; default_value = tok.kw_default > tok.quoted_string ; column_type = tok.type_smallint | tok.type_int | (tok.type_varchar > '(' > tok.unsigned_digit > ')') | tok.type_text | tok.type_date ; program.name("program"); statement.name("statement"); create_statement.name("create statement"); create_table.name("create table"); create_table_columns.name("create table columns"); column_definition.name("column definition"); column_type.name("column type"); default_value.name("default value"); type_constraint.name("type constraint"); table_constraints.name("table constraints"); constraint_definition.name("constraint definition"); primary_key_constraint.name("primary key constraint"); boost_spirit_debug_node(program); boost_spirit_debug_node(statement); boost_spirit_debug_node(create_statement); boost_spirit_debug_node(create_table); boost_spirit_debug_node(create_table_columns); boost_spirit_debug_node(column_definition); boost_spirit_debug_node(column_type); boost_spirit_debug_node(default_value); boost_spirit_debug_node(type_constraint); boost_spirit_debug_node(table_constraints); boost_spirit_debug_node(constraint_definition); boost_spirit_debug_node(primary_key_constraint); using namespace qi::labels; qi::on_error<qi::fail> ( program, std::cout << phx::val("error! expecting ") << bs::_4 // failed? << phx::val(" here: \"") << phx::construct<std::string>(bs::_3, bs::_2) // iterators error-pos, end << phx::val("\"") << std::endl ); } private: #ifdef state_ws typedef qi::in_state_skipper<lexer> skipper_type; #else typedef qi::unused_type skipper_type; #endif typedef qi::rule<iterator, skipper_type> simple_rule; simple_rule program, statement, create_statement, create_table, table_constraints, constraint_definition; simple_rule primary_key_constraint, create_table_columns, column_definition, type_constraint, default_value, column_type; }; std::string cin2string() { std::istreambuf_iterator<char> f(std::cin), l; std::string result; std::copy(f, l, std::back_inserter(result)); return result; } int main(int argc, char* argv[]) { // iterator type used expose underlying input stream typedef std::string::const_iterator base_iterator_type; // lexer token type use. typedef lex::lexertl::token< base_iterator_type, boost::mpl::vector<int, std::size_t, std::string> > token_type; #ifdef state_ws typedef lex::lexertl::lexer<token_type> lexer_type; #else typedef lex::lexertl::actor_lexer<token_type> lexer_type; #endif // token definition type (derived given lexer type). typedef sql_tokens<lexer_type> sql_tokens; // iterator type exposed lexer typedef sql_tokens::iterator_type iterator_type; // type of grammar parse typedef sql_grammar<iterator_type, sql_tokens::lexer_def> sql_grammar; // use types defined above create lexer , grammar // object instances needed invoke parsing process sql_tokens tokens; // our lexer sql_grammar sql(tokens); // our parser const std::string str = cin2string(); // @ point generate iterator pair used expose // tokenized input stream. base_iterator_type = str.begin(); iterator_type iter = tokens.begin(it, str.end()); iterator_type end = tokens.end(); // parsing done based on the token stream, not character // stream read input. // note how use lexer defined above skip parser. must // explicitly wrapped inside state directive, switching lexer // state duration of skipping whitespace. #ifdef state_ws std::string ws("ws"); bool r = qi::phrase_parse(iter, end, sql, qi::in_state(ws)[tokens.self]); #else bool r = qi::parse(iter, end, sql); #endif if (r && iter == end) { std::cout << "-------------------------\n"; std::cout << "parsing succeeded\n"; std::cout << "-------------------------\n"; } else { std::cout << "-------------------------\n"; std::cout << "parsing failed\n"; std::cout << "-------------------------\n"; } return 0; }
Comments
Post a Comment