c++ - Boost.Spirit SQL grammar/lexer failure -

i have 2 problems following sql grammar:

#define boost_spirit_qi_debug  #include <boost/spirit/include/qi.hpp> #include <boost/spirit/include/lex_lexertl.hpp> #include <boost/spirit/include/phoenix.hpp> #include <boost/spirit/include/karma.hpp> #include <boost/fusion/include/adapt_struct.hpp> #include <boost/fusion/include/std_pair.hpp>   #include <boost/algorithm/string.hpp> #include <boost/shared_ptr.hpp> #include <boost/make_shared.hpp> #include <boost/lexical_cast.hpp>  #include <iostream> #include <fstream> #include <string> #include <set> #include <utility>  namespace bs = boost::spirit; namespace lex = boost::spirit::lex; namespace qi = boost::spirit::qi; namespace phx = boost::phoenix;  // token definition base, defines tokens base grammar below template <typename lexer> struct sql_tokens : lex::lexer<lexer> { public:     // tokens no attributes.     lex::token_def<lex::omit> type_smallint, type_int, type_varchar, type_text, type_date;     lex::token_def<lex::omit> kw_not_null, kw_auto_increment, kw_unique, kw_default, kw_create,         kw_table, kw_constraint, kw_primary_key;      // attributed tokens. (if add new type, don't forget add lex::lexertl::token definition too).     lex::token_def<int> signed_digit;     lex::token_def<std::size_t> unsigned_digit;     lex::token_def<std::string> identifier;     lex::token_def<std::string> quoted_string;      sql_tokens()     {         // column data types.         type_smallint = "(?i:smallint)";         type_int = "(?i:int)";         type_varchar = "(?i:varchar)";         type_text = "(?i:text)";         type_date = "(?i:date)";          // keywords.         kw_not_null = "(?i:not +null)";         kw_auto_increment = "(?i:auto_increment)";         kw_unique = "(?i:unique)";         kw_default = "(?i:default)";         kw_create = "(?i:create)";         kw_table = "(?i:table)";         kw_constraint = "(?i:constraint)";         kw_primary_key = "(?i:primary +key)";          // values.         signed_digit = "[+-]?[0-9]+";         unsigned_digit = "[0-9]+";         quoted_string = "\\\"(\\\\.|[^\\\"])*\\\""; // \"(\\.|[^\"])*\"          // identifier.         identifier = "[a-za-z][a-za-z0-9_]*";          // token must added in priority order.         this->self += lex::token_def<>('(') | ')' | ',' | ';';         this->self += type_smallint | type_int | type_varchar | type_text |                                     type_date;         this->self += kw_not_null | kw_auto_increment | kw_unique | kw_default |                                     kw_create | kw_table | kw_constraint | kw_primary_key;         this->self += identifier | unsigned_digit | signed_digit | quoted_string;          // define whitespace ignore.         this->self("ws")                 =       lex::token_def<>("[ \\t\\n]+")                  |       "--[^\\n]*\\n"  // single line comments --                 |       "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/" // c-style comments                 ;     } };  // grammar definition, define little part of sql language. template <typename iterator, typename lexer> struct sql_grammar      : qi::grammar<iterator, qi::in_state_skipper<lexer> > {     template <typename tokendef>     sql_grammar(tokendef const& tok)         : sql_grammar::base_type(program, "program")     {         program              =  (statement % ';') >> *qi::lit(';')             ;          statement              =   create_statement.alias()             ;          create_statement             =   tok.kw_create >> create_table             ;          create_table             =   tok.kw_table >> tok.identifier >> '(' >> create_table_columns >> -(',' >> table_constraints) >> ')'             ;          table_constraints             =   constraint_definition % ','             ;          constraint_definition             = tok.kw_constraint >> tok.identifier >> primary_key_constraint             ;          primary_key_constraint             = tok.kw_primary_key >> '(' >> (tok.identifier % ',') >> ')'             ;          create_table_columns             =   column_definition % ','             ;          column_definition             =   tok.identifier >> column_type >> *type_constraint             ;          type_constraint             =   tok.kw_not_null             |   tok.kw_auto_increment             |   tok.kw_unique             |   default_value             ;          default_value             =   tok.kw_default > tok.quoted_string             ;          column_type             =   tok.type_smallint             |   tok.type_int             |   (tok.type_varchar > '(' > tok.unsigned_digit > ')')              |   tok.type_text             |   tok.type_date             ;          program.name("program");         statement.name("statement");         create_statement.name("create statement");         create_table.name("create table");         create_table_columns.name("create table columns");         column_definition.name("column definition");         column_type.name("column type");         default_value.name("default value");         type_constraint.name("type constraint");         table_constraints.name("table constraints");         constraint_definition.name("constraint definition");         primary_key_constraint.name("primary key constraint");          boost_spirit_debug_node(program);         boost_spirit_debug_node(statement);         boost_spirit_debug_node(create_statement);         boost_spirit_debug_node(create_table);         boost_spirit_debug_node(create_table_columns);         boost_spirit_debug_node(column_definition);         boost_spirit_debug_node(column_type);         boost_spirit_debug_node(default_value);         boost_spirit_debug_node(type_constraint);         boost_spirit_debug_node(table_constraints);         boost_spirit_debug_node(constraint_definition);         boost_spirit_debug_node(primary_key_constraint);          using namespace qi::labels;         qi::on_error<qi::fail>         (             program,             std::cout                 << phx::val("error! expecting ")                 << bs::_4                               // failed?                 << phx::val(" here: \"")                 << phx::construct<std::string>(bs::_3, bs::_2)   // iterators error-pos, end                 << phx::val("\"")                 << std::endl         );     }  private:     typedef qi::in_state_skipper<lexer> skipper_type;     typedef qi::rule<iterator, skipper_type> simple_rule;      simple_rule program, statement, create_statement, create_table, table_constraints, constraint_definition;     simple_rule primary_key_constraint, create_table_columns, column_definition, type_constraint, default_value, column_type; };  std::string file2string(const std::string& filename) {     std::ifstream s(filename.c_str(), std::ios_base::binary);     std::stringstream ss;     ss << s.rdbuf();     return ss.str(); }  int main(int argc, char* argv[]) {     if(argc != 2)     {         std::cerr << "usage: " << argv[0] << " schema_filename\n";         return 1;     }      // iterator type used expose underlying input stream     typedef std::string::iterator base_iterator_type;      // lexer token type use.     typedef lex::lexertl::token<         base_iterator_type, boost::mpl::vector<int, std::size_t, std::string>      > token_type;      // here use lexertl based lexer engine.     typedef lex::lexertl::lexer<token_type> lexer_type;      // token definition type (derived given lexer type).     typedef sql_tokens<lexer_type> sql_tokens;      // iterator type exposed lexer      typedef sql_tokens::iterator_type iterator_type;      // type of grammar parse     typedef sql_grammar<iterator_type, sql_tokens::lexer_def> sql_grammar;      // use types defined above create lexer , grammar     // object instances needed invoke parsing process     sql_tokens tokens;                         // our lexer     sql_grammar sql(tokens);                  // our parser      std::string str(file2string(argv[1]));      // @ point generate iterator pair used expose     // tokenized input stream.     base_iterator_type = str.begin();     iterator_type iter = tokens.begin(it, str.end());     iterator_type end = tokens.end();      // parsing done based on the token stream, not character      // stream read input.     // note how use lexer defined above skip parser. must     // explicitly wrapped inside state directive, switching lexer      // state duration of skipping whitespace.     std::string ws("ws");     bool r = qi::phrase_parse(iter, end, sql, qi::in_state(ws)[tokens.self]);      if (r && iter == end)     {         std::cout << "-------------------------\n";         std::cout << "parsing succeeded\n";         std::cout << "-------------------------\n";     }     else     {         std::cout << "-------------------------\n";         std::cout << "parsing failed\n";         std::cout << "-------------------------\n";     }     return 0; }

problem 1: start comments

when file start comment, parsing fails:

/* bouh */  create table mytable (   id int not null auto_increment );

with failing tree:

<program>   <try>[/]</try>   <statement>     <try>[/]</try>     <create_statement>       <try>[/]</try>       <fail/>     </create_statement>     <fail/>   </statement>   <fail/> </program>

but if add line return before, works. both type of comments ("--" , "/**/") fail.

problem 2: keyword unique not recognized

the parsing fails under specific condition keyword unique. it's not working when unique in upper case , directly followed comma.

all following cases succeed:

-- success create table addon (   id int not null auto_increment,   u smallint not null unique );  -- success create table addon (   id int not null auto_increment,   u smallint not null unique,   s int not null unique );  -- success create table addon (   id int not null auto_increment,   u smallint not null unique ,   s int not null unique );  -- success create table addon (   id int not null auto_increment,   u smallint unique not null,   s int not null unique );

but 1 doesn't:

-- fail create table addon (   id int not null auto_increment,   u smallint not null unique,   s int not null );

do have ideas of wrong? thanks!

regarding whitespace skipping can conclude pre-skipping not being done (perhaps state not switched correctly).

^{of course, try remedy using lex::tokenize_and_parse api (passing initial state "ws"). i misrembered api, manual tokenization, precludes state switching qi in first place.}

however, tend make skipping responsibility of lexer:

ws = "[ \\t\\n]+"; comment = "--[^\\n]*\\n";  // single line comments -- cstyle_comment = "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"; // c-style comments  this->self += ws              [ lex::_pass = lex::pass_flags::pass_ignore ]              | comment         [ lex::_pass = lex::pass_flags::pass_ignore ]             | cstyle_comment  [ lex::_pass = lex::pass_flags::pass_ignore ]             ;

now there no need use skipper @ all, , succeeds in parsing first problem (starting comment).

full code: coliru can't handle compilation :(

look #ifdef state_ws

//#define boost_spirit_qi_debug //#define state_ws  #include <boost/spirit/include/qi.hpp> #include <boost/spirit/include/lex_lexertl.hpp> #include <boost/spirit/include/phoenix.hpp> #include <boost/spirit/include/karma.hpp> #include <boost/fusion/include/adapt_struct.hpp> #include <boost/fusion/include/std_pair.hpp>   #include <boost/algorithm/string.hpp> #include <boost/shared_ptr.hpp> #include <boost/make_shared.hpp> #include <boost/lexical_cast.hpp>  #include <iostream> #include <fstream> #include <string> #include <set> #include <utility>  namespace bs  = boost::spirit; namespace lex = boost::spirit::lex; namespace qi  = boost::spirit::qi; namespace phx = boost::phoenix;  // token definition base, defines tokens base grammar below template <typename lexer> struct sql_tokens : lex::lexer<lexer> { public:     // tokens no attributes.     lex::token_def<lex::omit> type_smallint;     lex::token_def<lex::omit> type_int;     lex::token_def<lex::omit> type_varchar;     lex::token_def<lex::omit> type_text;     lex::token_def<lex::omit> type_date;     lex::token_def<lex::omit> kw_not_null;     lex::token_def<lex::omit> kw_auto_increment;     lex::token_def<lex::omit> kw_unique;     lex::token_def<lex::omit> kw_default;     lex::token_def<lex::omit> kw_create;     lex::token_def<lex::omit> kw_table;     lex::token_def<lex::omit> kw_constraint;     lex::token_def<lex::omit> kw_primary_key;      // attributed tokens. (if add new type, don't forget add lex::lexertl::token definition too).     lex::token_def<int>         signed_digit;     lex::token_def<std::size_t> unsigned_digit;     lex::token_def<std::string> identifier;     lex::token_def<std::string> quoted_string;      lex::token_def<lex::omit>   ws, comment, cstyle_comment;      sql_tokens()     {         // column data types.         type_smallint     = "(?i:smallint)";         type_int          = "(?i:int)";         type_varchar      = "(?i:varchar)";         type_text         = "(?i:text)";         type_date         = "(?i:date)";          // keywords.         kw_not_null       = "(?i:not +null)";         kw_auto_increment = "(?i:auto_increment)";         kw_unique         = "(?i:unique)";         kw_default        = "(?i:default)";         kw_create         = "(?i:create)";         kw_table          = "(?i:table)";         kw_constraint     = "(?i:constraint)";         kw_primary_key    = "(?i:primary +key)";          // values.         signed_digit      = "[+-]?[0-9]+";         unsigned_digit    = "[0-9]+";         quoted_string     = "\\\"(\\\\.|[^\\\"])*\\\""; // \"(\\.|[^\"])*\"          // identifier.         identifier        = "[a-za-z][a-za-z0-9_]*";          // token must added in priority order.         this->self += lex::token_def<>('(') | ')' | ',' | ';';         this->self += type_smallint | type_int | type_varchar | type_text |                                     type_date;         this->self += kw_not_null | kw_auto_increment | kw_unique | kw_default |                                     kw_create | kw_table | kw_constraint | kw_primary_key;         this->self += identifier | unsigned_digit | signed_digit | quoted_string;  #ifdef state_ws         // define whitespace ignore.         this->self("ws")                 =       ws                 |       comment                 |       cstyle_comment                 ; #else         ws = "[ \\t\\n]+";         comment = "--[^\\n]*\\n";  // single line comments --         cstyle_comment = "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"; // c-style comments          this->self += ws              [ lex::_pass = lex::pass_flags::pass_ignore ]                      | comment         [ lex::_pass = lex::pass_flags::pass_ignore ]                     | cstyle_comment  [ lex::_pass = lex::pass_flags::pass_ignore ]                     ; #endif     } };  // grammar definition, define little part of sql language. template <typename iterator, typename lexer> struct sql_grammar  #ifdef state_ws     : qi::grammar<iterator, qi::in_state_skipper<lexer> > #else     : qi::grammar<iterator> #endif {     template <typename tokendef>     sql_grammar(tokendef const& tok)         : sql_grammar::base_type(program, "program")     {         program              =  (statement % ';') >> *qi::lit(';')             ;          statement              =   create_statement.alias()             ;          create_statement             =   tok.kw_create >> create_table             ;          create_table             =   tok.kw_table >> tok.identifier >> '(' >> create_table_columns >> -(',' >> table_constraints) >> ')'             ;          table_constraints             =   constraint_definition % ','             ;          constraint_definition             = tok.kw_constraint >> tok.identifier >> primary_key_constraint             ;          primary_key_constraint             = tok.kw_primary_key >> '(' >> (tok.identifier % ',') >> ')'             ;          create_table_columns             =   column_definition % ','             ;          column_definition             =   tok.identifier >> column_type >> *type_constraint             ;          type_constraint             =   tok.kw_not_null             |   tok.kw_auto_increment             |   tok.kw_unique             |   default_value             ;          default_value             =   tok.kw_default > tok.quoted_string             ;          column_type             =   tok.type_smallint             |   tok.type_int             |   (tok.type_varchar > '(' > tok.unsigned_digit > ')')              |   tok.type_text             |   tok.type_date             ;          program.name("program");         statement.name("statement");         create_statement.name("create statement");         create_table.name("create table");         create_table_columns.name("create table columns");         column_definition.name("column definition");         column_type.name("column type");         default_value.name("default value");         type_constraint.name("type constraint");         table_constraints.name("table constraints");         constraint_definition.name("constraint definition");         primary_key_constraint.name("primary key constraint");          boost_spirit_debug_node(program);         boost_spirit_debug_node(statement);         boost_spirit_debug_node(create_statement);         boost_spirit_debug_node(create_table);         boost_spirit_debug_node(create_table_columns);         boost_spirit_debug_node(column_definition);         boost_spirit_debug_node(column_type);         boost_spirit_debug_node(default_value);         boost_spirit_debug_node(type_constraint);         boost_spirit_debug_node(table_constraints);         boost_spirit_debug_node(constraint_definition);         boost_spirit_debug_node(primary_key_constraint);          using namespace qi::labels;         qi::on_error<qi::fail>         (             program,             std::cout                 << phx::val("error! expecting ")                 << bs::_4                               // failed?                 << phx::val(" here: \"")                 << phx::construct<std::string>(bs::_3, bs::_2)   // iterators error-pos, end                 << phx::val("\"")                 << std::endl         );     }  private: #ifdef state_ws     typedef qi::in_state_skipper<lexer> skipper_type; #else     typedef qi::unused_type skipper_type; #endif     typedef qi::rule<iterator, skipper_type> simple_rule;      simple_rule program, statement, create_statement, create_table, table_constraints, constraint_definition;     simple_rule primary_key_constraint, create_table_columns, column_definition, type_constraint, default_value, column_type; };  std::string cin2string() {     std::istreambuf_iterator<char> f(std::cin), l;     std::string result;     std::copy(f, l, std::back_inserter(result));     return result; }  int main(int argc, char* argv[]) {     // iterator type used expose underlying input stream     typedef std::string::const_iterator base_iterator_type;      // lexer token type use.     typedef lex::lexertl::token<         base_iterator_type, boost::mpl::vector<int, std::size_t, std::string>      > token_type;      #ifdef state_ws         typedef lex::lexertl::lexer<token_type> lexer_type;     #else         typedef lex::lexertl::actor_lexer<token_type> lexer_type;     #endif      // token definition type (derived given lexer type).     typedef sql_tokens<lexer_type> sql_tokens;      // iterator type exposed lexer      typedef sql_tokens::iterator_type iterator_type;      // type of grammar parse     typedef sql_grammar<iterator_type, sql_tokens::lexer_def> sql_grammar;      // use types defined above create lexer , grammar     // object instances needed invoke parsing process     sql_tokens tokens;                         // our lexer     sql_grammar sql(tokens);                  // our parser      const std::string str = cin2string();      // @ point generate iterator pair used expose     // tokenized input stream.     base_iterator_type = str.begin();     iterator_type iter = tokens.begin(it, str.end());     iterator_type end = tokens.end();      // parsing done based on the token stream, not character      // stream read input.     // note how use lexer defined above skip parser. must     // explicitly wrapped inside state directive, switching lexer      // state duration of skipping whitespace. #ifdef state_ws     std::string ws("ws");     bool r = qi::phrase_parse(iter, end, sql, qi::in_state(ws)[tokens.self]); #else     bool r = qi::parse(iter, end, sql); #endif      if (r && iter == end)     {         std::cout << "-------------------------\n";         std::cout << "parsing succeeded\n";         std::cout << "-------------------------\n";     }     else     {         std::cout << "-------------------------\n";         std::cout << "parsing failed\n";         std::cout << "-------------------------\n";     }     return 0; }

Search This Blog

Copy