How to Parse Mustache with Boost.Xpressive Correctly

How to parse mustache with Boost.Xpressive correctly?

Here is the correct full code from @sehe that now works under GCC >4.8 and CLANG under Linux and Windows. Again many thanks mate for this awesome help, even though this means that I can bury XPressive :D

The following lines have changed or been added:

// --
#define BOOST_RESULT_OF_USE_DECLTYPE
// --
struct to_string_f {
template <typename T>
std::string operator()(T const& v) const { return v.to_string(); }};
// --
section     %= "{{" >> sense >> reference [ section_id = to_string(_1) ] >> "}}"
                >> sequence // contents
                > ("{{" >> ('/' >> lexeme [ lit(section_id) ]) >> "}}");
// --
phx::function<to_string_f> to_string;

//#define BOOST_SPIRIT_DEBUG
#define BOOST_RESULT_OF_USE_DECLTYPE
#define BOOST_SPIRIT_USE_PHOENIX_V3
#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/utility/string_ref.hpp>
#include <functional>
#include <map>

namespace mustache {

    // any atom refers directly to source iterators for efficiency
    using boost::string_ref;
    template <typename Kind> struct atom {
        string_ref value;

        atom() { }
        atom(string_ref const& value) : value(value) { }

        friend std::ostream& operator<<(std::ostream& os, atom const& v) { return os << typeid(v).name() << "[" << v.value << "]"; }
    };

    // the atoms
    using verbatim = atom<struct verbatim_tag>;
    using variable = atom<struct variable_tag>;
    using partial  = atom<struct partial_tag>;

    // the template elements (any atom or a section)
    struct section;

    using melement = boost::variant<
            verbatim,
            variable,
            partial, // TODO comments and set-separators
            boost::recursive_wrapper<section>
        >;

    // the template: sequences of elements
    using sequence = std::vector<melement>;

    // section: recursively define to contain a template sequence
    struct section {
        bool       sense; // positive or negative
        string_ref control;
        sequence   content;
    };
}

BOOST_FUSION_ADAPT_STRUCT(mustache::section, (bool, sense)(boost::string_ref, control)(mustache::sequence, content))

namespace qi = boost::spirit::qi;
namespace phx= boost::phoenix;

struct to_string_f {
    template <typename T>
    std::string operator()(T const& v) const { return v.to_string(); }
};

template <typename Iterator>
    struct mustache_grammar : qi::grammar<Iterator, mustache::sequence()>
{
    mustache_grammar() : mustache_grammar::base_type(sequence)
    {
        using namespace qi;
        static const _a_type section_id = {}; // local
        using boost::phoenix::construct;
        using boost::phoenix::begin;
        using boost::phoenix::size;

        sequence     = *element;
        element      = 
                    !(lit("{{") >> '/') >> // section-end ends the current sequence
                    (partial | section | variable | verbatim);

        reference    = raw [ lexeme [ +(graph - "}}") ] ]
                        [ _val = construct<boost::string_ref>(&*begin(_1), size(_1)) ];

        partial      = qi::lit("{{") >> "> " >> reference >> "}}";

        sense        = ('#' > attr(true))
                     | ('^' > attr(false));

        section     %= "{{" >> sense >> reference [ section_id = to_string(_1) ] >> "}}"
                    >> sequence // contents
                    > ("{{" >> ('/' >> lexeme [ lit(section_id) ]) >> "}}");

        variable     = "{{" >> reference >> "}}";

        verbatim     = raw [ lexeme [ +(char_ - "{{") ] ]
                        [ _val = construct<boost::string_ref>(&*begin(_1), size(_1)) ];

        BOOST_SPIRIT_DEBUG_NODES(
                (sequence)(element)(partial)(variable)(section)(verbatim)
                (reference)(sense)
            )
    }
  private:
    phx::function<to_string_f> to_string;
    qi::rule<Iterator, mustache::sequence()> sequence;
    qi::rule<Iterator, mustache::melement()> element;
    qi::rule<Iterator, mustache::partial()>  partial;
    qi::rule<Iterator, mustache::section(), qi::locals<std::string> >  section;
    qi::rule<Iterator, bool()>                sense;                  // postive  or negative
    qi::rule<Iterator, mustache::variable()> variable;
    qi::rule<Iterator, mustache::verbatim()> verbatim;
    qi::rule<Iterator, boost::string_ref()>   reference;
};

namespace Dumping {
    struct dumper : boost::static_visitor<std::ostream&>
    {
        std::ostream& operator()(std::ostream& os, mustache::sequence const& v) const {
            for(auto& element : v)
                boost::apply_visitor(std::bind(dumper(), std::ref(os), std::placeholders::_1), element);
            return os;
        }
        std::ostream& operator()(std::ostream& os, mustache::verbatim const& v) const {
            return os << v.value;
        }
        std::ostream& operator()(std::ostream& os, mustache::variable const& v) const {
            return os << "{{" << v.value << "}}";
        }
        std::ostream& operator()(std::ostream& os, mustache::partial const& v) const {
            return os << "{{> " << v.value << "}}";
        }
        std::ostream& operator()(std::ostream& os, mustache::section const& v) const {
            os << "{{" << (v.sense?'#':'^') << v.control << "}}";
            (*this)(os, v.content);
            return os << "{{/" << v.control << "}}";
        }
    };
}

namespace ContextExpander {

    struct Nil { };

    using Value = boost::make_recursive_variant<
        Nil,
        double,
        std::string,
        std::map<std::string, boost::recursive_variant_>,
        std::vector<boost::recursive_variant_>
    >::type;

    using Dict  = std::map<std::string, Value>;
    using Array = std::vector<Value>;

    static inline std::ostream& operator<<(std::ostream& os, Nil   const&)   { return os << "#NIL#"; }
    static inline std::ostream& operator<<(std::ostream& os, Dict  const& v) { return os << "#DICT("  << v.size() << ")#"; }
    static inline std::ostream& operator<<(std::ostream& os, Array const& v) { return os << "#ARRAY(" << v.size() << ")#"; }

    struct expander : boost::static_visitor<std::ostream&>
    {
        std::ostream& operator()(std::ostream& os, Value const& ctx, mustache::sequence const& v) const {
            for(auto& element : v)
                boost::apply_visitor(std::bind(expander(), std::ref(os), std::placeholders::_1, std::placeholders::_2), ctx, element);
            return os;
        }

        template <typename Ctx>
        std::ostream& operator()(std::ostream& os, Ctx const&/*ignored*/, mustache::verbatim const& v) const {
            return os << v.value;
        }

        std::ostream& operator()(std::ostream& os, Dict const& ctx, mustache::variable const& v) const {
            auto it = ctx.find(v.value.to_string());
            if (it != ctx.end())
                os << it->second;
            return os;
        }

        template <typename Ctx>
        std::ostream& operator()(std::ostream& os, Ctx const&, mustache::variable const&) const {
            return os;
        }

        std::ostream& operator()(std::ostream& os, Dict const& ctx, mustache::partial const& v) const {
            auto it = ctx.find(v.value.to_string());
            if (it != ctx.end())
            {
                static const mustache_grammar<std::string::const_iterator> p;

                auto const& subtemplate = boost::get<std::string>(it->second);
                std::string::const_iterator first = subtemplate.begin(), last = subtemplate.end();

                mustache::sequence dynamic_template;
                if (qi::parse(first, last, p, dynamic_template))
                    return (*this)(os, Value{ctx}, dynamic_template);
            }
            return os << "#ERROR#";
        }

        std::ostream& operator()(std::ostream& os, Dict const& ctx, mustache::section const& v) const {
            auto it = ctx.find(v.control.to_string());
            if (it != ctx.end())
                boost::apply_visitor(std::bind(do_section(), std::ref(os), std::placeholders::_1, std::cref(v)), it->second);
            else if (!v.sense)
                (*this)(os, Value{/*Nil*/}, v.content);

            return os;
        }

        template <typename Ctx, typename T>
        std::ostream& operator()(std::ostream& os, Ctx const&/* ctx*/, T const&/* element*/) const {
            return os << "[TBI:" << __PRETTY_FUNCTION__ << "]";
        }

      private:
        struct do_section : boost::static_visitor<> {
            void operator()(std::ostream& os, Array const& ctx, mustache::section const& v) const {
                for(auto& item : ctx)
                    expander()(os, item, v.content);
            }
            template <typename Ctx>
            void operator()(std::ostream& os, Ctx const& ctx, mustache::section const& v) const {
                if (v.sense == truthiness(ctx))
                    expander()(os, Value(ctx), v.content);
            }
          private:
            static bool truthiness(Nil)                              { return false; }
            static bool truthiness(double d)                         { return 0. == d; }
            template <typename T> static bool truthiness(T const& v) { return !v.empty(); }
        };
    };

}

int myMain()
{
    std::cout << std::unitbuf;
    std::string input = "<ul>{{#time}}\n\t<li>{{> partial}}</li>{{/time}}</ul>\n "
        "<i>for all good men</i> to come to the {007} aid of "
        "their</bold> {{country}}. Result: {{^Res2}}(absent){{/Res2}}{{#Res2}}{{Res2}}{{/Res2}}"
        ;
    // Parser setup --------------------------------------------------------
    typedef std::string::const_iterator It;
    static const mustache_grammar<It> p;

    It first = input.begin(), last = input.end();

    try {
        mustache::sequence parsed_template;
        if (qi::parse(first, last, p, parsed_template))
        {
            std::cout << "Parse success\n";
        } else
        {
            std::cout << "Parse failed\n";
        }

        if (first != last)
        {
            std::cout << "Remaing unparsed input: '" << std::string(first, last) << "'\n";
        }

        std::cout << "Input:      " << input << "\n";
        std::cout << "Dump:       ";
        Dumping::dumper()(std::cout, parsed_template) << "\n";

        std::cout << "Evaluation: ";

        {
            using namespace ContextExpander;
            expander engine;

            Value const ctx = Dict { 
                { "time", Array {
                    Dict { { "partial", "gugus {{zeit}} (a.k.a. <u>{{title}}</u>)"},             { "title", "noon" },    { "zeit", "12:00" } },
                    Dict { { "partial", "gugus {{zeit}} (a.k.a. <u>{{title}}</u>)"},             { "title", "evening" }, { "zeit", "19:30" } },
                    Dict { { "partial", "gugus <u>{{title}}</u> (expected at around {{zeit}})"}, { "title", "dawn" },    { "zeit", "06:00" } },
                } },
                { "country", "ESP" },
                { "Res3", "unused" }
            };

            engine(std::cout, ctx, parsed_template);
        }
    } catch(qi::expectation_failure<It> const& e)
    {
        std::cout << "Unexpected: '" << std::string(e.first, e.last) << "'\n";
    }
}

How to parse using boost if it is not json, but similar to it?

It looks like Yet Another YAML/Mustache/JSON/... derivative.

Without a formal spec it's hard to actually assess what effort would be required, but here's a list of implementations of similar grammars in Boost Spirit, with varying amounts of feature completeness:

How to parse mustache with Boost.Xpressive correctly? <-- this is likely your best matching demonstration
Parse a substring as JSON using QJsonDocument (minimal subset, use something like this to transform the input to proper JSON, e.g.?)
Reading JSON file with C++ and BOOST A full featured JSON parser (with AST and escapes but no comments)

Applications of a toy JSON parser implementation:

replace only some value from json to json
How to manipulate leaves of a JSON tree
more

Parsing white-spaces in between lexemes using boost-spirit

Generation is a fundamentally different job than parsing.

Parsing removes redundancy and normalizes data. Generation adds redundancy and chooses (one of typically many) representations according to some goals (stylistic guides, efficiency goals etc).

By allowing yourself to get side-tracked with the BNF similarity, you've lost sight of your goals. As, in BNF many instances of whitespace are simply not significant.

This is manifest in the direct observation that the AST does not contain the whitespace.

Hacking It

The simplest way would be to represent the skipped whitespace instead as "string literals" inside your AST:

    _term       = _literal | _rule_name | _whitespace;

With

    _whitespace = +blank;

And then making the _list rule a lexeme as well (so as to not skip blanks):

    // lexemes
    qi::rule<Iterator, Ast::List()>   _list;
    qi::rule<Iterator, std::string()> _literal, _whitespace;

See it Live On Compiler Explorer

Clean Solution

The above leaves a few "warts": there are spots where whitespace is still not significant (namely around | and specifically before the list-attribute numbers):

<code>   ::=  <letter><digit> 34 | <letter><digit><code> 23
<letter> ::= "a" 1 | "b" 2 | "c" 3 | "d" 4 | "e" 5 | "f" 6 | "g" 7 | "h" 8 | "i" 9
<digit>  ::= "9" 10 | "1" 11 | "2" 12 | "3" 13 | "4" 14

I don't see how it would usefully be significant there, unless of course your input doesn't look like the input you've been using. E.g. if it looks like this instead:

<code>::=<letter><digit>34|<letter><digit><code>23
<letter>::="a"1|"b"2|"c"3|"d"4|"e"5|"f"6|"g"7|"h"8|"i"9
<digit>::="9"10|"1"11|"2"12|"3"13|"4"14

You could make all the rules lexeme. However, this doesn't add up with the presence of quoted strings, at all. The whole notion of quoted strings is to mark regions where normal whitespace (and comment) skipping is suspended.

I have a nagging feeling that you are much farther away from your actual problem (see https://meta.stackexchange.com/questions/66377/what-is-the-xy-problem) than we can even currently see, and you might even have stripped the whole quoted-string-literals concept from the "BNF" already.

A clean solution would be to forget about misleading similarities with BNF and just devise your own grammar from the ground up.

If the goal is simply to have a (recursive) macro/template expansion engine, it should really turn out a lot simpler than what you currently have. Maybe you can describe your real task (input, desired output and required behaviors) so we can help you achieve that?

Problems with boost::phoenix::bind and boost::phoenix::actors in a semantic action for boost::spirit::qi

A few observations:

You don't seem to be using a skipper, so using lexeme is redundant (see Boost spirit skipper issues)
You want to know how to detect the type of the attribute exposed by a parser expression: see Detecting the parameter types in a Spirit semantic action
The types are documented with the parser directives, though, so e.g. as_string[(qi::char_("1-9") >> +qi::char_("0-9"))] results in boost::fusion::vector2<char, std::vector<char> >, which is directly reflected in the error message on GCC:
```
boost/phoenix/bind/detail/preprocessed/function_ptr_10.hpp|50 col 39| error: could not convert ‘a0’ from ‘boost::fusion::vector2<char, std::vector<char> >’ to ‘std::vector<char>’
```
Prefer not to mix and match library placeholders/wrappers, e.g. boost::ref and boost::phoenix::ref
You seem to be reinventing integer parsing; consider using qi::int_parser instead
It seems that the case to parse 0 is missing :)

Assuming you want my_str to simply reflect the input string including number base prefix, I could suggest using:

number =
     as_string[(qi::char_("1-9") >> +qi::char_("0-9"))] [phx::bind(&fPushIntCV, qi::_1, phx::ref(code), phx::ref(variables))]
   | as_string[("0x" >> +qi::char_("0-9a-fA-F"))      ] [phx::bind(&fPushIntCV, qi::_1, phx::ref(code), phx::ref(variables))]
   | as_string[("0b" >> +qi::char_("0-1"))            ] [phx::bind(&fPushIntCV, qi::_1, phx::ref(code), phx::ref(variables))]
   | as_string[("0" >>  +qi::char_("0-7"))            ] [phx::bind(&fPushIntCV, qi::_1, phx::ref(code), phx::ref(variables))]
   //some other junk
   ;

However, this could be simplified to:

number = as_string[
       (qi::char_("1-9") >> +qi::char_("0-9"))
     | ("0x" >> +qi::char_("0-9a-fA-F"))
     | ("0b" >> +qi::char_("01"))
     | ("0"  >> +qi::char_("0-7"))
   ]        [phx::bind(&fPushIntCV, qi::_1, phx::ref(code), phx::ref(variables))]
   ;

Now, you would probably like to just parse an integer value instead:

number = 
     ( 
       ("0x" >> qi::int_parser<int, 16, 1>())
     | ("0b" >> qi::int_parser<int,  2, 1>())
     | ("0"  >> qi::int_parser<int,  8, 1>())
     | qi::int_ /* accepts "0" */) [phx::bind(&fPushIntCV, qi::_1, phx::ref(code), phx::ref(variables))]
   ;

Which handsomely does the conversions^[1], and you can just take an int:

void fPushIntCV (int my_number, Code& c, Variables& v) {
    std::cout << "fPushIntCV: " << my_number << "\n";
}

^[1] (there's also uint_parser and you can parse long, long long etc.; even big integers like boost::multiprecision::cpp_int should be no issue)

Here's a demo program using this, showing that the values are converted correctly (and: "0" is accepted :)): Live On Coliru

int main()
{
    Code code;
    Variables variables;
    Calculator g(code, variables); 

    for (std::string const input : { "0", "0xef1A", "010", "0b10101" })
    {
        It f(input.begin()), l(input.end());

        if(qi::parse(f, l, g))
            std::cout << "Parse success ('" << input << "')\n";
        else std::cout << "Parse failed ('" << input << "')\n";

        if (f != l)
            std::cout << "Input remaining: '" << std::string(f, l) << "'\n";
    }
}

Prints

fPushIntCV: 0
Parse success ('0')

fPushIntCV: 61210
Parse success ('0xef1A')

fPushIntCV: 8
Parse success ('010')

fPushIntCV: 21
Parse success ('0b10101')

Strange static_cast compilation error while compile boost spirit parser

Firstly, I can only assume
```
data = (text | imgtag | vartag | inctag | blktag | reftag) >> *data;
```
was /meant/ as 1-or-more repeats of the (...) expression. Writing it as
```
data = +(text | imgtag | vartag | inctag | blktag | reftag);
```
expresses the same, but allows attribute propagation to match the exposes attribute type.
There are a number of lexeme[] directives that have no purpose when not using a skipper
There is a suspicious manual skipping of whitespace that might be better served by using a skipper
Insofar as you do wish to require a mandatory space after the "tag name", consider using operator& operator. That way you can still use a skipper.
Anyhow, it's possible you were looking for something like the Qi Repository distinct()[] parser directive
Even with a skipper
```
*(+lit(' ') >> lexeme[+(char_ - '{' - '}')])
```
doesn't make sense as lexeme[...] would eat any space up to closing '}' and hence the second repeat of the *() would never apply.

See also Boost spirit skipper issues
There is a lot of manual repetition between rules. Consider using qi::symbols to map the input to tag types.
If you do, it becomes easier to avoid semantic actions (Good Thing: Boost Spirit: "Semantic actions are evil"?). Even if you didn't, you could use qi::attr to expose a specific value as the type value.
Consider adding debug information (see BOOST_SPIRIT_DEBUG in the demo below)

The grammar simplified

I'd reduce the whole grammar to just this:

data = +( ('{' >> tag >> '}') | text );

tag  = lexeme[type >> &char_(" {}")] >> lexeme[*~char_("{}")];
text = attr("text")                  >> lexeme[+~char_("{}")];

Done! No more semantic actions, no more dozens of rules doing basically the same. No more complicated nested repeats with unclear multiplicities. type is a qi::symbols parser now, that contains the mapping of tag names:

type.add
    ("img",   "img")
    ("var",   "var")
    ("inc",   "inc")
    ("blank", "blk")
    ("ref",   "ref");

And here's a complete demo:

DEMO

Live On Coliru

//#define BOOST_SPIRIT_DEBUG
#include <boost/fusion/adapted.hpp>
#include <boost/spirit/include/qi.hpp>
namespace qi  = boost::spirit::qi;

struct CMyTag
{
    std::string tagName;
    std::string tagData;
};
BOOST_FUSION_ADAPT_STRUCT(::CMyTag, (std::string, tagName) (std::string, tagData))

template <typename Iterator, typename Skipper = qi::space_type>
struct testTag_grammar : qi::grammar<Iterator, std::vector<CMyTag>(), Skipper>
{
    testTag_grammar() :
        testTag_grammar::base_type(data)
    {
        using namespace qi;

        data = +( ('{' >> tag >> '}') | text );

        type.add
            ("img",   "img")
            ("var",   "var")
            ("inc",   "inc")
            ("blank", "blk")
            ("ref",   "ref");

        tag  = lexeme[type >> &char_(" {}")] >> lexeme[*~char_("{}")];
        text = attr("text")                  >> lexeme[+~char_("{}")];

        BOOST_SPIRIT_DEBUG_NODES( (data) (tag) (text))
    }

  private:
    qi::symbols<char, std::string> type;
    qi::rule<Iterator, CMyTag(), Skipper>              tag, text;
    qi::rule<Iterator, std::vector<CMyTag>(), Skipper> data;
};

int main() {
    testTag_grammar<std::string::const_iterator> l_gramar;
    std::string const l_test = "asd {img} {ref I}sdkflsdlk {img} wmrwerml";

    std::vector<CMyTag> l_result;
    auto f = l_test.begin(), l = l_test.end();
    bool result = qi::phrase_parse(f, l, l_gramar, qi::space, l_result);

    if (result) {
        std::cout << "Parse success: " << l_result.size() << "\n";

        for (auto& v : l_result)
            std::cout << "Name '" << v.tagName << "', Data '" << v.tagData << "'\n";
    }
    else {
        std::cout << "Parse failed\n";
    }

    if (f!=l) {
        std::cout << "Remaining unparsed: '" << std::string(f,l) << "'\n";
    }
}

Prints

Parse success: 6
Name 'text', Data 'asd '
Name 'img', Data ''
Name 'ref', Data 'I'
Name 'text', Data 'sdkflsdlk '
Name 'img', Data ''
Name 'text', Data 'wmrwerml'

How do I systematically replace text within mustache tags in a large body of text?

This is a tough (but not impossible) task to do with a single regular expression. Fortunately, there's no reason we have to do it with a single one. A much easier (and more robust) approach is to use two regular expressions: one to match replacement tags (things contained in {{curly brackets}}) and another to replace instances of array indexers with dot indexers. Here's my solution:

s.replace( /\{\{(.*?)\}\}/g, function(x){          // this grabs replacement tags
  return x.replace( /\[(\d+)\]/g,'.$1' )});      // this replaces array indexers

Note: I have not analyzed this solution with the entire mustache syntax, so I cannot guarantee it will work if you're using more than the standard tags.

How to Parse Mustache with Boost.Xpressive Correctly