Boost logo

Boost Users :

From: AJD (n.outpost_at_[hidden])
Date: 2007-12-05 17:10:03


Hi,

I've been trying out boost.proto lately and I've noticed that for my
expressions it tends to force my operands to be created on the stack. For
example, instead of optimizing out an object completely when everything
about it
is known at compile-time, the compiler (MSVC 2005 or 2008) is creating that
object on the stack anyway.. even though it's not necessary. Even worse, it
actually folds the simpler expressions at compile-time and never even
touches
the data that it just put on the stack. The problem is compounded with
larger
expressions involving many operands. It even seems to build the larger
expression objects on the stack unnecessarily (and this looks like it starts
to
break constant folding).

Here's some of the generated assembly:

int main()
{
00401800 sub esp,18h
        vector2< float[3] > v1 = { 0, 1, 2 };
00401803 xorps xmm0,xmm0
        vector2< float[3] > v2 = { 3, 4, 5 };
        vector2< float[3] > v3 = { 6, 7, 8 };

        // Add two vectors lazily and get the 2nd element.
#if 1
        // this sticks temporary junk all over the stack, even with 2 terms
        std::cout << (v1 + v2)[0];
00401806 fld dword ptr [__real_at_40400000 (402128h)]
0040180C movss dword ptr [esp],xmm0
00401811 movss xmm0,dword ptr [__real_at_3f800000 (402124h)]
00401819 movss dword ptr [esp+4],xmm0
0040181F movss xmm0,dword ptr [__real_at_40000000 (402120h)]
00401827 movss dword ptr [esp+8],xmm0
0040182D movss xmm0,dword ptr [__real_at_40400000 (402128h)]
00401835 movss dword ptr [esp+0Ch],xmm0
0040183B movss xmm0,dword ptr [__real_at_40800000 (40211Ch)]
00401843 push ecx
00401844 mov ecx,dword ptr [__imp_std::cout (402044h)]
0040184A fstp dword ptr [esp]
0040184D movss dword ptr [esp+14h],xmm0
00401853 movss xmm0,dword ptr [__real_at_40a00000 (402118h)]
0040185B movss dword ptr [esp+18h],xmm0
00401861 call dword ptr
[__imp_std::basic_ostream<char,std::char_traits<char> >::operator<<
(402040h)]

        std::cin.get();
00401867 mov ecx,dword ptr [__imp_std::cin (40203Ch)]
0040186D call dword ptr
[__imp_std::basic_istream<char,std::char_traits<char> >::get (402038h)]

        return 0;
00401873 xor eax,eax
}
00401875 add esp,18h
00401878 ret

I've come up with a couple workarounds but these only work for specific
cases:

// (1) This optimizes out nicely, but only works for adding 2 objects.
typedef proto::terminal< vector2< float[3] > >::type expr_type;
std::cout << (vector_expr< expr_type >( expr_type::make(v1) ) + vector_expr<
expr_type >( expr_type::make(v2) ))[0];

// (2) This never optimizes out
std::cout << (v1 + v2)[0];

I'd noticed that the reference at traits.hpp line 144 seemed to play a major
role (it was the only significant difference between the expression trees
generated by the above two snippets.)

// (3) This makes (2) optimize just like (1), but doesn't help 3+
// HACK TO REMOVE REF AT boost\xpressive\proto\traits.hpp LINE 144
namespace boost { namespace proto { namespace result_of {
template<typename T, typename EnableIf>
struct as_arg<T, VectorDomain, EnableIf>
{
        typedef expr<proto::tag::terminal, args0<T /*&*/> > expr_type;
        typedef typename VectorDomain::template apply<expr_type>::type type;

        template<typename T2>
        static type call(T2 &t)
        {
                return VectorDomain::make(expr_type::make(t));
        }
};
}}}

Finally, here is the entire source I'm using:

#include <cstddef>
#include <iostream>
#include <boost/xpressive/proto/proto.hpp>
#include <boost/xpressive/proto/context.hpp>
using namespace boost;

template< typename T>
struct vector2;

template< typename T >
struct vector2<T[3]>
{
        T v[3];

        typedef T value_type;

        T const & operator [](std::size_t i) const
        {
                return this->v[i];
        }

        T & operator [](std::size_t i)
        {
                return this->v[i];
        }
};

// This grammar describes which lazy vector expressions
// are allowed; namely, vector terminals and addition
// and subtraction of lazy vector expressions.
struct VectorGrammar
: proto::or_
         <
                 proto::terminal< vector2< proto::_ > >
         , proto::plus< VectorGrammar, VectorGrammar >
>
{
};

// Expressions in the lazy vector domain must conform
// to the lazy vector grammar
struct VectorDomain;

// Here is an evaluation context that indexes into a lazy vector
// expression, and combines the result.
struct subscript_context
{
private:
        std::size_t i;
public:
        subscript_context(std::size_t i) : i(i)
        {
        }

        // Use default_eval for all the operations ...
        template< typename Expr, typename Tag = typename Expr::proto_tag >
        struct eval : proto::default_eval< Expr, subscript_context const >
        {
        };

        // ... except for terminals, which we index with our subscript
        template< typename Expr >
        struct eval< Expr, proto::tag::terminal >
        {
                typedef typename proto::result_of::arg< Expr
>::type::value_type
result_type;

                result_type operator ()(Expr const & expr, subscript_context
const & ctx) const
                {
                        return proto::arg(expr)[ctx.i];
                }
        };
};

// Here is the domain-specific expression wrapper, which overrides
// operator [] to evaluate the expression using the subscript_context.
template< typename Expr >
struct vector_expr : proto::extends<Expr, vector_expr< Expr >, VectorDomain
>
{
        typedef proto::extends< Expr, vector_expr< Expr >, VectorDomain >
base_type;

        vector_expr(Expr const & expr) : base_type(expr)
        {
        }

        // Use the subscript_context to implement subscripting
        // of a lazy vector expression tree.
        typename proto::result_of::eval< Expr, subscript_context const
>::type
        operator [](std::size_t i) const
        {
                subscript_context const ctx(i);
                return proto::eval(*this, ctx);
        }
};

// Tell proto that in the VectorDomain, all
// expressions should be wrapped in vector_expr<>
struct VectorDomain : proto::domain< proto::generator< vector_expr >,
VectorGrammar >
{
};

template< typename T >
struct is_vector
: mpl::false_
{
};

template< typename T >
struct is_vector< vector2< T > >
: mpl::true_
{
};

BOOST_PROTO_DEFINE_BINARY_OPERATOR(+, boost::proto::tag::plus, is_vector,
VectorDomain);

int main()
{
        vector2< float[3] > v1 = { 0, 1, 2 };
        vector2< float[3] > v2 = { 3, 4, 5 };
        vector2< float[3] > v3 = { 6, 7, 8 };

        // Add two vectors lazily and get the 2nd element.
#if 1
        // this sticks write-only temporary junk all over the stack, even
with 2
terms
        std::cout << (v1 + v2)[0];// + v3 + v3 + v3 + v3 + v3)[0];
#else
        // this optimizes out completely for 2 terms... expanding to 3 terms
does not
        typedef proto::terminal< vector2< float[3] > >::type expr_type;
        std::cout << (vector_expr< expr_type >( expr_type::make(v2) )
                    + vector_expr< expr_type >( expr_type::make(v3) ))[0];
#endif

        std::cin.get();

        return 0;
}

Thanks in advance.
//

AJD < n.outpost_at_[hidden] >


Boost-users list run by williamkempf at hotmail.com, kalb at libertysoft.com, bjorn.karlsson at readsoft.com, gregod at cs.rpi.edu, wekempf at cox.net