docs/html/stensor_8h_source.html

 /*#############################################################################

  *

  *      Copyright 2011 by Henrik Skibbe and Marco Reisert

  *

  *

  *      This file is part of the STA-ImageAnalysisToolbox

  *

  *      STA-ImageAnalysisToolbox is free software: you can redistribute it and/or modify

  *      it under the terms of the GNU General Public License as published by

  *      the Free Software Foundation, either version 3 of the License, or

  *      (at your option) any later version.

  *

  *      STA-ImageAnalysisToolbox is distributed in the hope that it will be useful,

  *      but WITHOUT ANY WARRANTY; without even the implied warranty of

  *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

  *      GNU General Public License for more details.

  *

  *      You should have received a copy of the GNU General Public License

  *      along with STA-ImageAnalysisToolbox.

  *      If not, see <http://www.gnu.org/licenses/>.

  *

  *

 *#############################################################################*/


 #ifndef STA_STENSOR_H

 #define STA_STENSOR_H


 #ifndef _STA_NOGSL

 #include "gsl/gsl_sf_coupling.h"

 #include "gsl/gsl_sf_laguerre.h"

 #include "gsl/gsl_sf_gamma.h"

 #include "gsl/gsl_sf_legendre.h"

 #include "gsl/gsl_sf_bessel.h"

 #else

 #include "no_gsl.h"

 #endif


 #include <cstddef>

 #include <complex>

 #include <cmath>

 #include <sstream>

 #include <cstddef>

 #include <vector>

 #include <stdio.h>

 #include <stdlib.h>

 #include <cstdlib>

 #include <string>

 #include <limits>


 #ifdef __linux__

 #include <unistd.h>

 #else

 #ifndef M_PI

 #define M_PI 3.14159265358979323846

 #endif

 #endif


 #include "sta_omp_threads.h"


 #ifdef _STA_LINK_FFTW

 #ifdef __linux__

 #include "fftw3.h"

 #else

 //#pragma comment (lib,"C:/Program Files/MATLAB/R2011b/bin/win32/libfftw3.dll");

 //#pragma comment (lib,"C:/Program Files/MATLAB/R2011b/bin/win32/libfftw3f.dll");


 //#pragma comment (lib,"C:/files/libfftw3-3.lib");

 //#pragma comment (lib,"C:/files/libfftw3f-3.lib");


 //#pragma comment (lib,"C:/files/fft/libfftw3.lib");

 //#pragma comment (lib,"C:/files/fft/libfftw3f.lib");


 //#pragma comment (lib,"C:/files/gsl/lib/libgsl.lib");

 //#pragma comment (lib,"C:/files/gsl/lib/libgslcblas.lib");

 #include "fftw3.h";

 #endif

 // http://www.fftw.org/install/windows.html

 //"c:\Program Files\Microsoft Visual Studio 10.0\VC\bin\vcvars32.bat"

 //lib /def:libfftw3-3.def

 //lib /def:libfftw3f-3.def

 //lib /machine:x64 /def:libfftw3l-3.def

 // !set path=%path%:C:\files

 //mex   -D_STA_LINK_FFTW  -D_STA_NOGSL sta_fft3fb.cc  -I'C:/Documents and Settings/reisertm/Desktop/STAtoolbox/sta_toolbox'  -I'C:/Documents and Settings/reisertm/Desktop/STAtoolbox/sta_toolbox_matlab'  -I'C:/Documents and Settings/reisertm/Desktop/STAoolbox/fftw/'

 //#include "fftw3.h"


 //mex   -D_STA_LINK_FFTW -D_STA_NOGSL sta_fft3fb.cc  -L'C:/files/fft/' -lfftw3 -lfftw3f -I'C:/Documents and Settings/reisertm/Desktop/STAtoolbox/sta_toolbox'  -I'C:/Documents and Settings/reisertm/Desktop/STAtoolbox/sta_toolbox_matlab'  -I'C:/files/fft/'

 #endif


 //#define _STA_SINGLE_THREAD


 namespace hanalysis

 {


 enum STA_RESULT {

     STA_RESULT_SUCCESS=0,

     STA_RESULT_FAILED=-1,

     STA_RESULT_SHAPE_MISMATCH=-2,

     STA_RESULT_INVALID_PRODUCT=-3,

     STA_RESULT_STORAGE_MISMATCH=-4,

     STA_RESULT_INVALID_TENSOR_RANK=-5,

     STA_RESULT_OFIELD_TYPE_MISMATCH=-6,

     STA_RESULT_SAME_ADDRESS=-7,

     STA_RESULT_NOT_IMPLEMENTED=-8,

 };


 static int verbose=0;


 template<typename T>

 T sigmoid(T x, T s, T o)

 {

     return (T)1/((T)1+std::exp(-s*(x-o)));

 }


 //radius on sphere

 inline

 void getFourierFreqAndNorm(

     double a,

     unsigned int l,

     unsigned  int n,

     double & kln,

     double & Nln)

 {

     double zero=gsl_sf_bessel_zero_Jnu (l+0.5, n);

     Nln=std::pow(a,3.0)*std::pow(gsl_sf_bessel_jl (l+1,zero),2.0);

     Nln=1/(std::sqrt(Nln)+std::numeric_limits<float>::epsilon());

     kln=zero/a;

 }


 /*

    computes Clebsch Gordan coupling coefficients

    \f$ \langle ja~ma, jb~mb \hspace{1.5pt}|\hspace{1.5pt} J M \rangle \f$. \n

    \returns  \f$

     \left\{

       \begin{array}{ll}

         \langle ja~ma, jb~mb \hspace{1.5pt}|\hspace{1.5pt} J M \rangle   & \mbox{if }

         M=ma+mb \text{ and } \lvert ja-jb  \rvert \leq J \text{ and } ja+jb \geq J \\

         0 & \mbox{else }

       \end{array}

     \right.

    \f$

  */

 inline

 double clebschGordan ( const int ja,

                        const int ma,

                        const int jb,

                        const int mb,

                        const int J,

                        const int M )

 {

 #ifdef _STA_NOGSL


     return nogsl_clebschGordan<double>(ja,ma,jb,mb,J,M);


 //     int J2=2*J;

 //     double _result=0;

 //     int status=0;

 //     status=gsl_sf_coupling_3j ( 2*ja, 2*jb,  J2,

 //                                 2*ma, 2*mb, -2*M, _result );

 //

 //     if(status!=0)

 //     {

 //         printf("error computing the Wigner 3j symbols\n");

 //     }

 //     double norm = sqrt ( ( double ) ( J2+1.0 ) );

 //     int phase = ( ja-jb+M );

 //     double sign = ( phase & 1 ) ? -1.0 : 1.0;

 //     return _result*sign*norm;

 #else

     int J2=2*J;

     gsl_sf_result _result;

     gsl_sf_coupling_3j_e ( 2*ja, 2*jb,  J2,

                            2*ma, 2*mb, -2*M, &_result );


     //printf("YE_GSL: %f\n",_result.val);

     double norm = sqrt ( ( double ) ( J2+1.0 ) );

     int phase = ( ja-jb+M );

     double sign = ( phase & 1 ) ? -1.0 : 1.0;


     //printf("%d %d %d %d %d %d | org: %f , mine %f\n",ja,ma,jb,mb,J,M,_result.val*sign*norm,clebschGordan2<double>(ja,ma,jb,mb,J,M));

     return _result.val*sign*norm;

 #endif

 }


 /*

       orthonormal spherical harmonic basis functions

 */

 inline

 std::complex<double> basis_SphericalHarmonics(int l, int m, double theta,double phi)

 {

     bool sym=false;

     if (m<0) {

         sym=true;

         m*=-1;

     }

     double legendre=gsl_sf_legendre_sphPlm (l, m, std::cos(theta)); //already normalized

     std::complex<double>  tmp;

     tmp=legendre*std::exp(std::complex<double>(0,1)*(double)m*phi);

     if (sym)

     {

         if (m%2==0) return std::conj(tmp);

         else return -std::conj(tmp);

     }

     return tmp;

 }


 /*

      orthogonal spherical harmonic basis functions (semi schmidt)

 */

 inline

 std::complex<double> basis_SphericalHarmonicsSemiSchmidt(int l, int m, double theta,double phi)

 {


     double norm=std::sqrt(4.0*M_PI/(2.0*l+1.0));;

     return norm*basis_SphericalHarmonics(l,m,theta,phi);

 }


 /*

       precomputes the weights for the spherical tensor product (see \ref sta_product) \n

       \param J1 \f$ J_1 \in \mathbb N \f$ tensor rank of the first field

       \param J2 \f$ J_2 \in \mathbb N \f$ tensor rank of the second field

       \param J \f$ J \in \mathbb N \f$ tensor rank of the resulting field

       \param normalize normalized tensor products?: true=\f$ \bullet_{J}\f$ , false=\f$ \circ_{J}\f$

       \param alpha \f$ \alpha \in \mathbb C \f$ additional weighting factor

       \returns pointer to allocated memory with weights

 */

 template<typename T>

 T * sta_product_precomputeCGcoefficients_C ( int J1,int J2, int J, bool normalized=false, T alpha=1 )

 {

     T norm= ( T ) 1;

     if ( normalized )

     {

         //assert((J1+J2+J)%2==0);

         norm= ( T ) 1/ ( T ) hanalysis::clebschGordan ( J1,0,J2,0,J,0 );

     }

     norm*=alpha;

     std::size_t count=0;

     for ( int m=-J; m<=J; m++ )

     {

         for ( int m1=-J1; m1<=J1; m1++ )

         {

             int m2=m-m1;

             if ( abs ( m2 ) <=J2 )

             {

                 count++;

             }

         }

     }

     T * cg= new T[count];

     count=0;

     for ( int m=-J; m<=J; m++ )

     {

         for ( int m1=-J1; m1<=J1; m1++ )

         {

             int m2=m-m1;

             if ( abs ( m2 ) <=J2 )

             {

                 cg[count++]=norm* ( T ) hanalysis::clebschGordan ( J1,m1,J2,m2,J,m );

             }

         }

     }

     return cg;

 }


 template<typename T>

 bool sta_isnan (

     const std::complex<T> * stIn,

     const std::size_t shape[],

     int components=0,

     int stride = -1)

 {

     if ( stride == -1 )

         stride = components;


     std::size_t numvoxel=shape[0]*shape[1]*shape[2];


     stride*=2;


     const T * stIn_r=(const T*) stIn;


     printf("isnan: stride: %d components: %d data ptr: %d",stride,components,stIn_r);


     for (std::size_t i=0;i<numvoxel;i++)

     {

         for (std::size_t j=0;j<components;j++)

         {

          if (std::isnan(stIn_r[j]))

            return true;

         }

         stIn_r+=stride;

     }

     return false;

 }


 template<typename T>

 bool sta_isinf (

     const std::complex<T> * stIn,

     const std::size_t shape[],

     int components=0,

     int stride = -1)

 {

     if ( stride == -1 )

         stride = components;


     std::size_t numvoxel=shape[0]*shape[1]*shape[2];


     stride*=2;


     const T * stIn_r=(const T*) stIn;


     for (std::size_t i=0;i<numvoxel;i++)

     {

         for (std::size_t j=0;j<components;j++)

         {

          if (std::isinf(stIn_r[j]))

            return true;

         }

         stIn_r+=stride;

     }

     return false;

 }


 template<typename T,typename S>

 STA_RESULT sta_product_C (

     const std::complex<T> * stIn1,

     const std::complex<T> * stIn2,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J1,

     int J2,

     int J,

     S alpha,

     bool normalize = false,

     int  stride_in1 = -1,

     int  stride_in2 = -1,

     int  stride_out = -1,

     bool clear_field = false )

 {


     if ( ( std::abs ( J1-J2 ) >J ) || ( J>std::abs ( J1+J2 ) ) )

         return STA_RESULT_INVALID_PRODUCT;

 //printf("%d %d -> %d  (%d)\n",J1,J2,J,normalize);

     if ( ( ( J1+J2+J ) %2!=0 ) && ( normalize ) )

         return STA_RESULT_INVALID_PRODUCT;


     S * cg= hanalysis::sta_product_precomputeCGcoefficients_C<S> ( J1,J2, J,normalize,alpha );


     std::size_t vectorLengthJ1=J1*2+1;

     std::size_t vectorLengthJ2=J2*2+1;

     std::size_t vectorLengthJ=J*2+1;


     if ( stride_in1 == -1 )

         stride_in1 = vectorLengthJ1;

     if ( stride_in2 == -1 )

         stride_in2 = vectorLengthJ2;

     if ( stride_out == -1 )

         stride_out = vectorLengthJ;


     std::size_t jumpz=shape[1]*shape[2];


     #pragma omp parallel for num_threads(get_numCPUs())

     for ( std::size_t z=0; z<shape[0]; z++ )

     {

         std::size_t Z=z;

         Z*=jumpz;

         const std::complex<T> * current_J1=&stIn1[Z*stride_in1];

         const std::complex<T> * current_J2=&stIn2[Z*stride_in2];

         std::complex<T> * current_J=&stOut[Z*stride_out];

         current_J1+=J1;

         current_J2+=J2;

         current_J+=J;


         for ( std::size_t i=0; i<jumpz; i++ )

         {

             std::size_t count=0;

             for ( int m=-J; m<=J; m++ )

             {

                 std::complex<T> & current=current_J[m];

                 if ( clear_field ) current=T ( 0 );


                 for ( int m1=-J1; m1<=J1; m1++ )

                 {

                     int m2=m-m1;

                     if ( std::abs ( m2 ) <=J2 )

                     {

                         current+= ( current_J1[m1] ) * ( current_J2[m2] ) *cg[count++];

                     }

                 }

             }

             current_J1+=stride_in1;

             current_J2+=stride_in2;

             current_J+=stride_out;

         }

     }

     delete [] cg;

     return STA_RESULT_SUCCESS;

 }


 /*

  computes the spherical tensor derivative of  \f$ \mathbf{stIn} \in \mathcal T_{J}\f$ \n

  \param stIn \f$ \mathbf{stIn} \in \mathcal T_{J}\f$

  \param stOut \f$ \mathbf{stOut} \in \mathcal T_{(J+Jupdown)}\f$, the spherical tensor derivative of \f$ \mathbf{stIn} \f$

  \param shape

  \param J \f$ J \in \mathbb N \f$ tensor rank of the input field \f$ \mathbf{stIn}  \f$

  \param Jupdown

  \f$

  \left\{

  \begin{array}{ll}

    \mathbf{stOut}=\alpha({\nabla}  \bullet_{(J+1)}  \mathbf{stIn}), &  \mbox{ if } Jupdown=1\\

    \mathbf{stOut}=\alpha({\nabla}  \circ_{J}  \mathbf{stIn}), &  \mbox{ if } Jupdown=0\\

    \mathbf{stOut}=\alpha({\nabla}  \bullet_{(J-1)}  \mathbf{stIn}), &  \mbox{ if } Jupdown=-1

  \end{array}

  \right.

  \f$

  \param conjugate  if \b conjugate=true the  conjugate operator \f$ \overline{{\nabla}} \f$ is used

  \param alpha \f$ \alpha \in \mathbb C \f$ additional weighting factor

  \returns  \f$

  \left\{

  \begin{array}{ll}

   J+Jupdown &  \mbox{if derivative exists}\\

   -1 & \mbox{ else }

  \end{array}

  \right.

  \f$

   \warning ensure that stIn, stOut and shape exist

   and have been \b allocated properly!

  */

 // template<typename T>

 // int sta_derivatives_C (

 //     const std::complex<T> * stIn,

 //     std::complex<T> * stOut ,

 //     const std::size_t shape[],

 //     int J,

 //     int Jupdown,    // either -1 0 or 1

 //     bool conjugate=false,

 //     std::complex<T> alpha= ( T ) 1.0,

 //     const T  v_size[]=NULL,

 //     int stride_in = -1,

 //     int stride_out = -1,

 //     bool clear_field = false )

 // {

 //     bool alpha_real=(alpha.imag()==0);

 //

 //     alpha/=T ( 2 );

 //     if ( abs ( Jupdown ) >1 ) return -1;

 //     if ( abs ( J+Jupdown ) <0 ) return -1;

 //

 //

 //     T sign=-1;

 //     if (conjugate) sign*=-1;

 //

 //     T voxel_size[3];

 //     voxel_size[0]=voxel_size[1]=voxel_size[2]=T ( 1 );

 //     if ( v_size!=NULL )

 //     {

 //         voxel_size[0]/=v_size[0]; // Zdir

 //         voxel_size[1]/=v_size[1]; // Ydir

 //         voxel_size[2]/=v_size[2]; // Xdir

 //     }

 //

 //     sign*=voxel_size[1];

 //

 //     int J1=J+Jupdown;

 //

 //     std::size_t vectorLengthJ=2*J+1;

 //     std::size_t vectorLengthJ1=2* ( J1 ) +1;

 //

 //     if ( stride_in == -1 )

 //         stride_in = vectorLengthJ;

 //     if ( stride_out == -1 )

 //         stride_out = vectorLengthJ1;

 //

 //

 //

 //     std::size_t jumpz=shape[1]*shape[2];

 //     std::size_t jumpy=shape[2];

 //

 //

 //     T * CGTable=new T[3*vectorLengthJ1];

 //     T shnorm=hanalysis::clebschGordan ( 1,0,J,0,J1,0 );

 //     if ( Jupdown==0 ) shnorm=1;

 //     for ( int M=- ( J1 );M<= ( J1 );M++ )

 //     {

 //         CGTable[M+ ( J1 ) ]                 =T ( 1.0/std::sqrt ( 2.0 ) ) *hanalysis::clebschGordan ( 1,-1,J,M+1,J1,M ) /shnorm;;

 //         CGTable[M+ ( J1 ) +vectorLengthJ1]  =voxel_size[0]*hanalysis::clebschGordan ( 1,0,J,M,J1,M ) /shnorm;

 //         CGTable[M+ ( J1 ) +2*vectorLengthJ1]=T ( 1.0/std::sqrt ( 2.0 ) ) *hanalysis::clebschGordan ( 1,1,J,M-1,J1,M ) /shnorm;

 //     }

 //     T * CGTable0=&CGTable[0];

 //     CGTable0+= ( J1 );

 //     T * CGTable1=&CGTable[vectorLengthJ1];

 //     CGTable1+= ( J1 );

 //     T * CGTable2=&CGTable[2*vectorLengthJ1];

 //     CGTable2+= ( J1 );

 //

 //

 //     stride_in*=2;

 //     stride_out*=2;

 //     int J_times2=2*J;

 //     int J1_times2=2*J1;

 //

 //     const  T * stIn_R=(const  T *)stIn;

 //     T * stOut_R=(T *)stOut;

 //

 // #pragma omp parallel for num_threads(get_numCPUs())

 //     for ( std::size_t z=0;z<shape[0];z++ )

 //     {

 //         std::size_t Z[3];

 //         Z[1]=z+shape[0];

 //         Z[0]=Z[1]-1;

 //         Z[2]=Z[1]+1;

 //         Z[0]%=shape[0];

 //         Z[1]%=shape[0];

 //         Z[2]%=shape[0];

 //

 //         Z[0]*=jumpz;

 //         Z[1]*=jumpz;

 //         Z[2]*=jumpz;

 //

 //         const T * derivX1;

 //         const T * derivX0;

 //

 //         const T* derivY1;

 //         const T * derivY0;

 //

 //         const T* derivZ1;

 //         const T * derivZ0;

 //

 //         for ( std::size_t y=0;y<shape[1];y++ )

 //         {

 //             std::size_t Y[3];

 //             Y[1]=y+shape[1];

 //             Y[0]=Y[1]-1;

 //             Y[2]=Y[1]+1;

 //             Y[0]%=shape[1];

 //             Y[1]%=shape[1];

 //             Y[2]%=shape[1];

 //

 //             Y[0]*=jumpy;

 //             Y[1]*=jumpy;

 //             Y[2]*=jumpy;

 //

 //             for ( std::size_t x=0;x<shape[2];x++ )

 //             {

 //                 std::size_t X[3];

 //                 X[1]=x+shape[2];

 //                 X[0]=X[1]-1;

 //                 X[2]=X[1]+1;

 //                 X[0]%=shape[2];

 //                 X[1]%=shape[2];

 //                 X[2]%=shape[2];

 //

 //                 derivX1=stIn_R+ ( Z[1]+Y[1]+X[0] ) *stride_in+J_times2;

 //                 derivX0=stIn_R+ ( Z[1]+Y[1]+X[2] ) *stride_in+J_times2;

 //

 //                 derivY1=stIn_R+ ( Z[1]+Y[0]+X[1] ) *stride_in+J_times2;

 //                 derivY0=stIn_R+ ( Z[1]+Y[2]+X[1] ) *stride_in+J_times2;

 //

 //                 derivZ1=stIn_R+ ( Z[0]+Y[1]+X[1] ) *stride_in+J_times2;

 //                 derivZ0=stIn_R+ ( Z[2]+Y[1]+X[1] ) *stride_in+J_times2;

 //

 //                 std::size_t offset= ( Z[1]+Y[1]+X[1] ) *stride_out+J1_times2;

 //

 // //                 T ctmp_r;

 // //           T ctmp_i;

 //

 //                 for ( int M=- ( J1 );M<= ( J1 );M++ )

 //                 {

 //

 //                     T tmp_r=T ( 0 );

 //                  T tmp_i=T ( 0 );

 //

 //                     if ( abs ( M+1 ) <=J )

 //                     {

 //                         int m2=2*(M+1);

 //                      tmp_r+=CGTable0[M]*( voxel_size[2]* ( derivX0[m2]-derivX1[m2] ) -sign* ( derivY0[m2+1]-derivY1[m2+1] ) );

 //                      tmp_i+=CGTable0[M]*( voxel_size[2]* ( derivX0[m2+1]-derivX1[m2+1] ) +sign* ( derivY0[m2]-derivY1[m2] ) );

 //                     }

 //                     if ( abs ( M ) <=J )

 //                     {

 //                         tmp_r+=CGTable1[M]* ( derivZ0[2*M]-derivZ1[2*M] );

 //                      tmp_i+=CGTable1[M]* ( derivZ0[2*M+1]-derivZ1[2*M+1] );

 //                     }

 //                     if ( abs ( M-1 ) <=J )

 //                     {

 //                         int m2=2*(M-1);

 //                         tmp_r+=CGTable2[M]* ( -voxel_size[2]* ( derivX0[m2]-derivX1[m2] ) -sign* ( derivY0[m2+1]-derivY1[m2+1] ) );

 //                      tmp_i+=CGTable2[M]* ( -voxel_size[2]* ( derivX0[m2+1]-derivX1[m2+1] ) +sign* ( derivY0[m2]-derivY1[m2] ) );

 //                     }

 //

 //                     T * current=stOut_R+offset+2*M;

 //                  if (alpha_real)

 //                  {

 //                    if ( clear_field )

 //                    {

 //                      (*current++)=alpha.real()*tmp_r;

 //                      (*current)=alpha.real()*tmp_i;

 //                    }else

 //                    {

 //                      (*current++)+=alpha.real()*tmp_r;

 //                      (*current)+=alpha.real()*tmp_i;

 //                    }

 //                  }else

 //                  {

 //                    if ( clear_field )

 //                    {

 //                      (*current++)=alpha.real()*tmp_r-alpha.imag()*tmp_i;

 //                      (*current)=alpha.real()*tmp_i+alpha.imag()*tmp_r;

 //                    }else

 //                    {

 //                      (*current++)+=alpha.real()*tmp_r-alpha.imag()*tmp_i;

 //                      (*current)+=alpha.real()*tmp_i+alpha.imag()*tmp_r;

 //                    }

 //                  }

 //                 }

 //             }

 //         }

 //     }

 //     delete [] CGTable;

 //     return ( J1 );

 // }


 template<typename T,typename S>

 STA_RESULT sta_derivatives_C (

     const S * stIn,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J,

     int Jupdown,    // either -1 0 or 1

     bool conjugate=false,

     std::complex<T> alpha= ( T ) 1.0,

     const T  v_size[]=NULL,

     int stride_in = -1,

     int stride_out = -1,

     bool clear_field = false )

 {

     alpha/=T ( 2 );

     if ( abs ( Jupdown ) >1 ) return STA_RESULT_INVALID_TENSOR_RANK;

     if ( abs ( J+Jupdown ) <0 ) return STA_RESULT_INVALID_TENSOR_RANK;


     std::complex<T> imag=-std::complex<T> ( 0,1 );

     if (conjugate) imag*=T( -1 );


     T voxel_size[3];

     voxel_size[0]=voxel_size[1]=voxel_size[2]=T ( 1 );

     if ( v_size!=NULL )

     {

         voxel_size[0]/=v_size[0]; // Zdir

         voxel_size[1]/=v_size[1]; // Ydir

         voxel_size[2]/=v_size[2]; // Xdir

     }


     imag*=voxel_size[1];


     int J1=J+Jupdown;


     std::size_t vectorLengthJ=2*J+1;

     std::size_t vectorLengthJ1=2* ( J1 ) +1;


     if ( stride_in == -1 )

         stride_in = vectorLengthJ;

     if ( stride_out == -1 )

         stride_out = vectorLengthJ1;


     std::size_t jumpz=shape[1]*shape[2];

     std::size_t jumpy=shape[2];


     T * CGTable=new T[3*vectorLengthJ1];

     T shnorm=hanalysis::clebschGordan ( 1,0,J,0,J1,0 );

     if ( Jupdown==0 ) shnorm=1;

     for ( int M=- ( J1 ); M<= ( J1 ); M++ )

     {

         CGTable[M+ ( J1 ) ]                 =T ( 1.0/std::sqrt ( 2.0 ) ) *hanalysis::clebschGordan ( 1,-1,J,M+1,J1,M ) /shnorm;;

         CGTable[M+ ( J1 ) +vectorLengthJ1]  =voxel_size[0]*hanalysis::clebschGordan ( 1,0,J,M,J1,M ) /shnorm;

         CGTable[M+ ( J1 ) +2*vectorLengthJ1]=T ( 1.0/std::sqrt ( 2.0 ) ) *hanalysis::clebschGordan ( 1,1,J,M-1,J1,M ) /shnorm;

     }

     T * CGTable0=&CGTable[0];

     CGTable0+= ( J1 );

     T * CGTable1=&CGTable[vectorLengthJ1];

     CGTable1+= ( J1 );

     T * CGTable2=&CGTable[2*vectorLengthJ1];

     CGTable2+= ( J1 );


     #pragma omp parallel for num_threads(get_numCPUs())

     for ( std::size_t z=0; z<shape[0]; z++ )

     {

         std::size_t Z[3];

         Z[1]=z+shape[0];

         Z[0]=Z[1]-1;

         Z[2]=Z[1]+1;

         Z[0]%=shape[0];

         Z[1]%=shape[0];

         Z[2]%=shape[0];


         Z[0]*=jumpz;

         Z[1]*=jumpz;

         Z[2]*=jumpz;


         const S * derivX1;

         const S * derivX0;


         const S * derivY1;

         const S * derivY0;


         const S * derivZ1;

         const S * derivZ0;


         for ( std::size_t y=0; y<shape[1]; y++ )

         {

             std::size_t Y[3];

             Y[1]=y+shape[1];

             Y[0]=Y[1]-1;

             Y[2]=Y[1]+1;

             Y[0]%=shape[1];

             Y[1]%=shape[1];

             Y[2]%=shape[1];


             Y[0]*=jumpy;

             Y[1]*=jumpy;

             Y[2]*=jumpy;


             for ( std::size_t x=0; x<shape[2]; x++ )

             {

                 std::size_t X[3];

                 X[1]=x+shape[2];

                 X[0]=X[1]-1;

                 X[2]=X[1]+1;

                 X[0]%=shape[2];

                 X[1]%=shape[2];

                 X[2]%=shape[2];


                 derivX1=&stIn[ ( Z[1]+Y[1]+X[0] ) *stride_in]+J;

                 derivX0=&stIn[ ( Z[1]+Y[1]+X[2] ) *stride_in]+J;


                 derivY1=&stIn[ ( Z[1]+Y[0]+X[1] ) *stride_in]+J;

                 derivY0=&stIn[ ( Z[1]+Y[2]+X[1] ) *stride_in]+J;


                 derivZ1=&stIn[ ( Z[0]+Y[1]+X[1] ) *stride_in]+J;

                 derivZ0=&stIn[ ( Z[2]+Y[1]+X[1] ) *stride_in]+J;


                 std::size_t offset= ( Z[1]+Y[1]+X[1] ) *stride_out+J1;


                 for ( int M=- ( J1 ); M<= ( J1 ); M++ )

                 {

                     std::complex<T> & current=stOut[offset+M];

                     if ( clear_field ) current=T ( 0 );

                     std::complex<T> tmp=T ( 0 );


                     if ( abs ( M+1 ) <=J )

                     {

                         int m2=M+1;

                         tmp+=CGTable0[M]* ( voxel_size[2]* ( derivX0[m2]-derivX1[m2] ) +imag* ( derivY0[m2]-derivY1[m2] ) );

                     }

                     if ( abs ( M ) <=J )

                     {

                         tmp+=CGTable1[M]* ( derivZ0[M]-derivZ1[M] );

                     }

                     if ( abs ( M-1 ) <=J )

                     {

                         int m2=M-1;

                         tmp+=CGTable2[M]* ( -voxel_size[2]* ( derivX0[m2]-derivX1[m2] ) +imag* ( derivY0[m2]-derivY1[m2] ) );

                     }

                     current+=tmp*alpha;

                 }

             }

         }

     }

     delete [] CGTable;

     return ( STA_RESULT_SUCCESS );

 }


 /*

  computes the spherical tensor double-derivative of  \f$ \mathbf{stIn} \in \mathcal T_{J}\f$ \n

  \param stIn \f$ \mathbf{stIn} \in \mathcal T_{J}\f$

  \param stOut \f$ \mathbf{stOut} \in \mathcal T_{(J+Jupdown)}\f$, the spherical tensor double-derivative of \f$ \mathbf{stIn} \f$

  \param shape

  \param J \f$ J \in \mathbb N \f$ tensor rank of the input field \f$ \mathbf{stIn}  \f$

  \param Jupdown

  \f$

  \left\{

  \begin{array}{ll}

    \mathbf{stOut}=\alpha({\nabla}  \bullet_{(J+2)}  \mathbf{stIn}), &  \mbox{ if } Jupdown=2\\

    \mathbf{stOut}=\alpha({\nabla}  \bullet_{J}  \mathbf{stIn}), &  \mbox{ if } Jupdown=0\\

    \mathbf{stOut}=\alpha({\nabla}  \bullet_{(J-2)}  \mathbf{stIn}), &  \mbox{ if } Jupdown=-2

  \end{array}

  \right.

  \f$

  \param  conjugate  if \b conjugate=true the  conjugate operator \f$ \overline{{\nabla}} \f$ is used

  \param alpha \f$ \alpha \in \mathbb C \f$ additional weighting factor

  \returns  \f$

  \left\{

  \begin{array}{ll}

   J+Jupdown &  \mbox{if derivative exists}\\

   -1 & \mbox{ else }

  \end{array}

  \right.

  \f$

   \warning ensure that stIn, stOut and shape exist

   and have been \b allocated properly!

  */

 template<typename T,typename S>

 STA_RESULT sta_derivatives2_C (

     const S * stIn,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J,

     int Jupdown,         // either +2 or -2 or 0

     bool conjugate=false,

     std::complex<T> alpha= ( T ) 1.0,

     const T  v_size[]=NULL,

     int stride_in = -1,

     int stride_out = -1,

     bool clear_field = false )

 {

     if ( abs ( Jupdown ) >2 ) return STA_RESULT_INVALID_TENSOR_RANK;

     if ( abs ( Jupdown ) ==1 ) return STA_RESULT_INVALID_TENSOR_RANK;

     if ( abs ( J+Jupdown ) <0 ) return STA_RESULT_INVALID_TENSOR_RANK;


     T voxel_size[3];

     voxel_size[0]=voxel_size[1]=voxel_size[2]=T ( 1 );

     if ( v_size!=NULL )

     {

         voxel_size[0]/=v_size[0];

         voxel_size[1]/=v_size[1];

         voxel_size[2]/=v_size[2];

         if (hanalysis::verbose>0)

             printf("element size not considered yet: sta_derivatives2 \n");

     }


     std::complex<T> imag=-std::complex<T> ( 0,1 );

     if (conjugate) imag*=T( -1 );


     alpha*=T ( sqrt ( 3.0/2.0 ) );


     int J1=J+Jupdown;


     int vectorLengthJ=J*2+1;

     int vectorLengthJ1= ( J1 ) *2+1;


     if ( stride_in == -1 )

         stride_in = vectorLengthJ;

     if ( stride_out == -1 )

         stride_out = vectorLengthJ1;


     std::size_t jumpz=shape[1]*shape[2];

     std::size_t jumpy=shape[2];


     T * CGTable=new T[5*vectorLengthJ1];

     T shnorm=hanalysis::clebschGordan ( 2,0,J,0,J1,0 );

     for ( int M=- ( J1 ); M<= ( J1 ); M++ )

     {

         CGTable[M+ ( J1 ) ]                 =hanalysis::clebschGordan ( 2,-2,J,M+2,J1,M ) /shnorm;

         CGTable[M+ ( J1 ) +vectorLengthJ1]  =hanalysis::clebschGordan ( 2,-1,J,M+1,J1,M ) /shnorm;;

         CGTable[M+ ( J1 ) +2*vectorLengthJ1]=hanalysis::clebschGordan ( 2,0,J,M,J1,M ) /shnorm;

         CGTable[M+ ( J1 ) +3*vectorLengthJ1]=hanalysis::clebschGordan ( 2,1,J,M-1,J1,M ) /shnorm;

         CGTable[M+ ( J1 ) +4*vectorLengthJ1]=hanalysis::clebschGordan ( 2,2,J,M-2,J1,M ) /shnorm;

     }

     T * CGTable0=&CGTable[0];

     CGTable0+= ( J1 );

     T * CGTable1=&CGTable[vectorLengthJ1];

     CGTable1+= ( J1 );

     T * CGTable2=&CGTable[2*vectorLengthJ1];

     CGTable2+= ( J1 );

     T * CGTable3=&CGTable[3*vectorLengthJ1];

     CGTable3+= ( J1 );

     T * CGTable4=&CGTable[4*vectorLengthJ1];

     CGTable4+= ( J1 );


     #pragma omp parallel for num_threads(get_numCPUs())

     for ( std::size_t z=0; z<shape[0]; z++ )

     {

         std::size_t Z[5];

         Z[2]=z+shape[0];

         Z[0]=Z[2]-2;

         Z[1]=Z[2]-1;

         Z[3]=Z[2]+1;

         Z[4]=Z[2]+2;

         Z[0]%=shape[0];

         Z[1]%=shape[0];

         Z[2]%=shape[0];

         Z[3]%=shape[0];

         Z[4]%=shape[0];


         Z[0]*=jumpz;

         Z[1]*=jumpz;

         Z[2]*=jumpz;

         Z[3]*=jumpz;

         Z[4]*=jumpz;


         //const S * X1Y1Z1;

         const S * X1Y1Z2;

         //const S * X1Y1Z3;

         const S * X1Y2Z1;

         const S * X1Y2Z2;

         const S * X1Y2Z3;

         //const S * X1Y3Z1;

         const S * X1Y3Z2;

         //const S * X1Y3Z3;


         const S * X2Y1Z1;

         const S * X2Y1Z2;

         const S * X2Y1Z3;

         const S * X2Y2Z1;

         const S * X2Y2Z2;

         const S * X2Y2Z3;

         const S * X2Y3Z1;

         const S * X2Y3Z2;

         const S * X2Y3Z3;


         //const S * X3Y1Z1;

         const S * X3Y1Z2;

         //const S * X3Y1Z3;

         const S * X3Y2Z1;

         const S * X3Y2Z2;

         const S * X3Y2Z3;

         //const S * X3Y3Z1;

         const S * X3Y3Z2;

         //const S * X3Y3Z3;


         std::complex<T>  current;


         for ( std::size_t y=0; y<shape[1]; y++ )

         {

             std::size_t Y[5];

             Y[2]=y+shape[1];

             Y[0]=Y[2]-2;

             Y[1]=Y[2]-1;

             Y[3]=Y[2]+1;

             Y[4]=Y[2]+2;

             Y[0]%=shape[1];

             Y[1]%=shape[1];

             Y[2]%=shape[1];

             Y[3]%=shape[1];

             Y[4]%=shape[1];


             Y[0]*=jumpy;

             Y[1]*=jumpy;

             Y[2]*=jumpy;

             Y[3]*=jumpy;

             Y[4]*=jumpy;


             for ( std::size_t x=0; x<shape[2]; x++ )

             {

                 std::size_t X[5];

                 X[2]=x+shape[0];

                 X[0]=X[2]-2;

                 X[1]=X[2]-1;

                 X[3]=X[2]+1;

                 X[4]=X[2]+2;

                 X[0]%=shape[2];

                 X[1]%=shape[2];

                 X[2]%=shape[2];

                 X[3]%=shape[2];

                 X[4]%=shape[2];


                 //X1Y1Z1=&stIn[ ( Z[1]+Y[1]+X[1] ) *stride_in]+J;

                 X1Y1Z2=&stIn[ ( Z[2]+Y[1]+X[1] ) *stride_in]+J;

                 //X1Y1Z3=&stIn[ ( Z[3]+Y[1]+X[1] ) *stride_in]+J;

                 X1Y2Z1=&stIn[ ( Z[1]+Y[2]+X[1] ) *stride_in]+J;

                 X1Y2Z2=&stIn[ ( Z[2]+Y[2]+X[1] ) *stride_in]+J;

                 X1Y2Z3=&stIn[ ( Z[3]+Y[2]+X[1] ) *stride_in]+J;

                 //X1Y3Z1=&stIn[ ( Z[1]+Y[3]+X[1] ) *stride_in]+J;

                 X1Y3Z2=&stIn[ ( Z[2]+Y[3]+X[1] ) *stride_in]+J;

                 //X1Y3Z3=&stIn[ ( Z[3]+Y[3]+X[1] ) *stride_in]+J;


                 X2Y1Z1=&stIn[ ( Z[1]+Y[1]+X[2] ) *stride_in]+J;

                 X2Y1Z2=&stIn[ ( Z[2]+Y[1]+X[2] ) *stride_in]+J;

                 X2Y1Z3=&stIn[ ( Z[3]+Y[1]+X[2] ) *stride_in]+J;

                 X2Y2Z1=&stIn[ ( Z[1]+Y[2]+X[2] ) *stride_in]+J;

                 X2Y2Z2=&stIn[ ( Z[2]+Y[2]+X[2] ) *stride_in]+J;

                 X2Y2Z3=&stIn[ ( Z[3]+Y[2]+X[2] ) *stride_in]+J;

                 X2Y3Z1=&stIn[ ( Z[1]+Y[3]+X[2] ) *stride_in]+J;

                 X2Y3Z2=&stIn[ ( Z[2]+Y[3]+X[2] ) *stride_in]+J;

                 X2Y3Z3=&stIn[ ( Z[3]+Y[3]+X[2] ) *stride_in]+J;


                 //X3Y1Z1=&stIn[ ( Z[1]+Y[1]+X[3] ) *stride_in]+J;

                 X3Y1Z2=&stIn[ ( Z[2]+Y[1]+X[3] ) *stride_in]+J;

                 //X3Y1Z3=&stIn[ ( Z[3]+Y[1]+X[3] ) *stride_in]+J;

                 X3Y2Z1=&stIn[ ( Z[1]+Y[2]+X[3] ) *stride_in]+J;

                 X3Y2Z2=&stIn[ ( Z[2]+Y[2]+X[3] ) *stride_in]+J;

                 X3Y2Z3=&stIn[ ( Z[3]+Y[2]+X[3] ) *stride_in]+J;

                 //X3Y3Z1=&stIn[ ( Z[1]+Y[3]+X[3] ) *stride_in]+J;

                 X3Y3Z2=&stIn[ ( Z[2]+Y[3]+X[3] ) *stride_in]+J;

                 //X3Y3Z3=&stIn[ ( Z[3]+Y[3]+X[3] ) *stride_in]+J;


                 std::size_t offset= ( Z[2]+Y[2]+X[2] ) *stride_out+J1;


                 for ( int M=- ( J1 ); M<= ( J1 ); M++ )

                 {

                     std::complex<T> & current=stOut[offset+M];

                     if ( clear_field ) current=T ( 0 );

                     std::complex<T> tmp=T ( 0 );


                     if ( abs ( M+2 ) <=J ) // m1=-1    m2=M+1    M

                     {

                         int m2=M+2;

                         std::complex<T> Dxx= ( X1Y2Z2[m2]- ( T ) 2*X2Y2Z2[m2]+X3Y2Z2[m2] );

                         std::complex<T> Dyy= ( X2Y1Z2[m2]- ( T ) 2*X2Y2Z2[m2]+X2Y3Z2[m2] );

                         std::complex<T> Dxy=- ( T ) 0.25* ( X1Y1Z2[m2]-X3Y1Z2[m2]-X1Y3Z2[m2]+X3Y3Z2[m2] );

                         tmp+= ( T ) 0.5*CGTable0[M]* ( ( Dxx-Dyy )-imag* ( ( T ) 2.0*Dxy ) );

                     }


                     if ( abs ( M+1 ) <=J ) // m1=-1    m2=M+1    M

                     {

                         int m2=M+1;

                         std::complex<T> Dxz= ( T ) 0.25* ( X1Y2Z1[m2]-X1Y2Z3[m2]-X3Y2Z1[m2]+X3Y2Z3[m2] );

                         std::complex<T> Dyz=- ( T ) 0.25* ( X2Y1Z1[m2]-X2Y3Z1[m2]-X2Y1Z3[m2]+X2Y3Z3[m2] );

                         tmp+=CGTable1[M]* ( ( Dxz )-imag* ( Dyz ) );

                     }


                     if ( abs ( M ) <=J ) // m1=-1    m2=M+1    M

                     {

                         int m2=M;

                         std::complex<T> Dxx= ( X1Y2Z2[m2]- ( T ) 2*X2Y2Z2[m2]+X3Y2Z2[m2] );

                         std::complex<T> Dyy= ( X2Y1Z2[m2]- ( T ) 2*X2Y2Z2[m2]+X2Y3Z2[m2] );

                         std::complex<T> Dzz= ( X2Y2Z1[m2]- ( T ) 2*X2Y2Z2[m2]+X2Y2Z3[m2] );

                         const T SQRT6= ( T ) ( -1.0/std::sqrt ( 6.0 ) );

                         tmp+=CGTable2[M]* ( ( Dxx+Dyy- ( T ) 2.0*Dzz ) * ( SQRT6 ) );

                     }


                     if ( abs ( M-1 ) <=J ) // m1=-1    m2=M+1    M

                     {

                         int m2=M-1;

                         std::complex<T> Dxz= ( T ) 0.25* ( X1Y2Z1[m2]-X1Y2Z3[m2]-X3Y2Z1[m2]+X3Y2Z3[m2] );

                         std::complex<T> Dyz=- ( T ) 0.25* ( X2Y1Z1[m2]-X2Y3Z1[m2]-X2Y1Z3[m2]+X2Y3Z3[m2] );

                         tmp-=CGTable3[M]* ( ( Dxz ) +imag* ( Dyz ) );

                     }


                     if ( abs ( M-2 ) <=J ) // m1=-1    m2=M+1    M

                     {

                         int m2=M-2;

                         std::complex<T> Dxx= ( X1Y2Z2[m2]- ( T ) 2*X2Y2Z2[m2]+X3Y2Z2[m2] );

                         std::complex<T> Dyy= ( X2Y1Z2[m2]- ( T ) 2*X2Y2Z2[m2]+X2Y3Z2[m2] );

                         std::complex<T> Dxy=- ( T ) 0.25* ( X1Y1Z2[m2]-X3Y1Z2[m2]-X1Y3Z2[m2]+X3Y3Z2[m2] );

                         tmp+= ( T ) 0.5*CGTable4[M]* ( ( Dxx-Dyy ) +imag* ( ( T ) 2.0*Dxy ) );

                     }


                     current+=tmp*alpha;


                 }


             }

         }

     }

     delete [] CGTable;

     return STA_RESULT_SUCCESS;

 }


 template<typename T,typename S>

 STA_RESULT sta_laplace_1component (

     const S * stIn,std::complex<T> * stOut ,

     const std::size_t shape[],

     int type=1,

     std::complex<T> alpha= ( T ) 1.0,

     const T  v_size[]=NULL,

     bool clear_field=false)

 {


 //     if ( v_size!=NULL )

 //     {

 //       if (hanalysis::verbose>0)

 //         printf ( "WARNING! element size is not considered yet!\n" );

 //     }


 //     printf("..");

 //     return STA_RESULT_SUCCESS;


     T voxel_weights[4];

     voxel_weights[0]=voxel_weights[1]=voxel_weights[2]=voxel_weights[3]=1;


     if ( v_size!=NULL )// && ((v_size[0]!=1)||(v_size[1]!=1)||(v_size[2]!=1)))

     {

         if ( type!=1 )

         {

             if (hanalysis::verbose>0)

                 printf ( "WARNING! element size is not considered yet!\n" );

         } else

         {

             type=2;

             voxel_weights[0]/=v_size[0]*v_size[0]; // Zdir

             voxel_weights[1]/=v_size[1]*v_size[1]; // Ydir

             voxel_weights[2]/=v_size[2]*v_size[2]; // Xdir


         }

         if (hanalysis::verbose>0)

          printf ( "v_size: [%f %f %f]\n",v_size[0],v_size[1],v_size[2] );

     }


     voxel_weights[3]*=2*(voxel_weights[0]+voxel_weights[1]+voxel_weights[2]); // Center


      if (hanalysis::verbose>0)

          printf ( "laplace_1: [%f %f %f %f]\n",voxel_weights[0],voxel_weights[1],voxel_weights[2],voxel_weights[3] );


     std::size_t jumpz=shape[1]*shape[2];

     std::size_t jumpy=shape[2];


     switch ( type )

     {

     case 0:

     {

         alpha*= ( T ) 0.2;

         #pragma omp parallel for num_threads(get_numCPUs())

         for ( std::size_t z=0; z<shape[0]; z++ )

         {

             std::size_t Z[3];

             Z[1]=z+shape[0];

             Z[0]=Z[1]-1;

             Z[2]=Z[1]+1;

             Z[0]%=shape[0];

             Z[1]%=shape[0];

             Z[2]%=shape[0];


             Z[0]*=jumpz;

             Z[1]*=jumpz;

             Z[2]*=jumpz;


             for ( std::size_t y=0; y<shape[1]; y++ )

             {

                 std::size_t Y[3];

                 Y[1]=y+shape[1];

                 Y[0]=Y[1]-1;

                 Y[2]=Y[1]+1;

                 Y[0]%=shape[1];

                 Y[1]%=shape[1];

                 Y[2]%=shape[1];


                 Y[0]*=jumpy;

                 Y[1]*=jumpy;

                 Y[2]*=jumpy;


                 for ( std::size_t x=0; x<shape[2]; x++ )

                 {

                     std::size_t X[3];

                     X[1]=x+shape[2];

                     X[0]=X[1]-1;

                     X[2]=X[1]+1;

                     X[0]%=shape[2];

                     X[1]%=shape[2];

                     X[2]%=shape[2];


                     std::size_t offset= ( Z[1]+Y[1]+X[1] );

                     std::complex<T> & current=stOut[offset];

                     if ( clear_field ) current=T ( 0 );


                     current+=alpha* (

                                  stIn[ ( Z[0]+Y[0]+X[1] ) ] +

                                  stIn[ ( Z[0]+Y[1]+X[0] ) ] +

                                  stIn[ ( Z[0]+Y[1]+X[1] ) ] +

                                  stIn[ ( Z[0]+Y[1]+X[2] ) ] +

                                  stIn[ ( Z[0]+Y[2]+X[1] ) ] +

                                  stIn[ ( Z[1]+Y[0]+X[0] ) ] +

                                  stIn[ ( Z[1]+Y[0]+X[1] ) ] +

                                  stIn[ ( Z[1]+Y[0]+X[2] ) ] +

                                  stIn[ ( Z[1]+Y[1]+X[0] ) ]-

                                  T ( 18 ) *stIn[ ( Z[1]+Y[1]+X[1] ) ] +

                                  stIn[ ( Z[1]+Y[1]+X[2] ) ] +

                                  stIn[ ( Z[1]+Y[2]+X[0] ) ] +

                                  stIn[ ( Z[1]+Y[2]+X[1] ) ] +

                                  stIn[ ( Z[1]+Y[2]+X[2] ) ] +

                                  stIn[ ( Z[2]+Y[0]+X[1] ) ] +

                                  stIn[ ( Z[2]+Y[1]+X[0] ) ] +

                                  stIn[ ( Z[2]+Y[1]+X[1] ) ] +

                                  stIn[ ( Z[2]+Y[1]+X[2] ) ] +

                                  stIn[ ( Z[2]+Y[2]+X[1] ) ]

                              );


                 }


             }

         }

     }

     break;

     case 1:

     {

         alpha*=1;

         #pragma omp parallel for num_threads(get_numCPUs())

         for ( std::size_t z=0; z<shape[0]; z++ )

         {

             std::size_t Z[3];

             Z[1]=z+shape[0];

             Z[0]=Z[1]-1;

             Z[2]=Z[1]+1;

             Z[0]%=shape[0];

             Z[1]%=shape[0];

             Z[2]%=shape[0];


             Z[0]*=jumpz;

             Z[1]*=jumpz;

             Z[2]*=jumpz;


             for ( std::size_t y=0; y<shape[1]; y++ )

             {

                 std::size_t Y[3];

                 Y[1]=y+shape[1];

                 Y[0]=Y[1]-1;

                 Y[2]=Y[1]+1;

                 Y[0]%=shape[1];

                 Y[1]%=shape[1];

                 Y[2]%=shape[1];


                 Y[0]*=jumpy;

                 Y[1]*=jumpy;

                 Y[2]*=jumpy;


                 for ( std::size_t x=0; x<shape[2]; x++ )

                 {

                     std::size_t X[3];

                     X[1]=x+shape[2];

                     X[0]=X[1]-1;

                     X[2]=X[1]+1;

                     X[0]%=shape[2];

                     X[1]%=shape[2];

                     X[2]%=shape[2];


                     std::complex<T> & current=stOut[Z[1]+Y[1]+X[1]];


                     if ( clear_field ) current=T ( 0 );


                     current+=alpha* (

                                  stIn[ ( Z[0]+Y[1]+X[1] ) ] +

                                  stIn[ ( Z[1]+Y[0]+X[1] ) ] +

                                  stIn[ ( Z[1]+Y[1]+X[0] ) ] -

                                  T ( 6 ) *stIn[ ( Z[1]+Y[1]+X[1] ) ] +

                                  stIn[ ( Z[1]+Y[1]+X[2] ) ] +

                                  stIn[ ( Z[1]+Y[2]+X[1] ) ] +

                                  stIn[ ( Z[2]+Y[1]+X[1] ) ]

                              );


                 }


             }

         }

     }

     break;


     case 2:

     {

         alpha*=1;

         #pragma omp parallel for num_threads(get_numCPUs())

         for ( std::size_t z=0; z<shape[0]; z++ )

         {

             std::size_t Z[3];

             Z[1]=z+shape[0];

             Z[0]=Z[1]-1;

             Z[2]=Z[1]+1;

             Z[0]%=shape[0];

             Z[1]%=shape[0];

             Z[2]%=shape[0];


             Z[0]*=jumpz;

             Z[1]*=jumpz;

             Z[2]*=jumpz;


             for ( std::size_t y=0; y<shape[1]; y++ )

             {

                 std::size_t Y[3];

                 Y[1]=y+shape[1];

                 Y[0]=Y[1]-1;

                 Y[2]=Y[1]+1;

                 Y[0]%=shape[1];

                 Y[1]%=shape[1];

                 Y[2]%=shape[1];


                 Y[0]*=jumpy;

                 Y[1]*=jumpy;

                 Y[2]*=jumpy;


                 for ( std::size_t x=0; x<shape[2]; x++ )

                 {

                     std::size_t X[3];

                     X[1]=x+shape[2];

                     X[0]=X[1]-1;

                     X[2]=X[1]+1;

                     X[0]%=shape[2];

                     X[1]%=shape[2];

                     X[2]%=shape[2];


                     std::complex<T> & current=stOut[Z[1]+Y[1]+X[1]];


                     if ( clear_field ) current=T ( 0 );


                     current+=alpha* (

                                  voxel_weights[0]*stIn[ ( Z[0]+Y[1]+X[1] ) ] +

                                  voxel_weights[1]*stIn[ ( Z[1]+Y[0]+X[1] ) ] +

                                  voxel_weights[2]*stIn[ ( Z[1]+Y[1]+X[0] ) ] -

                                  voxel_weights[3]*stIn[ ( Z[1]+Y[1]+X[1] ) ] +

                                  voxel_weights[2]*stIn[ ( Z[1]+Y[1]+X[2] ) ] +

                                  voxel_weights[1]*stIn[ ( Z[1]+Y[2]+X[1] ) ] +

                                  voxel_weights[0]*stIn[ ( Z[2]+Y[1]+X[1] ) ]

                              );


                 }


             }

         }

     }

     break;


     default:

         printf ( "unsoported operator\n" );

     }

     return STA_RESULT_SUCCESS;

 }


 template<typename T,typename S>

 STA_RESULT sta_laplace_Ncomponents_C (

     const S * stIn,std::complex<T> * stOut ,

     const std::size_t shape[],

     int dim=0,   // number of components

     int type=1,

     std::complex<T> alpha=1,

     const T  v_size[]=NULL,

     int stride_in = -1,

     int stride_out = -1,

     bool clear_field=false)


 {


     if ( v_size!=NULL )

     {

         if (hanalysis::verbose>0)

             printf ( "WARNING! element size is not considered yet!\n" );

     }


 //     /*T voxel_size[3];

 //     voxel_size[0]=voxel_size[1]=voxel_size[2]=1;*/

      T voxel_weights[4];

     voxel_weights[0]=voxel_weights[1]=voxel_weights[2]=voxel_weights[3]=1;


 //     if ( v_size!=NULL )

 //     {

 //         voxel_size[0]/=v_size[0]; // Zdir

 //         voxel_size[1]/=v_size[1]; // Ydir

 //         voxel_size[2]/=v_size[2]; // Xdir

 //     }


     if ( v_size!=NULL )// && ((v_size[0]!=1)||(v_size[1]!=1)||(v_size[2]!=1)))

     {

         if ( type!=1 )

         {

             if (hanalysis::verbose>0)

                 printf ( "WARNING! element size is not considered yet!\n" );

         } else

         {


             voxel_weights[0]/=v_size[0]*v_size[0]; // Zdir

             voxel_weights[1]/=v_size[1]*v_size[1]; // Ydir

             voxel_weights[2]/=v_size[2]*v_size[2]; // Xdir


         }

       if (hanalysis::verbose>0)

          printf ( "v_size: [%f %f %f]\n",v_size[0],v_size[1],v_size[2] );

      }


      voxel_weights[3]*=2*(voxel_weights[0]+voxel_weights[1]+voxel_weights[2]); // Center


      if (hanalysis::verbose>0)

          printf ( "laplace_C: [%f %f %f %f]\n",voxel_weights[0],voxel_weights[1],voxel_weights[2],voxel_weights[3] );


     if ( stride_in == -1 )

         stride_in = dim;

     if ( stride_out == -1 )

         stride_out = dim;


     std::size_t jumpz=shape[1]*shape[2];

     std::size_t jumpy=shape[2];


     switch ( type )

     {

     case 0:

     {

         alpha*= ( T ) 0.2;

         #pragma omp parallel for num_threads(get_numCPUs())

         for ( std::size_t z=0; z<shape[0]; z++ )

         {

             std::size_t Z[3];

             Z[1]=z+shape[0];

             Z[0]=Z[1]-1;

             Z[2]=Z[1]+1;

             Z[0]%=shape[0];

             Z[1]%=shape[0];

             Z[2]%=shape[0];


             Z[0]*=jumpz;

             Z[1]*=jumpz;

             Z[2]*=jumpz;


             for ( std::size_t y=0; y<shape[1]; y++ )

             {

                 std::size_t Y[3];

                 Y[1]=y+shape[1];

                 Y[0]=Y[1]-1;

                 Y[2]=Y[1]+1;

                 Y[0]%=shape[1];

                 Y[1]%=shape[1];

                 Y[2]%=shape[1];


                 Y[0]*=jumpy;

                 Y[1]*=jumpy;

                 Y[2]*=jumpy;


                 for ( std::size_t x=0; x<shape[2]; x++ )

                 {

                     std::size_t X[3];

                     X[1]=x+shape[2];

                     X[0]=X[1]-1;

                     X[2]=X[1]+1;

                     X[0]%=shape[2];

                     X[1]%=shape[2];

                     X[2]%=shape[2];


                     for ( int j=0; j<dim; j++ )

                     {

                         std::size_t offset= ( Z[1]+Y[1]+X[1] ) *stride_out+j;

                         std::complex<T> & current=stOut[offset];

                         if ( clear_field ) current=T ( 0 );


                         current+=alpha* (

                                      stIn[ ( Z[0]+Y[0]+X[1] ) *stride_in+j] +

                                      stIn[ ( Z[0]+Y[1]+X[0] ) *stride_in+j] +

                                      stIn[ ( Z[0]+Y[1]+X[1] ) *stride_in+j] +

                                      stIn[ ( Z[0]+Y[1]+X[2] ) *stride_in+j] +

                                      stIn[ ( Z[0]+Y[2]+X[1] ) *stride_in+j] +

                                      stIn[ ( Z[1]+Y[0]+X[0] ) *stride_in+j] +

                                      stIn[ ( Z[1]+Y[0]+X[1] ) *stride_in+j] +

                                      stIn[ ( Z[1]+Y[0]+X[2] ) *stride_in+j] +

                                      stIn[ ( Z[1]+Y[1]+X[0] ) *stride_in+j]-

                                      T ( 18 ) *stIn[ ( Z[1]+Y[1]+X[1] ) *stride_in+j] +

                                      stIn[ ( Z[1]+Y[1]+X[2] ) *stride_in+j] +

                                      stIn[ ( Z[1]+Y[2]+X[0] ) *stride_in+j] +

                                      stIn[ ( Z[1]+Y[2]+X[1] ) *stride_in+j] +

                                      stIn[ ( Z[1]+Y[2]+X[2] ) *stride_in+j]+

                                      stIn[ ( Z[2]+Y[0]+X[1] ) *stride_in+j] +

                                      stIn[ ( Z[2]+Y[1]+X[0] ) *stride_in+j] +

                                      stIn[ ( Z[2]+Y[1]+X[1] ) *stride_in+j] +

                                      stIn[ ( Z[2]+Y[1]+X[2] ) *stride_in+j] +

                                      stIn[ ( Z[2]+Y[2]+X[1] ) *stride_in+j]

                                  );

                     }

                 }


             }

         }

     }

     break;

     case 1:

     {

         alpha*= ( T ) 1.0;

         #pragma omp parallel for num_threads(get_numCPUs())

         for ( std::size_t z=0; z<shape[0]; z++ )

         {

             std::size_t Z[3];

             Z[1]=z+shape[0];

             Z[0]=Z[1]-1;

             Z[2]=Z[1]+1;

             Z[0]%=shape[0];

             Z[1]%=shape[0];

             Z[2]%=shape[0];


             Z[0]*=jumpz;

             Z[1]*=jumpz;

             Z[2]*=jumpz;


             for ( std::size_t y=0; y<shape[1]; y++ )

             {

                 std::size_t Y[3];

                 Y[1]=y+shape[1];

                 Y[0]=Y[1]-1;

                 Y[2]=Y[1]+1;

                 Y[0]%=shape[1];

                 Y[1]%=shape[1];

                 Y[2]%=shape[1];


                 Y[0]*=jumpy;

                 Y[1]*=jumpy;

                 Y[2]*=jumpy;


                 for ( std::size_t x=0; x<shape[2]; x++ )

                 {

                     std::size_t X[3];

                     X[1]=x+shape[2];

                     X[0]=X[1]-1;

                     X[2]=X[1]+1;

                     X[0]%=shape[2];

                     X[1]%=shape[2];

                     X[2]%=shape[2];


                     for ( int j=0; j<dim; j++ )

                     {


                         std::size_t offset= ( Z[1]+Y[1]+X[1] ) *stride_out+j;

                         std::complex<T> & current=stOut[offset];

                         if ( clear_field ) current=T ( 0 );


 /*                        current+=alpha* (

                                      stIn[ ( Z[0]+Y[1]+X[1] ) *stride_in+j] +

                                      stIn[ ( Z[1]+Y[0]+X[1] ) *stride_in+j] +

                                      stIn[ ( Z[1]+Y[1]+X[0] ) *stride_in+j] -

                                      T ( 6 ) *stIn[ ( Z[1]+Y[1]+X[1] ) *stride_in+j] +

                                      stIn[ ( Z[1]+Y[1]+X[2] ) *stride_in+j] +

                                      stIn[ ( Z[1]+Y[2]+X[1] ) *stride_in+j] +

                                      stIn[ ( Z[2]+Y[1]+X[1] ) *stride_in+j]

                                   );       */


                         current+=alpha* (

                                      voxel_weights[0]*stIn[ ( Z[0]+Y[1]+X[1] ) *stride_in+j] +

                                      voxel_weights[1]*stIn[ ( Z[1]+Y[0]+X[1] ) *stride_in+j] +

                                      voxel_weights[2]*stIn[ ( Z[1]+Y[1]+X[0] ) *stride_in+j] -

                                      voxel_weights[3]*stIn[ ( Z[1]+Y[1]+X[1] ) *stride_in+j] +

                                      voxel_weights[2]*stIn[ ( Z[1]+Y[1]+X[2] ) *stride_in+j] +

                                      voxel_weights[1]*stIn[ ( Z[1]+Y[2]+X[1] ) *stride_in+j] +

                                      voxel_weights[0]*stIn[ ( Z[2]+Y[1]+X[1] ) *stride_in+j]

                                   );


                     }

                 }


             }

         }

     }

     break;

     default:

         printf ( "unsupported operator\n" );

     }


     return STA_RESULT_SUCCESS;

 }


 template<typename T>

 STA_RESULT sta_laplace_Ncomponents_R (

     const std::complex<T> * stIn,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int dim=0,   // number of components

     int type=1,

     T alpha=1,

     const T  v_size[]=NULL,

     int stride_in = -1,

     int stride_out = -1,

     bool clear_field=false)


 {


 //     if ( v_size!=NULL )

 //     {

 //         if (hanalysis::verbose>0)

 //             printf ( "WARNING! element size is not considered yet!\n" );

 //     }


  T voxel_weights[4];

     voxel_weights[0]=voxel_weights[1]=voxel_weights[2]=voxel_weights[3]=1;

 //     if ( v_size!=NULL )

 //     {

 //         voxel_size[0]/=v_size[0]; // Zdir

 //         voxel_size[1]/=v_size[1]; // Ydir

 //         voxel_size[2]/=v_size[2]; // Xdir

 //     }


      if ( v_size!=NULL )// && ((v_size[0]!=1)||(v_size[1]!=1)||(v_size[2]!=1)))

     {

         if ( type!=1 )

         {

             if (hanalysis::verbose>0)

                 printf ( "WARNING! element size is not considered yet!\n" );

         } else

         {


             voxel_weights[0]/=v_size[0]*v_size[0]; // Zdir

             voxel_weights[1]/=v_size[1]*v_size[1]; // Ydir

             voxel_weights[2]/=v_size[2]*v_size[2]; // Xdir

         }

         if (hanalysis::verbose>0)

           printf ( "v_size: [%f %f %f]\n",v_size[0],v_size[1],v_size[2] );

      }


      voxel_weights[3]*=2*(voxel_weights[0]+voxel_weights[1]+voxel_weights[2]);


      if (hanalysis::verbose>0)

          printf ( "laplace_R: [%f %f %f %f]\n",voxel_weights[0],voxel_weights[1],voxel_weights[2],voxel_weights[3] );


     if ( stride_in == -1 )

         stride_in = dim;

     if ( stride_out == -1 )

         stride_out = dim;


     std::size_t jumpz=shape[1]*shape[2];

     std::size_t jumpy=shape[2];


     stride_in*=2;

     stride_out*=2;

     dim*=2;


     const T * stIn_r=(const T*) stIn;

     T * stOut_r=( T*) stOut;


     switch ( type )

     {

     case 0:

     {

         alpha*= ( T ) 0.2;

         #pragma omp parallel for num_threads(get_numCPUs())

         for ( std::size_t z=0; z<shape[0]; z++ )

         {

             const T * p001;

             const T * p010;

             const T * p011;

             const T * p012;

             const T * p021;

             const T * p100;

             const T * p101;

             const T * p102;

             const T * p110;


             const T * p111;


             const T * p112;

             const T * p120;

             const T * p121;

             const T * p122;

             const T * p201;

             const T * p210;

             const T * p211;

             const T * p212;

             const T * p221;


             std::size_t Z[3];

             Z[1]=z+shape[0];

             Z[0]=Z[1]-1;

             Z[2]=Z[1]+1;

             Z[0]%=shape[0];

             Z[1]%=shape[0];

             Z[2]%=shape[0];


             Z[0]*=jumpz;

             Z[1]*=jumpz;

             Z[2]*=jumpz;


             for ( std::size_t y=0; y<shape[1]; y++ )

             {

                 std::size_t Y[3];

                 Y[1]=y+shape[1];

                 Y[0]=Y[1]-1;

                 Y[2]=Y[1]+1;

                 Y[0]%=shape[1];

                 Y[1]%=shape[1];

                 Y[2]%=shape[1];


                 Y[0]*=jumpy;

                 Y[1]*=jumpy;

                 Y[2]*=jumpy;


                 for ( std::size_t x=0; x<shape[2]; x++ )

                 {

                     std::size_t X[3];

                     X[1]=x+shape[2];

                     X[0]=X[1]-1;

                     X[2]=X[1]+1;

                     X[0]%=shape[2];

                     X[1]%=shape[2];

                     X[2]%=shape[2];


 //                   T * p001=stIn_r+ ( Z[0]+Y[0]+X[1] ) *stride_in;

 //                   T * p010=stIn_r+ ( Z[0]+Y[1]+X[0] ) *stride_in;

 //                   T * p011=stIn_r+ ( Z[0]+Y[1]+X[1] ) *stride_in;

 //                   T * p012=stIn_r+ ( Z[0]+Y[1]+X[2] ) *stride_in;

 //                   T * p021=stIn_r+ ( Z[0]+Y[2]+X[1] ) *stride_in;

 //                   T * p100=stIn_r+ ( Z[1]+Y[0]+X[0] ) *stride_in;

 //                   T * p101=stIn_r+ ( Z[1]+Y[0]+X[1] ) *stride_in;

 //                   T * p102=stIn_r+ ( Z[1]+Y[0]+X[2] ) *stride_in;

 //                   T * p110=stIn_r+ ( Z[1]+Y[1]+X[0] ) *stride_in;

 //

 //                   T * p111=stIn_r+ ( Z[1]+Y[1]+X[1] ) *stride_in;

 //

 //                   T * p112=stIn_r+ ( Z[1]+Y[1]+X[2] ) *stride_in;

 //                   T * p120=stIn_r+ ( Z[1]+Y[2]+X[0] ) *stride_in;

 //                   T * p121=stIn_r+ ( Z[1]+Y[2]+X[1] ) *stride_in;

 //                   T * p122=stIn_r+ ( Z[1]+Y[2]+X[2] ) *stride_in;

 //                   T * p201=stIn_r+ ( Z[2]+Y[0]+X[1] ) *stride_in;

 //                   T * p210=stIn_r+ ( Z[2]+Y[1]+X[0] ) *stride_in;

 //                   T * p211=stIn_r+ ( Z[2]+Y[1]+X[1] ) *stride_in;

 //                   T * p212=stIn_r+ ( Z[2]+Y[1]+X[2] ) *stride_in;

 //                   T * p221=stIn_r+ ( Z[2]+Y[2]+X[1] ) *stride_in;


                     p001=stIn_r+ ( Z[0]+Y[0]+X[1] ) *stride_in;

                     p010=stIn_r+ ( Z[0]+Y[1]+X[0] ) *stride_in;

                     p011=stIn_r+ ( Z[0]+Y[1]+X[1] ) *stride_in;

                     p012=stIn_r+ ( Z[0]+Y[1]+X[2] ) *stride_in;

                     p021=stIn_r+ ( Z[0]+Y[2]+X[1] ) *stride_in;

                     p100=stIn_r+ ( Z[1]+Y[0]+X[0] ) *stride_in;

                     p101=stIn_r+ ( Z[1]+Y[0]+X[1] ) *stride_in;

                     p102=stIn_r+ ( Z[1]+Y[0]+X[2] ) *stride_in;

                     p110=stIn_r+ ( Z[1]+Y[1]+X[0] ) *stride_in;


                     p111=stIn_r+ ( Z[1]+Y[1]+X[1] ) *stride_in;


                     p112=stIn_r+ ( Z[1]+Y[1]+X[2] ) *stride_in;

                     p120=stIn_r+ ( Z[1]+Y[2]+X[0] ) *stride_in;

                     p121=stIn_r+ ( Z[1]+Y[2]+X[1] ) *stride_in;

                     p122=stIn_r+ ( Z[1]+Y[2]+X[2] ) *stride_in;

                     p201=stIn_r+ ( Z[2]+Y[0]+X[1] ) *stride_in;

                     p210=stIn_r+ ( Z[2]+Y[1]+X[0] ) *stride_in;

                     p211=stIn_r+ ( Z[2]+Y[1]+X[1] ) *stride_in;

                     p212=stIn_r+ ( Z[2]+Y[1]+X[2] ) *stride_in;

                     p221=stIn_r+ ( Z[2]+Y[2]+X[1] ) *stride_in;


                     T * current=stOut_r+( Z[1]+Y[1]+X[1] ) *stride_out;


                     for ( int j=0; j<dim; j++ )

                     {


                         if ( clear_field ) (*current)=T ( 0 );


                         (*current++)+=alpha* (

                                           (*p001++) +

                                           (*p010++) +

                                           (*p011++) +

                                           (*p012++) +

                                           (*p021++) +

                                           (*p100++) +

                                           (*p101++) +

                                           (*p102++) +

                                           (*p110++) -

                                           T ( 18 ) * (*p111++) +

                                           (*p112++) +

                                           (*p120++) +

                                           (*p121++) +

                                           (*p122++) +

                                           (*p201++) +

                                           (*p210++) +

                                           (*p211++) +

                                           (*p212++) +

                                           (*p221++)

                                       );

                     }


 //                     for ( int j=0;j<dim;j++ )

 //                     {

 //

 //                         T * current=stOut_r+( Z[1]+Y[1]+X[1] ) *stride_out+j;

 //                      if ( clear_field ) (*current)=T ( 0 );

 //

 //                         (*current)+=alpha* (

 //                                      stIn_r[ ( Z[0]+Y[0]+X[1] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[0]+Y[1]+X[0] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[0]+Y[1]+X[1] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[0]+Y[1]+X[2] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[0]+Y[2]+X[1] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[1]+Y[0]+X[0] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[1]+Y[0]+X[1] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[1]+Y[0]+X[2] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[1]+Y[1]+X[0] ) *stride_in+j]-

 //                                      T ( 18 ) *stIn_r[ ( Z[1]+Y[1]+X[1] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[1]+Y[1]+X[2] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[1]+Y[2]+X[0] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[1]+Y[2]+X[1] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[1]+Y[2]+X[2] ) *stride_in+j]+

 //                                      stIn_r[ ( Z[2]+Y[0]+X[1] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[2]+Y[1]+X[0] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[2]+Y[1]+X[1] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[2]+Y[1]+X[2] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[2]+Y[2]+X[1] ) *stride_in+j]

 //                                  );

 //                     }

                 }


             }

         }

     }

     break;

     case 1:

     {

         alpha*= ( T ) 1.0;

         #pragma omp parallel for num_threads(get_numCPUs())

         for ( std::size_t z=0; z<shape[0]; z++ )

         {

             const T * p011;

             const T * p101;

             const T * p110;

             const T * p111;

             const T * p112;

             const T * p121;

             const T * p211;


             std::size_t Z[3];

             Z[1]=z+shape[0];

             Z[0]=Z[1]-1;

             Z[2]=Z[1]+1;

             Z[0]%=shape[0];

             Z[1]%=shape[0];

             Z[2]%=shape[0];


             Z[0]*=jumpz;

             Z[1]*=jumpz;

             Z[2]*=jumpz;


             for ( std::size_t y=0; y<shape[1]; y++ )

             {

                 std::size_t Y[3];

                 Y[1]=y+shape[1];

                 Y[0]=Y[1]-1;

                 Y[2]=Y[1]+1;

                 Y[0]%=shape[1];

                 Y[1]%=shape[1];

                 Y[2]%=shape[1];


                 Y[0]*=jumpy;

                 Y[1]*=jumpy;

                 Y[2]*=jumpy;


                 for ( std::size_t x=0; x<shape[2]; x++ )

                 {

                     std::size_t X[3];

                     X[1]=x+shape[2];

                     X[0]=X[1]-1;

                     X[2]=X[1]+1;

                     X[0]%=shape[2];

                     X[1]%=shape[2];

                     X[2]%=shape[2];


                     p011=stIn_r+ ( Z[0]+Y[1]+X[1] ) *stride_in;

                     p101=stIn_r+ ( Z[1]+Y[0]+X[1] ) *stride_in;

                     p110=stIn_r+ ( Z[1]+Y[1]+X[0] ) *stride_in;

                     p111=stIn_r+ ( Z[1]+Y[1]+X[1] ) *stride_in;

                     p112=stIn_r+ ( Z[1]+Y[1]+X[2] ) *stride_in;

                     p121=stIn_r+ ( Z[1]+Y[2]+X[1] ) *stride_in;

                     p211=stIn_r+ ( Z[2]+Y[1]+X[1] ) *stride_in;


 //                  const T * p011=stIn_r+ ( Z[0]+Y[1]+X[1] ) *stride_in;

 //                  const T * p101=stIn_r+ ( Z[1]+Y[0]+X[1] ) *stride_in;

 //                  const T * p110=stIn_r+ ( Z[1]+Y[1]+X[0] ) *stride_in;

 //                  const T * p111=stIn_r+ ( Z[1]+Y[1]+X[1] ) *stride_in;

 //                  const T * p112=stIn_r+ ( Z[1]+Y[1]+X[2] ) *stride_in;

 //                  const T * p121=stIn_r+ ( Z[1]+Y[2]+X[1] ) *stride_in;

 //                  const T * p211=stIn_r+ ( Z[2]+Y[1]+X[1] ) *stride_in;


                     T * current=stOut_r+( Z[1]+Y[1]+X[1] ) *stride_out;


                     for ( int j=0; j<dim; j++ )

                     {

                         if ( clear_field ) (*current)=T ( 0 );


 //                         (*current++)+=alpha* (

 //                                           (*p011++) +

 //                                           (*p101++) +

 //                                           (*p110++) -

 //                                           T ( 6 ) *(*p111++) +

 //                                           (*p112++) +

 //                                           (*p121++) +

 //                                           (*p211++)

 //                                       );


                         (*current++)+=alpha* (

                                           voxel_weights[0]*(*p011++) +

                                           voxel_weights[1]*(*p101++) +

                                           voxel_weights[2]*(*p110++) -

                                           voxel_weights[3]*(*p111++) +

                                           voxel_weights[2]*(*p112++) +

                                           voxel_weights[1]*(*p121++) +

                                           voxel_weights[0]*(*p211++)

                                       );


                     }


 //                     for ( int j=0;j<dim;j++ )

 //                     {

 //

 //                         T * current=stOut_r+( Z[1]+Y[1]+X[1] ) *stride_out+j;

 //                      if ( clear_field ) (*current)=T ( 0 );

 //

 //                         (*current)+=alpha* (

 //                                      stIn_r[ ( Z[0]+Y[1]+X[1] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[1]+Y[0]+X[1] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[1]+Y[1]+X[0] ) *stride_in+j] -

 //                                      T ( 6 ) *stIn_r[ ( Z[1]+Y[1]+X[1] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[1]+Y[1]+X[2] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[1]+Y[2]+X[1] ) *stride_in+j] +

 //                                      stIn_r[ ( Z[2]+Y[1]+X[1] ) *stride_in+j]

 //                                  );

 //

 //

 //                     }

                 }


             }

         }

     }

     break;

     default:

         printf ( "unsupported operator\n" );

     }


     return STA_RESULT_SUCCESS;

 }


 #ifdef _STA_LINK_FFTW


       STA_RESULT fft ( const std::complex<double> * IN,

                       std::complex<double> * OUT,

                       int shape[],int numComponents,

                       bool forward,

                       int flag=FFTW_ESTIMATE )

       {

       #ifdef _STA_FFT_MULTI_THREAD

           fftw_init_threads();

           fftw_plan_with_nthreads ( get_numCPUs() );

           if ( verbose>0 ) printf ( "FFTW with %d threads \n",get_numCPUs() );

       #else

           if ( verbose>0 ) printf ( "FFTW is single threaded\n" );

       #endif


           int rank=3;

           int * n=shape;

           int howmany=numComponents;

           fftw_complex * in = ( fftw_complex * ) IN;

           int * inembed=NULL;

           int istride=numComponents;

           int idist=1;

           fftw_complex * out = ( fftw_complex * ) OUT;

           int * onembed=inembed;

           int odist=idist;

           int ostride=istride;

           int sign=FFTW_FORWARD;

           if ( !forward ) sign=FFTW_BACKWARD;

           unsigned flags=flag | FFTW_PRESERVE_INPUT;//FFTW_ESTIMATE;//FFTW_MEASURE |  FFTW_PRESERVE_INPUT;//FFTW_ESTIMATE; //FFTW_MEASURE


       //     switch (flags)

       //     {

       //       case FFTW_ESTIMATE:

       //        printf("FFTW_ESTIMATE");

       //        break;

       //       case FFTW_MEASURE:

       //        printf("FFTW_MEASURE");

       //        break;

       //     }


       #if defined (__linux__) && ! defined (_STA_MWFFTW)

       //#ifdef __linux__

           char buffer[255];

           gethostname ( buffer,255 );

           std::string s;

           s=std::string ( getenv ( "HOME" ) ) +std::string ( "/.stensor_mywisdom_" ) +std::string ( buffer ) +".wisdom";


       //     if (fftw_import_wisdom_from_filename(s.c_str())==0)

       //        printf ( "Error reading wisdom file: %s!\n",s.c_str() );


           FILE *ifp;

           ifp = fopen ( s.c_str(),"r" );

           if ( ifp!=NULL )

           {

               if ( 0==fftw_import_wisdom_from_file ( ifp ) )

                   printf ( "Error reading wisdom file: %s!\n",s.c_str() );

               fclose ( ifp );

           }

           else printf ( "Wisdom file does not exist!\n" );


       #endif


           fftw_plan plan=fftw_plan_many_dft ( rank,n,howmany,in,inembed,istride, idist,out,onembed, ostride, odist, sign, flags | FFTW_PRESERVE_INPUT );

           if ( plan==NULL )

           {

               printf ( "no plan\n" );

               return STA_RESULT_FAILED;

           }


           fftw_execute_dft ( plan,in, out );


       //#ifdef __linux__

       #if defined (__linux__) && ! defined (_STA_MWFFTW)

       //     fftw_export_wisdom_to_file(s.c_str());

           ifp = fopen ( s.c_str(),"w" );

           if ( ifp!=NULL )

           {

               fftw_export_wisdom_to_file ( ifp );

               fclose ( ifp );

           }

           else  printf ( "Error creating file!\n" );


       #endif


           fftw_destroy_plan ( plan );


           return STA_RESULT_SUCCESS;

           //fftw_cleanup()

           //fftw_cleanup_threads();


       }


       STA_RESULT fft ( const std::complex<float> * IN,

                       std::complex<float> * OUT,

                       int shape[],

                       int numComponents,

                       bool forward,

                       int flag=FFTW_ESTIMATE )

       {

       #ifdef _STA_FFT_MULTI_THREAD

           fftwf_init_threads();

           fftwf_plan_with_nthreads ( get_numCPUs() );

           if ( verbose>0 ) printf ( "FFTW with %d threads \n",get_numCPUs() );

       #else

           if ( verbose>0 ) printf ( "FFTW is single threaded\n" );

       #endif


           int rank=3;

           int * n=shape;

           int howmany=numComponents;

           fftwf_complex * in = ( fftwf_complex * ) IN;

           int * inembed=NULL;

           int istride=numComponents;

           int idist=1;

           fftwf_complex * out = ( fftwf_complex * ) OUT;

           int * onembed=inembed;

           int odist=idist;

           int ostride=istride;

           int sign=FFTW_FORWARD;

           if ( !forward ) sign=FFTW_BACKWARD;

           unsigned flags=flag | FFTW_PRESERVE_INPUT;//FFTW_ESTIMATE;//FFTW_MEASURE |  FFTW_PRESERVE_INPUT;//FFTW_ESTIMATE; //FFTW_MEASURE


       //#ifdef __linux__

       #if defined (__linux__) && ! defined (_STA_MWFFTW)

           char buffer[255];

           gethostname ( buffer,255 );

           std::string s;

           s=std::string ( getenv ( "HOME" ) ) +std::string ( "/.stensor_mywisdom_" ) +std::string ( buffer ) +"_single.wisdom";


       //     if (fftwf_import_wisdom_from_filename(s.c_str())==0)

       //        printf ( "Error reading wisdom file: %s!\n",s.c_str() );


           FILE *ifp;

           ifp = fopen ( s.c_str(),"r" );

           if ( ifp!=NULL )

           {

               if ( 0==fftwf_import_wisdom_from_file ( ifp ) )

                   printf ( "Error reading wisdom file: %s!\n",s.c_str() );

               fclose ( ifp );

           }

           else printf ( "Wisdom file does not exist!\n" );


       #endif


           fftwf_plan plan=fftwf_plan_many_dft ( rank,n,howmany,in,inembed,istride, idist,out,onembed, ostride, odist, sign, flags | FFTW_PRESERVE_INPUT );

           if ( plan==NULL )

           {

               printf ( "no plan\n" );

               return STA_RESULT_FAILED;

           }


           fftwf_execute_dft ( plan,in, out );


       //#ifdef __linux__

       #if defined (__linux__) && ! defined (_STA_MWFFTW)

       //     fftwf_export_wisdom_to_file(s.c_str());

           ifp = fopen ( s.c_str(),"w" );

           if ( ifp!=NULL )

           {

               fftwf_export_wisdom_to_file ( ifp );

               fclose ( ifp );

           }

           else  printf ( "Error creating file!\n" );


       #endif


           fftwf_destroy_plan ( plan );

           return STA_RESULT_SUCCESS;

       }


 #else

     #if ! defined (_STA_CUDA)


       #include "matlab_fft.icc"


     #endif

 #endif


 /*#####################################################


             FIELD in V^l


 #######################################################*/


 template<typename T>

 T * sta_th_precomputeCGcoefficients_R(int J,int L, int j, T norm)

 {

     std::size_t count=0;


     int rank=j+L;

     for (int m=-rank; m<=0; m++)

     {

         for (int M=-J; M<=J; M++)

         {

             //if ((rank+L>=J)&&(std::abs(rank-L)<=J))

             {

                 int n=M-m;

                 if (std::abs(n)<=L)

                 {

                     count++;

                 }

             }

         }

     }


     T * cg= new T[count];

     count=0;

     for (int m=-rank; m<=0; m++)

     {

         for (int M=-J; M<=J; M++)

         {

             //if ((rank+L>=J)&&(std::abs(rank-L)<=J))

             {

                 int n=M-m;

                 if (std::abs(n)<=L)

                 {

                     double sign=((L+n) & 1) ? -1.0 : 1.0;

                     cg[count++]=sign*norm*(T)hanalysis::clebschGordan(L,-n,J,M,rank,m);


                     //double cg2=std::pow(-1.0f,(float)(L+n))*hanalysis::clebschGordan(L,-n,J,M,rank,m);

                     //printf("%f  %f\n",cg[count-1],cg2);


                 }

             }

         }

     }

     return cg;

 }


 template<typename T>

 STA_RESULT sta_th_R (

     const std::complex<T> ** stIn,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J,

     int L,

     int j,

     T alpha,

     int stride_in = -1,

     int stride_out = -1,

     bool clear_field=false)

 {

     if (J<0)

         return STA_RESULT_INVALID_TENSOR_RANK;

     if (L<0)

         return STA_RESULT_INVALID_TENSOR_RANK;

     if (std::abs(j)>J)

         return STA_RESULT_INVALID_TENSOR_RANK;

 //     if ((L+j)%2!=0)

 //     {

 //       printf("result in inv, doing nothing\n");

 //       return -1;

 //     }


     bool inIV=false;

     if ((J+j)%2!=0)

     {

         printf("result in inv\n");

         inIV=true;

     }

 //      int sign=0; if (inIV) sign=1;


     T  * Cg = sta_th_precomputeCGcoefficients_R<T> ( J,L,j,alpha);


     std::size_t vectorLengthIn=  (2* L+1 );

     std::size_t vectorLengthOut= ( j+L+1 );


     if ( stride_in == -1 )

         stride_in = vectorLengthIn;

     if ( stride_out == -1 )

         stride_out = vectorLengthOut;


     stride_in*=2;

     stride_out*=2;


     int rankIn=L;

     int rankOut=L+j;


     if (!((rankOut>=0)&&(std::abs(j)<=J)&&(J<=j+2*L)&&(L>=0)))

     {

         printf("trotz test bis hierher??");

         return STA_RESULT_FAILED;

     }


 //     //if (!((rankOut+L>=J)&&(std::abs(rankOut-L)<=J)))

 //     if (!((rankOut>=0)&&(std::abs(j)<=J)))

 //     {

 //      printf("trotz test bis hierher??");

 //      return STA_RESULT_FAILED;

 //     }

 //

 //     //if (!((std::abs(L-J)<=L+j)&&(j<=J)))

 //     if (!(std::abs(L)+std::abs(j)>=J))

 //     {

 //       printf("trotz test bis hierher2??");

 //      return STA_RESULT_FAILED;

 //     }


     int rankIn_times_2=rankIn*2;

     int rankOut_times_2=rankOut*2;


     std::size_t jumpz=shape[1]*shape[2];


     T * stOutR= ( T * ) stOut;


     #pragma omp parallel for num_threads(get_numCPUs())

     for ( std::size_t z=0; z<shape[0]; z++ )

     {

         std::size_t Z=z;

         Z*=jumpz;


         const T ** current_In = new const T *[J+1];

         for (int a=0; a<=J; a++)

             current_In[a]=(const T * )stIn[a]+ ( Z*stride_in+rankIn_times_2 );


         T * current_Out=stOutR+ ( Z*stride_out+rankOut_times_2 );


         T tmp0R;

         T tmp0I;


         T tmp1R;

         T tmp1I;

         for ( std::size_t i=0; i<jumpz; i++ )

         {

             std::size_t count=0;

             for (int m=-rankOut; m<=0; m++)

             {

                 if ( clear_field )

                 {

                     current_Out[m*2]=T ( 0 );

                     current_Out[m*2+1]=T ( 0 );

                 }

                 tmp1R=0;

                 tmp1I=0;


                 for (int M=-J; M<=J; M++)

                 {

                     int n=M-m;

                     if (std::abs(n)<=L)

                     {

                         tmp0R=0;

                         tmp0I=0;


                         //printf("%d %d %d\n",M,m,n);

                         if (M>0)

                         {

                             if ( n>0 )

                             {

                                 if ( (n+M)%2==0 )

                                 {

                                     tmp0R=current_In[-M+J][-n*2];

                                     tmp0I=-current_In[-M+J][-n*2+1];

                                 }

                                 else

                                 {

                                     tmp0R=-current_In[-M+J][-n*2];

                                     tmp0I=current_In[-M+J][-n*2+1];

                                 }

                             }

                         } else

                         {

                             tmp0R=current_In[M+J][n*2];

                             tmp0I=current_In[M+J][n*2+1];

                         }

                         tmp1R+=Cg[count]*tmp0R;

                         tmp1I+=Cg[count++]*tmp0I;

                     }

                 }

 //              if (m<0)

 //              {

 //                    if ((sign+m)%2!=0)

 //                    {

 //                      tmp1R*=-1;

 //                    }else

 //                    {

 //                      tmp1I*=-1;

 //                    }

 //              }


                 if (inIV)

                 {

                     //TODO

                     // times I

                 }


                 current_Out[m*2]+=tmp1R;

                 current_Out[m*2+1]-=tmp1I;


             }

             for (int a=0; a<=J; a++)

                 current_In[a]+=stride_in;

             current_Out+=stride_out;


             /*delete [] current_In;

             delete [] Cg;

             return 0;   */

         }


         delete [] current_In;

     }

     delete [] Cg;

     return STA_RESULT_SUCCESS;

 }


 template<typename T>

 T * sta_product_precomputeCGcoefficients_R( int J1,int J2, int J, bool normalized, T fact)

 {

     T norm=(T)1;

     if (normalized)

     {

         //assert((J1+J2+J)%2==0);

         norm=(T)1/(T)hanalysis::clebschGordan(J1,0,J2,0,J,0);

     }


     norm*=fact;

     std::size_t count=0;

     for (int m=-J; m<=0; m++)

     {

         for (int m1=-J1; m1<=J1; m1++)

         {

             int m2=m-m1;

             if (abs(m2)<=J2)

             {

                 count++;

             }

         }

     }

     T * cg= new T[count];

     count=0;

     for (int m=-J; m<=0; m++)

     {

         for (int m1=-J1; m1<=J1; m1++)

         {

             int m2=m-m1;

             if (abs(m2)<=J2)

             {

                 cg[count++]=norm*(T)hanalysis::clebschGordan(J1,m1,J2,m2,J,m);

             }

         }

     }

     return cg;

 }


 template<typename T>

 T * sta_tripleproduct_precomputeCGcoefficients_R( int J1,int J2,int J3, int Jprod1,int Jprod2, bool normalized, T fact)

 {

     T norm=(T)1;

     if (normalized)

     {

         //assert((J1+J2+J)%2==0);

         norm=(T)1/((T)(hanalysis::clebschGordan(J1,0,J2,0,Jprod1,0)*hanalysis::clebschGordan(Jprod1,0,J3,0,Jprod2,0)));

     }


     norm*=fact;

     std::size_t count=0;


     for ( int m=-Jprod2; m<=0; m++ )

     {

         for ( int m3=-J3; m3<=J3; m3++ )

         {

             int mL=m-m3;

             if ( abs ( mL ) <=Jprod1 )

             {

                 for ( int m1=-J1; m1<=J1; m1++ )

                 {

                     int m2=mL-m1;

                     if ( abs ( m2 ) <=J2 )

                     {

                         count++;

                     }

                 }

             }

         }

     }

     //printf("!!  %d  !!\n",count);

     T * cg= new T[count];

     count=0;


     for ( int m=-Jprod2; m<=0; m++ )

     {

         for ( int m3=-J3; m3<=J3; m3++ )

         {

             int mL=m-m3;

             if ( abs ( mL ) <=Jprod1 )

             {

                 for ( int m1=-J1; m1<=J1; m1++ )

                 {

                     int m2=mL-m1;

                     if ( abs ( m2 ) <=J2 )

                     {

                         cg[count++]=norm*(T)hanalysis::clebschGordan(J1,m1,J2,m2,Jprod1,mL)*(T)hanalysis::clebschGordan(Jprod1,mL,J3,m3,Jprod2,m);

                         //cg[count++]=1;

 //                      if (std::abs(cg[count-1])<0.0000000000001)

 //                      {

 //                      printf("%f    (%d %d, %d %d| %d %d) [%f] (%d %d, %d %d| %d %d) [%f] \n",cg[count-1],J1,m1,J2,m2,Jprod1,mL,hanalysis::clebschGordan(J1,m1,J2,m2,Jprod1,mL),Jprod1,mL,J3,m3,Jprod2,m,hanalysis::clebschGordan(Jprod1,mL,J3,m3,Jprod2,m));

 //                      }


                     }

                 }

             }

         }

     }

     return cg;

 }


 template<typename T>

 STA_RESULT sta_tripleproduct_R (

     const std::complex<T> * stIn1,

     const std::complex<T> * stIn2,

     const std::complex<T> * stIn3,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J1,

     int J2,

     int J3,

     int Jprod1,

     int Jprod2,

     T alpha,

     bool normalize,

     int stride_in1 = -1,

     int stride_in2 = -1,

     int stride_in3 = -1,

     int stride_out = -1,

     bool clear_field=false)

 {


     if ( ( std::abs ( J1-J2 ) >Jprod1 ) || ( Jprod1>std::abs ( J1+J2 ) ) )

         return STA_RESULT_INVALID_PRODUCT;

     if ( ( ( J1+J2+Jprod1 ) %2!=0 ) && ( normalize ) )

         return STA_RESULT_INVALID_PRODUCT;


     if ( ( std::abs ( Jprod1-J3 ) >Jprod2 ) || ( Jprod2>std::abs ( Jprod1+J3 ) ) )

         return STA_RESULT_INVALID_PRODUCT;

     if ( ( ( J3+Jprod1+Jprod2 ) %2!=0 ) && ( normalize ) )

         return STA_RESULT_INVALID_PRODUCT;


 //       return STA_RESULT_SUCCESS;


     //bool resultInIv= ( ( J1+J2+Jprod1 ) %2!=0 ) && ( ( J2+Jprod2+Jprod1 ) %2==0 )||( ( J1+J2+Jprod1 ) %2==0 ) && ( ( J2+Jprod2+Jprod1 ) %2!=0 );

     //bool resultInIv=  ( ( J2+Jprod2+Jprod1 ) %2!=0 );

     bool tmp1InIv = ( ( J1+J2+Jprod1 ) %2!=0 );

     bool tmp2InIv = ( ( J3+Jprod2+Jprod1 ) %2!=0 );

     bool resultInIv= (tmp1InIv&&(!tmp2InIv))||((!tmp1InIv)&&(tmp2InIv));

     if (tmp1InIv) alpha*=-1;


     T  * Cg= sta_tripleproduct_precomputeCGcoefficients_R(J1,J2,J3,Jprod1,Jprod2,normalize,alpha);


     //(((tmp1InIv && tmp2InIv) || (!(tmp1InIv && tmp2InIv))) &&  ( ( J2+Jprod2+Jprod1 ) %2!=0 ));


     std::size_t vectorLengthJ1= ( J1+1 );

     std::size_t vectorLengthJ2= ( J2+1 );

     std::size_t vectorLengthJ3= ( J3+1 );

     std::size_t vectorLengthJprod2= ( Jprod2+1 );


     if ( stride_in1 == -1 )

         stride_in1 = vectorLengthJ1;

     if ( stride_in2 == -1 )

         stride_in2 = vectorLengthJ2;

     if ( stride_in3 == -1 )

         stride_in3 = vectorLengthJ3;

     if ( stride_out == -1 )

         stride_out = vectorLengthJprod2;


     stride_in1*=2; // because input field is complex but pointer are real

     stride_in2*=2;

     stride_in3*=2;

     stride_out*=2;


     int J3_times_2=J3*2;

     int J2_times_2=J2*2;

     int J1_times_2=J1*2;

     int J_times_2=Jprod2*2;


     std::size_t jumpz=shape[1]*shape[2];


     const T * stIn1R= ( const T * ) stIn1;

     const T * stIn2R= ( const T * ) stIn2;

     const T * stIn3R= ( const T * ) stIn3;

     T * stOutR= ( T * ) stOut;


     #pragma omp parallel for num_threads(get_numCPUs())

     for ( std::size_t z=0; z<shape[0]; z++ )

     {

         std::size_t Z=z;

         Z*=jumpz;

         const T * current_J1R=stIn1R+ ( Z*stride_in1+J1_times_2 );

         const T * current_J2R=stIn2R+ ( Z*stride_in2+J2_times_2 );

         const T * current_J3R=stIn3R+ ( Z*stride_in3+J3_times_2 );

         T * current_JR=stOutR+ ( Z*stride_out+J_times_2 );


         T a;

         T b;


         T c;

         T d;


         T e;

         T f;


         for ( std::size_t i=0; i<jumpz; i++ )

         {

             std::size_t count=0;

             for ( int m=-Jprod2; m<=0; m++ )

             {

                 if ( clear_field )

                 {

                     current_JR[m*2]=T ( 0 );

                     current_JR[m*2+1]=T ( 0 );

                 }


                 for ( int m3=-J3; m3<=J3; m3++ )

                 {

                     int mL=m-m3;

                     if ( abs ( mL ) <=Jprod1 )

                     {

                         if ( m3>0 )

                         {

                             if ( m3%2==0 )

                             {

                                 e=current_J3R[-m3*2];

                                 f=-current_J3R[-m3*2+1];

                             }

                             else

                             {

                                 e=-current_J3R[-m3*2];

                                 f=current_J3R[-m3*2+1];

                             }

                         }

                         else

                         {

                             e=current_J3R[m3*2];

                             f=current_J3R[m3*2+1];

                         }


                         for ( int m1=-J1; m1<=J1; m1++ )

                         {

                             int m2=mL-m1;

                             if ( abs ( m2 ) <=J2 )

                             {

                                 if ( m1>0 )

                                 {

                                     if ( m1%2==0 )

                                     {

                                         a=current_J1R[-m1*2];

                                         b=-current_J1R[-m1*2+1];

                                     }

                                     else

                                     {

                                         a=-current_J1R[-m1*2];

                                         b=current_J1R[-m1*2+1];

                                     }

                                 }

                                 else

                                 {

                                     a=current_J1R[m1*2];

                                     b=current_J1R[m1*2+1];

                                 }


                                 if ( m2>0 )

                                 {

                                     if ( m2%2==0 )

                                     {

                                         c=current_J2R[-m2*2];

                                         d=-current_J2R[-m2*2+1];

                                     }

                                     else

                                     {

                                         c=-current_J2R[-m2*2];

                                         d=current_J2R[-m2*2+1];

                                     }

                                 }

                                 else

                                 {

                                     c=current_J2R[m2*2];

                                     d=current_J2R[m2*2+1];

                                 }


 //                                if ( tmpInIv )

 //                                {

 //                                    if ( resultInIv )

 //                                    {

 //                                      current_JR[m*2]-=Cg[count]*  ( a*(c*e-d*f)-b*(d*e+f*c));

 //                                      current_JR[m*2+1]+=Cg[count++]*( -a*(e*d+c*f)+b*(d*f-e*c));

 //                                    }else

 //                                    {

 //                                      current_JR[m*2]+=Cg[count]* ( -a*(e*d+c*f)+b*(d*f-e*c));

 //                                      current_JR[m*2+1]+=Cg[count++]*  ( a*(c*e-d*f)-b*(d*e+f*c));

 //                                    }

 //                                }else

 //                                {

 //                                    if ( resultInIv )

 //                                    {

 //                                      current_JR[m*2]-=Cg[count]*  ( a*(e*d+c*f)+b*(e*c-d*f));

 //                                      current_JR[m*2+1]+=Cg[count++]* ( a*(e*c-d*f)-b*(e*d+c*f));

 //                                    }else

 //                                    {

 //                                      current_JR[m*2]+=Cg[count]* ( a*(e*c-d*f)-b*(e*d+c*f));

 //                                      current_JR[m*2+1]+=Cg[count++]*  ( a*(e*d+c*f)+b*(e*c-d*f));

 //                                    }

 //                                }


                                 if ( resultInIv )

                                 {

 //                                    current_JR[m*2]-=Cg[count]* ( tmp0R*(tmp1I*tmp2R-tmp1R*tmp2I)+tmp0I*(tmp1R*tmp2R-tmp1I*tmp2I));

 //                                    current_JR[m*2+1]+=Cg[count++]* ( tmp0R*(tmp1R*tmp2R-tmp1I*tmp1I)-tmp0I*(tmp1I*tmp2R+tmp1R*tmp2I));

                                     current_JR[m*2]-=Cg[count]*  ( a*(e*d+c*f)+b*(e*c-d*f));

                                     current_JR[m*2+1]+=Cg[count++]* ( a*(e*c-d*f)-b*(e*d+c*f));


 //                                    current_JR[m*2]-=Cg[count]* ( tmp0R*tmp1I+tmp0I*tmp1R );

 //                                    current_JR[m*2+1]+=Cg[count++]* ( tmp0R*tmp1R-tmp0I*tmp1I );

                                 }

                                 else

                                 {

                                     current_JR[m*2]+=Cg[count]* ( a*(e*c-d*f)-b*(e*d+c*f));

                                     current_JR[m*2+1]+=Cg[count++]*  ( a*(e*d+c*f)+b*(e*c-d*f));


 //                                    current_JR[m*2]+=( a*(e*c-d*f)-b*(e*d+c*f));

 //                                    current_JR[m*2+1]+= ( a*(e*d+c*f)+b*(e*c-d*f));


 //                                    current_JR[m*2]+=Cg[count]* ( tmp0R*(tmp1R*tmp2R-tmp1I*tmp1I)-tmp0I*(tmp1I*tmp2R+tmp1R*tmp2I));

 //                                    current_JR[m*2+1]+=Cg[count++]* ( tmp0R*(tmp1I*tmp2R-tmp1R*tmp2I)+tmp0I*(tmp1R*tmp2R-tmp1I*tmp2I));

 //                                    current_JR[m*2]+=Cg[count]* ( tmp0R*tmp1R-tmp0I*tmp1I );

 //                                    current_JR[m*2+1]+=Cg[count++]* ( tmp0R*tmp1I+tmp0I*tmp1R );

                                 }


                             }

                         }

                     }

                 }

             }


             current_J1R+=stride_in1;

             current_J2R+=stride_in2;

             current_J3R+=stride_in3;

             current_JR+=stride_out;

         }

     }

     delete [] Cg;

     return STA_RESULT_SUCCESS;

 }


 /*

   computes the spherical tensor product \f$ \alpha(\mathbf{stIn1} \circ_{J} \mathbf{stIn2}) \f$ and \f$ \alpha(\mathbf{stIn1} \bullet_{J} \mathbf{stIn2}) \f$, respectively  \n

   \param stIn1 \f$ \mathbf{stIn1} \in \mathcal T_{J_1}\f$

   \param stIn2 \f$ \mathbf{stIn2} \in \mathcal T_{J_2} \f$

   \param stOut \f$ \alpha(\mathbf{stIn1} \bullet_{J} \mathbf{stIn2}) \in \mathcal T_{J}\f$ if normalized, \f$ \alpha(\mathbf{stIn1} \circ_{J} \mathbf{stIn2}) \in \mathcal T_{J}\f$  else

   \param shape

   \param J1 \f$ J_1 \in \mathbb N \f$ tensor rank of the first field

   \param J2 \f$ J_2 \in \mathbb N \f$ tensor rank of the second field

   \param J \f$ J \in \mathbb N \f$ tensor rank of the resulting field

   \param alpha \f$ \alpha \in \mathbb C \f$ additional weighting factor

   \param normalize normalized tensor products?: true=\f$ \bullet_{J}\f$ , false=\f$ \circ_{J}\f$

   \returns  \f$

   \left\{

     \begin{array}{ll}

   0 &  \mbox{if tensor product exists}\\

   -1 & \mbox{ else }

     \end{array}

   \right.

   \f$

   \warning ensure that stIn1, stIn2, stOut and shape exist

   and have been \b allocated properly!

 */


 /*

 template<typename T>

 int sta_product_R (

     const std::complex<T> * stIn1,

     const std::complex<T> * stIn2,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J1,

     int J2,

     int J,

     std::complex<T> alpha,

     bool normalize,

     int stride_in1 = -1,

     int stride_in2 = -1,

     int stride_out = -1,

     bool clear_field=false)

 {

     if ( ( std::abs ( J1-J2 ) >J ) || ( J>std::abs ( J1+J2 ) ) )

         return -1;

     if ( ( ( J1+J2+J ) %2!=0 ) && ( normalize ) )

         return -1;


     if ( ( J1+J2+J ) %2!=0 )

     {

         alpha*=std::complex<T> ( 0,1 );

     }


     std::complex<T>  * cg= sta_product_precomputeCGcoefficients_R<std::complex<T> > ( J1,J2, J,normalize,alpha );

     T * Cg = ( T* ) cg;

     //

     std::size_t vectorLengthJ1= ( J1+1 );

     std::size_t vectorLengthJ2= ( J2+1 );

     std::size_t vectorLengthJ= ( J+1 );


     if ( stride_in1 == -1 )

         stride_in1 = vectorLengthJ1;

     if ( stride_in2 == -1 )

         stride_in2 = vectorLengthJ2;

     if ( stride_out == -1 )

         stride_out = vectorLengthJ;


     stride_in1*=2; // because input field is complex but pointer are real

     stride_in2*=2;

     stride_out*=2;


     std::size_t jumpz=shape[1]*shape[2];


     const T * stIn1R= ( const T * ) stIn1;

     const T * stIn2R= ( const T * ) stIn2;

     T * stOutR= ( T * ) stOut;


 #pragma omp parallel for num_threads(get_numCPUs())

     for ( std::size_t z=0;z<shape[0];z++ )

     {

         std::size_t Z=z;

         Z*=jumpz;


         const T * current_J1R=stIn1R+ ( Z*stride_in1+2*J1 );


         const T * current_J2R=stIn2R+ ( Z*stride_in2+2*J2 );


         T * current_JR=stOutR+ ( Z*stride_out+2*J );


         T tmp0R;

         T tmp0I;


         T tmp1R;

         T tmp1I;


         for ( std::size_t i=0;i<jumpz;i++ )

         {

             std::size_t count=0;

             for ( int m=-J;m<=0;m++ )

             {


                 if ( clear_field )

                 {

                     current_JR[m*2]=T ( 0 );

                     current_JR[m*2+1]=T ( 0 );

                 }


                 for ( int m1=-J1;m1<=J1;m1++ )

                 {

                     int m2=m-m1;

                     if ( abs ( m2 ) <=J2 )

                     {

                         if ( m1>0 )

                         {

                             if ( m1%2==0 )

                             {

                                 tmp0R=current_J1R[-m1*2];

                                 tmp0I=-current_J1R[-m1*2+1];

                             }

                             else

                             {

                                 tmp0R=-current_J1R[-m1*2];

                                 tmp0I=current_J1R[-m1*2+1];

                             }

                         }

                         else

                         {

                             tmp0R=current_J1R[m1*2];

                             tmp0I=current_J1R[m1*2+1];

                         }


                         if ( m2>0 )

                         {

                             if ( m2%2==0 )

                             {

                                 tmp1R=current_J2R[-m2*2];

                                 tmp1I=-current_J2R[-m2*2+1];

                             }

                             else

                             {

                                 tmp1R=-current_J2R[-m2*2];

                                 tmp1I=current_J2R[-m2*2+1];

                             }

                         }

                         else

                         {

                             tmp1R=current_J2R[m2*2];

                             tmp1I=current_J2R[m2*2+1];

                         }

                         T ce=tmp1R*Cg[count];

                         T de=tmp1I*Cg[count++];

                         T cf=tmp1R*Cg[count];

                         T df=tmp1I*Cg[count++];

                         ce=ce-df;

                         cf=cf+de;


                         current_JR[m*2]+=tmp0R*ce-tmp0I*cf;

                         current_JR[m*2+1]+=tmp0R*cf+tmp0I*ce;;

                     }

                 }

             }


             current_J1R+=stride_in1;

             current_J2R+=stride_in2;

             current_JR+=stride_out;

         }

     }

     delete [] cg;

     return 0;

 }

 */


 /*

   computes the spherical tensor product \f$ \alpha(\mathbf{stIn1} \circ_{J} \mathbf{stIn2}) \f$ and \f$ \alpha(\mathbf{stIn1} \bullet_{J} \mathbf{stIn2}) \f$, respectively  \n

   \param stIn1 \f$ \mathbf{stIn1} \in \mathcal T_{J_1}\f$

   \param stIn2 \f$ \mathbf{stIn2} \in \mathcal T_{J_2} \f$

   \param stOut \f$ \alpha(\mathbf{stIn1} \bullet_{J} \mathbf{stIn2}) \in \mathcal T_{J}\f$ if normalized, \f$ \alpha(\mathbf{stIn1} \circ_{J} \mathbf{stIn2}) \in \mathcal T_{J}\f$  else

   \param shape

   \param J1 \f$ J_1 \in \mathbb N \f$ tensor rank of the first field

   \param J2 \f$ J_2 \in \mathbb N \f$ tensor rank of the second field

   \param J \f$ J \in \mathbb N \f$ tensor rank of the resulting field

   \param alpha \f$ \alpha \in \mathbb R \f$ additional weighting factor

   \param normalize normalized tensor products?: true=\f$ \bullet_{J}\f$ , false=\f$ \circ_{J}\f$

   \returns  \f$

   \left\{

     \begin{array}{ll}

   0 &  \mbox{if tensor product exists}\\

   -1 & \mbox{ else }

     \end{array}

   \right.

   \f$

   \warning ensure that stIn1, stIn2, stOut and shape exist

   and have been \b allocated properly!

 */

 template<typename T>

 STA_RESULT sta_product_R (

     const std::complex<T> * stIn1,

     const std::complex<T> * stIn2,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J1,

     int J2,

     int J,

     T alpha,

     bool normalize,

     int stride_in1 = -1,

     int stride_in2 = -1,

     int stride_out = -1,

     bool clear_field=false)

 {


     if ( ( std::abs ( J1-J2 ) >J ) || ( J>std::abs ( J1+J2 ) ) )

         return STA_RESULT_INVALID_PRODUCT;

     if ( ( ( J1+J2+J ) %2!=0 ) && ( normalize ) )

         return STA_RESULT_INVALID_PRODUCT;


     T  * Cg= sta_product_precomputeCGcoefficients_R<T> ( J1,J2, J,normalize,alpha );


     bool resultInIv= ( ( J1+J2+J ) %2!=0 );


     std::size_t vectorLengthJ1= ( J1+1 );

     std::size_t vectorLengthJ2= ( J2+1 );

     std::size_t vectorLengthJ= ( J+1 );


     if ( stride_in1 == -1 )

         stride_in1 = vectorLengthJ1;

     if ( stride_in2 == -1 )

         stride_in2 = vectorLengthJ2;

     if ( stride_out == -1 )

         stride_out = vectorLengthJ;


     stride_in1*=2; // because input field is complex but pointer are real

     stride_in2*=2;

     stride_out*=2;


     int J2_times_2=J2*2;

     int J1_times_2=J1*2;

     int J_times_2=J*2;


     std::size_t jumpz=shape[1]*shape[2];


     const T * stIn1R= ( const T * ) stIn1;

     const T * stIn2R= ( const T * ) stIn2;

     T * stOutR= ( T * ) stOut;


     #pragma omp parallel for num_threads(get_numCPUs())

     for ( std::size_t z=0; z<shape[0]; z++ )

     {

         std::size_t Z=z;

         Z*=jumpz;

         const T * current_J1R=stIn1R+ ( Z*stride_in1+J1_times_2 );

         const T * current_J2R=stIn2R+ ( Z*stride_in2+J2_times_2 );

         T * current_JR=stOutR+ ( Z*stride_out+J_times_2 );


         T tmp0R;

         T tmp0I;


         T tmp1R;

         T tmp1I;


         for ( std::size_t i=0; i<jumpz; i++ )

         {

             std::size_t count=0;

             for ( int m=-J; m<=0; m++ )

             {

                 if ( clear_field )

                 {

                     current_JR[m*2]=T ( 0 );

                     current_JR[m*2+1]=T ( 0 );

                 }


 //              for ( int m1=-J1;m1<=J1;m1++ )

 //                 {

 //                     int m2=2*(m-m1);

 //                     if ( abs ( m2 ) <=J2_times_2 )

 //                     {

 //                         if ( m1>0 )

 //                         {

 //                             if ( m1%2==0 )

 //                             {

 //                                 tmp0R=current_J1R[-m1*2];

 //                                 tmp0I=-current_J1R[-m1*2+1];

 //                             }

 //                             else

 //                             {

 //                                 tmp0R=-current_J1R[-m1*2];

 //                                 tmp0I=current_J1R[-m1*2+1];

 //                             }

 //                         }

 //                         else

 //                         {

 //                             tmp0R=current_J1R[m1*2];

 //                             tmp0I=current_J1R[m1*2+1];

 //                         }

 //                         if ( m2>0 )

 //                         {

 //                             if ( (m-m1)%2==0 )

 //                             {

 //                                 tmp1R=current_J2R[-m2];

 //                                 tmp1I=-current_J2R[-m2+1];

 //                             }

 //                             else

 //                             {

 //                                 tmp1R=-current_J2R[-m2];

 //                                 tmp1I=current_J2R[-m2+1];

 //                             }

 //                         }

 //                         else

 //                         {

 //                             tmp1R=current_J2R[m2];

 //                             tmp1I=current_J2R[m2+1];

 //                         }

 //                         if ( resultInIv )

 //                         {

 // //                     if ((z==0)&&(i==0))

 // //                       printf("result in iv\n");

 //                             current_JR[m*2]-=Cg[count]* ( tmp0R*tmp1I+tmp0I*tmp1R );

 //                             current_JR[m*2+1]+=Cg[count++]* ( tmp0R*tmp1R-tmp0I*tmp1I );

 //                         }

 //                         else

 //                         {

 // //                     if ((z==0)&&(i==0))

 // //                       printf("result in v\n");

 //                             current_JR[m*2]+=Cg[count]* ( tmp0R*tmp1R-tmp0I*tmp1I );

 //                             current_JR[m*2+1]+=Cg[count++]* ( tmp0R*tmp1I+tmp0I*tmp1R );

 //                         }

 //                     }

 //                 }


                 for ( int m1=-J1; m1<=J1; m1++ )

                 {

                     int m2=m-m1;

                     if ( abs ( m2 ) <=J2 )

                     {

                         if ( m1>0 )

                         {

                             if ( m1%2==0 )

                             {

                                 tmp0R=current_J1R[-m1*2];

                                 tmp0I=-current_J1R[-m1*2+1];

                             }

                             else

                             {

                                 tmp0R=-current_J1R[-m1*2];

                                 tmp0I=current_J1R[-m1*2+1];

                             }

                         }

                         else

                         {

                             tmp0R=current_J1R[m1*2];

                             tmp0I=current_J1R[m1*2+1];

                         }

                         if ( m2>0 )

                         {

                             if ( m2%2==0 )

                             {

                                 tmp1R=current_J2R[-m2*2];

                                 tmp1I=-current_J2R[-m2*2+1];

                             }

                             else

                             {

                                 tmp1R=-current_J2R[-m2*2];

                                 tmp1I=current_J2R[-m2*2+1];

                             }

                         }

                         else

                         {

                             tmp1R=current_J2R[m2*2];

                             tmp1I=current_J2R[m2*2+1];

                         }

                         if ( resultInIv )

                         {

 //                        if ((z==0)&&(i==0))

 //                          printf("result in iv\n");

                             current_JR[m*2]-=Cg[count]* ( tmp0R*tmp1I+tmp0I*tmp1R );

                             current_JR[m*2+1]+=Cg[count++]* ( tmp0R*tmp1R-tmp0I*tmp1I );

                         }

                         else

                         {

 //                        if ((z==0)&&(i==0))

 //                          printf("result in v\n");

                             current_JR[m*2]+=Cg[count]* ( tmp0R*tmp1R-tmp0I*tmp1I );

                             current_JR[m*2+1]+=Cg[count++]* ( tmp0R*tmp1I+tmp0I*tmp1R );

                         }

                     }

                 }

             }

             current_J1R+=stride_in1;

             current_J2R+=stride_in2;

             current_JR+=stride_out;

         }

     }

     delete [] Cg;

     return STA_RESULT_SUCCESS;

 }


 /*

   computes the spherical tensor product \f$ \alpha(\mathbf{stIn1} \circ_{J} \mathbf{stIn2}) \f$ and \f$ \alpha(\mathbf{stIn1} \bullet_{J} \mathbf{stIn2}) \f$, respectively  \n

   \param stIn1 \f$ \mathbf{stIn1} \in \mathcal T_{J_1}\f$

   \param stIn2 \f$ \mathbf{stIn2} \in \mathcal T_{J_2} \f$

   \param stOut \f$ \alpha(\mathbf{stIn1} \bullet_{J} \mathbf{stIn2}) \in \mathcal T_{J}\f$ if normalized, \f$ \alpha(\mathbf{stIn1} \circ_{J} \mathbf{stIn2}) \in \mathcal T_{J}\f$  else

   \param shape

   \param J1 \f$ J_1 \in \mathbb N \f$ tensor rank of the first field

   \param J2 \f$ J_2 \in \mathbb N \f$ tensor rank of the second field

   \param J \f$ J \in \mathbb N \f$ tensor rank of the resulting field

   \param alpha \f$ \alpha \in \mathbb R \f$ additional weighting factor

   \param normalize normalized tensor products?: true=\f$ \bullet_{J}\f$ , false=\f$ \circ_{J}\f$

   \returns  \f$

   \left\{

     \begin{array}{ll}

   0 &  \mbox{if tensor product exists}\\

   -1 & \mbox{ else }

     \end{array}

   \right.

   \f$

   \warning ensure that stIn1, stIn2, stOut and shape exist

   and have been \b allocated properly!

 */

 template<typename T>

 STA_RESULT sta_product_Rft (

     const std::complex<T> * stIn1,

     const std::complex<T> * stIn2,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J1,

     int J2,

     int J,

     T alpha,

     bool normalize,

     int stride_in1 = -1,

     int stride_in2 = -1,

     int stride_out = -1,

     bool clear_field=false)

 {

     if ( ( std::abs ( J1-J2 ) >J ) || ( J>std::abs ( J1+J2 ) ) )

         return STA_RESULT_INVALID_PRODUCT;

     if ( ( ( J1+J2+J ) %2!=0 ) && ( normalize ) )

         return STA_RESULT_INVALID_PRODUCT;


     T  * Cg= sta_product_precomputeCGcoefficients_R<T> ( J1,J2, J,normalize,alpha );


     bool resultInIv= ( ( J1+J2+J ) %2!=0 );


     std::size_t vectorLengthJ1= ( J1+1 );

     std::size_t vectorLengthJ2= ( J2+1 );

     std::size_t vectorLengthJ= ( J+1 );


     if ( stride_in1 == -1 )

         stride_in1 = vectorLengthJ1;

     if ( stride_in2 == -1 )

         stride_in2 = vectorLengthJ2;

     if ( stride_out == -1 )

         stride_out = vectorLengthJ;


     stride_in1*=2; // because input field is complex but pointer are real

     stride_in2*=2;

     stride_out*=2;


     std::size_t jumpz=shape[1]*shape[2];

     std::size_t jumpy=shape[2];


     const T * stIn1R= ( const T * ) stIn1;

     const T * stIn2R= ( const T * ) stIn2;

     T * stOutR= ( T * ) stOut;


     #pragma omp parallel for num_threads(get_numCPUs())

     for ( std::size_t z=0; z<shape[0]; z++ )

     {

         std::size_t Z=z;

         Z*=jumpz;

         const T * current_J1R=stIn1R+ ( Z*stride_in1+2*J1 );

         const T * current_J2R=stIn2R+ ( Z*stride_in2+2*J2 );

         T * current_JR=stOutR+ ( Z*stride_out+2*J );


         T tmp0R;

         T tmp0I;


         T tmp1R;

         T tmp1I;


         for ( std::size_t i=0; i<jumpz; i++ )

         {


             std::size_t  Y=i/jumpy;

             std::size_t  X=i%jumpy;

             std::size_t mpos= ((shape[0]-z)%shape[0])*jumpz+

                               ((shape[1]-Y)%shape[1])*jumpy+

                               ((shape[2]-X)%shape[2]);


             const T * current_J1Rmirrowed=stIn1R+ (mpos*stride_in1+2*J1 );

             const T * current_J2Rmirrowed=stIn2R+ (mpos*stride_in2+2*J2 );


             std::size_t count=0;

             for ( int m=-J; m<=0; m++ )

             {

                 if ( clear_field )

                 {

                     current_JR[m*2]=T ( 0 );

                     current_JR[m*2+1]=T ( 0 );

                 }


                 for ( int m1=-J1; m1<=J1; m1++ )

                 {

                     int m2=m-m1;

                     if ( abs ( m2 ) <=J2 )

                     {

                         if ( m1>0 )

                         {

                             if ( m1%2==0 )

                             {

                                 tmp0R=current_J1Rmirrowed[-m1*2];

                                 tmp0I=-current_J1Rmirrowed[-m1*2+1];

                             }

                             else

                             {

                                 tmp0R=-current_J1Rmirrowed[-m1*2];

                                 tmp0I=current_J1Rmirrowed[-m1*2+1];

                             }

                         }

                         else

                         {

                             tmp0R=current_J1R[m1*2];

                             tmp0I=current_J1R[m1*2+1];

                         }

                         if ( m2>0 )

                         {

                             if ( m2%2==0 )

                             {

                                 tmp1R=current_J2Rmirrowed[-m2*2];

                                 tmp1I=-current_J2Rmirrowed[-m2*2+1];

                             }

                             else

                             {

                                 tmp1R=-current_J2Rmirrowed[-m2*2];

                                 tmp1I=current_J2Rmirrowed[-m2*2+1];

                             }

                         }

                         else

                         {

                             tmp1R=current_J2R[m2*2];

                             tmp1I=current_J2R[m2*2+1];

                         }

                         if ( resultInIv )

                         {

                             current_JR[m*2]-=Cg[count]* ( tmp0R*tmp1I+tmp0I*tmp1R );

                             current_JR[m*2+1]+=Cg[count++]* ( tmp0R*tmp1R-tmp0I*tmp1I );

                         }

                         else

                         {

                             current_JR[m*2]+=Cg[count]* ( tmp0R*tmp1R-tmp0I*tmp1I );

                             current_JR[m*2+1]+=Cg[count++]* ( tmp0R*tmp1I+tmp0I*tmp1R );

                         }

                     }

                 }

             }

             current_J1R+=stride_in1;

             current_J2R+=stride_in2;

             current_JR+=stride_out;

         }

     }

     delete [] Cg;

     return STA_RESULT_SUCCESS;

 }


 template<typename T,typename S >

 STA_RESULT sta_feature_product_R (

     const std::complex<T> * stIn1,

     const std::complex<T> * stIn2,

     S * stOut ,

     const std::size_t shape[],

     int J,

     T alpha,

     bool normalize,

     int stride_in1 = -1,

     int stride_in2 = -1,

     int stride_out = -1,

     bool clear_field=false)

 {


     T  * Cg= sta_product_precomputeCGcoefficients_R<T> ( J,J, 0,normalize,alpha );


     std::size_t vectorLengthJ1= ( J+1 );

     std::size_t vectorLengthJ2= ( J+1 );

     std::size_t vectorLengthJ= ( 1 );


     if ( stride_in1 == -1 )

         stride_in1 = vectorLengthJ1;

     if ( stride_in2 == -1 )

         stride_in2 = vectorLengthJ2;

     if ( stride_out == -1 )

         stride_out = vectorLengthJ;


     stride_in1*=2; // because input field is complex but pointer are real

     stride_in2*=2;


     std::size_t jumpz=shape[1]*shape[2];


     const T * stIn1R= ( const T * ) stIn1;

     const T * stIn2R= ( const T * ) stIn2;

     S * stOutR= ( S * ) stOut;


     #pragma omp parallel for num_threads(get_numCPUs())

     for ( std::size_t z=0; z<shape[0]; z++ )

     {

         std::size_t Z=z;

         Z*=jumpz;

         const T * current_J1R=stIn1R+ ( Z*stride_in1+2*J );

         const T * current_J2R=stIn2R+ ( Z*stride_in2+2*J );

         S * current_JR=stOutR+ ( Z*stride_out );


         for ( std::size_t i=0; i<jumpz; i++ )

         {

             std::size_t count=0;

             {

                 if ( clear_field )

                 {

                     current_JR[0]=T ( 0 );

                 }


                 for ( int m1=-J; m1<=0; m1++ )

                 {

                     if (m1!=0)

                     {

                         current_JR[0]+=2*Cg[count]* ( current_J1R[m1*2]*current_J2R[m1*2]

                                                       +current_J1R[m1*2+1]*current_J2R[m1*2+1] );

                     }

                     else

                         current_JR[0]+=Cg[count]* ( current_J1R[m1*2]*current_J2R[m1*2]

                                                     +current_J1R[m1*2+1]*current_J2R[m1*2+1] );

                 }

             }

             current_J1R+=stride_in1;

             current_J2R+=stride_in2;

             current_JR+=stride_out;

         }

     }

     delete [] Cg;

     return STA_RESULT_SUCCESS;

 }


 /*

 template<typename T>

 int sta_product_Rft (

     const std::complex<T> * stIn1,

     const std::complex<T> * stIn2,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J1,

     int J2,

     int J,

     std::complex<T> alpha,

     bool normalize,

     int stride_in1 = -1,

     int stride_in2 = -1,

     int stride_out = -1,

     bool clear_field=false)

 {

     if ( ( std::abs ( J1-J2 ) >J ) || ( J>std::abs ( J1+J2 ) ) )

         return -1;

     if ( ( ( J1+J2+J ) %2!=0 ) && ( normalize ) )

         return -1;


     if ( ( J1+J2+J ) %2!=0 )

     {

         alpha*=std::complex<T> ( 0,1 );

     }


     std::complex<T>  * cg= sta_product_precomputeCGcoefficients_R<std::complex<T> > ( J1,J2, J,normalize,alpha );

     T * Cg = ( T* ) cg;

     //

     std::size_t vectorLengthJ1= ( J1+1 );

     std::size_t vectorLengthJ2= ( J2+1 );

     std::size_t vectorLengthJ= ( J+1 );


     if ( stride_in1 == -1 )

         stride_in1 = vectorLengthJ1;

     if ( stride_in2 == -1 )

         stride_in2 = vectorLengthJ2;

     if ( stride_out == -1 )

         stride_out = vectorLengthJ;


     stride_in1*=2; // because input field is complex but pointer are real

     stride_in2*=2;

     stride_out*=2;


     std::size_t jumpz=shape[1]*shape[2];

     std::size_t jumpy=shape[2];


     const T * stIn1R= ( const T * ) stIn1;

     const T * stIn2R= ( const T * ) stIn2;

     T * stOutR= ( T * ) stOut;


 #pragma omp parallel for num_threads(get_numCPUs())

     for ( std::size_t z=0;z<shape[0];z++ )

     {

         std::size_t Z=z;

         Z*=jumpz;


         const T * current_J1R=stIn1R+ ( Z*stride_in1+2*J1 );


         const T * current_J2R=stIn2R+ ( Z*stride_in2+2*J2 );


         T * current_JR=stOutR+ ( Z*stride_out+2*J );


         T tmp0R;

         T tmp0I;


         T tmp1R;

         T tmp1I;


         for ( std::size_t i=0;i<jumpz;i++ )

         {

             std::size_t  Y=i/jumpy;

             std::size_t  X=i%jumpy;

             std::size_t mpos= ((shape[0]-z)%shape[0])*jumpz+

                               ((shape[1]-Y)%shape[1])*jumpy+

                               ((shape[2]-X)%shape[2]);


             const T * current_J1Rmirrowed=stIn1R+ (mpos*stride_in1+2*J1 );

             const T * current_J2Rmirrowed=stIn2R+ (mpos*stride_in2+2*J2 );


             std::size_t count=0;

             for ( int m=-J;m<=0;m++ )

             {


                 if ( clear_field )

                 {

                     current_JR[m*2]=T ( 0 );

                     current_JR[m*2+1]=T ( 0 );

                 }


                 for ( int m1=-J1;m1<=J1;m1++ )

                 {

                     int m2=m-m1;

                     if ( abs ( m2 ) <=J2 )

                     {

                         if ( m1>0 )

                         {

                             if ( m1%2==0 )

                             {

                                 tmp0R=current_J1Rmirrowed[-m1*2];

                                 tmp0I=-current_J1Rmirrowed[-m1*2+1];

                             }

                             else

                             {

                                 tmp0R=-current_J1Rmirrowed[-m1*2];

                                 tmp0I=current_J1Rmirrowed[-m1*2+1];

                             }

                         }

                         else

                         {

                             tmp0R=current_J1R[m1*2];

                             tmp0I=current_J1R[m1*2+1];

                         }


                         if ( m2>0 )

                         {

                             if ( m2%2==0 )

                             {

                                 tmp1R=current_J2Rmirrowed[-m2*2];

                                 tmp1I=-current_J2Rmirrowed[-m2*2+1];

                             }

                             else

                             {

                                 tmp1R=-current_J2Rmirrowed[-m2*2];

                                 tmp1I=current_J2Rmirrowed[-m2*2+1];

                             }

                         }

                         else

                         {

                             tmp1R=current_J2R[m2*2];

                             tmp1I=current_J2R[m2*2+1];

                         }

                         T ce=tmp1R*Cg[count];

                         T de=tmp1I*Cg[count++];

                         T cf=tmp1R*Cg[count];

                         T df=tmp1I*Cg[count++];

                         ce=ce-df;

                         cf=cf+de;


                         current_JR[m*2]+=tmp0R*ce-tmp0I*cf;

                         current_JR[m*2+1]+=tmp0R*cf+tmp0I*ce;;

                     }

                 }

             }


             current_J1R+=stride_in1;

             current_J2R+=stride_in2;

             current_JR+=stride_out;

         }

     }

     delete [] cg;

     return 0;

 }*/


 //#define _STA_OLD_DERIV


 #ifdef _STA_OLD_DERIV

 template<typename T,typename S>

 STA_RESULT sta_derivatives_R(

     const S * stIn,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J,

     int Jupdown,    // either -1 0 or 1

     bool conjugate=false,

     //std::complex<T>

     T alpha=(T)1.0,

     const T  v_size[]=NULL,

     int stride_in = -1,

     int stride_out = -1,

     bool clear_field = false)

 {


     alpha/=T(2);

     if ( abs ( Jupdown ) >1 ) return STA_RESULT_INVALID_TENSOR_RANK;

     if ( abs ( J+Jupdown ) <0 ) return STA_RESULT_INVALID_TENSOR_RANK;


     std::complex<T> imag=-std::complex<T>(0,1);

     if (conjugate) imag*=T( -1 );


     T voxel_size[3];

     voxel_size[0]=voxel_size[1]=voxel_size[2]=T(1);

     if (v_size!=NULL)

     {

         voxel_size[0]/=v_size[0]; // Zdir

         voxel_size[1]/=v_size[1]; // Ydir

         voxel_size[2]/=v_size[2]; // Xdir

     }


     imag*=voxel_size[1];


     int J1=(T)(J+Jupdown);


     std::size_t vectorLengthJ=J+1;

     std::size_t vectorLengthJ1=(J1)+1;


     std::size_t jumpz=shape[1]*shape[2];

     std::size_t jumpy=shape[2];


     if (stride_in == -1)

         stride_in = vectorLengthJ;

     if (stride_out == -1)

         stride_out = vectorLengthJ1;


     T * CGTable=new T[3*vectorLengthJ1];

     T shnorm=hanalysis::clebschGordan(1,0,J,0,J1,0);

     if (Jupdown==0) shnorm=1;

     // printf("shnorm: %f\n",shnorm);

     for (int M=-(J1); M<=(0); M++)

     {

         CGTable[M+(J1)]                 =T(1.0/std::sqrt(2.0))*hanalysis::clebschGordan(1,-1,J,M+1,J1,M)/shnorm;;

         CGTable[M+(J1)+vectorLengthJ1]  =voxel_size[0]*hanalysis::clebschGordan(1,0,J,M,J1,M)/shnorm;

         CGTable[M+(J1)+2*vectorLengthJ1]=T(1.0/std::sqrt(2.0))*hanalysis::clebschGordan(1,1,J,M-1,J1,M)/shnorm;

     }

     T * CGTable0=&CGTable[0];

     CGTable0+=(J1);

     T * CGTable1=&CGTable[vectorLengthJ1];

     CGTable1+=(J1);

     T * CGTable2=&CGTable[2*vectorLengthJ1];

     CGTable2+=(J1);


     #pragma omp parallel for num_threads(get_numCPUs())

     for (std::size_t z=0; z<shape[0]; z++)

     {

         std::size_t Z[3];

         Z[1]=z+shape[0];

         Z[0]=Z[1]-1;

         Z[2]=Z[1]+1;

         Z[0]%=shape[0];

         Z[1]%=shape[0];

         Z[2]%=shape[0];


         Z[0]*=jumpz;

         Z[1]*=jumpz;

         Z[2]*=jumpz;


         const S * derivX1;

         const S * derivX0;


         const S * derivY1;

         const S * derivY0;


         const S * derivZ1;

         const S * derivZ0;


         for (std::size_t y=0; y<shape[1]; y++)

         {

             std::size_t Y[3];

             Y[1]=y+shape[1];

             Y[0]=Y[1]-1;

             Y[2]=Y[1]+1;

             Y[0]%=shape[1];

             Y[1]%=shape[1];

             Y[2]%=shape[1];


             Y[0]*=jumpy;

             Y[1]*=jumpy;

             Y[2]*=jumpy;


             for (std::size_t x=0; x<shape[2]; x++)

             {

                 std::size_t X[3];

                 X[1]=x+shape[2];

                 X[0]=X[1]-1;

                 X[2]=X[1]+1;

                 X[0]%=shape[2];

                 X[1]%=shape[2];

                 X[2]%=shape[2];


                 derivX1=&stIn[(Z[1]+Y[1]+X[0])*stride_in]+J;

                 derivX0=&stIn[(Z[1]+Y[1]+X[2])*stride_in]+J;


                 derivY1=&stIn[(Z[1]+Y[0]+X[1])*stride_in]+J;

                 derivY0=&stIn[(Z[1]+Y[2]+X[1])*stride_in]+J;


                 derivZ1=&stIn[(Z[0]+Y[1]+X[1])*stride_in]+J;

                 derivZ0=&stIn[(Z[2]+Y[1]+X[1])*stride_in]+J;


                 std::size_t offset=(Z[1]+Y[1]+X[1])*stride_out+J1;


                 for (int M=-(J1); M<=(0); M++)

                 {

                     //std::complex<T> current=T(0);

                     std::complex<T> & current=stOut[offset+M];

                     if ( clear_field ) current=T ( 0 );

                     std::complex<T> tmp=T ( 0 );


                     if (abs(M+1)<=J)  // m1=-1    m2=M+1    M

                     {

                         int m2=M+1;

                         if (M==0)

                         {

                             //m2*=-1;

                             tmp-=CGTable0[M]*(voxel_size[2]*std::conj(derivX0[-m2]-derivX1[-m2])+imag*std::conj(derivY0[-m2]-derivY1[-m2]));

                         } else

                             tmp+=CGTable0[M]*(voxel_size[2]*(derivX0[m2]-derivX1[m2])+imag*(derivY0[m2]-derivY1[m2]));

                     }

                     if (M>=-J)  // m1=0     m2=M        M

                     {

                         tmp+=CGTable1[M]*(derivZ0[M]-derivZ1[M]);

                     }

                     if (M-1>=-J)  // m1=1     m2=M-1    M

                     {

                         int m2=M-1;

                         tmp+=CGTable2[M]*(-voxel_size[2]*(derivX0[m2]-derivX1[m2])+imag*(derivY0[m2]-derivY1[m2]));

                     }

                     current+=tmp*alpha;

                 }


             }

         }

     }


     delete [] CGTable;

     return (STA_RESULT_SUCCESS);

 }


 template<typename T,typename S>

 STA_RESULT sta_derivatives2_R(

     const S * stIn,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J,

     int Jupdown,         // either +2 or -2

     bool conjugate=false,

     T alpha= ( T ) 1.0,

     const T  v_size[]=NULL,

     int stride_in = -1,

     int stride_out = -1,

     bool clear_field = false )

 {

     if ( abs ( Jupdown ) >2 ) return STA_RESULT_INVALID_TENSOR_RANK;

     if ( abs ( Jupdown ) ==1 ) return STA_RESULT_INVALID_TENSOR_RANK;

     if ( abs ( J+Jupdown ) <0 ) return STA_RESULT_INVALID_TENSOR_RANK;


     if (v_size!=NULL)

     {

         if (hanalysis::verbose>0)

             printf("WARNING! element size is not considered yet!\n");

     }


     std::complex<T> imag=-std::complex<T> ( 0,1 );

     if (conjugate) imag*=T( -1 );


     alpha*=T(sqrt(3.0/2.0));


     int J1=J+Jupdown;

     //if (abs(Jupdown)!=2) return -1;


     int vectorLengthJ=J+1;

     int vectorLengthJ1=(J1)+1;


     if (stride_in == -1)

         stride_in = vectorLengthJ;

     if (stride_out == -1)

         stride_out = vectorLengthJ1;


     std::size_t jumpz=shape[1]*shape[2];

     std::size_t jumpy=shape[2];


     T * CGTable=new T[5*vectorLengthJ1];

     T shnorm=hanalysis::clebschGordan(2,0,J,0,J1,0);

     // if (Jupdown==0) shnorm=1;

     // printf("shnorm: %f\n",shnorm);

     for (int M=-(J1); M<=(0); M++)

     {

         CGTable[M+(J1)]                 =hanalysis::clebschGordan(2,-2,J,M+2,J1,M)/shnorm;

         CGTable[M+(J1)+vectorLengthJ1]  =hanalysis::clebschGordan(2,-1,J,M+1,J1,M)/shnorm;;

         CGTable[M+(J1)+2*vectorLengthJ1]=hanalysis::clebschGordan(2,0,J,M,J1,M)/shnorm;

         CGTable[M+(J1)+3*vectorLengthJ1]=hanalysis::clebschGordan(2,1,J,M-1,J1,M)/shnorm;

         CGTable[M+(J1)+4*vectorLengthJ1]=hanalysis::clebschGordan(2,2,J,M-2,J1,M)/shnorm;

     }

     T * CGTable0=&CGTable[0];

     CGTable0+=(J1);

     T * CGTable1=&CGTable[vectorLengthJ1];

     CGTable1+=(J1);

     T * CGTable2=&CGTable[2*vectorLengthJ1];

     CGTable2+=(J1);

     T * CGTable3=&CGTable[3*vectorLengthJ1];

     CGTable3+=(J1);

     T * CGTable4=&CGTable[4*vectorLengthJ1];

     CGTable4+=(J1);


     #pragma omp parallel for num_threads(get_numCPUs())

     for (std::size_t z=0; z<shape[0]; z++)

     {

         std::size_t Z[5];

         Z[2]=z+shape[0];

         Z[0]=Z[2]-2;

         Z[1]=Z[2]-1;

         Z[3]=Z[2]+1;

         Z[4]=Z[2]+2;

         Z[0]%=shape[0];

         Z[1]%=shape[0];

         Z[2]%=shape[0];

         Z[3]%=shape[0];

         Z[4]%=shape[0];


         Z[0]*=jumpz;

         Z[1]*=jumpz;

         Z[2]*=jumpz;

         Z[3]*=jumpz;

         Z[4]*=jumpz;


         //const S * X1Y1Z1;

         const S * X1Y1Z2;

         //const S * X1Y1Z3;

         const S * X1Y2Z1;

         const S * X1Y2Z2;

         const S * X1Y2Z3;

         //const S * X1Y3Z1;

         const S * X1Y3Z2;

         //const S * X1Y3Z3;


         const S * X2Y1Z1;

         const S * X2Y1Z2;

         const S * X2Y1Z3;

         const S * X2Y2Z1;

         const S * X2Y2Z2;

         const S * X2Y2Z3;

         const S * X2Y3Z1;

         const S * X2Y3Z2;

         const S * X2Y3Z3;


         //const S * X3Y1Z1;

         const S * X3Y1Z2;

         //const S * X3Y1Z3;

         const S * X3Y2Z1;

         const S * X3Y2Z2;

         const S * X3Y2Z3;

         //const S * X3Y3Z1;

         const S * X3Y3Z2;

         //const S * X3Y3Z3;


         for (std::size_t y=0; y<shape[1]; y++)

         {

             std::size_t Y[5];

             Y[2]=y+shape[1];

             Y[0]=Y[2]-2;

             Y[1]=Y[2]-1;

             Y[3]=Y[2]+1;

             Y[4]=Y[2]+2;

             Y[0]%=shape[1];

             Y[1]%=shape[1];

             Y[2]%=shape[1];

             Y[3]%=shape[1];

             Y[4]%=shape[1];


             Y[0]*=jumpy;

             Y[1]*=jumpy;

             Y[2]*=jumpy;

             Y[3]*=jumpy;

             Y[4]*=jumpy;


             for (std::size_t x=0; x<shape[2]; x++)

             {

                 std::size_t X[5];

                 X[2]=x+shape[0];

                 X[0]=X[2]-2;

                 X[1]=X[2]-1;

                 X[3]=X[2]+1;

                 X[4]=X[2]+2;

                 X[0]%=shape[2];

                 X[1]%=shape[2];

                 X[2]%=shape[2];

                 X[3]%=shape[2];

                 X[4]%=shape[2];


                 //X1Y1Z1=&stIn[(Z[1]+Y[1]+X[1])*stride_in]+J;

                 X1Y1Z2=&stIn[(Z[2]+Y[1]+X[1])*stride_in]+J;

                 //X1Y1Z3=&stIn[(Z[3]+Y[1]+X[1])*stride_in]+J;

                 X1Y2Z1=&stIn[(Z[1]+Y[2]+X[1])*stride_in]+J;

                 X1Y2Z2=&stIn[(Z[2]+Y[2]+X[1])*stride_in]+J;

                 X1Y2Z3=&stIn[(Z[3]+Y[2]+X[1])*stride_in]+J;

                 //X1Y3Z1=&stIn[(Z[1]+Y[3]+X[1])*stride_in]+J;

                 X1Y3Z2=&stIn[(Z[2]+Y[3]+X[1])*stride_in]+J;

                 //X1Y3Z3=&stIn[(Z[3]+Y[3]+X[1])*stride_in]+J;


                 X2Y1Z1=&stIn[(Z[1]+Y[1]+X[2])*stride_in]+J;

                 X2Y1Z2=&stIn[(Z[2]+Y[1]+X[2])*stride_in]+J;

                 X2Y1Z3=&stIn[(Z[3]+Y[1]+X[2])*stride_in]+J;

                 X2Y2Z1=&stIn[(Z[1]+Y[2]+X[2])*stride_in]+J;

                 X2Y2Z2=&stIn[(Z[2]+Y[2]+X[2])*stride_in]+J;

                 X2Y2Z3=&stIn[(Z[3]+Y[2]+X[2])*stride_in]+J;

                 X2Y3Z1=&stIn[(Z[1]+Y[3]+X[2])*stride_in]+J;

                 X2Y3Z2=&stIn[(Z[2]+Y[3]+X[2])*stride_in]+J;

                 X2Y3Z3=&stIn[(Z[3]+Y[3]+X[2])*stride_in]+J;


                 //X3Y1Z1=&stIn[(Z[1]+Y[1]+X[3])*stride_in]+J;

                 X3Y1Z2=&stIn[(Z[2]+Y[1]+X[3])*stride_in]+J;

                 //X3Y1Z3=&stIn[(Z[3]+Y[1]+X[3])*stride_in]+J;

                 X3Y2Z1=&stIn[(Z[1]+Y[2]+X[3])*stride_in]+J;

                 X3Y2Z2=&stIn[(Z[2]+Y[2]+X[3])*stride_in]+J;

                 X3Y2Z3=&stIn[(Z[3]+Y[2]+X[3])*stride_in]+J;

                 //X3Y3Z1=&stIn[(Z[1]+Y[3]+X[3])*stride_in]+J;

                 X3Y3Z2=&stIn[(Z[2]+Y[3]+X[3])*stride_in]+J;

                 //X3Y3Z3=&stIn[(Z[3]+Y[3]+X[3])*stride_in]+J;


                 std::size_t offset=(Z[2]+Y[2]+X[2])*stride_out+J1;


                 for (int M=-(J1); M<=(0); M++)

                 {

                     std::complex<T> & current=stOut[offset+M];

                     if ( clear_field ) current=T ( 0 );

                     std::complex<T> ctmp=T ( 0 );


                     if (abs(M+2)<=J) // m1=-1    m2=M+1    M

                     {

                         int m2=M+2;

                         std::complex<T> tmp;

                         if (m2>0)

                         {

                             std::complex<T> Dxx= (X1Y2Z2[-m2]-(T)2*X2Y2Z2[-m2]+X3Y2Z2[-m2]);

                             std::complex<T> Dyy=(X2Y1Z2[-m2]-(T)2*X2Y2Z2[-m2]+X2Y3Z2[-m2]);

                             std::complex<T> Dxy=-(T)0.25*(X1Y1Z2[-m2]-X3Y1Z2[-m2]-X1Y3Z2[-m2]+X3Y3Z2[-m2]);


                             if (m2%2==0) tmp=(T)0.5*CGTable0[M]*(std::conj(Dxx-Dyy )-imag*std::conj((T)2.0*Dxy));

                             else  tmp=(T)0.5*CGTable0[M]*(-std::conj(Dxx-Dyy )-imag*(-std::conj((T)2.0*Dxy)));


                         } else

                         {

                             std::complex<T> Dxx= (X1Y2Z2[m2]-(T)2*X2Y2Z2[m2]+X3Y2Z2[m2]);

                             std::complex<T> Dyy=(X2Y1Z2[m2]-(T)2*X2Y2Z2[m2]+X2Y3Z2[m2]);

                             std::complex<T> Dxy=-(T)0.25*(X1Y1Z2[m2]-X3Y1Z2[m2]-X1Y3Z2[m2]+X3Y3Z2[m2]);

                             tmp=(T)0.5*CGTable0[M]*((Dxx-Dyy )-imag*((T)2.0*Dxy));

                         }

                         ctmp+=tmp;

                     }


                     if (abs(M+1)<=J) // m1=-1    m2=M+1    M

                     {

                         int m2=M+1;

                         std::complex<T> tmp;


                         if (m2>0)

                         {

                             std::complex<T> Dxz=(T)0.25*(X1Y2Z1[-m2]-X1Y2Z3[-m2]-X3Y2Z1[-m2]+X3Y2Z3[-m2]);

                             std::complex<T> Dyz=-(T)0.25*(X2Y1Z1[-m2]-X2Y3Z1[-m2]-X2Y1Z3[-m2]+X2Y3Z3[-m2]);

                             if (m2%2==0) tmp=CGTable1[M]*(std::conj(Dxz )-imag*std::conj(Dyz)); //tmp=std::conj(CGTable1[M]*((Dxz )-imag*(Dyz)));

                             else tmp=CGTable1[M]*(-std::conj(Dxz )-imag*(-std::conj(Dyz))); //tmp=-std::conj(CGTable1[M]*((Dxz )-imag*(Dyz)));

                         } else

                         {

                             std::complex<T> Dxz=(T)0.25*(X1Y2Z1[m2]-X1Y2Z3[m2]-X3Y2Z1[m2]+X3Y2Z3[m2]);

                             std::complex<T> Dyz=-(T)0.25*(X2Y1Z1[m2]-X2Y3Z1[m2]-X2Y1Z3[m2]+X2Y3Z3[m2]);

                             tmp=CGTable1[M]*((Dxz )-imag*(Dyz));

                         }

                         ctmp+=tmp;

                     }


                     if (M>=-J) // m1=-1    m2=M+1    M

                     {

                         int m2=M;

                         std::complex<T> Dxx= (X1Y2Z2[m2]-(T)2*X2Y2Z2[m2]+X3Y2Z2[m2]);

                         std::complex<T> Dyy=(X2Y1Z2[m2]-(T)2*X2Y2Z2[m2]+X2Y3Z2[m2]);

                         std::complex<T> Dzz=(X2Y2Z1[m2]-(T)2*X2Y2Z2[m2]+X2Y2Z3[m2]);

                         const T SQRT6=(T)(-1.0/std::sqrt(6.0));

                         ctmp+=CGTable2[M]*((Dxx+Dyy-(T)2.0*Dzz)*(SQRT6));

                     }


                     if (M-1>=-J) // m1=-1    m2=M+1    M

                     {

                         int m2=M-1;

                         std::complex<T> Dxz=(T)0.25*(X1Y2Z1[m2]-X1Y2Z3[m2]-X3Y2Z1[m2]+X3Y2Z3[m2]);

                         std::complex<T> Dyz=-(T)0.25*(X2Y1Z1[m2]-X2Y3Z1[m2]-X2Y1Z3[m2]+X2Y3Z3[m2]);

                         ctmp-=CGTable3[M]*((Dxz )+imag*(Dyz));

                     }


                     if (M-2>=-J) // m1=-1    m2=M+1    M

                     {

                         int m2=M-2;

                         std::complex<T> Dxx= (X1Y2Z2[m2]-(T)2*X2Y2Z2[m2]+X3Y2Z2[m2]);

                         std::complex<T> Dyy=(X2Y1Z2[m2]-(T)2*X2Y2Z2[m2]+X2Y3Z2[m2]);

                         std::complex<T> Dxy=-(T)0.25*(X1Y1Z2[m2]-X3Y1Z2[m2]-X1Y3Z2[m2]+X3Y3Z2[m2]);

                         ctmp+=(T)0.5*CGTable4[M]*((Dxx-Dyy )+imag*((T)2.0*Dxy));

                     }


                     current+=ctmp*alpha;


                 }


             }

         }

     }

     delete [] CGTable;

     return STA_RESULT_SUCCESS;

 }


 #else


 template<typename T>

 STA_RESULT sta_derivatives_R(

     const std::complex<T> * stIn,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J,

     int Jupdown,    // either -1 0 or 1

     bool conjugate=false,

     T alpha=(T)1.0,

     const T  v_size[]=NULL,

     int stride_in = -1,

     int stride_out = -1,

     bool clear_field = false)

 {

     alpha/=T(2);

     if ( abs ( Jupdown ) >1 ) return STA_RESULT_INVALID_TENSOR_RANK;

     if ( abs ( J+Jupdown ) <0 ) return STA_RESULT_INVALID_TENSOR_RANK;


     //std::complex<T> imag=-std::complex<T>(0,1);

     //if (conjugate) imag*=T( -1 );


     T voxel_size[3];

     voxel_size[0]=voxel_size[1]=voxel_size[2]=T(1);

     if (v_size!=NULL)

     {

         voxel_size[0]/=v_size[0]; // Zdir

         voxel_size[1]/=v_size[1]; // Ydir

         voxel_size[2]/=v_size[2]; // Xdir

     }


     //imag*=voxel_size[1];

     voxel_size[1]*=-1;

     if (conjugate) voxel_size[1]*=T( -1 );


     int J1=(T)(J+Jupdown);


     std::size_t vectorLengthJ=J+1;

     std::size_t vectorLengthJ1=(J1)+1;


     std::size_t jumpz=shape[1]*shape[2];

     std::size_t jumpy=shape[2];


     if (stride_in == -1)

         stride_in = vectorLengthJ;

     if (stride_out == -1)

         stride_out = vectorLengthJ1;


     T * CGTable=new T[3*vectorLengthJ1];


     T shnorm=hanalysis::clebschGordan(1,0,J,0,J1,0);


     if (Jupdown==0) shnorm=1;

     // printf("shnorm: %f\n",shnorm);

     for (int M=-(J1); M<=(0); M++)

     {

         CGTable[M+(J1)]                 =T(1.0/std::sqrt(2.0))*hanalysis::clebschGordan(1,-1,J,M+1,J1,M)/shnorm;;

         CGTable[M+(J1)+vectorLengthJ1]  =voxel_size[0]*hanalysis::clebschGordan(1,0,J,M,J1,M)/shnorm;

         CGTable[M+(J1)+2*vectorLengthJ1]=T(1.0/std::sqrt(2.0))*hanalysis::clebschGordan(1,1,J,M-1,J1,M)/shnorm;

     }


 //     for (int M=-(J1); M<=(0); M++)

 //       printf("%f %f %f\n",CGTable[M+(J1)],CGTable[M+(J1)+vectorLengthJ1],CGTable[M+(J1)+2*vectorLengthJ1]);


     T * CGTable0=&CGTable[0];

     CGTable0+=(J1);

     T * CGTable1=&CGTable[vectorLengthJ1];

     CGTable1+=(J1);

     T * CGTable2=&CGTable[2*vectorLengthJ1];

     CGTable2+=(J1);


     const T * stIn_r=(const T *)stIn;

     T * stOut_r=(T*)stOut;

     vectorLengthJ*=2;

     vectorLengthJ1*=2;

     stride_in*=2;

     stride_out*=2;


     int J_times_2=J*2;

     //int J1_times_2=J1*2;


     #pragma omp parallel for num_threads(get_numCPUs())

     for (std::size_t z=0; z<shape[0]; z++)

     {

         std::size_t Z[3];

         Z[1]=z+shape[0];

         Z[0]=Z[1]-1;

         Z[2]=Z[1]+1;

         Z[0]%=shape[0];

         Z[1]%=shape[0];

         Z[2]%=shape[0];


         Z[0]*=jumpz;

         Z[1]*=jumpz;

         Z[2]*=jumpz;


         const T * derivX1;

         const T * derivX0;


         const T * derivY1;

         const T * derivY0;


         const T * derivZ1;

         const T * derivZ0;


         for (std::size_t y=0; y<shape[1]; y++)

         {

             std::size_t Y[3];

             Y[1]=y+shape[1];

             Y[0]=Y[1]-1;

             Y[2]=Y[1]+1;

             Y[0]%=shape[1];

             Y[1]%=shape[1];

             Y[2]%=shape[1];


             Y[0]*=jumpy;

             Y[1]*=jumpy;

             Y[2]*=jumpy;


             for (std::size_t x=0; x<shape[2]; x++)

             {

                 std::size_t X[3];

                 X[1]=x+shape[2];

                 X[0]=X[1]-1;

                 X[2]=X[1]+1;

                 X[0]%=shape[2];

                 X[1]%=shape[2];

                 X[2]%=shape[2];


                 derivX1=&stIn_r[(Z[1]+Y[1]+X[0])*stride_in]+J_times_2;

                 derivX0=&stIn_r[(Z[1]+Y[1]+X[2])*stride_in]+J_times_2;


                 derivY1=&stIn_r[(Z[1]+Y[0]+X[1])*stride_in]+J_times_2;

                 derivY0=&stIn_r[(Z[1]+Y[2]+X[1])*stride_in]+J_times_2;


                 derivZ1=&stIn_r[(Z[0]+Y[1]+X[1])*stride_in]+J_times_2;

                 derivZ0=&stIn_r[(Z[2]+Y[1]+X[1])*stride_in]+J_times_2;


                 T * current_r=stOut_r+(Z[1]+Y[1]+X[1])*stride_out;//+2*M;


                 for (int M=-(J1); M<=(0); M++)

                 {

                     T tmp_r=T ( 0 );

                     T tmp_i=T ( 0 );


                     if (abs(M+1)<=J)  // m1=-1    m2=M+1    M

                     {

                         int m2=2*(M+1);

                         if (M==0)

                         {

                             m2*=-1;

                             tmp_r-=CGTable0[M]*(voxel_size[2]*(derivX0[m2]-derivX1[m2])+voxel_size[1]*(derivY0[m2+1]-derivY1[m2+1]));

                             tmp_i-=CGTable0[M]*(voxel_size[2]*(derivX1[m2+1]-derivX0[m2+1])+voxel_size[1]*(derivY0[m2]-derivY1[m2]));

                         } else

                         {

                             tmp_r+=CGTable0[M]*(voxel_size[2]*(derivX0[m2]-derivX1[m2])+voxel_size[1]*(derivY1[m2+1]-derivY0[m2+1]));

                             tmp_i+=CGTable0[M]*(voxel_size[2]*(derivX0[m2+1]-derivX1[m2+1])+voxel_size[1]*(derivY0[m2]-derivY1[m2]));

                         }

                     }

                     if (M>=-J)  // m1=0     m2=M        M

                     {

                         tmp_r+=CGTable1[M]*(derivZ0[M*2]-derivZ1[M*2]);

                         tmp_i+=CGTable1[M]*(derivZ0[M*2+1]-derivZ1[M*2+1]);

                     }

                     if (M-1>=-J)  // m1=1     m2=M-1    M

                     {

                         int m2=2*(M-1);

                         tmp_r+=CGTable2[M]*(voxel_size[2]*(derivX1[m2]-derivX0[m2])+voxel_size[1]*(derivY1[m2+1]-derivY0[m2+1]));

                         tmp_i+=CGTable2[M]*(voxel_size[2]*(derivX1[m2+1]-derivX0[m2+1])+voxel_size[1]*(derivY0[m2]-derivY1[m2]));

                     }


                     /*if ( clear_field )

                     {

                     current_r[0]=T ( 0 );

                     current_r[1]=T ( 0 );

                     }   */


                     if ( clear_field )

                     {

                         (*current_r)=tmp_r*alpha;

                         current_r++;

                         (*current_r)=tmp_i*alpha;

                         current_r++;

                     } else

                     {

                         (*current_r)+=tmp_r*alpha;

                         current_r++;

                         (*current_r)+=tmp_i*alpha;

                         current_r++;

                     }

                 }


             }

         }

     }


     delete [] CGTable;

     return (STA_RESULT_SUCCESS);

 }


 template<typename T>

 STA_RESULT sta_derivatives2_R(

     const std::complex<T> * stIn,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J,

     int Jupdown,         // either +2 or -2

     bool conjugate=false,

     T alpha= ( T ) 1.0,

     const T  v_size[]=NULL,

     int stride_in = -1,

     int stride_out = -1,

     bool clear_field = false )

 {

     if ( abs ( Jupdown ) >2 ) return STA_RESULT_INVALID_TENSOR_RANK;

     if ( abs ( Jupdown ) ==1 ) return STA_RESULT_INVALID_TENSOR_RANK;

     if ( abs ( J+Jupdown ) <0 ) return STA_RESULT_INVALID_TENSOR_RANK;


 //     if (v_size!=NULL)

 //     {

 //       if (hanalysis::verbose>0)

 //         printf("WARNING! element size is not considered yet!\n");

 //     }


     T voxel_weights[6];

     voxel_weights[0]=voxel_weights[1]=voxel_weights[2]

                                       =voxel_weights[3]=voxel_weights[4]=voxel_weights[5]=T(1);


     if (v_size!=NULL)

     {

         voxel_weights[0]/=(v_size[0]*v_size[0]); //Zdir

         voxel_weights[1]/=(v_size[1]*v_size[1]); //Ydir

         voxel_weights[2]/=(v_size[2]*v_size[2]); //Xdir

         voxel_weights[3]/=(v_size[0]*v_size[1]); //ZYdir

         voxel_weights[4]/=(v_size[0]*v_size[2]); //ZXdir

         voxel_weights[5]/=(v_size[1]*v_size[2]); //YXdir


     }


     T conj=-1;

     if (conjugate) conj*=T( -1 );


     alpha*=T(sqrt(3.0/2.0));


     int J1=J+Jupdown;

     //if (abs(Jupdown)!=2) return -1;


     int vectorLengthJ=J+1;

     int vectorLengthJ1=(J1)+1;


     if (stride_in == -1)

         stride_in = vectorLengthJ;

     if (stride_out == -1)

         stride_out = vectorLengthJ1;


     std::size_t jumpz=shape[1]*shape[2];

     std::size_t jumpy=shape[2];


     T * CGTable=new T[5*vectorLengthJ1];

     T shnorm=hanalysis::clebschGordan(2,0,J,0,J1,0);

     // if (Jupdown==0) shnorm=1;

     // printf("shnorm: %f\n",shnorm);

     for (int M=-(J1); M<=(0); M++)

     {

         CGTable[M+(J1)]                 =hanalysis::clebschGordan(2,-2,J,M+2,J1,M)/shnorm;

         CGTable[M+(J1)+vectorLengthJ1]  =hanalysis::clebschGordan(2,-1,J,M+1,J1,M)/shnorm;

         CGTable[M+(J1)+2*vectorLengthJ1]=hanalysis::clebschGordan(2,0,J,M,J1,M)/shnorm;

         CGTable[M+(J1)+3*vectorLengthJ1]=hanalysis::clebschGordan(2,1,J,M-1,J1,M)/shnorm;

         CGTable[M+(J1)+4*vectorLengthJ1]=hanalysis::clebschGordan(2,2,J,M-2,J1,M)/shnorm;

     }

     T * CGTable0=&CGTable[0];

     CGTable0+=(J1);

     T * CGTable1=&CGTable[vectorLengthJ1];

     CGTable1+=(J1);

     T * CGTable2=&CGTable[2*vectorLengthJ1];

     CGTable2+=(J1);

     T * CGTable3=&CGTable[3*vectorLengthJ1];

     CGTable3+=(J1);

     T * CGTable4=&CGTable[4*vectorLengthJ1];

     CGTable4+=(J1);


     const T * stIn_r=(const T *)stIn;

     T * stOut_r=(T*)stOut;

     vectorLengthJ*=2;

     vectorLengthJ1*=2;

     stride_in*=2;

     stride_out*=2;


     int J_times_2=J*2;


     #pragma omp parallel for num_threads(get_numCPUs())

     for (std::size_t z=0; z<shape[0]; z++)

     {

         std::size_t Z[5];

         Z[2]=z+shape[0];

         Z[0]=Z[2]-2;

         Z[1]=Z[2]-1;

         Z[3]=Z[2]+1;

         Z[4]=Z[2]+2;

         Z[0]%=shape[0];

         Z[1]%=shape[0];

         Z[2]%=shape[0];

         Z[3]%=shape[0];

         Z[4]%=shape[0];


         Z[0]*=jumpz;

         Z[1]*=jumpz;

         Z[2]*=jumpz;

         Z[3]*=jumpz;

         Z[4]*=jumpz;


         const T * X1Y1Z2;

         const T * X1Y2Z1;

         const T * X1Y2Z2;

         const T * X1Y2Z3;

         const T * X1Y3Z2;


         const T * X2Y1Z1;

         const T * X2Y1Z2;

         const T * X2Y1Z3;

         const T * X2Y2Z1;

         const T * X2Y2Z2;

         const T * X2Y2Z3;

         const T * X2Y3Z1;

         const T * X2Y3Z2;

         const T * X2Y3Z3;


         const T * X3Y1Z2;

         const T * X3Y2Z1;

         const T * X3Y2Z2;

         const T * X3Y2Z3;

         const T * X3Y3Z2;


         for (std::size_t y=0; y<shape[1]; y++)

         {

             std::size_t Y[5];

             Y[2]=y+shape[1];

             Y[0]=Y[2]-2;

             Y[1]=Y[2]-1;

             Y[3]=Y[2]+1;

             Y[4]=Y[2]+2;

             Y[0]%=shape[1];

             Y[1]%=shape[1];

             Y[2]%=shape[1];

             Y[3]%=shape[1];

             Y[4]%=shape[1];


             Y[0]*=jumpy;

             Y[1]*=jumpy;

             Y[2]*=jumpy;

             Y[3]*=jumpy;

             Y[4]*=jumpy;


             for (std::size_t x=0; x<shape[2]; x++)

             {

                 std::size_t X[5];

                 X[2]=x+shape[0];

                 X[0]=X[2]-2;

                 X[1]=X[2]-1;

                 X[3]=X[2]+1;

                 X[4]=X[2]+2;

                 X[0]%=shape[2];

                 X[1]%=shape[2];

                 X[2]%=shape[2];

                 X[3]%=shape[2];

                 X[4]%=shape[2];


                 X1Y1Z2=stIn_r+(Z[2]+Y[1]+X[1])*stride_in+J_times_2;


                 X1Y2Z1=stIn_r+(Z[1]+Y[2]+X[1])*stride_in+J_times_2;

                 X1Y2Z2=stIn_r+(Z[2]+Y[2]+X[1])*stride_in+J_times_2;

                 X1Y2Z3=stIn_r+(Z[3]+Y[2]+X[1])*stride_in+J_times_2;


                 X1Y3Z2=stIn_r+(Z[2]+Y[3]+X[1])*stride_in+J_times_2;


                 X2Y1Z1=stIn_r+(Z[1]+Y[1]+X[2])*stride_in+J_times_2;

                 X2Y1Z2=stIn_r+(Z[2]+Y[1]+X[2])*stride_in+J_times_2;

                 X2Y1Z3=stIn_r+(Z[3]+Y[1]+X[2])*stride_in+J_times_2;

                 X2Y2Z1=stIn_r+(Z[1]+Y[2]+X[2])*stride_in+J_times_2;

                 X2Y2Z2=stIn_r+(Z[2]+Y[2]+X[2])*stride_in+J_times_2;

                 X2Y2Z3=stIn_r+(Z[3]+Y[2]+X[2])*stride_in+J_times_2;

                 X2Y3Z1=stIn_r+(Z[1]+Y[3]+X[2])*stride_in+J_times_2;

                 X2Y3Z2=stIn_r+(Z[2]+Y[3]+X[2])*stride_in+J_times_2;

                 X2Y3Z3=stIn_r+(Z[3]+Y[3]+X[2])*stride_in+J_times_2;


                 X3Y1Z2=stIn_r+(Z[2]+Y[1]+X[3])*stride_in+J_times_2;


                 X3Y2Z1=stIn_r+(Z[1]+Y[2]+X[3])*stride_in+J_times_2;

                 X3Y2Z2=stIn_r+(Z[2]+Y[2]+X[3])*stride_in+J_times_2;

                 X3Y2Z3=stIn_r+(Z[3]+Y[2]+X[3])*stride_in+J_times_2;


                 X3Y3Z2=stIn_r+(Z[2]+Y[3]+X[3])*stride_in+J_times_2;


                 T * current_r=stOut_r+(Z[2]+Y[2]+X[2])*stride_out;

                 for (int M=-(J1); M<=(0); M++)

                 {


                     T ctmp_r=T ( 0 );

                     T ctmp_i=T ( 0 );


                     if (abs(M+2)<=J) // m1=-1    m2=M+1    M

                     {

                         int m2=2*(M+2);

 //                         T tmp_r;

 //                      T tmp_i;


                         //if (m2>0)

                         if (M+2>0)

                         {


                             T Dxx_r= voxel_weights[2]*(X1Y2Z2[-m2]-(T)2*X2Y2Z2[-m2]+X3Y2Z2[-m2]);

                             T Dxx_i= voxel_weights[2]*(X1Y2Z2[-m2+1]-(T)2*X2Y2Z2[-m2+1]+X3Y2Z2[-m2+1]);


                             T Dyy_r=voxel_weights[1]*(X2Y1Z2[-m2]-(T)2*X2Y2Z2[-m2]+X2Y3Z2[-m2]);

                             T Dyy_i=voxel_weights[1]*(X2Y1Z2[-m2+1]-(T)2*X2Y2Z2[-m2+1]+X2Y3Z2[-m2+1]);


                             T Dxy_r=voxel_weights[5]*(X1Y1Z2[-m2]-X3Y1Z2[-m2]-X1Y3Z2[-m2]+X3Y3Z2[-m2]);

                             T Dxy_i=voxel_weights[5]*(X1Y1Z2[-m2+1]-X3Y1Z2[-m2+1]-X1Y3Z2[-m2+1]+X3Y3Z2[-m2+1]);


                             //if (m2%2==0)

                             if (M%2==0)

                             {

                                 ctmp_r+=(T)0.5*CGTable0[M]*((Dxx_r-Dyy_r )+conj*((T)0.5*Dxy_i));

                                 ctmp_i+=(T)0.5*CGTable0[M]*((Dyy_i-Dxx_i )+conj*((T)0.5*Dxy_r));

                                 //tmp=(T)0.5*CGTable0[M]*(std::conj(Dxx-Dyy )-imag*std::conj((T)2.0*Dxy));

                             }

                             else

                             {

                                 ctmp_r+=(T)0.5*CGTable0[M]*((Dyy_r-Dxx_r )-conj*((T)0.5*Dxy_i));

                                 ctmp_i+=(T)0.5*CGTable0[M]*((Dxx_i-Dyy_i )-conj*((T)0.5*Dxy_r));

                                 //tmp=(T)0.5*CGTable0[M]*(-std::conj(Dxx-Dyy )-imag*(-std::conj((T)2.0*Dxy)));

                             }


                         } else

                         {

                             T Dxx_r= voxel_weights[2]*(X1Y2Z2[m2]-(T)2*X2Y2Z2[m2]+X3Y2Z2[m2]);

                             T Dxx_i= voxel_weights[2]*(X1Y2Z2[m2+1]-(T)2*X2Y2Z2[m2+1]+X3Y2Z2[m2+1]);


                             T Dyy_r=voxel_weights[1]*(X2Y1Z2[m2]-(T)2*X2Y2Z2[m2]+X2Y3Z2[m2]);

                             T Dyy_i=voxel_weights[1]*(X2Y1Z2[m2+1]-(T)2*X2Y2Z2[m2+1]+X2Y3Z2[m2+1]);


                             T Dxy_r=voxel_weights[5]*(X1Y1Z2[m2]-X3Y1Z2[m2]-X1Y3Z2[m2]+X3Y3Z2[m2]);

                             T Dxy_i=voxel_weights[5]*(X1Y1Z2[m2+1]-X3Y1Z2[m2+1]-X1Y3Z2[m2+1]+X3Y3Z2[m2+1]);


                             ctmp_r+=(T)0.5*CGTable0[M]*((Dxx_r-Dyy_r )-conj*((T)0.5*Dxy_i));

                             ctmp_i+=(T)0.5*CGTable0[M]*((Dxx_i-Dyy_i )+conj*((T)0.5*Dxy_r));

                             //tmp=(T)0.5*CGTable0[M]*((Dxx-Dyy )-imag*((T)2.0*Dxy));

                         }

 //                         ctmp_r+=tmp_r;

 //                      ctmp_i+=tmp_i;

                     }


                     if (abs(M+1)<=J) // m1=-1    m2=M+1    M

                     {

                         int m2=2*(M+1);


                         /*                        T tmp_r;

                                                 T tmp_i;*/


                         if (M+1>0)

                         {

                             T Dxz_r=(T)0.25*voxel_weights[4]*(X1Y2Z1[-m2]-X1Y2Z3[-m2]-X3Y2Z1[-m2]+X3Y2Z3[-m2]);

                             T Dxz_i=(T)0.25*voxel_weights[4]*(X1Y2Z1[-m2+1]-X1Y2Z3[-m2+1]-X3Y2Z1[-m2+1]+X3Y2Z3[-m2+1]);


                             T Dyz_r=-(T)0.25*voxel_weights[3]*(X2Y1Z1[-m2]-X2Y3Z1[-m2]-X2Y1Z3[-m2]+X2Y3Z3[-m2]);

                             T Dyz_i=-(T)0.25*voxel_weights[3]*(X2Y1Z1[-m2+1]-X2Y3Z1[-m2+1]-X2Y1Z3[-m2+1]+X2Y3Z3[-m2+1]);


                             if (M%2==0)

                             {

                                 ctmp_r+=CGTable1[M]*(conj*Dyz_i-Dxz_r);

                                 ctmp_i+=CGTable1[M]*(Dxz_i+conj*Dyz_r);


                                 //tmp=CGTable1[M]*(-std::conj(Dxz )-imag*(-std::conj(Dyz)));

                             }

                             else

                             {

                                 ctmp_r+=CGTable1[M]*(Dxz_r+conj*Dyz_i);

                                 ctmp_i+=-CGTable1[M]*(Dxz_i+conj*Dyz_r);

                                 //tmp=CGTable1[M]*(std::conj(Dxz )-imag*std::conj(Dyz));

                             }

                         } else

                         {

                             T Dxz_r=(T)0.25*voxel_weights[4]*(X1Y2Z1[m2]-X1Y2Z3[m2]-X3Y2Z1[m2]+X3Y2Z3[m2]);

                             T Dxz_i=(T)0.25*voxel_weights[4]*(X1Y2Z1[m2+1]-X1Y2Z3[m2+1]-X3Y2Z1[m2+1]+X3Y2Z3[m2+1]);


                             T Dyz_r=-(T)0.25*voxel_weights[3]*(X2Y1Z1[m2]-X2Y3Z1[m2]-X2Y1Z3[m2]+X2Y3Z3[m2]);

                             T Dyz_i=-(T)0.25*voxel_weights[3]*(X2Y1Z1[m2+1]-X2Y3Z1[m2+1]-X2Y1Z3[m2+1]+X2Y3Z3[m2+1]);


                             ctmp_r+=CGTable1[M]*(Dxz_r +conj*Dyz_i);

                             ctmp_i+=CGTable1[M]*(Dxz_i -conj*Dyz_r);

                             //tmp=CGTable1[M]*((Dxz )-imag*(Dyz));

                         }

 //                        ctmp_r+=tmp_r;

 //                     ctmp_i+=tmp_i;

                     }


                     if (M>=-J) // m1=-1    m2=M+1    M

                     {

                         int m2=2*M;

                         T Dxx_r= voxel_weights[2]*(X1Y2Z2[m2]-(T)2*X2Y2Z2[m2]+X3Y2Z2[m2]);

                         T Dxx_i= voxel_weights[2]*(X1Y2Z2[m2+1]-(T)2*X2Y2Z2[m2+1]+X3Y2Z2[m2+1]);


                         T Dyy_r= voxel_weights[1]*(X2Y1Z2[m2]-(T)2*X2Y2Z2[m2]+X2Y3Z2[m2]);

                         T Dyy_i= voxel_weights[1]*(X2Y1Z2[m2+1]-(T)2*X2Y2Z2[m2+1]+X2Y3Z2[m2+1]);


                         T Dzz_r= voxel_weights[0]*(X2Y2Z1[m2]-(T)2*X2Y2Z2[m2]+X2Y2Z3[m2]);

                         T Dzz_i= voxel_weights[0]*(X2Y2Z1[m2+1]-(T)2*X2Y2Z2[m2+1]+X2Y2Z3[m2+1]);


                         const T SQRT6=(T)(-1.0/std::sqrt(6.0));


                         ctmp_r+=CGTable2[M]*((Dxx_r+Dyy_r-(T)2.0*Dzz_r)*(SQRT6));

                         ctmp_i+=CGTable2[M]*((Dxx_i+Dyy_i-(T)2.0*Dzz_i)*(SQRT6));


                         //ctmp+=CGTable2[M]*((Dxx+Dyy-(T)2.0*Dzz)*(SQRT6));

                     }


                     if (M-1>=-J) // m1=-1    m2=M+1    M

                     {

                         int m2=2*(M-1);

                         T Dxz_r=voxel_weights[4]*(T)0.25*(X1Y2Z1[m2]-X1Y2Z3[m2]-X3Y2Z1[m2]+X3Y2Z3[m2]);

                         T Dxz_i=voxel_weights[4]*(T)0.25*(X1Y2Z1[m2+1]-X1Y2Z3[m2+1]-X3Y2Z1[m2+1]+X3Y2Z3[m2+1]);


                         T Dyz_r=-(T)0.25*voxel_weights[3]*(X2Y1Z1[m2]-X2Y3Z1[m2]-X2Y1Z3[m2]+X2Y3Z3[m2]);

                         T Dyz_i=-(T)0.25*voxel_weights[3]*(X2Y1Z1[m2+1]-X2Y3Z1[m2+1]-X2Y1Z3[m2+1]+X2Y3Z3[m2+1]);


                         ctmp_r-=CGTable3[M]*(Dxz_r-conj*Dyz_i);

                         ctmp_i-=CGTable3[M]*(Dxz_i+conj*Dyz_r);

                         //ctmp-=CGTable3[M]*((Dxz )+imag*(Dyz));

                     }


                     if (M-2>=-J) // m1=-1    m2=M+1    M

                     {

                         int m2=2*(M-2);


                         T Dxx_r= voxel_weights[2]*(X1Y2Z2[m2]-(T)2*X2Y2Z2[m2]+X3Y2Z2[m2]);

                         T Dxx_i= voxel_weights[2]*(X1Y2Z2[m2+1]-(T)2*X2Y2Z2[m2+1]+X3Y2Z2[m2+1]);


                         T Dyy_r= voxel_weights[1]*(X2Y1Z2[m2]-(T)2*X2Y2Z2[m2]+X2Y3Z2[m2]);

                         T Dyy_i= voxel_weights[1]*(X2Y1Z2[m2+1]-(T)2*X2Y2Z2[m2+1]+X2Y3Z2[m2+1]);


                         T Dxy_r= voxel_weights[5]*(X1Y1Z2[m2]-X3Y1Z2[m2]-X1Y3Z2[m2]+X3Y3Z2[m2]);

                         T Dxy_i= voxel_weights[5]*(X1Y1Z2[m2+1]-X3Y1Z2[m2+1]-X1Y3Z2[m2+1]+X3Y3Z2[m2+1]);


                         ctmp_r+=(T)0.5*CGTable4[M]*((Dxx_r-Dyy_r )+conj*((T)0.5*Dxy_i));

                         ctmp_i+=(T)0.5*CGTable4[M]*((Dxx_i-Dyy_i )-conj*((T)0.5*Dxy_r));

                         //ctmp+=(T)0.5*CGTable4[M]*((Dxx-Dyy )+imag*((T)2.0*Dxy));

                     }


                     /*                    if ( clear_field )

                                     {

                                       current_r[0]=T ( 0 );

                                       current_r[1]=T ( 0 );

                                     }


                                         (*current_r)+=ctmp_r*alpha;

                                     current_r++;

                                     (*current_r)+=ctmp_i*alpha;

                                     current_r++;     */


                     if ( clear_field )

                     {

                         (*current_r)=ctmp_r*alpha;

                         current_r++;

                         (*current_r)=ctmp_i*alpha;

                         current_r++;

                     } else

                     {

                         (*current_r)+=ctmp_r*alpha;

                         current_r++;

                         (*current_r)+=ctmp_i*alpha;

                         current_r++;

                     }


                     //current+=ctmp*alpha;

                 }

             }

         }

     }

     delete [] CGTable;

     return STA_RESULT_SUCCESS;

 }


 #endif


 template<typename T,typename S>

 STA_RESULT sta_derivatives_R4th(

     const S * stIn,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J,

     int Jupdown,    // either -1 0 or 1

     bool conjugate=false,

     std::complex<T> alpha=(T)1.0,

     const T  v_size[]=NULL,

     int stride_in = -1,

     int stride_out = -1,

     bool clear_field = false)

 {

     alpha/=T(12);

     if ( abs ( Jupdown ) >1 ) return STA_RESULT_INVALID_TENSOR_RANK;

     if ( abs ( J+Jupdown ) <0 ) return STA_RESULT_INVALID_TENSOR_RANK;


     std::complex<T> imag=-std::complex<T>(0,1);

     if (conjugate) imag*=T( -1 );


     T voxel_size[3];

     voxel_size[0]=voxel_size[1]=voxel_size[2]=T(1);

     if (v_size!=NULL)

     {

         voxel_size[0]/=v_size[0]; // Zdir

         voxel_size[1]/=v_size[1]; // Ydir

         voxel_size[2]/=v_size[2]; // Xdir

     }


     imag*=voxel_size[1];


     int J1=(T)(J+Jupdown);


     std::size_t vectorLengthJ=J+1;

     std::size_t vectorLengthJ1=(J1)+1;


     if (stride_in == -1)

         stride_in = vectorLengthJ;

     if (stride_out == -1)

         stride_out = vectorLengthJ1;


     std::size_t jumpz=shape[1]*shape[2];

     std::size_t jumpy=shape[2];


     T * CGTable=new T[3*vectorLengthJ1];

     T shnorm=hanalysis::clebschGordan(1,0,J,0,J1,0);

     if (Jupdown==0) shnorm=1;


     for (int M=-(J1); M<=(0); M++)

     {

         CGTable[M+(J1)]                 =T(1.0/std::sqrt(2.0))*hanalysis::clebschGordan(1,-1,J,M+1,J1,M)/shnorm;;

         CGTable[M+(J1)+vectorLengthJ1]  =voxel_size[0]*hanalysis::clebschGordan(1,0,J,M,J1,M)/shnorm;

         CGTable[M+(J1)+2*vectorLengthJ1]=T(1.0/std::sqrt(2.0))*hanalysis::clebschGordan(1,1,J,M-1,J1,M)/shnorm;

     }

     T * CGTable0=&CGTable[0];

     CGTable0+=(J1);

     T * CGTable1=&CGTable[vectorLengthJ1];

     CGTable1+=(J1);

     T * CGTable2=&CGTable[2*vectorLengthJ1];

     CGTable2+=(J1);


     #pragma omp parallel for num_threads(get_numCPUs())

     for (std::size_t z=0; z<shape[0]; z++)

     {

         std::size_t Z[5];

         Z[2]=z+shape[0];

         Z[0]=Z[2]-2;

         Z[1]=Z[2]-1;

         Z[3]=Z[2]+1;

         Z[4]=Z[2]+2;


         Z[0]%=shape[0];

         Z[1]%=shape[0];

         Z[2]%=shape[0];

         Z[3]%=shape[0];

         Z[4]%=shape[0];


         Z[0]*=jumpz;

         Z[1]*=jumpz;

         Z[2]*=jumpz;

         Z[3]*=jumpz;

         Z[4]*=jumpz;


         const S * derivZ0p=stIn+Z[0]*stride_in+J;

         const S * derivZ1p=stIn+Z[1]*stride_in+J;

         const S * derivZ2p=stIn+Z[2]*stride_in+J;

         const S * derivZ3p=stIn+Z[3]*stride_in+J;

         const S * derivZ4p=stIn+Z[4]*stride_in+J;


         const S * derivX3;

         const S * derivX2;

         const S * derivX1;

         const S * derivX0;


         const S * derivY3;

         const S * derivY2;

         const S * derivY1;

         const S * derivY0;


         const S * derivZ3;

         const S * derivZ2;

         const S * derivZ1;

         const S * derivZ0;


         for (std::size_t y=0; y<shape[1]; y++)

         {

             std::size_t Y[5];

             Y[2]=y+shape[1];

             Y[0]=Y[2]-2;

             Y[1]=Y[2]-1;

             Y[3]=Y[2]+1;

             Y[4]=Y[2]+2;


             Y[0]%=shape[1];

             Y[1]%=shape[1];

             Y[2]%=shape[1];

             Y[3]%=shape[1];

             Y[4]%=shape[1];


             Y[0]*=jumpy;

             Y[1]*=jumpy;

             Y[2]*=jumpy;

             Y[3]*=jumpy;

             Y[4]*=jumpy;


             const S * derivZ2Y0p=derivZ2p+Y[0]*stride_in;

             const S * derivZ2Y1p=derivZ2p+Y[1]*stride_in;

             const S * derivZ2Y3p=derivZ2p+Y[3]*stride_in;

             const S * derivZ2Y4p=derivZ2p+Y[4]*stride_in;


             std::size_t tmp=Y[2]*stride_in;

             const S * derivZ0Y2p=derivZ0p+tmp;

             const S * derivZ1Y2p=derivZ1p+tmp;

             const S * derivZ2Y2p=derivZ2p+tmp;

             const S * derivZ3Y2p=derivZ3p+tmp;

             const S * derivZ4Y2p=derivZ4p+tmp;


             for (std::size_t x=0; x<shape[2]; x++)

             {


                 std::size_t X[5];

                 X[2]=x+shape[2];

                 X[0]=X[2]-2;

                 X[1]=X[2]-1;

                 X[3]=X[2]+1;

                 X[4]=X[2]+2;


                 X[0]%=shape[2];

                 X[1]%=shape[2];

                 X[2]%=shape[2];

                 X[3]%=shape[2];

                 X[4]%=shape[2];


                 derivX0=derivZ2Y2p+(X[0])*stride_in;

                 derivX1=derivZ2Y2p+(X[1])*stride_in;

                 derivX2=derivZ2Y2p+X[3]*stride_in;

                 derivX3=derivZ2Y2p+(X[4])*stride_in;


                 std::size_t tmp=X[2]*stride_in;

                 derivY0=derivZ2Y0p+tmp;

                 derivY1=derivZ2Y1p+tmp;

                 derivY2=derivZ2Y3p+tmp;

                 derivY3=derivZ2Y4p+tmp;


                 derivZ0=derivZ0Y2p+tmp;

                 derivZ1=derivZ1Y2p+tmp;

                 derivZ2=derivZ3Y2p+tmp;

                 derivZ3=derivZ4Y2p+tmp;


                 std::size_t offset=(Z[2]+Y[2]+X[2])*stride_out+J1;


                 for (int M=-(J1); M<=(0); M++)

                 {

                     std::complex<T> & current=stOut[offset+M];

                     if ( clear_field ) current=T ( 0 );

                     std::complex<T> tmp=T ( 0 );


                     if (abs(M+1)<=J)  // m1=-1    m2=M+1    M

                     {

                         int m2=M+1;

                         if (M==0)

                         {

                             tmp-=CGTable0[M]*(voxel_size[2]*std::conj(derivX0[-m2]+(T)8.0*(derivX2[-m2]-derivX1[-m2])-derivX3[-m2])+

                                               imag*std::conj(derivY0[-m2]+(T)8.0*(derivY2[-m2]-derivY1[-m2])-derivY3[-m2]));

                         } else

                             tmp+=CGTable0[M]*(voxel_size[2]*(derivX0[m2]+(T)8.0*(derivX2[m2]-derivX1[m2])-derivX3[m2])+

                                               imag*(derivY0[m2]+(T)8.0*(derivY2[m2]-derivY1[m2])-derivY3[m2]));

                     }

                     if (M>=-J)  // m1=0     m2=M        M

                     {

                         tmp+=CGTable1[M]*(derivZ0[M]+(T)8.0*(derivZ2[M]-derivZ1[M])-derivZ3[M]);

                     }

                     if (M-1>=-J)  // m1=1     m2=M-1    M

                     {

                         int m2=M-1;

                         tmp+=CGTable2[M]*(-voxel_size[2]*(derivX0[m2]+(T)8.0*(derivX2[m2]-derivX1[m2])-derivX3[m2])+

                                           imag*(derivY0[m2]+(T)8.0*(derivY2[m2]-derivY1[m2])-derivY3[m2]));

                     }

                     current+=tmp*alpha;

                 }


             }

         }

     }


     delete [] CGTable;

     return (STA_RESULT_SUCCESS);

 }


 template<typename T,typename S>

 STA_RESULT sta_product0 (

     const std::complex<T> * stIn1,

     const std::complex<T> * stIn2,

     std::complex<T> * stOut,

     const std::size_t shape[],

     S alpha,

     int  stride_in1 = -1,

     int  stride_in2 = -1,

     int  stride_out = -1,

     bool clear_field = false )

 {

     {

         if (stride_in1 == -1)

             stride_in1 = 1;

         if (stride_in2 == -1)

             stride_in2 = 1;

         if (stride_out == -1)

             stride_out = 1;


         std::size_t jumpz=shape[1]*shape[2];


         #pragma omp parallel for num_threads(hanalysis::get_numCPUs())

         for (std::size_t z=0; z<shape[0]; z++)

         {

             std::size_t Z=z;

             Z*=jumpz;

             const std::complex<T> * current_J1=&stIn1[Z*stride_in1];

             const std::complex<T> * current_J2=&stIn2[Z*stride_in2];

             std::complex<T> * current_J=&stOut[Z*stride_out];


             for (std::size_t i=0; i<jumpz; i++)

             {

                 if (clear_field)

                     *current_J=T ( 0 );

                 *current_J+=(*current_J1)* (*current_J2)*alpha;

                 current_J+=stride_out;

                 current_J1+=stride_in1;

                 current_J2+=stride_in2;

             }

         }

     }

     return STA_RESULT_SUCCESS;

 }


 /*########################################################


 ########################################################*/


 static std::string errortostring(STA_RESULT error)

 {

     switch(error)

     {

     case  STA_RESULT_SUCCESS:

         return "STA_RESULT_SUCCESS";

     case  STA_RESULT_FAILED:

         return "STA_RESULT_FAILED";

     case  STA_RESULT_SHAPE_MISMATCH:

         return "STA_RESULT_SHAPE_MISMATCH";

     case  STA_RESULT_INVALID_PRODUCT:

         return "STA_RESULT_INVALID_PRODUCT";

     case  STA_RESULT_STORAGE_MISMATCH:

         return "STA_RESULT_STORAGE_MISMATCH";

     case  STA_RESULT_INVALID_TENSOR_RANK:

         return "STA_RESULT_INVALID_TENSOR_RANK";

     case  STA_RESULT_OFIELD_TYPE_MISMATCH:

         return "STA_RESULT_OFIELD_TYPE_MISMATCH";

     case  STA_RESULT_SAME_ADDRESS:

         return "STA_RESULT_SAME_ADDRESS";

     case  STA_RESULT_NOT_IMPLEMENTED:

         return "STA_RESULT_NOT_IMPLEMENTED";

     }

     return "unkown error code";

 }


 /*

 enum STA_FIELD_PROPERTIES {

     STA_FIELD_INVALID_PARAM=0,

     STA_FIELD_STORAGE_C=2,

     STA_FIELD_STORAGE_R=4,

     STA_FIELD_STORAGE_RF=8,

     STA_FIELD_EXPERIMENTAL=16,


     STA_OFIELD_SINGLE=128,

     STA_OFIELD_FULL=256,

     STA_OFIELD_EVEN=512,

     STA_OFIELD_ODD=1024

 };*/


 enum STA_FIELD_STORAGE {

     STA_FIELD_STORAGE_UNSUPPORTED=0,

     STA_FIELD_STORAGE_C=2,

     STA_FIELD_STORAGE_R=4,

     STA_FIELD_STORAGE_RF=8,

 };


 enum STA_FIELD_TYPE {

     STA_OFIELD_UNSUPPORTED=0,

     STA_OFIELD_SINGLE=128,

     STA_OFIELD_FULL=256,

     STA_OFIELD_EVEN=512,

     STA_OFIELD_ODD=1024

 };


 static inline STA_FIELD_STORAGE enumfromstring_storage(std::string s)

 {

     if (s.compare("STA_FIELD_STORAGE_C")==0)

         return STA_FIELD_STORAGE_C;


     if (s.compare("STA_FIELD_STORAGE_R")==0)

         return STA_FIELD_STORAGE_R;


     if (s.compare("STA_FIELD_STORAGE_RF")==0)

         return STA_FIELD_STORAGE_RF;

     return STA_FIELD_STORAGE_UNSUPPORTED;

 }


 static inline STA_FIELD_TYPE enumfromstring_type(std::string s)

 {

     if (s.compare("STA_OFIELD_EVEN")==0)

         return STA_OFIELD_EVEN;


     if (s.compare("STA_OFIELD_ODD")==0)

         return STA_OFIELD_ODD;


     if (s.compare("STA_OFIELD_FULL")==0)

         return STA_OFIELD_FULL;


     if (s.compare("STA_OFIELD_SINGLE")==0)

         return STA_OFIELD_SINGLE;

     return STA_OFIELD_UNSUPPORTED;

 }


 static inline std::string enumtostring_storage(STA_FIELD_STORAGE p)

 {

     if (p == STA_FIELD_STORAGE_C)

         return "STA_FIELD_STORAGE_C";


     if (p == STA_FIELD_STORAGE_R)

         return "STA_FIELD_STORAGE_R";


     if (p == STA_FIELD_STORAGE_RF)

         return "STA_FIELD_STORAGE_RF";


     return "STA_FIELD_STORAGE_UNSUPPORTED";

 }


 static inline std::string enumtostring_type(STA_FIELD_TYPE p)

 {


     if (p == STA_OFIELD_EVEN)

         return "STA_OFIELD_EVEN";


     if (p == STA_OFIELD_ODD)

         return "STA_OFIELD_ODD";


     if (p == STA_OFIELD_FULL)

         return "STA_OFIELD_FULL";


     if (p == STA_OFIELD_SINGLE)

         return "STA_OFIELD_SINGLE";


     return "STA_OFIELD_UNSUPPORTED";

 }


 static inline int numComponents2order(

     hanalysis::STA_FIELD_STORAGE field_storage,

     hanalysis::STA_FIELD_TYPE field_type,

     int ncomponents)

 {

     if (field_storage==STA_FIELD_STORAGE_C)

     {

         switch (field_type)

         {

         case STA_OFIELD_SINGLE:

         {

             return (ncomponents-1)/2;

         }

         case STA_OFIELD_FULL:

         {

             return std::sqrt(1.0*ncomponents)-1;

         }

         case STA_OFIELD_ODD:

         {

             return (-3/2+std::sqrt((3.0/2.0)*(3.0/2.0)+2*ncomponents-2));

         }

         case STA_OFIELD_EVEN:

         {

             return (-3/2+std::sqrt((3.0/2.0)*(3.0/2.0)+2*ncomponents-2));

         }

         default:

             return -1;

         }

     } else

     {

         switch (field_type)

         {

         case STA_OFIELD_SINGLE:

         {

             return ncomponents-1;

         }

         case STA_OFIELD_FULL:

         {

             return (-3/2+std::sqrt((3.0/2.0)*(3.0/2.0)+2*ncomponents-2));

         }

         case STA_OFIELD_ODD:

         {

             return std::sqrt(1+4.0*ncomponents)-2;

         }

         case STA_OFIELD_EVEN:

         {

             return std::sqrt(4.0*ncomponents)-2;

         }

         default:

             return -1;

         }

     }

 }


 static inline int order2numComponents(

     hanalysis::STA_FIELD_STORAGE field_storage,

     hanalysis::STA_FIELD_TYPE field_type,

     int L)

 {

     if (field_storage==STA_FIELD_STORAGE_C)

     {

         switch (field_type)

         {

         case STA_OFIELD_SINGLE:

         {

             return 2*L+1;

         }

         case STA_OFIELD_FULL:

         {

             return ((L+1)*(L+1));

         }

         case STA_OFIELD_ODD:

         {

             return ((L+1)*(L+2))/2;

         }

         case STA_OFIELD_EVEN:

         {

             return ((L+1)*(L+2))/2;

         }

         default:

             return -1;

         }

     } else

     {

         switch (field_type)

         {

         case STA_OFIELD_SINGLE:

         {

             return L+1;

         }

         case STA_OFIELD_FULL:

         {

             return ((L+1)*(L+2))/2;

         }

         case STA_OFIELD_ODD:

         {

             return ((L+1)*(L+3))/4;

         }

         case STA_OFIELD_EVEN:

         {

             return ((L+2)*(L+2))/4;

         }

         default:

             return -1;

         }

     }

 }


 static inline int getComponentOffset(

     hanalysis::STA_FIELD_STORAGE field_storage,

     hanalysis::STA_FIELD_TYPE field_type,

     int L)

 {

     if (field_storage==STA_FIELD_STORAGE_C)

     {

         switch (field_type)

         {

         case STA_OFIELD_SINGLE:

         {

             return 0;

         }

         case STA_OFIELD_FULL:

         {

             return (L)*(L);

         }

         case STA_OFIELD_ODD:

         {

             return ((L-1)*L)/2;

         }

         case STA_OFIELD_EVEN:

         {

             return  ((L-1)*L)/2;

         }

         default:

             return -1;

         }

     } else

     {

         switch (field_type)

         {

         case STA_OFIELD_SINGLE:

         {

             return 0;

         }

         case STA_OFIELD_FULL:

         {

             return (L*(L+1))/2;

         }

         case STA_OFIELD_ODD:

         {

             return ((L+1)*(L+3))/4-L-1;

         }

         case STA_OFIELD_EVEN:

         {

             return ((L+2)*(L+2))/4-L-1;

         }

         default:

             return -1;

         }

     }

 }


 /*

  *

  *

  *

  * */

 template<typename T>

 STA_RESULT sta_3product (

     const std::complex<T> * stIn1,

     const std::complex<T> * stIn2,

     const std::complex<T> * stIn3,

     std::complex<T> * stOut,

     const std::size_t shape[],

     int J1,

     int J2,

     int J3,

     int Jprod1,

     int Jprod2,

     std::complex<T> alpha,

     bool normalize = false,

     STA_FIELD_STORAGE field_storage=STA_FIELD_STORAGE_C,

     int  stride_in1 = -1,

     int  stride_in2 = -1,

     int  stride_in3 = -1,

     int  stride_out = -1,

     bool clear_field = false )

 {


     printf("tripple product experimental ! not tested yet, and slow\n");

     if ((stIn1==stOut)||(stIn2==stOut))

         return STA_RESULT_SAME_ADDRESS;


     STA_RESULT result=STA_RESULT_FAILED;

     bool alpha_real=(alpha.imag()==0);


     switch (field_storage)

     {


     case STA_FIELD_STORAGE_R:

     {

         if (alpha_real)

             return sta_tripleproduct_R (

                        stIn1,

                        stIn2,

                        stIn3,

                        stOut ,

                        shape,

                        J1,

                        J2,

                        J3,

                        Jprod1,

                        Jprod2,

                        alpha.real(),

                        normalize,

                        stride_in1,

                        stride_in2,

                        stride_in3,

                        stride_out,

                        clear_field);

         else

             return STA_RESULT_STORAGE_MISMATCH;

     }

     break;

     }


     return result;

 }


 template<typename T>

 STA_RESULT sta_product (

     const std::complex<T> * stIn1,

     const std::complex<T> * stIn2,

     std::complex<T> * stOut,

     const std::size_t shape[],

     int J1,

     int J2,

     int J,

     std::complex<T> alpha = T( 1 ),

     bool normalize = false,

     STA_FIELD_STORAGE field_storage=STA_FIELD_STORAGE_C,

     int  stride_in1 = -1,

     int  stride_in2 = -1,

     int  stride_out = -1,

     bool clear_field = false )

 {

     if ((stIn1==stOut)||(stIn2==stOut))

         return STA_RESULT_SAME_ADDRESS;


     STA_RESULT result=STA_RESULT_FAILED;

     bool alpha_real=(alpha.imag()==0);


     if ((J1==0)&&(J2==0)&&(J==0))

     {

         if (alpha_real)

             return  sta_product0(stIn1,

                                  stIn2,

                                  stOut,

                                  shape,

                                  alpha.real(),

                                  stride_in1,

                                  stride_in2,

                                  stride_out,

                                  clear_field);

         else if (field_storage==STA_FIELD_STORAGE_C)

         {

             return  sta_product0(stIn1,

                                  stIn2,

                                  stOut,

                                  shape,

                                  alpha,

                                  stride_in1,

                                  stride_in2,

                                  stride_out,

                                  clear_field);

         } else

             return STA_RESULT_STORAGE_MISMATCH;


     }


     switch (field_storage)

     {

     case STA_FIELD_STORAGE_C:

     {

         if (alpha_real)

             result=sta_product_C (stIn1,

                                   stIn2,

                                   stOut,

                                   shape,

                                   J1,

                                   J2,

                                   J,

                                   alpha.real(),

                                   normalize,

                                   stride_in1,

                                   stride_in2,

                                   stride_out,

                                   clear_field);

         else

             result=sta_product_C (stIn1,

                                   stIn2,

                                   stOut,

                                   shape,

                                   J1,

                                   J2,

                                   J,

                                   alpha,

                                   normalize,

                                   stride_in1,

                                   stride_in2,

                                   stride_out,

                                   clear_field);

     }

     break;


     case STA_FIELD_STORAGE_R:

     {

         if (alpha_real)

             result=sta_product_R (stIn1,

                                   stIn2,

                                   stOut,

                                   shape,

                                   J1,

                                   J2,

                                   J,

                                   alpha.real(),

                                   normalize,

                                   stride_in1,

                                   stride_in2,

                                   stride_out,

                                   clear_field);

         else

             return STA_RESULT_STORAGE_MISMATCH;

 //         else

 //             result=sta_product_R (stIn1,

 //                                   stIn2,

 //                                   stOut,

 //                                   shape,

 //                                   J1,

 //                                   J2,

 //                                   J,

 //                                   alpha,

 //                                   normalize,

 //                                   stride_in1,

 //                                   stride_in2,

 //                                   stride_out,

 //                                   clear_field);

     }

     break;


     case STA_FIELD_STORAGE_RF:

     {

         if (alpha_real)

             result=sta_product_Rft (stIn1,

                                     stIn2,

                                     stOut,

                                     shape,

                                     J1,

                                     J2,

                                     J,

                                     alpha.real(),

                                     normalize,

                                     stride_in1,

                                     stride_in2,

                                     stride_out,

                                     clear_field);

         else

             return STA_RESULT_STORAGE_MISMATCH;

     }

     break;

     default:

     {

         printf("unsupported\n");

     }

     break;


     }

     return result;

 }


 template<typename T,typename S>

 STA_RESULT sta_mult (

     const std::complex<T> * stIn,

     std::complex<T> * stOut,

     const std::size_t shape[],

     int ncomponents,

     S alpha = S ( 1 ),

     bool conjugate=false,

     int  stride_in = -1,

     int  stride_out = -1,

     bool clear_field = false )

 {

     bool doalpha= ( alpha!=S ( 1 ) );


     if (stride_in==-1)

         stride_in=ncomponents;

     if (stride_out==-1)

         stride_out=ncomponents;


     std::size_t jumpz=shape[1]*shape[2];


     //printf("alpha (%f %f) %d\n",alpha.real(),alpha.imag(),conjugate);


     #pragma omp parallel for num_threads(hanalysis::get_numCPUs())

     for (std::size_t a=0; a<shape[0]; a++ )

     {

         std::complex<T> *resultp=stOut+a*jumpz*stride_out;

         const std::complex<T> *inp=stIn+a*jumpz*stride_in;


         for ( std::size_t i=0; i<jumpz; i++ )

         {

             for (int b=0; b<ncomponents; b++)

             {

                 std::complex<T> tmp=inp[b];

                 if ( conjugate )

                     tmp=std::conj ( tmp );


                 if ( doalpha )

                     tmp*=alpha;


                 if ( clear_field )

                     resultp[b] =tmp;

                 else

                     resultp[b] +=tmp;

             }

             resultp+=stride_out;

             inp+=stride_in;

         }

     }

     return STA_RESULT_SUCCESS;

 }


 template<typename T>

 STA_RESULT sta_norm (

     const std::complex<T> * stIn,

     std::complex<T> * stOut,

     const std::size_t shape[],

     int J,

     STA_FIELD_STORAGE field_storage=STA_FIELD_STORAGE_C,

     int  stride_in = -1,

     int  stride_out = -1,

     bool clear_field = false )

 {

     if (stIn==stOut)

         return STA_RESULT_SAME_ADDRESS;


     int J1=J+1;

     if (field_storage==STA_FIELD_STORAGE_C)

         J1=2*J+1;


     if (stride_in==-1)

         stride_in=J1;

     if (stride_out==-1)

         stride_out=1;


     std::size_t jumpz=shape[1]*shape[2];


     #pragma omp parallel for num_threads(hanalysis::get_numCPUs())

     for (std::size_t a=0; a<shape[0]; a++ )

     {

         std::complex<T> *resultp=stOut+a*jumpz*stride_out;

         const std::complex<T> *inp=stIn+a*jumpz*stride_in+J;


         for ( std::size_t i=0; i<jumpz; i++ )

         {

             const std::complex<T> * current=inp;


             T tmp=0;


             if (field_storage==STA_FIELD_STORAGE_C)

             {

                 for (int b=-J; b<=J; b++)

                 {

                     tmp+=std::norm(current[b]);

                 }

             } else

             {

                 for (int b=-J; b<0; b++)

                 {

                     tmp+=T( 2 )*std::norm(current[b]);

                 }

                 tmp+=std::norm(current[0]);

             }


             if ( clear_field )

                 *resultp=T (0);

             *resultp+=std::sqrt(tmp);


             resultp+=stride_out;

             inp+=stride_in;


         }

     }

     return STA_RESULT_SUCCESS;

 }


 template<typename T>

 STA_RESULT sta_derivatives (

     const std::complex<T> * stIn,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J,

     int Jupdown,    // either -1 0 or 1

     bool conjugate=false,

     std::complex<T> alpha= ( T ) 1.0,

     STA_FIELD_STORAGE field_storage=STA_FIELD_STORAGE_C,

     const T  v_size[]=NULL,

     int stride_in = -1,

     int stride_out = -1,

     bool clear_field = false,

     int accuracy=0)

 {

     if (stIn==stOut)

         return STA_RESULT_SAME_ADDRESS;


     STA_RESULT result=STA_RESULT_FAILED;

     bool alpha_real=(alpha.imag()==0);


     switch (field_storage)

     {

     case STA_FIELD_STORAGE_C:

     {

         result=sta_derivatives_C (stIn,

                                   stOut,

                                   shape,

                                   J,

                                   Jupdown,

                                   conjugate,

                                   alpha,

                                   v_size,

                                   stride_in,

                                   stride_out,

                                   clear_field);

     }

     break;


     case STA_FIELD_STORAGE_R:

     {

         if (alpha_real)

         {

             if (accuracy==0)

             {

                 result=sta_derivatives_R (stIn,

                                           stOut,

                                           shape,

                                           J,

                                           Jupdown,

                                           conjugate,

                                           alpha.real(),

                                           v_size,

                                           stride_in,

                                           stride_out,

                                           clear_field);


 //              printf("ok? nan %d inf %d\n",

 //              sta_isnan(stOut,shape,J+2,stride_out),sta_isinf(stOut,shape,J+2,stride_out));


             }

             else

             {

                 result=sta_derivatives_R4th (stIn,

                                              stOut,

                                              shape,

                                              J,

                                              Jupdown,

                                              conjugate,

                                              alpha,

                                              v_size,

                                              stride_in,

                                              stride_out,

                                              clear_field);

             }

         } else

             return STA_RESULT_STORAGE_MISMATCH;

     }

     break;


     default:

     {

         printf("unsupported derivative\n");

     }

     break;

     }


     return result;

 }


 template<typename T>

 STA_RESULT sta_derivatives2 (

     const std::complex<T> * stIn,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int J,

     int Jupdown,         // either +2 or -2 or 0

     bool conjugate=false,

     std::complex<T> alpha= ( T ) 1.0,

     STA_FIELD_STORAGE field_storage=STA_FIELD_STORAGE_C,

     const T  v_size[]=NULL,

     int stride_in = -1,

     int stride_out = -1,

     bool clear_field = false )

 {

     if (stIn==stOut)

         return STA_RESULT_SAME_ADDRESS;


     STA_RESULT result=STA_RESULT_FAILED;

     bool alpha_real=(alpha.imag()==0);


     switch (field_storage)

     {

     case STA_FIELD_STORAGE_C:

     {

         result=sta_derivatives2_C (stIn,

                                    stOut,

                                    shape,

                                    J,

                                    Jupdown,

                                    conjugate,

                                    alpha,

                                    v_size,

                                    stride_in,

                                    stride_out,

                                    clear_field);

     }

     break;


     case STA_FIELD_STORAGE_R:

     {

         if (alpha_real)

             result=sta_derivatives2_R (stIn,

                                        stOut,

                                        shape,

                                        J,

                                        Jupdown,

                                        conjugate,

                                        alpha.real(),

                                        v_size,

                                        stride_in,

                                        stride_out,

                                        clear_field);

         else

             return STA_RESULT_STORAGE_MISMATCH;

     }

     break;

     default:

     {

         printf("unsupported\n");

     }

     break;


     }


     return result;

 }


 template<typename T>

 STA_RESULT sta_laplace (

     const std::complex<T>  * stIn,

     std::complex<T> * stOut ,

     const std::size_t shape[],

     int components=1,

     int type=1,

     std::complex<T> alpha=1,

     STA_FIELD_STORAGE field_storage=STA_FIELD_STORAGE_C,

     const T  v_size[]=NULL,

     int stride_in = -1,

     int stride_out = -1,

     bool clear_field = false  )

 {

     if (stIn==stOut)

         return STA_RESULT_SAME_ADDRESS;


     bool alpha_real=(alpha.imag()==0);

     if ((!alpha_real) && (field_storage!=STA_FIELD_STORAGE_C))

         return STA_RESULT_STORAGE_MISMATCH;


     //printf("%d %d %d\n",components,stride_in,stride_out);

     if ( components<=0 ) return STA_RESULT_INVALID_TENSOR_RANK;

     //if (( components==1 )&&(stride_in==-1)&&(stride_out==-1))

     if (( components==1 )&&(stride_in<=1)&&(stride_out<=1))

         sta_laplace_1component ( stIn,stOut,shape,type,alpha,v_size,clear_field);

     else

     {

         if (alpha_real)

             sta_laplace_Ncomponents_R ( stIn,stOut,shape,components,type,alpha.real(),v_size,stride_in,stride_out,clear_field);

         else

             sta_laplace_Ncomponents_C ( stIn,stOut,shape,components,type,alpha,v_size,stride_in,stride_out,clear_field);

     }

     return STA_RESULT_SUCCESS;

 }


 template<typename T,typename S>

 STA_RESULT sta_fft ( const std::complex<T> * stIn,

                      std::complex<T> * stOut,

                      const std::size_t shape[],

                      int components,

                      bool forward,

                      bool conjugate=false,

                      //S alpha = S ( 1 ),

                      S alpha = ( S ) 1 ,

 #ifdef _STA_LINK_FFTW

                      int flag=FFTW_ESTIMATE )

 #else

                      int flag=0 )

 #endif

 {

     if (stIn==stOut)

         return STA_RESULT_SAME_ADDRESS;


     STA_RESULT result=STA_RESULT_FAILED;


     int shape_[3];

     shape_[0]=shape[0];

     shape_[1]=shape[1];

     shape_[2]=shape[2];

     result=fft ( stIn,stOut,shape_,components,forward,flag );


     if (conjugate || (alpha!=T(1)) )

         sta_mult (stOut,

                   stOut,

                   shape,

                   components,

                   alpha,

                   conjugate,

                   -1,

                   -1,

                   true);


 // #ifndef _STA_LINK_FFTW

 //     printf("fftw libs have not been linked an no transformation is performed\n");

 // #endif

     return result;

 }


 template<typename T>

 class sta_fspecial_func

 {

 public:

     virtual std::complex<T> fspecial(const std::complex<T> & value) const =0;

 };


 template<typename T>

 STA_RESULT sta_special (

     const std::complex<T> * stIn,

     std::complex<T> * stOut,

     const std::size_t shape[],

     const sta_fspecial_func<T> & fspecial,

     int ncomponents,

     int  stride_in = -1,

     int  stride_out = -1,

     bool clear_field = false )

 {


     if (stride_in==-1)

         stride_in=ncomponents;

     if (stride_out==-1)

         stride_out=ncomponents;


     std::size_t jumpz=shape[1]*shape[2];


     #pragma omp parallel for num_threads(hanalysis::get_numCPUs())

     for (std::size_t a=0; a<shape[0]; a++ )

     {

         std::complex<T> *resultp=stOut+a*jumpz*stride_out;

         const std::complex<T> *inp=stIn+a*jumpz*stride_in;


         for ( std::size_t i=0; i<jumpz; i++ )

         {

             for (int b=0; b<ncomponents; b++)

             {

                 const std::complex<T>  &  in=inp[b];

                 std::complex<T>  & out=resultp[b];

                 if ( clear_field )

                     out =0;

                 out+=fspecial.fspecial(in);

             }

             resultp+=stride_out;

             inp+=stride_in;

         }

     }

     return hanalysis::STA_RESULT_SUCCESS;

 }


 }

 #endif

hanalysis::STA_FIELD_STORAGE
STA_FIELD_STORAGE
tensor field data storage
Definition: stensor.h:5163

hanalysis::STA_OFIELD_EVEN
tensor field has all components of even ranks :
Definition: stensor.h:5184

hanalysis::sta_derivatives
STA_RESULT sta_derivatives(const std::complex< T > *stIn, std::complex< T > *stOut, const std::size_t shape[], int J, int Jupdown, bool conjugate=false, std::complex< T > alpha=(T) 1.0, STA_FIELD_STORAGE field_storage=STA_FIELD_STORAGE_C, const T v_size[]=NULL, int stride_in=-1, int stride_out=-1, bool clear_field=false, int accuracy=0)
spherical tensor derivative:
Definition: stensor.h:5864

hanalysis::STA_OFIELD_SINGLE
tensor field has one single component of rank :
Definition: stensor.h:5180

hanalysis::STA_FIELD_TYPE
STA_FIELD_TYPE
tensor field data interpretations according to certain symmetries
Definition: stensor.h:5177

hanalysis::STA_RESULT
STA_RESULT
function return value
Definition: stensor.h:124

hanalysis::sta_fspecial_func
Definition: stensor.h:6190

hanalysis::sta_norm
STA_RESULT sta_norm(const std::complex< T > *stIn, std::complex< T > *stOut, const std::size_t shape[], int J, STA_FIELD_STORAGE field_storage=STA_FIELD_STORAGE_C, int stride_in=-1, int stride_out=-1, bool clear_field=false)
returns lengths of vectors component by compnent
Definition: stensor.h:5765

hanalysis::STA_FIELD_STORAGE_RF
Definition: stensor.h:5173

hanalysis::sta_product
STA_RESULT sta_product(const std::complex< T > *stIn1, const std::complex< T > *stIn2, std::complex< T > *stOut, const std::size_t shape[], int J1, int J2, int J, std::complex< T > alpha=T(1), bool normalize=false, STA_FIELD_STORAGE field_storage=STA_FIELD_STORAGE_C, int stride_in1=-1, int stride_in2=-1, int stride_out=-1, bool clear_field=false)
spherical tensor product:  and , respectively
Definition: stensor.h:5528

hanalysis::STA_FIELD_STORAGE_C
Definition: stensor.h:5167

hanalysis::sta_fft
STA_RESULT sta_fft(const std::complex< T > *stIn, std::complex< T > *stOut, const std::size_t shape[], int components, bool forward, bool conjugate=false, S alpha=(S) 1, int flag=0)
tensor fft component by component
Definition: stensor.h:6140

hanalysis
The STA-ImageAnalysisToolkit namespace.
Definition: stafield.h:55

hanalysis::STA_OFIELD_ODD
tensor field has all components of odd ranks :
Definition: stensor.h:5186

hanalysis::sta_derivatives2
STA_RESULT sta_derivatives2(const std::complex< T > *stIn, std::complex< T > *stOut, const std::size_t shape[], int J, int Jupdown, bool conjugate=false, std::complex< T > alpha=(T) 1.0, STA_FIELD_STORAGE field_storage=STA_FIELD_STORAGE_C, const T v_size[]=NULL, int stride_in=-1, int stride_out=-1, bool clear_field=false)
spherical tensor double-derivative:
Definition: stensor.h:5991

hanalysis::sta_mult
STA_RESULT sta_mult(const std::complex< T > *stIn, std::complex< T > *stOut, const std::size_t shape[], int ncomponents, S alpha=S(1), bool conjugate=false, int stride_in=-1, int stride_out=-1, bool clear_field=false)
computes
Definition: stensor.h:5700

hanalysis::STA_OFIELD_FULL
tensor field has all components of ranks :
Definition: stensor.h:5182

hanalysis::sta_laplace
STA_RESULT sta_laplace(const std::complex< T > *stIn, std::complex< T > *stOut, const std::size_t shape[], int components=1, int type=1, std::complex< T > alpha=1, STA_FIELD_STORAGE field_storage=STA_FIELD_STORAGE_C, const T v_size[]=NULL, int stride_in=-1, int stride_out=-1, bool clear_field=false)
Laplacian: .
Definition: stensor.h:6078

hanalysis::STA_FIELD_STORAGE_R
Definition: stensor.h:5170