/*******************************************************
                        PFTOOLS
 *******************************************************
  Sep 30, 2011 heuristic.c
 *******************************************************
 (C) 2011 Swiss Institute of Bioinformatics
     Thierry Schuepbach (thierry.schuepbach@isb-sib.ch)
 *******************************************************/
#include <stdlib.h>
#include <stdio.h>
#include <inttypes.h>
#include <stdbool.h>
#include <mmintrin.h>
#include <emmintrin.h>
#include <xmmintrin.h>
#ifdef __SSE_4_1__
# include <smmintrin.h>
#endif
#include <alloca.h>
#include "profile.h"

#if 0
unsigned int heuristic(const struct Profile * const restrict prf, const PFSequence * const restrict Sequence,
                       const unsigned int CutOff)
{
  unsigned int Score = 0;
//   printf("Sequence length: %li\n", Sequence->Length);
  // Allocate vectors on the stack ( one for read, one for write )
  int * restrict v0 = (int *) ( ( (uintptr_t) alloca(Sequence->Length*sizeof(int) + 63)) & ~63);
  int * restrict v1 = (int *) ( ( (uintptr_t) alloca(Sequence->Length*sizeof(int) + 63)) & ~63);

  __assume_aligned(v0,64);
  __assume_aligned(v1,64);
  
  register int * restrict v_w = v0;
 
  // Initialize v with first profile
  register const short int * restrict Match = prf->Scores.Match.Alphabet;
  for (int iseq=0; iseq<Sequence->Length; ++iseq) {
//     printf("%i %i\n", iseq, (int) Sequence->ProfileIndex[iseq]);
    register const int Index = (int) Sequence->ProfileIndex[iseq];
    const int tmp = (int) Match[Index];
    v_w[iseq] = tmp > 0 ? tmp : 0;
    Score = tmp > Score ? tmp : Score;
  }

  // Block size for alphabet
  const register size_t AlignStep = prf->Scores.Match.AlignStep;

  // Set read v pointer
  register const int * restrict v_r = v0;
  v_w = v1;
  
  // Run through the rest of the profile
  for (int iprf=1; iprf<prf->Length; ++iprf) {
    // Move to next profile Match alphabet
    Match += AlignStep;

    register int Index0 = (int) Sequence->ProfileIndex[0];
    register const int Match0 = (int) Match[Index0];
    v_w[0] = Match0;
    register unsigned int max = Match0 > 0 ? Match0 : 0;
#pragma unroll(4)
    for (int iseq=1; iseq<Sequence->Length; ++iseq) {
      register const int Index = (int) Sequence->ProfileIndex[iseq];
      register int tmp   = v_r[iseq-1] + (int) Match[Index];
      tmp = tmp > 0 ? tmp : 0;
      v_w[iseq] = tmp;
      if ((unsigned int) tmp > max) max = tmp;
    }
    
    // Swap pointers
    int * ptr = v_w;
    v_w = (int*) v_r;
    v_r = (const int*) ptr;

    // Update Score
    Score += max;
    if (Score >= CutOff) return Score;
  }

  return Score;
}
#endif
#ifdef __SSE4_1__
unsigned int TransposeHeuristic_sse41(const int * const restrict TransposeMatch, const size_t Alphabet_Length,
                                const size_t Profile_Length, const PFSequence * const restrict Sequence)
// WARNING: Creation of the transpose matrix took care of zeroing extra data on cache line.
//          Remember that if you happen to change the code.
{
  unsigned int Score = 0;
  size_t iprf;
  // Allocate vectors on the stack ( one for read, one for write )
  const size_t Aligned_Profile_Length = (Profile_Length+1 + 15) & ~15;
  int * restrict v0 = (int *) ( ( (uintptr_t) alloca(Aligned_Profile_Length*sizeof(int) + 63)) & ~63);
  int * restrict v1 = (int *) ( ( (uintptr_t) alloca(Aligned_Profile_Length*sizeof(int) + 63)) & ~63);
  int * restrict Sc = (int *) ( ( (uintptr_t) alloca(Aligned_Profile_Length*sizeof(int) + 63)) & ~63);
  
  register int * restrict v_w = v0;

  // Initialize v with first sequence
  register size_t Index = (size_t) Sequence->ProfileIndex[0];
  register const int * restrict lMatch = &TransposeMatch[Aligned_Profile_Length*Index];

#if 0   
  for ( iprf=0; iprf<Profile_Length; ++iprf) {
    v_w[iprf] = lMatch[iprf];
    Sc[iprf]  = lMatch[iprf] > 0 ? lMatch[iprf] : 0;
  }
  for ( iprf=Profile_Length; iprf<Aligned_Profile_Length; ++iprf) {
    Sc[iprf] = 0;
  }
#else

  iprf = 0;
  do {
    __m128i __m1  = _mm_load_si128((__m128i*)&lMatch[iprf]);
    __m128i __m2  = _mm_load_si128((__m128i*)&lMatch[iprf+4]);
    __m128i __m3  = _mm_load_si128((__m128i*)&lMatch[iprf+8]);
    __m128i __m4  = _mm_load_si128((__m128i*)&lMatch[iprf+12]);
    __m128i __sc1 = _mm_max_epi32(_mm_setzero_si128(), __m1);
    _mm_store_si128((__m128i*)&v_w[iprf], __m1);
    __m128i __sc2 = _mm_max_epi32(_mm_setzero_si128(), __m2);
    _mm_store_si128((__m128i*)&v_w[iprf+4], __m1);
    __m128i __sc3 = _mm_max_epi32(_mm_setzero_si128(), __m3);
    _mm_store_si128((__m128i*)&v_w[iprf+8], __m1);
    __m128i __sc4 = _mm_max_epi32(_mm_setzero_si128(), __m4);
    _mm_store_si128((__m128i*)&v_w[iprf+12], __m1);
    _mm_store_si128((__m128i*)&Sc[iprf   ], __sc1);
    _mm_store_si128((__m128i*)&Sc[iprf+4 ], __sc2);
    _mm_store_si128((__m128i*)&Sc[iprf+8 ], __sc3);
    _mm_store_si128((__m128i*)&Sc[iprf+12], __sc4);
    iprf += 16;
  } while (iprf < Profile_Length);
  while (iprf < Aligned_Profile_Length) {
    _mm_store_si128((__m128i*)&Sc[iprf   ], _mm_setzero_si128());
    _mm_store_si128((__m128i*)&Sc[iprf+4 ], _mm_setzero_si128());
    _mm_store_si128((__m128i*)&Sc[iprf+8 ], _mm_setzero_si128());
    _mm_store_si128((__m128i*)&Sc[iprf+12], _mm_setzero_si128());
    iprf += 16;
  }
#endif
  // Set read v pointer
  register const int * v_r = v0;
  v_w = v1;

  // Run through the rest of the profile
  for (unsigned int iseq=1; iseq<(unsigned int) Sequence->Length; ++iseq) {
    Index = (size_t) Sequence->ProfileIndex[iseq];
    lMatch = &TransposeMatch[Aligned_Profile_Length*Index];
    
#if 0
    v_w[0] = lMatch[0] > 0 ? lMatch[0] : 0;
    if (lMatch[0] > Sc[0] ) Sc[0] = lMatch[0];
   
    for (size_t iprf=1; iprf<Profile_Length; ++iprf) {
      register int tmp   = v_r[iprf-1] + lMatch[iprf];
      tmp = tmp > 0 ? tmp : 0;
      v_w[iprf] = tmp;

      if (tmp > Sc[iprf]) Sc[iprf] = tmp;
    }
#else
    
    __m128i __V_R_0 = _mm_load_si128((__m128i*) &v_r[0]);
    __V_R_0         = _mm_slli_si128(__V_R_0, 4); 

    iprf=0;
    goto Insert;
    
    Loop:
      __V_R_0                  = _mm_loadu_si128((__m128i*) &v_r[iprf-1]);
      
    Insert:
    ;
      const __m128i __lMatch_0 = _mm_load_si128((__m128i*)&lMatch[iprf]);
      __V_R_0                  = _mm_add_epi32(__V_R_0, __lMatch_0); 
      
      __m128i __V_R_1          = _mm_loadu_si128((__m128i*)&v_r[iprf-1 + 4]);
      const __m128i __lMatch_1 = _mm_load_si128((__m128i*)&lMatch[iprf + 4]);
      __V_R_1                  = _mm_add_epi32(__V_R_1, __lMatch_1);
      
      __m128i __V_R_2          = _mm_loadu_si128((__m128i*)&v_r[iprf-1 + 8]);
      const __m128i __lMatch_2 = _mm_load_si128((__m128i*)&lMatch[iprf + 8]);
      __V_R_2                  = _mm_add_epi32(__V_R_2, __lMatch_2); 
      
      __m128i __V_R_3          = _mm_loadu_si128((__m128i*)&v_r[iprf-1 + 12]);
      const __m128i __lMatch_3 = _mm_load_si128((__m128i*)&lMatch[iprf + 12]);
      __V_R_3                  = _mm_add_epi32(__V_R_3, __lMatch_3);
      
      __m128i __SC_0           = _mm_load_si128((__m128i*)&Sc[iprf]);
      __V_R_0                  = _mm_max_epi32( (__m128i) _mm_setzero_si128(), __V_R_0);
      
      __m128i __SC_1           = _mm_load_si128((__m128i*)&Sc[iprf + 4]);
      __V_R_1                  = _mm_max_epi32( (__m128i) _mm_setzero_si128(), __V_R_1);
      
      __m128i __SC_2           = _mm_load_si128((__m128i*)&Sc[iprf + 8]);
      __V_R_2                  = _mm_max_epi32( (__m128i) _mm_setzero_si128(), __V_R_2);
      
      __m128i __SC_3           = _mm_load_si128((__m128i*)&Sc[iprf + 12]);
      __V_R_3                  = _mm_max_epi32( (__m128i) _mm_setzero_si128(), __V_R_3);
      
      _mm_store_si128((__m128i*)&v_w[iprf   ], __V_R_0);
      _mm_store_si128((__m128i*)&v_w[iprf+4 ], __V_R_1);
      _mm_store_si128((__m128i*)&v_w[iprf+8 ], __V_R_2);
      _mm_store_si128((__m128i*)&v_w[iprf+12], __V_R_3);
      
      __SC_0 = _mm_max_epi32(__SC_0, __V_R_0);
      __SC_1 = _mm_max_epi32(__SC_1, __V_R_1);
      __SC_2 = _mm_max_epi32(__SC_2, __V_R_2);
      __SC_3 = _mm_max_epi32(__SC_3, __V_R_3);
      
      _mm_store_si128((__m128i*)&Sc[iprf   ], __SC_0);
      _mm_store_si128((__m128i*)&Sc[iprf+4 ], __SC_1);
      _mm_store_si128((__m128i*)&Sc[iprf+8 ], __SC_2);
      _mm_store_si128((__m128i*)&Sc[iprf+12], __SC_3);
      iprf+= 16;
    
      if ( iprf<Profile_Length) goto Loop;

#endif
    
    // Swap pointers
    int * ptr = v_w;
    v_w = (int*) v_r;
    v_r = (const int*) ptr;

    // Update Score
    //Score += max;
  }

#if 1
  #pragma unroll(16)
  for (unsigned int iprf=0; iprf<(unsigned int)Profile_Length; ++iprf) 
   Score += Sc[iprf];
#else
   // WARNING : THERE IS AN ERROR SOMEWHERE !!!
  iprf = 0;
  __m128i __s1 = _mm_setzero_si128();
  __m128i __s2 = _mm_setzero_si128();
  __m128i __s3 = _mm_setzero_si128();
  __m128i __s4 = _mm_setzero_si128();
  do {
    __m128i __sc1 = _mm_load_si128((__m128i*)&Sc[iprf   ]);
    __m128i __sc2 = _mm_load_si128((__m128i*)&Sc[iprf+ 4]);
    __m128i __sc3 = _mm_load_si128((__m128i*)&Sc[iprf+ 8]);
    __m128i __sc4 = _mm_load_si128((__m128i*)&Sc[iprf+12]);
    __s1 = _mm_add_epi32(__s1, __sc1);
    __s2 = _mm_add_epi32(__s2, __sc2);
    __s3 = _mm_add_epi32(__s3, __sc3);
    __s4 = _mm_add_epi32(__s4, __sc4);
    iprf +=16;
  } while (iprf < Profile_Length);
  
  __s1 = _mm_add_epi32(__s1, __s2);
  __s3 = _mm_add_epi32(__s3, __s4);
  __s1 = _mm_add_epi32(__s1, __s3);
  
  __asm__ __volatile__ ( "pshufd    $14, %1, %2 \n\t" 
			 "paddd     %2, %1      \n\t"
			 "pshufd    $57, %1, %3 \n\t"
			 "paddd     %3, %1      \n\t"
			 "pextrd    $0, %1, %0  \n\t"
			 : "=r"(Score)
			 : "x"(__s1), "x"(__s2), "x"(__s3)
	  );
#endif
  
  return Score;
  
}

unsigned int TransposeHeuristicGivenMemory_sse41(const int * const restrict TransposeMatch, int * const Memory,
						 const size_t Alphabet_Length, const size_t Profile_Length,
						 const PFSequence * const restrict Sequence)
// WARNING: Creation of the transpose matrix took care of zeroing extra data on cache line.
//          Remember that if you happen to change the code.
{
  unsigned int Score = 0;
  size_t iprf;
  // Allocate vectors on the stack ( one for read, one for write )
  const size_t Aligned_Profile_Length = (Profile_Length+1 + 15) & ~15;
  int * restrict v0 = Memory;
  int * restrict v1 = Memory + Aligned_Profile_Length;
  int * restrict Sc = Memory + 2*Aligned_Profile_Length;
  
  register int * restrict v_w = v0;

  // Initialize v with first sequence
  register size_t Index = (size_t) Sequence->ProfileIndex[0];
  register const int * restrict lMatch = &TransposeMatch[Aligned_Profile_Length*Index];

#if 0   
  for ( iprf=0; iprf<Profile_Length; ++iprf) {
    v_w[iprf] = lMatch[iprf];
    Sc[iprf]  = lMatch[iprf] > 0 ? lMatch[iprf] : 0;
  }
  for ( iprf=Profile_Length; iprf<Aligned_Profile_Length; ++iprf) {
    Sc[iprf] = 0;
  }
#else

  iprf = 0;
  do {
    __m128i __m1  = _mm_load_si128((__m128i*)&lMatch[iprf]);
    __m128i __m2  = _mm_load_si128((__m128i*)&lMatch[iprf+4]);
    __m128i __m3  = _mm_load_si128((__m128i*)&lMatch[iprf+8]);
    __m128i __m4  = _mm_load_si128((__m128i*)&lMatch[iprf+12]);
    __m128i __sc1 = _mm_max_epi32(_mm_setzero_si128(), __m1);
    _mm_store_si128((__m128i*)&v_w[iprf], __m1);
    __m128i __sc2 = _mm_max_epi32(_mm_setzero_si128(), __m2);
    _mm_store_si128((__m128i*)&v_w[iprf+4], __m1);
    __m128i __sc3 = _mm_max_epi32(_mm_setzero_si128(), __m3);
    _mm_store_si128((__m128i*)&v_w[iprf+8], __m1);
    __m128i __sc4 = _mm_max_epi32(_mm_setzero_si128(), __m4);
    _mm_store_si128((__m128i*)&v_w[iprf+12], __m1);
    _mm_store_si128((__m128i*)&Sc[iprf   ], __sc1);
    _mm_store_si128((__m128i*)&Sc[iprf+4 ], __sc2);
    _mm_store_si128((__m128i*)&Sc[iprf+8 ], __sc3);
    _mm_store_si128((__m128i*)&Sc[iprf+12], __sc4);
    iprf += 16;
  } while (iprf < Profile_Length);
  while (iprf < Aligned_Profile_Length) {
    _mm_store_si128((__m128i*)&Sc[iprf   ], _mm_setzero_si128());
    _mm_store_si128((__m128i*)&Sc[iprf+4 ], _mm_setzero_si128());
    _mm_store_si128((__m128i*)&Sc[iprf+8 ], _mm_setzero_si128());
    _mm_store_si128((__m128i*)&Sc[iprf+12], _mm_setzero_si128());
    iprf += 16;
  }
#endif
  // Set read v pointer
  register const int * v_r = v0;
  v_w = v1;

  // Run through the rest of the profile
  for (unsigned int iseq=1; iseq<(unsigned int) Sequence->Length; ++iseq) {
    Index = (size_t) Sequence->ProfileIndex[iseq];
    lMatch = &TransposeMatch[Aligned_Profile_Length*Index];
    
#if 0
    v_w[0] = lMatch[0] > 0 ? lMatch[0] : 0;
    if (lMatch[0] > Sc[0] ) Sc[0] = lMatch[0];
   
    for (size_t iprf=1; iprf<Profile_Length; ++iprf) {
      register int tmp   = v_r[iprf-1] + lMatch[iprf];
      tmp = tmp > 0 ? tmp : 0;
      v_w[iprf] = tmp;

      if (tmp > Sc[iprf]) Sc[iprf] = tmp;
    }
#else
    
    __m128i __V_R_0 = _mm_load_si128((__m128i*) &v_r[0]);
    __V_R_0         = _mm_slli_si128(__V_R_0, 4); 

    iprf=0;
    goto Insert;
    
    Loop:
      __V_R_0                  = _mm_loadu_si128((__m128i*) &v_r[iprf-1]);
      
    Insert:
    ;
      const __m128i __lMatch_0 = _mm_load_si128((__m128i*)&lMatch[iprf]);
      __V_R_0                  = _mm_add_epi32(__V_R_0, __lMatch_0); 
      
      __m128i __V_R_1          = _mm_loadu_si128((__m128i*)&v_r[iprf-1 + 4]);
      const __m128i __lMatch_1 = _mm_load_si128((__m128i*)&lMatch[iprf + 4]);
      __V_R_1                  = _mm_add_epi32(__V_R_1, __lMatch_1);
      
      __m128i __V_R_2          = _mm_loadu_si128((__m128i*)&v_r[iprf-1 + 8]);
      const __m128i __lMatch_2 = _mm_load_si128((__m128i*)&lMatch[iprf + 8]);
      __V_R_2                  = _mm_add_epi32(__V_R_2, __lMatch_2); 
      
      __m128i __V_R_3          = _mm_loadu_si128((__m128i*)&v_r[iprf-1 + 12]);
      const __m128i __lMatch_3 = _mm_load_si128((__m128i*)&lMatch[iprf + 12]);
      __V_R_3                  = _mm_add_epi32(__V_R_3, __lMatch_3);
      
      __m128i __SC_0           = _mm_load_si128((__m128i*)&Sc[iprf]);
      __V_R_0                  = _mm_max_epi32( (__m128i) _mm_setzero_si128(), __V_R_0);
      
      __m128i __SC_1           = _mm_load_si128((__m128i*)&Sc[iprf + 4]);
      __V_R_1                  = _mm_max_epi32( (__m128i) _mm_setzero_si128(), __V_R_1);
      
      __m128i __SC_2           = _mm_load_si128((__m128i*)&Sc[iprf + 8]);
      __V_R_2                  = _mm_max_epi32( (__m128i) _mm_setzero_si128(), __V_R_2);
      
      __m128i __SC_3           = _mm_load_si128((__m128i*)&Sc[iprf + 12]);
      __V_R_3                  = _mm_max_epi32( (__m128i) _mm_setzero_si128(), __V_R_3);
      
      _mm_store_si128((__m128i*)&v_w[iprf   ], __V_R_0);
      _mm_store_si128((__m128i*)&v_w[iprf+4 ], __V_R_1);
      _mm_store_si128((__m128i*)&v_w[iprf+8 ], __V_R_2);
      _mm_store_si128((__m128i*)&v_w[iprf+12], __V_R_3);
      
      __SC_0 = _mm_max_epi32(__SC_0, __V_R_0);
      __SC_1 = _mm_max_epi32(__SC_1, __V_R_1);
      __SC_2 = _mm_max_epi32(__SC_2, __V_R_2);
      __SC_3 = _mm_max_epi32(__SC_3, __V_R_3);
      
      _mm_store_si128((__m128i*)&Sc[iprf   ], __SC_0);
      _mm_store_si128((__m128i*)&Sc[iprf+4 ], __SC_1);
      _mm_store_si128((__m128i*)&Sc[iprf+8 ], __SC_2);
      _mm_store_si128((__m128i*)&Sc[iprf+12], __SC_3);
      iprf+= 16;
    
      if ( iprf<Profile_Length) goto Loop;

#endif
    
    // Swap pointers
    int * ptr = v_w;
    v_w = (int*) v_r;
    v_r = (const int*) ptr;

    // Update Score
    //Score += max;
  }

#if 1
  #pragma unroll(4)
  for (iprf=0; iprf<Profile_Length; ++iprf) 
   Score += Sc[iprf];
#else
   // WARNING : THERE IS AN ERROR SOMEWHERE !!!
  iprf = 0;
  __m128i __s1 = _mm_setzero_si128();
  __m128i __s2 = _mm_setzero_si128();
  __m128i __s3 = _mm_setzero_si128();
  __m128i __s4 = _mm_setzero_si128();
  do {
    __m128i __sc1 = _mm_load_si128((__m128i*)&Sc[iprf   ]);
    __m128i __sc2 = _mm_load_si128((__m128i*)&Sc[iprf+ 4]);
    __m128i __sc3 = _mm_load_si128((__m128i*)&Sc[iprf+ 8]);
    __m128i __sc4 = _mm_load_si128((__m128i*)&Sc[iprf+12]);
    __s1 = _mm_add_epi32(__s1, __sc1);
    __s2 = _mm_add_epi32(__s2, __sc2);
    __s3 = _mm_add_epi32(__s3, __sc3);
    __s4 = _mm_add_epi32(__s4, __sc4);
    iprf +=16;
  } while (iprf < Profile_Length);
  
  __s1 = _mm_add_epi32(__s1, __s2);
  __s3 = _mm_add_epi32(__s3, __s4);
  __s1 = _mm_add_epi32(__s1, __s3);
  
  __asm__ __volatile__ ( "pshufd    $14, %1, %2 \n\t" 
			 "paddd     %2, %1      \n\t"
			 "pshufd    $57, %1, %3 \n\t"
			 "paddd     %3, %1      \n\t"
			 "pextrd    $0, %1, %0  \n\t"
			 : "=r"(Score)
			 : "x"(__s1), "x"(__s2), "x"(__s3)
	  );
#endif
  
  return Score;
  
}
#endif

float TransposeHeuristic_sse2(const float * const restrict TransposeMatch, const size_t Alphabet_Length,
	                      const size_t Profile_Length, const PFSequence * const restrict Sequence)
{
  size_t iprf;
  float Score = 0.0f;
  // Allocate vectors on the stack ( one for read, one for write )
  const size_t Aligned_Profile_Length = (Profile_Length+1 + 15) & ~15;
  float * restrict v0 = (float *) ( ( (uintptr_t) alloca(Aligned_Profile_Length*sizeof(float) + 63)) & ~63);
  float * restrict v1 = (float *) ( ( (uintptr_t) alloca(Aligned_Profile_Length*sizeof(float) + 63)) & ~63);
  float * restrict Sc = (float *) ( ( (uintptr_t) alloca(Aligned_Profile_Length*sizeof(float) + 63)) & ~63);
  
  register float * restrict v_w = v0;

  // Initialize v with first sequence
  register size_t Index = (size_t) Sequence->ProfileIndex[0];
  register const float * restrict lMatch = &TransposeMatch[Aligned_Profile_Length*Index];
  
  iprf = 0;
  const __m128 __Zero = _mm_setzero_ps();
  do {
    __m128 __m1  = _mm_load_ps(&lMatch[iprf]);
    __m128 __m2  = _mm_load_ps(&lMatch[iprf+4]);
    __m128 __m3  = _mm_load_ps(&lMatch[iprf+8]);
    __m128 __m4  = _mm_load_ps(&lMatch[iprf+12]);
    __m128 __sc1 = _mm_max_ps(__Zero, __m1);
    _mm_store_ps(&v_w[iprf], __m1);
    __m128 __sc2 = _mm_max_ps(__Zero, __m2);
    _mm_store_ps(&v_w[iprf+4], __m1);
    __m128 __sc3 = _mm_max_ps(__Zero, __m3);
    _mm_store_ps(&v_w[iprf+8], __m1);
    __m128 __sc4 = _mm_max_ps(__Zero, __m4);
    _mm_store_ps(&v_w[iprf+12], __m1);
    _mm_store_ps(&Sc[iprf   ], __sc1);
    _mm_store_ps(&Sc[iprf+4 ], __sc2);
    _mm_store_ps(&Sc[iprf+8 ], __sc3);
    _mm_store_ps(&Sc[iprf+12], __sc4);
    iprf += 16;
  } while (iprf < Profile_Length);
  while (iprf < Aligned_Profile_Length) {
    _mm_store_ps(&Sc[iprf   ], __Zero);
    _mm_store_ps(&Sc[iprf+4 ], __Zero);
    _mm_store_ps(&Sc[iprf+8 ], __Zero);
    _mm_store_ps(&Sc[iprf+12], __Zero);
    iprf += 16;
  }

  // Set read v pointer
  register const float * v_r = v0;
  v_w = v1;

  // Run through the rest of the profile
  for (unsigned int iseq=1; iseq<(unsigned int) Sequence->Length; ++iseq) {
    Index = (size_t) Sequence->ProfileIndex[iseq];
    lMatch = &TransposeMatch[Aligned_Profile_Length*Index];
#if 0
    v_w[0] = lMatch[0] > 0.0f ? lMatch[0] : 0.0f;
    if (lMatch[0] > Sc[0] ) Sc[0] = lMatch[0];
    
    for (iprf=1; iprf<Profile_Length; ++iprf) {
      float tmp   = v_r[iprf-1] + lMatch[iprf];
      tmp = tmp > 0.0f ? tmp : 0.0f;
      v_w[iprf] = tmp;

      if (tmp > Sc[iprf]) Sc[iprf] = tmp;
    }
#else    
    __m128 __V_R_0 = _mm_load_ps(&v_r[0]);
    __V_R_0        = (__m128) _mm_slli_si128((__m128i) __V_R_0, 4); 

    iprf=0;
    goto Insert;
    
    Loop:
      __V_R_0                  = _mm_loadu_ps(&v_r[iprf-1]);
      
    Insert:
    ;
      const __m128 __lMatch_0 = _mm_load_ps(&lMatch[iprf]);
      __V_R_0                 = _mm_add_ps(__V_R_0, __lMatch_0); 
      
      __m128 __V_R_1          = _mm_loadu_ps(&v_r[iprf-1 + 4]);
      const __m128 __lMatch_1 = _mm_load_ps(&lMatch[iprf + 4]);
      __V_R_1                 = _mm_add_ps(__V_R_1, __lMatch_1);
      
      __m128 __V_R_2          = _mm_loadu_ps(&v_r[iprf-1 + 8]);
      const __m128 __lMatch_2 = _mm_load_ps(&lMatch[iprf + 8]);
      __V_R_2                 = _mm_add_ps(__V_R_2, __lMatch_2); 
      
      __m128 __V_R_3          = _mm_loadu_ps(&v_r[iprf-1 + 12]);
      const __m128 __lMatch_3 = _mm_load_ps(&lMatch[iprf + 12]);
      __V_R_3                 = _mm_add_ps(__V_R_3, __lMatch_3);
      
      __m128 __SC_0           = _mm_load_ps(&Sc[iprf]);
      __V_R_0                 = _mm_max_ps( (__m128) __Zero, __V_R_0);
      
      __m128 __SC_1           = _mm_load_ps(&Sc[iprf + 4]);
      __V_R_1                 = _mm_max_ps( (__m128) __Zero, __V_R_1);
      
      __m128 __SC_2           = _mm_load_ps(&Sc[iprf + 8]);
      __V_R_2                 = _mm_max_ps( (__m128) __Zero, __V_R_2);
      
      __m128 __SC_3           = _mm_load_ps(&Sc[iprf + 12]);
      __V_R_3                 = _mm_max_ps( (__m128) __Zero, __V_R_3);
      
      _mm_store_ps(&v_w[iprf   ], __V_R_0);
      _mm_store_ps(&v_w[iprf+4 ], __V_R_1);
      _mm_store_ps(&v_w[iprf+8 ], __V_R_2);
      _mm_store_ps(&v_w[iprf+12], __V_R_3);
      
      __SC_0 = _mm_max_ps(__SC_0, __V_R_0);
      __SC_1 = _mm_max_ps(__SC_1, __V_R_1);
      __SC_2 = _mm_max_ps(__SC_2, __V_R_2);
      __SC_3 = _mm_max_ps(__SC_3, __V_R_3);
      
      _mm_store_ps(&Sc[iprf   ], __SC_0);
      _mm_store_ps(&Sc[iprf+4 ], __SC_1);
      _mm_store_ps(&Sc[iprf+8 ], __SC_2);
      _mm_store_ps(&Sc[iprf+12], __SC_3);
      iprf+= 16;
    
      if ( iprf<Profile_Length) goto Loop;

#endif
    
    // Swap pointers
    float * ptr = v_w;
    v_w = (float*) v_r;
    v_r = (const float*) ptr;

    // Update Score
    //Score += max;
  }

  #pragma unroll(4)
  for (iprf=0; iprf<Profile_Length; ++iprf) 
   Score += Sc[iprf];
  
  return Score;
}

float TransposeHeuristicGivenMemory_sse2(const float * const restrict TransposeMatch, float * const Memory,
					 const size_t Alphabet_Length, const size_t Profile_Length,
					 const PFSequence * const restrict Sequence)
{
  size_t iprf;
  float Score = 0.0f;
  
  const size_t Aligned_Profile_Length = (Profile_Length+1 + 15) & ~15;
  float * restrict v0 = Memory;
  float * restrict v1 = Memory + Aligned_Profile_Length;
  float * restrict Sc = Memory + 2*Aligned_Profile_Length;
  
  register float * restrict v_w = v0;

  // Initialize v with first sequence
  register size_t Index = (size_t) Sequence->ProfileIndex[0];
  register const float * restrict lMatch = &TransposeMatch[Aligned_Profile_Length*Index];
  
  const __m128 __Zero = _mm_setzero_ps();
  iprf = 0;
  do {
    __m128 __m1  = _mm_load_ps(&lMatch[iprf]);
    __m128 __m2  = _mm_load_ps(&lMatch[iprf+4]);
    __m128 __m3  = _mm_load_ps(&lMatch[iprf+8]);
    __m128 __m4  = _mm_load_ps(&lMatch[iprf+12]);
    __m128 __sc1 = _mm_max_ps(__Zero, __m1);
    _mm_store_ps(&v_w[iprf], __m1);
    __m128 __sc2 = _mm_max_ps(__Zero, __m2);
    _mm_store_ps(&v_w[iprf+4], __m1);
    __m128 __sc3 = _mm_max_ps(__Zero, __m3);
    _mm_store_ps(&v_w[iprf+8], __m1);
    __m128 __sc4 = _mm_max_ps(__Zero, __m4);
    _mm_store_ps(&v_w[iprf+12], __m1);
    _mm_store_ps(&Sc[iprf   ], __sc1);
    _mm_store_ps(&Sc[iprf+4 ], __sc2);
    _mm_store_ps(&Sc[iprf+8 ], __sc3);
    _mm_store_ps(&Sc[iprf+12], __sc4);
    iprf += 16;
  } while (iprf < Profile_Length);
  while (iprf < Aligned_Profile_Length) {
    _mm_store_ps(&Sc[iprf   ], __Zero);
    _mm_store_ps(&Sc[iprf+4 ], __Zero);
    _mm_store_ps(&Sc[iprf+8 ], __Zero);
    _mm_store_ps(&Sc[iprf+12], __Zero);
    iprf += 16;
  }

  // Set read v pointer
  register const float * v_r = v0;
  v_w = v1;

  // Run through the rest of the profile
  for (unsigned int iseq=1; iseq<(unsigned int) Sequence->Length; ++iseq) {
    Index = (size_t) Sequence->ProfileIndex[iseq];
    lMatch = &TransposeMatch[Aligned_Profile_Length*Index];
#if 0
    v_w[0] = lMatch[0] > 0.0f ? lMatch[0] : 0.0f;
    if (lMatch[0] > Sc[0] ) Sc[0] = lMatch[0];
   
    __assume_aligned(lMatch, 16);
    __assume_aligned(v_r, 16);
    __assume_aligned(v_w, 16);
    __assume_aligned(Sc, 16);
 
    for (iprf=1; iprf<Profile_Length; ++iprf) {
      float tmp   = v_r[iprf-1] + lMatch[iprf];
      tmp = tmp > 0.0f ? tmp : 0.0f;
      v_w[iprf] = tmp;

      if (tmp > Sc[iprf]) Sc[iprf] = tmp;
    }
#else
    
    __m128 __V_R_0 = _mm_load_ps(&v_r[0]);
    __V_R_0        = (__m128) _mm_slli_si128((__m128i) __V_R_0, 4); 

    iprf=0;
    goto Insert;
    
    Loop:
      __V_R_0                  = _mm_loadu_ps(&v_r[iprf-1]);
      
    Insert:
    ;
      const __m128 __lMatch_0 = _mm_load_ps(&lMatch[iprf]);
      __V_R_0                  = _mm_add_ps(__V_R_0, __lMatch_0); 
      
      __m128 __V_R_1          = _mm_loadu_ps(&v_r[iprf-1 + 4]);
      const __m128 __lMatch_1 = _mm_load_ps(&lMatch[iprf + 4]);
      __V_R_1                  = _mm_add_ps(__V_R_1, __lMatch_1);
      
      __m128 __V_R_2          = _mm_loadu_ps(&v_r[iprf-1 + 8]);
      const __m128 __lMatch_2 = _mm_load_ps(&lMatch[iprf + 8]);
      __V_R_2                  = _mm_add_ps(__V_R_2, __lMatch_2); 
      
      __m128 __V_R_3          = _mm_loadu_ps(&v_r[iprf-1 + 12]);
      const __m128 __lMatch_3 = _mm_load_ps(&lMatch[iprf + 12]);
      __V_R_3                  = _mm_add_ps(__V_R_3, __lMatch_3);
      
      __m128 __SC_0           = _mm_load_ps(&Sc[iprf]);
      __V_R_0                  = _mm_max_ps( (__m128) __Zero, __V_R_0);
      
      __m128 __SC_1           = _mm_load_ps(&Sc[iprf + 4]);
      __V_R_1                  = _mm_max_ps( (__m128) __Zero, __V_R_1);
      
      __m128 __SC_2           = _mm_load_ps(&Sc[iprf + 8]);
      __V_R_2                  = _mm_max_ps( (__m128) __Zero, __V_R_2);
      
      __m128 __SC_3           = _mm_load_ps(&Sc[iprf + 12]);
      __V_R_3                  = _mm_max_ps( (__m128) __Zero, __V_R_3);
      
      _mm_store_ps(&v_w[iprf   ], __V_R_0);
      _mm_store_ps(&v_w[iprf+4 ], __V_R_1);
      _mm_store_ps(&v_w[iprf+8 ], __V_R_2);
      _mm_store_ps(&v_w[iprf+12], __V_R_3);
      
      __SC_0 = _mm_max_ps(__SC_0, __V_R_0);
      __SC_1 = _mm_max_ps(__SC_1, __V_R_1);
      __SC_2 = _mm_max_ps(__SC_2, __V_R_2);
      __SC_3 = _mm_max_ps(__SC_3, __V_R_3);
      
      _mm_store_ps(&Sc[iprf   ], __SC_0);
      _mm_store_ps(&Sc[iprf+4 ], __SC_1);
      _mm_store_ps(&Sc[iprf+8 ], __SC_2);
      _mm_store_ps(&Sc[iprf+12], __SC_3);
      iprf+= 16;
    
      if ( iprf<Profile_Length) goto Loop;

#endif
    
    // Swap pointers
    float * ptr = v_w;
    v_w = (float*) v_r;
    v_r = (const float*) ptr;

    // Update Score
    //Score += max;
  }

  #pragma unroll(4)
  for (iprf=0; iprf<Profile_Length; ++iprf) 
   Score += Sc[iprf];
   
  return Score;
}
