/*
u1.h version $Date: 2014/09/08 17:44:28 $
D. J. Bernstein
Romain Dolbeau
Public domain.
*/

// Modified by kerukuro for use in cppcrypto.

//  if (!bytes) return;
  while (bytes >=64) {
    __m128i x_0, x_1, x_2, x_3;
    __m128i t_1;
    const __m128i rot16 = _mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
    const __m128i rot8  = _mm_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3);
	uint32 in12, in13;

    x_0 = _mm_load_si128((__m128i*)(x +  0));
    x_1 = _mm_load_si128((__m128i*)(x +  4));
    x_2 = _mm_load_si128((__m128i*)(x +  8));
    x_3 = _mm_load_si128((__m128i*)(x + 12));

    for (i = 0 ; i < r ; ++i) {
      x_0 = _mm_add_epi32(x_0, x_1);
      x_3 = _mm_xor_si128(x_3, x_0);
      x_3 = _mm_shuffle_epi8(x_3, rot16);

      x_2 = _mm_add_epi32(x_2, x_3);
      x_1 = _mm_xor_si128(x_1, x_2);

      t_1 = x_1;
      x_1 = _mm_slli_epi32(x_1, 12);
      t_1 = _mm_srli_epi32(t_1, 20);
      x_1 = _mm_xor_si128(x_1, t_1);

      x_0 = _mm_add_epi32(x_0, x_1);
      x_3 = _mm_xor_si128(x_3, x_0);
      x_0 = _mm_shuffle_epi32(x_0, 0x93);
      x_3 = _mm_shuffle_epi8(x_3, rot8);

      x_2 = _mm_add_epi32(x_2, x_3);
      x_3 = _mm_shuffle_epi32(x_3, 0x4e);
      x_1 = _mm_xor_si128(x_1, x_2);
      x_2 = _mm_shuffle_epi32(x_2, 0x39);

      t_1 = x_1;
      x_1 = _mm_slli_epi32(x_1, 7);
      t_1 = _mm_srli_epi32(t_1, 25);
      x_1 = _mm_xor_si128(x_1, t_1);

      x_0 = _mm_add_epi32(x_0, x_1);
      x_3 = _mm_xor_si128(x_3, x_0);
      x_3 = _mm_shuffle_epi8(x_3, rot16);

      x_2 = _mm_add_epi32(x_2, x_3);
      x_1 = _mm_xor_si128(x_1, x_2);

      t_1 = x_1;
      x_1 = _mm_slli_epi32(x_1, 12);
      t_1 = _mm_srli_epi32(t_1, 20);
      x_1 = _mm_xor_si128(x_1, t_1);

      x_0 = _mm_add_epi32(x_0, x_1);
      x_3 = _mm_xor_si128(x_3, x_0);
      x_0 = _mm_shuffle_epi32(x_0, 0x39);
      x_3 = _mm_shuffle_epi8(x_3, rot8);

      x_2 = _mm_add_epi32(x_2, x_3);
      x_3 = _mm_shuffle_epi32(x_3, 0x4e);
      x_1 = _mm_xor_si128(x_1, x_2);
      x_2 = _mm_shuffle_epi32(x_2, 0x93);

      t_1 = x_1;
      x_1 = _mm_slli_epi32(x_1, 7);
      t_1 = _mm_srli_epi32(t_1, 25);
      x_1 = _mm_xor_si128(x_1, t_1);
    }
    x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*)(x +  0)));
    x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*)(x +  4)));
    x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*)(x +  8)));
    x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*)(x + 12)));
    x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((__m128i*)(m +  0)));
    x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((__m128i*)(m + 16)));
    x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((__m128i*)(m + 32)));
    x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((__m128i*)(m + 48)));
    _mm_storeu_si128((__m128i*)(out +  0),  x_0);
    _mm_storeu_si128((__m128i*)(out + 16),  x_1);
    _mm_storeu_si128((__m128i*)(out + 32),  x_2);
    _mm_storeu_si128((__m128i*)(out + 48),  x_3);
    
    in12 = x[12];
    in13 = x[13];
    in12 ++;
    if (in12 == 0)
      in13 ++;
    x[12] = in12;
    x[13] = in13;

    bytes -= 64;
    out += 64;
    m += 64;
  }