remove early returns

Early returns within a kernel can have unexpected behavior, this strongly depends on the opencl runtime compiler. To avoid errors all early returns are removed. There is no negative effect(e.g.performance) if the threads stay alive up to the end of the kernel. One source for the early return is: http://al-key-opencl.blogspot.de/2014/09/be-careful-not-to-mix-early-return-and.html This pull request also fix a race condition where all fill the shared memory. Also a bug introduced with #16 is fixed, because of the early return not was possible that the last block works with an wrong initilized shared memory (result should be a wrong hash if the result target size is valid).

remove early returns
Early returns within a kernel can have unexpected behavior, this strongly depends on the opencl runtime compiler. To avoid errors all early returns are removed. There is no negative effect(e.g.performance) if the threads stay alive up to the end of the kernel. One source for the early return is: http://al-key-opencl.blogspot.de/2014/09/be-careful-not-to-mix-early-return-and.html This pull request also fix a race condition where all fill the shared memory. Also a bug introduced with #16 is fixed, because of the early return not was possible that the last block works with an wrong initilized shared memory (result should be a wrong hash if the result target size is valid).
2c2d05e5 · psychocrypt · e79debcf · 2c2d05e5
Commit 2c2d05e5 authored 7 years ago by psychocrypt
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -422,12 +422,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 	uint4 text;

 	const ulong gIdx = get_global_id(0) - get_global_offset(0);
-	if(gIdx >= Threads) return;

-	states += 25 * gIdx;
-	Scratchpad += gIdx * (0x80000 >> 2);
-
-	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
+	for(int i = get_local_id(1) * WORKSIZE + get_local_id(0);
+		i < 256;
+		i += WORKSIZE * 8)
 	{
 		const uint tmp = AES0_C[i];
 		AES0[i] = tmp;
@@ -435,49 +433,64 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 		AES2[i] = rotate(tmp, 16U);
 		AES3[i] = rotate(tmp, 24U);
 	}
+
 	barrier(CLK_LOCAL_MEM_FENCE);
-	
-	((ulong8 *)State)[0] = vload8(0, input);
-	State[8] = input[8];
-	State[9] = input[9];
-	State[10] = input[10];
-
-	((uint *)State)[9] &= 0x00FFFFFFU;
-	((uint *)State)[9] |= ((get_global_id(0)) & 0xFF) << 24;
-	((uint *)State)[10] &= 0xFF000000U;
-	((uint *)State)[10] |= ((get_global_id(0) >> 8));
-	
-	for(int i = 11; i < 25; ++i) State[i] = 0x00UL;
-	
-	// Last bit of padding
-	State[16] = 0x8000000000000000UL;
-	
-	keccakf1600_2(State);
-	
+
+	// do not use early return here
+	if(gIdx < Threads)
+	{
+		states += 25 * gIdx;
+		Scratchpad += gIdx * (0x80000 >> 2);
+
+		((ulong8 *)State)[0] = vload8(0, input);
+		State[8] = input[8];
+		State[9] = input[9];
+		State[10] = input[10];
+
+		((uint *)State)[9] &= 0x00FFFFFFU;
+		((uint *)State)[9] |= ((get_global_id(0)) & 0xFF) << 24;
+		((uint *)State)[10] &= 0xFF000000U;
+		((uint *)State)[10] |= ((get_global_id(0) >> 8));
+
+		for(int i = 11; i < 25; ++i) State[i] = 0x00UL;
+
+		// Last bit of padding
+		State[16] = 0x8000000000000000UL;
+
+		keccakf1600_2(State);
+	}
+
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
-	
-	#pragma unroll
-	for(int i = 0; i < 25; ++i) states[i] = State[i];
-	
-	text = vload4(get_local_id(1) + 4, (__global uint *)(states));
-	
-	#pragma unroll
-	for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey1)[i] = states[i];
-	
-	AESExpandKey256(ExpandedKey1);
-	
-	mem_fence(CLK_LOCAL_MEM_FENCE);
-	
-	#pragma unroll 2
-	for(int i = 0; i < 0x4000; ++i)
+
+	// do not use early return here
+	if(gIdx < Threads)
 	{
 		#pragma unroll
-		for(int j = 0; j < 10; ++j)
-			text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]);
-		
-		Scratchpad[IDX((i << 3) + get_local_id(1))] = text;
+		for(int i = 0; i < 25; ++i) states[i] = State[i];
+
+		text = vload4(get_local_id(1) + 4, (__global uint *)(states));
+
+		#pragma unroll
+		for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey1)[i] = states[i];
+
+		AESExpandKey256(ExpandedKey1);
+	}
+
+	mem_fence(CLK_LOCAL_MEM_FENCE);
+
+	// do not use early return here
+	if(gIdx < Threads)
+	{
+		#pragma unroll 2
+		for(int i = 0; i < 0x4000; ++i)
+		{
+			#pragma unroll
+			for(int j = 0; j < 10; ++j)
+				text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]);
+
+			Scratchpad[IDX((i << 3) + get_local_id(1))] = text;
+		}
 	}
-	
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }

@@ -488,11 +501,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];

 	const ulong gIdx = get_global_id(0) - get_global_offset(0);
-	if(gIdx >= Threads) return;

-	states += 25 * gIdx;
-	Scratchpad += gIdx * (0x80000 >> 2);
-	
 	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
 	{
 		const uint tmp = AES0_C[i];
@@ -501,41 +510,54 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 		AES2[i] = rotate(tmp, 16U);
 		AES3[i] = rotate(tmp, 24U);
 	}
+
 	barrier(CLK_LOCAL_MEM_FENCE);
-	
-	a[0] = states[0] ^ states[4];
-	b[0] = states[2] ^ states[6];
-	a[1] = states[1] ^ states[5];
-	b[1] = states[3] ^ states[7];
-	
-	uint4 b_x = ((uint4 *)b)[0];
-	
+
+	uint4 b_x;
+		
+	// do not use early return here
+	if(gIdx < Threads)
+	{
+		states += 25 * gIdx;
+		Scratchpad += gIdx * (0x80000 >> 2);
+
+		a[0] = states[0] ^ states[4];
+		b[0] = states[2] ^ states[6];
+		a[1] = states[1] ^ states[5];
+		b[1] = states[3] ^ states[7];
+
+		b_x = ((uint4 *)b)[0];
+	}
+
 	mem_fence(CLK_LOCAL_MEM_FENCE);
-	
-	#pragma unroll 8
-	for(int i = 0; i < 0x80000; ++i)
+
+	// do not use early return here
+	if(gIdx < Threads)
 	{
-		ulong c[2];
-		
-		((uint4 *)c)[0] = Scratchpad[IDX((a[0] & 0x1FFFF0) >> 4)];
-		((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
-		//b_x ^= ((uint4 *)c)[0];
-		
-		Scratchpad[IDX((a[0] & 0x1FFFF0) >> 4)] = b_x ^ ((uint4 *)c)[0];
-		
-		uint4 tmp;
-		tmp = Scratchpad[IDX((c[0] & 0x1FFFF0) >> 4)];
-		
-		a[1] += c[0] * as_ulong2(tmp).s0;
-		a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
-		
-		Scratchpad[IDX((c[0] & 0x1FFFF0) >> 4)] = ((uint4 *)a)[0];
-		
-		((uint4 *)a)[0] ^= tmp;
-		
-		b_x = ((uint4 *)c)[0];
+		#pragma unroll 8
+		for(int i = 0; i < 0x80000; ++i)
+		{
+			ulong c[2];
+
+			((uint4 *)c)[0] = Scratchpad[IDX((a[0] & 0x1FFFF0) >> 4)];
+			((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
+			//b_x ^= ((uint4 *)c)[0];
+
+			Scratchpad[IDX((a[0] & 0x1FFFF0) >> 4)] = b_x ^ ((uint4 *)c)[0];
+
+			uint4 tmp;
+			tmp = Scratchpad[IDX((c[0] & 0x1FFFF0) >> 4)];
+
+			a[1] += c[0] * as_ulong2(tmp).s0;
+			a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
+
+			Scratchpad[IDX((c[0] & 0x1FFFF0) >> 4)] = ((uint4 *)a)[0];
+
+			((uint4 *)a)[0] ^= tmp;
+
+			b_x = ((uint4 *)c)[0];
+		}
 	}
-	
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }

@@ -548,12 +570,10 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 	uint4 text;
 	
 	const ulong gIdx = get_global_id(0) - get_global_offset(0);
-	if(gIdx >= Threads) return;
-
-	states += 25 * gIdx;
-	Scratchpad += gIdx * (0x80000 >> 2);

-	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
+	for(int i = get_local_id(1) * WORKSIZE + get_local_id(0);
+		i < 256;
+		i += WORKSIZE * 8)
 	{
 		const uint tmp = AES0_C[i];
 		AES0[i] = tmp;
@@ -561,63 +581,78 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 		AES2[i] = rotate(tmp, 16U);
 		AES3[i] = rotate(tmp, 24U);
 	}
+
 	barrier(CLK_LOCAL_MEM_FENCE);

-	#if defined(__Tahiti__) || defined(__Pitcairn__)
-	
-	for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4];
-	text = vload4(get_local_id(1) + 4, (__global uint *)states);
-	
-	#else
-	
-	text = vload4(get_local_id(1) + 4, (__global uint *)states);
-	((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states);
-	
-	#endif
-	
-	AESExpandKey256(ExpandedKey2);
-	
+	// do not use early return here
+	if(gIdx < Threads)
+	{
+		states += 25 * gIdx;
+		Scratchpad += gIdx * (0x80000 >> 2);
+
+		#if defined(__Tahiti__) || defined(__Pitcairn__)
+
+		for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4];
+		text = vload4(get_local_id(1) + 4, (__global uint *)states);
+
+		#else
+
+		text = vload4(get_local_id(1) + 4, (__global uint *)states);
+		((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states);
+
+		#endif
+
+		AESExpandKey256(ExpandedKey2);
+	}
+
 	barrier(CLK_LOCAL_MEM_FENCE);
-	
-	#pragma unroll 2
-	for(int i = 0; i < 0x4000; ++i)
-	{		
-		text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
-		
-		#pragma unroll
-		for(int j = 0; j < 10; ++j)
-			text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+
+	// do not use early return here
+	if(gIdx < Threads)
+	{
+		#pragma unroll 2
+		for(int i = 0; i < 0x4000; ++i)
+		{
+			text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+
+			#pragma unroll
+			for(int j = 0; j < 10; ++j)
+				text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+		}
+
+		vstore2(as_ulong2(text), get_local_id(1) + 4, states);
 	}
-	
-	vstore2(as_ulong2(text), get_local_id(1) + 4, states);
-	
+
 	barrier(CLK_GLOBAL_MEM_FENCE);
-	
-	if(!get_local_id(1))
+
+	// do not use early return here
+	if(gIdx < Threads)
 	{
-		for(int i = 0; i < 25; ++i) State[i] = states[i];
-		
-		keccakf1600_2(State);
-		
-		for(int i = 0; i < 25; ++i) states[i] = State[i];
-		
-		switch(State[0] & 3)
+		if(!get_local_id(1))
 		{
-			case 0:
-				Branch0[atomic_inc(Branch0 + Threads)] = get_global_id(0) - get_global_offset(0);
-				break;
-			case 1:
-				Branch1[atomic_inc(Branch1 + Threads)] = get_global_id(0) - get_global_offset(0);
-				break;
-			case 2:
-				Branch2[atomic_inc(Branch2 + Threads)] = get_global_id(0) - get_global_offset(0);
-				break;
-			case 3:
-				Branch3[atomic_inc(Branch3 + Threads)] = get_global_id(0) - get_global_offset(0);
-				break;
+			for(int i = 0; i < 25; ++i) State[i] = states[i];
+
+			keccakf1600_2(State);
+
+			for(int i = 0; i < 25; ++i) states[i] = State[i];
+
+			switch(State[0] & 3)
+			{
+				case 0:
+					Branch0[atomic_inc(Branch0 + Threads)] = get_global_id(0) - get_global_offset(0);
+					break;
+				case 1:
+					Branch1[atomic_inc(Branch1 + Threads)] = get_global_id(0) - get_global_offset(0);
+					break;
+				case 2:
+					Branch2[atomic_inc(Branch2 + Threads)] = get_global_id(0) - get_global_offset(0);
+					break;
+				case 3:
+					Branch3[atomic_inc(Branch3 + Threads)] = get_global_id(0) - get_global_offset(0);
+					break;
+			}
 		}
 	}
-	
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }

@@ -634,54 +669,56 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
 {
 	const ulong idx = get_global_id(0) - get_global_offset(0);
 	
-	if(idx >= Threads) return;
-	
-	states += 25 * BranchBuf[idx];
-	
-	// skein
-	ulong8 h = vload8(0, SKEIN512_256_IV);
-	
-	// Type field begins with final bit, first bit, then six bits of type; the last 96
-	// bits are input processed (including in the block to be processed with that tweak)
-	// The output transform is only one run of UBI, since we need only 256 bits of output
-	// The tweak for the output transform is Type = Output with the Final bit set
-	// T[0] for the output is 8, and I don't know why - should be message size...
-	ulong t[3] = { 0x00UL, 0x7000000000000000UL, 0x00UL };
-	ulong8 p, m;
-	
-	for(uint i = 0; i < 4; ++i)
+	// do not use early return here
+	if(idx < Threads)
 	{
-		if(i < 3) t[0] += 0x40UL;
-		else t[0] += 0x08UL;
-		
+		states += 25 * BranchBuf[idx];
+
+		// skein
+		ulong8 h = vload8(0, SKEIN512_256_IV);
+
+		// Type field begins with final bit, first bit, then six bits of type; the last 96
+		// bits are input processed (including in the block to be processed with that tweak)
+		// The output transform is only one run of UBI, since we need only 256 bits of output
+		// The tweak for the output transform is Type = Output with the Final bit set
+		// T[0] for the output is 8, and I don't know why - should be message size...
+		ulong t[3] = { 0x00UL, 0x7000000000000000UL, 0x00UL };
+		ulong8 p, m;
+
+		for(uint i = 0; i < 4; ++i)
+		{
+			if(i < 3) t[0] += 0x40UL;
+			else t[0] += 0x08UL;
+
+			t[2] = t[0] ^ t[1];
+
+			m = (i < 3) ? vload8(i, states) : (ulong8)(states[24], 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
+			const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
+			p = Skein512Block(m, h, h8, t);
+
+			h = m ^ p;
+
+			if(i < 2) t[1] = 0x3000000000000000UL;
+			else t[1] = 0xB000000000000000UL;
+		}
+
+		t[0] = 0x08UL;
+		t[1] = 0xFF00000000000000UL;
 		t[2] = t[0] ^ t[1];
-		
-		m = (i < 3) ? vload8(i, states) : (ulong8)(states[24], 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
+
+		p = (ulong8)(0);
 		const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
-		p = Skein512Block(m, h, h8, t);
-		
-		h = m ^ p;
-		
-		if(i < 2) t[1] = 0x3000000000000000UL;
-		else t[1] = 0xB000000000000000UL;
-	}
-	
-	t[0] = 0x08UL;
-	t[1] = 0xFF00000000000000UL;
-	t[2] = t[0] ^ t[1];
-	
-	p = (ulong8)(0);
-	const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
-	
-	p = Skein512Block(p, h, h8, t);
-	
-	//vstore8(p, 0, output);
-	
-	if(as_uint16(p).s7 <= Target)
-	{
-		ulong outIdx = atomic_inc(output + 0xFF);
-		if(outIdx < 0xFF)
-			output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+
+		p = Skein512Block(p, h, h8, t);
+
+		//vstore8(p, 0, output);
+
+		if(as_uint16(p).s7 <= Target)
+		{
+			ulong outIdx = atomic_inc(output + 0xFF);
+			if(outIdx < 0xFF)
+				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+		}
 	}
 	mem_fence(CLK_GLOBAL_MEM_FENCE);	
 }
@@ -692,66 +729,68 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
 {
 	const uint idx = get_global_id(0) - get_global_offset(0);
 	
-	if(idx >= Threads) return;
-	
-	states += 25 * BranchBuf[idx];
-	
-	sph_u64 h0h = 0xEBD3202C41A398EBUL, h0l = 0xC145B29C7BBECD92UL, h1h = 0xFAC7D4609151931CUL, h1l = 0x038A507ED6820026UL, h2h = 0x45B92677269E23A4UL, h2l = 0x77941AD4481AFBE0UL, h3h = 0x7A176B0226ABB5CDUL, h3l = 0xA82FFF0F4224F056UL;
-	sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL;
-	sph_u64 tmp;
-	
-	for(int i = 0; i < 5; ++i)
+	// do not use early return here
+	if(idx < Threads)
 	{
-		ulong input[8];
-		
-		if(i < 3)
-		{
-			for(int x = 0; x < 8; ++x) input[x] = (states[(i << 3) + x]);
-		}
-		else if(i == 3)
+		states += 25 * BranchBuf[idx];
+
+		sph_u64 h0h = 0xEBD3202C41A398EBUL, h0l = 0xC145B29C7BBECD92UL, h1h = 0xFAC7D4609151931CUL, h1l = 0x038A507ED6820026UL, h2h = 0x45B92677269E23A4UL, h2l = 0x77941AD4481AFBE0UL, h3h = 0x7A176B0226ABB5CDUL, h3l = 0xA82FFF0F4224F056UL;
+		sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL;
+		sph_u64 tmp;
+
+		for(int i = 0; i < 5; ++i)
 		{
-			input[0] = (states[24]);
-			input[1] = 0x80UL;
-			for(int x = 2; x < 8; ++x) input[x] = 0x00UL;
+			ulong input[8];
+
+			if(i < 3)
+			{
+				for(int x = 0; x < 8; ++x) input[x] = (states[(i << 3) + x]);
+			}
+			else if(i == 3)
+			{
+				input[0] = (states[24]);
+				input[1] = 0x80UL;
+				for(int x = 2; x < 8; ++x) input[x] = 0x00UL;
+			}
+			else
+			{
+				input[7] = 0x4006000000000000UL;
+
+				for(int x = 0; x < 7; ++x) input[x] = 0x00UL;
+			}
+
+			h0h ^= input[0];
+			h0l ^= input[1];
+			h1h ^= input[2];
+			h1l ^= input[3];
+			h2h ^= input[4];
+			h2l ^= input[5];
+			h3h ^= input[6];
+			h3l ^= input[7];
+
+			E8;
+
+			h4h ^= input[0];
+			h4l ^= input[1];
+			h5h ^= input[2];
+			h5l ^= input[3];
+			h6h ^= input[4];
+			h6l ^= input[5];
+			h7h ^= input[6];
+			h7l ^= input[7];
 		}
-		else
+
+		//output[0] = h6h;
+		//output[1] = h6l;
+		//output[2] = h7h;
+		//output[3] = h7l;
+
+		if(as_uint2(h7l).s1 <= Target)
 		{
-			input[7] = 0x4006000000000000UL;
-						
-			for(int x = 0; x < 7; ++x) input[x] = 0x00UL;
+			ulong outIdx = atomic_inc(output + 0xFF);
+			if(outIdx < 0xFF)
+				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
 		}
-		
-		h0h ^= input[0];
-		h0l ^= input[1];
-		h1h ^= input[2];
-		h1l ^= input[3];
-		h2h ^= input[4];
-		h2l ^= input[5];
-		h3h ^= input[6];
-		h3l ^= input[7];
-		
-		E8;
-		
-		h4h ^= input[0];
-		h4l ^= input[1];
-		h5h ^= input[2];
-		h5l ^= input[3];
-		h6h ^= input[4];
-		h6l ^= input[5];
-		h7h ^= input[6];
-		h7l ^= input[7];
-	}
-	
-	//output[0] = h6h;
-	//output[1] = h6l;
-	//output[2] = h7h;
-	//output[3] = h7l;
-	
-	if(as_uint2(h7l).s1 <= Target)
-	{
-		ulong outIdx = atomic_inc(output + 0xFF);
-		if(outIdx < 0xFF)
-			output[outIdx] = BranchBuf[idx] + get_global_offset(0);
 	}
 }

@@ -761,70 +800,71 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
 {
 	const uint idx = get_global_id(0) - get_global_offset(0);
 	
-	if(idx >= Threads) return;
-	
-	states += 25 * BranchBuf[idx];
-	
-	unsigned int m[16];
-	unsigned int v[16];
-	uint h[8];
-	
-	((uint8 *)h)[0] = vload8(0U, c_IV256);
-	
-	for(uint i = 0, bitlen = 0; i < 4; ++i)
+	// do not use early return here
+	if(idx < Threads)
 	{
-		if(i < 3)
+		states += 25 * BranchBuf[idx];
+	
+		unsigned int m[16];
+		unsigned int v[16];
+		uint h[8];
+
+		((uint8 *)h)[0] = vload8(0U, c_IV256);
+
+		for(uint i = 0, bitlen = 0; i < 4; ++i)
 		{
-			((uint16 *)m)[0] = vload16(i, (__global uint *)states);
-			for(int i = 0; i < 16; ++i) m[i] = SWAP4(m[i]);
-			bitlen += 512;
+			if(i < 3)
+			{
+				((uint16 *)m)[0] = vload16(i, (__global uint *)states);
+				for(int i = 0; i < 16; ++i) m[i] = SWAP4(m[i]);
+				bitlen += 512;
+			}
+			else
+			{
+				m[0] = SWAP4(((__global uint *)states)[48]);
+				m[1] = SWAP4(((__global uint *)states)[49]);
+				m[2] = 0x80000000U;
+
+				for(int i = 3; i < 13; ++i) m[i] = 0x00U;
+
+				m[13] = 1U;
+				m[14] = 0U;
+				m[15] = 0x640;
+				bitlen += 64;
+			}
+
+			((uint16 *)v)[0].lo = ((uint8 *)h)[0];
+			((uint16 *)v)[0].hi = vload8(0U, c_u256);
+
+			//v[12] ^= (i < 3) ? (i + 1) << 9 : 1600U;
+			//v[13] ^= (i < 3) ? (i + 1) << 9 : 1600U;
+
+			v[12] ^= bitlen;
+			v[13] ^= bitlen;
+
+			for(int r = 0; r < 14; r++)
+			{
+				GS(0, 4, 0x8, 0xC, 0x0);
+				GS(1, 5, 0x9, 0xD, 0x2);
+				GS(2, 6, 0xA, 0xE, 0x4);
+				GS(3, 7, 0xB, 0xF, 0x6);
+				GS(0, 5, 0xA, 0xF, 0x8);
+				GS(1, 6, 0xB, 0xC, 0xA);
+				GS(2, 7, 0x8, 0xD, 0xC);
+				GS(3, 4, 0x9, 0xE, 0xE);
+			}
+
+			((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1];
 		}
-		else
+
+		for(int i = 0; i < 8; ++i) h[i] = SWAP4(h[i]);
+
+		if(h[7] <= Target)
 		{
-			m[0] = SWAP4(((__global uint *)states)[48]);
-			m[1] = SWAP4(((__global uint *)states)[49]);
-			m[2] = 0x80000000U;
-			
-			for(int i = 3; i < 13; ++i) m[i] = 0x00U;
-			
-			m[13] = 1U;
-			m[14] = 0U;
-			m[15] = 0x640;
-			bitlen += 64;
+			ulong outIdx = atomic_inc(output + 0xFF);
+			if(outIdx < 0xFF)
+				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
 		}
-		
-		((uint16 *)v)[0].lo = ((uint8 *)h)[0];
-		((uint16 *)v)[0].hi = vload8(0U, c_u256);
-		
-		//v[12] ^= (i < 3) ? (i + 1) << 9 : 1600U;
-		//v[13] ^= (i < 3) ? (i + 1) << 9 : 1600U;
-		
-		v[12] ^= bitlen;
-		v[13] ^= bitlen;
-		
-		for(int r = 0; r < 14; r++)
-		{	
-			GS(0, 4, 0x8, 0xC, 0x0);
-			GS(1, 5, 0x9, 0xD, 0x2);
-			GS(2, 6, 0xA, 0xE, 0x4);
-			GS(3, 7, 0xB, 0xF, 0x6);
-			GS(0, 5, 0xA, 0xF, 0x8);
-			GS(1, 6, 0xB, 0xC, 0xA);
-			GS(2, 7, 0x8, 0xD, 0xC);
-			GS(3, 4, 0x9, 0xE, 0xE);
-		}
-		
-		((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1];
-	}
-	
-	for(int i = 0; i < 8; ++i) h[i] = SWAP4(h[i]);
-	
-	//for(int i = 0; i < 4; ++i) output[i] = ((ulong *)h)[i];
-	if(h[7] <= Target)
-	{
-		ulong outIdx = atomic_inc(output + 0xFF);
-		if(outIdx < 0xFF)
-			output[outIdx] = BranchBuf[idx] + get_global_offset(0);
 	}
 }

@@ -832,56 +872,57 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
 {
 	const uint idx = get_global_id(0) - get_global_offset(0);
 	
-	if(idx >= Threads) return;
-	
-	states += 25 * BranchBuf[idx];
-	
-	ulong State[8];
-	
-	for(int i = 0; i < 7; ++i) State[i] = 0UL;
-	
-	State[7] = 0x0001000000000000UL;
-	
-	for(uint i = 0; i < 4; ++i)
+	// do not use early return here
+	if(idx < Threads)
 	{
-		ulong H[8], M[8];
-		
-		if(i < 3)
+		states += 25 * BranchBuf[idx];
+
+		ulong State[8];
+
+		for(int i = 0; i < 7; ++i) State[i] = 0UL;
+
+		State[7] = 0x0001000000000000UL;
+
+		for(uint i = 0; i < 4; ++i)
 		{
-			((ulong8 *)M)[0] = vload8(i, states);
+			ulong H[8], M[8];
+
+			if(i < 3)
+			{
+				((ulong8 *)M)[0] = vload8(i, states);
+			}
+			else
+			{
+				M[0] = states[24];
+				M[1] = 0x80UL;
+
+				for(int x = 2; x < 7; ++x) M[x] = 0UL;
+
+				M[7] = 0x0400000000000000UL;
+			}
+
+			for(int x = 0; x < 8; ++x) H[x] = M[x] ^ State[x];
+
+			PERM_SMALL_P(H);
+			PERM_SMALL_Q(M);
+
+			for(int x = 0; x < 8; ++x) State[x] ^= H[x] ^ M[x];
 		}
-		else
+
+		ulong tmp[8];
+
+		for(int i = 0; i < 8; ++i) tmp[i] = State[i];
+
+		PERM_SMALL_P(State);
+
+		for(int i = 0; i < 8; ++i) State[i] ^= tmp[i];
+
+		if(as_uint2(State[7]).s1 <= Target)
 		{
-			M[0] = states[24];
-			M[1] = 0x80UL;
-						
-			for(int x = 2; x < 7; ++x) M[x] = 0UL;
-			
-			M[7] = 0x0400000000000000UL;
+			ulong outIdx = atomic_inc(output + 0xFF);
+			if(outIdx < 0xFF)
+				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
 		}
-		
-		for(int x = 0; x < 8; ++x) H[x] = M[x] ^ State[x];
-		
-		PERM_SMALL_P(H);
-		PERM_SMALL_Q(M);
-		
-		for(int x = 0; x < 8; ++x) State[x] ^= H[x] ^ M[x];
-	}
-	
-	ulong tmp[8];
-	
-	for(int i = 0; i < 8; ++i) tmp[i] = State[i];
-	
-	PERM_SMALL_P(State);
-	
-	for(int i = 0; i < 8; ++i) State[i] ^= tmp[i];
-	
-	//for(int i = 0; i < 4; ++i) output[i] = State[i + 4];
-	if(as_uint2(State[7]).s1 <= Target)
-	{
-		ulong outIdx = atomic_inc(output + 0xFF);
-		if(outIdx < 0xFF)
-			output[outIdx] = BranchBuf[idx] + get_global_offset(0);
 	}
 }