2015-10-23 08:23:09 +02:00
|
|
|
#ifndef CUDA_VECTOR_UINT2x4_H
|
|
|
|
#define CUDA_VECTOR_UINT2x4_H
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
|
|
|
|
#define __LDG_PTR "l"
|
|
|
|
#else
|
|
|
|
#define __LDG_PTR "r"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include "cuda_helper.h"
|
|
|
|
|
2015-10-23 12:17:11 +02:00
|
|
|
typedef struct __align__(16) uint2x4 {
|
2015-10-23 08:23:09 +02:00
|
|
|
uint2 x, y, z, w;
|
2015-10-23 12:17:11 +02:00
|
|
|
} uint2x4;
|
2015-10-23 08:23:09 +02:00
|
|
|
|
2015-10-23 14:43:22 +02:00
|
|
|
|
2015-10-23 12:17:11 +02:00
|
|
|
static __inline__ __device__ uint2x4 make_uint2x4(uint2 s0, uint2 s1, uint2 s2, uint2 s3)
|
2015-10-23 08:23:09 +02:00
|
|
|
{
|
2015-10-23 14:43:22 +02:00
|
|
|
uint2x4 t;
|
|
|
|
t.x = s0; t.y = s1; t.z = s2; t.w = s3;
|
2015-10-23 08:23:09 +02:00
|
|
|
return t;
|
|
|
|
}
|
|
|
|
|
2015-10-23 12:17:11 +02:00
|
|
|
static __forceinline__ __device__ uint2x4 operator^ (const uint2x4 &a, const uint2x4 &b) {
|
|
|
|
return make_uint2x4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
|
2015-10-23 08:23:09 +02:00
|
|
|
}
|
|
|
|
|
2015-10-23 12:17:11 +02:00
|
|
|
static __forceinline__ __device__ uint2x4 operator+ (const uint2x4 &a, const uint2x4 &b) {
|
|
|
|
return make_uint2x4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
|
2015-10-23 08:23:09 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/////////////////////////
|
|
|
|
|
2015-10-23 12:17:11 +02:00
|
|
|
static __forceinline__ __device__ void operator^= (uint2x4 &a, const uint2x4 &b) { a = a ^ b; }
|
|
|
|
static __forceinline__ __device__ void operator+= (uint2x4 &a, const uint2x4 &b) { a = a + b; }
|
2015-10-23 08:23:09 +02:00
|
|
|
|
2015-10-23 12:17:11 +02:00
|
|
|
#if __CUDA_ARCH__ >= 320
|
2015-10-23 08:23:09 +02:00
|
|
|
|
2015-10-23 12:17:11 +02:00
|
|
|
static __device__ __inline__ uint2x4 __ldg4(const uint2x4 *ptr)
|
2015-10-23 08:23:09 +02:00
|
|
|
{
|
2015-10-23 12:17:11 +02:00
|
|
|
uint2x4 ret;
|
2015-10-23 14:43:22 +02:00
|
|
|
asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr));
|
2015-10-23 08:23:09 +02:00
|
|
|
asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.z.x), "=r"(ret.z.y), "=r"(ret.w.x), "=r"(ret.w.y) : __LDG_PTR(ptr));
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-10-23 12:17:11 +02:00
|
|
|
static __device__ __inline__ void ldg4(const uint2x4 *ptr, uint2x4 *ret)
|
2015-10-23 08:23:09 +02:00
|
|
|
{
|
2015-10-23 14:43:22 +02:00
|
|
|
asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr));
|
|
|
|
asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr));
|
2015-10-23 08:23:09 +02:00
|
|
|
asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr));
|
2015-10-23 14:43:22 +02:00
|
|
|
asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr));
|
2015-10-23 08:23:09 +02:00
|
|
|
asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr));
|
2015-10-23 14:43:22 +02:00
|
|
|
asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr));
|
2015-10-23 08:23:09 +02:00
|
|
|
}
|
2015-10-23 14:43:22 +02:00
|
|
|
#elif !defined(__ldg4)
|
|
|
|
#define __ldg4(x) (*(x))
|
|
|
|
#define ldg4(ptr, ret) { *(ret) = (*(ptr)); }
|
2015-10-23 12:17:11 +02:00
|
|
|
#endif
|
2015-10-23 08:23:09 +02:00
|
|
|
|
|
|
|
#endif // H
|