Browse Source

99% done Windows ARM32 port

pull/260/head
Er2 1 year ago
parent
commit
ee04c66aa5
  1. 1
      .gitignore
  2. 60
      common/sse2neon.h
  3. 4
      engine/cmodel.cpp
  4. 2
      engine/l_studio.cpp
  5. 2
      engine/sys_engine.cpp
  6. 4
      game/client/detailobjectsystem.cpp
  7. 4
      inputsystem/inputsystem.cpp
  8. 9
      inputsystem/inputsystem.h
  9. 370
      inputsystem/joystick_win32.cpp
  10. 8
      inputsystem/wscript
  11. 2
      materialsystem/colorspace.h
  12. 2
      mathlib/3dnow.cpp
  13. 6
      mathlib/mathlib_base.cpp
  14. 20
      mathlib/sse.cpp
  15. 42
      mathlib/sseconst.cpp
  16. 2
      mathlib/ssenoise.cpp
  17. 23
      public/materialsystem/imesh.h
  18. 6
      public/mathlib/mathlib.h
  19. 3
      public/mathlib/simdvectormatrix.h
  20. 14
      public/mathlib/ssemath.h
  21. 2
      public/mathlib/vmatrix.h
  22. 4
      public/shaderapi/ishaderapi.h
  23. 2
      public/tier0/commonmacros.h
  24. 34
      public/tier0/platform.h
  25. 4
      public/tier0/threadtools.h
  26. 8
      studiorender/r_studiodraw.cpp
  27. 2
      studiorender/r_studiolight.cpp
  28. 4
      studiorender/r_studiolight.h
  29. 2
      studiorender/studiorender.h
  30. 2
      tier0/PMELib.cpp
  31. 2
      tier0/cpumonitoring.cpp
  32. 12
      tier0/threadtools.cpp
  33. 7
      tier0/wscript
  34. 2
      tier1/processor_detect.cpp
  35. 4
      vphysics/trace.cpp
  36. 8
      vstdlib/coroutine.cpp
  37. 22
      wscript

1
.gitignore vendored

@ -1,3 +1,4 @@
*~
*.mak *.mak
*.mak.vpc_crc *.mak.vpc_crc
*.vpc_crc *.vpc_crc

60
common/sse2neon.h

@ -89,9 +89,6 @@
#define _sse2neon_likely(x) __builtin_expect(!!(x), 1) #define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0) #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
#elif defined(_MSC_VER) #elif defined(_MSC_VER)
#if _MSVC_TRADITIONAL
#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
#endif
#ifndef FORCE_INLINE #ifndef FORCE_INLINE
#define FORCE_INLINE static inline #define FORCE_INLINE static inline
#endif #endif
@ -184,6 +181,10 @@
} while (0) } while (0)
#endif #endif
#ifdef _M_ARM
#define vst1q_lane_s64(a, b, c)
#endif
/* Memory barriers /* Memory barriers
* __atomic_thread_fence does not include a compiler barrier; instead, * __atomic_thread_fence does not include a compiler barrier; instead,
* the barrier is part of __atomic_load/__atomic_store's "volatile-like" * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
@ -202,8 +203,12 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
#elif defined(__GNUC__) || defined(__clang__) #elif defined(__GNUC__) || defined(__clang__)
__atomic_thread_fence(__ATOMIC_SEQ_CST); __atomic_thread_fence(__ATOMIC_SEQ_CST);
#else /* MSVC */ #else /* MSVC */
#ifdef _M_ARM
__dmb(_ARM_BARRIER_ISH);
#else
__dmb(_ARM64_BARRIER_ISH); __dmb(_ARM64_BARRIER_ISH);
#endif #endif
#endif
} }
/* Architecture-specific build options */ /* Architecture-specific build options */
@ -268,7 +273,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
* we have to perform syscall instead. * we have to perform syscall instead.
*/ */
#if (!defined(__aarch64__) && !defined(_M_ARM64)) #if (!defined(__aarch64__) && !defined(_M_ARM64))
#include <sys/time.h> #include <time.h>
#endif #endif
/* "__has_builtin" can be used to query support for built-in functions /* "__has_builtin" can be used to query support for built-in functions
@ -574,10 +579,10 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
/* Backwards compatibility for compilers with lack of specific type support */ /* Backwards compatibility for compilers with lack of specific type support */
// Older gcc does not define vld1q_u8_x4 type // Older gcc does not define vld1q_u8_x4 type
#if defined(__GNUC__) && !defined(__clang__) && \ #if defined(_M_ARM) || (defined(__GNUC__) && !defined(__clang__) && \
((__GNUC__ <= 12 && defined(__arm__)) || \ ((__GNUC__ <= 12 && defined(__arm__)) || \
(__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \ (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
(__GNUC__ <= 9 && defined(__aarch64__))) (__GNUC__ <= 9 && defined(__aarch64__))))
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
{ {
uint8x16x4_t ret; uint8x16x4_t ret;
@ -610,6 +615,9 @@ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
} }
#endif #endif
#if defined(_M_ARM)
#pragma message("TODO: Windows ARM32: Port many SSE2NEON functions")
#else
#if !defined(__aarch64__) && !defined(_M_ARM64) #if !defined(__aarch64__) && !defined(_M_ARM64)
/* emulate vaddvq u8 variant */ /* emulate vaddvq u8 variant */
FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a) FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
@ -645,6 +653,7 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
return vaddvq_u16(a); return vaddvq_u16(a);
} }
#endif #endif
#endif
/* Function Naming Conventions /* Function Naming Conventions
* The naming convention of SSE intrinsics is straightforward. A generic SSE * The naming convention of SSE intrinsics is straightforward. A generic SSE
@ -1765,6 +1774,7 @@ FORCE_INLINE void _mm_free(void *addr)
} }
#endif #endif
#ifndef _M_ARM
FORCE_INLINE uint64_t _sse2neon_get_fpcr() FORCE_INLINE uint64_t _sse2neon_get_fpcr()
{ {
uint64_t value; uint64_t value;
@ -1808,6 +1818,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF; return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
} }
#endif
// Macro: Get the rounding mode bits from the MXCSR control and status register. // Macro: Get the rounding mode bits from the MXCSR control and status register.
// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
@ -1826,6 +1837,8 @@ FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
#if defined(__aarch64__) || defined(_M_ARM64) #if defined(__aarch64__) || defined(_M_ARM64)
r.value = _sse2neon_get_fpcr(); r.value = _sse2neon_get_fpcr();
#elif defined(_M_ARM)
r.value = _MoveFromCoprocessor(10,7, 1,0,0);
#else #else
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif #endif
@ -2247,7 +2260,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
FORCE_INLINE void _mm_prefetch(char const *p, int i) FORCE_INLINE void _mm_prefetch(char const *p, int i)
{ {
(void) i; (void) i;
#if defined(_MSC_VER) #ifdef _M_ARM64
switch (i) { switch (i) {
case _MM_HINT_NTA: case _MM_HINT_NTA:
__prefetch2(p, 1); __prefetch2(p, 1);
@ -2262,6 +2275,8 @@ FORCE_INLINE void _mm_prefetch(char const *p, int i)
__prefetch2(p, 4); __prefetch2(p, 4);
break; break;
} }
#elif defined(_M_ARM)
// TODO
#else #else
switch (i) { switch (i) {
case _MM_HINT_NTA: case _MM_HINT_NTA:
@ -2348,6 +2363,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0)); vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
} }
#ifndef _M_ARM
// Macro: Set the flush zero bits of the MXCSR control and status register to // Macro: Set the flush zero bits of the MXCSR control and status register to
// the value in unsigned 32-bit integer a. The flush zero may contain any of the // the value in unsigned 32-bit integer a. The flush zero may contain any of the
// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF // following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
@ -2379,6 +2395,7 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif #endif
} }
#endif
// Set packed single-precision (32-bit) floating-point elements in dst with the // Set packed single-precision (32-bit) floating-point elements in dst with the
// supplied values. // supplied values.
@ -2404,6 +2421,7 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
{ {
#ifndef _M_ARM
union { union {
fpcr_bitfield field; fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64) #if defined(__aarch64__) || defined(_M_ARM64)
@ -2442,6 +2460,7 @@ FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
#else #else
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif #endif
#endif
} }
// Copy single-precision (32-bit) floating-point element a to the lower element // Copy single-precision (32-bit) floating-point element a to the lower element
@ -3206,6 +3225,7 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
return _mm_move_sd(a, _mm_cmpeq_pd(a, b)); return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
} }
#ifndef _M_ARM
// Compare packed double-precision (64-bit) floating-point elements in a and b // Compare packed double-precision (64-bit) floating-point elements in a and b
// for greater-than-or-equal, and store the results in dst. // for greater-than-or-equal, and store the results in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
@ -3247,6 +3267,7 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
return vreinterpretq_m128d_u64(vld1q_u64(d)); return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif #endif
} }
#endif
// Compare packed signed 16-bit integers in a and b for greater-than, and store // Compare packed signed 16-bit integers in a and b for greater-than, and store
// the results in dst. // the results in dst.
@ -3275,6 +3296,7 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
} }
#ifndef _M_ARM
// Compare packed double-precision (64-bit) floating-point elements in a and b // Compare packed double-precision (64-bit) floating-point elements in a and b
// for greater-than, and store the results in dst. // for greater-than, and store the results in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
@ -3358,6 +3380,7 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
return vreinterpretq_m128d_u64(vld1q_u64(d)); return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif #endif
} }
#endif
// Compare packed signed 16-bit integers in a and b for less-than, and store the // Compare packed signed 16-bit integers in a and b for less-than, and store the
// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the // results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
@ -3389,6 +3412,7 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
} }
#ifndef _M_ARM
// Compare packed double-precision (64-bit) floating-point elements in a and b // Compare packed double-precision (64-bit) floating-point elements in a and b
// for less-than, and store the results in dst. // for less-than, and store the results in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
@ -3429,6 +3453,7 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
return vreinterpretq_m128d_u64(vld1q_u64(d)); return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif #endif
} }
#endif
// Compare packed double-precision (64-bit) floating-point elements in a and b // Compare packed double-precision (64-bit) floating-point elements in a and b
// for not-equal, and store the results in dst. // for not-equal, and store the results in dst.
@ -3456,6 +3481,7 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
return _mm_move_sd(a, _mm_cmpneq_pd(a, b)); return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
} }
#ifndef _M_ARM
// Compare packed double-precision (64-bit) floating-point elements in a and b // Compare packed double-precision (64-bit) floating-point elements in a and b
// for not-greater-than-or-equal, and store the results in dst. // for not-greater-than-or-equal, and store the results in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
@ -3756,6 +3782,7 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
return (*(double *) &a0 < *(double *) &b0); return (*(double *) &a0 < *(double *) &b0);
#endif #endif
} }
#endif
// Compare the lower double-precision (64-bit) floating-point element in a and b // Compare the lower double-precision (64-bit) floating-point element in a and b
// for equality, and return the boolean result (0 or 1). // for equality, and return the boolean result (0 or 1).
@ -4401,6 +4428,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
} }
#ifndef _M_ARM
// Compare packed double-precision (64-bit) floating-point elements in a and b, // Compare packed double-precision (64-bit) floating-point elements in a and b,
// and store packed maximum values in dst. // and store packed maximum values in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
@ -4487,6 +4515,7 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
return vreinterpretq_m128d_u64(vld1q_u64(d)); return vreinterpretq_m128d_u64(vld1q_u64(d));
#endif #endif
} }
#endif
// Compare the lower double-precision (64-bit) floating-point elements in a and // Compare the lower double-precision (64-bit) floating-point elements in a and
// b, store the minimum value in the lower element of dst, and copy the upper // b, store the minimum value in the lower element of dst, and copy the upper
@ -4793,7 +4822,11 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
FORCE_INLINE void _mm_pause() FORCE_INLINE void _mm_pause()
{ {
#if defined(_MSC_VER) #if defined(_MSC_VER)
#ifdef _M_ARM
__isb(_ARM_BARRIER_SY);
#else
__isb(_ARM64_BARRIER_SY); __isb(_ARM64_BARRIER_SY);
#endif
#else #else
__asm__ __volatile__("isb\n"); __asm__ __volatile__("isb\n");
#endif #endif
@ -7622,6 +7655,7 @@ FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
} }
/* SSE4.2 */ /* SSE4.2 */
#ifndef _M_ARM
const static uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = { const static uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
@ -8463,9 +8497,11 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
return crc; return crc;
} }
#endif
/* AES */ /* AES */
#if !defined(__ARM_FEATURE_CRYPTO) && !defined(_M_ARM64) #if !defined(__ARM_FEATURE_CRYPTO) && !defined(_M_ARM64) && !defined(_M_ARM)
/* clang-format off */ /* clang-format off */
#define SSE2NEON_AES_SBOX(w) \ #define SSE2NEON_AES_SBOX(w) \
{ \ { \
@ -8913,6 +8949,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
#undef SSE2NEON_MULTIPLY #undef SSE2NEON_MULTIPLY
#endif #endif
#elif defined(_M_ARM)
#else /* __ARM_FEATURE_CRYPTO */ #else /* __ARM_FEATURE_CRYPTO */
// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
// AESMC and then manually applying the real key as an xor operation. This // AESMC and then manually applying the real key as an xor operation. This
@ -9034,6 +9071,7 @@ FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
} }
} }
#ifndef _M_ARM
FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode() FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
{ {
union { union {
@ -9053,6 +9091,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF; return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
} }
#endif
// Count the number of bits set to 1 in unsigned 32-bit integer a, and // Count the number of bits set to 1 in unsigned 32-bit integer a, and
// return that count in dst. // return that count in dst.
@ -9113,6 +9152,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
#endif #endif
} }
#ifndef _M_ARM
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
{ {
// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
@ -9140,6 +9180,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif #endif
} }
#endif
// Return the current 64-bit value of the processor's time-stamp counter. // Return the current 64-bit value of the processor's time-stamp counter.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
@ -9161,6 +9202,9 @@ FORCE_INLINE uint64_t _rdtsc(void)
#endif #endif
return val; return val;
#elif defined(_M_ARM)
uint32_t val = _MoveFromCoprocessor(15,0, 9,13,0);
return ((uint64_t)val) << 6;
#else #else
uint32_t pmccntr, pmuseren, pmcntenset; uint32_t pmccntr, pmuseren, pmcntenset;
// Read the user mode Performance Monitoring Unit (PMU) // Read the user mode Performance Monitoring Unit (PMU)

4
engine/cmodel.cpp

@ -862,7 +862,7 @@ BOX TRACING
// Custom SIMD implementation for box brushes // Custom SIMD implementation for box brushes
const fltx4 Four_DistEpsilons={DIST_EPSILON,DIST_EPSILON,DIST_EPSILON,DIST_EPSILON}; const fltx4 Four_DistEpsilons=FLTX4(DIST_EPSILON,DIST_EPSILON,DIST_EPSILON,DIST_EPSILON);
const int32 ALIGN16 g_CubeFaceIndex0[4] ALIGN16_POST = {0,1,2,-1}; const int32 ALIGN16 g_CubeFaceIndex0[4] ALIGN16_POST = {0,1,2,-1};
const int32 ALIGN16 g_CubeFaceIndex1[4] ALIGN16_POST = {3,4,5,-1}; const int32 ALIGN16 g_CubeFaceIndex1[4] ALIGN16_POST = {3,4,5,-1};
bool IntersectRayWithBoxBrush( TraceInfo_t *pTraceInfo, const cbrush_t *pBrush, cboxbrush_t *pBox ) bool IntersectRayWithBoxBrush( TraceInfo_t *pTraceInfo, const cbrush_t *pBrush, cboxbrush_t *pBox )
@ -1572,7 +1572,7 @@ void FASTCALL CM_TraceToLeaf( TraceInfo_t * RESTRICT pTraceInfo, int ndxLeaf, fl
fltx4 traceStart = LoadUnaligned3SIMD(pTraceInfo->m_start.Base()); fltx4 traceStart = LoadUnaligned3SIMD(pTraceInfo->m_start.Base());
fltx4 traceDelta = LoadUnaligned3SIMD(pTraceInfo->m_delta.Base()); fltx4 traceDelta = LoadUnaligned3SIMD(pTraceInfo->m_delta.Base());
fltx4 traceInvDelta = LoadUnaligned3SIMD(pTraceInfo->m_invDelta.Base()); fltx4 traceInvDelta = LoadUnaligned3SIMD(pTraceInfo->m_invDelta.Base());
static const fltx4 vecEpsilon = {DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON}; static const fltx4 vecEpsilon = FLTX4(DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON);
// only used in !IS_POINT version: // only used in !IS_POINT version:
fltx4 extents; fltx4 extents;
if (!IS_POINT) if (!IS_POINT)

2
engine/l_studio.cpp

@ -40,7 +40,7 @@
#include "materialsystem/materialsystem_config.h" #include "materialsystem/materialsystem_config.h"
#include "materialsystem/itexture.h" #include "materialsystem/itexture.h"
#include "IHammer.h" #include "IHammer.h"
#if defined( _WIN32 ) && !defined( _X360 ) #if defined( _WIN32 ) && !defined( _X360 ) && !defined(_M_ARM)
#include <xmmintrin.h> #include <xmmintrin.h>
#endif #endif
#include "staticpropmgr.h" #include "staticpropmgr.h"

2
engine/sys_engine.cpp

@ -104,7 +104,7 @@ extern ConVar host_timer_spin_ms;
extern float host_nexttick; extern float host_nexttick;
extern IVEngineClient *engineClient; extern IVEngineClient *engineClient;
#ifdef WIN32 #if defined(_WIN32) && !defined(_M_ARM)
static void cpu_frequency_monitoring_callback( IConVar *var, const char *pOldValue, float flOldValue ) static void cpu_frequency_monitoring_callback( IConVar *var, const char *pOldValue, float flOldValue )
{ {
// Set the specified interval for CPU frequency monitoring // Set the specified interval for CPU frequency monitoring

4
game/client/detailobjectsystem.cpp

@ -2122,8 +2122,8 @@ int CDetailObjectSystem::SortSpritesBackToFront( int nLeaf, const Vector &viewOr
#else #else
#define MANTISSA_LSB_OFFSET 0 #define MANTISSA_LSB_OFFSET 0
#endif #endif
static fltx4 Four_MagicNumbers={ MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER }; static fltx4 Four_MagicNumbers=FLTX4( MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER );
static fltx4 Four_255s={ 255.0, 255.0, 255.0, 255.0 }; static fltx4 Four_255s=FLTX4( 255.0, 255.0, 255.0, 255.0 );
static ALIGN16 int32 And255Mask[4] ALIGN16_POST = {0xff,0xff,0xff,0xff}; static ALIGN16 int32 And255Mask[4] ALIGN16_POST = {0xff,0xff,0xff,0xff};
#define PIXMASK ( * ( reinterpret_cast< fltx4 *>( &And255Mask ) ) ) #define PIXMASK ( * ( reinterpret_cast< fltx4 *>( &And255Mask ) ) )

4
inputsystem/inputsystem.cpp

@ -167,8 +167,10 @@ InitReturnVal_t CInputSystem::Init()
joy_xcontroller_found.SetValue( 0 ); joy_xcontroller_found.SetValue( 0 );
#ifdef USE_SDL
if( !m_bConsoleTextMode ) if( !m_bConsoleTextMode )
InitializeTouch(); InitializeTouch();
#endif
if ( IsPC() && !m_bConsoleTextMode ) if ( IsPC() && !m_bConsoleTextMode )
{ {
@ -975,7 +977,9 @@ void CInputSystem::SetPrimaryUserId( int userId )
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
void CInputSystem::SetRumble( float fLeftMotor, float fRightMotor, int userId ) void CInputSystem::SetRumble( float fLeftMotor, float fRightMotor, int userId )
{ {
#ifdef USE_SDL
SetXDeviceRumble( fLeftMotor, fRightMotor, userId ); SetXDeviceRumble( fLeftMotor, fRightMotor, userId );
#endif
} }

9
inputsystem/inputsystem.h

@ -145,10 +145,16 @@ public:
struct JoystickInfo_t struct JoystickInfo_t
{ {
#ifdef USE_SDL
void *m_pDevice; // Really an SDL_GameController*, NULL if not present. void *m_pDevice; // Really an SDL_GameController*, NULL if not present.
void *m_pHaptic; // Really an SDL_Haptic* void *m_pHaptic; // Really an SDL_Haptic*
float m_fCurrentRumble; float m_fCurrentRumble;
bool m_bRumbleEnabled; bool m_bRumbleEnabled;
#elif defined(_WIN32)
JOYINFOEX m_JoyInfoEx;
#else
#error
#endif
int m_nButtonCount; int m_nButtonCount;
int m_nAxisFlags; int m_nAxisFlags;
int m_nDeviceId; int m_nDeviceId;
@ -271,6 +277,9 @@ public:
//Added called and set to true when binding input and set to false once bound //Added called and set to true when binding input and set to false once bound
void SetNovintPure( bool bPure ); void SetNovintPure( bool bPure );
#ifndef USE_SDL
unsigned int AxisValue( JoystickAxis_t axis, JOYINFOEX& ji );
#endif
#else #else
void SetNovintPure( bool bPure ) {} // to satify the IInput virtual interface void SetNovintPure( bool bPure ) {} // to satify the IInput virtual interface
#endif #endif

370
inputsystem/joystick_win32.cpp

@ -0,0 +1,370 @@
//========= Copyright Valve Corporation, All rights reserved. ============//
//
// Purpose: PC Joystick implementation for inputsystem.dll
//
//===========================================================================//
/* For force feedback testing. */
#include "inputsystem.h"
#include "tier1/convar.h"
#include "tier0/icommandline.h"
//-----------------------------------------------------------------------------
// Joystick helpers
//-----------------------------------------------------------------------------
#define JOY_POVFWDRIGHT ( ( JOY_POVFORWARD + JOY_POVRIGHT ) >> 1 ) // 4500
#define JOY_POVRIGHTBACK ( ( JOY_POVRIGHT + JOY_POVBACKWARD ) >> 1 ) // 13500
#define JOY_POVFBACKLEFT ( ( JOY_POVBACKWARD + JOY_POVLEFT ) >> 1 ) // 22500
#define JOY_POVLEFTFWD ( ( JOY_POVLEFT + JOY_POVFORWARD ) >> 1 ) // 31500
ConVar joy_wwhack1( "joy_wingmanwarrior_centerhack", "0", FCVAR_ARCHIVE, "Wingman warrior centering hack." );
ConVar joy_axisbutton_threshold( "joy_axisbutton_threshold", "0.3", FCVAR_ARCHIVE, "Analog axis range before a button press is registered." );
//-----------------------------------------------------------------------------
// Initialize all joysticks
//-----------------------------------------------------------------------------
void CInputSystem::InitializeJoysticks( void )
{
// assume no joystick
m_nJoystickCount = 0;
// abort startup if user requests no joystick
if ( CommandLine()->FindParm("-nojoy" ) )
return;
// verify joystick driver is present
int nMaxJoysticks = joyGetNumDevs();
if ( nMaxJoysticks > MAX_JOYSTICKS )
{
nMaxJoysticks = MAX_JOYSTICKS;
}
else if ( nMaxJoysticks <= 0 )
{
DevMsg( 1, "joystick not found -- driver not present\n");
return;
}
// cycle through the joysticks looking for valid ones
MMRESULT mmr;
for ( int i=0; i < nMaxJoysticks; i++ )
{
JOYINFOEX ji;
Q_memset( &ji, 0, sizeof( ji ) );
ji.dwSize = sizeof(ji);
ji.dwFlags = JOY_RETURNCENTERED;
mmr = joyGetPosEx( i, &ji );
if ( mmr != JOYERR_NOERROR )
continue;
// get the capabilities of the selected joystick
// abort startup if command fails
JOYCAPS jc;
Q_memset( &jc, 0, sizeof( jc ) );
mmr = joyGetDevCaps( i, &jc, sizeof( jc ) );
if ( mmr != JOYERR_NOERROR )
continue;
JoystickInfo_t &info = m_pJoystickInfo[m_nJoystickCount];
info.m_nDeviceId = i;
info.m_JoyInfoEx = ji;
info.m_nButtonCount = (int)jc.wNumButtons;
info.m_bHasPOVControl = ( jc.wCaps & JOYCAPS_HASPOV ) ? true : false;
info.m_bDiagonalPOVControlEnabled = false;
info.m_nFlags = JOY_RETURNCENTERED | JOY_RETURNBUTTONS | JOY_RETURNX | JOY_RETURNY;
info.m_nAxisFlags = 0;
if ( jc.wNumAxes >= 2 )
{
info.m_nAxisFlags |= 0x3;
}
if ( info.m_bHasPOVControl )
{
info.m_nFlags |= JOY_RETURNPOV;
}
if ( jc.wCaps & JOYCAPS_HASZ )
{
info.m_nFlags |= JOY_RETURNZ;
info.m_nAxisFlags |= 0x4;
}
if ( jc.wCaps & JOYCAPS_HASR )
{
info.m_nFlags |= JOY_RETURNR;
info.m_nAxisFlags |= 0x8;
}
if ( jc.wCaps & JOYCAPS_HASU )
{
info.m_nFlags |= JOY_RETURNU;
info.m_nAxisFlags |= 0x10;
}
if ( jc.wCaps & JOYCAPS_HASV )
{
info.m_nFlags |= JOY_RETURNV;
info.m_nAxisFlags |= 0x20;
}
info.m_nLastPolledButtons = 0;
info.m_nLastPolledAxisButtons = 0;
info.m_nLastPolledPOVState = 0;
memset( info.m_pLastPolledAxes, 0, sizeof(info.m_pLastPolledAxes) );
++m_nJoystickCount;
EnableJoystickInput( i, true );
}
}
void CInputSystem::ShutdownJoysticks()
{
}
//-----------------------------------------------------------------------------
// Process the event
//-----------------------------------------------------------------------------
void CInputSystem::JoystickButtonEvent( ButtonCode_t button, int sample )
{
// package the key
if ( sample )
{
PostButtonPressedEvent( IE_ButtonPressed, m_nLastSampleTick, button, button );
}
else
{
PostButtonReleasedEvent( IE_ButtonReleased, m_nLastSampleTick, button, button );
}
}
//-----------------------------------------------------------------------------
// Update the joystick button state
//-----------------------------------------------------------------------------
void CInputSystem::UpdateJoystickButtonState( int nJoystick )
{
JoystickInfo_t &info = m_pJoystickInfo[nJoystick];
JOYINFOEX& ji = info.m_JoyInfoEx;
// Standard joystick buttons
unsigned int buttons = ji.dwButtons ^ info.m_nLastPolledButtons;
if ( buttons )
{
for ( int j = 0 ; j < info.m_nButtonCount ; ++j )
{
int mask = buttons & ( 1 << j );
if ( !mask )
continue;
ButtonCode_t code = (ButtonCode_t)JOYSTICK_BUTTON( nJoystick, j );
if ( mask & ji.dwButtons )
{
// down event
JoystickButtonEvent( code, MAX_BUTTONSAMPLE );
}
else
{
// up event
JoystickButtonEvent( code, 0 );
}
}
info.m_nLastPolledButtons = (unsigned int)ji.dwButtons;
}
// Analog axis buttons
const float minValue = joy_axisbutton_threshold.GetFloat() * MAX_BUTTONSAMPLE;
for ( int j = 0 ; j < MAX_JOYSTICK_AXES; ++j )
{
if ( ( info.m_nAxisFlags & (1 << j) ) == 0 )
continue;
// Positive side of the axis
int mask = ( 1 << (j << 1) );
ButtonCode_t code = JOYSTICK_AXIS_BUTTON( nJoystick, (j << 1) );
float value = GetAnalogValue( JOYSTICK_AXIS( nJoystick, j ) );
if ( value > minValue && !(info.m_nLastPolledAxisButtons & mask) )
{
info.m_nLastPolledAxisButtons |= mask;
JoystickButtonEvent( code, MAX_BUTTONSAMPLE );
}
if ( value <= minValue && (info.m_nLastPolledAxisButtons & mask) )
{
info.m_nLastPolledAxisButtons &= ~mask;
JoystickButtonEvent( code, 0 );
}
// Negative side of the axis
mask <<= 1;
code = (ButtonCode_t)( code + 1 );
if ( value < -minValue && !(info.m_nLastPolledAxisButtons & mask) )
{
info.m_nLastPolledAxisButtons |= mask;
JoystickButtonEvent( code, MAX_BUTTONSAMPLE );
}
if ( value >= -minValue && (info.m_nLastPolledAxisButtons & mask) )
{
info.m_nLastPolledAxisButtons &= ~mask;
JoystickButtonEvent( code, 0 );
}
}
}
//-----------------------------------------------------------------------------
// Purpose: Get raw joystick sample along axis
//-----------------------------------------------------------------------------
unsigned int CInputSystem::AxisValue( JoystickAxis_t axis, JOYINFOEX& ji )
{
switch (axis)
{
case JOY_AXIS_X:
return (unsigned int)ji.dwXpos;
case JOY_AXIS_Y:
return (unsigned int)ji.dwYpos;
case JOY_AXIS_Z:
return (unsigned int)ji.dwZpos;
case JOY_AXIS_R:
return (unsigned int)ji.dwRpos;
case JOY_AXIS_U:
return (unsigned int)ji.dwUpos;
case JOY_AXIS_V:
return (unsigned int)ji.dwVpos;
}
// FIX: need to do some kind of error
return (unsigned int)ji.dwXpos;
}
//-----------------------------------------------------------------------------
// Update the joystick POV control
//-----------------------------------------------------------------------------
void CInputSystem::UpdateJoystickPOVControl( int nJoystick )
{
JoystickInfo_t &info = m_pJoystickInfo[nJoystick];
JOYINFOEX& ji = info.m_JoyInfoEx;
if ( !info.m_bHasPOVControl )
return;
// convert POV information into 4 bits of state information
// this avoids any potential problems related to moving from one
// direction to another without going through the center position
unsigned int povstate = 0;
if ( ji.dwPOV != JOY_POVCENTERED )
{
if (ji.dwPOV == JOY_POVFORWARD) // 0
{
povstate |= 0x01;
}
if (ji.dwPOV == JOY_POVRIGHT) // 9000
{
povstate |= 0x02;
}
if (ji.dwPOV == JOY_POVBACKWARD) // 18000
{
povstate |= 0x04;
}
if (ji.dwPOV == JOY_POVLEFT) // 27000
{
povstate |= 0x08;
}
// Deal with diagonals if user wants them
if ( info.m_bDiagonalPOVControlEnabled )
{
if (ji.dwPOV == JOY_POVFWDRIGHT) // 4500
{
povstate |= ( 0x01 | 0x02 );
}
if (ji.dwPOV == JOY_POVRIGHTBACK) // 13500
{
povstate |= ( 0x02 | 0x04 );
}
if (ji.dwPOV == JOY_POVFBACKLEFT) // 22500
{
povstate |= ( 0x04 | 0x08 );
}
if (ji.dwPOV == JOY_POVLEFTFWD) // 31500
{
povstate |= ( 0x08 | 0x01 );
}
}
}
// determine which bits have changed and key an auxillary event for each change
unsigned int buttons = povstate ^ info.m_nLastPolledPOVState;
if ( buttons )
{
for ( int i = 0; i < JOYSTICK_POV_BUTTON_COUNT; ++i )
{
unsigned int mask = buttons & ( 1 << i );
if ( !mask )
continue;
ButtonCode_t code = (ButtonCode_t)JOYSTICK_POV_BUTTON( nJoystick, i );
if ( mask & povstate )
{
// Keydown on POV buttons
JoystickButtonEvent( code, MAX_BUTTONSAMPLE );
}
else
{
// KeyUp on POV buttons
JoystickButtonEvent( code, 0 );
}
}
// Latch old values
info.m_nLastPolledPOVState = povstate;
}
}
//-----------------------------------------------------------------------------
// Purpose: Sample the joystick
//-----------------------------------------------------------------------------
void CInputSystem::PollJoystick( void )
{
if ( !m_JoysticksEnabled.IsAnyFlagSet() )
return;
InputState_t &state = m_InputState[ m_bIsPolling ];
for ( int i = 0; i < m_nJoystickCount; ++i )
{
if ( !m_JoysticksEnabled.IsFlagSet( 1 << i ) )
continue;
JoystickInfo_t &info = m_pJoystickInfo[i];
JOYINFOEX& ji = info.m_JoyInfoEx;
Q_memset( &ji, 0, sizeof( ji ) );
ji.dwSize = sizeof( ji );
ji.dwFlags = (DWORD)info.m_nFlags;
if ( joyGetPosEx( info.m_nDeviceId, &ji ) != JOYERR_NOERROR )
continue;
// This hack fixes a bug in the Logitech WingMan Warrior DirectInput Driver
// rather than having 32768 be the zero point, they have the zero point at 32668
// go figure -- anyway, now we get the full resolution out of the device
if ( joy_wwhack1.GetBool() )
{
ji.dwUpos += 100;
}
// Poll joystick axes
for ( int j = 0; j < MAX_JOYSTICK_AXES; ++j )
{
if ( ( info.m_nAxisFlags & ( 1 << j ) ) == 0 )
continue;
AnalogCode_t code = JOYSTICK_AXIS( i, j );
int nValue = AxisValue( (JoystickAxis_t)j, ji ) - MAX_BUTTONSAMPLE;
state.m_pAnalogDelta[ code ] = nValue - state.m_pAnalogValue[ code ];
state.m_pAnalogValue[ code ] = nValue;
if ( state.m_pAnalogDelta[ code ] != 0 )
{
PostEvent( IE_AnalogValueChanged, m_nLastSampleTick, code, state.m_pAnalogValue[ code ], state.m_pAnalogDelta[ code ] );
}
}
UpdateJoystickButtonState( i );
UpdateJoystickPOVControl( i );
}
}

8
inputsystem/wscript

@ -18,8 +18,6 @@ def configure(conf):
def build(bld): def build(bld):
source = [ source = [
'inputsystem.cpp', 'inputsystem.cpp',
'joystick_sdl.cpp',
'touch_sdl.cpp',
'key_translation.cpp', 'key_translation.cpp',
'steamcontroller.cpp', 'steamcontroller.cpp',
'../public/tier0/memoverride.cpp' '../public/tier0/memoverride.cpp'
@ -41,6 +39,12 @@ def build(bld):
libs = ['tier0','tier1','tier2','vstdlib','SDL2','steam_api'] libs = ['tier0','tier1','tier2','vstdlib','SDL2','steam_api']
if bld.options.SDL:
source += ['joystick_sdl.cpp', 'touch_sdl.cpp']
elif bld.env.DEST_OS == 'win32':
source += ['joystick_win32.cpp']
libs += ['WINMM']
if bld.env.DEST_OS == 'win32': if bld.env.DEST_OS == 'win32':
libs += ['USER32'] libs += ['USER32']

2
materialsystem/colorspace.h

@ -287,7 +287,7 @@ namespace ColorSpace
{ {
// preload 3.0f onto the returns so that we don't need to multiply the bumpAverage by it // preload 3.0f onto the returns so that we don't need to multiply the bumpAverage by it
// straight away (eg, reschedule this dependent op) // straight away (eg, reschedule this dependent op)
static const fltx4 vThree = { 3.0f, 3.0f, 3.0f, 0.0f }; static const fltx4 vThree = FLTX4( 3.0f, 3.0f, 3.0f, 0.0f );
fltx4 retValBump1 = MulSIMD( vThree, linearBumpColor1); fltx4 retValBump1 = MulSIMD( vThree, linearBumpColor1);
fltx4 retValBump2 = MulSIMD( vThree, linearBumpColor2); fltx4 retValBump2 = MulSIMD( vThree, linearBumpColor2);
fltx4 retValBump3 = MulSIMD( vThree, linearBumpColor3); fltx4 retValBump3 = MulSIMD( vThree, linearBumpColor3);

2
mathlib/3dnow.cpp

@ -16,7 +16,7 @@
// memdbgon must be the last include file in a .cpp file!!! // memdbgon must be the last include file in a .cpp file!!!
#include "tier0/memdbgon.h" #include "tier0/memdbgon.h"
#if !defined(COMPILER_MSVC64) && !defined(LINUX) && !defined(COMPILER_CLANG) #if defined(_M_IX86) && !defined(LINUX) && !defined(COMPILER_CLANG)
// Implement for 64-bit Windows if needed. // Implement for 64-bit Windows if needed.
// Clang hits "fatal error: error in backend:" and other errors when trying // Clang hits "fatal error: error in backend:" and other errors when trying
// to compile the inline assembly below. 3DNow support is highly unlikely to // to compile the inline assembly below. 3DNow support is highly unlikely to

6
mathlib/mathlib_base.cpp

@ -3258,7 +3258,7 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright
// SSE Generally performs better than 3DNow when present, so this is placed // SSE Generally performs better than 3DNow when present, so this is placed
// first to allow SSE to override these settings. // first to allow SSE to override these settings.
#if !defined( OSX ) && !defined( PLATFORM_WINDOWS_PC64 ) && !defined(LINUX) && !defined(PLATFORM_BSD) #ifdef _M_IX86
if ( bAllow3DNow && pi.m_b3DNow ) if ( bAllow3DNow && pi.m_b3DNow )
{ {
s_b3DNowEnabled = true; s_b3DNowEnabled = true;
@ -3291,7 +3291,7 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright
pfRSqrt = _SSE_RSqrtAccurate; pfRSqrt = _SSE_RSqrtAccurate;
pfRSqrtFast = _SSE_RSqrtFast; pfRSqrtFast = _SSE_RSqrtFast;
#endif #endif
#ifdef PLATFORM_WINDOWS_PC32 #ifdef _M_IX86
pfFastSinCos = _SSE_SinCos; pfFastSinCos = _SSE_SinCos;
pfFastCos = _SSE_cos; pfFastCos = _SSE_cos;
#endif #endif
@ -3304,7 +3304,7 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright
if ( bAllowSSE2 && pi.m_bSSE2 ) if ( bAllowSSE2 && pi.m_bSSE2 )
{ {
s_bSSE2Enabled = true; s_bSSE2Enabled = true;
#ifdef PLATFORM_WINDOWS_PC32 #ifdef _M_IX86
pfFastSinCos = _SSE2_SinCos; pfFastSinCos = _SSE2_SinCos;
pfFastCos = _SSE2_cos; pfFastCos = _SSE2_cos;
#endif #endif

20
mathlib/sse.cpp

@ -91,7 +91,7 @@ float _SSE_Sqrt(float x)
{ {
Assert( s_bMathlibInitialized ); Assert( s_bMathlibInitialized );
float root = 0.f; float root = 0.f;
#ifdef _WIN32 #if defined(_WIN32) && !defined(_M_ARM)
_asm _asm
{ {
sqrtss xmm0, x sqrtss xmm0, x
@ -122,7 +122,7 @@ float _SSE_RSqrtAccurate(float x)
} }
#else #else
#ifdef POSIX #if POSIX || defined(_M_ARM)
const __m128 f3 = _mm_set_ss(3.0f); // 3 as SSE value const __m128 f3 = _mm_set_ss(3.0f); // 3 as SSE value
const __m128 f05 = _mm_set_ss(0.5f); // 0.5 as SSE value const __m128 f05 = _mm_set_ss(0.5f); // 0.5 as SSE value
#endif #endif
@ -131,7 +131,7 @@ const __m128 f05 = _mm_set_ss(0.5f); // 0.5 as SSE value
float _SSE_RSqrtAccurate(float a) float _SSE_RSqrtAccurate(float a)
{ {
#ifdef _WIN32 #if defined(_WIN32) && !defined(_M_ARM)
float x; float x;
float half = 0.5f; float half = 0.5f;
float three = 3.f; float three = 3.f;
@ -153,7 +153,7 @@ float _SSE_RSqrtAccurate(float a)
} }
return x; return x;
#elif POSIX #elif POSIX || defined(_M_ARM)
__m128 xx = _mm_load_ss( &a ); __m128 xx = _mm_load_ss( &a );
__m128 xr = _mm_rsqrt_ss( xx ); __m128 xr = _mm_rsqrt_ss( xx );
__m128 xt; __m128 xt;
@ -764,7 +764,7 @@ float _SSE_cos( float x )
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
// SSE2 implementations of optimized routines: // SSE2 implementations of optimized routines:
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
#ifdef PLATFORM_WINDOWS_PC32 #if defined(PLATFORM_WINDOWS_PC32) && !defined(_M_ARM)
void _SSE2_SinCos(float x, float* s, float* c) // any x void _SSE2_SinCos(float x, float* s, float* c) // any x
{ {
#ifdef _WIN32 #ifdef _WIN32
@ -850,9 +850,7 @@ void _SSE2_SinCos(float x, float* s, float* c) // any x
#error "Not Implemented" #error "Not Implemented"
#endif #endif
} }
#endif // PLATFORM_WINDOWS_PC32
#ifdef PLATFORM_WINDOWS_PC32
float _SSE2_cos(float x) float _SSE2_cos(float x)
{ {
#ifdef _WIN32 #ifdef _WIN32
@ -970,9 +968,7 @@ void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
#error "Not Implemented" #error "Not Implemented"
#endif #endif
} }
#endif
#if 0
void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 ) void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
{ {
Assert( s_bMathlibInitialized ); Assert( s_bMathlibInitialized );
@ -1026,9 +1022,7 @@ void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
#error "Not Implemented" #error "Not Implemented"
#endif #endif
} }
#endif
#ifdef _WIN32
void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest ) void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )
{ {
// FIXME: This don't work!! It will overwrite memory in the write to dest // FIXME: This don't work!! It will overwrite memory in the write to dest
@ -1057,7 +1051,6 @@ void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const floa
#endif #endif
} }
} }
#endif
#ifdef _WIN32 #ifdef _WIN32
#ifdef PFN_VECTORMA #ifdef PFN_VECTORMA
@ -1101,7 +1094,6 @@ float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA;
// NJS: (Nov 1 2002) -NOT- faster. may time a couple cycles faster in a single function like // NJS: (Nov 1 2002) -NOT- faster. may time a couple cycles faster in a single function like
// this, but when inlined, and instruction scheduled, the C version is faster. // this, but when inlined, and instruction scheduled, the C version is faster.
// Verified this via VTune // Verified this via VTune
/*
vec_t DotProduct (const vec_t *a, const vec_t *c) vec_t DotProduct (const vec_t *a, const vec_t *c)
{ {
vec_t temp; vec_t temp;
@ -1124,6 +1116,6 @@ vec_t DotProduct (const vec_t *a, const vec_t *c)
ret ret
} }
} }
*/ #endif
#endif // COMPILER_MSVC64 #endif // COMPILER_MSVC64

42
mathlib/sseconst.cpp

@ -7,35 +7,35 @@
#include "mathlib/ssemath.h" #include "mathlib/ssemath.h"
#include "mathlib/ssequaternion.h" #include "mathlib/ssequaternion.h"
const fltx4 Four_PointFives={0.5,0.5,0.5,0.5}; const fltx4 Four_PointFives=FLTX4(0.5,0.5,0.5,0.5);
#ifndef _X360 #ifndef _X360
const fltx4 Four_Zeros={0.0,0.0,0.0,0.0}; const fltx4 Four_Zeros=FLTX4(0.0,0.0,0.0,0.0);
const fltx4 Four_Ones={1.0,1.0,1.0,1.0}; const fltx4 Four_Ones=FLTX4(1.0,1.0,1.0,1.0);
#endif #endif
const fltx4 Four_Twos={2.0,2.0,2.0,2.0}; const fltx4 Four_Twos=FLTX4(2.0,2.0,2.0,2.0);
const fltx4 Four_Threes={3.0,3.0,3.0,3.0}; const fltx4 Four_Threes=FLTX4(3.0,3.0,3.0,3.0);
const fltx4 Four_Fours={4.0,4.0,4.0,4.0}; const fltx4 Four_Fours=FLTX4(4.0,4.0,4.0,4.0);
const fltx4 Four_Origin={0,0,0,1}; const fltx4 Four_Origin=FLTX4(0,0,0,1);
const fltx4 Four_NegativeOnes={-1,-1,-1,-1}; const fltx4 Four_NegativeOnes=FLTX4(-1,-1,-1,-1);
const fltx4 Four_2ToThe21s={ (float) (1<<21), (float) (1<<21), (float) (1<<21), (float)(1<<21) }; const fltx4 Four_2ToThe21s=FLTX4( (float) (1<<21), (float) (1<<21), (float) (1<<21), (float)(1<<21) );
const fltx4 Four_2ToThe22s={ (float) (1<<22), (float) (1<<22), (float) (1<<22), (float)(1<<22) }; const fltx4 Four_2ToThe22s=FLTX4( (float) (1<<22), (float) (1<<22), (float) (1<<22), (float)(1<<22) );
const fltx4 Four_2ToThe23s={ (float) (1<<23), (float) (1<<23), (float) (1<<23), (float)(1<<23) }; const fltx4 Four_2ToThe23s=FLTX4( (float) (1<<23), (float) (1<<23), (float) (1<<23), (float)(1<<23) );
const fltx4 Four_2ToThe24s={ (float) (1<<24), (float) (1<<24), (float) (1<<24), (float)(1<<24) }; const fltx4 Four_2ToThe24s=FLTX4( (float) (1<<24), (float) (1<<24), (float) (1<<24), (float)(1<<24) );
const fltx4 Four_Point225s={ .225, .225, .225, .225 }; const fltx4 Four_Point225s=FLTX4( .225, .225, .225, .225 );
const fltx4 Four_Epsilons={FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON}; const fltx4 Four_Epsilons=FLTX4(FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON);
const fltx4 Four_FLT_MAX={FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; const fltx4 Four_FLT_MAX=FLTX4(FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX);
const fltx4 Four_Negative_FLT_MAX={-FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX}; const fltx4 Four_Negative_FLT_MAX=FLTX4(-FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX);
const fltx4 g_SIMD_0123 = { 0., 1., 2., 3. }; const fltx4 g_SIMD_0123 = FLTX4( 0., 1., 2., 3. );
const fltx4 g_QuatMultRowSign[4] = const fltx4 g_QuatMultRowSign[4] =
{ {
{ 1.0f, 1.0f, -1.0f, 1.0f }, FLTX4( 1.0f, 1.0f, -1.0f, 1.0f ),
{ -1.0f, 1.0f, 1.0f, 1.0f }, FLTX4( -1.0f, 1.0f, 1.0f, 1.0f ),
{ 1.0f, -1.0f, 1.0f, 1.0f }, FLTX4( 1.0f, -1.0f, 1.0f, 1.0f ),
{ -1.0f, -1.0f, -1.0f, 1.0f } FLTX4( -1.0f, -1.0f, -1.0f, 1.0f )
}; };
const uint32 ALIGN16 g_SIMD_clear_signmask[4] ALIGN16_POST = {0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff}; const uint32 ALIGN16 g_SIMD_clear_signmask[4] ALIGN16_POST = {0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};

2
mathlib/ssenoise.cpp

@ -20,7 +20,7 @@
#define MAGIC_NUMBER (1<<15) // gives 8 bits of fraction #define MAGIC_NUMBER (1<<15) // gives 8 bits of fraction
static fltx4 Four_MagicNumbers = { MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER }; static fltx4 Four_MagicNumbers = FLTX4( MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER );
static ALIGN16 int32 idx_mask[4]= {0xffff, 0xffff, 0xffff, 0xffff}; static ALIGN16 int32 idx_mask[4]= {0xffff, 0xffff, 0xffff, 0xffff};

23
public/materialsystem/imesh.h

@ -1220,7 +1220,7 @@ inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX7_t &vertex )
Assert( m_CompressionType == VERTEX_COMPRESSION_NONE ); // FIXME: support compressed verts if needed Assert( m_CompressionType == VERTEX_COMPRESSION_NONE ); // FIXME: support compressed verts if needed
Assert( m_nCurrentVertex < m_nMaxVertexCount ); Assert( m_nCurrentVertex < m_nMaxVertexCount );
#if defined( _WIN32 ) && !defined( _X360 ) && !defined( PLATFORM_64BITS ) #if defined( _WIN32 ) && !defined( _X360 ) && !defined( PLATFORM_64BITS ) && !defined(_M_ARM)
const void *pRead = &vertex; const void *pRead = &vertex;
void *pCurrPos = m_pCurrPosition; void *pCurrPos = m_pCurrPosition;
__asm __asm
@ -1236,7 +1236,7 @@ inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX7_t &vertex )
movntps [edi + 16], xmm1 movntps [edi + 16], xmm1
movntps [edi + 32], xmm2 movntps [edi + 32], xmm2
} }
#elif defined(GNUC) || defined(PLATFORM_WINDOWS_PC64) #elif defined(GNUC) || defined(_WIN32)
const char *pRead = (char *)&vertex; const char *pRead = (char *)&vertex;
char *pCurrPos = (char *)m_pCurrPosition; char *pCurrPos = (char *)m_pCurrPosition;
__m128 m1 = _mm_load_ps( (float *)pRead ); __m128 m1 = _mm_load_ps( (float *)pRead );
@ -1267,7 +1267,7 @@ inline void CVertexBuilder::Fast4VerticesSSE(
Assert( m_CompressionType == VERTEX_COMPRESSION_NONE ); // FIXME: support compressed verts if needed Assert( m_CompressionType == VERTEX_COMPRESSION_NONE ); // FIXME: support compressed verts if needed
Assert( m_nCurrentVertex < m_nMaxVertexCount-3 ); Assert( m_nCurrentVertex < m_nMaxVertexCount-3 );
#if defined( _WIN32 ) && !defined( _X360 ) && !defined( PLATFORM_64BITS ) #if defined( _WIN32 ) && !defined( _X360 ) && !defined( PLATFORM_64BITS ) && !defined(_M_ARM)
void *pCurrPos = m_pCurrPosition; void *pCurrPos = m_pCurrPosition;
__asm __asm
{ {
@ -1309,7 +1309,7 @@ inline void CVertexBuilder::Fast4VerticesSSE(
movntps [edi + 80+96], xmm5 movntps [edi + 80+96], xmm5
} }
#elif defined(__arm__) || defined(PLATFORM_WINDOWS_PC64) #elif defined(__arm__) || defined(_WIN32)
const void *pReadA = &vtx_a; const void *pReadA = &vtx_a;
const void *pReadB = &vtx_b; const void *pReadB = &vtx_b;
const void *pReadC = &vtx_c; const void *pReadC = &vtx_c;
@ -1430,7 +1430,7 @@ inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX8_t &vertex )
Assert( m_CompressionType == VERTEX_COMPRESSION_NONE ); // FIXME: support compressed verts if needed Assert( m_CompressionType == VERTEX_COMPRESSION_NONE ); // FIXME: support compressed verts if needed
Assert( m_nCurrentVertex < m_nMaxVertexCount ); Assert( m_nCurrentVertex < m_nMaxVertexCount );
#if defined( _WIN32 ) && !defined( _X360 ) && !defined( PLATFORM_64BITS ) #if defined( _WIN32 ) && !defined( _X360 ) && !defined( PLATFORM_64BITS ) && !defined(_M_ARM)
const void *pRead = &vertex; const void *pRead = &vertex;
void *pCurrPos = m_pCurrPosition; void *pCurrPos = m_pCurrPosition;
__asm __asm
@ -1448,21 +1448,10 @@ inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX8_t &vertex )
movntps [edi + 32], xmm2 movntps [edi + 32], xmm2
movntps [edi + 48], xmm3 movntps [edi + 48], xmm3
} }
#elif defined(GNUC) || defined(PLATFORM_WINDOWS_PC64) #elif defined(GNUC) || defined(_WIN32)
const void *pRead = &vertex; const void *pRead = &vertex;
void *pCurrPos = m_pCurrPosition; void *pCurrPos = m_pCurrPosition;
/* __asm__ __volatile__ (
"movaps (%0), %%xmm0\n"
"movaps 16(%0), %%xmm1\n"
"movaps 32(%0), %%xmm2\n"
"movaps 48(%0), %%xmm3\n"
"movntps %%xmm0, (%1)\n"
"movntps %%xmm1, 16(%1)\n"
"movntps %%xmm2, 32(%1)\n"
"movntps %%xmm3, 48(%1)\n"
:: "r" (pRead), "r" (pCurrPos) : "memory"); */
__m128 m1 = _mm_load_ps( (float *)pRead ); __m128 m1 = _mm_load_ps( (float *)pRead );
__m128 m2 = _mm_load_ps( (float *)((intp)pRead + 16) ); __m128 m2 = _mm_load_ps( (float *)((intp)pRead + 16) );
__m128 m3 = _mm_load_ps( (float *)((intp)pRead + 32) ); __m128 m3 = _mm_load_ps( (float *)((intp)pRead + 32) );

6
public/mathlib/mathlib.h

@ -405,6 +405,9 @@ void inline SinCos( float radians, float *sine, float *cosine )
{ {
#if defined( _X360 ) #if defined( _X360 )
XMScalarSinCos( sine, cosine, radians ); XMScalarSinCos( sine, cosine, radians );
#elif defined( PLATFORM_WINDOWS_PC64 ) || defined(_M_ARM)
*sine = sin( radians );
*cosine = cos( radians );
#elif defined( PLATFORM_WINDOWS_PC32 ) #elif defined( PLATFORM_WINDOWS_PC32 )
_asm _asm
{ {
@ -417,9 +420,6 @@ void inline SinCos( float radians, float *sine, float *cosine )
fstp DWORD PTR [edx] fstp DWORD PTR [edx]
fstp DWORD PTR [eax] fstp DWORD PTR [eax]
} }
#elif defined( PLATFORM_WINDOWS_PC64 )
*sine = sin( radians );
*cosine = cos( radians );
#elif defined( OSX ) #elif defined( OSX )
__sincosf(radians, sine, cosine); __sincosf(radians, sine, cosine);
#elif defined( POSIX ) #elif defined( POSIX )

3
public/mathlib/simdvectormatrix.h

@ -135,7 +135,8 @@ public:
Assert( m_pData ); Assert( m_pData );
static FourVectors value{Four_Zeros, Four_Zeros, Four_Zeros}; static FourVectors value{Four_Zeros, Four_Zeros, Four_Zeros};
memutils::set( m_pData, value, m_nHeight*m_nPaddedWidth ); for (size_t n = m_nHeight * m_nPaddedWidth; n; n--)
*(m_pData+n) = value;
} }
void RaiseToPower( float power ); void RaiseToPower( float power );

14
public/mathlib/ssemath.h

@ -63,6 +63,12 @@ typedef __m128 fltx4;
typedef __m128 i32x4; typedef __m128 i32x4;
typedef __m128 u32x4; typedef __m128 u32x4;
#ifdef _M_ARM
#define FLTX4(w, x, y, z) {(w) + (unsigned long long(x) << 32), (y) + (unsigned long long(z) << 32)}
#else
#define FLTX4(w, x, y, z) {w, x, y, z}
#endif
#endif #endif
// The FLTX4 type is a fltx4 used as a parameter to a function. // The FLTX4 type is a fltx4 used as a parameter to a function.
@ -1828,7 +1834,7 @@ FORCEINLINE fltx4 ReplicateX4( float flValue )
FORCEINLINE float SubFloat( const fltx4 & a, int idx ) FORCEINLINE float SubFloat( const fltx4 & a, int idx )
{ {
// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
#ifndef POSIX #if defined(_WIN32) && !defined(_M_ARM)
return a.m128_f32[ idx ]; return a.m128_f32[ idx ];
#else #else
return (reinterpret_cast<float const *>(&a))[idx]; return (reinterpret_cast<float const *>(&a))[idx];
@ -1837,7 +1843,7 @@ FORCEINLINE float SubFloat( const fltx4 & a, int idx )
FORCEINLINE float & SubFloat( fltx4 & a, int idx ) FORCEINLINE float & SubFloat( fltx4 & a, int idx )
{ {
#ifndef POSIX #if defined(_WIN32) && !defined(_M_ARM)
return a.m128_f32[ idx ]; return a.m128_f32[ idx ];
#else #else
return (reinterpret_cast<float *>(&a))[idx]; return (reinterpret_cast<float *>(&a))[idx];
@ -1851,7 +1857,7 @@ FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )
FORCEINLINE uint32 SubInt( const fltx4 & a, int idx ) FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
{ {
#ifndef POSIX #if defined(_WIN32) && !defined(_M_ARM)
return a.m128_u32[ idx ]; return a.m128_u32[ idx ];
#else #else
return (reinterpret_cast<uint32 const *>(&a))[idx]; return (reinterpret_cast<uint32 const *>(&a))[idx];
@ -1860,7 +1866,7 @@ FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
FORCEINLINE uint32 & SubInt( fltx4 & a, int idx ) FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
{ {
#ifndef POSIX #if defined(_WIN32) && !defined(_M_ARM)
return a.m128_u32[ idx ]; return a.m128_u32[ idx ];
#else #else
return (reinterpret_cast<uint32 *>(&a))[idx]; return (reinterpret_cast<uint32 *>(&a))[idx];

2
public/mathlib/vmatrix.h

@ -43,7 +43,7 @@ struct cplane_t;
#define M_PI 3.14159265358979323846 // matches value in gcc v2 math.h #define M_PI 3.14159265358979323846 // matches value in gcc v2 math.h
#endif #endif
class alignas(16) VMatrix class VMatrix
{ {
public: public:

4
public/shaderapi/ishaderapi.h

@ -175,7 +175,9 @@ public:
virtual void TexMagFilter( ShaderTexFilterMode_t texFilterMode ) = 0; virtual void TexMagFilter( ShaderTexFilterMode_t texFilterMode ) = 0;
virtual void TexWrap( ShaderTexCoordComponent_t coord, ShaderTexWrapMode_t wrapMode ) = 0; virtual void TexWrap( ShaderTexCoordComponent_t coord, ShaderTexWrapMode_t wrapMode ) = 0;
#ifndef SHADERAPIDX10
virtual void CopyRenderTargetToTexture( ShaderAPITextureHandle_t textureHandle ) = 0; virtual void CopyRenderTargetToTexture( ShaderAPITextureHandle_t textureHandle ) = 0;
#endif
// Binds a particular material to render with // Binds a particular material to render with
virtual void Bind( IMaterial* pMaterial ) = 0; virtual void Bind( IMaterial* pMaterial ) = 0;
@ -612,6 +614,7 @@ public:
//extended clear buffers function with alpha independent from color //extended clear buffers function with alpha independent from color
virtual void ClearBuffersObeyStencilEx( bool bClearColor, bool bClearAlpha, bool bClearDepth ) = 0; virtual void ClearBuffersObeyStencilEx( bool bClearColor, bool bClearAlpha, bool bClearDepth ) = 0;
#ifndef SHADERAPIDX10
// Allows copying a render target to another texture by specifying them both. // Allows copying a render target to another texture by specifying them both.
virtual void CopyRenderTargetToScratchTexture( ShaderAPITextureHandle_t srcRt, ShaderAPITextureHandle_t dstTex, Rect_t *pSrcRect = NULL, Rect_t *pDstRect = NULL ) = 0; virtual void CopyRenderTargetToScratchTexture( ShaderAPITextureHandle_t srcRt, ShaderAPITextureHandle_t dstTex, Rect_t *pSrcRect = NULL, Rect_t *pDstRect = NULL ) = 0;
@ -627,6 +630,7 @@ public:
virtual void CopyTextureToTexture( ShaderAPITextureHandle_t srcTex, ShaderAPITextureHandle_t dstTex ) = 0; virtual void CopyTextureToTexture( ShaderAPITextureHandle_t srcTex, ShaderAPITextureHandle_t dstTex ) = 0;
#endif
}; };

2
public/tier0/commonmacros.h

@ -68,7 +68,7 @@ inline bool IsPowerOfTwo( T value )
// From crtdefs.h // From crtdefs.h
#if !defined(UNALIGNED) #if !defined(UNALIGNED)
#if defined(_M_IA64) || defined(_M_AMD64) #if defined(_M_AMD64) || defined(_M_ARM)
#define UNALIGNED __unaligned #define UNALIGNED __unaligned
#else #else
#define UNALIGNED #define UNALIGNED

34
public/tier0/platform.h

@ -852,7 +852,9 @@ static FORCEINLINE double fsel(double fComparand, double fValGE, double fLT)
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
//#define CHECK_FLOAT_EXCEPTIONS 1 //#define CHECK_FLOAT_EXCEPTIONS 1
#if !defined( _X360 ) #if defined (__arm__) || defined (__aarch64__)
inline void SetupFPUControlWord() {}
#elif !defined( _X360 )
#if defined( _MSC_VER ) #if defined( _MSC_VER )
#if defined( PLATFORM_WINDOWS_PC64 ) #if defined( PLATFORM_WINDOWS_PC64 )
@ -898,8 +900,6 @@ static FORCEINLINE double fsel(double fComparand, double fValGE, double fLT)
#endif #endif
#endif #endif
#elif defined (__arm__) || defined (__aarch64__)
inline void SetupFPUControlWord() {}
#else #else
inline void SetupFPUControlWord() inline void SetupFPUControlWord()
{ {
@ -1025,7 +1025,7 @@ inline T QWordSwapC( T dw )
return output; return output;
} }
#elif defined( _MSC_VER ) && !defined( PLATFORM_WINDOWS_PC64 ) #elif defined( _MSC_VER ) && !defined( PLATFORM_WINDOWS_PC64 ) && !defined(_M_ARM)
#define WordSwap WordSwapAsm #define WordSwap WordSwapAsm
#define DWordSwap DWordSwapAsm #define DWordSwap DWordSwapAsm
@ -1229,8 +1229,18 @@ PLATFORM_INTERFACE time_t Plat_timegm( struct tm *timeptr );
PLATFORM_INTERFACE struct tm * Plat_localtime( const time_t *timep, struct tm *result ); PLATFORM_INTERFACE struct tm * Plat_localtime( const time_t *timep, struct tm *result );
#if defined( _WIN32 ) && defined( _MSC_VER ) && ( _MSC_VER >= 1400 ) #if defined( _WIN32 ) && defined( _MSC_VER ) && ( _MSC_VER >= 1400 )
#ifdef _M_X64
extern "C" unsigned __int64 __rdtsc(); extern "C" unsigned __int64 __rdtsc();
#pragma intrinsic(__rdtsc) #pragma intrinsic(__rdtsc)
#else
#include <intrin.h>
#define MSVC_ARM_SYSREG(op0, op1, crn, crm, op2) \
( ((op0 & 1) << 14) | \
((op1 & 7) << 11) | \
((crn & 15) << 7) | \
((crm & 15) << 3) | \
((op2 & 7) << 0) )
#endif
#endif #endif
inline uint64 Plat_Rdtsc() inline uint64 Plat_Rdtsc()
@ -1241,15 +1251,15 @@ inline uint64 Plat_Rdtsc()
return t.tv_sec * 1000000000ULL + t.tv_nsec; return t.tv_sec * 1000000000ULL + t.tv_nsec;
#elif defined( _X360 ) #elif defined( _X360 )
return ( uint64 )__mftb32(); return ( uint64 )__mftb32();
#elif defined( _WIN64 ) #elif defined( _M_IX86 )
_asm rdtsc
#elif defined( _M_ARM )
uint32 val = _MoveFromCoprocessor(15,0, 9,13,0);
return ((uint64)val) << 6;
#elif defined( _M_ARM64 ) || defined( _M_ARM64EC )
return _ReadStatusReg(MSVC_ARM_SYSREG(3,3, 9,12,5));
#elif defined( COMPILER_MSVC )
return ( uint64 )__rdtsc(); return ( uint64 )__rdtsc();
#elif defined( _WIN32 )
#if defined( _MSC_VER ) && ( _MSC_VER >= 1400 )
return ( uint64 )__rdtsc();
#else
__asm rdtsc;
__asm ret;
#endif
#elif defined( __i386__ ) #elif defined( __i386__ )
uint64 val; uint64 val;
__asm__ __volatile__ ( "rdtsc" : "=A" (val) ); __asm__ __volatile__ ( "rdtsc" : "=A" (val) );

4
public/tier0/threadtools.h

@ -241,6 +241,8 @@ inline void ThreadPause()
_mm_pause(); _mm_pause();
#elif defined( COMPILER_MSVC32 ) #elif defined( COMPILER_MSVC32 )
__asm pause; __asm pause;
#elif defined(_M_ARM)
__yield();
#elif defined( COMPILER_MSVCX360 ) #elif defined( COMPILER_MSVCX360 )
YieldProcessor(); YieldProcessor();
__asm { or r0,r0,r0 } __asm { or r0,r0,r0 }
@ -445,7 +447,7 @@ PLATFORM_INTERFACE bool ThreadInterlockedAssignIf64( volatile int64 *pDest, int6
PLATFORM_INTERFACE int64 ThreadInterlockedExchange64( int64 volatile *, int64 value ) NOINLINE; PLATFORM_INTERFACE int64 ThreadInterlockedExchange64( int64 volatile *, int64 value ) NOINLINE;
#ifdef COMPILER_MSVC32 #if COMPILER_MSVC32 || _M_ARM
PLATFORM_INTERFACE int64 ThreadInterlockedIncrement64( int64 volatile * ) NOINLINE; PLATFORM_INTERFACE int64 ThreadInterlockedIncrement64( int64 volatile * ) NOINLINE;
PLATFORM_INTERFACE int64 ThreadInterlockedDecrement64( int64 volatile * ) NOINLINE; PLATFORM_INTERFACE int64 ThreadInterlockedDecrement64( int64 volatile * ) NOINLINE;
PLATFORM_INTERFACE int64 ThreadInterlockedExchangeAdd64( int64 volatile *, int64 value ) NOINLINE; PLATFORM_INTERFACE int64 ThreadInterlockedExchangeAdd64( int64 volatile *, int64 value ) NOINLINE;

8
studiorender/r_studiodraw.cpp

@ -657,7 +657,7 @@ static matrix3x4_t *ComputeSkinMatrix( mstudioboneweight_t &boneweights, matrix3
static matrix3x4_t *ComputeSkinMatrixSSE( mstudioboneweight_t &boneweights, matrix3x4_t *pPoseToWorld, matrix3x4_t &result ) static matrix3x4_t *ComputeSkinMatrixSSE( mstudioboneweight_t &boneweights, matrix3x4_t *pPoseToWorld, matrix3x4_t &result )
{ {
// NOTE: pPoseToWorld, being cache aligned, doesn't need explicit initialization // NOTE: pPoseToWorld, being cache aligned, doesn't need explicit initialization
#if defined( _WIN32 ) && !defined( _X360 ) && !defined( PLATFORM_64BITS ) #if defined( _WIN32 ) && !defined( _X360 ) && defined(_M_IX86)
switch( boneweights.numbones ) switch( boneweights.numbones )
{ {
default: default:
@ -866,11 +866,9 @@ static matrix3x4_t *ComputeSkinMatrixSSE( mstudioboneweight_t &boneweights, matr
return &result; return &result;
#endif #endif
} }
#elif POSIX || PLATFORM_WINDOWS_PC64 #elif POSIX || _WIN32
// #warning "ComputeSkinMatrixSSE C implementation only" // #warning "ComputeSkinMatrixSSE C implementation only"
return ComputeSkinMatrix( boneweights, pPoseToWorld, result ); return ComputeSkinMatrix( boneweights, pPoseToWorld, result );
#elif defined( _X360 )
return ComputeSkinMatrix( boneweights, pPoseToWorld, result );
#else #else
#error #error
#endif #endif
@ -909,7 +907,7 @@ inline void CStudioRender::R_ComputeLightAtPoint3( const Vector &pos, const Vect
// define SPECIAL_SSE_MESH_PROCESSOR to enable code which contains a special optimized SSE lighting loop, significantly // define SPECIAL_SSE_MESH_PROCESSOR to enable code which contains a special optimized SSE lighting loop, significantly
// improving software vertex processing performace. // improving software vertex processing performace.
#if defined( _WIN32 ) && !defined( _X360 ) #if defined( _WIN32 ) && !defined( _X360 ) && !defined(_M_ARM)
#define SPECIAL_SSE_MESH_PROCESSOR #define SPECIAL_SSE_MESH_PROCESSOR
#endif #endif

2
studiorender/r_studiolight.cpp

@ -66,7 +66,7 @@ void R_LightAmbient_4D( const Vector& normal, Vector4D* pLightBoxColor, Vector &
VectorMA( lv, normal[2]*normal[2], normal[2] > 0.f ? pLightBoxColor[4].AsVector3D() : pLightBoxColor[5].AsVector3D(), lv ); VectorMA( lv, normal[2]*normal[2], normal[2] > 0.f ? pLightBoxColor[4].AsVector3D() : pLightBoxColor[5].AsVector3D(), lv );
} }
#if defined( _WIN32 ) && !defined( _X360 ) #if defined( _WIN32 ) && !defined( _X360 ) && !defined(_M_ARM)
void R_LightAmbient_4D( const FourVectors& normal, Vector4D* pLightBoxColor, FourVectors &lv ) void R_LightAmbient_4D( const FourVectors& normal, Vector4D* pLightBoxColor, FourVectors &lv )
{ {
// VPROF( "R_LightAmbient" ); // VPROF( "R_LightAmbient" );

4
studiorender/r_studiolight.h

@ -13,7 +13,7 @@
#include "tier0/platform.h" #include "tier0/platform.h"
#if defined( _WIN32 ) && !defined( _X360 ) #if defined( _WIN32 ) && !defined( _X360 ) && !defined(_M_ARM)
#include <xmmintrin.h> #include <xmmintrin.h>
#endif #endif
@ -40,7 +40,7 @@ float FASTCALL R_WorldLightDistanceFalloff( const LightDesc_t *wl, const Vector&
// Copies lighting state into a buffer, returns number of lights copied // Copies lighting state into a buffer, returns number of lights copied
int CopyLocalLightingState( int nMaxLights, LightDesc_t *pDest, int nLightCount, const LightDesc_t *pSrc ); int CopyLocalLightingState( int nMaxLights, LightDesc_t *pDest, int nLightCount, const LightDesc_t *pSrc );
#if defined( _WIN32 ) && !defined( _X360 ) #if defined( _WIN32 ) && !defined( _X360 ) && !defined(_M_ARM)
// SSE optimized versions // SSE optimized versions
void R_LightAmbient_4D( const FourVectors& normal, Vector4D* pLightBoxColor, FourVectors &lv ); void R_LightAmbient_4D( const FourVectors& normal, Vector4D* pLightBoxColor, FourVectors &lv );
__m128 FASTCALL R_WorldLightDistanceFalloff( const LightDesc_t *wl, const FourVectors& delta ); __m128 FASTCALL R_WorldLightDistanceFalloff( const LightDesc_t *wl, const FourVectors& delta );

2
studiorender/studiorender.h

@ -22,7 +22,7 @@
#include "flexrenderdata.h" #include "flexrenderdata.h"
#include "mathlib/compressed_vector.h" #include "mathlib/compressed_vector.h"
#include "r_studiolight.h" #include "r_studiolight.h"
#if defined( _WIN32 ) && !defined( _X360 ) #if defined( _WIN32 ) && !defined( _X360 ) && !defined(_M_ARM)
#include <xmmintrin.h> #include <xmmintrin.h>
#endif #endif
#include "tier0/dbg.h" #include "tier0/dbg.h"

2
tier0/PMELib.cpp

@ -6,7 +6,7 @@
// //
//===========================================================================// //===========================================================================//
#ifdef _WIN32 #if defined(_WIN32) && !defined(_M_ARM)
#include <windows.h> #include <windows.h>
#pragma warning( disable : 4530 ) // warning: exception handler -GX option #pragma warning( disable : 4530 ) // warning: exception handler -GX option

2
tier0/cpumonitoring.cpp

@ -23,7 +23,7 @@
#include "pch_tier0.h" #include "pch_tier0.h"
#include "tier0/cpumonitoring.h" #include "tier0/cpumonitoring.h"
#ifdef PLATFORM_WINDOWS_PC32 #ifdef _M_IX86
#include "tier0/threadtools.h" #include "tier0/threadtools.h"
#define NOMINMAX #define NOMINMAX
#undef min #undef min

12
tier0/threadtools.cpp

@ -1740,7 +1740,7 @@ bool ThreadInterlockedAssignIf( int32 volatile *pDest, int32 value, int32 comper
{ {
Assert( (size_t)pDest % 4 == 0 ); Assert( (size_t)pDest % 4 == 0 );
#if !(defined(_WIN64) || defined (_X360)) #if !(defined(_WIN64) || defined (_X360) || defined(_M_ARM))
__asm __asm
{ {
mov eax,comperand mov eax,comperand
@ -1773,7 +1773,7 @@ void *ThreadInterlockedCompareExchangePointer( void * volatile *pDest, void *val
bool ThreadInterlockedAssignPointerIf( void * volatile *pDest, void *value, void *comperand ) bool ThreadInterlockedAssignPointerIf( void * volatile *pDest, void *value, void *comperand )
{ {
Assert( (size_t)pDest % 4 == 0 ); Assert( (size_t)pDest % 4 == 0 );
#if !(defined(_WIN64) || defined (_X360)) #if !(defined(_WIN64) || defined (_X360) || defined(_M_ARM))
__asm __asm
{ {
mov eax,comperand mov eax,comperand
@ -1807,13 +1807,19 @@ int64 ThreadInterlockedCompareExchange64( int64 volatile *pDest, int64 value, in
lock CMPXCHG8B [esi]; lock CMPXCHG8B [esi];
} }
} }
#elif defined(_M_ARM)
int64 ThreadInterlockedCompareExchange64( int64 volatile *pDest, int64 value, int64 comperand )
{
Assert( (size_t)pDest % 8 == 0 );
return InterlockedCompareExchange64( pDest, value, comperand );
}
#endif #endif
bool ThreadInterlockedAssignIf64(volatile int64 *pDest, int64 value, int64 comperand ) bool ThreadInterlockedAssignIf64(volatile int64 *pDest, int64 value, int64 comperand )
{ {
Assert( (size_t)pDest % 8 == 0 ); Assert( (size_t)pDest % 8 == 0 );
#if defined(_X360) || defined(_WIN64) #if defined(_X360) || defined(_WIN64) || defined(_M_ARM)
return ( ThreadInterlockedCompareExchange64( pDest, value, comperand ) == comperand ); return ( ThreadInterlockedCompareExchange64( pDest, value, comperand ) == comperand );
#else #else
__asm __asm

7
tier0/wscript

@ -22,8 +22,8 @@ def build(bld):
'assert_dialog.cpp', 'assert_dialog.cpp',
'commandline.cpp', 'commandline.cpp',
'cpu.cpp', 'cpu.cpp',
'cpumonitoring.cpp',
'cpu_usage.cpp', 'cpu_usage.cpp',
'cpumonitoring.cpp',
'dbg.cpp', 'dbg.cpp',
'dynfunction.cpp', 'dynfunction.cpp',
'fasttimer.cpp', 'fasttimer.cpp',
@ -54,10 +54,13 @@ def build(bld):
'assert_dialog.rc', 'assert_dialog.rc',
#'etwprof.cpp', [$WINDOWS] #'etwprof.cpp', [$WINDOWS]
'platform.cpp', 'platform.cpp',
'pme.cpp',
'vcrmode.cpp', 'vcrmode.cpp',
'win32consoleio.cpp' 'win32consoleio.cpp'
] ]
if bld.env.DEST_CPU == 'arm':
source += ['pme_posix.cpp']
else:
source += ['pme.cpp']
if bld.env.DEST_CPU == 'amd64': if bld.env.DEST_CPU == 'amd64':
source += [ source += [
'InterlockedCompareExchange128.masm' 'InterlockedCompareExchange128.masm'

2
tier1/processor_detect.cpp

@ -6,7 +6,7 @@
// $NoKeywords: $ // $NoKeywords: $
//=============================================================================// //=============================================================================//
#if defined( _X360 ) || defined( WIN64 ) #if defined( _X360 ) || defined( WIN64 ) || defined(_M_ARM)
bool CheckMMXTechnology(void) { return false; } bool CheckMMXTechnology(void) { return false; }
bool CheckSSETechnology(void) { return false; } bool CheckSSETechnology(void) { return false; }

4
vphysics/trace.cpp

@ -453,7 +453,7 @@ private:
#ifdef WIN32 #ifdef WIN32
static const static const
#endif #endif
fltx4 g_IVPToHLDir = { 1.0f, -1.0f, 1.0f, 1.0f }; fltx4 g_IVPToHLDir = FLTX4( 1.0f, -1.0f, 1.0f, 1.0f );
//static const fltx4 g_IVPToHLPosition = { IVP2HL(1.0f), -IVP2HL(1.0f), IVP2HL(1.0f), IVP2HL(1.0f) }; //static const fltx4 g_IVPToHLPosition = { IVP2HL(1.0f), -IVP2HL(1.0f), IVP2HL(1.0f), IVP2HL(1.0f) };
@ -680,7 +680,7 @@ bool CTraceIVP::BuildLeafmapCache( const leafmap_t * RESTRICT pLeafmap )
#endif #endif
} }
static const fltx4 g_IndexBase = {0,1,2,3}; static const fltx4 g_IndexBase =FLTX4(0,1,2,3);
int CTraceIVP::SupportMapCached( const Vector &dir, Vector *pOut ) const int CTraceIVP::SupportMapCached( const Vector &dir, Vector *pOut ) const
{ {
VPROF("SupportMapCached"); VPROF("SupportMapCached");

8
vstdlib/coroutine.cpp

@ -218,7 +218,11 @@ extern "C" byte *GetStackPtr64();
#define GetStackPtr( pStackPtr) byte *pStackPtr = GetStackPtr64(); #define GetStackPtr( pStackPtr) byte *pStackPtr = GetStackPtr64();
#else #else
#ifdef WIN32 #ifdef WIN32
# ifdef _M_ARM
# define GetStackPtr( pStackPtr ) byte x; byte *pStackPtr = &x
# else
# define GetStackPtr( pStackPtr ) byte *pStackPtr; __asm mov pStackPtr, esp # define GetStackPtr( pStackPtr ) byte *pStackPtr; __asm mov pStackPtr, esp
# endif
#elif defined(GNUC) #elif defined(GNUC)
// Apple's version of gcc/g++ doesn't return the expected value using the intrinsic, so // Apple's version of gcc/g++ doesn't return the expected value using the intrinsic, so
// do it the old fashioned way - this will also use asm on linux (since we don't compile // do it the old fashioned way - this will also use asm on linux (since we don't compile
@ -649,7 +653,7 @@ bool Internal_Coroutine_Continue( HCoroutine hCoroutine, const char *pchDebugMsg
bool bInCoroutineAlready = GCoroutineMgr().IsAnyCoroutineActive(); bool bInCoroutineAlready = GCoroutineMgr().IsAnyCoroutineActive();
#ifdef _WIN32 #ifdef _WIN32
#ifndef _WIN64 #if !defined( _WIN64 ) && !defined( _M_ARM )
// make sure nobody has a try/catch block and then yielded // make sure nobody has a try/catch block and then yielded
// because we hate that and we will crash // because we hate that and we will crash
uint32 topofexceptionchain; uint32 topofexceptionchain;
@ -897,7 +901,7 @@ void Coroutine_YieldToMain()
CoroutineDbgMsg( g_fmtstr.sprintf( "Coroutine_YieldToMain() %s#%x -> %s#%x\n", coroutine.m_pchName, coroutine.m_hCoroutine, coroutinePrev.m_pchName, coroutinePrev.m_hCoroutine ) ); CoroutineDbgMsg( g_fmtstr.sprintf( "Coroutine_YieldToMain() %s#%x -> %s#%x\n", coroutine.m_pchName, coroutine.m_hCoroutine, coroutinePrev.m_pchName, coroutinePrev.m_hCoroutine ) );
#ifdef _WIN32 #ifdef _WIN32
#ifndef _WIN64 #if !defined( _WIN64 ) && !defined( _M_ARM )
// make sure nobody has a try/catch block and then yielded // make sure nobody has a try/catch block and then yielded
// because we hate that and we will crash // because we hate that and we will crash
uint32 topofexceptionchain; uint32 topofexceptionchain;

22
wscript

@ -1,5 +1,6 @@
#! /usr/bin/env python #! /usr/bin/env python
# encoding: utf-8 # encoding: utf-8
# vim: noexpandtab
# nillerusr # nillerusr
from __future__ import print_function from __future__ import print_function
@ -222,6 +223,8 @@ def define_platform(conf):
'_ALLOW_MSC_VER_MISMATCH', '_ALLOW_MSC_VER_MISMATCH',
'NO_X360_XDK' 'NO_X360_XDK'
]) ])
if conf.env.DEST_CPU == 'arm':
conf.env.append_unique('DEFINES', ['__arm__=1'])
elif conf.env.DEST_OS == 'darwin': elif conf.env.DEST_OS == 'darwin':
conf.env.append_unique('DEFINES', [ conf.env.append_unique('DEFINES', [
'OSX=1', '_OSX=1', 'OSX=1', '_OSX=1',
@ -384,9 +387,11 @@ def check_deps(conf):
conf.check(lib='opus', uselib_store='OPUS') conf.check(lib='opus', uselib_store='OPUS')
if conf.env.DEST_OS == 'win32': if conf.env.DEST_OS == 'win32':
if conf.env.DEST_CPU == 'arm':
conf.check(lib='d3d9', uselib_store='D3D9')
conf.check(lib='d3dcompiler', uselib_store='D3DCOMPILER')
else:
conf.check(lib='libz', uselib_store='ZLIB', define_name='USE_ZLIB') conf.check(lib='libz', uselib_store='ZLIB', define_name='USE_ZLIB')
# conf.check(lib='nvtc', uselib_store='NVTC')
# conf.check(lib='ati_compress_mt_vc10', uselib_store='ATI_COMPRESS_MT_VC10')
conf.check(lib='SDL2', uselib_store='SDL2') conf.check(lib='SDL2', uselib_store='SDL2')
conf.check(lib='libjpeg', uselib_store='JPEG', define_name='HAVE_JPEG') conf.check(lib='libjpeg', uselib_store='JPEG', define_name='HAVE_JPEG')
conf.check(lib='libpng', uselib_store='PNG', define_name='HAVE_PNG') conf.check(lib='libpng', uselib_store='PNG', define_name='HAVE_PNG')
@ -405,10 +410,12 @@ def configure(conf):
# Force XP compability, all build targets should add # Force XP compability, all build targets should add
# subsystem=bld.env.MSVC_SUBSYSTEM # subsystem=bld.env.MSVC_SUBSYSTEM
# TODO: wrapper around bld.stlib, bld.shlib and so on? # TODO: wrapper around bld.stlib, bld.shlib and so on?
conf.env.MSVC_TARGETS = ['x86' if not conf.options.ALLOW64 else 'x64']
if conf.env.MSVC_TARGETS[0] == 'x86':
conf.env.MSVC_SUBSYSTEM = 'WINDOWS,5.01' conf.env.MSVC_SUBSYSTEM = 'WINDOWS,5.01'
conf.env.MSVC_TARGETS = ['x86'] # explicitly request x86 target for MSVC else:
if conf.options.ALLOW64: conf.env.MSVC_SUBSYSTEM = 'WINDOWS'
conf.env.MSVC_TARGETS = ['x64']
if sys.platform == 'win32': if sys.platform == 'win32':
conf.load('msvc_pdb_ext msdev msvs') conf.load('msvc_pdb_ext msdev msvs')
conf.load('subproject xcompile compiler_c compiler_cxx gitversion clang_compilation_database strip_on_install_v2 waf_unit_test enforce_pic') conf.load('subproject xcompile compiler_c compiler_cxx gitversion clang_compilation_database strip_on_install_v2 waf_unit_test enforce_pic')
@ -504,7 +511,6 @@ def configure(conf):
else: else:
cflags += [ cflags += [
'/I'+os.path.abspath('.')+'/thirdparty/SDL', '/I'+os.path.abspath('.')+'/thirdparty/SDL',
'/arch:SSE' if conf.env.DEST_CPU == 'x86' else '/arch:AVX',
'/GF', '/GF',
'/Gy', '/Gy',
'/fp:fast', '/fp:fast',
@ -514,6 +520,8 @@ def configure(conf):
'/TP', '/TP',
'/EHsc' '/EHsc'
] ]
if conf.env.DEST_CPU != 'arm':
cflags += ['/arch:SSE' if conf.env.DEST_CPU == 'x86' else '/arch:AVX']
if conf.options.BUILD_TYPE == 'debug': if conf.options.BUILD_TYPE == 'debug':
linkflags += [ linkflags += [
@ -593,7 +601,7 @@ def configure(conf):
def build(bld): def build(bld):
os.environ["CCACHE_DIR"] = os.path.abspath('.ccache/'+bld.env.COMPILER_CC+'/'+bld.env.DEST_OS+'/'+bld.env.DEST_CPU) os.environ["CCACHE_DIR"] = os.path.abspath('.ccache/'+bld.env.COMPILER_CC+'/'+bld.env.DEST_OS+'/'+bld.env.DEST_CPU)
if bld.env.DEST_OS in ['win32', 'android']: if bld.env.SDL and bld.env.DEST_OS in ['win32', 'android']:
sdl_name = 'SDL2.dll' if bld.env.DEST_OS == 'win32' else 'libSDL2.so' sdl_name = 'SDL2.dll' if bld.env.DEST_OS == 'win32' else 'libSDL2.so'
sdl_path = os.path.join('lib', bld.env.DEST_OS, bld.env.DEST_CPU, sdl_name) sdl_path = os.path.join('lib', bld.env.DEST_OS, bld.env.DEST_CPU, sdl_name)
bld.install_files(bld.env.LIBDIR, [sdl_path]) bld.install_files(bld.env.LIBDIR, [sdl_path])

Loading…
Cancel
Save