/*
	This program is free software; you can redistribute it and/or modify
	it under the terms of the GNU General Public License version 2 
	as published by the Free Software Foundation.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program; if not, write to the Free Software
	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA


	Copyright (C) 2006  Thierry Berger-Perrin <tbptbp@gmail.com>
*/

#include "specifics.h"
#include "sys_cpu.h"
#include "sys_log.h"

#include "math_utility.h"

#ifdef LINUX
	#include <cstdio>
	#include <sched.h>			// cpuset_t
	//#include <sys/time.h>
	#include <sys/resource.h>	// PRIO_*, setpriority etc...

	#include <unistd.h>			// pid_t etc...
	#include <linux/unistd.h>	// _syscall3


/*
 * provide the proper syscall information if our libc is not yet updated.
 * It is suggested you check your kernel to make sure these are right for
 * your architecture.
 */
#ifndef __NR_sched_setaffinity
#if defined(__i386__)
#define __NR_sched_setaffinity  241
#define __NR_sched_getaffinity  242
#endif
#endif

#define __NR_kludge_sched_setaffinity __NR_sched_setaffinity
#define __NR_kludge_sched_getaffinity __NR_sched_getaffinity
//_syscall3(int, sched_setaffinity, pid_t, pid, unsigned int, len, unsigned long *, user_mask_ptr)
//_syscall3(int, sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long *, user_mask_ptr)
//_syscall3(int, kludge_sched_setaffinity, pid_t, pid, unsigned int, len, unsigned long *, user_mask_ptr)
//_syscall3(int, kludge_sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long *, user_mask_ptr)
#endif

// returns the number of 1-bit of x
static FINLINE int32_t bit_pop_count32(const uint32_t x) {
	#if defined __GCC__ || defined __ICC_GCC__
		return __builtin_popcount(x);
	#elif defined __ICC__
		return _popcnt32(x);
	#else
		// pffff.
		uint32_t num = 0, val = x;
		#ifdef __ICC__
			// hah, shit, icc completly unroll it. hahah.
			#pragma nounroll
		#endif
		for (uint_t i=0; i<32; ++i) {
			num += (val & 1) ? 1 : 0;
			val >>= 1;
		}

		return num;
	#endif
}
namespace sys {
    namespace cpu {
        int_t		num_cpu;
        float64_t	cpu_frequency;


        // no calibration loop, just reading it from registry (on doze). /proc/cpuinfo on linux.
        static float64_t get_cpu_frequency() {
#if defined WIN32
            HKEY	key;
            DWORD	freq_mhz;

            int rc = RegOpenKeyEx(HKEY_LOCAL_MACHINE, L"Hardware\\Description\\System\\CentralProcessor\\0", 0, KEY_READ, &key);
            if (rc == ERROR_SUCCESS) {
                DWORD dummy = sizeof(DWORD);
                rc = RegQueryValueEx(key, L"~MHz",0,0,(LPBYTE)(&freq_mhz), &dummy);
                RegCloseKey(key);
            }

            if (rc != ERROR_SUCCESS) {
                sys::log("CPU: failed to find the proper key, using fictitious freq of 1Ghz.\n");
                return 1. * 1e9;
            }

            sys::log("CPU: Windoze said we're running at %d mhz.\n", freq_mhz);
            return float64_t(int64_t(freq_mhz)) * 1e6;

#elif defined LINUX
            FILE *f = fopen("/proc/cpuinfo", "r");
            if (f) {
                bool_t		bingo = false;
                float		freq_mhz;
                char 		line[256];
                while(fgets(line, sizeof(line), f)) {
                    if (sscanf(line, "cpu MHz : %f", &freq_mhz) == 1) {
                        bingo = true;
                        break;
                    }
                }
                fclose(f);
                if (bingo) {
                    sys::log("CPU: the penguin said we're running at %.3f mhz.\n", freq_mhz);
                    return float64_t(freq_mhz) * 1e6;
                }
            }

            sys::log("CPU: couldn't parse cpu freq.\n");
            return 1. * 1e9;
#else
#error not implemented.
#endif
        }

        // borrowed from SDL. should check for cpuid availability first but i don't care
        static uint32_t get_cpu_features() {
            uint32_t features = 0;
            //FIX: improper
#if defined(__GNUC__) && ( defined(i386) || defined(__x86_64__) )
            __asm__ (
                    "        movl    %%ebx,%%edi\n"
                    "        xorl    %%eax,%%eax         # Set up for CPUID instruction    \n"
                    "        cpuid                       # Get and save vendor ID          \n"
                    "        cmpl    $1,%%eax            # Make sure 1 is valid input for CPUID\n"
                    "        jl      1f                  # We dont have the CPUID instruction\n"
                    "        xorl    %%eax,%%eax                                           \n"
                    "        incl    %%eax                                                 \n"
                    "        cpuid                       # Get family/model/stepping/features\n"
                    "        movl    %%edx,%0                                              \n"
                    "1:                                                                    \n"
                    "        movl    %%edi,%%ebx\n"
                    : "=m" (features)
                    :
                    : "%eax", "%ebx", "%ecx", "%edx", "%edi"
                    );
#elif defined __MSVC__ || defined __ICC_MSVC__
            __asm {
                xor     eax, eax            ; Set up for CPUID instruction
                    cpuid                       ; Get and save vendor ID
                    cmp     eax, 1              ; Make sure 1 is valid input for CPUID
                    jl      done                ; We dont have the CPUID instruction
                    xor     eax, eax
                    inc     eax
                    cpuid                       ; Get family/model/stepping/features
                    mov     features, edx
                    done:
            }
#else
#error	not implemented.
#endif
            return features;
        }

        /*
           PM (bit 5) or Precision Mask
           UM (bit 4) or Underflow Mask
           OM (bit 3) or Overflow Mask
           ZM (bit 2) or Zero divide Mask
           DM (bit 1) or Denormalized operand Mask
           IM (bit 0) or Invalid operation Mask
           */
        static void set_fpu_handling() {
            /*
               Mantissa Precision Control Bits (8 & 9)
               00 24 bits, 01 Reserved, 10 53 bits, 11 64 bits
               */
            uint32_t flags = 0;
#if defined(__MSVC__) || defined(__ICC_MSVC__)
            __asm fstcw [flags];
#else
            __asm("fstcw %0" : "=m" (flags));
#endif

            enum { PRECISION_MASK = ~(3ul << 8), PRECISION_24 = 0 };
            flags &= PRECISION_MASK;
            flags |= PRECISION_24;

            // (1=handle exception internally, 0=fault)
            enum {
                EXCEPTION_MASK = ~(1ul<<5)-1ul,
                EXCEPTION_IM = 1, EXCEPTION_DM = 2, EXCEPTION_ZM = 4, EXCEPTION_OM = 8,
                EXCEPTION_UM = 16, EXCEPTION_PM = 32 };

            flags &= EXCEPTION_MASK;
            flags |= EXCEPTION_IM | EXCEPTION_DM | EXCEPTION_ZM;
            flags |= EXCEPTION_OM | EXCEPTION_UM | EXCEPTION_PM;

#if defined(__MSVC__) || defined(__ICC_MSVC__)
            __asm fldcw [flags];
#else
            __asm("fldcw %0" : : "m" (flags));
#endif
        }


        static int get_num_cpu() {
#if defined WIN32
            DWORD proc_mask, sys_mask;
            GetProcessAffinityMask(GetCurrentProcess(), &proc_mask, &sys_mask);
            // i have a utility func around for bit counts, i'm sure.
            const int num = bit_pop_count32(sys_mask);
            return num;
#elif defined LINUX
            {
                /*
                   If we're launched with, say, tasket 1 quadrille
                   then we won't detect the right number of processors.

note:	glibc support seems quite borked atm, but it works for that part
so i'm going to let it go that way (but i'll kludge the bind_* stuff).
*/
                cpu_set_t original, cpuset;

                CPU_ZERO(&original);
                sched_getaffinity(0, sizeof(cpuset), &original);

                CPU_ZERO(&cpuset);
                for (int i=0; i<sys::cpu::max_cpu; ++i) CPU_SET(i, &cpuset);
                sched_setaffinity(0, sizeof(cpuset), &cpuset);
                sched_getaffinity(0, sizeof(cpuset), &cpuset);

                int count = 0;
                for (int i=0; i<sys::cpu::max_cpu; ++i) count += CPU_ISSET(i, &cpuset);

                return count;
            }
#else
#error not implemented.
#endif
        }

        bool_t bind_process(const int cpu) {
            if (cpu > num_cpu) return false;

#if defined LINUX
            if (1) {
                unsigned long mask = 0;
                // the 'all cpu' thing is not correct, but i won't fix that
                // until glibc provides proper support for the call.
                mask |= cpu < 0 ? (1<<sys::cpu::num_cpu)-1 : 1<<cpu;
                //const bool_t rc = kludge_sched_setaffinity(0, sizeof(mask), &mask) == 0;
					//syslog("sys::cpu::bind_process: asked %d; got mask %d, rc %d\n",cpu,mask, rc);

					/* verif
					unsigned long mask2 = 0;
					const bool_t rc2 = kludge_sched_getaffinity(0, sizeof(mask2), &mask2) != 0;
					syslog("sys::cpu::bind_process: verif mask %d, rc %d\n",mask2, rc2);
					return mask == mask2;
					*/

					//return cpu < 0 ? true : rc; // ditto.
					//if (!rc) perror("sys::cpu::bind_process: sched_setaffinity failure");
					return true;
				}
			#else
				const int mask = cpu < 0 ? (1<<sys::cpu::num_cpu)-1 : 1 << cpu;
				return SetProcessAffinityMask(GetCurrentProcess(), mask) != 0;
			#endif
		}

		bool_t bind_thread(const int cpu) {
			#if defined LINUX
				return bind_process(cpu);	// heh.
			#else
				if (cpu > num_cpu) return false;
				const int mask = cpu < 0 ? (1<<sys::cpu::num_cpu)-1 : 1 << cpu;
				return SetThreadAffinityMask(GetCurrentThread(), mask) != 0;
			#endif
		}


		#if defined WINDOWS
			static const int
				remap_priority_process[4] =	{ BELOW_NORMAL_PRIORITY_CLASS, NORMAL_PRIORITY_CLASS, ABOVE_NORMAL_PRIORITY_CLASS, HIGH_PRIORITY_CLASS },
				remap_priority_thread[4] =	{ THREAD_PRIORITY_LOWEST, THREAD_PRIORITY_NORMAL, THREAD_PRIORITY_ABOVE_NORMAL, THREAD_PRIORITY_HIGHEST };
		#endif

		bool_t set_process_priority(const priority_t prio) {
			#if defined LINUX
				// let's remap values.
				const int
					remap[4] = { 10, 0, -10, -19 },
					tmp = (int)prio > 3 ? 3 : (int)prio < 0 ? 0 : (int)prio,
					val = remap[tmp];

				//syslog("sys::cpu::set_process_priority: prio %d, remapped to %d.\n",prio,val);
				if (setpriority(PRIO_PROCESS, 0, val) == 0)
					return true;

				perror("sys::cpu::set_process_priority: setpriority failure");
				return false;
			#else
				const int
					tmp = (int)prio > 3 ? 3 : (int)prio < 0 ? 0 : (int)prio,
					val = remap_priority_process[tmp];
				return SetPriorityClass(GetCurrentProcess(), val) != 0;
			#endif
		}

		bool_t set_thread_priority(const priority_t prio) {
			#if defined LINUX
				return set_process_priority(prio);
			#else
				const int
					tmp = (int)prio > 3 ? 3 : (int)prio < 0 ? 0 : (int)prio,
					val = remap_priority_thread[tmp];
				return SetThreadPriority(GetCurrentThread(), val) != 0;
			#endif
		}


		bool_t bootstrap() {
			num_cpu = get_num_cpu();
			sys::log("CPU: %d cpu/core detected.\n", num_cpu);

			const bool_t bind_rc =
				bind_process(-1) &
				//true; 
				bind_thread(preferred_cpu);

			if (bind_rc)
				sys::log("CPU: binded main thread to cpu #%d\n", preferred_cpu);
			else
				sys::log("CPU: failed to bind to cpu?!\n");

			const bool_t prio_rc =
				//set_process_priority(sys::cpu::priority_max) &
				//set_process_priority(sys::cpu::priority_normal) &
				//set_thread_priority(sys::cpu::priority_high);
				//set_thread_priority(sys::cpu::priority_max);
				set_process_priority(sys::cpu::process_priority) &
				set_thread_priority(sys::cpu::thread_priority_main);
			if (!prio_rc)
				sys::log("CPU: couldn't tweak priorities.\n");


			cpu_frequency = get_cpu_frequency();

			// features
			enum { FEAT_RDTSC = 0x00000010ul, FEAT_SSE   = 0x02000000ul, FEAT_SSE2  = 0x04000000ul };
			const int32_t feats = get_cpu_features();
			const bool_t has_rdtsc = feats & FEAT_RDTSC, has_sse = feats & FEAT_SSE, has_sse2 = feats & FEAT_SSE2;
			sys::log("CPU: rdtsc [%c] SSE [%c] SSE2 [%c]\n", has_rdtsc?'X':' ', has_sse?'X':' ', has_sse2?'X':' ');
			if (!has_sse2) {
				sys::log("SSE2 required, sorry.\n");
				return false;
			}

			// x87 tinkering
			set_fpu_handling();
			// likewise for SSE.
			#if 1
				_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
				_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);

				_MM_SET_EXCEPTION_MASK(
					_MM_MASK_OVERFLOW | _MM_MASK_UNDERFLOW	|
					_MM_MASK_INEXACT | _MM_MASK_INVALID	|
					_MM_MASK_DIV_ZERO);
			#else
				// NaN tracking mode.
				// Might blow where not due, say because we normalize a packet with a ray that will be ignored etc...
				// Still, it's useful.
				_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
				_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);

				_MM_SET_EXCEPTION_MASK(_MM_MASK_INEXACT);
			#endif

			return true;
		}
	}
}
