src/third_party/glm/test/core/core_func_integer_find_msb.cpp - cobalt - Git at Google

 // This has the programs for computing the number of leading zeros
 // in a word.
 // Max line length is 57, to fit in hacker.book.
 // Compile with g++, not gcc.
 #include <cstdio>
 #include <cstdlib>     // To define "exit", req'd by XLC.
 #include <ctime>

 #define LE 1            // 1 for little-endian, 0 for big-endian.

 int pop(unsigned x) {
    x = x - ((x >> 1) & 0x55555555);
    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
    x = (x + (x >> 4)) & 0x0F0F0F0F;
    x = x + (x << 8);
    x = x + (x << 16);
    return x >> 24;
 }

 int nlz1(unsigned x) {
    int n;

    if (x == 0) return(32);
    n = 0;
    if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
    if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
    if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
    if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
    if (x <= 0x7FFFFFFF) {n = n + 1;}
    return n;
 }

 int nlz1a(unsigned x) {
    int n;

 /* if (x == 0) return(32); */
    if ((int)x <= 0) return (~x >> 26) & 32;
    n = 1;
    if ((x >> 16) == 0) {n = n +16; x = x <<16;}
    if ((x >> 24) == 0) {n = n + 8; x = x << 8;}
    if ((x >> 28) == 0) {n = n + 4; x = x << 4;}
    if ((x >> 30) == 0) {n = n + 2; x = x << 2;}
    n = n - (x >> 31);
    return n;
 }
 // On basic Risc, 12 to 20 instructions.

 int nlz2(unsigned x) {
    unsigned y;
    int n;

    n = 32;
    y = x >>16;  if (y != 0) {n = n -16;  x = y;}
    y = x >> 8;  if (y != 0) {n = n - 8;  x = y;}
    y = x >> 4;  if (y != 0) {n = n - 4;  x = y;}
    y = x >> 2;  if (y != 0) {n = n - 2;  x = y;}
    y = x >> 1;  if (y != 0) return n - 2;
    return n - x;
 }

 // As above but coded as a loop for compactness:
 // 23 to 33 basic Risc instructions.
 int nlz2a(unsigned x) {
    unsigned y;
    int n, c;

    n = 32;
    c = 16;
    do {
       y = x >> c;  if (y != 0) {n = n - c;  x = y;}
       c = c >> 1;
    } while (c != 0);
    return n - x;
 }

 int nlz3(int x) {
    int y, n;

    n = 0;
    y = x;
 L: if (x < 0) return n;
    if (y == 0) return 32 - n;
    n = n + 1;
    x = x << 1;
    y = y >> 1;
    goto L;
 }

 int nlz4(unsigned x) {
    int y, m, n;

    y = -(x >> 16);      // If left half of x is 0,
    m = (y >> 16) & 16;  // set n = 16.  If left half
    n = 16 - m;          // is nonzero, set n = 0 and
    x = x >> m;          // shift x right 16.
                         // Now x is of the form 0000xxxx.
    y = x - 0x100;       // If positions 8-15 are 0,
    m = (y >> 16) & 8;   // add 8 to n and shift x left 8.
    n = n + m;
    x = x << m;

    y = x - 0x1000;      // If positions 12-15 are 0,
    m = (y >> 16) & 4;   // add 4 to n and shift x left 4.
    n = n + m;
    x = x << m;

    y = x - 0x4000;      // If positions 14-15 are 0,
    m = (y >> 16) & 2;   // add 2 to n and shift x left 2.
    n = n + m;
    x = x << m;

    y = x >> 14;         // Set y = 0, 1, 2, or 3.
    m = y & ~(y >> 1);   // Set m = 0, 1, 2, or 2 resp.
    return n + 2 - m;
 }

 int nlz5(unsigned x) {
    int pop(unsigned x);

    x = x | (x >> 1);
    x = x | (x >> 2);
    x = x | (x >> 4);
    x = x | (x >> 8);
    x = x | (x >>16);
    return pop(~x);
 }

 /* The four programs below are not valid ANSI C programs.  This is
 because they refer to the same storage locations as two different types.
 However, they work with xlc/AIX, gcc/AIX, and gcc/NT.  If you try to
 code them more compactly by declaring a variable xx to be "double," and
 then using

    n = 1054 - (*((unsigned *)&xx + LE) >> 20);

 then you are violating not only the rule above, but also the ANSI C
 rule that pointer arithmetic can be performed only on pointers to
 array elements.
    When coded with the above statement, the program fails with xlc,
 gcc/AIX, and gcc/NT, at some optimization levels.
    BTW, these programs use the "anonymous union" feature of C++, not
 available in C. */

 int nlz6(unsigned k) {
    union {
       unsigned asInt[2];
       double asDouble;
    };
    int n;

    asDouble = (double)k + 0.5;
    n = 1054 - (asInt[LE] >> 20);
    return n;
 }

 int nlz7(unsigned k) {
    union {
       unsigned asInt[2];
       double asDouble;
    };
    int n;

    asDouble = (double)k;
    n = 1054 - (asInt[LE] >> 20);
    n = (n & 31) + (n >> 9);
    return n;
 }

    /* In single precision, round-to-nearest mode, the basic method fails for:
    k = 0, k = 01FFFFFF, 03FFFFFE <= k <= 03FFFFFF,
                         07FFFFFC <= k <= 07FFFFFF,
                         0FFFFFF8 <= k <= 0FFFFFFF,
                                    ...
                         7FFFFFC0 <= k <= 7FFFFFFF.
                         FFFFFF80 <= k <= FFFFFFFF.
    For k = 0 it gives 158, and for the other values it is too low by 1. */

 int nlz8(unsigned k) {
    union {
       unsigned asInt;
       float asFloat;
    };
    int n;

    k = k & ~(k >> 1);           /* Fix problem with rounding. */
    asFloat = (float)k + 0.5f;
    n = 158 - (asInt >> 23);
    return n;
 }

 /* The example below shows how to make a macro for nlz.  It uses an
 extension to the C and C++ languages that is provided by the GNU C/C++
 compiler, namely, that of allowing statements and declarations in
 expressions (see "Using and Porting GNU CC", by Richard M. Stallman
 (1998).  The underscores are necessary to protect against the
 possibility that the macro argument will conflict with one of its local
 variables, e.g., NLZ(k). */

 int nlz9(unsigned k) {
    union {
       unsigned asInt;
       float asFloat;
    };
    int n;

    k = k & ~(k >> 1);           /* Fix problem with rounding. */
    asFloat = (float)k;
    n = 158 - (asInt >> 23);
    n = (n & 31) + (n >> 6);     /* Fix problem with k = 0. */
    return n;
 }

 /* Below are three nearly equivalent programs for computing the number
 of leading zeros in a word. This material is not in HD, but may be in a
 future edition.
    Immediately below is Robert Harley's algorithm, found at the
 comp.arch newsgroup entry dated 7/12/96, pointed out to me by Norbert
 Juffa.
    Table entries marked "u" are unused. 14 ops including a multiply,
 plus an indexed load.
    The smallest multiplier that works is 0x045BCED1 = 17*65*129*513 (all
 of form 2**k + 1). There are no multipliers of three terms of the form
 2**k +- 1 that work, with a table size of 64 or 128. There are some,
 with a table size of 64, if you precede the multiplication with x = x -
 (x >> 1), but that seems less elegant. There are also some if you use a
 table size of 256, the smallest is 0x01033CBF = 65*255*1025 (this would
 save two instructions in the form of this algorithm with the
 multiplication expanded into shifts and adds, but the table size is
 getting a bit large). */

 #define u 99
 int nlz10(unsigned x) {

    static char table[64] =
      {32,31, u,16, u,30, 3, u,  15, u, u, u,29,10, 2, u,
        u, u,12,14,21, u,19, u,   u,28, u,25, u, 9, 1, u,
       17, u, 4, u, u, u,11, u,  13,22,20, u,26, u, u,18,
        5, u, u,23, u,27, u, 6,   u,24, 7, u, 8, u, 0, u};

    x = x | (x >> 1);    // Propagate leftmost
    x = x | (x >> 2);    // 1-bit to the right.
    x = x | (x >> 4);
    x = x | (x >> 8);
    x = x | (x >>16);
    x = x*0x06EB14F9;    // Multiplier is 7*255**3.
    return table[x >> 26];
 }

 /* Harley's algorithm with multiply expanded.
 19 elementary ops plus an indexed load. */

 int nlz10a(unsigned x) {

    static char table[64] =
      {32,31, u,16, u,30, 3, u,  15, u, u, u,29,10, 2, u,
        u, u,12,14,21, u,19, u,   u,28, u,25, u, 9, 1, u,
       17, u, 4, u, u, u,11, u,  13,22,20, u,26, u, u,18,
        5, u, u,23, u,27, u, 6,   u,24, 7, u, 8, u, 0, u};

    x = x | (x >> 1);    // Propagate leftmost
    x = x | (x >> 2);    // 1-bit to the right.
    x = x | (x >> 4);
    x = x | (x >> 8);
    x = x | (x >> 16);
    x = (x << 3) - x;    // Multiply by 7.
    x = (x << 8) - x;    // Multiply by 255.
    x = (x << 8) - x;    // Again.
    x = (x << 8) - x;    // Again.
    return table[x >> 26];
 }

 /* Julius Goryavsky's version of Harley's algorithm.
 17 elementary ops plus an indexed load, if the machine
 has "and not." */

 int nlz10b(unsigned x) {

    static char table[64] =
      {32,20,19, u, u,18, u, 7,  10,17, u, u,14, u, 6, u,
        u, 9, u,16, u, u, 1,26,   u,13, u, u,24, 5, u, u,
        u,21, u, 8,11, u,15, u,   u, u, u, 2,27, 0,25, u,
       22, u,12, u, u, 3,28, u,  23, u, 4,29, u, u,30,31};

    x = x | (x >> 1);    // Propagate leftmost
    x = x | (x >> 2);    // 1-bit to the right.
    x = x | (x >> 4);
    x = x | (x >> 8);
    x = x & ~(x >> 16);
    x = x*0xFD7049FF;    // Activate this line or the following 3.
 // x = (x << 9) - x;    // Multiply by 511.
 // x = (x << 11) - x;   // Multiply by 2047.
 // x = (x << 14) - x;   // Multiply by 16383.
    return table[x >> 26];
 }

 int errors;
 void error(int x, int y) {
    errors = errors + 1;
    printf("Error for x = %08x, got %d\n", x, y);
 }

 int main()
 {
 #	ifdef NDEBUG

 	int i, n;
 	static unsigned test[] = {0,32, 1,31, 2,30, 3,30, 4,29, 5,29, 6,29,
 		7,29, 8,28, 9,28, 16,27, 32,26, 64,25, 128,24, 255,24, 256,23,
 		512,22, 1024,21, 2048,20, 4096,19, 8192,18, 16384,17, 32768,16,
 		65536,15, 0x20000,14, 0x40000,13, 0x80000,12, 0x100000,11,
 		0x200000,10, 0x400000,9, 0x800000,8, 0x1000000,7, 0x2000000,6,
 		0x4000000,5, 0x8000000,4, 0x0FFFFFFF,4, 0x10000000,3,
 		0x3000FFFF,2, 0x50003333,1, 0x7FFFFFFF,1, 0x80000000,0,
 		0xFFFFFFFF,0};
 	std::size_t const Count = 1000;

 	n = sizeof(test)/4;

 	std::clock_t TimestampBeg = 0;
 	std::clock_t TimestampEnd = 0;

 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz1(test[i]) != test[i+1]) error(test[i], nlz1(test[i]));}
 	TimestampEnd = std::clock();

 	printf("nlz1: %d clocks\n", TimestampEnd - TimestampBeg);

 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz1a(test[i]) != test[i+1]) error(test[i], nlz1a(test[i]));}
 	TimestampEnd = std::clock();

 	printf("nlz1a: %d clocks\n", TimestampEnd - TimestampBeg);

 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz2(test[i]) != test[i+1]) error(test[i], nlz2(test[i]));}
 	TimestampEnd = std::clock();

 	printf("nlz2: %d clocks\n", TimestampEnd - TimestampBeg);

 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz2a(test[i]) != test[i+1]) error(test[i], nlz2a(test[i]));}
 	TimestampEnd = std::clock();

 	printf("nlz2a: %d clocks\n", TimestampEnd - TimestampBeg);

 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz3(test[i]) != test[i+1]) error(test[i], nlz3(test[i]));}
 	TimestampEnd = std::clock();

 	printf("nlz3: %d clocks\n", TimestampEnd - TimestampBeg);

 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz4(test[i]) != test[i+1]) error(test[i], nlz4(test[i]));}
 	TimestampEnd = std::clock();

 	printf("nlz4: %d clocks\n", TimestampEnd - TimestampBeg);

 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz5(test[i]) != test[i+1]) error(test[i], nlz5(test[i]));}
 	TimestampEnd = std::clock();

 	printf("nlz5: %d clocks\n", TimestampEnd - TimestampBeg);

 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz6(test[i]) != test[i+1]) error(test[i], nlz6(test[i]));}
 	TimestampEnd = std::clock();

 	printf("nlz6: %d clocks\n", TimestampEnd - TimestampBeg);

 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz7(test[i]) != test[i+1]) error(test[i], nlz7(test[i]));}
 	TimestampEnd = std::clock();

 	printf("nlz7: %d clocks\n", TimestampEnd - TimestampBeg);

 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz8(test[i]) != test[i+1]) error(test[i], nlz8(test[i]));}
 	TimestampEnd = std::clock();

 	printf("nlz8: %d clocks\n", TimestampEnd - TimestampBeg);

 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz9(test[i]) != test[i+1]) error(test[i], nlz9(test[i]));}
 	TimestampEnd = std::clock();

 	printf("nlz9: %d clocks\n", TimestampEnd - TimestampBeg);

 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz10(test[i]) != test[i+1]) error(test[i], nlz10(test[i]));}
 	TimestampEnd = std::clock();

 	printf("nlz10: %d clocks\n", TimestampEnd - TimestampBeg);

 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz10a(test[i]) != test[i+1]) error(test[i], nlz10a(test[i]));}
 	TimestampEnd = std::clock();

 	printf("nlz10a: %d clocks\n", TimestampEnd - TimestampBeg);

 	TimestampBeg = std::clock();
 	for (std::size_t k = 0; k < Count; ++k)
 	for (i = 0; i < n; i += 2) {
 		if (nlz10b(test[i]) != test[i+1]) error(test[i], nlz10b(test[i]));}
 	TimestampEnd = std::clock();

 	printf("nlz10b: %d clocks\n", TimestampEnd - TimestampBeg);

 	if (errors == 0)
 		printf("Passed all %d cases.\n", sizeof(test)/8);

 #	endif//NDEBUG
 }
	// This has the programs for computing the number of leading zeros
	// in a word.
	// Max line length is 57, to fit in hacker.book.
	// Compile with g++, not gcc.
	#include <cstdio>
	#include <cstdlib> // To define "exit", req'd by XLC.
	#include <ctime>

	#define LE 1 // 1 for little-endian, 0 for big-endian.

	int pop(unsigned x) {
	x = x - ((x >> 1) & 0x55555555);
	x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
	x = (x + (x >> 4)) & 0x0F0F0F0F;
	x = x + (x << 8);
	x = x + (x << 16);
	return x >> 24;
	}

	int nlz1(unsigned x) {
	int n;

	if (x == 0) return(32);
	n = 0;
	if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
	if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
	if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
	if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
	if (x <= 0x7FFFFFFF) {n = n + 1;}
	return n;
	}

	int nlz1a(unsigned x) {
	int n;

	/* if (x == 0) return(32); */
	if ((int)x <= 0) return (~x >> 26) & 32;
	n = 1;
	if ((x >> 16) == 0) {n = n +16; x = x <<16;}
	if ((x >> 24) == 0) {n = n + 8; x = x << 8;}
	if ((x >> 28) == 0) {n = n + 4; x = x << 4;}
	if ((x >> 30) == 0) {n = n + 2; x = x << 2;}
	n = n - (x >> 31);
	return n;
	}
	// On basic Risc, 12 to 20 instructions.

	int nlz2(unsigned x) {
	unsigned y;
	int n;

	n = 32;
	y = x >>16; if (y != 0) {n = n -16; x = y;}
	y = x >> 8; if (y != 0) {n = n - 8; x = y;}
	y = x >> 4; if (y != 0) {n = n - 4; x = y;}
	y = x >> 2; if (y != 0) {n = n - 2; x = y;}
	y = x >> 1; if (y != 0) return n - 2;
	return n - x;
	}

	// As above but coded as a loop for compactness:
	// 23 to 33 basic Risc instructions.
	int nlz2a(unsigned x) {
	unsigned y;
	int n, c;

	n = 32;
	c = 16;
	do {
	y = x >> c; if (y != 0) {n = n - c; x = y;}
	c = c >> 1;
	} while (c != 0);
	return n - x;
	}

	int nlz3(int x) {
	int y, n;

	n = 0;
	y = x;
	L: if (x < 0) return n;
	if (y == 0) return 32 - n;
	n = n + 1;
	x = x << 1;
	y = y >> 1;
	goto L;
	}

	int nlz4(unsigned x) {
	int y, m, n;

	y = -(x >> 16); // If left half of x is 0,
	m = (y >> 16) & 16; // set n = 16. If left half
	n = 16 - m; // is nonzero, set n = 0 and
	x = x >> m; // shift x right 16.
	// Now x is of the form 0000xxxx.
	y = x - 0x100; // If positions 8-15 are 0,
	m = (y >> 16) & 8; // add 8 to n and shift x left 8.
	n = n + m;
	x = x << m;

	y = x - 0x1000; // If positions 12-15 are 0,
	m = (y >> 16) & 4; // add 4 to n and shift x left 4.
	n = n + m;
	x = x << m;

	y = x - 0x4000; // If positions 14-15 are 0,
	m = (y >> 16) & 2; // add 2 to n and shift x left 2.
	n = n + m;
	x = x << m;

	y = x >> 14; // Set y = 0, 1, 2, or 3.
	m = y & ~(y >> 1); // Set m = 0, 1, 2, or 2 resp.
	return n + 2 - m;
	}

	int nlz5(unsigned x) {
	int pop(unsigned x);

	x = x \| (x >> 1);
	x = x \| (x >> 2);
	x = x \| (x >> 4);
	x = x \| (x >> 8);
	x = x \| (x >>16);
	return pop(~x);
	}

	/* The four programs below are not valid ANSI C programs. This is
	because they refer to the same storage locations as two different types.
	However, they work with xlc/AIX, gcc/AIX, and gcc/NT. If you try to
	code them more compactly by declaring a variable xx to be "double," and
	then using

	n = 1054 - (((unsigned )&xx + LE) >> 20);

	then you are violating not only the rule above, but also the ANSI C
	rule that pointer arithmetic can be performed only on pointers to
	array elements.
	When coded with the above statement, the program fails with xlc,
	gcc/AIX, and gcc/NT, at some optimization levels.
	BTW, these programs use the "anonymous union" feature of C++, not
	available in C. */

	int nlz6(unsigned k) {
	union {
	unsigned asInt[2];
	double asDouble;
	};
	int n;

	asDouble = (double)k + 0.5;
	n = 1054 - (asInt[LE] >> 20);
	return n;
	}

	int nlz7(unsigned k) {
	union {
	unsigned asInt[2];
	double asDouble;
	};
	int n;

	asDouble = (double)k;
	n = 1054 - (asInt[LE] >> 20);
	n = (n & 31) + (n >> 9);
	return n;
	}

	/* In single precision, round-to-nearest mode, the basic method fails for:
	k = 0, k = 01FFFFFF, 03FFFFFE <= k <= 03FFFFFF,
	07FFFFFC <= k <= 07FFFFFF,
	0FFFFFF8 <= k <= 0FFFFFFF,
	...
	7FFFFFC0 <= k <= 7FFFFFFF.
	FFFFFF80 <= k <= FFFFFFFF.
	For k = 0 it gives 158, and for the other values it is too low by 1. */

	int nlz8(unsigned k) {
	union {
	unsigned asInt;
	float asFloat;
	};
	int n;

	k = k & ~(k >> 1); /* Fix problem with rounding. */
	asFloat = (float)k + 0.5f;
	n = 158 - (asInt >> 23);
	return n;
	}

	/* The example below shows how to make a macro for nlz. It uses an
	extension to the C and C++ languages that is provided by the GNU C/C++
	compiler, namely, that of allowing statements and declarations in
	expressions (see "Using and Porting GNU CC", by Richard M. Stallman
	(1998). The underscores are necessary to protect against the
	possibility that the macro argument will conflict with one of its local
	variables, e.g., NLZ(k). */

	int nlz9(unsigned k) {
	union {
	unsigned asInt;
	float asFloat;
	};
	int n;

	k = k & ~(k >> 1); /* Fix problem with rounding. */
	asFloat = (float)k;
	n = 158 - (asInt >> 23);
	n = (n & 31) + (n >> 6); /* Fix problem with k = 0. */
	return n;
	}

	/* Below are three nearly equivalent programs for computing the number
	of leading zeros in a word. This material is not in HD, but may be in a
	future edition.
	Immediately below is Robert Harley's algorithm, found at the
	comp.arch newsgroup entry dated 7/12/96, pointed out to me by Norbert
	Juffa.
	Table entries marked "u" are unused. 14 ops including a multiply,
	plus an indexed load.
	The smallest multiplier that works is 0x045BCED1 = 1765129*513 (all
	of form 2**k + 1). There are no multipliers of three terms of the form
	2**k +- 1 that work, with a table size of 64 or 128. There are some,
	with a table size of 64, if you precede the multiplication with x = x -
	(x >> 1), but that seems less elegant. There are also some if you use a
	table size of 256, the smallest is 0x01033CBF = 652551025 (this would
	save two instructions in the form of this algorithm with the
	multiplication expanded into shifts and adds, but the table size is
	getting a bit large). */

	#define u 99
	int nlz10(unsigned x) {

	static char table[64] =
	{32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u,
	u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u,
	17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18,
	5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u};

	x = x \| (x >> 1); // Propagate leftmost
	x = x \| (x >> 2); // 1-bit to the right.
	x = x \| (x >> 4);
	x = x \| (x >> 8);
	x = x \| (x >>16);
	x = x0x06EB14F9; // Multiplier is 7255**3.
	return table[x >> 26];
	}

	/* Harley's algorithm with multiply expanded.
	19 elementary ops plus an indexed load. */

	int nlz10a(unsigned x) {

	static char table[64] =
	{32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u,
	u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u,
	17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18,
	5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u};

	x = x \| (x >> 1); // Propagate leftmost
	x = x \| (x >> 2); // 1-bit to the right.
	x = x \| (x >> 4);
	x = x \| (x >> 8);
	x = x \| (x >> 16);
	x = (x << 3) - x; // Multiply by 7.
	x = (x << 8) - x; // Multiply by 255.
	x = (x << 8) - x; // Again.
	x = (x << 8) - x; // Again.
	return table[x >> 26];
	}

	/* Julius Goryavsky's version of Harley's algorithm.
	17 elementary ops plus an indexed load, if the machine
	has "and not." */

	int nlz10b(unsigned x) {

	static char table[64] =
	{32,20,19, u, u,18, u, 7, 10,17, u, u,14, u, 6, u,
	u, 9, u,16, u, u, 1,26, u,13, u, u,24, 5, u, u,
	u,21, u, 8,11, u,15, u, u, u, u, 2,27, 0,25, u,
	22, u,12, u, u, 3,28, u, 23, u, 4,29, u, u,30,31};

	x = x \| (x >> 1); // Propagate leftmost
	x = x \| (x >> 2); // 1-bit to the right.
	x = x \| (x >> 4);
	x = x \| (x >> 8);
	x = x & ~(x >> 16);
	x = x*0xFD7049FF; // Activate this line or the following 3.
	// x = (x << 9) - x; // Multiply by 511.
	// x = (x << 11) - x; // Multiply by 2047.
	// x = (x << 14) - x; // Multiply by 16383.
	return table[x >> 26];
	}

	int errors;
	void error(int x, int y) {
	errors = errors + 1;
	printf("Error for x = %08x, got %d\n", x, y);
	}

	int main()
	{
	# ifdef NDEBUG

	int i, n;
	static unsigned test[] = {0,32, 1,31, 2,30, 3,30, 4,29, 5,29, 6,29,
	7,29, 8,28, 9,28, 16,27, 32,26, 64,25, 128,24, 255,24, 256,23,
	512,22, 1024,21, 2048,20, 4096,19, 8192,18, 16384,17, 32768,16,
	65536,15, 0x20000,14, 0x40000,13, 0x80000,12, 0x100000,11,
	0x200000,10, 0x400000,9, 0x800000,8, 0x1000000,7, 0x2000000,6,
	0x4000000,5, 0x8000000,4, 0x0FFFFFFF,4, 0x10000000,3,
	0x3000FFFF,2, 0x50003333,1, 0x7FFFFFFF,1, 0x80000000,0,
	0xFFFFFFFF,0};
	std::size_t const Count = 1000;

	n = sizeof(test)/4;

	std::clock_t TimestampBeg = 0;
	std::clock_t TimestampEnd = 0;

	TimestampBeg = std::clock();
	for (std::size_t k = 0; k < Count; ++k)
	for (i = 0; i < n; i += 2) {
	if (nlz1(test[i]) != test[i+1]) error(test[i], nlz1(test[i]));}
	TimestampEnd = std::clock();

	printf("nlz1: %d clocks\n", TimestampEnd - TimestampBeg);

	TimestampBeg = std::clock();
	for (std::size_t k = 0; k < Count; ++k)
	for (i = 0; i < n; i += 2) {
	if (nlz1a(test[i]) != test[i+1]) error(test[i], nlz1a(test[i]));}
	TimestampEnd = std::clock();

	printf("nlz1a: %d clocks\n", TimestampEnd - TimestampBeg);

	TimestampBeg = std::clock();
	for (std::size_t k = 0; k < Count; ++k)
	for (i = 0; i < n; i += 2) {
	if (nlz2(test[i]) != test[i+1]) error(test[i], nlz2(test[i]));}
	TimestampEnd = std::clock();

	printf("nlz2: %d clocks\n", TimestampEnd - TimestampBeg);

	TimestampBeg = std::clock();
	for (std::size_t k = 0; k < Count; ++k)
	for (i = 0; i < n; i += 2) {
	if (nlz2a(test[i]) != test[i+1]) error(test[i], nlz2a(test[i]));}
	TimestampEnd = std::clock();

	printf("nlz2a: %d clocks\n", TimestampEnd - TimestampBeg);

	TimestampBeg = std::clock();
	for (std::size_t k = 0; k < Count; ++k)
	for (i = 0; i < n; i += 2) {
	if (nlz3(test[i]) != test[i+1]) error(test[i], nlz3(test[i]));}
	TimestampEnd = std::clock();

	printf("nlz3: %d clocks\n", TimestampEnd - TimestampBeg);

	TimestampBeg = std::clock();
	for (std::size_t k = 0; k < Count; ++k)
	for (i = 0; i < n; i += 2) {
	if (nlz4(test[i]) != test[i+1]) error(test[i], nlz4(test[i]));}
	TimestampEnd = std::clock();

	printf("nlz4: %d clocks\n", TimestampEnd - TimestampBeg);

	TimestampBeg = std::clock();
	for (std::size_t k = 0; k < Count; ++k)
	for (i = 0; i < n; i += 2) {
	if (nlz5(test[i]) != test[i+1]) error(test[i], nlz5(test[i]));}
	TimestampEnd = std::clock();

	printf("nlz5: %d clocks\n", TimestampEnd - TimestampBeg);

	TimestampBeg = std::clock();
	for (std::size_t k = 0; k < Count; ++k)
	for (i = 0; i < n; i += 2) {
	if (nlz6(test[i]) != test[i+1]) error(test[i], nlz6(test[i]));}
	TimestampEnd = std::clock();

	printf("nlz6: %d clocks\n", TimestampEnd - TimestampBeg);

	TimestampBeg = std::clock();
	for (std::size_t k = 0; k < Count; ++k)
	for (i = 0; i < n; i += 2) {
	if (nlz7(test[i]) != test[i+1]) error(test[i], nlz7(test[i]));}
	TimestampEnd = std::clock();

	printf("nlz7: %d clocks\n", TimestampEnd - TimestampBeg);

	TimestampBeg = std::clock();
	for (std::size_t k = 0; k < Count; ++k)
	for (i = 0; i < n; i += 2) {
	if (nlz8(test[i]) != test[i+1]) error(test[i], nlz8(test[i]));}
	TimestampEnd = std::clock();

	printf("nlz8: %d clocks\n", TimestampEnd - TimestampBeg);

	TimestampBeg = std::clock();
	for (std::size_t k = 0; k < Count; ++k)
	for (i = 0; i < n; i += 2) {
	if (nlz9(test[i]) != test[i+1]) error(test[i], nlz9(test[i]));}
	TimestampEnd = std::clock();

	printf("nlz9: %d clocks\n", TimestampEnd - TimestampBeg);

	TimestampBeg = std::clock();
	for (std::size_t k = 0; k < Count; ++k)
	for (i = 0; i < n; i += 2) {
	if (nlz10(test[i]) != test[i+1]) error(test[i], nlz10(test[i]));}
	TimestampEnd = std::clock();

	printf("nlz10: %d clocks\n", TimestampEnd - TimestampBeg);

	TimestampBeg = std::clock();
	for (std::size_t k = 0; k < Count; ++k)
	for (i = 0; i < n; i += 2) {
	if (nlz10a(test[i]) != test[i+1]) error(test[i], nlz10a(test[i]));}
	TimestampEnd = std::clock();

	printf("nlz10a: %d clocks\n", TimestampEnd - TimestampBeg);

	TimestampBeg = std::clock();
	for (std::size_t k = 0; k < Count; ++k)
	for (i = 0; i < n; i += 2) {
	if (nlz10b(test[i]) != test[i+1]) error(test[i], nlz10b(test[i]));}
	TimestampEnd = std::clock();

	printf("nlz10b: %d clocks\n", TimestampEnd - TimestampBeg);

	if (errors == 0)
	printf("Passed all %d cases.\n", sizeof(test)/8);

	# endif//NDEBUG
	}