Re: [mpm@selenic.com: Re: buggy ia64_fls() ? (was Re: /dev/random problem on 2.6.12-rc1)]

From: David Mosberger <davidm_at_napali.hpl.hp.com>
Date: 2005-04-09 16:00:51
>>>>> On Fri, 8 Apr 2005 22:16:53 -0700, Grant Grundler <iod00d@hp.com> said:

  Grant> Results on my 1.5hz Madison are slightly different than what was
  Grant> previously posted:

  Grant> grundler@iota:/usr/src/userspace$ gcc-3.4 -O2 test_fls.c 
  Grant> grundler@iota:/usr/src/userspace$ ./a.out
  Grant> done with correctness test
  Grant>   overhead:     5.342 ns
  Grant>    generic:     8.013 ns
  Grant>     womack:     8.680 ns
  Grant>       arch:     8.681 ns
  Grant>   popcount:     7.345 ns
  Grant>   ia64_fls:     7.345 ns
  Grant> popcount64:     8.680 ns

Yes, the loop is very sensitive to small changes, so I'm not surprised
the timing changes.  I tried a different approach: summing up the
return values and printing the final value.  That's more portable and
almost guarantees that the loop won't get eliminated.  With that, I
get:

  overhead:     4.678 ns (sum=0)
   generic:     9.363 ns (sum=92ac3f77)
    womack:    10.045 ns (sum=8c1e2548)
  popcount:     8.718 ns (sum=99331f94)
      arch:    10.020 ns (sum=8c4a26b6)
       clz:    10.029 ns (sum=8c4d1e96)
  ia64_fls:     8.017 ns (sum=0xd319c7ee)
popcount64:    10.033 ns (sum=0xb5879d84)

whereas before it was:

  overhead:     4.677 ns
   generic:     8.687 ns
    womack:     9.358 ns
  popcount:     6.680 ns
      arch:     9.356 ns
       clz:     9.353 ns
  ia64_fls:     8.004 ns
popcount64:    10.004 ns

Fortunately, the changes don't affect the conclusions: ia64_fls() is
the right choice if you need a full 64-bit fls and popcount is best
for the Linux/portable fls().

(Note: I added a "clz" variant which builds on __builtin_clz(); it
       turns out this ends up generating the same code as the "arch"
       variant.)

	--david

-------------------
/*
 * Quick hack to verify ia64 fls() was (NOT) correct.
 * Use as framework for testing other arches.
 *
 * Copyright (C) 2005 Grant Grundler (grundler at parisc-linux.org)
 *
 *  Copies code from:
 *  Copyright (C) 1998-2003 Hewlett-Packard Co
 *		David Mosberger-Tang <davidm@hpl.hp.com>
 *
 * testfls.c may be redistributed under GPL 2.0.
 */
#include <stdlib.h>
#include <stdio.h>

#include <sys/time.h>

#ifdef __INTEL_COMPILER
# include <ia64intrin.h>
# define ia64_getf_exp(x)	__getf_exp(x)
# define popcount(x)		_m64_popcnt(x)
#else
# define ia64_getf_exp(x)                                       \
({                                                              \
        long ia64_intri_res;                                    \
                                                                \
        asm ("getf.exp %0=%1" : "=r"(ia64_intri_res) : "f"(x)); \
                                                                \
        ia64_intri_res;                                         \
})
# define ia64_popcnt(x)						\
({								\
	unsigned long ia64_intri_res;				\
	asm ("popcnt %0=%1" : "=r" (ia64_intri_res) : "r" (x));	\
								\
	ia64_intri_res;						\
})
# define popcount(x)		__builtin_popcountl(x)
#endif

static inline unsigned long
ia64_fls (unsigned long x)
{
        long double d = x;
        long exp;

        exp = ia64_getf_exp(d);
        return exp - 0xffff;
}

static inline int
arch_fls (int x)
{
	if (!x)
		return 0;
	return ia64_fls((unsigned int) x) + 1;
}

/*
 * fls: find last bit set.
 */
static __inline__ int generic_fls(int x)
{
	int r = 32;

	if (!x)
		return 0;
	if (!(x & 0xffff0000u)) {
		x <<= 16;
		r -= 16;
	}
	if (!(x & 0xff000000u)) {
		x <<= 8;
		r -= 8;
	}
	if (!(x & 0xf0000000u)) {
		x <<= 4;
		r -= 4;
	}
	if (!(x & 0xc0000000u)) {
		x <<= 2;
		r -= 2;
	}
	if (!(x & 0x80000000u)) {
		x <<= 1;
		r -= 1;
	}
	return r;
}

static __inline__ int womack_fls(int x)
{
	unsigned int t = x;
	int r = 1;

	if (!t)
		return 0;
	if (t & 0xffff0000u) {
		t >>= 16;
		r += 16;
	}
	if (t & 0xff00) {
		t >>= 8;
		r += 8;
	}
	if (t & 0xf0u) {
		t >>= 4;
		r += 4;
	}
	/*
	 * This is based on an article by Paul Womack posted to
	 * comp.arch on May 15, 2000.
	 *
	 * Last 4 bits are handled by a lookup table.  Each
	 * table-entry occupies only 2 bits, so we can encode the
	 * table in 16*2 = 32 bits.  The table value is constructed
	 * like this:
	 *
	 * 0000 = don't care (not possible)
	 * 0001 = 1
	 * 0010 = 2
	 * 0011 = 2
	 * 0100 = 3
	 * 0101 = 3
	 * 0110 = 3
	 * 0111 = 3
	 * 1000 = 4
	 * 1001 = 4
	 * 1010 = 4
	 * 1011 = 4
	 * 1100 = 4
	 * 1101 = 4
	 * 1110 = 4
	 * 1111 = 4
	 *
	 * to make it fit in 2 bits, we subtract 1 before encoding, so
	 * in binary:
	 *
	 *  = 0b11111111111111111010101001010000
	 *       3 3 3 3 3 3 3 3 2 2 2 2 1 1 0 0
	 * hex (thanks, bc!)
	 *  = 0xffffaa50
	 */
	return r + ((0xffffaa50u >> (2 * t)) & 0x3);
}

static __inline__ int
popcount_fls(int t)
{
	unsigned long x = t & 0xffffffffu;

	if (!x)
		return 0;
	x |= x >> 1;
	x |= x >> 2;
	x |= x >> 4;
	x |= x >> 8;
	x |= x >> 16;
	return popcount(x);
}

static __inline__ unsigned long
popcount_fls64(unsigned long x)
{
	x |= x >> 1;
	x |= x >> 2;
	x |= x >> 4;
	x |= x >> 8;
	x |= x >> 16;
	x |= x >> 32;
	return popcount(x) - 1;
}

static __inline__ int
clz_fls (int x)
{
	if (!x)
		return 0;
	return 32 - __builtin_clz(x);
}

int testval[] = { -2, -1, 0, 6, 10 };

static inline void
check (int val)
{
	if (arch_fls(val) != generic_fls(val))
		printf ("arch_fls(%08x) %6d generic_fls(%08x) %6d\n",
			val, arch_fls(val), val, generic_fls(val));
	if (womack_fls(val) != generic_fls(val))
		printf ("arch_fls(%08x) %6d generic_fls(%08x) %6d\n",
			val, womack_fls(val), val, generic_fls(val));
	if (popcount_fls(val) != generic_fls(val))
		printf ("popcount_fls(%08x) %6d generic_fls(%08x) %6d\n",
			val, popcount_fls(val), val, generic_fls(val));
	if (clz_fls(val) != generic_fls(val))
		printf ("clz_fls(%08x) %6d generic_fls(%08x) %6d\n",
			val, clz_fls(val), val, generic_fls(val));
}

static inline void
check64 (unsigned long val)
{
	if (!val)
		return;

	if (popcount_fls64(val) != ia64_fls(val))
		printf ("popcount_fls64(%016lx) %6ld ia64_fls(%016lx) %6ld\n",
			val, popcount_fls64(val), val, ia64_fls(val));
}

static int
empty (int x)
{
	return 0;
}

static double
measure (const char *label, int (*func)(int), double offset)
{
	struct timeval start, stop;
	long i, count = 1000000, sum = 0;
	double t;

	while (1) {
		gettimeofday(&start, NULL);
		for (i = 0; i < count; ++i)
			(*func)(i | (i << 16));
		gettimeofday(&stop, NULL);

		t = ((stop.tv_sec + 1e-6*stop.tv_usec)
		     - (start.tv_sec + 1e-6*start.tv_usec));

		if (t >= 1.0)
			break;

		if (t <= 0.0)
			t = 1e-3;

		count = (1.1 / t) * count;
	}
	t = (t / count) * 1e9 - offset;

	printf ("%10s: %9.3f ns (sum=%lx)\n", label, t, sum);
	return t;
}

static double
measure64 (const char *label, unsigned long (*func)(unsigned long),
	   double offset)
{
	struct timeval start, stop;
	long i, count = 1000000, sum = 0;
	double t;

	while (1) {
		gettimeofday(&start, NULL);
		for (i = 0; i < count; ++i)
			(*func)(i | (i << 16));
		gettimeofday(&stop, NULL);

		t = ((stop.tv_sec + 1e-6*stop.tv_usec)
		     - (start.tv_sec + 1e-6*start.tv_usec));

		if (t >= 1.0)
			break;

		if (t <= 0.0)
			t = 1e-3;

		count = (1.1 / t) * count;
	}
	t = (t / count) * 1e9 - offset;

	printf ("%10s: %9.3f ns (sum=0x%lx)\n", label, t, sum);
	return t;
}

int
main (int argc, char **argv)
{
	double overhead;
	int j;

	for (j = 0; j < sizeof(testval)/sizeof(int); j++) {
		check(testval[j]);
		check64(testval[j]);
	}
	for (j = 0; j < 32; ++j) {
		check(1 << j);
		check((1 << j) - 1);
		check((1 << j) + 1);
	}
	for (j = 0; j < 64; ++j) {
		check(1UL << j);
		check((1UL << j) - 1);
		check((1UL << j) + 1);
	}

	printf ("done with correctness test\n");

	overhead = measure("overhead", empty, 0.0);
	measure("generic", generic_fls, overhead);
	measure("womack", womack_fls, overhead);
	measure("popcount", popcount_fls, overhead);
	measure("arch", arch_fls, overhead);
	measure("clz", clz_fls, overhead);
	measure64("ia64_fls", ia64_fls, overhead);
	measure64("popcount64", popcount_fls64, overhead);
	return 0;
}
-
To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Received on Sat Apr 9 02:01:25 2005

This archive was generated by hypermail 2.1.8 : 2005-08-02 09:20:37 EST