gcc 2.95.3 isn't that bad, actually. For the most part, it optimizes in the same way your template would.
static inline unsigned _rotl(unsigned val, int shift)
{
shift &= 0x1f;
val = (val>>(0x20 - shift)) | (val << shift);
return val;
}
static inline unsigned _rotr(unsigned val, int shift)
{
shift &= 0x1f;
val = (val<<(0x20 - shift)) | (val >> shift);
return val;
}
int main()
{
volatile unsigned x = 1;
volatile unsigned c = _rotl(x, 64);
volatile unsigned d = _rotl(x, 48);
volatile unsigned e = _rotl(x, 41);
volatile unsigned f = _rotl(x, 36);
volatile unsigned g = _rotl(x, 32);
volatile unsigned h = _rotl(x, 24);
volatile unsigned i = _rotl(x, 16);
volatile unsigned j = _rotl(x, 9);
volatile unsigned k = _rotl(x, 4);
volatile unsigned l = _rotl(x, 0);
volatile unsigned m = _rotr(x, 0);
volatile unsigned n = _rotr(x, 4);
volatile unsigned o = _rotr(x, 9);
volatile unsigned p = _rotr(x, 16);
volatile unsigned q = _rotr(x, 24);
volatile unsigned r = _rotr(x, 32);
volatile unsigned s = _rotr(x, 36);
volatile unsigned t = _rotr(x, 41);
volatile unsigned u = _rotr(x, 48);
volatile unsigned v = _rotr(x, 64);
return 0;
}
00000000 :
0: 4e56 ffac linkw %fp,#-84
4: 4eb9 0000 0000 jsr 0
volatile unsigned x = 1;
a: 7001 moveq #1,%d0
c: 2d40 fffc movel %d0,%fp@(-4)
volatile unsigned c = _rotl(x, 64);
10: 202e fffc movel %fp@(-4),%d0
14: 2d40 fff8 movel %d0,%fp@(-8)
volatile unsigned d = _rotl(x, 48);
18: 202e fffc movel %fp@(-4),%d0
1c: 4840 swap %d0
1e: 2d40 fff4 movel %d0,%fp@(-12)
volatile unsigned e = _rotl(x, 41);
22: 202e fffc movel %fp@(-4),%d0
26: 7209 moveq #9,%d1
28: e3b8 roll %d1,%d0
2a: 2d40 fff0 movel %d0,%fp@(-16)
volatile unsigned f = _rotl(x, 36);
2e: 202e fffc movel %fp@(-4),%d0
32: e998 roll #4,%d0
34: 2d40 ffec movel %d0,%fp@(-20)
volatile unsigned g = _rotl(x, 32);
38: 202e fffc movel %fp@(-4),%d0
3c: 2d40 ffe8 movel %d0,%fp@(-24)
volatile unsigned h = _rotl(x, 24);
40: 202e fffc movel %fp@(-4),%d0
44: e098 rorl #8,%d0
46: 2d40 ffe4 movel %d0,%fp@(-28)
volatile unsigned i = _rotl(x, 16);
4a: 202e fffc movel %fp@(-4),%d0
4e: 4840 swap %d0
50: 2d40 ffe0 movel %d0,%fp@(-32)
volatile unsigned j = _rotl(x, 9);
54: 202e fffc movel %fp@(-4),%d0
58: e3b8 roll %d1,%d0
5a: 2d40 ffdc movel %d0,%fp@(-36)
volatile unsigned k = _rotl(x, 4);
5e: 202e fffc movel %fp@(-4),%d0
62: e998 roll #4,%d0
64: 2d40 ffd8 movel %d0,%fp@(-40)
volatile unsigned l = _rotl(x, 0);
68: 202e fffc movel %fp@(-4),%d0
6c: 2d40 ffd4 movel %d0,%fp@(-44)
volatile unsigned m = _rotr(x, 0);
70: 202e fffc movel %fp@(-4),%d0
74: 2d40 ffd0 movel %d0,%fp@(-48)
volatile unsigned n = _rotr(x, 4);
78: 202e fffc movel %fp@(-4),%d0
7c: e898 rorl #4,%d0
7e: 2d40 ffcc movel %d0,%fp@(-52)
volatile unsigned o = _rotr(x, 9);
82: 202e fffc movel %fp@(-4),%d0
86: e2b8 rorl %d1,%d0
88: 2d40 ffc8 movel %d0,%fp@(-56)
volatile unsigned p = _rotr(x, 16);
8c: 202e fffc movel %fp@(-4),%d0
90: 7210 moveq #16,%d1
92: e2b8 rorl %d1,%d0
94: 2d40 ffc4 movel %d0,%fp@(-60)
volatile unsigned q = _rotr(x, 24);
98: 202e fffc movel %fp@(-4),%d0
9c: 7218 moveq #24,%d1
9e: e2b8 rorl %d1,%d0
a0: 2d40 ffc0 movel %d0,%fp@(-64)
volatile unsigned r = _rotr(x, 32);
a4: 202e fffc movel %fp@(-4),%d0
a8: 2d40 ffbc movel %d0,%fp@(-68)
volatile unsigned s = _rotr(x, 36);
ac: 202e fffc movel %fp@(-4),%d0
b0: e898 rorl #4,%d0
b2: 2d40 ffb8 movel %d0,%fp@(-72)
volatile unsigned t = _rotr(x, 41);
b6: 202e fffc movel %fp@(-4),%d0
ba: 7209 moveq #9,%d1
bc: e2b8 rorl %d1,%d0
be: 2d40 ffb4 movel %d0,%fp@(-76)
volatile unsigned u = _rotr(x, 48);
c2: 202e fffc movel %fp@(-4),%d0
c6: 7210 moveq #16,%d1
c8: e2b8 rorl %d1,%d0
ca: 2d40 ffb0 movel %d0,%fp@(-80)
volatile unsigned v = _rotr(x, 64);
ce: 202e fffc movel %fp@(-4),%d0
d2: 2d40 ffac movel %d0,%fp@(-84)
return 0;
d6: 4280 clrl %d0
d8: 4e5e unlk %fp
da: 4e75 rts
If I had to choose a compiler based on this alone, I'd go with gcc 2.95.3. Notice, though, how it does a swap on _rotl(x, ) and not _rorl(x, ). The same goes for direction changes for large shifts.
Your template is better in that regard, but as you noted, you might exclude the templated asm from further optimization. I think, though, that the code should be optimized (or at least scheduled) properly as long as you don't use asm volatile (...).