I suspect your template will be faster, but only because the optimizer isn't doing rol's:
template <signed N> static inline unsigned rotate(unsigned val)
{
if ((32-(-(N%32)))%32 != 0) {
if ((32-(-(N%32)))%32 < 9) {
asm("rorl %1, %0;" : "=d"(val) : "I"((32-(-(N%32)))%32), "0"(val) : "cc");
}
else if ((32-(-(N%32)))%32 > 23) {
asm("roll %1, %0;" : "=d"(val) : "I"(32-((32-(-(N%32)))%32)), "0"(val) : "cc");
}
else if ((32-(-(N%32)))%32 == 16) {
asm("swap %0;" : "=d"(val) : "0"(val) : "cc");
}
else {
asm("rorl %1, %0;" : "=d"(val) : "d"((32-(-(N%32)))%32), "0"(val) : "cc");
}
}
return val;
}
static inline unsigned _rotl(unsigned val, int shift)
{
shift &= 0x1f;
val = (val>>(0x20 - shift)) | (val << shift);
return val;
}
static inline unsigned _rotr(unsigned val, int shift)
{
shift &= 0x1f;
val = (val<<(0x20 - shift)) | (val >> shift);
return val;
}
int main(void)
{
volatile unsigned x = 1;
volatile unsigned a = _rotl(x, 1);
volatile unsigned b = _rotr(x, 1);
volatile unsigned c = rotate<-1>(x);
volatile unsigned d = rotate<1>(x);
return 0;
}
/*
00000000 <main>:
0: 4e56 ffec linkw %fp,#-20
volatile unsigned x = 1;
4: 7001 moveq #1,%d0
volatile unsigned a = _rotl(x, 1);
6: 2d40 fffc movel %d0,%fp@(-4)
a: 202e fffc movel %fp@(-4),%d0
e: 721f moveq #31,%d1
10: e2b8 rorl %d1,%d0
12: 2d40 fff8 movel %d0,%fp@(-8)
volatile unsigned b = _rotr(x, 1);
16: 202e fffc movel %fp@(-4),%d0
1a: e298 rorl #1,%d0
1c: 2d40 fff4 movel %d0,%fp@(-12)
volatile unsigned c = rotate<-1>(x);
20: 202e fffc movel %fp@(-4),%d0
24: e398 roll #1,%d0
26: 2d40 fff0 movel %d0,%fp@(-16)
volatile unsigned d = rotate<1>(x);
2a: 202e fffc movel %fp@(-4),%d0
2e: e298 rorl #1,%d0
30: 2d40 ffec movel %d0,%fp@(-20)
return 0;
34: 4280 clrl %d0
36: 4e5e unlk %fp
38: 4e75 rts
*/
I don't know anything about how the optimizer works, really, so I don't know why it's always opting for one solution over another.