Looking at Microsoft's implementation, 8-bit and 16-bit rotations are intrinsics, and 32-bit and 64-bit rotations are functions. The functions are implemented using the traditional bitwise shift and bitwise or combination.
16-bit rotations are actually implemented using 32-bit registers, but the result is truncated:
x86
volatile unsigned char a = _rotl8(1, 1);
013213DE mov al,1
013213E0 rol al,1
013213E2 mov byte ptr [a],al
volatile unsigned short b = _rotl16(1, 1);
013213E5 mov eax,1
013213EA rol ax,1
013213ED mov word ptr [b],ax
volatile unsigned c = _rotl(1, 1);
013213F1 push 1
013213F3 push 1
013213F5 call @ILT+155(__rotl) (13210A0h)
013213FA add esp,8
013213FD mov dword ptr [c],eax
volatile unsigned long d = _lrotl(1, 1);
01321400 push 1
01321402 push 1
01321404 call @ILT+65(__lrotl) (1321046h)
01321409 add esp,8
0132140C mov dword ptr [d],eax
volatile unsigned __int64 e = _rotl64(1, 1);
0132140F push 1
01321411 push 0
01321413 push 1
01321415 call @ILT+280(__rotl64) (132111Dh)
0132141A add esp,0Ch
0132141D mov dword ptr [e],eax
01321420 mov dword ptr [ebp-38h],edx
x64
volatile unsigned char a = _rotl8(1, 1);
000000013FBF102A mov al,1
000000013FBF102C rol al,1
000000013FBF102E mov byte ptr [a],al
volatile unsigned short b = _rotl16(1, 1);
000000013FBF1032 mov ax,1
000000013FBF1036 rol ax,1
000000013FBF1039 mov word ptr [b],ax
volatile unsigned c = _rotl(1, 1);
000000013FBF103E mov edx,1
000000013FBF1043 mov ecx,1
000000013FBF1048 call _rotl (13FBF10A8h)
000000013FBF104D mov dword ptr [c],eax
volatile unsigned long d = _lrotl(1, 1);
000000013FBF1051 mov edx,1
000000013FBF1056 mov ecx,1
000000013FBF105B call _lrotl (13FBF10A2h)
000000013FBF1060 mov dword ptr [d],eax
volatile unsigned __int64 e = _rotl64(1, 1);
000000013FBF1064 mov edx,1
000000013FBF1069 mov ecx,1
000000013FBF106E call _rotl64 (13FBF109Ch)
000000013FBF1073 mov qword ptr [e],rax
The good news--for Visual C++, anyway--is that the shift-or combination is correctly optimized into a rotate. Microsoft reduces with N&31 as well:
___rotl PROC ; COMDAT
; 8 : shift &= 0x1f;
; 9 : val = (val>>(0x20 - shift)) | (val << shift);
00000 b8 01 00 00 00 mov eax, 1
00005 d1 c0 rol eax, 1
; 10 : return val;
; 11 : }
00007 c3 ret 0
___rotl ENDP
Anyhow, just food for thought. I know GCC's m68k optimizer needs work, even in current versions.