55using System . Runtime . CompilerServices ;
66using System . Runtime . Intrinsics ;
77using System . Runtime . InteropServices ;
8+ using static System . IO . Hashing . VectorHelper ;
89
910namespace System . IO . Hashing
1011{
@@ -17,7 +18,9 @@ public partial class Crc32
1718 private static bool CanBeVectorized ( ReadOnlySpan < byte > source ) =>
1819 BitConverter . IsLittleEndian
1920 && VectorHelper . IsSupported
20- && source . Length >= Vector128 < byte > . Count * 4 ;
21+ // Vectorization can process spans as short as a single vector (16 bytes), but if ARM intrinsics are supported they
22+ // seem to be more performant for spans less than 8 vectors (128 bytes).
23+ && source . Length >= Vector128 < byte > . Count * ( System . Runtime . Intrinsics . Arm . Crc32 . IsSupported ? 8 : 1 ) ;
2124
2225 // Processes the bytes in source in 64 byte chunks using carryless/polynomial multiplication intrinsics,
2326 // followed by processing 16 byte chunks, and then processing remaining bytes individually. Requires
@@ -35,102 +38,81 @@ private static uint UpdateVectorized(uint crc, ReadOnlySpan<byte> source)
3538 ref byte srcRef = ref MemoryMarshal . GetReference ( source ) ;
3639 int length = source . Length ;
3740
38- Vector128 < ulong > x1 = Vector128 . LoadUnsafe ( ref srcRef ) . AsUInt64 ( ) ;
39- Vector128 < ulong > x2 = Vector128 . LoadUnsafe ( ref srcRef , 16 ) . AsUInt64 ( ) ;
40- Vector128 < ulong > x3 = Vector128 . LoadUnsafe ( ref srcRef , 32 ) . AsUInt64 ( ) ;
41- Vector128 < ulong > x4 = Vector128 . LoadUnsafe ( ref srcRef , 48 ) . AsUInt64 ( ) ;
42- Vector128 < ulong > x5 ;
41+ Vector128 < ulong > kConstants ;
42+ Vector128 < ulong > x1 ; // Accumulator for the new CRC
43+ Vector128 < ulong > x2 ;
4344
44- x1 ^= Vector128 . CreateScalar ( crc ) . AsUInt64 ( ) ;
45- Vector128 < ulong > x0 = Vector128 . Create ( 0x0154442bd4UL , 0x01c6e41596UL ) ; // k1, k2
46-
47- srcRef = ref Unsafe . Add ( ref srcRef , Vector128 < byte > . Count * 4 ) ;
48- length -= Vector128 < byte > . Count * 4 ;
49-
50- // Parallel fold blocks of 64, if any.
51- while ( length >= Vector128 < byte > . Count * 4 )
45+ if ( length >= Vector128 < byte > . Count * 8 )
5246 {
53- x5 = VectorHelper . CarrylessMultiplyLower ( x1 , x0 ) ;
54- Vector128 < ulong > x6 = VectorHelper . CarrylessMultiplyLower ( x2 , x0 ) ;
55- Vector128 < ulong > x7 = VectorHelper . CarrylessMultiplyLower ( x3 , x0 ) ;
56- Vector128 < ulong > x8 = VectorHelper . CarrylessMultiplyLower ( x4 , x0 ) ;
57-
58- x1 = VectorHelper . CarrylessMultiplyUpper ( x1 , x0 ) ;
59- x2 = VectorHelper . CarrylessMultiplyUpper ( x2 , x0 ) ;
60- x3 = VectorHelper . CarrylessMultiplyUpper ( x3 , x0 ) ;
61- x4 = VectorHelper . CarrylessMultiplyUpper ( x4 , x0 ) ;
62-
63- Vector128 < ulong > y5 = Vector128 . LoadUnsafe ( ref srcRef ) . AsUInt64 ( ) ;
64- Vector128 < ulong > y6 = Vector128 . LoadUnsafe ( ref srcRef , 16 ) . AsUInt64 ( ) ;
65- Vector128 < ulong > y7 = Vector128 . LoadUnsafe ( ref srcRef , 32 ) . AsUInt64 ( ) ;
66- Vector128 < ulong > y8 = Vector128 . LoadUnsafe ( ref srcRef , 48 ) . AsUInt64 ( ) ;
67-
68- x1 ^= x5 ;
69- x2 ^= x6 ;
70- x3 ^= x7 ;
71- x4 ^= x8 ;
72-
73- x1 ^= y5 ;
74- x2 ^= y6 ;
75- x3 ^= y7 ;
76- x4 ^= y8 ;
47+ x1 = Vector128 . LoadUnsafe ( ref srcRef ) . AsUInt64 ( ) ;
48+ x2 = Vector128 . LoadUnsafe ( ref srcRef , 16 ) . AsUInt64 ( ) ;
49+ Vector128 < ulong > x3 = Vector128 . LoadUnsafe ( ref srcRef , 32 ) . AsUInt64 ( ) ;
50+ Vector128 < ulong > x4 = Vector128 . LoadUnsafe ( ref srcRef , 48 ) . AsUInt64 ( ) ;
7751
7852 srcRef = ref Unsafe . Add ( ref srcRef , Vector128 < byte > . Count * 4 ) ;
7953 length -= Vector128 < byte > . Count * 4 ;
80- }
81-
82- // Fold into 128-bits.
83- x0 = Vector128 . Create ( 0x01751997d0UL , 0x00ccaa009eUL ) ; // k3, k4
8454
85- x5 = VectorHelper . CarrylessMultiplyLower ( x1 , x0 ) ;
86- x1 = VectorHelper . CarrylessMultiplyUpper ( x1 , x0 ) ;
87- x1 ^= x2 ;
88- x1 ^= x5 ;
55+ // Load and XOR the initial CRC value
56+ x1 ^= Vector128 . CreateScalar ( crc ) . AsUInt64 ( ) ;
57+
58+ kConstants = Vector128 . Create ( 0x0154442bd4UL , 0x01c6e41596UL ) ; // k1, k2
59+
60+ // Parallel fold blocks of 64, if any.
61+ do
62+ {
63+ Vector128 < ulong > y5 = Vector128 . LoadUnsafe ( ref srcRef ) . AsUInt64 ( ) ;
64+ Vector128 < ulong > y6 = Vector128 . LoadUnsafe ( ref srcRef , 16 ) . AsUInt64 ( ) ;
65+ Vector128 < ulong > y7 = Vector128 . LoadUnsafe ( ref srcRef , 32 ) . AsUInt64 ( ) ;
66+ Vector128 < ulong > y8 = Vector128 . LoadUnsafe ( ref srcRef , 48 ) . AsUInt64 ( ) ;
67+
68+ x1 = FoldPolynomialPair ( y5 , x1 , kConstants ) ;
69+ x2 = FoldPolynomialPair ( y6 , x2 , kConstants ) ;
70+ x3 = FoldPolynomialPair ( y7 , x3 , kConstants ) ;
71+ x4 = FoldPolynomialPair ( y8 , x4 , kConstants ) ;
72+
73+ srcRef = ref Unsafe . Add ( ref srcRef , Vector128 < byte > . Count * 4 ) ;
74+ length -= Vector128 < byte > . Count * 4 ;
75+ } while ( length >= Vector128 < byte > . Count * 4 ) ;
76+
77+ // Fold into 128-bits.
78+ kConstants = Vector128 . Create ( 0x01751997d0UL , 0x00ccaa009eUL ) ; // k3, k4
79+ x1 = FoldPolynomialPair ( x2 , x1 , kConstants ) ;
80+ x1 = FoldPolynomialPair ( x3 , x1 , kConstants ) ;
81+ x1 = FoldPolynomialPair ( x4 , x1 , kConstants ) ;
82+ }
83+ else
84+ {
85+ // For shorter sources just load the first vector and XOR with the CRC
86+ Debug . Assert ( length >= 16 ) ;
8987
90- x5 = VectorHelper . CarrylessMultiplyLower ( x1 , x0 ) ;
91- x1 = VectorHelper . CarrylessMultiplyUpper ( x1 , x0 ) ;
92- x1 ^= x3 ;
93- x1 ^= x5 ;
88+ x1 = Vector128 . LoadUnsafe ( ref srcRef ) . AsUInt64 ( ) ;
89+ x1 ^= Vector128 . CreateScalar ( crc ) . AsUInt64 ( ) ;
9490
95- x5 = VectorHelper . CarrylessMultiplyLower ( x1 , x0 ) ;
96- x1 = VectorHelper . CarrylessMultiplyUpper ( x1 , x0 ) ;
97- x1 ^= x4 ;
98- x1 ^= x5 ;
91+ srcRef = ref Unsafe . Add ( ref srcRef , Vector128 < byte > . Count ) ;
92+ length -= Vector128 < byte > . Count ;
93+ }
9994
10095 // Single fold blocks of 16, if any.
10196 while ( length >= Vector128 < byte > . Count )
10297 {
103- x2 = Vector128 . LoadUnsafe ( ref srcRef ) . AsUInt64 ( ) ;
104-
105- x5 = VectorHelper . CarrylessMultiplyLower ( x1 , x0 ) ;
106- x1 = VectorHelper . CarrylessMultiplyUpper ( x1 , x0 ) ;
107- x1 ^= x2 ;
108- x1 ^= x5 ;
98+ x1 = FoldPolynomialPair ( Vector128 . LoadUnsafe ( ref srcRef ) . AsUInt64 ( ) , x1 ,
99+ Vector128 . Create ( 0x01751997d0UL , 0x00ccaa009eUL ) ) ;
109100
110101 srcRef = ref Unsafe . Add ( ref srcRef , Vector128 < byte > . Count ) ;
111102 length -= Vector128 < byte > . Count ;
112103 }
113104
114105 // Fold 128 bits to 64 bits.
115- x2 = VectorHelper . CarrylessMultiplyLeftLowerRightUpper ( x1 , x0 ) ;
116- x3 = Vector128 . Create ( ~ 0 , 0 , ~ 0 , 0 ) . AsUInt64 ( ) ;
117- x1 = VectorHelper . ShiftRightBytesInVector ( x1 , 8 ) ;
118- x1 ^= x2 ;
119-
120- x0 = Vector128 . CreateScalar ( 0x0163cd6124UL ) ; // k5, k0
121-
122- x2 = VectorHelper . ShiftRightBytesInVector ( x1 , 4 ) ;
123- x1 &= x3 ;
124- x1 = VectorHelper . CarrylessMultiplyLower ( x1 , x0 ) ;
125- x1 ^= x2 ;
106+ Vector128 < ulong > bitmask = Vector128 . Create ( ~ 0 , 0 , ~ 0 , 0 ) . AsUInt64 ( ) ;
107+ x1 = ShiftRightBytesInVector ( x1 , 8 ) ^
108+ CarrylessMultiplyLower ( x1 , Vector128 . CreateScalar ( 0x00ccaa009eUL ) ) ;
109+ x1 = CarrylessMultiplyLower ( x1 & bitmask , Vector128 . CreateScalar ( 0x0163cd6124UL ) ) ^ // k5, k0
110+ ShiftRightBytesInVector ( x1 , 4 ) ;
126111
127112 // Reduce to 32 bits.
128- x0 = Vector128 . Create ( 0x01db710641UL , 0x01f7011641UL ) ; // polynomial
129-
130- x2 = x1 & x3 ;
131- x2 = VectorHelper . CarrylessMultiplyLeftLowerRightUpper ( x2 , x0 ) ;
132- x2 &= x3 ;
133- x2 = VectorHelper . CarrylessMultiplyLower ( x2 , x0 ) ;
113+ kConstants = Vector128 . Create ( 0x01db710641UL , 0x01f7011641UL ) ; // polynomial
114+ x2 = CarrylessMultiplyLeftLowerRightUpper ( x1 & bitmask , kConstants ) & bitmask ;
115+ x2 = CarrylessMultiplyLower ( x2 , kConstants ) ;
134116 x1 ^= x2 ;
135117
136118 // Process the remaining bytes, if any
0 commit comments