nexmon – Blame information for rev 1
?pathlinks?
Rev | Author | Line No. | Line |
---|---|---|---|
1 | office | 1 | /* strcspn with SSE4.2 intrinsics |
2 | Copyright (C) 2009-2014 Free Software Foundation, Inc. |
||
3 | Contributed by Intel Corporation. |
||
4 | This file is part of the GNU C Library. |
||
5 | |||
6 | The GNU C Library is free software; you can redistribute it and/or |
||
7 | modify it under the terms of the GNU Lesser General Public |
||
8 | License as published by the Free Software Foundation; either |
||
9 | version 2.1 of the License, or (at your option) any later version. |
||
10 | |||
11 | The GNU C Library is distributed in the hope that it will be useful, |
||
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
14 | Lesser General Public License for more details. |
||
15 | |||
16 | You should have received a copy of the GNU Lesser General Public |
||
17 | License along with the GNU C Library; if not, see |
||
18 | <http://www.gnu.org/licenses/>. */ |
||
19 | |||
20 | |||
21 | #include "config.h" |
||
22 | |||
23 | #ifdef HAVE_SSE4_2 |
||
24 | |||
25 | #include <glib.h> |
||
26 | #include "ws_cpuid.h" |
||
27 | |||
28 | #ifdef _WIN32 |
||
29 | #include <tmmintrin.h> |
||
30 | #endif |
||
31 | |||
32 | #include <nmmintrin.h> |
||
33 | #include <string.h> |
||
34 | #include "ws_mempbrk.h" |
||
35 | #include "ws_mempbrk_int.h" |
||
36 | |||
37 | /* __has_feature(address_sanitizer) is used later for Clang, this is for |
||
38 | * compatibility with other compilers (such as GCC and MSVC) */ |
||
39 | #ifndef __has_feature |
||
40 | # define __has_feature(x) 0 |
||
41 | #endif |
||
42 | |||
43 | #define cast_128aligned__m128i(p) ((const __m128i *) (const void *) (p)) |
||
44 | |||
45 | /* Helper for variable shifts of SSE registers. |
||
46 | Copyright (C) 2010 Free Software Foundation, Inc. |
||
47 | */ |
||
48 | |||
49 | static const gint8 ___m128i_shift_right[31] = |
||
50 | { |
||
51 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
||
52 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 |
||
53 | }; |
||
54 | |||
55 | static inline __m128i |
||
56 | __m128i_shift_right (__m128i value, unsigned long int offset) |
||
57 | { |
||
58 | /* _mm_loadu_si128() works with unaligned data, cast safe */ |
||
59 | return _mm_shuffle_epi8 (value, |
||
60 | _mm_loadu_si128 (cast_128aligned__m128i(___m128i_shift_right + offset))); |
||
61 | } |
||
62 | |||
63 | |||
64 | void |
||
65 | ws_mempbrk_sse42_compile(ws_mempbrk_pattern* pattern, const gchar *needles) |
||
66 | { |
||
67 | size_t length = strlen(needles); |
||
68 | |||
69 | pattern->use_sse42 = ws_cpuid_sse42() && (length <= 16); |
||
70 | |||
71 | if (pattern->use_sse42) { |
||
72 | pattern->mask = _mm_setzero_si128(); |
||
73 | memcpy(&(pattern->mask), needles, length); |
||
74 | } |
||
75 | } |
||
76 | |||
77 | /* We use 0x2: |
||
78 | _SIDD_SBYTE_OPS |
||
79 | | _SIDD_CMP_EQUAL_ANY |
||
80 | | _SIDD_POSITIVE_POLARITY |
||
81 | | _SIDD_LEAST_SIGNIFICANT |
||
82 | on pcmpistri to compare xmm/mem128 |
||
83 | |||
84 | |||
85 | X X X X X X X X X X X X X X X X |
||
86 | |||
87 | against xmm |
||
88 | |||
89 | |||
90 | A A A A A A A A A A A A A A A A |
||
91 | |||
92 | to find out if the first 16byte data element has any byte A and |
||
93 | the offset of the first byte. There are 3 cases: |
||
94 | |||
95 | 1. The first 16byte data element has the byte A at the offset X. |
||
96 | 2. The first 16byte data element has EOS and doesn't have the byte A. |
||
97 | 3. The first 16byte data element is valid and doesn't have the byte A. |
||
98 | |||
99 | Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: |
||
100 | |||
101 | 1 X 1 0/1 0 |
||
102 | 2 16 0 1 0 |
||
103 | 3 16 0 0 0 |
||
104 | |||
105 | We exit from the loop for cases 1 and 2 with jbe which branches |
||
106 | when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset |
||
107 | X for case 1. */ |
||
108 | |||
109 | const char * |
||
110 | ws_mempbrk_sse42_exec(const char *s, size_t slen, const ws_mempbrk_pattern* pattern, guchar *found_needle) |
||
111 | { |
||
112 | const char *aligned; |
||
113 | int offset; |
||
114 | |||
115 | offset = (int) ((size_t) s & 15); |
||
116 | aligned = (const char *) ((size_t) s & -16L); |
||
117 | if (offset != 0) |
||
118 | { |
||
119 | /* Check partial string. cast safe it's 16B aligned */ |
||
120 | __m128i value = __m128i_shift_right (_mm_load_si128 (cast_128aligned__m128i(aligned)), offset); |
||
121 | |||
122 | int length = _mm_cmpistri (pattern->mask, value, 0x2); |
||
123 | /* No need to check ZFlag since ZFlag is always 1. */ |
||
124 | int cflag = _mm_cmpistrc (pattern->mask, value, 0x2); |
||
125 | /* XXX: why does this compare value with value? */ |
||
126 | int idx = _mm_cmpistri (value, value, 0x3a); |
||
127 | |||
128 | if (cflag) { |
||
129 | if (found_needle) |
||
130 | *found_needle = *(s + length); |
||
131 | return s + length; |
||
132 | } |
||
133 | |||
134 | /* Find where the NULL terminator is. */ |
||
135 | if (idx < 16 - offset) |
||
136 | { |
||
137 | /* found NUL @ 'idx', need to switch to slower mempbrk */ |
||
138 | return ws_mempbrk_portable_exec(s + idx + 1, slen - idx - 1, pattern, found_needle); /* slen is bigger than 16 & idx < 16 so no undeflow here */ |
||
139 | } |
||
140 | aligned += 16; |
||
141 | slen -= (16 - offset); |
||
142 | } |
||
143 | else |
||
144 | aligned = s; |
||
145 | |||
146 | while (slen >= 16) |
||
147 | { |
||
148 | __m128i value = _mm_load_si128 (cast_128aligned__m128i(aligned)); |
||
149 | int idx = _mm_cmpistri (pattern->mask, value, 0x2); |
||
150 | int cflag = _mm_cmpistrc (pattern->mask, value, 0x2); |
||
151 | int zflag = _mm_cmpistrz (pattern->mask, value, 0x2); |
||
152 | |||
153 | if (cflag) { |
||
154 | if (found_needle) |
||
155 | *found_needle = *(aligned + idx); |
||
156 | return aligned + idx; |
||
157 | } |
||
158 | |||
159 | if (zflag) |
||
160 | { |
||
161 | /* found NUL, need to switch to slower mempbrk */ |
||
162 | return ws_mempbrk_portable_exec(aligned, slen, pattern, found_needle); |
||
163 | } |
||
164 | aligned += 16; |
||
165 | slen -= 16; |
||
166 | } |
||
167 | |||
168 | /* XXX, use mempbrk_slow here? */ |
||
169 | return ws_mempbrk_portable_exec(aligned, slen, pattern, found_needle); |
||
170 | } |
||
171 | |||
172 | #endif /* HAVE_SSE4_2 */ |
||
173 | /* |
||
174 | * Editor modelines |
||
175 | * |
||
176 | * Local Variables: |
||
177 | * c-basic-offset: 2 |
||
178 | * tab-width: 8 |
||
179 | * indent-tabs-mode: nil |
||
180 | * End: |
||
181 | * |
||
182 | * ex: set shiftwidth=2 tabstop=8 expandtab: |
||
183 | * :indentSize=2:tabSize=8:noTabs=true: |
||
184 | */ |