mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
Compare commits
898 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
9e9a10ad01 | ||
|
c057c7f0f0 | ||
|
f7d5546fe5 | ||
|
689556d5f9 | ||
|
d90ab3ac1c | ||
|
7e0503c3b8 | ||
|
5e62255667 | ||
|
55a05e41a0 | ||
|
4951b6186d | ||
|
4f09e785c0 | ||
|
9a3268b047 | ||
|
5145b6d2ab | ||
|
de15179c57 | ||
|
e4c49f2aa2 | ||
|
aa4bc24439 | ||
|
6c8e33e597 | ||
|
1dc0600156 | ||
|
dd43c86658 | ||
|
cc11a3d738 | ||
|
aa832db892 | ||
|
0f4369bf22 | ||
|
0e0c9f8c63 | ||
|
a68845c82b | ||
|
834a329daa | ||
|
8fc1a7efff | ||
|
4113a1f150 | ||
|
9987ecd4a0 | ||
|
00a5ff1c67 | ||
|
e36203c323 | ||
|
85ffb2b2f1 | ||
|
c5c4c5d5f5 | ||
|
9c0beb57f8 | ||
|
de1697b467 | ||
|
faa9e7549f | ||
|
d23e4a12e7 | ||
|
aa6acaec84 | ||
|
c837925087 | ||
|
cebc6541c1 | ||
|
16467faf2b | ||
|
0e271ccf9a | ||
|
938c026256 | ||
|
af39f77461 | ||
|
94eff4aa60 | ||
|
b312112e87 | ||
|
fd46b72a18 | ||
|
2ec64b6f07 | ||
|
82067fd526 | ||
|
dfa72ffd50 | ||
|
7a5f271abe | ||
|
cd39f71e80 | ||
|
e111684bc2 | ||
|
dc18a14663 | ||
|
40da067b4f | ||
|
92d5db503e | ||
|
28adc07824 | ||
|
06339e65ad | ||
|
14deb313c1 | ||
|
80812ee5b3 | ||
|
a8373df48b | ||
|
cffe5095da | ||
|
fa358535be | ||
|
97a8519084 | ||
|
84dd8de656 | ||
|
e261f286da | ||
|
2fa06dd9ed | ||
|
3b01effaf7 | ||
|
22c3e3da6e | ||
|
c038f776b1 | ||
|
f2cecfd0e2 | ||
|
c482c05fa8 | ||
|
4abe31cbce | ||
|
da4f563a24 | ||
|
59b4e082a8 | ||
|
1290733c89 | ||
|
f9df732284 | ||
|
8339534a44 | ||
|
e819cb1100 | ||
|
b03699fade | ||
|
d653822a82 | ||
|
db92a42681 | ||
|
6970f8f7f4 | ||
|
59a098504e | ||
|
8260d7c906 | ||
|
6d6d4e1013 | ||
|
0cf72ef474 | ||
|
4a56b9c2e9 | ||
|
1d4a2b2b60 | ||
|
2dc4da7f2e | ||
|
9577fdc474 | ||
|
22166ed948 | ||
|
a255600773 | ||
|
bf794fafa0 | ||
|
4cefba5ced | ||
|
f8c576db15 | ||
|
3aa9c18e34 | ||
|
0258606df3 | ||
|
9070447260 | ||
|
3d60d4f3be | ||
|
ee8bc59ee0 | ||
|
96aca187bd | ||
|
9798b57f9e | ||
|
fc272868b8 | ||
|
0b8e863282 | ||
|
6830ce21ef | ||
|
d2d8a0bbde | ||
|
3a2ec524bd | ||
|
017acc4265 | ||
|
98bd92ddbc | ||
|
77cdcec359 | ||
|
cbd831e92a | ||
|
86645e9bee | ||
|
6989314295 | ||
|
2fa7eaded4 | ||
|
5affdf3a11 | ||
|
879cc0a183 | ||
|
a1fbe84660 | ||
|
c54acf0e04 | ||
|
05d86f0c3e | ||
|
5f2f343b8f | ||
|
9ee4c07964 | ||
|
e6595c72aa | ||
|
a2e66e31d2 | ||
|
0f61853a37 | ||
|
cd1e13d4d2 | ||
|
753c7de002 | ||
|
c70c09c961 | ||
|
bdffbde80f | ||
|
94b17ecaf2 | ||
|
4ff92f87d9 | ||
|
cc63087d06 | ||
|
8499055605 | ||
|
c38bb8bc1d | ||
|
7dd2135b80 | ||
|
82cf36724e | ||
|
bb056ab63f | ||
|
13e5183be2 | ||
|
c7f7d17ebc | ||
|
692a63c8ca | ||
|
5ad1f2127f | ||
|
a634d57b2d | ||
|
9fdbdac06c | ||
|
389b55c647 | ||
|
ea420114a7 | ||
|
d3dd448641 | ||
|
727cff3621 | ||
|
9902ca0e34 | ||
|
27bb2b9134 | ||
|
d6e185cd75 | ||
|
f2d8d63793 | ||
|
b5bf3d8d31 | ||
|
2921e50ecc | ||
|
aa1955a368 | ||
|
bb6464431f | ||
|
2a476df2c5 | ||
|
ec8cda3f49 | ||
|
987cd17160 | ||
|
f463357a38 | ||
|
62e3450eae | ||
|
49fd4f0047 | ||
|
131672d175 | ||
|
71fcade2ac | ||
|
7fd45f864c | ||
|
e291d498fa | ||
|
2e68780fb5 | ||
|
11e4968367 | ||
|
5dab841cea | ||
|
72371a05dc | ||
|
fd3e251afa | ||
|
db07cddaec | ||
|
adda613f51 | ||
|
e6c884358e | ||
|
9b9df1b397 | ||
|
08942348c6 | ||
|
d7006a7c85 | ||
|
91d9631c97 | ||
|
8dabc86a69 | ||
|
01dee390a9 | ||
|
52b0076f4f | ||
|
8d3a5d7cf1 | ||
|
73e00e2abc | ||
|
9316d65022 | ||
|
9d5753b215 | ||
|
182f7ddb47 | ||
|
06fc35321d | ||
|
c4bffd7cef | ||
|
53015b2bbf | ||
|
7590fdc8c7 | ||
|
c7439a605e | ||
|
9c52d3d6c1 | ||
|
cfa8397e97 | ||
|
acbef47c74 | ||
|
6a7574501d | ||
|
50fdcaf357 | ||
|
6546dd856a | ||
|
cdc0d47cde | ||
|
1e614dc861 | ||
|
51ac3a2287 | ||
|
f2db0cdf01 | ||
|
3b37add4d8 | ||
|
3c04ec25bb | ||
|
6e306a508e | ||
|
b6b0ab1a9b | ||
|
81722040f8 | ||
|
00b1a50977 | ||
|
12e95b2c5c | ||
|
b312438929 | ||
|
d96206a12f | ||
|
2e86b8524d | ||
|
b0916df825 | ||
|
0376319a93 | ||
|
d0b39914df | ||
|
372053ba4f | ||
|
773d57d890 | ||
|
c2700cafd9 | ||
|
3670e52c87 | ||
|
62a275e576 | ||
|
b5a29155e4 | ||
|
50a62a17ff | ||
|
e20ba37208 | ||
|
42653b8a31 | ||
|
e2ce866462 | ||
|
e239f482fd | ||
|
dc371fb682 | ||
|
17c78ff23c | ||
|
d7fb5f437a | ||
|
f5412b3509 | ||
|
c9b3a86908 | ||
|
1ea53768a6 | ||
|
b006d7f620 | ||
|
d0498f942d | ||
|
0045a2bdc7 | ||
|
226645eaf1 | ||
|
d6d7a96c44 | ||
|
6b459843e6 | ||
|
9db7b529e2 | ||
|
f9e254ab41 | ||
|
6bbd4821f0 | ||
|
0c57b6c894 | ||
|
943f198ebf | ||
|
2d23d24b67 | ||
|
d9a75dc3b9 | ||
|
bd7423f4f0 | ||
|
523db6051d | ||
|
afcbd28d3b | ||
|
f7a4d41c63 | ||
|
4739c76a11 | ||
|
a6a35e044c | ||
|
30ae8505c3 | ||
|
a05c891146 | ||
|
12f61d15ed | ||
|
71f3e7d994 | ||
|
24786ae332 | ||
|
98eb459ac2 | ||
|
01658be05d | ||
|
8cd365121d | ||
|
68dab83799 | ||
|
5fa5e142b9 | ||
|
528a165c20 | ||
|
1a4e878abe | ||
|
4fb8cee35f | ||
|
3d0df318b8 | ||
|
5d38d0d0a5 | ||
|
a1258680ac | ||
|
ffa6926608 | ||
|
634c884204 | ||
|
b5ae828e61 | ||
|
719e1c9be6 | ||
|
01d8a2d768 | ||
|
b6e3c66015 | ||
|
b106c10b4d | ||
|
9e4789d374 | ||
|
9134cd6250 | ||
|
593299e7bb | ||
|
f6387e34da | ||
|
73f70e3d2e | ||
|
6b9068db0f | ||
|
5e1972efce | ||
|
9fac2bf78d | ||
|
afb1a1705f | ||
|
4d2bcff7b4 | ||
|
ac02b589be | ||
|
f8fdd979f1 | ||
|
46488b8097 | ||
|
8b2ebeb06b | ||
|
e2439685a9 | ||
|
1988ff5a6d | ||
|
6e1c3a10fa | ||
|
6f4409365a | ||
|
f68a1e526c | ||
|
d9ebb20010 | ||
|
eca4049ce4 | ||
|
8e5abfebf0 | ||
|
17fb9f41f6 | ||
|
ad70693999 | ||
|
3113d1ca30 | ||
|
10d957477a | ||
|
ef37e6015a | ||
|
306e8612be | ||
|
a7a12844e7 | ||
|
44f19c1006 | ||
|
2aa5e1c710 | ||
|
1b915cfb93 | ||
|
49e6fe15a2 | ||
|
8cba258e7f | ||
|
c8ba7fa1d3 | ||
|
e15ad9308a | ||
|
a26bed96bc | ||
|
519bd64c65 | ||
|
d3f6d2ad06 | ||
|
9fd0ce5d44 | ||
|
6332cb91f5 | ||
|
3beda7e5e0 | ||
|
be9ce68767 | ||
|
f5e508b13f | ||
|
23aeaecf53 | ||
|
8c7b503ac4 | ||
|
f57928ea08 | ||
|
dfacf75855 | ||
|
20f4f542a5 | ||
|
62cb8d6c2d | ||
|
b32ca719d9 | ||
|
7c53b4e608 | ||
|
14c9222a48 | ||
|
a8e9b9069e | ||
|
b068087240 | ||
|
b5cde5ebf7 | ||
|
8455cba03d | ||
|
129015afc6 | ||
|
d24d67c28b | ||
|
44b893abfc | ||
|
c3a6bb3cb3 | ||
|
d611fcbaa8 | ||
|
343e523763 | ||
|
574e525c46 | ||
|
41fb015616 | ||
|
393dee3697 | ||
|
08b904b31c | ||
|
a97d576ac8 | ||
|
aecd920b57 | ||
|
d5cd29b333 | ||
|
9c92c7b081 | ||
|
8d1c7c49f0 | ||
|
5e5d6d2c17 | ||
|
b1522860d5 | ||
|
35acf49d5f | ||
|
44b026a8c9 | ||
|
80f84a1be5 | ||
|
b5f1a82258 | ||
|
9d0599a85e | ||
|
21c45f325c | ||
|
9c139c3a6d | ||
|
71bbf97b90 | ||
|
de94286fed | ||
|
02474c4f52 | ||
|
a659555781 | ||
|
aa8af2621b | ||
|
5a4d900675 | ||
|
1fdeedf151 | ||
|
c4b7a44cac | ||
|
7909b91ba4 | ||
|
1619dbaf35 | ||
|
9445f49172 | ||
|
4d539f2c87 | ||
|
981576a5fe | ||
|
5e4a1edb0c | ||
|
e85f7cc9c9 | ||
|
ee8a3c29cc | ||
|
0d5ce27df4 | ||
|
6beeb372bc | ||
|
3884f597d3 | ||
|
24ae1670d6 | ||
|
0e403103d6 | ||
|
9e1c43b9ec | ||
|
983a3a52bd | ||
|
1320d01035 | ||
|
6900806127 | ||
|
e8e2957344 | ||
|
7a2ccd7773 | ||
|
55cae8c807 | ||
|
a26661c849 | ||
|
98d7434cfd | ||
|
22a24f12ea | ||
|
e369681ce2 | ||
|
35c0711689 | ||
|
72afe16452 | ||
|
da88abfa39 | ||
|
2e88df1a89 | ||
|
354fda48fb | ||
|
b7d1bc0298 | ||
|
9aa61440ea | ||
|
93d3e7eb30 | ||
|
9a174745e4 | ||
|
db7b23a468 | ||
|
0d2f9ccbaa | ||
|
16604f9539 | ||
|
4918f81ea3 | ||
|
adedf2acf3 | ||
|
d85d306ff9 | ||
|
1d25f9b8f5 | ||
|
ad42abe7b4 | ||
|
85a6973060 | ||
|
a344cd30f7 | ||
|
68346631c7 | ||
|
e843ac80c9 | ||
|
4344d2fce7 | ||
|
5209c7978a | ||
|
c6523453d7 | ||
|
ab4f837607 | ||
|
dc78dc1633 | ||
|
4fb3a48dfd | ||
|
6765b35d48 | ||
|
91f0cb6cea | ||
|
7f2f7d2a1e | ||
|
659525480c | ||
|
941cc7144b | ||
|
fc5a423c7e | ||
|
b7ee9102ee | ||
|
684f0ce2cb | ||
|
7c1835c0e7 | ||
|
762f4050a0 | ||
|
978105a4c0 | ||
|
75dbedeebe | ||
|
b6b69a41eb | ||
|
4bc70b37a7 | ||
|
0ec7b4e77b | ||
|
68db36f4c4 | ||
|
38431d1117 | ||
|
4fbabb66d8 | ||
|
07305d18ae | ||
|
8a54576861 | ||
|
d05491117c | ||
|
8f26c5e65f | ||
|
b4bba94b1a | ||
|
8c7fdf1e7a | ||
|
eef3f06c94 | ||
|
e6b97a99b8 | ||
|
842e680650 | ||
|
66289cdacf | ||
|
101f6083b0 | ||
|
9f8758d270 | ||
|
1ce45a31c5 | ||
|
dbdbfe9473 | ||
|
0f967b9575 | ||
|
e6cfd11948 | ||
|
6d8599eece | ||
|
00d1807bb4 | ||
|
7133ac5be1 | ||
|
90ac746303 | ||
|
9d34941f13 | ||
|
48105cdd1d | ||
|
911a98d54f | ||
|
a4972aa191 | ||
|
0e0147ec5c | ||
|
6de45b4648 | ||
|
3fc6c8a532 | ||
|
1a43178eeb | ||
|
ef66877e9e | ||
|
88b1bec5b7 | ||
|
4934852003 | ||
|
bf6200ecc8 | ||
|
4ab0730dbe | ||
|
d0a017da99 | ||
|
69e6176e09 | ||
|
ee0c8f763f | ||
|
f6250ae3e5 | ||
|
361feb64e3 | ||
|
d0ae940261 | ||
|
67b414f2f9 | ||
|
db2a6d65f1 | ||
|
f4840adf3d | ||
|
0c97e5f2c2 | ||
|
e3c237a7e0 | ||
|
756ef409b4 | ||
|
1ae0d15181 | ||
|
0af2ba8616 | ||
|
02ae2a3cad | ||
|
305a041c73 | ||
|
a837cf3bee | ||
|
be20c2c519 | ||
|
dc6b8ae92d | ||
|
7295b9c718 | ||
|
94fe406f0c | ||
|
17467ff21b | ||
|
0e7874f122 | ||
|
026f761671 | ||
|
43c053a069 | ||
|
c043730675 | ||
|
74ab41897c | ||
|
c597f69c59 | ||
|
70b2a28386 | ||
|
4f27a70dd7 | ||
|
31afacc7be | ||
|
a9ca0e4de3 | ||
|
4d4940dfbe | ||
|
2731a3384b | ||
|
c1659b8544 | ||
|
decabdfede | ||
|
a119693a66 | ||
|
cafd5248b1 | ||
|
6259783d79 | ||
|
19947f70d2 | ||
|
b5e1384995 | ||
|
db52ce6f08 | ||
|
7e7f604f7d | ||
|
849846700a | ||
|
8a49e20bcd | ||
|
49eb18ee4f | ||
|
73695e419c | ||
|
85a77e3eff | ||
|
0a35a467e0 | ||
|
fc5059aa10 | ||
|
8739a6c2a7 | ||
|
6c24e61572 | ||
|
59ffac5721 | ||
|
2c78b770ea | ||
|
e71fb5cfeb | ||
|
b3d7174a93 | ||
|
fce10b53a0 | ||
|
630f7b2360 | ||
|
f441213d35 | ||
|
76b2b4b423 | ||
|
bd9113463d | ||
|
288491d6d9 | ||
|
edea9d12b1 | ||
|
5fa22e68ba | ||
|
b34aacdb94 | ||
|
d626381ad0 | ||
|
5f8729a085 | ||
|
b3e88e480f | ||
|
9af996b936 | ||
|
2819dc3d1b | ||
|
6d6c291769 | ||
|
e6f856407e | ||
|
f9b6526ef8 | ||
|
666e1c455e | ||
|
6cd6957a23 | ||
|
0949576693 | ||
|
2eaf6e5d31 | ||
|
f5960c81d9 | ||
|
312ae895b4 | ||
|
4c32b36f53 | ||
|
1155a9219c | ||
|
f304c3e7e1 | ||
|
4fdfb8c7f4 | ||
|
a315fae243 | ||
|
8c71238d60 | ||
|
90018f927f | ||
|
467db4a268 | ||
|
1718e33544 | ||
|
4589f1742e | ||
|
fd2eabd071 | ||
|
fec557c1f9 | ||
|
deeb113977 | ||
|
d3f0d8dd70 | ||
|
1b6f37d626 | ||
|
290eabbca0 | ||
|
58bfe5423e | ||
|
07ce6d8e7f | ||
|
7cad514366 | ||
|
6b364021d1 | ||
|
451d539f1d | ||
|
5aae719ecd | ||
|
4aa32275f1 | ||
|
5d23e6dab6 | ||
|
1f4143de81 | ||
|
0221dc1771 | ||
|
7d600c4fcb | ||
|
404a0ab0f4 | ||
|
6f20276b2f | ||
|
81fba99f3a | ||
|
8dec4e8d85 | ||
|
7ceca78db4 | ||
|
00384c9e37 | ||
|
cd95b1a38c | ||
|
725a8d8f1a | ||
|
35e5369c70 | ||
|
bfc8da1102 | ||
|
e13bfec734 | ||
|
0287724413 | ||
|
54158a1746 | ||
|
e09d8674b4 | ||
|
7e7c50bdd7 | ||
|
4114b8a480 | ||
|
942deb7d80 | ||
|
41b98d7d8f | ||
|
dcf6b59e8d | ||
|
82bea29f4e | ||
|
ba90cdeb5a | ||
|
24fa54081b | ||
|
210295a702 | ||
|
869d2bd53b | ||
|
16f3cca98b | ||
|
59505f98ba | ||
|
d55c74b6c4 | ||
|
f6fd845400 | ||
|
d47641c2fc | ||
|
bc1a1127cf | ||
|
5eabceddcf | ||
|
16e5e2ae64 | ||
|
713aaef799 | ||
|
4a569affbc | ||
|
2fa947af9c | ||
|
9abfdcaa84 | ||
|
7b65b298c1 | ||
|
44dc75a3ea | ||
|
f4a490ac00 | ||
|
d9d39d48c5 | ||
|
dd45bf0d35 | ||
|
8ae6e613cb | ||
|
70414574ee | ||
|
6e5a8353c5 | ||
|
70ddb11a72 | ||
|
8be8ed309f | ||
|
3f17750a27 | ||
|
bf54aae779 | ||
|
1eb3b19f63 | ||
|
d43d6733b6 | ||
|
57301721f1 | ||
|
24f149f239 | ||
|
b53b0a0fcd | ||
|
5abda15c26 | ||
|
7184ce9870 | ||
|
2b1db73326 | ||
|
558313a2c2 | ||
|
e084c2d6e4 | ||
|
b1f53f8e49 | ||
|
ba4472a61c | ||
|
d0a41252c8 | ||
|
4d2acd59e2 | ||
|
7888dd4418 | ||
|
2231f7c024 | ||
|
90d3db1776 | ||
|
0078c28ee6 | ||
|
079f3518d7 | ||
|
f1d781ffee | ||
|
1f55d419eb | ||
|
35a25fffd7 | ||
|
4e044d4142 | ||
|
b9801478b2 | ||
|
c3baf3d296 | ||
|
2d9f52d03e | ||
|
9d0c15c448 | ||
|
aea10b8ab0 | ||
|
623c64142b | ||
|
577e03e0c7 | ||
|
9c54412447 | ||
|
8b7ba89cb5 | ||
|
eebd6c97bc | ||
|
6ceab8435d | ||
|
db6354b787 | ||
|
a78f3789a9 | ||
|
96af3e8613 | ||
|
fad39b6058 | ||
|
456b1c6182 | ||
|
9e6c1c30cf | ||
|
fa3d509fad | ||
|
9ab18cf419 | ||
|
67e0674df8 | ||
|
e7161fdfec | ||
|
e5e2057ca9 | ||
|
bc57891aa0 | ||
|
b40899966f | ||
|
d7e9d2d915 | ||
|
cf1d72745c | ||
|
c774a76f24 | ||
|
a86d6c290d | ||
|
ee8fa17351 | ||
|
53b9034546 | ||
|
0e141ce700 | ||
|
5d4adf267d | ||
|
2e6c75c895 | ||
|
9901477bcf | ||
|
2b9636ccc0 | ||
|
91f58fb1ca | ||
|
be1551aa94 | ||
|
4027319d6c | ||
|
1009391d9f | ||
|
904a94fbe5 | ||
|
08357a096c | ||
|
8cff876962 | ||
|
67fa6d2738 | ||
|
b3a20afbbc | ||
|
de30471edd | ||
|
e5050c9373 | ||
|
7f5e859019 | ||
|
deae90f947 | ||
|
a879715953 | ||
|
6c6aee9682 | ||
|
25183089fd | ||
|
00fff3f53c | ||
|
c95a4c3dd1 | ||
|
56ef2d5f72 | ||
|
ab5d4d9279 | ||
|
8242f46ed7 | ||
|
df926ef62f | ||
|
c7086cb7f1 | ||
|
a38324a5a3 | ||
|
603bc14cdd | ||
|
e35b88f2c8 | ||
|
f5f37f3f40 | ||
|
6f44a1aa26 | ||
|
f2d9784979 | ||
|
a2e6143ea1 | ||
|
f8ce0bb922 | ||
|
cabd13d18a | ||
|
ebb1b84ae3 | ||
|
825460856f | ||
|
86accf41a3 | ||
|
b67cd7dfd0 | ||
|
6c51f7f591 | ||
|
051ceed0f9 | ||
|
4bc28272da | ||
|
9fb79ac3ec | ||
|
7162446358 | ||
|
b48ea2c1a6 | ||
|
89b123d003 | ||
|
6f88ecac44 | ||
|
ae6bc52076 | ||
|
32350cf9b1 | ||
|
7ae636dfe9 | ||
|
c44fa634d1 | ||
|
d04b899c29 | ||
|
b42b187712 | ||
|
dede600637 | ||
|
c45e72775f | ||
|
78e098661f | ||
|
d453a612dc | ||
|
ec3f108d71 | ||
|
0ed10082b1 | ||
|
f425951b49 | ||
|
845e533b66 | ||
|
41ff0962c4 | ||
|
6d8f3b9ff8 | ||
|
d7b247a949 | ||
|
28b2949396 | ||
|
9de3065e68 | ||
|
e0a45a354d | ||
|
2753dbb3b0 | ||
|
9685095379 | ||
|
1ce5e17ce9 | ||
|
d1009e8830 | ||
|
5297ed5038 | ||
|
8b09ecfe48 | ||
|
cceb599fc9 | ||
|
e49fa3a97a | ||
|
1e434a9b3d | ||
|
d6df8116a5 | ||
|
acca824dea | ||
|
5d9d958e74 | ||
|
6fbd18183a | ||
|
23b075cbd4 | ||
|
3ee7b75ee0 | ||
|
b6c3ab723b | ||
|
feb2d3ccf7 | ||
|
096fb55faa | ||
|
6526df81e4 | ||
|
d8b5eb5d17 | ||
|
273b9683ac | ||
|
e215157a21 | ||
|
05c7c8e576 | ||
|
6e63aafbea | ||
|
c6406bebde | ||
|
f77837130d | ||
|
e6c1fa04ce | ||
|
ede2b18564 | ||
|
5213ef579d | ||
|
7a9a2dd0dc | ||
|
2805ff038a | ||
|
52661f35e8 | ||
|
831091db9e | ||
|
556206f138 | ||
|
9f7088a9e0 | ||
|
48e9a17f0a | ||
|
ec5531a6b1 | ||
|
d3ff893871 | ||
|
521f233cfd | ||
|
92916e311f | ||
|
58cface115 | ||
|
e3e101b412 | ||
|
2f13ad0674 | ||
|
a0abf31a82 | ||
|
f2354537ff | ||
|
76bd21e521 | ||
|
5298333c73 | ||
|
27bd09454f | ||
|
741d8246c5 | ||
|
c3c68b1c3f | ||
|
e21305aa23 | ||
|
04567ab649 | ||
|
814045201f | ||
|
c078d355b6 | ||
|
9fd94e0062 | ||
|
d3e03ed88a | ||
|
be66cdb51d | ||
|
f541f75400 | ||
|
d9874898c7 | ||
|
70c54ef144 | ||
|
4cc93f5553 | ||
|
dfd39fadb0 | ||
|
d8cece7cd2 | ||
|
6a8a7a6c01 | ||
|
6377a73b2b | ||
|
52f658ac55 | ||
|
18f6aee5c2 | ||
|
5f930b267c | ||
|
001b7824d2 | ||
|
bb9ed60489 | ||
|
6fd77679d9 | ||
|
345446519b | ||
|
beaca7c7db | ||
|
9ea1e4be3d | ||
|
5ad3d64b4b | ||
|
b19a41528a | ||
|
d96f1ab505 | ||
|
dea7c4dc2e | ||
|
56cb107005 | ||
|
f5657ef7b7 | ||
|
83d03e97c5 | ||
|
a388a0f193 | ||
|
c41d33c53f | ||
|
ed4b0f713a | ||
|
6a42b37fca | ||
|
cc747013c4 | ||
|
d71515be04 | ||
|
7d21fc157c | ||
|
87413fbff0 | ||
|
e2f253d8ab | ||
|
a039089888 | ||
|
4686ac47b6 | ||
|
b62247a36e | ||
|
51dcfa8571 | ||
|
5b85589274 | ||
|
1c581e45e9 | ||
|
c238d627c9 | ||
|
f9ef98ce19 | ||
|
dfba9227e9 | ||
|
9bf5cac782 | ||
|
94739756b4 | ||
|
fc4338eca0 | ||
|
ef9bf02d00 | ||
|
6a11c83630 | ||
|
644aac5e1b | ||
|
752a42419b | ||
|
124455a4a8 | ||
|
0372a8120a | ||
|
61b963a717 | ||
|
e088c6ae2b | ||
|
773dc6fa69 | ||
|
39945b7775 | ||
|
c38722a68b | ||
|
38477b08bc | ||
|
259c2572c1 | ||
|
17ab42d891 | ||
|
d76365240b | ||
|
1c26f044a7 | ||
|
606c53a05f | ||
|
c4f1372814 | ||
|
62fed20ad0 | ||
|
501f60e930 | ||
|
33904180d8 | ||
|
7b8cf97546 | ||
|
18296eee47 | ||
|
592b1905af | ||
|
547f79b920 | ||
|
548242981d | ||
|
0bef151437 | ||
|
149ea938c4 | ||
|
c4db63665a | ||
|
4bce012570 | ||
|
83977db7ab | ||
|
e7e1308d7f | ||
|
45bfed9b9d | ||
|
c5a7f4b846 | ||
|
5b425bd5a6 | ||
|
31ac6718dd | ||
|
a9212174ee | ||
|
1c2c73becf | ||
|
d2cf1a7882 | ||
|
5d773dd9db | ||
|
4c924cc920 | ||
|
9a0494259e | ||
|
e91082d477 | ||
|
5952c64066 | ||
|
b1170bcc2e | ||
|
f0e70bc0ad | ||
|
04fbf24681 | ||
|
5333467249 | ||
|
f7a6b8934c | ||
|
e8e188acaf | ||
|
e915d84864 | ||
|
9f3ad89ed6 | ||
|
6581aae90e | ||
|
aac1f0f1dc | ||
|
8ed5f4ac75 | ||
|
956b001613 | ||
|
ea721c908f | ||
|
6a40793719 | ||
|
2d89df44ae |
11
.clang-tidy
Normal file
11
.clang-tidy
Normal file
@ -0,0 +1,11 @@
|
||||
#unit/gtest/gtest-all.cc,build/src/parser/Parser.cpp,build/src/parser/control_verbs.cpp
|
||||
#Dont change first comment ignores specific files from clang-tidy
|
||||
|
||||
|
||||
Checks: 'clang-analyzer-*,-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,performance-*,-performance-unnecessary-value-param,-performance-avoid-endl'
|
||||
WarningsAsErrors: ''
|
||||
HeaderFilterRegex: '.*'
|
||||
SystemHeaders: false
|
||||
FormatStyle: none
|
||||
InheritParentConfig: true
|
||||
User: user
|
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
[submodule "simde"]
|
||||
path = simde
|
||||
url = https://github.com/simd-everywhere/simde.git
|
66
CHANGELOG-vectorscan.md
Normal file
66
CHANGELOG-vectorscan.md
Normal file
@ -0,0 +1,66 @@
|
||||
# Vectorscan Change Log
|
||||
|
||||
This is a list of notable changes to Vectorscan, in reverse chronological order. For Hyperscan Changelog, check CHANGELOG.md
|
||||
|
||||
## [5.4.11] 2023-11-19
|
||||
|
||||
- Refactor CMake build system to be much more modular.
|
||||
- version in hs.h fell out of sync again #175
|
||||
- Fix compile failures with recent compilers, namely clang-15 and gcc-13
|
||||
- Fix clang 15,16 compilation errors on all platforms, refactor CMake build system #181
|
||||
- Fix signed/unsigned char issue on Arm with Ragel generated code.
|
||||
- Correct set_source_files_properties usage #189
|
||||
- Fix build failure on Ubuntu 20.04
|
||||
- Support building on Ubuntu 20.04 #180
|
||||
- Require pkg-config during Cmake
|
||||
- make pkgconfig a requirement #188
|
||||
- Fix segfault on Fat runtimes with SVE2 code
|
||||
- Move VERM16 enums to the end of the list #191
|
||||
- Update README.md, add CHANGELOG-vectorscan.md and Contributors-vectorscan.md files
|
||||
|
||||
## [5.4.10] 2023-09-23
|
||||
- Fix compilation with libcxx 16 by @rschu1ze in #144
|
||||
- Fix use-of-uninitialized-value due to getData128() by @azat in #148
|
||||
- Use std::vector instead of boost::container::small_vector under MSan by @azat in #149
|
||||
- Feature/enable fat runtime arm by @markos in #165
|
||||
- adding ifndef around HS_PUBLIC_API definition so that vectorscan can be statically linked into another shared library without exporting symbols by @jeffplaisance in #164
|
||||
- Feature/backport hyperscan 2023 q3 by @markos in #169
|
||||
- Prepare for 5.4.10 by @markos in #167
|
||||
|
||||
## [5.4.9] 2023-03-23
|
||||
- Major change: Enable SVE & SVE2 builds and make it a supported architecture! (thanks to @abondarev84)
|
||||
- Fix various clang-related bugs
|
||||
- Fix Aarch64 bug in Parser.rl because of char signedness. Make unsigned char the default in the Parser for all architectures.
|
||||
- Fix Power bug, multiple tests were failing.
|
||||
- C++20 related change, use prefixed assume_aligned to avoid conflict with C++20 std::assume_aligned.
|
||||
|
||||
## [5.4.8] 2022-09-13
|
||||
- CMake: Use non-deprecated method for finding python by @jth in #108
|
||||
- Optimize vectorscan for aarch64 by using shrn instruction by @danlark1 in #113
|
||||
- Fixed the PCRE download location by @pareenaverma in #116
|
||||
- Bugfix/hyperscan backport 202208 by @markos in #118
|
||||
- VSX optimizations by @markos in #119
|
||||
- when compiling with mingw64, use __mingw_aligned_malloc() and __mingw_aligned_free() by @liquidaty in #121
|
||||
- [NEON] simplify/optimize shift/align primitives by @markos in #123
|
||||
- Merge develop to master by @markos in #124
|
||||
|
||||
## [5.4.7] 2022-05-05
|
||||
- Fix word boundary assertions under C++20 by @BigRedEye in #90
|
||||
- Fix all ASAN issues in vectorscan by @danlark1 in #93
|
||||
- change FAT_RUNTIME to a normal option so it can be set to off by @a16bitsysop in #94
|
||||
- Optimized and correct version of movemask128 for ARM by @danlark1 in #102
|
||||
|
||||
## [5.4.6] 2022-01-21
|
||||
- Major refactoring of many engines to use internal SuperVector C++ templates library. Code size reduced to 1/3rd with no loss of performance in most cases.
|
||||
- Microbenchmarking tool added for performance finetuning
|
||||
- Arm Advanced SIMD/NEON fully ported. Initial work on SVE2 for a couple of engines.
|
||||
- Power9 VSX ppc64le fully ported. Initial port needs some optimization.
|
||||
- Clang compiler support added.
|
||||
- Apple M1 support added.
|
||||
- CI added, the following configurations are tested on every PR:
|
||||
gcc-debug, gcc-release, clang-debug, clang-release:
|
||||
Linux Intel: SSE4.2, AVX2, AVX512, FAT
|
||||
Linux Arm
|
||||
Linux Power9
|
||||
clang-debug, clang-release:
|
||||
MacOS Apple M1
|
49
CHANGELOG.md
49
CHANGELOG.md
@ -2,6 +2,55 @@
|
||||
|
||||
This is a list of notable changes to Hyperscan, in reverse chronological order.
|
||||
|
||||
## [5.4.2] 2023-04-19
|
||||
- Roll back bugfix for github issue #350: Besides using scratch for
|
||||
corresponding database, Hyperscan also allows user to use larger scratch
|
||||
allocated for another database. Users can leverage this property to achieve
|
||||
safe scratch usage in multi-database scenarios. Behaviors beyond these are
|
||||
discouraged and results are undefined.
|
||||
- Fix hsdump issue due to invalid nfa type.
|
||||
|
||||
## [5.4.1] 2023-02-20
|
||||
- The Intel Hyperscan team is pleased to provide a bug fix release to our open source library.
|
||||
Intel also maintains an upgraded version available through your Intel sales representative.
|
||||
- Bugfix for issue #184: fix random char value of UTF-8.
|
||||
- Bugfix for issue #291: bypass logical combination flag in hs_expression_info().
|
||||
- Bugfix for issue #292: fix build error due to libc symbol parsing.
|
||||
- Bugfix for issue #302/304: add empty string check for pure literal API.
|
||||
- Bugfix for issue #303: fix unknown instruction error in pure literal API.
|
||||
- Bugfix for issue #303: avoid memory leak in stream close stage.
|
||||
- Bugfix for issue #305: fix assertion failure in DFA construction.
|
||||
- Bugfix for issue #317: fix aligned allocator segment faults.
|
||||
- Bugfix for issue #350: add quick validity check for scratch.
|
||||
- Bugfix for issue #359: fix glibc-2.34 stack size issue.
|
||||
- Bugfix for issue #360: fix SKIP flag issue in chimera.
|
||||
- Bugfix for issue #362: fix one cotec check corner issue in UTF-8 validation.
|
||||
- Fix other compile issues.
|
||||
|
||||
## [5.4.0] 2020-12-31
|
||||
- Improvement on literal matcher "Fat Teddy" performance, including
|
||||
support for Intel(R) AVX-512 Vector Byte Manipulation Instructions (Intel(R)
|
||||
AVX-512 VBMI).
|
||||
- Introduce a new 32-state shuffle-based DFA engine ("Sheng32"). This improves
|
||||
scanning performance by leveraging AVX-512 VBMI.
|
||||
- Introduce a new 64-state shuffle-based DFA engine ("Sheng64"). This improves
|
||||
scanning performance by leveraging AVX-512 VBMI.
|
||||
- Introduce a new shuffle-based hybrid DFA engine ("McSheng64"). This improves
|
||||
scanning performance by leveraging AVX-512 VBMI.
|
||||
- Improvement on exceptional state handling performance for LimEx NFA, including
|
||||
support for AVX-512 VBMI.
|
||||
- Improvement on lookaround performance with new models, including support for
|
||||
AVX-512.
|
||||
- Improvement on DFA state space efficiency.
|
||||
- Optimization on decision of NFA/DFA generation.
|
||||
- hsbench: add CSV dump support for hsbench.
|
||||
- Bugfix for cmake error on Icelake under release mode.
|
||||
- Bugfix in find_vertices_in_cycles() to avoid self-loop checking in SCC.
|
||||
- Bugfix for issue #270: fix return value handling in chimera.
|
||||
- Bugfix for issue #284: use correct free function in logical combination.
|
||||
- Add BUILD_EXAMPLES cmake option to enable example code compilation. (#260)
|
||||
- Some typo fixing. (#242, #259)
|
||||
|
||||
## [5.3.0] 2020-05-15
|
||||
- Improvement on literal matcher "Teddy" performance, including support for
|
||||
Intel(R) AVX-512 Vector Byte Manipulation Instructions (Intel(R) AVX-512
|
||||
|
1096
CMakeLists.txt
1096
CMakeLists.txt
File diff suppressed because it is too large
Load Diff
1
COPYING
1
COPYING
@ -1,4 +1,5 @@
|
||||
Copyright (c) 2015, Intel Corporation
|
||||
Copyright (c) 2019-20, VectorCamp PC
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
25
Contributors-vectorscan.md
Normal file
25
Contributors-vectorscan.md
Normal file
@ -0,0 +1,25 @@
|
||||
394 Konstantinos Margaritis <konstantinos@vectorcamp.gr>
|
||||
59 apostolos <apostolos.tapsas@vectorcamp.gr>
|
||||
25 Hong, Yang A <yang.a.hong@intel.com>
|
||||
19 George Wort <george.wort@arm.com>
|
||||
16 Chang, Harry <harry.chang@intel.com>
|
||||
7 Danila Kutenin <danilak@google.com>
|
||||
7 Wang Xiang W <xiang.w.wang@intel.com>
|
||||
6 Alex Bondarev <abondarev84@gmail.com>
|
||||
5 Konstantinos Margaritis <konma@vectorcamp.gr>
|
||||
3 Duncan Bellamy <dunk@denkimushi.com>
|
||||
2 Azat Khuzhin <a3at.mail@gmail.com>
|
||||
2 Jan Henning <jan.thilo.henning@sap.com>
|
||||
1 BigRedEye <mail@bigredeye.me>
|
||||
1 Daniel Kutenin <kutdanila@yandex.ru>
|
||||
1 Danila Kutenin <kutdanila@yandex.ru>
|
||||
1 Liu Zixian <hdu_sdlzx@163.com>
|
||||
1 Mitchell Wasson <miwasson@cisco.com>
|
||||
1 Piotr Skamruk <piotr.skamruk@gmail.com>
|
||||
1 Robbie Williamson <robbie.williamson@arm.com>
|
||||
1 Robert Schulze <robert@clickhouse.com>
|
||||
1 Walt Stoneburner <wls@wwco.com>
|
||||
1 Zhu,Wenjun <wenjun.zhu@intel.com>
|
||||
1 hongyang7 <yang.a.hong@intel.com>
|
||||
1 jplaisance <jeffplaisance@gmail.com>
|
||||
1 liquidaty <info@liquidaty.com>
|
5
LICENSE
5
LICENSE
@ -2,6 +2,11 @@ Hyperscan is licensed under the BSD License.
|
||||
|
||||
Copyright (c) 2015, Intel Corporation
|
||||
|
||||
Vectorscan is licensed under the BSD License.
|
||||
|
||||
Copyright (c) 2020, VectorCamp PC
|
||||
Copyright (c) 2021, Arm Limited
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
|
251
README.md
251
README.md
@ -1,43 +1,252 @@
|
||||
# Hyperscan
|
||||
# About Vectorscan
|
||||
|
||||
Hyperscan is a high-performance multiple regex matching library. It follows the
|
||||
A fork of Intel's Hyperscan, modified to run on more platforms. Currently ARM NEON/ASIMD
|
||||
and Power VSX are 100% functional. ARM SVE2 support is in ongoing with
|
||||
access to hardware now. More platforms will follow in the future.
|
||||
Further more, starting 5.4.12 there is now a [SIMDe](https://github.com/simd-everywhere/simde)
|
||||
port, which can be either used for platforms without official SIMD support,
|
||||
as SIMDe can emulate SIMD instructions, or as an alternative backend for existing architectures,
|
||||
for reference and comparison purposes.
|
||||
|
||||
Vectorscan will follow Intel's API and internal algorithms where possible, but will not
|
||||
hesitate to make code changes where it is thought of giving better performance or better
|
||||
portability. In addition, the code will be gradually simplified and made more uniform and
|
||||
all architecture specific -currently Intel- #ifdefs will be removed and abstracted away.
|
||||
|
||||
# Why was there a need for a fork?
|
||||
|
||||
Originally, the ARM porting was intended to be merged into Intel's own Hyperscan, and relevant
|
||||
Pull Requests were made to the project for this reason. Unfortunately, the
|
||||
PRs were rejected for now and the forseeable future, thus we have created Vectorscan for
|
||||
our own multi-architectural and opensource collaborative needs.
|
||||
|
||||
The recent license change of Hyperscan makes Vectorscan even more relevant for the FLOSS ecosystem.
|
||||
|
||||
# What is Vectorscan/Hyperscan/?
|
||||
|
||||
Hyperscan and by extension Vectorscan is a high-performance multiple regex matching library. It follows the
|
||||
regular expression syntax of the commonly-used libpcre library, but is a
|
||||
standalone library with its own C API.
|
||||
|
||||
Hyperscan uses hybrid automata techniques to allow simultaneous matching of
|
||||
Hyperscan/Vectorscan uses hybrid automata techniques to allow simultaneous matching of
|
||||
large numbers (up to tens of thousands) of regular expressions and for the
|
||||
matching of regular expressions across streams of data.
|
||||
|
||||
Hyperscan is typically used in a DPI library stack.
|
||||
|
||||
# Documentation
|
||||
|
||||
Information on building the Hyperscan library and using its API is available in
|
||||
the [Developer Reference Guide](http://intel.github.io/hyperscan/dev-reference/).
|
||||
Vectorscan is typically used in a DPI library stack, just like Hyperscan.
|
||||
|
||||
# License
|
||||
|
||||
Hyperscan is licensed under the BSD License. See the LICENSE file in the
|
||||
project repository.
|
||||
Vectorscan follows a BSD License like the original Hyperscan (up to 5.4).
|
||||
|
||||
Vectorscan continues to be an open source project and we are committed to keep it that way.
|
||||
See the LICENSE file in the project repository.
|
||||
|
||||
## Hyperscan License Change after 5.4
|
||||
|
||||
According to
|
||||
[Accelerate Snort Performance with Hyperscan and Intel Xeon Processors on Public Clouds](https://networkbuilders.intel.com/docs/networkbuilders/accelerate-snort-performance-with-hyperscan-and-intel-xeon-processors-on-public-clouds-1680176363.pdf) versions of Hyperscan later than 5.4 are
|
||||
going to be closed-source:
|
||||
|
||||
> The latest open-source version (BSD-3 license) of Hyperscan on Github is 5.4. Intel conducts continuous internal
|
||||
> development and delivers new Hyperscan releases under Intel Proprietary License (IPL) beginning from 5.5 for interested
|
||||
> customers. Please contact authors to learn more about getting new Hyperscan releases.
|
||||
|
||||
# Versioning
|
||||
|
||||
The `master` branch on Github will always contain the most recent release of
|
||||
The `master` branch on Github will always contain the most recent stable release of
|
||||
Hyperscan. Each version released to `master` goes through QA and testing before
|
||||
it is released; if you're a user, rather than a developer, this is the version
|
||||
you should be using.
|
||||
|
||||
Further development towards the next release takes place on the `develop`
|
||||
branch.
|
||||
branch. All PRs are first made against the develop branch and if the pass the [Vectorscan CI](https://buildbot-ci.vectorcamp.gr/#/grid), then they get merged. Similarly with PRs from develop to master.
|
||||
|
||||
# Get Involved
|
||||
# Compatibility with Hyperscan
|
||||
|
||||
The official homepage for Hyperscan is at [www.hyperscan.io](https://www.hyperscan.io).
|
||||
Vectorscan aims to be ABI and API compatible with the last open source version of Intel Hyperscan 5.4.
|
||||
After careful consideration we decided that we will **NOT** aim to achieving compatibility with later Hyperscan versions 5.5/5.6 that have extended Hyperscan's API.
|
||||
If keeping up to date with latest API of Hyperscan, you should talk to Intel and get a license to use that.
|
||||
However, we intend to extend Vectorscan's API with user requested changes or API extensions and improvements that we think are best for the project.
|
||||
|
||||
If you have questions or comments, we encourage you to [join the mailing
|
||||
list](https://lists.01.org/mailman/listinfo/hyperscan). Bugs can be filed by
|
||||
sending email to the list, or by creating an issue on Github.
|
||||
# Installation
|
||||
|
||||
If you wish to contact the Hyperscan team at Intel directly, without posting
|
||||
publicly to the mailing list, send email to
|
||||
[hyperscan@intel.com](mailto:hyperscan@intel.com).
|
||||
## Debian/Ubuntu
|
||||
|
||||
On recent Debian/Ubuntu systems, vectorscan should be directly available for installation:
|
||||
|
||||
```
|
||||
$ sudo apt install libvectorscan5
|
||||
```
|
||||
|
||||
Or to install the devel package you can install `libvectorscan-dev` package:
|
||||
|
||||
```
|
||||
$ sudo apt install libvectorscan-dev
|
||||
```
|
||||
|
||||
For other distributions/OSes please check the [Wiki](https://github.com/VectorCamp/vectorscan/wiki/Installation-from-package)
|
||||
|
||||
|
||||
# Build Instructions
|
||||
|
||||
The build system has recently been refactored to be more modular and easier to extend. For that reason,
|
||||
some small but necessary changes were made that might break compatibility with how Hyperscan was built.
|
||||
|
||||
## Install Common Dependencies
|
||||
|
||||
### Debian/Ubuntu
|
||||
In order to build on Debian/Ubuntu make sure you install the following build-dependencies
|
||||
|
||||
```
|
||||
$ sudo apt install build-essential cmake ragel pkg-config libsqlite3-dev libpcap-dev
|
||||
```
|
||||
|
||||
### Other distributions
|
||||
|
||||
TBD
|
||||
|
||||
### MacOS X (M1/M2/M3 CPUs only)
|
||||
|
||||
Assuming an existing HomeBrew installation:
|
||||
|
||||
```
|
||||
% brew install boost cmake gcc libpcap pkg-config ragel sqlite
|
||||
```
|
||||
|
||||
### *BSD
|
||||
In NetBSD you will almost certainly need to have a newer compiler installed.
|
||||
Also you will need to install cmake, sqlite, boost and ragel.
|
||||
Also, libpcap is necessary for some of the benchmarks, so let's install that
|
||||
as well.
|
||||
When using pkgsrc, you would typically do this using something
|
||||
similar to
|
||||
```
|
||||
pkg_add gcc12-12.3.0.tgz
|
||||
pkg_add boost-headers-1.83.0.tgz boost-jam-1.83.0.tgz boost-libs-1.83.0nb1.tgz
|
||||
pkg_add ragel-6.10.tgz
|
||||
pkg_add cmake-3.28.1.tgz
|
||||
pkg_add sqlite3-3.44.2.tgz
|
||||
pkg_add libpcap-1.10.4.tgz
|
||||
```
|
||||
Version numbers etc will of course vary. One would either download the
|
||||
binary packages or build them using pkgsrc. There exist some NetBSD pkg
|
||||
tools like ```pkgin``` which help download e.g. dependencies as binary packages,
|
||||
but overall NetBSD leaves a lot of detail exposed to the user.
|
||||
The main package system used in NetBSD is pkgsrc and one will probably
|
||||
want to read up more about it than is in the scope of this document.
|
||||
See https://www.netbsd.org/docs/software/packages.html for more information.
|
||||
|
||||
This will not replace the compiler in the standard base distribution, and
|
||||
cmake will probably find the base dist's compiler when it checks automatically.
|
||||
Using the example of gcc12 from pkgsrc, one will need to set two
|
||||
environment variables before starting:
|
||||
```
|
||||
export CC="/usr/pkg/gcc12/bin/cc"
|
||||
export CXX="/usr/pkg/gcc12/bin/g++"
|
||||
```
|
||||
|
||||
In FreeBSD similarly, you might want to install a different compiler.
|
||||
If you want to use gcc, it is recommended to use gcc12.
|
||||
You will also, as in NetBSD, need to install cmake, sqlite, boost and ragel packages.
|
||||
Using the example of gcc12 from pkg:
|
||||
installing the desired compiler:
|
||||
```
|
||||
pkg install gcc12
|
||||
pkg install boost-all
|
||||
pkg install ragel
|
||||
pkg install cmake
|
||||
pkg install sqlite
|
||||
pkg install libpcap
|
||||
pkg install ccache
|
||||
```
|
||||
and then before beginning the cmake and build process, set
|
||||
the environment variables to point to this compiler:
|
||||
```
|
||||
export CC="/usr/local/bin/gcc"
|
||||
export CXX="/usr/local/bin/g++"
|
||||
```
|
||||
A further note in FreeBSD, on the PowerPC and ARM platforms,
|
||||
the gcc12 package installs to a slightly different name, on FreeBSD/ppc,
|
||||
gcc12 will be found using:
|
||||
```
|
||||
export CC="/usr/local/bin/gcc12"
|
||||
export CXX="/usr/local/bin/g++12"
|
||||
```
|
||||
|
||||
Then continue with the build as below.
|
||||
|
||||
|
||||
## Configure & build
|
||||
|
||||
In order to configure with `cmake` first create and cd into a build directory:
|
||||
|
||||
```
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
```
|
||||
|
||||
Then call `cmake` from inside the `build` directory:
|
||||
|
||||
```
|
||||
$ cmake ../
|
||||
```
|
||||
|
||||
Common options for Cmake are:
|
||||
|
||||
* `-DBUILD_STATIC_LIBS=[On|Off]` Build static libraries
|
||||
* `-DBUILD_SHARED_LIBS=[On|Off]` Build shared libraries (if none are set static libraries are built by default)
|
||||
* `-DCMAKE_BUILD_TYPE=[Release|Debug|RelWithDebInfo|MinSizeRel]` Configure build type and determine optimizations and certain features.
|
||||
* `-DUSE_CPU_NATIVE=[On|Off]` Native CPU detection is off by default, however it is possible to build a performance-oriented non-fat library tuned to your CPU
|
||||
* `-DFAT_RUNTIME=[On|Off]` Fat Runtime is only available for X86 32-bit/64-bit and AArch64 architectures and only on Linux. It is incompatible with `Debug` type and `USE_CPU_NATIVE`.
|
||||
|
||||
### Specific options for X86 32-bit/64-bit (Intel/AMD) CPUs
|
||||
|
||||
* `-DBUILD_AVX2=[On|Off]` Enable code for AVX2.
|
||||
* `-DBUILD_AVX512=[On|Off]` Enable code for AVX512. Implies `BUILD_AVX2`.
|
||||
* `-DBUILD_AVX512VBMI=[On|Off]` Enable code for AVX512 with VBMI extension. Implies `BUILD_AVX512`.
|
||||
|
||||
### Specific options for Arm 64-bit CPUs
|
||||
|
||||
* `-DBUILD_SVE=[On|Off]` Enable code for SVE, like on AWS Graviton3 CPUs. Not much code is ported just for SVE , but enabling SVE code production, does improve code generation, see [Benchmarks](https://github.com/VectorCamp/vectorscan/wiki/Benchmarks).
|
||||
* `-DBUILD_SVE2=[On|Off]` Enable code for SVE2, implies `BUILD_SVE`. Most non-Neon code is written for SVE2
|
||||
* `-DBUILD_SVE2_BITPERM=[On|Off]` Enable code for SVE2_BITPERM harwdare feature, implies `BUILD_SVE2`.
|
||||
|
||||
## Other options
|
||||
|
||||
* `SANITIZE=[address|memory|undefined]` (experimental) Use `libasan` sanitizer to detect possible bugs. For now only `address` is tested. This will eventually be integrated in the CI.
|
||||
|
||||
## SIMDe options
|
||||
|
||||
* `SIMDE_BACKEND=[On|Off]` Enable SIMDe backend. If this is chosen all native (SSE/AVX/AVX512/Neon/SVE/VSX) backends will be disabled and a SIMDe SSE4.2 emulation backend will be enabled. This will enable Vectorscan to build and run on architectures without SIMD.
|
||||
* `SIMDE_NATIVE=[On|Off]` Enable SIMDe native emulation of x86 SSE4.2 intrinsics on the building platform. That is, SSE4.2 intrinsics will be emulated using Neon on an Arm platform, or VSX on a Power platform, etc.
|
||||
|
||||
## Build
|
||||
|
||||
If `cmake` has completed successfully you can run `make` in the same directory, if you have a multi-core system with `N` cores, running
|
||||
|
||||
```
|
||||
$ make -j <N>
|
||||
```
|
||||
|
||||
will speed up the process. If all goes well, you should have the vectorscan library compiled.
|
||||
|
||||
|
||||
# Contributions
|
||||
|
||||
The official homepage for Vectorscan is at [www.github.com/VectorCamp/vectorscan](https://www.github.com/VectorCamp/vectorscan).
|
||||
|
||||
# Vectorscan Development
|
||||
|
||||
All development of Vectorscan is done in public.
|
||||
|
||||
# Original Hyperscan links
|
||||
For reference, the official homepage for Hyperscan is at [www.hyperscan.io](https://www.hyperscan.io).
|
||||
|
||||
# Hyperscan Documentation
|
||||
|
||||
Information on building the Hyperscan library and using its API is available in
|
||||
the [Developer Reference Guide](http://intel.github.io/hyperscan/dev-reference/).
|
||||
|
||||
And you can find the source code [on Github](https://github.com/intel/hyperscan).
|
||||
|
||||
For Intel Hyperscan related issues and questions, please follow the relevant links there.
|
||||
|
9
benchmarks/CMakeLists.txt
Normal file
9
benchmarks/CMakeLists.txt
Normal file
@ -0,0 +1,9 @@
|
||||
include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
include_directories(${PROJECT_SOURCE_DIR})
|
||||
|
||||
if (NOT FAT_RUNTIME AND (BUILD_SHARED_LIBS OR BUILD_STATIC_LIBS))
|
||||
add_executable(benchmarks benchmarks.cpp)
|
||||
set_source_files_properties(benchmarks.cpp PROPERTIES COMPILE_FLAGS
|
||||
"-Wall -Wno-unused-variable")
|
||||
target_link_libraries(benchmarks hs)
|
||||
endif()
|
309
benchmarks/benchmarks.cpp
Normal file
309
benchmarks/benchmarks.cpp
Normal file
@ -0,0 +1,309 @@
|
||||
/*
|
||||
* Copyright (c) 2020, 2021, VectorCamp PC
|
||||
* Copyright (c) 2023, 2024, Arm Limited
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <chrono>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
|
||||
#include "util/arch.h"
|
||||
#include "benchmarks.hpp"
|
||||
|
||||
#define MAX_LOOPS 1000000000
|
||||
#define MAX_MATCHES 5
|
||||
#define N 8
|
||||
|
||||
struct hlmMatchEntry {
|
||||
size_t to;
|
||||
u32 id;
|
||||
hlmMatchEntry(size_t end, u32 identifier) : to(end), id(identifier) {}
|
||||
};
|
||||
|
||||
std::vector<hlmMatchEntry> ctxt;
|
||||
|
||||
static hwlmcb_rv_t hlmSimpleCallback(size_t to, u32 id,
|
||||
UNUSED struct hs_scratch *scratch) { // cppcheck-suppress constParameterCallback
|
||||
DEBUG_PRINTF("match @%zu = %u\n", to, id);
|
||||
|
||||
ctxt.push_back(hlmMatchEntry(to, id));
|
||||
|
||||
return HWLM_CONTINUE_MATCHING;
|
||||
}
|
||||
|
||||
template <typename InitFunc, typename BenchFunc>
|
||||
static void run_benchmarks(int size, int loops, int max_matches,
|
||||
bool is_reverse, MicroBenchmark &bench,
|
||||
InitFunc &&init, BenchFunc &&func) {
|
||||
init(bench);
|
||||
double total_sec = 0.0;
|
||||
double max_bw = 0.0;
|
||||
double avg_time = 0.0;
|
||||
if (max_matches) {
|
||||
double avg_bw = 0.0;
|
||||
int pos = 0;
|
||||
for (int j = 0; j < max_matches - 1; j++) {
|
||||
bench.buf[pos] = 'b';
|
||||
pos = (j + 1) * size / max_matches;
|
||||
bench.buf[pos] = 'a';
|
||||
u64a actual_size = 0;
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
for (int i = 0; i < loops; i++) {
|
||||
const u8 *res = func(bench);
|
||||
if (is_reverse)
|
||||
actual_size += bench.buf.data() + size - res;
|
||||
else
|
||||
actual_size += res - bench.buf.data();
|
||||
}
|
||||
auto end = std::chrono::steady_clock::now();
|
||||
double dt = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
end - start)
|
||||
.count();
|
||||
total_sec += dt;
|
||||
/*convert microseconds to seconds*/
|
||||
/*calculate bandwidth*/
|
||||
double bw = (actual_size / dt) * 1000000.0 / 1048576.0;
|
||||
/*std::cout << "act_size = " << act_size << std::endl;
|
||||
std::cout << "dt = " << dt << std::endl;
|
||||
std::cout << "bw = " << bw << std::endl;*/
|
||||
avg_bw += bw;
|
||||
/*convert to MB/s*/
|
||||
max_bw = std::max(bw, max_bw);
|
||||
/*calculate average time*/
|
||||
avg_time += total_sec / loops;
|
||||
}
|
||||
avg_time /= max_matches;
|
||||
avg_bw /= max_matches;
|
||||
total_sec /= 1000000.0;
|
||||
/*convert average time to us*/
|
||||
printf("%-18s, %-12d, %-10d, %-6d, %-10.3f, %-9.3f, %-8.3f, %-7.3f\n",
|
||||
bench.label, max_matches, size ,loops, total_sec, avg_time, max_bw, avg_bw);
|
||||
} else {
|
||||
u64a total_size = 0;
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
for (int i = 0; i < loops; i++) {
|
||||
func(bench);
|
||||
}
|
||||
auto end = std::chrono::steady_clock::now();
|
||||
total_sec +=
|
||||
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
|
||||
.count();
|
||||
/*calculate transferred size*/
|
||||
total_size = (u64a)size * (u64a)loops;
|
||||
/*calculate average time*/
|
||||
avg_time = total_sec / loops;
|
||||
/*convert microseconds to seconds*/
|
||||
total_sec /= 1000000.0;
|
||||
/*calculate maximum bandwidth*/
|
||||
max_bw = total_size / total_sec;
|
||||
/*convert to MB/s*/
|
||||
max_bw /= 1048576.0;
|
||||
printf("%-18s, %-12s, %-10d, %-6d, %-10.3f, %-9.3f, %-8.3f, %-7s\n",
|
||||
bench.label, "0", size, loops, total_sec, avg_time, max_bw, "0");
|
||||
}
|
||||
}
|
||||
|
||||
int main(){
|
||||
const int matches[] = {0, MAX_MATCHES};
|
||||
std::vector<size_t> sizes;
|
||||
for (size_t i = 0; i < N; i++)
|
||||
sizes.push_back(16000 << i * 2);
|
||||
const char charset[] = "aAaAaAaAAAaaaaAAAAaaaaAAAAAAaaaAAaaa";
|
||||
printf("%-18s, %-12s, %-10s, %-6s, %-10s, %-9s, %-8s, %-7s\n", "Matcher",
|
||||
"max_matches", "size", "loops", "total_sec", "avg_time", "max_bw",
|
||||
"avg_bw");
|
||||
for (int m = 0; m < 2; m++) {
|
||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||
MicroBenchmark bench("Shufti", sizes[i]);
|
||||
run_benchmarks(
|
||||
sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
|
||||
[&](MicroBenchmark &b) {
|
||||
b.chars.set('a');
|
||||
ue2::shuftiBuildMasks(b.chars,
|
||||
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
|
||||
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
|
||||
memset(b.buf.data(), 'b', b.size);
|
||||
},
|
||||
[&](MicroBenchmark const &b) {
|
||||
return shuftiExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
|
||||
b.buf.data() + b.size);
|
||||
});
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||
MicroBenchmark bench("Reverse Shufti", sizes[i]);
|
||||
run_benchmarks(
|
||||
sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
|
||||
[&](MicroBenchmark &b) {
|
||||
b.chars.set('a');
|
||||
ue2::shuftiBuildMasks(b.chars,
|
||||
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
|
||||
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
|
||||
memset(b.buf.data(), 'b', b.size);
|
||||
},
|
||||
[&](MicroBenchmark const &b) {
|
||||
return rshuftiExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
|
||||
b.buf.data() + b.size);
|
||||
});
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||
MicroBenchmark bench("Truffle", sizes[i]);
|
||||
run_benchmarks(
|
||||
sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
|
||||
[&](MicroBenchmark &b) {
|
||||
b.chars.set('a');
|
||||
ue2::truffleBuildMasks(b.chars,
|
||||
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
|
||||
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
|
||||
memset(b.buf.data(), 'b', b.size);
|
||||
},
|
||||
[&](MicroBenchmark const &b) {
|
||||
return truffleExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
|
||||
b.buf.data() + b.size);
|
||||
});
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||
MicroBenchmark bench("Reverse Truffle", sizes[i]);
|
||||
run_benchmarks(
|
||||
sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
|
||||
[&](MicroBenchmark &b) {
|
||||
b.chars.set('a');
|
||||
ue2::truffleBuildMasks(b.chars,
|
||||
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
|
||||
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
|
||||
memset(b.buf.data(), 'b', b.size);
|
||||
},
|
||||
[&](MicroBenchmark const &b) {
|
||||
return rtruffleExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
|
||||
b.buf.data() + b.size);
|
||||
});
|
||||
}
|
||||
#ifdef CAN_USE_WIDE_TRUFFLE
|
||||
if(CAN_USE_WIDE_TRUFFLE) {
|
||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||
MicroBenchmark bench("Truffle Wide", sizes[i]);
|
||||
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
|
||||
[&](MicroBenchmark &b) {
|
||||
b.chars.set('a');
|
||||
ue2::truffleBuildMasksWide(b.chars, reinterpret_cast<u8 *>(&b.truffle_mask));
|
||||
memset(b.buf.data(), 'b', b.size);
|
||||
},
|
||||
[&](MicroBenchmark const &b) {
|
||||
return truffleExecWide(b.truffle_mask, b.buf.data(), b.buf.data() + b.size);
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||
MicroBenchmark bench("Reverse Truffle Wide", sizes[i]);
|
||||
run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
|
||||
[&](MicroBenchmark &b) {
|
||||
b.chars.set('a');
|
||||
ue2::truffleBuildMasksWide(b.chars, reinterpret_cast<u8 *>(&b.truffle_mask));
|
||||
memset(b.buf.data(), 'b', b.size);
|
||||
},
|
||||
[&](MicroBenchmark const &b) {
|
||||
return rtruffleExecWide(b.truffle_mask, b.buf.data(), b.buf.data() + b.size);
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||
MicroBenchmark bench("Vermicelli", sizes[i]);
|
||||
run_benchmarks(
|
||||
sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
|
||||
[&](MicroBenchmark &b) {
|
||||
b.chars.set('a');
|
||||
ue2::truffleBuildMasks(b.chars,
|
||||
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
|
||||
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
|
||||
memset(b.buf.data(), 'b', b.size);
|
||||
},
|
||||
[&](MicroBenchmark const &b) {
|
||||
return vermicelliExec('a', 'b', b.buf.data(),
|
||||
b.buf.data() + b.size);
|
||||
});
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||
MicroBenchmark bench("Reverse Vermicelli", sizes[i]);
|
||||
run_benchmarks(
|
||||
sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
|
||||
[&](MicroBenchmark &b) {
|
||||
b.chars.set('a');
|
||||
ue2::truffleBuildMasks(b.chars,
|
||||
reinterpret_cast<u8 *>(&b.truffle_mask_lo),
|
||||
reinterpret_cast<u8 *>(&b.truffle_mask_hi));
|
||||
memset(b.buf.data(), 'b', b.size);
|
||||
},
|
||||
[&](MicroBenchmark const &b) {
|
||||
return rvermicelliExec('a', 'b', b.buf.data(),
|
||||
b.buf.data() + b.size);
|
||||
});
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < std::size(sizes); i++) {
|
||||
// we imitate the noodle unit tests
|
||||
std::string str;
|
||||
const size_t char_len = 5;
|
||||
str.resize(char_len + 2);
|
||||
for (size_t j = 0; j < char_len; j++) {
|
||||
srand(time(NULL));
|
||||
int key = rand() % +36;
|
||||
str[char_len] = charset[key];
|
||||
str[char_len + 1] = '\0';
|
||||
}
|
||||
|
||||
MicroBenchmark bench("Noodle", sizes[i]);
|
||||
run_benchmarks(
|
||||
sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
|
||||
[&](MicroBenchmark &b) {
|
||||
ctxt.clear();
|
||||
memset(b.buf.data(), 'a', b.size);
|
||||
u32 id = 1000;
|
||||
ue2::hwlmLiteral lit(str, true, id);
|
||||
b.nt = ue2::noodBuildTable(lit);
|
||||
assert(b.nt.get() != nullptr);
|
||||
},
|
||||
[&](MicroBenchmark &b) { // cppcheck-suppress constParameterReference
|
||||
noodExec(b.nt.get(), b.buf.data(), b.size, 0,
|
||||
hlmSimpleCallback, &b.scratch);
|
||||
return b.buf.data() + b.size;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
67
benchmarks/benchmarks.hpp
Normal file
67
benchmarks/benchmarks.hpp
Normal file
@ -0,0 +1,67 @@
|
||||
/*
|
||||
* Copyright (c) 2020, 2021, VectorCamp PC
|
||||
* Copyright (c) 2024, Arm Limited
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "hwlm/hwlm_literal.h"
|
||||
#include "hwlm/noodle_build.h"
|
||||
#include "hwlm/noodle_engine.h"
|
||||
#include "hwlm/noodle_internal.h"
|
||||
#include "nfa/shufti.h"
|
||||
#include "nfa/shufticompile.h"
|
||||
#include "nfa/truffle.h"
|
||||
#include "nfa/trufflecompile.h"
|
||||
#include "nfa/vermicelli.hpp"
|
||||
#include "scratch.h"
|
||||
#include "util/bytecode_ptr.h"
|
||||
|
||||
class MicroBenchmark {
|
||||
public:
|
||||
struct hs_scratch scratch{};
|
||||
char const *label;
|
||||
size_t size;
|
||||
std::vector<u8> buf;
|
||||
ue2::bytecode_ptr<noodTable> nt;
|
||||
ue2::CharReach chars;
|
||||
|
||||
// Shufti/Truffle
|
||||
union {
|
||||
m256 truffle_mask;
|
||||
struct {
|
||||
#if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
|
||||
m128 truffle_mask_lo;
|
||||
m128 truffle_mask_hi;
|
||||
#else
|
||||
m128 truffle_mask_hi;
|
||||
m128 truffle_mask_lo;
|
||||
#endif
|
||||
};
|
||||
};
|
||||
|
||||
MicroBenchmark(char const *label_, size_t size_)
|
||||
: label(label_), size(size_), buf(size_){};
|
||||
};
|
@ -33,17 +33,15 @@ target_link_libraries(chimera hs pcre)
|
||||
|
||||
install(TARGETS chimera DESTINATION ${CMAKE_INSTALL_LIBDIR})
|
||||
|
||||
if (NOT WIN32)
|
||||
# expand out library names for pkgconfig static link info
|
||||
foreach (LIB ${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES})
|
||||
# this is fragile, but protects us from toolchain specific files
|
||||
if (NOT EXISTS ${LIB})
|
||||
set(PRIVATE_LIBS "${PRIVATE_LIBS} -l${LIB}")
|
||||
endif()
|
||||
endforeach()
|
||||
set(PRIVATE_LIBS "${PRIVATE_LIBS} -L${LIBDIR} -lpcre")
|
||||
# expand out library names for pkgconfig static link info
|
||||
foreach (LIB ${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES})
|
||||
# this is fragile, but protects us from toolchain specific files
|
||||
if (NOT EXISTS ${LIB})
|
||||
set(PRIVATE_LIBS "${PRIVATE_LIBS} -l${LIB}")
|
||||
endif()
|
||||
endforeach()
|
||||
set(PRIVATE_LIBS "${PRIVATE_LIBS} -L${LIBDIR} -lpcre")
|
||||
|
||||
configure_file(libch.pc.in libch.pc @ONLY) # only replace @ quoted vars
|
||||
install(FILES ${CMAKE_BINARY_DIR}/chimera/libch.pc
|
||||
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
|
||||
endif()
|
||||
configure_file(libch.pc.in libch.pc @ONLY) # only replace @ quoted vars
|
||||
install(FILES ${CMAKE_BINARY_DIR}/chimera/libch.pc
|
||||
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2018, Intel Corporation
|
||||
* Copyright (c) 2018-2020, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -345,6 +345,16 @@ ch_error_t HS_CDECL ch_set_scratch_allocator(ch_alloc_t alloc_func,
|
||||
*/
|
||||
#define CH_SCRATCH_IN_USE (-10)
|
||||
|
||||
/**
|
||||
* Unexpected internal error from Hyperscan.
|
||||
*
|
||||
* This error indicates that there was unexpected matching behaviors from
|
||||
* Hyperscan. This could be related to invalid usage of scratch space or
|
||||
* invalid memory operations by users.
|
||||
*
|
||||
*/
|
||||
#define CH_UNKNOWN_HS_ERROR (-13)
|
||||
|
||||
/**
|
||||
* Returned when pcre_exec (called for some expressions internally from @ref
|
||||
* ch_scan) failed due to a fatal error.
|
||||
|
@ -39,7 +39,6 @@
|
||||
#include "hs_internal.h"
|
||||
#include "ue2common.h"
|
||||
#include "util/compile_error.h"
|
||||
#include "util/make_unique.h"
|
||||
#include "util/multibit_build.h"
|
||||
#include "util/target_info.h"
|
||||
|
||||
@ -495,7 +494,7 @@ void ch_compile_multi_int(const char *const *expressions, const unsigned *flags,
|
||||
// First, build with libpcre. A build failure from libpcre will throw
|
||||
// an exception up to the caller.
|
||||
auto patternData =
|
||||
ue2::make_unique<PatternData>(myExpr, myFlags, i, myId, mode, match_limit,
|
||||
std::make_unique<PatternData>(myExpr, myFlags, i, myId, mode, match_limit,
|
||||
match_limit_recursion, platform);
|
||||
pcres.push_back(move(patternData));
|
||||
PatternData &curr = *pcres.back();
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2018, Intel Corporation
|
||||
* Copyright (c) 2018-2022, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -326,6 +326,10 @@ ch_error_t catchupPcre(struct HybridContext *hyctx, unsigned int id,
|
||||
} else if (cbrv == CH_CALLBACK_SKIP_PATTERN) {
|
||||
DEBUG_PRINTF("user callback told us to skip this pattern\n");
|
||||
pd->scanStart = hyctx->length;
|
||||
if (top_id == id) {
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (top_id == id) {
|
||||
@ -419,6 +423,7 @@ int HS_CDECL multiCallback(unsigned int id, unsigned long long from,
|
||||
DEBUG_PRINTF("user callback told us to skip this pattern\n");
|
||||
pd->scanStart = hyctx->length;
|
||||
ret = HS_SUCCESS;
|
||||
hyctx->scratch->ret = ret;
|
||||
} else if (ret == CH_FAIL_INTERNAL) {
|
||||
return ret;
|
||||
}
|
||||
@ -590,11 +595,24 @@ ch_error_t ch_scan_i(const ch_database_t *hydb,
|
||||
|
||||
if (!(db->flags & CHIMERA_FLAG_NO_MULTIMATCH)) {
|
||||
ret = scanHyperscan(&hyctx, data, length);
|
||||
if (ret != HS_SUCCESS && scratch->ret != CH_SUCCESS) {
|
||||
DEBUG_PRINTF("Hyperscan returned error %d\n", scratch->ret);
|
||||
// Errors from pcre scan.
|
||||
if (scratch->ret == CH_CALLBACK_TERMINATE) {
|
||||
DEBUG_PRINTF("Pcre terminates scan\n");
|
||||
unmarkScratchInUse(scratch);
|
||||
return CH_SCAN_TERMINATED;
|
||||
} else if (scratch->ret != CH_SUCCESS) {
|
||||
DEBUG_PRINTF("Pcre internal error\n");
|
||||
unmarkScratchInUse(scratch);
|
||||
return scratch->ret;
|
||||
}
|
||||
// Errors from Hyperscan scan. Note Chimera could terminate
|
||||
// Hyperscan callback on purpose so this is not counted as an error.
|
||||
if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
|
||||
assert(scratch->ret == CH_SUCCESS);
|
||||
DEBUG_PRINTF("Hyperscan returned error %d\n", ret);
|
||||
unmarkScratchInUse(scratch);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("Flush priority queue\n");
|
||||
|
@ -1,96 +0,0 @@
|
||||
# detect architecture features
|
||||
#
|
||||
# must be called after determining where compiler intrinsics are defined
|
||||
|
||||
if (HAVE_C_X86INTRIN_H)
|
||||
set (INTRIN_INC_H "x86intrin.h")
|
||||
elseif (HAVE_C_INTRIN_H)
|
||||
set (INTRIN_INC_H "intrin.h")
|
||||
else ()
|
||||
message (FATAL_ERROR "No intrinsics header found")
|
||||
endif ()
|
||||
|
||||
if (BUILD_AVX512)
|
||||
CHECK_C_COMPILER_FLAG(${SKYLAKE_FLAG} HAS_ARCH_SKYLAKE)
|
||||
if (NOT HAS_ARCH_SKYLAKE)
|
||||
message (FATAL_ERROR "AVX512 not supported by compiler")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (FAT_RUNTIME)
|
||||
# test the highest level microarch to make sure everything works
|
||||
if (BUILD_AVX512)
|
||||
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}")
|
||||
else ()
|
||||
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2")
|
||||
endif ()
|
||||
else (NOT FAT_RUNTIME)
|
||||
# if not fat runtime, then test given cflags
|
||||
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ARCH_C_FLAGS}")
|
||||
endif ()
|
||||
|
||||
# ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
int main() {
|
||||
__m128i a = _mm_set1_epi8(1);
|
||||
(void)_mm_shuffle_epi8(a, a);
|
||||
}" HAVE_SSSE3)
|
||||
|
||||
# now look for AVX2
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
#if !defined(__AVX2__)
|
||||
#error no avx2
|
||||
#endif
|
||||
|
||||
int main(){
|
||||
__m256i z = _mm256_setzero_si256();
|
||||
(void)_mm256_xor_si256(z, z);
|
||||
}" HAVE_AVX2)
|
||||
|
||||
# and now for AVX512
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
#if !defined(__AVX512BW__)
|
||||
#error no avx512bw
|
||||
#endif
|
||||
|
||||
int main(){
|
||||
__m512i z = _mm512_setzero_si512();
|
||||
(void)_mm512_abs_epi8(z);
|
||||
}" HAVE_AVX512)
|
||||
|
||||
# and now for AVX512VBMI
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
#if !defined(__AVX512VBMI__)
|
||||
#error no avx512vbmi
|
||||
#endif
|
||||
|
||||
int main(){
|
||||
__m512i a = _mm512_set1_epi8(0xFF);
|
||||
__m512i idx = _mm512_set_epi64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
|
||||
(void)_mm512_permutexvar_epi8(idx, a);
|
||||
}" HAVE_AVX512VBMI)
|
||||
|
||||
if (FAT_RUNTIME)
|
||||
if (NOT HAVE_SSSE3)
|
||||
message(FATAL_ERROR "SSSE3 support required to build fat runtime")
|
||||
endif ()
|
||||
if (NOT HAVE_AVX2)
|
||||
message(FATAL_ERROR "AVX2 support required to build fat runtime")
|
||||
endif ()
|
||||
if (BUILD_AVX512 AND NOT HAVE_AVX512)
|
||||
message(FATAL_ERROR "AVX512 support requested but not supported")
|
||||
endif ()
|
||||
else (NOT FAT_RUNTIME)
|
||||
if (NOT HAVE_AVX2)
|
||||
message(STATUS "Building without AVX2 support")
|
||||
endif ()
|
||||
if (NOT HAVE_AVX512)
|
||||
message(STATUS "Building without AVX512 support")
|
||||
endif ()
|
||||
if (NOT HAVE_SSSE3)
|
||||
message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
unset (CMAKE_REQUIRED_FLAGS)
|
||||
unset (INTRIN_INC_H)
|
111
cmake/archdetect.cmake
Normal file
111
cmake/archdetect.cmake
Normal file
@ -0,0 +1,111 @@
|
||||
if (USE_CPU_NATIVE)
|
||||
# Detect best GNUCC_ARCH to tune for
|
||||
if (CMAKE_COMPILER_IS_GNUCC)
|
||||
message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
|
||||
|
||||
# If gcc doesn't recognise the host cpu, then mtune=native becomes
|
||||
# generic, which isn't very good in some cases. march=native looks at
|
||||
# cpuid info and then chooses the best microarch it can (and replaces
|
||||
# the flag), so use that for tune.
|
||||
|
||||
set(TUNE_FLAG "mtune")
|
||||
set(GNUCC_TUNE "")
|
||||
message(STATUS "ARCH_FLAG '${ARCH_FLAG}' '${GNUCC_ARCH}', TUNE_FLAG '${TUNE_FLAG}' '${GNUCC_TUNE}' ")
|
||||
|
||||
# arg1 might exist if using ccache
|
||||
string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
|
||||
set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -${ARCH_FLAG}=native -${TUNE_FLAG}=native)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
|
||||
OUTPUT_VARIABLE _GCC_OUTPUT)
|
||||
set(_GCC_OUTPUT_TUNE ${_GCC_OUTPUT})
|
||||
string(FIND "${_GCC_OUTPUT}" "${ARCH_FLAG}=" POS)
|
||||
string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
|
||||
string(REGEX REPLACE "${ARCH_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_ARCH "${_GCC_OUTPUT}")
|
||||
|
||||
string(FIND "${_GCC_OUTPUT_TUNE}" "${TUNE_FLAG}=" POS_TUNE)
|
||||
string(SUBSTRING "${_GCC_OUTPUT_TUNE}" ${POS_TUNE} -1 _GCC_OUTPUT_TUNE)
|
||||
string(REGEX REPLACE "${TUNE_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_TUNE "${_GCC_OUTPUT_TUNE}")
|
||||
|
||||
message(STATUS "ARCH_FLAG '${ARCH_FLAG}' '${GNUCC_ARCH}', TUNE_FLAG '${TUNE_FLAG}' '${GNUCC_TUNE}' ")
|
||||
|
||||
# test the parsed flag
|
||||
set (EXEC_ARGS ${CC_ARG1} -E - -${ARCH_FLAG}=${GNUCC_ARCH} -${TUNE_FLAG}=${GNUCC_TUNE})
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
|
||||
OUTPUT_QUIET ERROR_QUIET
|
||||
INPUT_FILE /dev/null
|
||||
RESULT_VARIABLE GNUCC_TUNE_TEST)
|
||||
|
||||
if (NOT GNUCC_TUNE_TEST EQUAL 0)
|
||||
message(WARNING "Something went wrong determining gcc tune: -mtune=${GNUCC_TUNE} not valid, falling back to -mtune=native")
|
||||
set(GNUCC_TUNE native)
|
||||
else()
|
||||
set(GNUCC_TUNE ${GNUCC_TUNE})
|
||||
message(STATUS "gcc will tune for ${GNUCC_ARCH}, ${GNUCC_TUNE}")
|
||||
endif()
|
||||
elseif (CMAKE_COMPILER_IS_CLANG)
|
||||
if (ARCH_IA32 OR ARCH_X86_64)
|
||||
set(GNUCC_ARCH x86-64-v2)
|
||||
set(TUNE_FLAG generic)
|
||||
elseif(ARCH_AARCH64)
|
||||
if (BUILD_SVE2_BITPERM)
|
||||
set(GNUCC_ARCH ${SVE2_BITPERM_ARCH})
|
||||
elseif (BUILD_SVE2)
|
||||
set(GNUCC_ARCH ${SVE2_ARCH})
|
||||
elseif (BUILD_SVE)
|
||||
set(GNUCC_ARCH ${SVE_ARCH})
|
||||
else ()
|
||||
set(GNUCC_ARCH ${ARMV8_ARCH})
|
||||
endif()
|
||||
set(TUNE_FLAG generic)
|
||||
elseif(ARCH_ARM32)
|
||||
set(GNUCC_ARCH armv7a)
|
||||
set(TUNE_FLAG generic)
|
||||
else()
|
||||
set(GNUCC_ARCH native)
|
||||
set(TUNE_FLAG generic)
|
||||
endif()
|
||||
message(STATUS "clang will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}")
|
||||
endif()
|
||||
else()
|
||||
if (SIMDE_BACKEND)
|
||||
if (ARCH_IA32 OR ARCH_X86_64)
|
||||
set(GNUCC_ARCH x86-64-v2)
|
||||
set(TUNE_FLAG generic)
|
||||
elseif(ARCH_AARCH64)
|
||||
set(GNUCC_ARCH armv8-a)
|
||||
set(TUNE_FLAG generic)
|
||||
elseif(ARCH_ARM32)
|
||||
set(GNUCC_ARCH armv7a)
|
||||
set(TUNE_FLAG generic)
|
||||
elseif(ARCH_PPC64EL)
|
||||
set(GNUCC_ARCH power8)
|
||||
set(TUNE_FLAG power8)
|
||||
else()
|
||||
set(GNUCC_ARCH x86-64-v2)
|
||||
set(TUNE_FLAG generic)
|
||||
endif()
|
||||
elseif (ARCH_IA32 OR ARCH_X86_64)
|
||||
set(GNUCC_ARCH ${X86_ARCH})
|
||||
set(TUNE_FLAG generic)
|
||||
elseif(ARCH_AARCH64)
|
||||
if (BUILD_SVE2_BITPERM)
|
||||
set(GNUCC_ARCH ${SVE2_BITPERM_ARCH})
|
||||
elseif (BUILD_SVE2)
|
||||
set(GNUCC_ARCH ${SVE2_ARCH})
|
||||
elseif (BUILD_SVE)
|
||||
set(GNUCC_ARCH ${SVE_ARCH})
|
||||
else ()
|
||||
set(GNUCC_ARCH ${ARMV8_ARCH})
|
||||
endif()
|
||||
set(TUNE_FLAG generic)
|
||||
elseif(ARCH_ARM32)
|
||||
set(GNUCC_ARCH armv7a)
|
||||
set(TUNE_FLAG generic)
|
||||
elseif(ARCH_PPC64EL)
|
||||
set(GNUCC_ARCH power8)
|
||||
set(TUNE_FLAG power8)
|
||||
else()
|
||||
set(GNUCC_ARCH native)
|
||||
set(TUNE_FLAG native)
|
||||
endif()
|
||||
endif()
|
@ -15,13 +15,21 @@ SYMSFILE=$(mktemp -p /tmp ${PREFIX}_rename.syms.XXXXX)
|
||||
KEEPSYMS=$(mktemp -p /tmp keep.syms.XXXXX)
|
||||
# find the libc used by gcc
|
||||
LIBC_SO=$("$@" --print-file-name=libc.so.6)
|
||||
NM_FLAG="-f"
|
||||
if [ `uname` = "FreeBSD" ]; then
|
||||
# for freebsd, we will specify the name,
|
||||
# we will leave it work as is in linux
|
||||
LIBC_SO=/lib/libc.so.7
|
||||
# also, in BSD, the nm flag -F corresponds to the -f flag in linux.
|
||||
NM_FLAG="-F"
|
||||
fi
|
||||
cp ${KEEPSYMS_IN} ${KEEPSYMS}
|
||||
# get all symbols from libc and turn them into patterns
|
||||
nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ ]*\).*/^\1$/' >> ${KEEPSYMS}
|
||||
nm ${NM_FLAG} p -g -D ${LIBC_SO} | sed 's/\([^ @]*\).*/^\1$/' >> ${KEEPSYMS}
|
||||
# build the object
|
||||
"$@"
|
||||
# rename the symbols in the object
|
||||
nm -f p -g ${OUT} | cut -f1 -d' ' | grep -v -f ${KEEPSYMS} | sed -e "s/\(.*\)/\1\ ${PREFIX}_\1/" >> ${SYMSFILE}
|
||||
nm ${NM_FLAG} p -g ${OUT} | cut -f1 -d' ' | grep -v -f ${KEEPSYMS} | sed -e "s/\(.*\)/\1\ ${PREFIX}_\1/" >> ${SYMSFILE}
|
||||
if test -s ${SYMSFILE}
|
||||
then
|
||||
objcopy --redefine-syms=${SYMSFILE} ${OUT}
|
||||
|
93
cmake/cflags-arm.cmake
Normal file
93
cmake/cflags-arm.cmake
Normal file
@ -0,0 +1,93 @@
|
||||
if (NOT FAT_RUNTIME)
|
||||
if (BUILD_SVE2_BITPERM)
|
||||
message (STATUS "SVE2_BITPERM implies SVE2, enabling BUILD_SVE2")
|
||||
set(BUILD_SVE2 ON)
|
||||
endif ()
|
||||
if (BUILD_SVE2)
|
||||
message (STATUS "SVE2 implies SVE, enabling BUILD_SVE")
|
||||
set(BUILD_SVE ON)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
||||
if (CMAKE_COMPILER_IS_GNUCXX)
|
||||
set(ARMV9BASE_MINVER "12")
|
||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS ARMV9BASE_MINVER)
|
||||
set(SVE2_ARCH "armv8-a+sve2")
|
||||
else()
|
||||
set(SVE2_ARCH "armv9-a")
|
||||
endif()
|
||||
else()
|
||||
set(SVE2_ARCH "armv9-a")
|
||||
endif()
|
||||
|
||||
set(ARMV8_ARCH "armv8-a")
|
||||
set(SVE_ARCH "${ARMV8_ARCH}+sve")
|
||||
set(SVE2_BITPERM_ARCH "${SVE2_ARCH}+sve2-bitperm")
|
||||
|
||||
CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_C_ARM_NEON_H)
|
||||
if (BUILD_SVE OR BUILD_SVE2 OR BUILD_SVE2_BITPERM OR FAT_RUNTIME)
|
||||
set(CMAKE_REQUIRED_FLAGS "-march=${SVE_ARCH}")
|
||||
CHECK_INCLUDE_FILE_CXX(arm_sve.h HAVE_C_ARM_SVE_H)
|
||||
if (NOT HAVE_C_ARM_SVE_H)
|
||||
message(FATAL_ERROR "arm_sve.h is required to build for SVE.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
CHECK_C_SOURCE_COMPILES("#include <arm_neon.h>
|
||||
int main() {
|
||||
int32x4_t a = vdupq_n_s32(1);
|
||||
(void)a;
|
||||
}" HAVE_NEON)
|
||||
|
||||
if (BUILD_SVE2_BITPERM)
|
||||
set(CMAKE_REQUIRED_FLAGS "-march=${SVE2_BITPERM_ARCH}")
|
||||
CHECK_C_SOURCE_COMPILES("#include <arm_sve.h>
|
||||
int main() {
|
||||
svuint8_t a = svbext(svdup_u8(1), svdup_u8(2));
|
||||
(void)a;
|
||||
}" HAVE_SVE2_BITPERM)
|
||||
endif()
|
||||
if (BUILD_SVE2)
|
||||
set(CMAKE_REQUIRED_FLAGS "-march=${SVE2_ARCH}")
|
||||
CHECK_C_SOURCE_COMPILES("#include <arm_sve.h>
|
||||
int main() {
|
||||
svuint8_t a = svbsl(svdup_u8(1), svdup_u8(2), svdup_u8(3));
|
||||
(void)a;
|
||||
}" HAVE_SVE2)
|
||||
endif()
|
||||
if (BUILD_SVE)
|
||||
set(CMAKE_REQUIRED_FLAGS "-march=${SVE_ARCH}")
|
||||
CHECK_C_SOURCE_COMPILES("#include <arm_sve.h>
|
||||
int main() {
|
||||
svuint8_t a = svdup_u8(1);
|
||||
(void)a;
|
||||
}" HAVE_SVE)
|
||||
endif ()
|
||||
|
||||
if (FAT_RUNTIME)
|
||||
if (NOT HAVE_NEON)
|
||||
message(FATAL_ERROR "NEON support required to build fat runtime")
|
||||
endif ()
|
||||
if (BUILD_SVE AND NOT HAVE_SVE)
|
||||
message(FATAL_ERROR "SVE support required to build fat runtime")
|
||||
endif ()
|
||||
if (BUILD_SVE2 AND NOT HAVE_SVE2)
|
||||
message(FATAL_ERROR "SVE2 support required to build fat runtime")
|
||||
endif ()
|
||||
if (BUILD_SVE2_BITPERM AND NOT HAVE_SVE2_BITPERM)
|
||||
message(FATAL_ERROR "SVE2 support required to build fat runtime")
|
||||
endif ()
|
||||
else (NOT FAT_RUNTIME)
|
||||
if (NOT BUILD_SVE)
|
||||
message(STATUS "Building without SVE support")
|
||||
endif ()
|
||||
if (NOT BUILD_SVE2)
|
||||
message(STATUS "Building without SVE2 support")
|
||||
endif ()
|
||||
if (NOT HAVE_NEON)
|
||||
message(FATAL_ERROR "Neon/ASIMD support required for Arm support")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
106
cmake/cflags-generic.cmake
Normal file
106
cmake/cflags-generic.cmake
Normal file
@ -0,0 +1,106 @@
|
||||
# set compiler flags - more are tested and added later
|
||||
set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra ")
|
||||
set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra ")
|
||||
if (NOT CMAKE_COMPILER_IS_CLANG)
|
||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-new-ttp-matching")
|
||||
endif()
|
||||
|
||||
# Always use -Werror *also during release builds
|
||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wall -Werror")
|
||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wall -Werror")
|
||||
|
||||
if (DISABLE_ASSERTS)
|
||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG")
|
||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG")
|
||||
endif()
|
||||
|
||||
if(CMAKE_COMPILER_IS_GNUCC)
|
||||
# spurious warnings?
|
||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds ") #-Wno-maybe-uninitialized")
|
||||
endif()
|
||||
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized -Wno-uninitialized")
|
||||
endif()
|
||||
|
||||
CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
|
||||
CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
|
||||
CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
|
||||
|
||||
if(FREEBSD OR NETBSD)
|
||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -gdwarf-4")
|
||||
endif()
|
||||
|
||||
if(NETBSD)
|
||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DHAVE_BUILTIN_POPCOUNT")
|
||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DHAVE_BUILTIN_POPCOUNT")
|
||||
endif()
|
||||
|
||||
if(MACOSX)
|
||||
# Boost headers cause such complains on MacOS
|
||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-deprecated-declarations -Wno-unused-parameter")
|
||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-deprecated-declarations -Wno-unused-parameter")
|
||||
endif()
|
||||
|
||||
# these end up in the config file
|
||||
CHECK_C_COMPILER_FLAG(-fvisibility=hidden HAS_C_HIDDEN)
|
||||
CHECK_CXX_COMPILER_FLAG(-fvisibility=hidden HAS_CXX_HIDDEN)
|
||||
|
||||
# are we using libc++
|
||||
CHECK_CXX_SYMBOL_EXISTS(_LIBCPP_VERSION ciso646 HAVE_LIBCPP)
|
||||
|
||||
if (RELEASE_BUILD)
|
||||
if (HAS_C_HIDDEN)
|
||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fvisibility=hidden")
|
||||
endif()
|
||||
if (HAS_CXX_HIDDEN)
|
||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fvisibility=hidden")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# testing a builtin takes a little more work
|
||||
CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
|
||||
CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
|
||||
# Clang does not use __builtin_constant_p() the same way as gcc
|
||||
if (NOT CMAKE_COMPILER_IS_CLANG)
|
||||
CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
|
||||
endif()
|
||||
|
||||
# clang-14 complains about unused-but-set variable.
|
||||
CHECK_CXX_COMPILER_FLAG("-Wunused-but-set-variable" CXX_UNUSED_BUT_SET_VAR)
|
||||
if (CXX_UNUSED_BUT_SET_VAR)
|
||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-but-set-variable")
|
||||
endif()
|
||||
|
||||
CHECK_CXX_COMPILER_FLAG("-Wignored-attributes" CXX_IGNORED_ATTR)
|
||||
if(CMAKE_COMPILER_IS_GNUCC)
|
||||
if (CXX_IGNORED_ATTR)
|
||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-ignored-attributes")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
CHECK_CXX_COMPILER_FLAG("-Wignored-attributes" CXX_NON_NULL)
|
||||
if(CMAKE_COMPILER_IS_GNUCC)
|
||||
if (CXX_NON_NULL)
|
||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-nonnull")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# note this for later, g++ doesn't have this flag but clang does
|
||||
CHECK_CXX_COMPILER_FLAG("-Wweak-vtables" CXX_WEAK_VTABLES)
|
||||
|
||||
CHECK_CXX_COMPILER_FLAG("-Wmissing-declarations" CXX_MISSING_DECLARATIONS)
|
||||
|
||||
CHECK_CXX_COMPILER_FLAG("-Wunused-local-typedefs" CXX_UNUSED_LOCAL_TYPEDEFS)
|
||||
|
||||
CHECK_CXX_COMPILER_FLAG("-Wunused-variable" CXX_WUNUSED_VARIABLE)
|
||||
|
||||
# gcc complains about this
|
||||
if(CMAKE_COMPILER_IS_GNUCC)
|
||||
CHECK_C_COMPILER_FLAG("-Wstringop-overflow" CC_STRINGOP_OVERFLOW)
|
||||
CHECK_CXX_COMPILER_FLAG("-Wstringop-overflow" CXX_STRINGOP_OVERFLOW)
|
||||
if(CC_STRINGOP_OVERFLOW OR CXX_STRINGOP_OVERFLOW)
|
||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-stringop-overflow -Wno-stringop-overread")
|
||||
set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-stringop-overflow -Wno-stringop-overread")
|
||||
endif()
|
||||
endif()
|
27
cmake/cflags-ppc64le.cmake
Normal file
27
cmake/cflags-ppc64le.cmake
Normal file
@ -0,0 +1,27 @@
|
||||
|
||||
CHECK_INCLUDE_FILE_CXX(altivec.h HAVE_C_PPC64EL_ALTIVEC_H)
|
||||
|
||||
if (HAVE_C_PPC64EL_ALTIVEC_H)
|
||||
set (INTRIN_INC_H "altivec.h")
|
||||
else()
|
||||
message (FATAL_ERROR "No intrinsics header found for VSX")
|
||||
endif ()
|
||||
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
int main() {
|
||||
vector int a = vec_splat_s32(1);
|
||||
(void)a;
|
||||
}" HAVE_VSX)
|
||||
|
||||
if (NOT HAVE_VSX)
|
||||
message(FATAL_ERROR "VSX support required for Power support")
|
||||
endif ()
|
||||
|
||||
# fix unit-internal seg fault for freebsd and gcc13
|
||||
if (FREEBSD AND CMAKE_COMPILER_IS_GNUCXX)
|
||||
if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "13")
|
||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++")
|
||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libgcc -static-libstdc++")
|
||||
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -static-libgcc -static-libstdc++")
|
||||
endif ()
|
||||
endif ()
|
140
cmake/cflags-x86.cmake
Normal file
140
cmake/cflags-x86.cmake
Normal file
@ -0,0 +1,140 @@
|
||||
option(BUILD_AVX512 "Enabling support for AVX512" OFF)
|
||||
option(BUILD_AVX512VBMI "Enabling support for AVX512VBMI" OFF)
|
||||
|
||||
set(SKYLAKE_ARCH "skylake-avx512")
|
||||
set(ICELAKE_ARCH "icelake-server")
|
||||
set(SKYLAKE_FLAG "-march=${SKYLAKE_ARCH}")
|
||||
set(ICELAKE_FLAG "-march=${ICELAKE_ARCH}")
|
||||
|
||||
if (NOT FAT_RUNTIME)
|
||||
if (BUILD_AVX512VBMI)
|
||||
message (STATUS "AVX512VBMI implies AVX512, enabling BUILD_AVX512")
|
||||
set(BUILD_AVX512 ON)
|
||||
set(BUILD_AVX2 ON)
|
||||
set(ARCH_C_FLAGS "${ICELAKE_FLAG}")
|
||||
set(ARCH_CXX_FLAGS "${ICELAKE_FLAG}")
|
||||
set(X86_ARCH "${ICELAKE_ARCH}")
|
||||
elseif (BUILD_AVX512)
|
||||
message (STATUS "AVX512 implies AVX2, enabling BUILD_AVX2")
|
||||
set(BUILD_AVX2 ON)
|
||||
set(ARCH_C_FLAGS "${SKYLAKE_FLAG}")
|
||||
set(ARCH_CXX_FLAGS "${SKYLAKE_FLAG}")
|
||||
set(X86_ARCH "${SKYLAKE_ARCH}")
|
||||
elseif (BUILD_AVX2)
|
||||
message (STATUS "Enabling BUILD_AVX2")
|
||||
set(ARCH_C_FLAGS "-mavx2")
|
||||
set(ARCH_CXX_FLAGS "-mavx2")
|
||||
set(X86_ARCH "core-avx2")
|
||||
else()
|
||||
set(ARCH_C_FLAGS "-msse4.2")
|
||||
set(ARCH_CXX_FLAGS "-msse4.2")
|
||||
set(X86_ARCH "x86-64-v2")
|
||||
endif()
|
||||
else()
|
||||
set(BUILD_AVX512VBMI ON)
|
||||
set(BUILD_AVX512 ON)
|
||||
set(BUILD_AVX2 ON)
|
||||
set(ARCH_C_FLAGS "-msse4.2")
|
||||
set(ARCH_CXX_FLAGS "-msse4.2")
|
||||
set(X86_ARCH "x86-64-v2")
|
||||
endif()
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS "${ARCH_C_FLAGS}")
|
||||
CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
|
||||
CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
|
||||
CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
|
||||
CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
|
||||
|
||||
if (HAVE_C_X86INTRIN_H)
|
||||
set (INTRIN_INC_H "x86intrin.h")
|
||||
elseif (HAVE_C_INTRIN_H)
|
||||
set (INTRIN_INC_H "intrin.h")
|
||||
else()
|
||||
message (FATAL_ERROR "No intrinsics header found for SSE/AVX2/AVX512")
|
||||
endif ()
|
||||
|
||||
if (BUILD_AVX512)
|
||||
CHECK_C_COMPILER_FLAG(${SKYLAKE_FLAG} HAS_ARCH_SKYLAKE)
|
||||
if (NOT HAS_ARCH_SKYLAKE)
|
||||
message (FATAL_ERROR "AVX512 not supported by compiler")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (BUILD_AVX512VBMI)
|
||||
CHECK_C_COMPILER_FLAG(${ICELAKE_FLAG} HAS_ARCH_ICELAKE)
|
||||
if (NOT HAS_ARCH_ICELAKE)
|
||||
message (FATAL_ERROR "AVX512VBMI not supported by compiler")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
# ensure we have the minimum of SSE4.2 - call a SSE4.2 intrinsic
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
int main() {
|
||||
__m128i a = _mm_set1_epi8(1);
|
||||
(void)_mm_shuffle_epi8(a, a);
|
||||
}" HAVE_SSE42)
|
||||
|
||||
# now look for AVX2
|
||||
set(CMAKE_REQUIRED_FLAGS "-mavx2")
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
#if !defined(__AVX2__)
|
||||
#error no avx2
|
||||
#endif
|
||||
|
||||
int main(){
|
||||
__m256i z = _mm256_setzero_si256();
|
||||
(void)_mm256_xor_si256(z, z);
|
||||
}" HAVE_AVX2)
|
||||
|
||||
# and now for AVX512
|
||||
set(CMAKE_REQUIRED_FLAGS "${SKYLAKE_FLAG}")
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
#if !defined(__AVX512BW__)
|
||||
#error no avx512bw
|
||||
#endif
|
||||
|
||||
int main(){
|
||||
__m512i z = _mm512_setzero_si512();
|
||||
(void)_mm512_abs_epi8(z);
|
||||
}" HAVE_AVX512)
|
||||
|
||||
# and now for AVX512VBMI
|
||||
set(CMAKE_REQUIRED_FLAGS "${ICELAKE_FLAG}")
|
||||
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
|
||||
#if !defined(__AVX512VBMI__)
|
||||
#error no avx512vbmi
|
||||
#endif
|
||||
|
||||
int main(){
|
||||
__m512i a = _mm512_set1_epi8(0xFF);
|
||||
__m512i idx = _mm512_set_epi64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
|
||||
(void)_mm512_permutexvar_epi8(idx, a);
|
||||
}" HAVE_AVX512VBMI)
|
||||
|
||||
if (FAT_RUNTIME)
|
||||
if (NOT HAVE_SSE42)
|
||||
message(FATAL_ERROR "SSE4.2 support required to build fat runtime")
|
||||
endif ()
|
||||
if (BUILD_AVX2 AND NOT HAVE_AVX2)
|
||||
message(FATAL_ERROR "AVX2 support required to build fat runtime")
|
||||
endif ()
|
||||
if (BUILD_AVX512 AND NOT HAVE_AVX512)
|
||||
message(FATAL_ERROR "AVX512 support requested but not supported")
|
||||
endif ()
|
||||
if (BUILD_AVX512VBMI AND NOT HAVE_AVX512VBMI)
|
||||
message(FATAL_ERROR "AVX512VBMI support requested but not supported")
|
||||
endif ()
|
||||
else (NOT FAT_RUNTIME)
|
||||
if (NOT BUILD_AVX2)
|
||||
message(STATUS "Building without AVX2 support")
|
||||
endif ()
|
||||
if (NOT HAVE_AVX512)
|
||||
message(STATUS "Building without AVX512 support")
|
||||
endif ()
|
||||
if (NOT HAVE_AVX512VBMI)
|
||||
message(STATUS "Building without AVX512VBMI support")
|
||||
endif ()
|
||||
if (NOT HAVE_SSE42)
|
||||
message(FATAL_ERROR "A minimum of SSE4.2 compiler support is required")
|
||||
endif ()
|
||||
endif ()
|
20
cmake/compiler.cmake
Normal file
20
cmake/compiler.cmake
Normal file
@ -0,0 +1,20 @@
|
||||
# determine compiler
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
set(CMAKE_COMPILER_IS_CLANG TRUE)
|
||||
set(CLANGCXX_MINVER "5")
|
||||
message(STATUS "clang++ version ${CMAKE_CXX_COMPILER_VERSION}")
|
||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS CLANGCXX_MINVER)
|
||||
message(FATAL_ERROR "A minimum of clang++ ${CLANGCXX_MINVER} is required for C++17 support")
|
||||
endif()
|
||||
string (REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)$" "\\1" CLANG_MAJOR_VERSION "${CMAKE_CXX_COMPILER_VERSION}")
|
||||
endif()
|
||||
|
||||
# compiler version checks TODO: test more compilers
|
||||
if (CMAKE_COMPILER_IS_GNUCXX)
|
||||
set(GNUCXX_MINVER "9")
|
||||
message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}")
|
||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER)
|
||||
message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++17 support")
|
||||
endif()
|
||||
endif()
|
||||
|
@ -15,15 +15,42 @@
|
||||
/* "Define if building for EM64T" */
|
||||
#cmakedefine ARCH_X86_64
|
||||
|
||||
/* "Define if building for ARM32" */
|
||||
#cmakedefine ARCH_ARM32
|
||||
|
||||
/* "Define if building for AARCH64" */
|
||||
#cmakedefine ARCH_AARCH64
|
||||
|
||||
/* "Define if building for PPC64EL" */
|
||||
#cmakedefine ARCH_PPC64EL
|
||||
|
||||
/* "Define if cross compiling for AARCH64" */
|
||||
#cmakedefine CROSS_COMPILE_AARCH64
|
||||
|
||||
/* Define if building SVE for AARCH64. */
|
||||
#cmakedefine BUILD_SVE
|
||||
|
||||
/* Define if building SVE2 for AARCH64. */
|
||||
#cmakedefine BUILD_SVE2
|
||||
|
||||
/* Define if building SVE2+BITPERM for AARCH64. */
|
||||
#cmakedefine BUILD_SVE2_BITPERM
|
||||
|
||||
/* internal build, switch on dump support. */
|
||||
#cmakedefine DUMP_SUPPORT
|
||||
|
||||
/* Define if building "fat" runtime. */
|
||||
#cmakedefine FAT_RUNTIME
|
||||
|
||||
/* Define if building AVX2 in the fat runtime. */
|
||||
#cmakedefine BUILD_AVX2
|
||||
|
||||
/* Define if building AVX-512 in the fat runtime. */
|
||||
#cmakedefine BUILD_AVX512
|
||||
|
||||
/* Define if building AVX512VBMI in the fat runtime. */
|
||||
#cmakedefine BUILD_AVX512VBMI
|
||||
|
||||
/* Define to 1 if `backtrace' works. */
|
||||
#cmakedefine HAVE_BACKTRACE
|
||||
|
||||
@ -45,6 +72,15 @@
|
||||
/* C compiler has intrin.h */
|
||||
#cmakedefine HAVE_C_INTRIN_H
|
||||
|
||||
/* C compiler has arm_neon.h */
|
||||
#cmakedefine HAVE_C_ARM_NEON_H
|
||||
|
||||
/* C compiler has arm_sve.h */
|
||||
#cmakedefine HAVE_C_ARM_SVE_H
|
||||
|
||||
/* C compiler has arm_neon.h */
|
||||
#cmakedefine HAVE_C_PPC64EL_ALTIVEC_H
|
||||
|
||||
/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
|
||||
0 if you don't. */
|
||||
#cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP
|
||||
|
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import sys
|
||||
import datetime
|
||||
|
54
cmake/osdetection.cmake
Normal file
54
cmake/osdetection.cmake
Normal file
@ -0,0 +1,54 @@
|
||||
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
set(LINUX TRUE)
|
||||
endif(CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
|
||||
if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
|
||||
set(FREEBSD true)
|
||||
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
|
||||
#FIXME: find a nicer and more general way of doing this
|
||||
if(CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc13")
|
||||
set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc13")
|
||||
elseif(ARCH_AARCH64 AND (CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc12"))
|
||||
set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc12")
|
||||
endif()
|
||||
endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
|
||||
|
||||
if(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
|
||||
set(NETBSD true)
|
||||
endif(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
|
||||
|
||||
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
|
||||
set(MACOSX TRUE)
|
||||
endif()
|
||||
|
||||
if (ARCH_IA32 OR ARCH_X86_64)
|
||||
option(FAT_RUNTIME "Build a library that supports multiple microarchitectures" ON)
|
||||
else()
|
||||
option(FAT_RUNTIME "Build a library that supports multiple microarchitectures" OFF)
|
||||
endif()
|
||||
|
||||
if (FAT_RUNTIME)
|
||||
message("Checking Fat Runtime Requirements...")
|
||||
if (USE_CPU_NATIVE AND FAT_RUNTIME)
|
||||
message(FATAL_ERROR "Fat runtime is not compatible with Native CPU detection")
|
||||
endif()
|
||||
|
||||
if (NOT (ARCH_IA32 OR ARCH_X86_64 OR ARCH_AARCH64))
|
||||
message(FATAL_ERROR "Fat runtime is only supported on Intel and Aarch64 architectures")
|
||||
else()
|
||||
message(STATUS "Building Fat runtime for multiple microarchitectures")
|
||||
message(STATUS "generator is ${CMAKE_GENERATOR}")
|
||||
if (NOT (CMAKE_GENERATOR MATCHES "Unix Makefiles" OR
|
||||
(CMAKE_VERSION VERSION_GREATER "3.0" AND CMAKE_GENERATOR MATCHES "Ninja")))
|
||||
message (FATAL_ERROR "Building the fat runtime requires the Unix Makefiles generator, or Ninja with CMake v3.0 or higher")
|
||||
else()
|
||||
include (${CMAKE_MODULE_PATH}/attrib.cmake)
|
||||
if (NOT HAS_C_ATTR_IFUNC)
|
||||
message(FATAL_ERROR "Compiler does not support ifunc attribute, cannot build fat runtime")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (NOT RELEASE_BUILD)
|
||||
message(FATAL_ERROR "Fat runtime is only built on Release builds")
|
||||
endif()
|
||||
endif ()
|
@ -30,7 +30,7 @@ if (PCRE_BUILD_SOURCE)
|
||||
#if PCRE_MAJOR != ${PCRE_REQUIRED_MAJOR_VERSION} || PCRE_MINOR < ${PCRE_REQUIRED_MINOR_VERSION}
|
||||
#error Incorrect pcre version
|
||||
#endif
|
||||
main() {}" CORRECT_PCRE_VERSION)
|
||||
int main(void) {return 0;}" CORRECT_PCRE_VERSION)
|
||||
set (CMAKE_REQUIRED_INCLUDES "${saved_INCLUDES}")
|
||||
|
||||
if (NOT CORRECT_PCRE_VERSION)
|
||||
|
@ -1,9 +1,12 @@
|
||||
# determine the target arch
|
||||
|
||||
# really only interested in the preprocessor here
|
||||
CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_64_BIT)
|
||||
|
||||
CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT)
|
||||
|
||||
set(ARCH_X86_64 ${ARCH_64_BIT})
|
||||
set(ARCH_IA32 ${ARCH_32_BIT})
|
||||
CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64)
|
||||
CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)
|
||||
CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64)
|
||||
CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32)
|
||||
CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !(defined(__LITTLE_ENDIAN__) && defined(__VSX__))\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL)
|
||||
if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL)
|
||||
set(ARCH_64_BIT TRUE)
|
||||
else()
|
||||
set(ARCH_32_BIT TRUE)
|
||||
endif()
|
||||
|
@ -7,7 +7,7 @@ function(ragelmaker src_rl)
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${src_dir}/${src_file}.cpp
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${src_dir}
|
||||
COMMAND ${RAGEL} ${CMAKE_CURRENT_SOURCE_DIR}/${src_rl} -o ${rl_out}
|
||||
COMMAND ${RAGEL} ${CMAKE_CURRENT_SOURCE_DIR}/${src_rl} -o ${rl_out} -G0
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${src_rl}
|
||||
)
|
||||
add_custom_target(ragel_${src_file} DEPENDS ${rl_out})
|
||||
|
40
cmake/sanitize.cmake
Normal file
40
cmake/sanitize.cmake
Normal file
@ -0,0 +1,40 @@
|
||||
# Possible values:
|
||||
# - `address` (ASan)
|
||||
# - `memory` (MSan)
|
||||
# - `undefined` (UBSan)
|
||||
# - "" (no sanitizing)
|
||||
option (SANITIZE "Enable one of the code sanitizers" "")
|
||||
|
||||
set (SAN_FLAGS "${SAN_FLAGS} -g -fno-omit-frame-pointer -DSANITIZER")
|
||||
|
||||
if (SANITIZE)
|
||||
if (SANITIZE STREQUAL "address")
|
||||
set (ASAN_FLAGS "-fsanitize=address -fsanitize-address-use-after-scope")
|
||||
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${ASAN_FLAGS}")
|
||||
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${ASAN_FLAGS}")
|
||||
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_FLAGS}")
|
||||
endif()
|
||||
|
||||
elseif (SANITIZE STREQUAL "memory")
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set (FATAL_ERROR "GCC does not have memory sanitizer")
|
||||
endif()
|
||||
# MemorySanitizer flags are set according to the official documentation:
|
||||
# https://clang.llvm.org/docs/MemorySanitizer.html#usage
|
||||
set (MSAN_FLAGS "-fsanitize=memory -fsanitize-memory-use-after-dtor -fsanitize-memory-track-origins -fno-optimize-sibling-calls")
|
||||
|
||||
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${MSAN_FLAGS}")
|
||||
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${MSAN_FLAGS}")
|
||||
elseif (SANITIZE STREQUAL "undefined")
|
||||
set (UBSAN_FLAGS "-fsanitize=undefined")
|
||||
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${UBSAN_FLAGS}")
|
||||
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${UBSAN_FLAGS}")
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined")
|
||||
endif()
|
||||
else ()
|
||||
message (FATAL_ERROR "Unknown sanitizer type: ${SANITIZE}")
|
||||
endif ()
|
||||
endif()
|
40
cmake/simde.cmake
Normal file
40
cmake/simde.cmake
Normal file
@ -0,0 +1,40 @@
|
||||
LIST(APPEND CMAKE_REQUIRED_INCLUDES ${PROJECT_SOURCE_DIR}/simde)
|
||||
|
||||
CHECK_INCLUDE_FILES(simde/x86/sse4.2.h SIMDE_SSE42_H_FOUND)
|
||||
|
||||
if (SIMDE_SSE42_H_FOUND)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_BACKEND")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_BACKEND")
|
||||
include_directories(${PROJECT_SOURCE_DIR}/simde)
|
||||
|
||||
if (CMAKE_COMPILER_IS_CLANG)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
|
||||
if (ARCH_PPC64EL)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-deprecated-altivec-src-compat")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-altivec-src-compat")
|
||||
if (CLANG_MAJOR_VERSION EQUAL 15)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-deprecate-lax-vec-conv-all")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecate-lax-vec-conv-all")
|
||||
endif ()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (BUILD_SSE2_SIMDE)
|
||||
message("using BUILD_SSE2_SIMDE..")
|
||||
set(SIMDE_NATIVE true)
|
||||
set(ARCH_C_FLAGS "-msse2")
|
||||
set(ARCH_CXX_FLAGS "-msse2")
|
||||
set(X86_ARCH "x86-64")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_NATIVE -DVS_SIMDE_BACKEND")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_NATIVE -DVS_SIMDE_BACKEND")
|
||||
endif()
|
||||
|
||||
if (SIMDE_NATIVE AND NOT BUILD_SSE2_SIMDE)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
|
||||
endif()
|
||||
|
||||
else()
|
||||
message(FATAL_ERROR "SIMDe backend requested but SIMDe is not available on the system")
|
||||
endif()
|
@ -1,53 +1,19 @@
|
||||
#
|
||||
# a lot of noise to find sqlite
|
||||
# sqlite is only used in hsbench, no need to special case its build, depend only on OS installations using pkg-config
|
||||
#
|
||||
|
||||
option(SQLITE_PREFER_STATIC "Build sqlite3 statically instead of using an installed lib" OFF)
|
||||
|
||||
if(NOT WIN32 AND NOT SQLITE_PREFER_STATIC)
|
||||
find_package(PkgConfig QUIET)
|
||||
|
||||
# first check for sqlite on the system
|
||||
pkg_check_modules(SQLITE3 sqlite3)
|
||||
endif()
|
||||
|
||||
if (NOT SQLITE3_FOUND)
|
||||
message(STATUS "looking for sqlite3 in source tree")
|
||||
# look in the source tree
|
||||
if (EXISTS "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.h" AND
|
||||
EXISTS "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.c")
|
||||
message(STATUS " found sqlite3 in source tree")
|
||||
set(SQLITE3_FOUND TRUE)
|
||||
set(SQLITE3_BUILD_SOURCE TRUE)
|
||||
set(SQLITE3_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}/sqlite3")
|
||||
set(SQLITE3_LDFLAGS sqlite3_static)
|
||||
else()
|
||||
message(STATUS " no sqlite3 in source tree")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# now do version checks
|
||||
if (SQLITE3_FOUND)
|
||||
list(INSERT CMAKE_REQUIRED_INCLUDES 0 "${SQLITE3_INCLUDE_DIRS}")
|
||||
CHECK_C_SOURCE_COMPILES("#include <sqlite3.h>\n#if SQLITE_VERSION_NUMBER >= 3008007 && SQLITE_VERSION_NUMBER < 3008010\n#error broken sqlite\n#endif\nint main() {return 0;}" SQLITE_VERSION_OK)
|
||||
if (NOT SQLITE_VERSION_OK)
|
||||
if (SQLITE_VERSION LESS "3.8.10")
|
||||
message(FATAL_ERROR "sqlite3 is broken from 3.8.7 to 3.8.10 - please find a working version")
|
||||
endif()
|
||||
if (NOT SQLITE3_BUILD_SOURCE)
|
||||
set(_SAVED_FLAGS ${CMAKE_REQUIRED_FLAGS})
|
||||
|
||||
list(INSERT CMAKE_REQUIRED_LIBRARIES 0 ${SQLITE3_LDFLAGS})
|
||||
CHECK_SYMBOL_EXISTS(sqlite3_open_v2 sqlite3.h HAVE_SQLITE3_OPEN_V2)
|
||||
list(REMOVE_ITEM CMAKE_REQUIRED_INCLUDES "${SQLITE3_INCLUDE_DIRS}")
|
||||
list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES ${SQLITE3_LDFLAGS})
|
||||
else()
|
||||
if (NOT TARGET sqlite3_static)
|
||||
# build sqlite as a static lib to compile into our test programs
|
||||
add_library(sqlite3_static STATIC "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.c")
|
||||
if (NOT WIN32)
|
||||
set_target_properties(sqlite3_static PROPERTIES COMPILE_FLAGS "-Wno-error -Wno-extra -Wno-unused -Wno-cast-qual -DSQLITE_OMIT_LOAD_EXTENSION")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# that's enough about sqlite
|
||||
|
15
cppcheck-suppression-list.txt
Normal file
15
cppcheck-suppression-list.txt
Normal file
@ -0,0 +1,15 @@
|
||||
unknownMacro:*gtest-all.cc
|
||||
knownConditionTrueFalse:*Parser.rl
|
||||
knownConditionTrueFalse:*Parser.cpp
|
||||
variableScope:*Parser.rl
|
||||
duplicateBreak:*.rl
|
||||
unreadVariable:*control_verbs.cpp
|
||||
unreachableCode:*rose_build_dump.cpp
|
||||
*:*simde/*
|
||||
assertWithSideEffect
|
||||
syntaxError
|
||||
internalError
|
||||
checkersReport
|
||||
missingInclude
|
||||
missingIncludeSystem
|
||||
unmatchedSuppression
|
@ -19,6 +19,7 @@ else()
|
||||
set(SPHINX_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
|
||||
set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
|
||||
set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
|
||||
set(SPHINX_MAN_DIR "${CMAKE_CURRENT_BINARY_DIR}/man")
|
||||
|
||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/conf.py" @ONLY)
|
||||
@ -32,4 +33,14 @@ add_custom_target(dev-reference
|
||||
"${SPHINX_HTML_DIR}"
|
||||
DEPENDS dev-reference-doxygen
|
||||
COMMENT "Building HTML dev reference with Sphinx")
|
||||
|
||||
add_custom_target(dev-reference-man
|
||||
${SPHINX_BUILD}
|
||||
-b man
|
||||
-c "${CMAKE_CURRENT_BINARY_DIR}"
|
||||
-d "${SPHINX_CACHE_DIR}"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}"
|
||||
"${SPHINX_MAN_DIR}"
|
||||
DEPENDS dev-reference-doxygen
|
||||
COMMENT "Building man page reference with Sphinx")
|
||||
endif()
|
||||
|
@ -11,10 +11,10 @@ Introduction
|
||||
************
|
||||
|
||||
Chimera is a software regular expression matching engine that is a hybrid of
|
||||
Hyperscan and PCRE. The design goals of Chimera are to fully support PCRE
|
||||
syntax as well as to take advantage of the high performance nature of Hyperscan.
|
||||
Vectorscan and PCRE. The design goals of Chimera are to fully support PCRE
|
||||
syntax as well as to take advantage of the high performance nature of Vectorscan.
|
||||
|
||||
Chimera inherits the design guideline of Hyperscan with C APIs for compilation
|
||||
Chimera inherits the design guideline of Vectorscan with C APIs for compilation
|
||||
and scanning.
|
||||
|
||||
The Chimera API itself is composed of two major components:
|
||||
@ -65,13 +65,13 @@ For a given database, Chimera provides several guarantees:
|
||||
.. note:: Chimera is designed to have the same matching behavior as PCRE,
|
||||
including greedy/ungreedy, capturing, etc. Chimera reports both
|
||||
**start offset** and **end offset** for each match like PCRE. Different
|
||||
from the fashion of reporting all matches in Hyperscan, Chimera only reports
|
||||
from the fashion of reporting all matches in Vectorscan, Chimera only reports
|
||||
non-overlapping matches. For example, the pattern :regexp:`/foofoo/` will
|
||||
match ``foofoofoofoo`` at offsets (0, 6) and (6, 12).
|
||||
|
||||
.. note:: Since Chimera is a hybrid of Hyperscan and PCRE in order to support
|
||||
.. note:: Since Chimera is a hybrid of Vectorscan and PCRE in order to support
|
||||
full PCRE syntax, there will be extra performance overhead compared to
|
||||
Hyperscan-only solution. Please always use Hyperscan for better performance
|
||||
Vectorscan-only solution. Please always use Vectorscan for better performance
|
||||
unless you must need full PCRE syntax support.
|
||||
|
||||
See :ref:`chruntime` for more details
|
||||
@ -83,12 +83,12 @@ Requirements
|
||||
The PCRE library (http://pcre.org/) version 8.41 is required for Chimera.
|
||||
|
||||
.. note:: Since Chimera needs to reference PCRE internal function, please place PCRE source
|
||||
directory under Hyperscan root directory in order to build Chimera.
|
||||
directory under Vectorscan root directory in order to build Chimera.
|
||||
|
||||
Beside this, both hardware and software requirements of Chimera are the same to Hyperscan.
|
||||
Beside this, both hardware and software requirements of Chimera are the same to Vectorscan.
|
||||
See :ref:`hardware` and :ref:`software` for more details.
|
||||
|
||||
.. note:: Building Hyperscan will automatically generate Chimera library.
|
||||
.. note:: Building Vectorscan will automatically generate Chimera library.
|
||||
Currently only static library is supported for Chimera, so please
|
||||
use static build type when configure CMake build options.
|
||||
|
||||
@ -119,7 +119,7 @@ databases:
|
||||
|
||||
Compilation allows the Chimera library to analyze the given pattern(s) and
|
||||
pre-determine how to scan for these patterns in an optimized fashion using
|
||||
Hyperscan and PCRE.
|
||||
Vectorscan and PCRE.
|
||||
|
||||
===============
|
||||
Pattern Support
|
||||
@ -134,7 +134,7 @@ Semantics
|
||||
=========
|
||||
|
||||
Chimera supports the exact same semantics of PCRE library. Moreover, it supports
|
||||
multiple simultaneous pattern matching like Hyperscan and the multiple matches
|
||||
multiple simultaneous pattern matching like Vectorscan and the multiple matches
|
||||
will be reported in order by end offset.
|
||||
|
||||
.. _chruntime:
|
||||
@ -212,7 +212,7 @@ space is required for that context.
|
||||
In the absence of recursive scanning, only one such space is required per thread
|
||||
and can (and indeed should) be allocated before data scanning is to commence.
|
||||
|
||||
In a scenario where a set of expressions are compiled by a single "master"
|
||||
In a scenario where a set of expressions are compiled by a single "main"
|
||||
thread and data will be scanned by multiple "worker" threads, the convenience
|
||||
function :c:func:`ch_clone_scratch` allows multiple copies of an existing
|
||||
scratch space to be made for each thread (rather than forcing the caller to pass
|
||||
|
@ -9,7 +9,7 @@ Compiling Patterns
|
||||
Building a Database
|
||||
*******************
|
||||
|
||||
The Hyperscan compiler API accepts regular expressions and converts them into a
|
||||
The Vectorscan compiler API accepts regular expressions and converts them into a
|
||||
compiled pattern database that can then be used to scan data.
|
||||
|
||||
The API provides three functions that compile regular expressions into
|
||||
@ -24,7 +24,7 @@ databases:
|
||||
#. :c:func:`hs_compile_ext_multi`: compiles an array of expressions as above,
|
||||
but allows :ref:`extparam` to be specified for each expression.
|
||||
|
||||
Compilation allows the Hyperscan library to analyze the given pattern(s) and
|
||||
Compilation allows the Vectorscan library to analyze the given pattern(s) and
|
||||
pre-determine how to scan for these patterns in an optimized fashion that would
|
||||
be far too expensive to compute at run-time.
|
||||
|
||||
@ -48,10 +48,10 @@ To compile patterns to be used in streaming mode, the ``mode`` parameter of
|
||||
block mode requires the use of :c:member:`HS_MODE_BLOCK` and vectored mode
|
||||
requires the use of :c:member:`HS_MODE_VECTORED`. A pattern database compiled
|
||||
for one mode (streaming, block or vectored) can only be used in that mode. The
|
||||
version of Hyperscan used to produce a compiled pattern database must match the
|
||||
version of Hyperscan used to scan with it.
|
||||
version of Vectorscan used to produce a compiled pattern database must match the
|
||||
version of Vectorscan used to scan with it.
|
||||
|
||||
Hyperscan provides support for targeting a database at a particular CPU
|
||||
Vectorscan provides support for targeting a database at a particular CPU
|
||||
platform; see :ref:`instr_specialization` for details.
|
||||
|
||||
=====================
|
||||
@ -64,25 +64,25 @@ interpreted independently. No syntax association happens between any adjacent
|
||||
characters.
|
||||
|
||||
For example, given an expression written as :regexp:`/bc?/`. We could say it is
|
||||
a regluar expression, with the meaning that character ``b`` followed by nothing
|
||||
a regular expression, with the meaning that character ``b`` followed by nothing
|
||||
or by one character ``c``. On the other view, we could also say it is a pure
|
||||
literal expression, with the meaning that this is a character sequence of 3-byte
|
||||
length, containing characters ``b``, ``c`` and ``?``. In regular case, the
|
||||
question mark character ``?`` has a particular syntax role called 0-1 quantifier,
|
||||
which has an syntax association with the character ahead of it. Similar
|
||||
characters exist in regular grammer like ``[``, ``]``, ``(``, ``)``, ``{``,
|
||||
which has a syntax association with the character ahead of it. Similar
|
||||
characters exist in regular grammar like ``[``, ``]``, ``(``, ``)``, ``{``,
|
||||
``}``, ``-``, ``*``, ``+``, ``\``, ``|``, ``/``, ``:``, ``^``, ``.``, ``$``.
|
||||
While in pure literal case, all these meta characters lost extra meanings
|
||||
expect for that they are just common ASCII codes.
|
||||
|
||||
Hyperscan is initially designed to process common regular expressions. It is
|
||||
hence embedded with a complex parser to do comprehensive regular grammer
|
||||
interpretion. Particularly, the identification of above meta characters is the
|
||||
basic step for the interpretion of far more complex regular grammers.
|
||||
Vectorscan is initially designed to process common regular expressions. It is
|
||||
hence embedded with a complex parser to do comprehensive regular grammar
|
||||
interpretation. Particularly, the identification of above meta characters is the
|
||||
basic step for the interpretation of far more complex regular grammars.
|
||||
|
||||
However in real cases, patterns may not always be regular expressions. They
|
||||
could just be pure literals. Problem will come if the pure literals contain
|
||||
regular meta characters. Supposing fed directly into traditional Hyperscan
|
||||
regular meta characters. Supposing fed directly into traditional Vectorscan
|
||||
compile API, all these meta characters will be interpreted in predefined ways,
|
||||
which is unnecessary and the result is totally out of expectation. To avoid
|
||||
such misunderstanding by traditional API, users have to preprocess these
|
||||
@ -90,7 +90,7 @@ literal patterns by converting the meta characters into some other formats:
|
||||
either by adding a backslash ``\`` before certain meta characters, or by
|
||||
converting all the characters into a hexadecimal representation.
|
||||
|
||||
In ``v5.2.0``, Hyperscan introduces 2 new compile APIs for pure literal patterns:
|
||||
In ``v5.2.0``, Vectorscan introduces 2 new compile APIs for pure literal patterns:
|
||||
|
||||
#. :c:func:`hs_compile_lit`: compiles a single pure literal into a pattern
|
||||
database.
|
||||
@ -106,7 +106,7 @@ content directly into these APIs without worrying about writing regular meta
|
||||
characters in their patterns. No preprocessing work is needed any more.
|
||||
|
||||
For new APIs, the ``length`` of each literal pattern is a newly added parameter.
|
||||
Hyperscan needs to locate the end position of the input expression via clearly
|
||||
Vectorscan needs to locate the end position of the input expression via clearly
|
||||
knowing each literal's length, not by simply identifying character ``\0`` of a
|
||||
string.
|
||||
|
||||
@ -127,19 +127,19 @@ Supported flags: :c:member:`HS_FLAG_CASELESS`, :c:member:`HS_FLAG_SINGLEMATCH`,
|
||||
Pattern Support
|
||||
***************
|
||||
|
||||
Hyperscan supports the pattern syntax used by the PCRE library ("libpcre"),
|
||||
Vectorscan supports the pattern syntax used by the PCRE library ("libpcre"),
|
||||
described at <http://www.pcre.org/>. However, not all constructs available in
|
||||
libpcre are supported. The use of unsupported constructs will result in
|
||||
compilation errors.
|
||||
|
||||
The version of PCRE used to validate Hyperscan's interpretation of this syntax
|
||||
The version of PCRE used to validate Vectorscan's interpretation of this syntax
|
||||
is 8.41 or above.
|
||||
|
||||
====================
|
||||
Supported Constructs
|
||||
====================
|
||||
|
||||
The following regex constructs are supported by Hyperscan:
|
||||
The following regex constructs are supported by Vectorscan:
|
||||
|
||||
* Literal characters and strings, with all libpcre quoting and character
|
||||
escapes.
|
||||
@ -165,7 +165,7 @@ The following regex constructs are supported by Hyperscan:
|
||||
:regexp:`{n,}` are supported with limitations.
|
||||
|
||||
* For arbitrary repeated sub-patterns: *n* and *m* should be either small
|
||||
or infinite, e.g. :regexp:`(a|b}{4}`, :regexp:`(ab?c?d){4,10}` or
|
||||
or infinite, e.g. :regexp:`(a|b){4}`, :regexp:`(ab?c?d){4,10}` or
|
||||
:regexp:`(ab(cd)*){6,}`.
|
||||
|
||||
* For single-character width sub-patterns such as :regexp:`[^\\a]` or
|
||||
@ -177,7 +177,7 @@ The following regex constructs are supported by Hyperscan:
|
||||
:c:member:`HS_FLAG_SINGLEMATCH` flag is on for that pattern.
|
||||
|
||||
* Lazy modifiers (:regexp:`?` appended to another quantifier, e.g.
|
||||
:regexp:`\\w+?`) are supported but ignored (as Hyperscan reports all
|
||||
:regexp:`\\w+?`) are supported but ignored (as Vectorscan reports all
|
||||
matches).
|
||||
|
||||
* Parenthesization, including the named and unnamed capturing and
|
||||
@ -219,15 +219,15 @@ The following regex constructs are supported by Hyperscan:
|
||||
.. note:: At this time, not all patterns can be successfully compiled with the
|
||||
:c:member:`HS_FLAG_SOM_LEFTMOST` flag, which enables per-pattern support for
|
||||
:ref:`som`. The patterns that support this flag are a subset of patterns that
|
||||
can be successfully compiled with Hyperscan; notably, many bounded repeat
|
||||
forms that can be compiled with Hyperscan without the Start of Match flag
|
||||
can be successfully compiled with Vectorscan; notably, many bounded repeat
|
||||
forms that can be compiled with Vectorscan without the Start of Match flag
|
||||
enabled cannot be compiled with the flag enabled.
|
||||
|
||||
======================
|
||||
Unsupported Constructs
|
||||
======================
|
||||
|
||||
The following regex constructs are not supported by Hyperscan:
|
||||
The following regex constructs are not supported by Vectorscan:
|
||||
|
||||
* Backreferences and capturing sub-expressions.
|
||||
* Arbitrary zero-width assertions.
|
||||
@ -246,32 +246,32 @@ The following regex constructs are not supported by Hyperscan:
|
||||
Semantics
|
||||
*********
|
||||
|
||||
While Hyperscan follows libpcre syntax, it provides different semantics. The
|
||||
While Vectorscan follows libpcre syntax, it provides different semantics. The
|
||||
major departures from libpcre semantics are motivated by the requirements of
|
||||
streaming and multiple simultaneous pattern matching.
|
||||
|
||||
The major departures from libpcre semantics are:
|
||||
|
||||
#. **Multiple pattern matching**: Hyperscan allows matches to be reported for
|
||||
#. **Multiple pattern matching**: Vectorscan allows matches to be reported for
|
||||
several patterns simultaneously. This is not equivalent to separating the
|
||||
patterns by :regexp:`|` in libpcre, which evaluates alternations
|
||||
left-to-right.
|
||||
|
||||
#. **Lack of ordering**: the multiple matches that Hyperscan produces are not
|
||||
#. **Lack of ordering**: the multiple matches that Vectorscan produces are not
|
||||
guaranteed to be ordered, although they will always fall within the bounds of
|
||||
the current scan.
|
||||
|
||||
#. **End offsets only**: Hyperscan's default behaviour is only to report the end
|
||||
#. **End offsets only**: Vectorscan's default behaviour is only to report the end
|
||||
offset of a match. Reporting of the start offset can be enabled with
|
||||
per-expression flags at pattern compile time. See :ref:`som` for details.
|
||||
|
||||
#. **"All matches" reported**: scanning :regexp:`/foo.*bar/` against
|
||||
``fooxyzbarbar`` will return two matches from Hyperscan -- at the points
|
||||
``fooxyzbarbar`` will return two matches from Vectorscan -- at the points
|
||||
corresponding to the ends of ``fooxyzbar`` and ``fooxyzbarbar``. In contrast,
|
||||
libpcre semantics by default would report only one match at ``fooxyzbarbar``
|
||||
(greedy semantics) or, if non-greedy semantics were switched on, one match at
|
||||
``fooxyzbar``. This means that switching between greedy and non-greedy
|
||||
semantics is a no-op in Hyperscan.
|
||||
semantics is a no-op in Vectorscan.
|
||||
|
||||
To support libpcre quantifier semantics while accurately reporting streaming
|
||||
matches at the time they occur is impossible. For example, consider the pattern
|
||||
@ -299,7 +299,7 @@ as in block 3 -- which would constitute a better match for the pattern.
|
||||
Start of Match
|
||||
==============
|
||||
|
||||
In standard operation, Hyperscan will only provide the end offset of a match
|
||||
In standard operation, Vectorscan will only provide the end offset of a match
|
||||
when the match callback is called. If the :c:member:`HS_FLAG_SOM_LEFTMOST` flag
|
||||
is specified for a particular pattern, then the same set of matches is
|
||||
returned, but each match will also provide the leftmost possible start offset
|
||||
@ -308,7 +308,7 @@ corresponding to its end offset.
|
||||
Using the SOM flag entails a number of trade-offs and limitations:
|
||||
|
||||
* Reduced pattern support: For many patterns, tracking SOM is complex and can
|
||||
result in Hyperscan failing to compile a pattern with a "Pattern too
|
||||
result in Vectorscan failing to compile a pattern with a "Pattern too
|
||||
large" error, even if the pattern is supported in normal operation.
|
||||
* Increased stream state: At scan time, state space is required to track
|
||||
potential SOM offsets, and this must be stored in persistent stream state in
|
||||
@ -316,20 +316,20 @@ Using the SOM flag entails a number of trade-offs and limitations:
|
||||
required to match a pattern.
|
||||
* Performance overhead: Similarly, there is generally a performance cost
|
||||
associated with tracking SOM.
|
||||
* Incompatible features: Some other Hyperscan pattern flags (such as
|
||||
* Incompatible features: Some other Vectorscan pattern flags (such as
|
||||
:c:member:`HS_FLAG_SINGLEMATCH` and :c:member:`HS_FLAG_PREFILTER`) can not be
|
||||
used in combination with SOM. Specifying them together with
|
||||
:c:member:`HS_FLAG_SOM_LEFTMOST` will result in a compilation error.
|
||||
|
||||
In streaming mode, the amount of precision delivered by SOM can be controlled
|
||||
with the SOM horizon flags. These instruct Hyperscan to deliver accurate SOM
|
||||
with the SOM horizon flags. These instruct Vectorscan to deliver accurate SOM
|
||||
information within a certain distance of the end offset, and return a special
|
||||
start offset of :c:member:`HS_OFFSET_PAST_HORIZON` otherwise. Specifying a
|
||||
small or medium SOM horizon will usually reduce the stream state required for a
|
||||
given database.
|
||||
|
||||
.. note:: In streaming mode, the start offset returned for a match may refer to
|
||||
a point in the stream *before* the current block being scanned. Hyperscan
|
||||
a point in the stream *before* the current block being scanned. Vectorscan
|
||||
provides no facility for accessing earlier blocks; if the calling application
|
||||
needs to inspect historical data, then it must store it itself.
|
||||
|
||||
@ -341,7 +341,7 @@ Extended Parameters
|
||||
|
||||
In some circumstances, more control over the matching behaviour of a pattern is
|
||||
required than can be specified easily using regular expression syntax. For
|
||||
these scenarios, Hyperscan provides the :c:func:`hs_compile_ext_multi` function
|
||||
these scenarios, Vectorscan provides the :c:func:`hs_compile_ext_multi` function
|
||||
that allows a set of "extended parameters" to be set on a per-pattern basis.
|
||||
|
||||
Extended parameters are specified using an :c:type:`hs_expr_ext_t` structure,
|
||||
@ -383,18 +383,18 @@ section.
|
||||
Prefiltering Mode
|
||||
=================
|
||||
|
||||
Hyperscan provides a per-pattern flag, :c:member:`HS_FLAG_PREFILTER`, which can
|
||||
be used to implement a prefilter for a pattern than Hyperscan would not
|
||||
Vectorscan provides a per-pattern flag, :c:member:`HS_FLAG_PREFILTER`, which can
|
||||
be used to implement a prefilter for a pattern than Vectorscan would not
|
||||
ordinarily support.
|
||||
|
||||
This flag instructs Hyperscan to compile an "approximate" version of this
|
||||
pattern for use in a prefiltering application, even if Hyperscan does not
|
||||
This flag instructs Vectorscan to compile an "approximate" version of this
|
||||
pattern for use in a prefiltering application, even if Vectorscan does not
|
||||
support the pattern in normal operation.
|
||||
|
||||
The set of matches returned when this flag is used is guaranteed to be a
|
||||
superset of the matches specified by the non-prefiltering expression.
|
||||
|
||||
If the pattern contains pattern constructs not supported by Hyperscan (such as
|
||||
If the pattern contains pattern constructs not supported by Vectorscan (such as
|
||||
zero-width assertions, back-references or conditional references) these
|
||||
constructs will be replaced internally with broader constructs that may match
|
||||
more often.
|
||||
@ -404,7 +404,7 @@ back-reference :regexp:`\\1`. In prefiltering mode, this pattern might be
|
||||
approximated by having its back-reference replaced with its referent, forming
|
||||
:regexp:`/\\w+ again \\w+/`.
|
||||
|
||||
Furthermore, in prefiltering mode Hyperscan may simplify a pattern that would
|
||||
Furthermore, in prefiltering mode Vectorscan may simplify a pattern that would
|
||||
otherwise return a "Pattern too large" error at compile time, or for performance
|
||||
reasons (subject to the matching guarantee above).
|
||||
|
||||
@ -422,22 +422,22 @@ matches for the pattern.
|
||||
Instruction Set Specialization
|
||||
******************************
|
||||
|
||||
Hyperscan is able to make use of several modern instruction set features found
|
||||
Vectorscan is able to make use of several modern instruction set features found
|
||||
on x86 processors to provide improvements in scanning performance.
|
||||
|
||||
Some of these features are selected when the library is built; for example,
|
||||
Hyperscan will use the native ``POPCNT`` instruction on processors where it is
|
||||
Vectorscan will use the native ``POPCNT`` instruction on processors where it is
|
||||
available and the library has been optimized for the host architecture.
|
||||
|
||||
.. note:: By default, the Hyperscan runtime is built with the ``-march=native``
|
||||
.. note:: By default, the Vectorscan runtime is built with the ``-march=native``
|
||||
compiler flag and (where possible) will make use of all instructions known by
|
||||
the host's C compiler.
|
||||
|
||||
To use some instruction set features, however, Hyperscan must build a
|
||||
To use some instruction set features, however, Vectorscan must build a
|
||||
specialized database to support them. This means that the target platform must
|
||||
be specified at pattern compile time.
|
||||
|
||||
The Hyperscan compiler API functions all accept an optional
|
||||
The Vectorscan compiler API functions all accept an optional
|
||||
:c:type:`hs_platform_info_t` argument, which describes the target platform
|
||||
for the database to be built. If this argument is NULL, the database will be
|
||||
targeted at the current host platform.
|
||||
@ -467,7 +467,7 @@ See :ref:`api_constants` for the full list of CPU tuning and feature flags.
|
||||
Approximate matching
|
||||
********************
|
||||
|
||||
Hyperscan provides an experimental approximate matching mode, which will match
|
||||
Vectorscan provides an experimental approximate matching mode, which will match
|
||||
patterns within a given edit distance. The exact matching behavior is defined as
|
||||
follows:
|
||||
|
||||
@ -492,7 +492,7 @@ follows:
|
||||
|
||||
Here are a few examples of approximate matching:
|
||||
|
||||
* Pattern :regexp:`/foo/` can match ``foo`` when using regular Hyperscan
|
||||
* Pattern :regexp:`/foo/` can match ``foo`` when using regular Vectorscan
|
||||
matching behavior. With approximate matching within edit distance 2, the
|
||||
pattern will produce matches when scanned against ``foo``, ``foooo``, ``f00``,
|
||||
``f``, and anything else that lies within edit distance 2 of matching corpora
|
||||
@ -513,7 +513,7 @@ matching support. Here they are, in a nutshell:
|
||||
* Reduced pattern support:
|
||||
|
||||
* For many patterns, approximate matching is complex and can result in
|
||||
Hyperscan failing to compile a pattern with a "Pattern too large" error,
|
||||
Vectorscan failing to compile a pattern with a "Pattern too large" error,
|
||||
even if the pattern is supported in normal operation.
|
||||
* Additionally, some patterns cannot be approximately matched because they
|
||||
reduce to so-called "vacuous" patterns (patterns that match everything). For
|
||||
@ -548,7 +548,7 @@ Logical Combinations
|
||||
********************
|
||||
|
||||
For situations when a user requires behaviour that depends on the presence or
|
||||
absence of matches from groups of patterns, Hyperscan provides support for the
|
||||
absence of matches from groups of patterns, Vectorscan provides support for the
|
||||
logical combination of patterns in a given pattern set, with three operators:
|
||||
``NOT``, ``AND`` and ``OR``.
|
||||
|
||||
@ -561,7 +561,7 @@ offset is *true* if the expression it refers to is *false* at this offset.
|
||||
For example, ``NOT 101`` means that expression 101 has not yet matched at this
|
||||
offset.
|
||||
|
||||
A logical combination is passed to Hyperscan at compile time as an expression.
|
||||
A logical combination is passed to Vectorscan at compile time as an expression.
|
||||
This combination expression will raise matches at every offset where one of its
|
||||
sub-expressions matches and the logical value of the whole expression is *true*.
|
||||
|
||||
@ -603,7 +603,7 @@ In a logical combination expression:
|
||||
* Whitespace is ignored.
|
||||
|
||||
To use a logical combination expression, it must be passed to one of the
|
||||
Hyperscan compile functions (:c:func:`hs_compile_multi`,
|
||||
Vectorscan compile functions (:c:func:`hs_compile_multi`,
|
||||
:c:func:`hs_compile_ext_multi`) along with the :c:member:`HS_FLAG_COMBINATION` flag,
|
||||
which identifies the pattern as a logical combination expression. The patterns
|
||||
referred to in the logical combination expression must be compiled together in
|
||||
@ -613,7 +613,7 @@ When an expression has the :c:member:`HS_FLAG_COMBINATION` flag set, it ignores
|
||||
all other flags except the :c:member:`HS_FLAG_SINGLEMATCH` flag and the
|
||||
:c:member:`HS_FLAG_QUIET` flag.
|
||||
|
||||
Hyperscan will accept logical combination expressions at compile time that
|
||||
Vectorscan will accept logical combination expressions at compile time that
|
||||
evaluate to *true* when no patterns have matched, and report the match for
|
||||
combination at end of data if no patterns have matched; for example: ::
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Hyperscan documentation build configuration file, created by
|
||||
# Vectorscan documentation build configuration file, created by
|
||||
# sphinx-quickstart on Tue Sep 29 15:59:19 2015.
|
||||
#
|
||||
# This file is execfile()d with the current directory set to its
|
||||
@ -43,8 +43,8 @@ source_suffix = '.rst'
|
||||
master_doc = 'index'
|
||||
|
||||
# General information about the project.
|
||||
project = u'Hyperscan'
|
||||
copyright = u'2015-2018, Intel Corporation'
|
||||
project = u'Vectorscan'
|
||||
copyright = u'2015-2020, Intel Corporation; 2020-2024, VectorCamp; and other contributors'
|
||||
|
||||
# The version info for the project you're documenting, acts as replacement for
|
||||
# |version| and |release|, also used in various other places throughout the
|
||||
@ -202,7 +202,7 @@ latex_elements = {
|
||||
# (source start file, target name, title,
|
||||
# author, documentclass [howto, manual, or own class]).
|
||||
latex_documents = [
|
||||
('index', 'Hyperscan.tex', u'Hyperscan Documentation',
|
||||
('index', 'Hyperscan.tex', u'Vectorscan Documentation',
|
||||
u'Intel Corporation', 'manual'),
|
||||
]
|
||||
|
||||
@ -232,8 +232,8 @@ latex_documents = [
|
||||
# One entry per manual page. List of tuples
|
||||
# (source start file, name, description, authors, manual section).
|
||||
man_pages = [
|
||||
('index', 'hyperscan', u'Hyperscan Documentation',
|
||||
[u'Intel Corporation'], 1)
|
||||
('index', 'vectorscan', u'Vectorscan Documentation',
|
||||
[u'Intel Corporation'], 7)
|
||||
]
|
||||
|
||||
# If true, show URL addresses after external links.
|
||||
@ -246,8 +246,8 @@ man_pages = [
|
||||
# (source start file, target name, title, author,
|
||||
# dir menu entry, description, category)
|
||||
texinfo_documents = [
|
||||
('index', 'Hyperscan', u'Hyperscan Documentation',
|
||||
u'Intel Corporation', 'Hyperscan', 'High-performance regular expression matcher.',
|
||||
('index', 'Vectorscan', u'Vectorscan Documentation',
|
||||
u'Intel Corporation; VectorCamp', 'Vectorscan', 'High-performance regular expression matcher.',
|
||||
'Miscellaneous'),
|
||||
]
|
||||
|
||||
|
@ -7,43 +7,41 @@ Getting Started
|
||||
Very Quick Start
|
||||
****************
|
||||
|
||||
#. Clone Hyperscan ::
|
||||
#. Clone Vectorscan ::
|
||||
|
||||
cd <where-you-want-hyperscan-source>
|
||||
git clone git://github.com/intel/hyperscan
|
||||
cd <where-you-want-vectorscan-source>
|
||||
git clone https://github.com/VectorCamp/vectorscan
|
||||
|
||||
#. Configure Hyperscan
|
||||
#. Configure Vectorscan
|
||||
|
||||
Ensure that you have the correct :ref:`dependencies <software>` present,
|
||||
and then:
|
||||
|
||||
::
|
||||
|
||||
cd <where-you-want-to-build-hyperscan>
|
||||
cd <where-you-want-to-build-vectorscan>
|
||||
mkdir <build-dir>
|
||||
cd <build-dir>
|
||||
cmake [-G <generator>] [options] <hyperscan-source-path>
|
||||
cmake [-G <generator>] [options] <vectorscan-source-path>
|
||||
|
||||
Known working generators:
|
||||
* ``Unix Makefiles`` --- make-compatible makefiles (default on Linux/FreeBSD/Mac OS X)
|
||||
* ``Ninja`` --- `Ninja <http://martine.github.io/ninja/>`_ build files.
|
||||
* ``Visual Studio 15 2017`` --- Visual Studio projects
|
||||
|
||||
Generators that might work include:
|
||||
Unsupported generators that might work include:
|
||||
* ``Xcode`` --- OS X Xcode projects.
|
||||
|
||||
#. Build Hyperscan
|
||||
#. Build Vectorscan
|
||||
|
||||
Depending on the generator used:
|
||||
* ``cmake --build .`` --- will build everything
|
||||
* ``make -j<jobs>`` --- use makefiles in parallel
|
||||
* ``ninja`` --- use Ninja build
|
||||
* ``MsBuild.exe`` --- use Visual Studio MsBuild
|
||||
* etc.
|
||||
|
||||
#. Check Hyperscan
|
||||
#. Check Vectorscan
|
||||
|
||||
Run the Hyperscan unit tests: ::
|
||||
Run the Vectorscan unit tests: ::
|
||||
|
||||
bin/unit-hyperscan
|
||||
|
||||
@ -55,20 +53,23 @@ Requirements
|
||||
Hardware
|
||||
========
|
||||
|
||||
Hyperscan will run on x86 processors in 64-bit (Intel\ |reg| 64 Architecture) and
|
||||
32-bit (IA-32 Architecture) modes.
|
||||
Vectorscan will run on x86 processors in 64-bit (Intel\ |reg| 64 Architecture) and
|
||||
32-bit (IA-32 Architecture) modes as well as Arm v8.0+ aarch64, and POWER 8+ ppc64le
|
||||
machines.
|
||||
|
||||
Hyperscan is a high performance software library that takes advantage of recent
|
||||
Intel architecture advances. At a minimum, support for Supplemental Streaming
|
||||
SIMD Extensions 3 (SSSE3) is required, which should be available on any modern
|
||||
x86 processor.
|
||||
architecture advances.
|
||||
|
||||
Additionally, Hyperscan can make use of:
|
||||
Additionally, Vectorscan can make use of:
|
||||
|
||||
* Intel Streaming SIMD Extensions 4.2 (SSE4.2)
|
||||
* the POPCNT instruction
|
||||
* Bit Manipulation Instructions (BMI, BMI2)
|
||||
* Intel Advanced Vector Extensions 2 (Intel AVX2)
|
||||
* Arm NEON
|
||||
* Arm SVE and SVE2
|
||||
* Arm SVE2 BITPERM
|
||||
* IBM Power8/Power9 VSX
|
||||
|
||||
if present.
|
||||
|
||||
@ -79,40 +80,34 @@ These can be determined at library compile time, see :ref:`target_arch`.
|
||||
Software
|
||||
========
|
||||
|
||||
As a software library, Hyperscan doesn't impose any particular runtime
|
||||
software requirements, however to build the Hyperscan library we require a
|
||||
modern C and C++ compiler -- in particular, Hyperscan requires C99 and C++11
|
||||
As a software library, Vectorscan doesn't impose any particular runtime
|
||||
software requirements, however to build the Vectorscan library we require a
|
||||
modern C and C++ compiler -- in particular, Vectorscan requires C99 and C++17
|
||||
compiler support. The supported compilers are:
|
||||
|
||||
* GCC, v4.8.1 or higher
|
||||
* Clang, v3.4 or higher (with libstdc++ or libc++)
|
||||
* Intel C++ Compiler v15 or higher
|
||||
* Visual C++ 2017 Build Tools
|
||||
* GCC, v9 or higher
|
||||
* Clang, v5 or higher (with libstdc++ or libc++)
|
||||
|
||||
Examples of operating systems that Hyperscan is known to work on include:
|
||||
Examples of operating systems that Vectorscan is known to work on include:
|
||||
|
||||
Linux:
|
||||
|
||||
* Ubuntu 14.04 LTS or newer
|
||||
* Ubuntu 20.04 LTS or newer
|
||||
* RedHat/CentOS 7 or newer
|
||||
* Fedora 38 or newer
|
||||
* Debian 10
|
||||
|
||||
FreeBSD:
|
||||
|
||||
* 10.0 or newer
|
||||
|
||||
Windows:
|
||||
|
||||
* 8 or newer
|
||||
|
||||
Mac OS X:
|
||||
|
||||
* 10.8 or newer, using XCode/Clang
|
||||
|
||||
Hyperscan *may* compile and run on other platforms, but there is no guarantee.
|
||||
We currently have experimental support for Windows using Intel C++ Compiler
|
||||
or Visual Studio 2017.
|
||||
Vectorscan *may* compile and run on other platforms, but there is no guarantee.
|
||||
|
||||
In addition, the following software is required for compiling the Hyperscan library:
|
||||
In addition, the following software is required for compiling the Vectorscan library:
|
||||
|
||||
======================================================= =========== ======================================
|
||||
Dependency Version Notes
|
||||
@ -132,20 +127,20 @@ Ragel, you may use Cygwin to build it from source.
|
||||
Boost Headers
|
||||
-------------
|
||||
|
||||
Compiling Hyperscan depends on a recent version of the Boost C++ header
|
||||
Compiling Vectorscan depends on a recent version of the Boost C++ header
|
||||
library. If the Boost libraries are installed on the build machine in the
|
||||
usual paths, CMake will find them. If the Boost libraries are not installed,
|
||||
the location of the Boost source tree can be specified during the CMake
|
||||
configuration step using the ``BOOST_ROOT`` variable (described below).
|
||||
|
||||
Another alternative is to put a copy of (or a symlink to) the boost
|
||||
subdirectory in ``<hyperscan-source-path>/include/boost``.
|
||||
subdirectory in ``<vectorscanscan-source-path>/include/boost``.
|
||||
|
||||
For example: for the Boost-1.59.0 release: ::
|
||||
|
||||
ln -s boost_1_59_0/boost <hyperscan-source-path>/include/boost
|
||||
ln -s boost_1_59_0/boost <vectorscan-source-path>/include/boost
|
||||
|
||||
As Hyperscan uses the header-only parts of Boost, it is not necessary to
|
||||
As Vectorscan uses the header-only parts of Boost, it is not necessary to
|
||||
compile the Boost libraries.
|
||||
|
||||
CMake Configuration
|
||||
@ -168,11 +163,12 @@ Common options for CMake include:
|
||||
| | Valid options are Debug, Release, RelWithDebInfo, |
|
||||
| | and MinSizeRel. Default is RelWithDebInfo. |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| BUILD_SHARED_LIBS | Build Hyperscan as a shared library instead of |
|
||||
| BUILD_SHARED_LIBS | Build Vectorscan as a shared library instead of |
|
||||
| | the default static library. |
|
||||
| | Default: Off |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| BUILD_STATIC_AND_SHARED| Build both static and shared Hyperscan libs. |
|
||||
| | Default off. |
|
||||
| BUILD_STATIC_LIBS | Build Vectorscan as a static library. |
|
||||
| | Default: On |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| BOOST_ROOT | Location of Boost source tree. |
|
||||
+------------------------+----------------------------------------------------+
|
||||
@ -180,12 +176,64 @@ Common options for CMake include:
|
||||
+------------------------+----------------------------------------------------+
|
||||
| FAT_RUNTIME | Build the :ref:`fat runtime<fat_runtime>`. Default |
|
||||
| | true on Linux, not available elsewhere. |
|
||||
| | Default: Off |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| USE_CPU_NATIVE | Native CPU detection is off by default, however it |
|
||||
| | is possible to build a performance-oriented non-fat|
|
||||
| | library tuned to your CPU. |
|
||||
| | Default: Off |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| SANITIZE | Use libasan sanitizer to detect possible bugs. |
|
||||
| | Valid options are address, memory and undefined. |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| SIMDE_BACKEND | Enable SIMDe backend. If this is chosen all native |
|
||||
| | (SSE/AVX/AVX512/Neon/SVE/VSX) backends will be |
|
||||
| | disabled and a SIMDe SSE4.2 emulation backend will |
|
||||
| | be enabled. This will enable Vectorscan to build |
|
||||
| | and run on architectures without SIMD. |
|
||||
| | Default: Off |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| SIMDE_NATIVE | Enable SIMDe native emulation of x86 SSE4.2 |
|
||||
| | intrinsics on the building platform. That is, |
|
||||
| | SSE4.2 intrinsics will be emulated using Neon on |
|
||||
| | an Arm platform, or VSX on a Power platform, etc. |
|
||||
| | Default: Off |
|
||||
+------------------------+----------------------------------------------------+
|
||||
|
||||
X86 platform specific options include:
|
||||
|
||||
+------------------------+----------------------------------------------------+
|
||||
| Variable | Description |
|
||||
+========================+====================================================+
|
||||
| BUILD_AVX2 | Enable code for AVX2. |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| BUILD_AVX512 | Enable code for AVX512. Implies BUILD_AVX2. |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| BUILD_AVX512VBMI | Enable code for AVX512 with VBMI extension. Implies|
|
||||
| | BUILD_AVX512. |
|
||||
+------------------------+----------------------------------------------------+
|
||||
|
||||
Arm platform specific options include:
|
||||
|
||||
+------------------------+----------------------------------------------------+
|
||||
| Variable | Description |
|
||||
+========================+====================================================+
|
||||
| BUILD_SVE | Enable code for SVE, like on AWS Graviton3 CPUs. |
|
||||
| | Not much code is ported just for SVE , but enabling|
|
||||
| | SVE code production, does improve code generation, |
|
||||
| | see Benchmarks. |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| BUILD_SVE2 | Enable code for SVE2, implies BUILD_SVE. Most |
|
||||
| | non-Neon code is written for SVE2. |
|
||||
+------------------------+----------------------------------------------------+
|
||||
| BUILD_SVE2_BITPERM | Enable code for SVE2_BITPERM harwdare feature, |
|
||||
| | implies BUILD_SVE2. |
|
||||
+------------------------+----------------------------------------------------+
|
||||
|
||||
For example, to generate a ``Debug`` build: ::
|
||||
|
||||
cd <build-dir>
|
||||
cmake -DCMAKE_BUILD_TYPE=Debug <hyperscan-source-path>
|
||||
cmake -DCMAKE_BUILD_TYPE=Debug <vectorscan-source-path>
|
||||
|
||||
|
||||
|
||||
@ -193,7 +241,7 @@ Build Type
|
||||
----------
|
||||
|
||||
CMake determines a number of features for a build based on the Build Type.
|
||||
Hyperscan defaults to ``RelWithDebInfo``, i.e. "release with debugging
|
||||
Vectorscan defaults to ``RelWithDebInfo``, i.e. "release with debugging
|
||||
information". This is a performance optimized build without runtime assertions
|
||||
but with debug symbols enabled.
|
||||
|
||||
@ -201,7 +249,7 @@ The other types of builds are:
|
||||
|
||||
* ``Release``: as above, but without debug symbols
|
||||
* ``MinSizeRel``: a stripped release build
|
||||
* ``Debug``: used when developing Hyperscan. Includes runtime assertions
|
||||
* ``Debug``: used when developing Vectorscan. Includes runtime assertions
|
||||
(which has a large impact on runtime performance), and will also enable
|
||||
some other build features like building internal unit
|
||||
tests.
|
||||
@ -211,7 +259,7 @@ The other types of builds are:
|
||||
Target Architecture
|
||||
-------------------
|
||||
|
||||
Unless using the :ref:`fat runtime<fat_runtime>`, by default Hyperscan will be
|
||||
Unless using the :ref:`fat runtime<fat_runtime>`, by default Vectorscan will be
|
||||
compiled to target the instruction set of the processor of the machine that
|
||||
being used for compilation. This is done via the use of ``-march=native``. The
|
||||
result of this means that a library built on one machine may not work on a
|
||||
@ -223,7 +271,7 @@ CMake, or ``CMAKE_C_FLAGS`` and ``CMAKE_CXX_FLAGS`` on the CMake command line. F
|
||||
example, to set the instruction subsets up to ``SSE4.2`` using GCC 4.8: ::
|
||||
|
||||
cmake -DCMAKE_C_FLAGS="-march=corei7" \
|
||||
-DCMAKE_CXX_FLAGS="-march=corei7" <hyperscan-source-path>
|
||||
-DCMAKE_CXX_FLAGS="-march=corei7" <vectorscan-source-path>
|
||||
|
||||
For more information, refer to :ref:`instr_specialization`.
|
||||
|
||||
@ -232,17 +280,17 @@ For more information, refer to :ref:`instr_specialization`.
|
||||
Fat Runtime
|
||||
-----------
|
||||
|
||||
A feature introduced in Hyperscan v4.4 is the ability for the Hyperscan
|
||||
A feature introduced in Hyperscan v4.4 is the ability for the Vectorscan
|
||||
library to dispatch the most appropriate runtime code for the host processor.
|
||||
This feature is called the "fat runtime", as a single Hyperscan library
|
||||
This feature is called the "fat runtime", as a single Vectorscan library
|
||||
contains multiple copies of the runtime code for different instruction sets.
|
||||
|
||||
.. note::
|
||||
|
||||
The fat runtime feature is only available on Linux. Release builds of
|
||||
Hyperscan will default to having the fat runtime enabled where supported.
|
||||
Vectorscan will default to having the fat runtime enabled where supported.
|
||||
|
||||
When building the library with the fat runtime, the Hyperscan runtime code
|
||||
When building the library with the fat runtime, the Vectorscan runtime code
|
||||
will be compiled multiple times for these different instruction sets, and
|
||||
these compiled objects are combined into one library. There are no changes to
|
||||
how user applications are built against this library.
|
||||
@ -254,26 +302,28 @@ resolved so that the right version of each API function is used. There is no
|
||||
impact on function call performance, as this check and resolution is performed
|
||||
by the ELF loader once when the binary is loaded.
|
||||
|
||||
If the Hyperscan library is used on x86 systems without ``SSSE3``, the runtime
|
||||
If the Vectorscan library is used on x86 systems without ``SSSE4.2``, the runtime
|
||||
API functions will resolve to functions that return :c:member:`HS_ARCH_ERROR`
|
||||
instead of potentially executing illegal instructions. The API function
|
||||
:c:func:`hs_valid_platform` can be used by application writers to determine if
|
||||
the current platform is supported by Hyperscan.
|
||||
the current platform is supported by Vectorscan.
|
||||
|
||||
As of this release, the variants of the runtime that are built, and the CPU
|
||||
capability that is required, are the following:
|
||||
|
||||
+----------+-------------------------------+---------------------------+
|
||||
| Variant | CPU Feature Flag(s) Required | gcc arch flag |
|
||||
+==========+===============================+===========================+
|
||||
| Core 2 | ``SSSE3`` | ``-march=core2`` |
|
||||
+----------+-------------------------------+---------------------------+
|
||||
| Core i7 | ``SSE4_2`` and ``POPCNT`` | ``-march=corei7`` |
|
||||
+----------+-------------------------------+---------------------------+
|
||||
| AVX 2 | ``AVX2`` | ``-march=core-avx2`` |
|
||||
+----------+-------------------------------+---------------------------+
|
||||
| AVX 512 | ``AVX512BW`` (see note below) | ``-march=skylake-avx512`` |
|
||||
+----------+-------------------------------+---------------------------+
|
||||
+--------------+---------------------------------+---------------------------+
|
||||
| Variant | CPU Feature Flag(s) Required | gcc arch flag |
|
||||
+==============+=================================+===========================+
|
||||
| Core 2 | ``SSSE3`` | ``-march=core2`` |
|
||||
+--------------+---------------------------------+---------------------------+
|
||||
| Core i7 | ``SSE4_2`` and ``POPCNT`` | ``-march=corei7`` |
|
||||
+--------------+---------------------------------+---------------------------+
|
||||
| AVX 2 | ``AVX2`` | ``-march=core-avx2`` |
|
||||
+--------------+---------------------------------+---------------------------+
|
||||
| AVX 512 | ``AVX512BW`` (see note below) | ``-march=skylake-avx512`` |
|
||||
+--------------+---------------------------------+---------------------------+
|
||||
| AVX 512 VBMI | ``AVX512VBMI`` (see note below) | ``-march=icelake-server`` |
|
||||
+--------------+---------------------------------+---------------------------+
|
||||
|
||||
.. note::
|
||||
|
||||
@ -287,6 +337,21 @@ capability that is required, are the following:
|
||||
|
||||
cmake -DBUILD_AVX512=on <...>
|
||||
|
||||
Hyperscan v5.3 adds support for AVX512VBMI instructions - in particular the
|
||||
``AVX512VBMI`` instruction set that was introduced on Intel "Icelake" Xeon
|
||||
processors - however the AVX512VBMI runtime variant is **not** enabled by
|
||||
default in fat runtime builds as not all toolchains support AVX512VBMI
|
||||
instruction sets. To build an AVX512VBMI runtime, the CMake variable
|
||||
``BUILD_AVX512VBMI`` must be enabled manually during configuration. For
|
||||
example: ::
|
||||
|
||||
cmake -DBUILD_AVX512VBMI=on <...>
|
||||
|
||||
Vectorscan add support for Arm processors and SVE, SV2 and SVE2_BITPERM.
|
||||
example: ::
|
||||
|
||||
cmake -DBUILD_SVE=ON -DBUILD_SVE2=ON -DBUILD_SVE2_BITPERM=ON <...>
|
||||
|
||||
As the fat runtime requires compiler, libc, and binutils support, at this time
|
||||
it will only be enabled for Linux builds where the compiler supports the
|
||||
`indirect function "ifunc" function attribute
|
||||
|
@ -1,5 +1,5 @@
|
||||
###############################################
|
||||
Hyperscan |version| Developer's Reference Guide
|
||||
Vectorscan |version| Developer's Reference Guide
|
||||
###############################################
|
||||
|
||||
-------
|
||||
|
@ -5,11 +5,11 @@
|
||||
Introduction
|
||||
############
|
||||
|
||||
Hyperscan is a software regular expression matching engine designed with
|
||||
Vectorscan is a software regular expression matching engine designed with
|
||||
high performance and flexibility in mind. It is implemented as a library that
|
||||
exposes a straightforward C API.
|
||||
|
||||
The Hyperscan API itself is composed of two major components:
|
||||
The Vectorscan API itself is composed of two major components:
|
||||
|
||||
***********
|
||||
Compilation
|
||||
@ -17,7 +17,7 @@ Compilation
|
||||
|
||||
These functions take a group of regular expressions, along with identifiers and
|
||||
option flags, and compile them into an immutable database that can be used by
|
||||
the Hyperscan scanning API. This compilation process performs considerable
|
||||
the Vectorscan scanning API. This compilation process performs considerable
|
||||
analysis and optimization work in order to build a database that will match the
|
||||
given expressions efficiently.
|
||||
|
||||
@ -36,8 +36,8 @@ See :ref:`compilation` for more detail.
|
||||
Scanning
|
||||
********
|
||||
|
||||
Once a Hyperscan database has been created, it can be used to scan data in
|
||||
memory. Hyperscan provides several scanning modes, depending on whether the
|
||||
Once a Vectorscan database has been created, it can be used to scan data in
|
||||
memory. Vectorscan provides several scanning modes, depending on whether the
|
||||
data to be scanned is available as a single contiguous block, whether it is
|
||||
distributed amongst several blocks in memory at the same time, or whether it is
|
||||
to be scanned as a sequence of blocks in a stream.
|
||||
@ -45,7 +45,7 @@ to be scanned as a sequence of blocks in a stream.
|
||||
Matches are delivered to the application via a user-supplied callback function
|
||||
that is called synchronously for each match.
|
||||
|
||||
For a given database, Hyperscan provides several guarantees:
|
||||
For a given database, Vectorscan provides several guarantees:
|
||||
|
||||
* No memory allocations occur at runtime with the exception of two
|
||||
fixed-size allocations, both of which should be done ahead of time for
|
||||
@ -56,7 +56,7 @@ For a given database, Hyperscan provides several guarantees:
|
||||
call.
|
||||
- **Stream state**: in streaming mode only, some state space is required to
|
||||
store data that persists between scan calls for each stream. This allows
|
||||
Hyperscan to track matches that span multiple blocks of data.
|
||||
Vectorscan to track matches that span multiple blocks of data.
|
||||
|
||||
* The sizes of the scratch space and stream state (in streaming mode) required
|
||||
for a given database are fixed and determined at database compile time. This
|
||||
@ -64,7 +64,7 @@ For a given database, Hyperscan provides several guarantees:
|
||||
time, and these structures can be pre-allocated if required for performance
|
||||
reasons.
|
||||
|
||||
* Any pattern that has successfully been compiled by the Hyperscan compiler can
|
||||
* Any pattern that has successfully been compiled by the Vectorscan compiler can
|
||||
be scanned against any input. There are no internal resource limits or other
|
||||
limitations at runtime that could cause a scan call to return an error.
|
||||
|
||||
@ -74,12 +74,12 @@ See :ref:`runtime` for more detail.
|
||||
Tools
|
||||
*****
|
||||
|
||||
Some utilities for testing and benchmarking Hyperscan are included with the
|
||||
Some utilities for testing and benchmarking Vectorscan are included with the
|
||||
library. See :ref:`tools` for more information.
|
||||
|
||||
************
|
||||
Example Code
|
||||
************
|
||||
|
||||
Some simple example code demonstrating the use of the Hyperscan API is
|
||||
available in the ``examples/`` subdirectory of the Hyperscan distribution.
|
||||
Some simple example code demonstrating the use of the Vectorscan API is
|
||||
available in the ``examples/`` subdirectory of the Vectorscan distribution.
|
||||
|
@ -4,7 +4,7 @@
|
||||
Performance Considerations
|
||||
##########################
|
||||
|
||||
Hyperscan supports a wide range of patterns in all three scanning modes. It is
|
||||
Vectorscan supports a wide range of patterns in all three scanning modes. It is
|
||||
capable of extremely high levels of performance, but certain patterns can
|
||||
reduce performance markedly.
|
||||
|
||||
@ -25,7 +25,7 @@ For example, caseless matching of :regexp:`/abc/` can be written as:
|
||||
* :regexp:`/(?i)abc(?-i)/`
|
||||
* :regexp:`/abc/i`
|
||||
|
||||
Hyperscan is capable of handling all these constructs. Unless there is a
|
||||
Vectorscan is capable of handling all these constructs. Unless there is a
|
||||
specific reason otherwise, do not rewrite patterns from one form to another.
|
||||
|
||||
As another example, matching of :regexp:`/foo(bar|baz)(frotz)?/` can be
|
||||
@ -41,24 +41,24 @@ Library usage
|
||||
|
||||
.. tip:: Do not hand-optimize library usage.
|
||||
|
||||
The Hyperscan library is capable of dealing with small writes, unusually large
|
||||
The Vectorscan library is capable of dealing with small writes, unusually large
|
||||
and small pattern sets, etc. Unless there is a specific performance problem
|
||||
with some usage of the library, it is best to use Hyperscan in a simple and
|
||||
with some usage of the library, it is best to use Vectorscan in a simple and
|
||||
direct fashion. For example, it is unlikely for there to be much benefit in
|
||||
buffering input to the library into larger blocks unless streaming writes are
|
||||
tiny (say, 1-2 bytes at a time).
|
||||
|
||||
Unlike many other pattern matching products, Hyperscan will run faster with
|
||||
Unlike many other pattern matching products, Vectorscan will run faster with
|
||||
small numbers of patterns and slower with large numbers of patterns in a smooth
|
||||
fashion (as opposed to, typically, running at a moderate speed up to some fixed
|
||||
limit then either breaking or running half as fast).
|
||||
|
||||
Hyperscan also provides high-throughput matching with a single thread of
|
||||
control per core; if a database runs at 3.0 Gbps in Hyperscan it means that a
|
||||
Vectorscan also provides high-throughput matching with a single thread of
|
||||
control per core; if a database runs at 3.0 Gbps in Vectorscan it means that a
|
||||
3000-bit block of data will be scanned in 1 microsecond in a single thread of
|
||||
control, not that it is required to scan 22 3000-bit blocks of data in 22
|
||||
microseconds. Thus, it is not usually necessary to buffer data to supply
|
||||
Hyperscan with available parallelism.
|
||||
Vectorscan with available parallelism.
|
||||
|
||||
********************
|
||||
Block-based matching
|
||||
@ -72,7 +72,7 @@ accumulated before processing, it should be scanned in block rather than in
|
||||
streaming mode.
|
||||
|
||||
Unnecessary use of streaming mode reduces the number of optimizations that can
|
||||
be applied in Hyperscan and may make some patterns run slower.
|
||||
be applied in Vectorscan and may make some patterns run slower.
|
||||
|
||||
If there is a mixture of 'block' and 'streaming' mode patterns, these should be
|
||||
scanned in separate databases except in the case that the streaming patterns
|
||||
@ -107,7 +107,7 @@ Allocate scratch ahead of time
|
||||
|
||||
Scratch allocation is not necessarily a cheap operation. Since it is the first
|
||||
time (after compilation or deserialization) that a pattern database is used,
|
||||
Hyperscan performs some validation checks inside :c:func:`hs_alloc_scratch` and
|
||||
Vectorscan performs some validation checks inside :c:func:`hs_alloc_scratch` and
|
||||
must also allocate memory.
|
||||
|
||||
Therefore, it is important to ensure that :c:func:`hs_alloc_scratch` is not
|
||||
@ -329,7 +329,7 @@ Consequently, :regexp:`/foo.*bar/L` with a check on start of match values after
|
||||
the callback is considerably more expensive and general than
|
||||
:regexp:`/foo.{300}bar/`.
|
||||
|
||||
Similarly, the :c:member:`hs_expr_ext::min_length` extended parameter can be
|
||||
Similarly, the :cpp:member:`hs_expr_ext::min_length` extended parameter can be
|
||||
used to specify a lower bound on the length of the matches for a pattern. Using
|
||||
this facility may be more lightweight in some circumstances than using the SOM
|
||||
flag and post-confirming match length in the calling application.
|
||||
|
@ -6,35 +6,35 @@ Preface
|
||||
Overview
|
||||
********
|
||||
|
||||
Hyperscan is a regular expression engine designed to offer high performance, the
|
||||
Vectorscan is a regular expression engine designed to offer high performance, the
|
||||
ability to match multiple expressions simultaneously and flexibility in
|
||||
scanning operation.
|
||||
|
||||
Patterns are provided to a compilation interface which generates an immutable
|
||||
pattern database. The scan interface then can be used to scan a target data
|
||||
buffer for the given patterns, returning any matching results from that data
|
||||
buffer. Hyperscan also provides a streaming mode, in which matches that span
|
||||
buffer. Vectorscan also provides a streaming mode, in which matches that span
|
||||
several blocks in a stream are detected.
|
||||
|
||||
This document is designed to facilitate code-level integration of the Hyperscan
|
||||
This document is designed to facilitate code-level integration of the Vectorscan
|
||||
library with existing or new applications.
|
||||
|
||||
:ref:`intro` is a short overview of the Hyperscan library, with more detail on
|
||||
the Hyperscan API provided in the subsequent sections: :ref:`compilation` and
|
||||
:ref:`intro` is a short overview of the Vectorscan library, with more detail on
|
||||
the Vectorscan API provided in the subsequent sections: :ref:`compilation` and
|
||||
:ref:`runtime`.
|
||||
|
||||
:ref:`perf` provides details on various factors which may impact the
|
||||
performance of a Hyperscan integration.
|
||||
performance of a Vectorscan integration.
|
||||
|
||||
:ref:`api_constants` and :ref:`api_files` provides a detailed summary of the
|
||||
Hyperscan Application Programming Interface (API).
|
||||
Vectorscan Application Programming Interface (API).
|
||||
|
||||
********
|
||||
Audience
|
||||
********
|
||||
|
||||
This guide is aimed at developers interested in integrating Hyperscan into an
|
||||
application. For information on building the Hyperscan library, see the Quick
|
||||
This guide is aimed at developers interested in integrating Vectorscan into an
|
||||
application. For information on building the Vectorscan library, see the Quick
|
||||
Start Guide.
|
||||
|
||||
***********
|
||||
|
@ -4,7 +4,7 @@
|
||||
Scanning for Patterns
|
||||
#####################
|
||||
|
||||
Hyperscan provides three different scanning modes, each with its own scan
|
||||
Vectorscan provides three different scanning modes, each with its own scan
|
||||
function beginning with ``hs_scan``. In addition, streaming mode has a number
|
||||
of other API functions for managing stream state.
|
||||
|
||||
@ -33,8 +33,8 @@ See :c:type:`match_event_handler` for more information.
|
||||
Streaming Mode
|
||||
**************
|
||||
|
||||
The core of the Hyperscan streaming runtime API consists of functions to open,
|
||||
scan, and close Hyperscan data streams:
|
||||
The core of the Vectorscan streaming runtime API consists of functions to open,
|
||||
scan, and close Vectorscan data streams:
|
||||
|
||||
* :c:func:`hs_open_stream`: allocates and initializes a new stream for scanning.
|
||||
|
||||
@ -57,14 +57,14 @@ will return immediately with :c:member:`HS_SCAN_TERMINATED`. The caller must
|
||||
still call :c:func:`hs_close_stream` to complete the clean-up process for that
|
||||
stream.
|
||||
|
||||
Streams exist in the Hyperscan library so that pattern matching state can be
|
||||
Streams exist in the Vectorscan library so that pattern matching state can be
|
||||
maintained across multiple blocks of target data -- without maintaining this
|
||||
state, it would not be possible to detect patterns that span these blocks of
|
||||
data. This, however, does come at the cost of requiring an amount of storage
|
||||
per-stream (the size of this storage is fixed at compile time), and a slight
|
||||
performance penalty in some cases to manage the state.
|
||||
|
||||
While Hyperscan does always support a strict ordering of multiple matches,
|
||||
While Vectorscan does always support a strict ordering of multiple matches,
|
||||
streaming matches will not be delivered at offsets before the current stream
|
||||
write, with the exception of zero-width asserts, where constructs such as
|
||||
:regexp:`\\b` and :regexp:`$` can cause a match on the final character of a
|
||||
@ -76,7 +76,7 @@ Stream Management
|
||||
=================
|
||||
|
||||
In addition to :c:func:`hs_open_stream`, :c:func:`hs_scan_stream`, and
|
||||
:c:func:`hs_close_stream`, the Hyperscan API provides a number of other
|
||||
:c:func:`hs_close_stream`, the Vectorscan API provides a number of other
|
||||
functions for the management of streams:
|
||||
|
||||
* :c:func:`hs_reset_stream`: resets a stream to its initial state; this is
|
||||
@ -98,10 +98,10 @@ A stream object is allocated as a fixed size region of memory which has been
|
||||
sized to ensure that no memory allocations are required during scan
|
||||
operations. When the system is under memory pressure, it may be useful to reduce
|
||||
the memory consumed by streams that are not expected to be used soon. The
|
||||
Hyperscan API provides calls for translating a stream to and from a compressed
|
||||
Vectorscan API provides calls for translating a stream to and from a compressed
|
||||
representation for this purpose. The compressed representation differs from the
|
||||
full stream object as it does not reserve space for components which are not
|
||||
required given the current stream state. The Hyperscan API functions for this
|
||||
required given the current stream state. The Vectorscan API functions for this
|
||||
functionality are:
|
||||
|
||||
* :c:func:`hs_compress_stream`: fills the provided buffer with a compressed
|
||||
@ -157,7 +157,7 @@ scanned in block mode.
|
||||
Scratch Space
|
||||
*************
|
||||
|
||||
While scanning data, Hyperscan needs a small amount of temporary memory to store
|
||||
While scanning data, Vectorscan needs a small amount of temporary memory to store
|
||||
on-the-fly internal data. This amount is unfortunately too large to fit on the
|
||||
stack, particularly for embedded applications, and allocating memory dynamically
|
||||
is too expensive, so a pre-allocated "scratch" space must be provided to the
|
||||
@ -170,7 +170,7 @@ databases, only a single scratch region is necessary: in this case, calling
|
||||
will ensure that the scratch space is large enough to support scanning against
|
||||
any of the given databases.
|
||||
|
||||
While the Hyperscan library is re-entrant, the use of scratch spaces is not.
|
||||
While the Vectorscan library is re-entrant, the use of scratch spaces is not.
|
||||
For example, if by design it is deemed necessary to run recursive or nested
|
||||
scanning (say, from the match callback function), then an additional scratch
|
||||
space is required for that context.
|
||||
@ -178,7 +178,7 @@ space is required for that context.
|
||||
In the absence of recursive scanning, only one such space is required per thread
|
||||
and can (and indeed should) be allocated before data scanning is to commence.
|
||||
|
||||
In a scenario where a set of expressions are compiled by a single "master"
|
||||
In a scenario where a set of expressions are compiled by a single "main"
|
||||
thread and data will be scanned by multiple "worker" threads, the convenience
|
||||
function :c:func:`hs_clone_scratch` allows multiple copies of an existing
|
||||
scratch space to be made for each thread (rather than forcing the caller to pass
|
||||
@ -219,11 +219,11 @@ For example:
|
||||
Custom Allocators
|
||||
*****************
|
||||
|
||||
By default, structures used by Hyperscan at runtime (scratch space, stream
|
||||
By default, structures used by Vectorscan at runtime (scratch space, stream
|
||||
state, etc) are allocated with the default system allocators, usually
|
||||
``malloc()`` and ``free()``.
|
||||
|
||||
The Hyperscan API provides a facility for changing this behaviour to support
|
||||
The Vectorscan API provides a facility for changing this behaviour to support
|
||||
applications that use custom memory allocators.
|
||||
|
||||
These functions are:
|
||||
|
@ -4,7 +4,7 @@
|
||||
Serialization
|
||||
#############
|
||||
|
||||
For some applications, compiling Hyperscan pattern databases immediately prior
|
||||
For some applications, compiling Vectorscan pattern databases immediately prior
|
||||
to use is not an appropriate design. Some users may wish to:
|
||||
|
||||
* Compile pattern databases on a different host;
|
||||
@ -14,9 +14,9 @@ to use is not an appropriate design. Some users may wish to:
|
||||
|
||||
* Control the region of memory in which the compiled database is located.
|
||||
|
||||
Hyperscan pattern databases are not completely flat in memory: they contain
|
||||
Vectorscan pattern databases are not completely flat in memory: they contain
|
||||
pointers and have specific alignment requirements. Therefore, they cannot be
|
||||
copied (or otherwise relocated) directly. To enable these use cases, Hyperscan
|
||||
copied (or otherwise relocated) directly. To enable these use cases, Vectorscan
|
||||
provides functionality for serializing and deserializing compiled pattern
|
||||
databases.
|
||||
|
||||
@ -40,10 +40,10 @@ The API provides the following functions:
|
||||
returns a string containing information about the database. This call is
|
||||
analogous to :c:func:`hs_database_info`.
|
||||
|
||||
.. note:: Hyperscan performs both version and platform compatibility checks
|
||||
.. note:: Vectorscan performs both version and platform compatibility checks
|
||||
upon deserialization. The :c:func:`hs_deserialize_database` and
|
||||
:c:func:`hs_deserialize_database_at` functions will only permit the
|
||||
deserialization of databases compiled with (a) the same version of Hyperscan
|
||||
deserialization of databases compiled with (a) the same version of Vectorscan
|
||||
and (b) platform features supported by the current host platform. See
|
||||
:ref:`instr_specialization` for more information on platform specialization.
|
||||
|
||||
@ -51,17 +51,17 @@ The API provides the following functions:
|
||||
The Runtime Library
|
||||
===================
|
||||
|
||||
The main Hyperscan library (``libhs``) contains both the compiler and runtime
|
||||
portions of the library. This means that in order to support the Hyperscan
|
||||
The main Vectorscan library (``libhs``) contains both the compiler and runtime
|
||||
portions of the library. This means that in order to support the Vectorscan
|
||||
compiler, which is written in C++, it requires C++ linkage and has a
|
||||
dependency on the C++ standard library.
|
||||
|
||||
Many embedded applications require only the scanning ("runtime") portion of the
|
||||
Hyperscan library. In these cases, pattern compilation generally takes place on
|
||||
Vectorscan library. In these cases, pattern compilation generally takes place on
|
||||
another host, and serialized pattern databases are delivered to the application
|
||||
for use.
|
||||
|
||||
To support these applications without requiring the C++ dependency, a
|
||||
runtime-only version of the Hyperscan library, called ``libhs_runtime``, is also
|
||||
runtime-only version of the Vectorscan library, called ``libhs_runtime``, is also
|
||||
distributed. This library does not depend on the C++ standard library and
|
||||
provides all Hyperscan functions other that those used to compile databases.
|
||||
provides all Vectorscan functions other that those used to compile databases.
|
||||
|
@ -4,14 +4,14 @@
|
||||
Tools
|
||||
#####
|
||||
|
||||
This section describes the set of utilities included with the Hyperscan library.
|
||||
This section describes the set of utilities included with the Vectorscan library.
|
||||
|
||||
********************
|
||||
Quick Check: hscheck
|
||||
********************
|
||||
|
||||
The ``hscheck`` tool allows the user to quickly check whether Hyperscan supports
|
||||
a group of patterns. If a pattern is rejected by Hyperscan's compiler, the
|
||||
The ``hscheck`` tool allows the user to quickly check whether Vectorscan supports
|
||||
a group of patterns. If a pattern is rejected by Vectorscan's compiler, the
|
||||
compile error is provided on standard output.
|
||||
|
||||
For example, given the following three patterns (the last of which contains a
|
||||
@ -34,7 +34,7 @@ syntax error) in a file called ``/tmp/test``::
|
||||
Benchmarker: hsbench
|
||||
********************
|
||||
|
||||
The ``hsbench`` tool provides an easy way to measure Hyperscan's performance
|
||||
The ``hsbench`` tool provides an easy way to measure Vectorscan's performance
|
||||
for a particular set of patterns and corpus of data to be scanned.
|
||||
|
||||
Patterns are supplied in the format described below in
|
||||
@ -44,7 +44,7 @@ easy control of how a corpus is broken into blocks and streams.
|
||||
|
||||
.. note:: A group of Python scripts for constructing corpora databases from
|
||||
various input types, such as PCAP network traffic captures or text files, can
|
||||
be found in the Hyperscan source tree in ``tools/hsbench/scripts``.
|
||||
be found in the Vectorscan source tree in ``tools/hsbench/scripts``.
|
||||
|
||||
Running hsbench
|
||||
===============
|
||||
@ -56,7 +56,7 @@ produce output like this::
|
||||
$ hsbench -e /tmp/patterns -c /tmp/corpus.db
|
||||
|
||||
Signatures: /tmp/patterns
|
||||
Hyperscan info: Version: 4.3.1 Features: AVX2 Mode: STREAM
|
||||
Vectorscan info: Version: 5.4.11 Features: AVX2 Mode: STREAM
|
||||
Expression count: 200
|
||||
Bytecode size: 342,540 bytes
|
||||
Database CRC: 0x6cd6b67c
|
||||
@ -77,7 +77,7 @@ takes to perform all twenty scans. The number of repeats can be changed with the
|
||||
``-n`` argument, and the results of each scan will be displayed if the
|
||||
``--per-scan`` argument is specified.
|
||||
|
||||
To benchmark Hyperscan on more than one core, you can supply a list of cores
|
||||
To benchmark Vectorscan on more than one core, you can supply a list of cores
|
||||
with the ``-T`` argument, which will instruct ``hsbench`` to start one
|
||||
benchmark thread per core given and compute the throughput from the time taken
|
||||
to complete all of them.
|
||||
@ -91,17 +91,17 @@ Correctness Testing: hscollider
|
||||
*******************************
|
||||
|
||||
The ``hscollider`` tool, or Pattern Collider, provides a way to verify
|
||||
Hyperscan's matching behaviour. It does this by compiling and scanning patterns
|
||||
Vectorscan's matching behaviour. It does this by compiling and scanning patterns
|
||||
(either singly or in groups) against known corpora and comparing the results
|
||||
against another engine (the "ground truth"). Two sources of ground truth for
|
||||
comparison are available:
|
||||
|
||||
* The PCRE library (http://pcre.org/).
|
||||
* An NFA simulation run on Hyperscan's compile-time graph representation. This
|
||||
* An NFA simulation run on Vectorscan's compile-time graph representation. This
|
||||
is used if PCRE cannot support the pattern or if PCRE execution fails due to
|
||||
a resource limit.
|
||||
|
||||
Much of Hyperscan's testing infrastructure is built on ``hscollider``, and the
|
||||
Much of Vectorscan's testing infrastructure is built on ``hscollider``, and the
|
||||
tool is designed to take advantage of multiple cores and provide considerable
|
||||
flexibility in controlling the test. These options are described in the help
|
||||
(``hscollider -h``) and include:
|
||||
@ -116,11 +116,11 @@ flexibility in controlling the test. These options are described in the help
|
||||
Using hscollider to debug a pattern
|
||||
===================================
|
||||
|
||||
One common use-case for ``hscollider`` is to determine whether Hyperscan will
|
||||
One common use-case for ``hscollider`` is to determine whether Vectorscan will
|
||||
match a pattern in the expected location, and whether this accords with PCRE's
|
||||
behaviour for the same case.
|
||||
|
||||
Here is an example. We put our pattern in a file in Hyperscan's pattern
|
||||
Here is an example. We put our pattern in a file in Vectorscan's pattern
|
||||
format::
|
||||
|
||||
$ cat /tmp/pat
|
||||
@ -172,7 +172,7 @@ individual matches are displayed in the output::
|
||||
|
||||
Total elapsed time: 0.00522815 secs.
|
||||
|
||||
We can see from this output that both PCRE and Hyperscan find matches ending at
|
||||
We can see from this output that both PCRE and Vectorscan find matches ending at
|
||||
offset 33 and 45, and so ``hscollider`` considers this test case to have
|
||||
passed.
|
||||
|
||||
@ -180,13 +180,13 @@ passed.
|
||||
corpus alignment 0, and ``-T 1`` instructs us to only use one thread.)
|
||||
|
||||
.. note:: In default operation, PCRE produces only one match for a scan, unlike
|
||||
Hyperscan's automata semantics. The ``hscollider`` tool uses libpcre's
|
||||
"callout" functionality to match Hyperscan's semantics.
|
||||
Vectorscan's automata semantics. The ``hscollider`` tool uses libpcre's
|
||||
"callout" functionality to match Vectorscan's semantics.
|
||||
|
||||
Running a larger scan test
|
||||
==========================
|
||||
|
||||
A set of patterns for testing purposes are distributed with Hyperscan, and these
|
||||
A set of patterns for testing purposes are distributed with Vectorscan, and these
|
||||
can be tested via ``hscollider`` on an in-tree build. Two CMake targets are
|
||||
provided to do this easily:
|
||||
|
||||
@ -202,10 +202,10 @@ Debugging: hsdump
|
||||
*****************
|
||||
|
||||
When built in debug mode (using the CMake directive ``CMAKE_BUILD_TYPE`` set to
|
||||
``Debug``), Hyperscan includes support for dumping information about its
|
||||
``Debug``), Vectorscan includes support for dumping information about its
|
||||
internals during pattern compilation with the ``hsdump`` tool.
|
||||
|
||||
This information is mostly of use to Hyperscan developers familiar with the
|
||||
This information is mostly of use to Vectorscan developers familiar with the
|
||||
library's internal structure, but can be used to diagnose issues with patterns
|
||||
and provide more information in bug reports.
|
||||
|
||||
@ -215,7 +215,7 @@ and provide more information in bug reports.
|
||||
Pattern Format
|
||||
**************
|
||||
|
||||
All of the Hyperscan tools accept patterns in the same format, read from plain
|
||||
All of the Vectorscan tools accept patterns in the same format, read from plain
|
||||
text files with one pattern per line. Each line looks like this:
|
||||
|
||||
* ``<integer id>:/<regex>/<flags>``
|
||||
@ -227,12 +227,12 @@ For example::
|
||||
3:/^.{10,20}hatstand/m
|
||||
|
||||
The integer ID is the value that will be reported when a match is found by
|
||||
Hyperscan and must be unique.
|
||||
Vectorscan and must be unique.
|
||||
|
||||
The pattern itself is a regular expression in PCRE syntax; see
|
||||
:ref:`compilation` for more information on supported features.
|
||||
|
||||
The flags are single characters that map to Hyperscan flags as follows:
|
||||
The flags are single characters that map to Vectorscan flags as follows:
|
||||
|
||||
========= ================================= ===========
|
||||
Character API Flag Description
|
||||
@ -256,7 +256,7 @@ between braces, separated by commas. For example::
|
||||
|
||||
1:/hatstand.*teakettle/s{min_offset=50,max_offset=100}
|
||||
|
||||
All Hyperscan tools will accept a pattern file (or a directory containing
|
||||
All Vectorscan tools will accept a pattern file (or a directory containing
|
||||
pattern files) with the ``-e`` argument. If no further arguments constraining
|
||||
the pattern set are given, all patterns in those files are used.
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
* Copyright (c) 2024, VectorCamp PC
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -112,6 +113,7 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include <random>
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <chrono>
|
||||
@ -133,7 +135,12 @@
|
||||
#include <netinet/tcp.h>
|
||||
#include <netinet/udp.h>
|
||||
#include <netinet/ip_icmp.h>
|
||||
#ifdef __NetBSD__
|
||||
#include <net/ethertypes.h>
|
||||
#include <net/if_ether.h>
|
||||
#else
|
||||
#include <net/ethernet.h>
|
||||
#endif /* __NetBSD__ */
|
||||
#include <arpa/inet.h>
|
||||
|
||||
#include <pcap.h>
|
||||
@ -151,6 +158,8 @@ using std::set;
|
||||
using std::min;
|
||||
using std::max;
|
||||
using std::copy;
|
||||
using std::random_device;
|
||||
using std::mt19937;
|
||||
|
||||
enum Criterion {
|
||||
CRITERION_THROUGHPUT,
|
||||
@ -193,15 +202,15 @@ struct FiveTuple {
|
||||
unsigned int dstPort;
|
||||
|
||||
// Construct a FiveTuple from a TCP or UDP packet.
|
||||
FiveTuple(const struct ip *iphdr) {
|
||||
explicit FiveTuple(const struct ip *iphdr) {
|
||||
// IP fields
|
||||
protocol = iphdr->ip_p;
|
||||
srcAddr = iphdr->ip_src.s_addr;
|
||||
dstAddr = iphdr->ip_dst.s_addr;
|
||||
|
||||
// UDP/TCP ports
|
||||
const struct udphdr *uh = (const struct udphdr *)
|
||||
(((const char *)iphdr) + (iphdr->ip_hl * 4));
|
||||
const struct udphdr *uh = reinterpret_cast<const struct udphdr *>
|
||||
((reinterpret_cast<const char *>(iphdr)) + (iphdr->ip_hl * 4));
|
||||
srcPort = uh->uh_sport;
|
||||
dstPort = uh->uh_dport;
|
||||
}
|
||||
@ -230,7 +239,7 @@ static
|
||||
int onMatch(unsigned int id, unsigned long long from, unsigned long long to,
|
||||
unsigned int flags, void *ctx) {
|
||||
// Our context points to a size_t storing the match count
|
||||
size_t *matches = (size_t *)ctx;
|
||||
size_t *matches = static_cast<size_t *>(ctx);
|
||||
(*matches)++;
|
||||
return 0; // continue matching
|
||||
}
|
||||
@ -292,7 +301,7 @@ public:
|
||||
// database.
|
||||
hs_error_t err = hs_alloc_scratch(db, &scratch);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: could not allocate scratch space. Exiting." << endl;
|
||||
cerr << "ERROR: could not allocate scratch space. Exiting.\n";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
@ -304,8 +313,7 @@ public:
|
||||
size_t scratch_size;
|
||||
hs_error_t err = hs_scratch_size(scratch, &scratch_size);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: could not query scratch space size. Exiting."
|
||||
<< endl;
|
||||
cerr << "ERROR: could not query scratch space size. Exiting.\n";
|
||||
exit(-1);
|
||||
}
|
||||
return scratch_size;
|
||||
@ -331,9 +339,9 @@ public:
|
||||
}
|
||||
|
||||
// Valid TCP or UDP packet
|
||||
const struct ip *iphdr = (const struct ip *)(pktData
|
||||
const struct ip *iphdr = reinterpret_cast<const struct ip *>(pktData
|
||||
+ sizeof(struct ether_header));
|
||||
const char *payload = (const char *)pktData + offset;
|
||||
const char *payload = reinterpret_cast<const char *>(pktData) + offset;
|
||||
|
||||
size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr),
|
||||
stream_map.size())).first->second;
|
||||
@ -349,9 +357,8 @@ public:
|
||||
// Return the number of bytes scanned
|
||||
size_t bytes() const {
|
||||
size_t sum = 0;
|
||||
for (const auto &packet : packets) {
|
||||
sum += packet.size();
|
||||
}
|
||||
auto packs = [](size_t z, const string &packet) { return z + packet.size(); };
|
||||
sum += std::accumulate(packets.begin(), packets.end(), 0, packs);
|
||||
return sum;
|
||||
}
|
||||
|
||||
@ -371,7 +378,7 @@ public:
|
||||
for (auto &stream : streams) {
|
||||
hs_error_t err = hs_open_stream(db, 0, &stream);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: Unable to open stream. Exiting." << endl;
|
||||
cerr << "ERROR: Unable to open stream. Exiting.\n";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
@ -380,11 +387,11 @@ public:
|
||||
// Close all open Hyperscan streams (potentially generating any
|
||||
// end-anchored matches)
|
||||
void closeStreams() {
|
||||
for (auto &stream : streams) {
|
||||
for (const auto &stream : streams) {
|
||||
hs_error_t err =
|
||||
hs_close_stream(stream, scratch, onMatch, &matchCount);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: Unable to close stream. Exiting." << endl;
|
||||
cerr << "ERROR: Unable to close stream. Exiting.\n";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
@ -399,7 +406,7 @@ public:
|
||||
pkt.c_str(), pkt.length(), 0,
|
||||
scratch, onMatch, &matchCount);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: Unable to scan packet. Exiting." << endl;
|
||||
cerr << "ERROR: Unable to scan packet. Exiting.\n";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
@ -413,7 +420,7 @@ public:
|
||||
hs_error_t err = hs_scan(db, pkt.c_str(), pkt.length(), 0,
|
||||
scratch, onMatch, &matchCount);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: Unable to scan packet. Exiting." << endl;
|
||||
cerr << "ERROR: Unable to scan packet. Exiting.\n";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
@ -433,7 +440,7 @@ class Sigdata {
|
||||
|
||||
public:
|
||||
Sigdata() {}
|
||||
Sigdata(const char *filename) {
|
||||
explicit Sigdata(const char *filename) {
|
||||
parseFile(filename, patterns, flags, ids, originals);
|
||||
|
||||
}
|
||||
@ -451,9 +458,8 @@ public:
|
||||
// dynamic storage.)
|
||||
vector<const char *> cstrPatterns;
|
||||
cstrPatterns.reserve(patterns.size());
|
||||
for (const auto &pattern : patterns) {
|
||||
cstrPatterns.push_back(pattern.c_str());
|
||||
}
|
||||
auto pstr = [](const string &pattern) { return pattern.c_str(); };
|
||||
std::transform(patterns.begin(), patterns.end(), std::back_inserter(cstrPatterns), pstr);
|
||||
|
||||
Clock clock;
|
||||
clock.start();
|
||||
@ -502,29 +508,29 @@ public:
|
||||
|
||||
static
|
||||
void usage(const char *) {
|
||||
cerr << "Usage:" << endl << endl;
|
||||
cerr << " patbench [-n repeats] [ -G generations] [ -C criterion ]" << endl
|
||||
cerr << "Usage:\n\n";
|
||||
cerr << " patbench [-n repeats] [ -G generations] [ -C criterion ]\n"
|
||||
<< " [ -F factor_group_size ] [ -N | -S ] "
|
||||
<< "<pattern file> <pcap file>" << endl << endl
|
||||
<< "<pattern file> <pcap file>\n\n"
|
||||
<< " -n repeats sets the number of times the PCAP is repeatedly "
|
||||
"scanned" << endl << " with the pattern." << endl
|
||||
"scanned\n" << " with the pattern.\n"
|
||||
<< " -G generations sets the number of generations that the "
|
||||
"algorithm is" << endl << " run for." << endl
|
||||
"algorithm is\n" << " run for.\n"
|
||||
<< " -N sets non-streaming mode, -S sets streaming mode (default)."
|
||||
<< endl << " -F sets the factor group size (must be >0); this "
|
||||
"allows the detection" << endl
|
||||
<< " of multiple interacting factors." << endl << "" << endl
|
||||
<< " -C sets the 'criterion', which can be either:" << endl
|
||||
"allows the detection\n"
|
||||
<< " of multiple interacting factors.\n" << "\n"
|
||||
<< " -C sets the 'criterion', which can be either:\n"
|
||||
<< " t throughput (the default) - this requires a pcap file"
|
||||
<< endl << " r scratch size" << endl
|
||||
<< " s stream state size" << endl
|
||||
<< " c compile time" << endl << " b bytecode size"
|
||||
<< endl << " r scratch size\n"
|
||||
<< " s stream state size\n"
|
||||
<< " c compile time\n" << " b bytecode size"
|
||||
<< endl << endl
|
||||
<< "We recommend the use of a utility like 'taskset' on "
|
||||
"multiprocessor hosts to" << endl
|
||||
"multiprocessor hosts to\n"
|
||||
<< "lock execution to a single processor: this will remove processor "
|
||||
"migration" << endl
|
||||
<< "by the scheduler as a source of noise in the results." << endl;
|
||||
"migration\n"
|
||||
<< "by the scheduler as a source of noise in the results.\n";
|
||||
}
|
||||
|
||||
static
|
||||
@ -556,7 +562,7 @@ double measure_block_time(Benchmark &bench, unsigned int repeatCount) {
|
||||
}
|
||||
|
||||
static
|
||||
double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
|
||||
double eval_set(Benchmark &bench, const Sigdata &sigs, unsigned int mode,
|
||||
unsigned repeatCount, Criterion criterion,
|
||||
bool diagnose = true) {
|
||||
double compileTime = 0;
|
||||
@ -567,7 +573,7 @@ double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
|
||||
size_t dbSize;
|
||||
hs_error_t err = hs_database_size(bench.getDatabase(), &dbSize);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: could not retrieve bytecode size" << endl;
|
||||
cerr << "ERROR: could not retrieve bytecode size\n";
|
||||
exit(1);
|
||||
}
|
||||
return dbSize;
|
||||
@ -578,7 +584,7 @@ double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
|
||||
size_t streamStateSize;
|
||||
hs_error_t err = hs_stream_size(bench.getDatabase(), &streamStateSize);
|
||||
if (err != HS_SUCCESS) {
|
||||
cerr << "ERROR: could not retrieve stream state size" << endl;
|
||||
cerr << "ERROR: could not retrieve stream state size\n";
|
||||
exit(1);
|
||||
}
|
||||
return streamStateSize;
|
||||
@ -596,8 +602,9 @@ double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
|
||||
scan_time = measure_stream_time(bench, repeatCount);
|
||||
}
|
||||
size_t bytes = bench.bytes();
|
||||
size_t matches = bench.matches();
|
||||
|
||||
if (diagnose) {
|
||||
size_t matches = bench.matches();
|
||||
std::ios::fmtflags f(cout.flags());
|
||||
cout << "Scan time " << std::fixed << std::setprecision(3) << scan_time
|
||||
<< " sec, Scanned " << bytes * repeatCount << " bytes, Throughput "
|
||||
@ -676,14 +683,13 @@ int main(int argc, char **argv) {
|
||||
Benchmark bench;
|
||||
if (criterion == CRITERION_THROUGHPUT) {
|
||||
if (!bench.readStreams(pcapFile)) {
|
||||
cerr << "Unable to read packets from PCAP file. Exiting." << endl;
|
||||
cerr << "Unable to read packets from PCAP file. Exiting.\n";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
if ((criterion == CRITERION_STREAM_STATE) && (mode != HS_MODE_STREAM)) {
|
||||
cerr << "Cannot evaluate stream state for block mode compile. Exiting."
|
||||
<< endl;
|
||||
cerr << "Cannot evaluate stream state for block mode compile. Exiting.\n";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
@ -721,7 +727,7 @@ int main(int argc, char **argv) {
|
||||
unsigned generations = min(gen_max, (sigs.size() - 1) / factor_max);
|
||||
|
||||
cout << "Cutting signatures cumulatively for " << generations
|
||||
<< " generations" << endl;
|
||||
<< " generations\n";
|
||||
for (unsigned gen = 0; gen < generations; ++gen) {
|
||||
cout << "Generation " << gen << " ";
|
||||
set<unsigned> s(work_sigs.begin(), work_sigs.end());
|
||||
@ -731,7 +737,9 @@ int main(int argc, char **argv) {
|
||||
count++;
|
||||
cout << "." << std::flush;
|
||||
vector<unsigned> sv(s.begin(), s.end());
|
||||
random_shuffle(sv.begin(), sv.end());
|
||||
random_device rng;
|
||||
mt19937 urng(rng());
|
||||
shuffle(sv.begin(), sv.end(), urng);
|
||||
unsigned groups = factor_max + 1;
|
||||
for (unsigned current_group = 0; current_group < groups;
|
||||
current_group++) {
|
||||
@ -763,7 +771,7 @@ int main(int argc, char **argv) {
|
||||
cout << "Performance: ";
|
||||
print_criterion(criterion, best);
|
||||
cout << " (" << std::fixed << std::setprecision(3) << (best / score_base)
|
||||
<< "x) after cutting:" << endl;
|
||||
<< "x) after cutting:\n";
|
||||
cout.flags(out_f);
|
||||
|
||||
// s now has factor_max signatures
|
||||
@ -786,7 +794,7 @@ int main(int argc, char **argv) {
|
||||
static
|
||||
bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
|
||||
unsigned int *length) {
|
||||
const ip *iph = (const ip *)(pkt_data + sizeof(ether_header));
|
||||
const ip *iph = reinterpret_cast<const ip *>(pkt_data + sizeof(ether_header));
|
||||
const tcphdr *th = nullptr;
|
||||
|
||||
// Ignore packets that aren't IPv4
|
||||
@ -805,7 +813,7 @@ bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
|
||||
|
||||
switch (iph->ip_p) {
|
||||
case IPPROTO_TCP:
|
||||
th = (const tcphdr *)((const char *)iph + ihlen);
|
||||
th = reinterpret_cast<const tcphdr *>(reinterpret_cast<const char *>(iph) + ihlen);
|
||||
thlen = th->th_off * 4;
|
||||
break;
|
||||
case IPPROTO_UDP:
|
||||
@ -842,7 +850,7 @@ static unsigned parseFlags(const string &flagsStr) {
|
||||
case '\r': // stray carriage-return
|
||||
break;
|
||||
default:
|
||||
cerr << "Unsupported flag \'" << c << "\'" << endl;
|
||||
cerr << "Unsupported flag \'" << c << "\'\n";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
@ -854,7 +862,7 @@ static void parseFile(const char *filename, vector<string> &patterns,
|
||||
vector<string> &originals) {
|
||||
ifstream inFile(filename);
|
||||
if (!inFile.good()) {
|
||||
cerr << "ERROR: Can't open pattern file \"" << filename << "\"" << endl;
|
||||
cerr << "ERROR: Can't open pattern file \"" << filename << "\"\n";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
@ -884,7 +892,7 @@ static void parseFile(const char *filename, vector<string> &patterns,
|
||||
|
||||
size_t flagsStart = expr.find_last_of('/');
|
||||
if (flagsStart == string::npos) {
|
||||
cerr << "ERROR: no trailing '/' char" << endl;
|
||||
cerr << "ERROR: no trailing '/' char\n";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2016, Intel Corporation
|
||||
* Copyright (c) 2024, VectorCamp PC
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -54,6 +55,7 @@
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
@ -68,7 +70,12 @@
|
||||
#include <netinet/tcp.h>
|
||||
#include <netinet/udp.h>
|
||||
#include <netinet/ip_icmp.h>
|
||||
#ifdef __NetBSD__
|
||||
#include <net/ethertypes.h>
|
||||
#include <net/if_ether.h>
|
||||
#else
|
||||
#include <net/ethernet.h>
|
||||
#endif /* __NetBSD__ */
|
||||
#include <arpa/inet.h>
|
||||
|
||||
#include <pcap.h>
|
||||
@ -93,15 +100,15 @@ struct FiveTuple {
|
||||
unsigned int dstPort;
|
||||
|
||||
// Construct a FiveTuple from a TCP or UDP packet.
|
||||
FiveTuple(const struct ip *iphdr) {
|
||||
explicit FiveTuple(const struct ip *iphdr) {
|
||||
// IP fields
|
||||
protocol = iphdr->ip_p;
|
||||
srcAddr = iphdr->ip_src.s_addr;
|
||||
dstAddr = iphdr->ip_dst.s_addr;
|
||||
|
||||
// UDP/TCP ports
|
||||
const struct udphdr *uh =
|
||||
(const struct udphdr *)(((const char *)iphdr) + (iphdr->ip_hl * 4));
|
||||
const char * iphdr_base = reinterpret_cast<const char *>(iphdr);
|
||||
const struct udphdr *uh = reinterpret_cast<const struct udphdr *>(iphdr_base + (iphdr->ip_hl * 4));
|
||||
srcPort = uh->uh_sport;
|
||||
dstPort = uh->uh_dport;
|
||||
}
|
||||
@ -130,7 +137,7 @@ static
|
||||
int onMatch(unsigned int id, unsigned long long from, unsigned long long to,
|
||||
unsigned int flags, void *ctx) {
|
||||
// Our context points to a size_t storing the match count
|
||||
size_t *matches = (size_t *)ctx;
|
||||
size_t *matches = static_cast<size_t *>(ctx);
|
||||
(*matches)++;
|
||||
return 0; // continue matching
|
||||
}
|
||||
@ -226,9 +233,8 @@ public:
|
||||
}
|
||||
|
||||
// Valid TCP or UDP packet
|
||||
const struct ip *iphdr = (const struct ip *)(pktData
|
||||
+ sizeof(struct ether_header));
|
||||
const char *payload = (const char *)pktData + offset;
|
||||
const struct ip *iphdr = reinterpret_cast<const struct ip *>(pktData + sizeof(struct ether_header));
|
||||
const char *payload = reinterpret_cast<const char *>(pktData) + offset;
|
||||
|
||||
size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr),
|
||||
stream_map.size())).first->second;
|
||||
@ -244,9 +250,8 @@ public:
|
||||
// Return the number of bytes scanned
|
||||
size_t bytes() const {
|
||||
size_t sum = 0;
|
||||
for (const auto &packet : packets) {
|
||||
sum += packet.size();
|
||||
}
|
||||
auto packs = [](size_t z, const string &packet) { return z + packet.size(); };
|
||||
sum += std::accumulate(packets.begin(), packets.end(), 0, packs);
|
||||
return sum;
|
||||
}
|
||||
|
||||
@ -275,7 +280,7 @@ public:
|
||||
// Close all open Hyperscan streams (potentially generating any
|
||||
// end-anchored matches)
|
||||
void closeStreams() {
|
||||
for (auto &stream : streams) {
|
||||
for (const auto &stream : streams) {
|
||||
hs_error_t err = hs_close_stream(stream, scratch, onMatch,
|
||||
&matchCount);
|
||||
if (err != HS_SUCCESS) {
|
||||
@ -427,7 +432,8 @@ static void databasesFromFile(const char *filename,
|
||||
// storage.)
|
||||
vector<const char*> cstrPatterns;
|
||||
for (const auto &pattern : patterns) {
|
||||
cstrPatterns.push_back(pattern.c_str());
|
||||
// cppcheck-suppress useStlAlgorithm
|
||||
cstrPatterns.push_back(pattern.c_str()); //NOLINT (performance-inefficient-vector-operation)
|
||||
}
|
||||
|
||||
cout << "Compiling Hyperscan databases with " << patterns.size()
|
||||
@ -568,7 +574,8 @@ int main(int argc, char **argv) {
|
||||
*/
|
||||
static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
|
||||
unsigned int *length) {
|
||||
const ip *iph = (const ip *)(pkt_data + sizeof(ether_header));
|
||||
const ip *iph = reinterpret_cast<const ip *>(pkt_data + sizeof(ether_header));
|
||||
const char *iph_base = reinterpret_cast<const char *>(iph);
|
||||
const tcphdr *th = nullptr;
|
||||
|
||||
// Ignore packets that aren't IPv4
|
||||
@ -587,7 +594,7 @@ static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
|
||||
|
||||
switch (iph->ip_p) {
|
||||
case IPPROTO_TCP:
|
||||
th = (const tcphdr *)((const char *)iph + ihlen);
|
||||
th = reinterpret_cast<const tcphdr *>(iph_base + ihlen);
|
||||
thlen = th->th_off * 4;
|
||||
break;
|
||||
case IPPROTO_UDP:
|
||||
|
@ -67,7 +67,7 @@
|
||||
* to pass in the pattern that was being searched for so we can print it out.
|
||||
*/
|
||||
static int eventHandler(unsigned int id, unsigned long long from,
|
||||
unsigned long long to, unsigned int flags, void *ctx) {
|
||||
unsigned long long to, unsigned int flags, void *ctx) { // cppcheck-suppress constParameterCallback
|
||||
printf("Match for pattern \"%s\" at offset %llu\n", (char *)ctx, to);
|
||||
return 0;
|
||||
}
|
||||
@ -150,7 +150,7 @@ int main(int argc, char *argv[]) {
|
||||
}
|
||||
|
||||
char *pattern = argv[1];
|
||||
char *inputFN = argv[2];
|
||||
const char *inputFN = argv[2];
|
||||
|
||||
/* First, we attempt to compile the pattern provided on the command line.
|
||||
* We assume 'DOTALL' semantics, meaning that the '.' meta-character will
|
||||
|
@ -4,8 +4,7 @@ libdir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@
|
||||
includedir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
Name: libhs
|
||||
Description: Intel(R) Hyperscan Library
|
||||
Description: A portable fork of the high-performance regular expression matching library
|
||||
Version: @HS_VERSION@
|
||||
Libs: -L${libdir} -lhs
|
||||
Libs.private: @PRIVATE_LIBS@
|
||||
Cflags: -I${includedir}/hs
|
||||
|
53
scripts/change_command.py
Normal file
53
scripts/change_command.py
Normal file
@ -0,0 +1,53 @@
|
||||
#
|
||||
# Copyright (c) 2020-2023, VectorCamp PC
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
import json
|
||||
import sys
|
||||
|
||||
#reads from the clang-tidy config file the first comment to ignore specific files
|
||||
# Get the paths from the command-line arguments
|
||||
# python3 ../source/scripts/change_command.py ../source/.clang-tidy ./compile_commands.json
|
||||
clang_tidy_config_path = sys.argv[1]
|
||||
compile_commands_path = sys.argv[2]
|
||||
|
||||
# Load the data from the file
|
||||
with open(compile_commands_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Open the clang-tidy config file and read the first comment
|
||||
with open(clang_tidy_config_path, 'r') as f:
|
||||
for line in f:
|
||||
if line.startswith('#'):
|
||||
ignore_files = line[1:].strip().split(',')
|
||||
break
|
||||
|
||||
# Filter out the entries for the ignored files
|
||||
data = [entry for entry in data if not any(ignore_file in entry['file'] for ignore_file in ignore_files)]
|
||||
|
||||
# Write the result to the same file
|
||||
with open(compile_commands_path, 'w') as f:
|
||||
json.dump(data, f, indent=2)
|
1
simde
Submodule
1
simde
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 416091ebdb9e901b29d026633e73167d6353a0b0
|
@ -176,7 +176,8 @@ void replaceAssertVertex(NGHolder &g, NFAVertex t, const ExpressionInfo &expr,
|
||||
auto ecit = edge_cache.find(cache_key);
|
||||
if (ecit == edge_cache.end()) {
|
||||
DEBUG_PRINTF("adding edge %zu %zu\n", g[u].index, g[v].index);
|
||||
NFAEdge e = add_edge(u, v, g);
|
||||
NFAEdge e;
|
||||
std::tie(e, std::ignore) = add_edge(u, v, g);
|
||||
edge_cache.emplace(cache_key, e);
|
||||
g[e].assert_flags = flags;
|
||||
if (++assert_edge_count > MAX_ASSERT_EDGES) {
|
||||
@ -229,11 +230,12 @@ void checkForMultilineStart(ReportManager &rm, NGHolder &g,
|
||||
|
||||
/* we need to interpose a dummy dot vertex between v and accept if
|
||||
* required so that ^ doesn't match trailing \n */
|
||||
for (const auto &e : out_edges_range(v, g)) {
|
||||
if (target(e, g) == g.accept) {
|
||||
dead.push_back(e);
|
||||
}
|
||||
}
|
||||
auto deads = [&g=g](const NFAEdge &e) {
|
||||
return (target(e, g) == g.accept);
|
||||
};
|
||||
const auto &er = out_edges_range(v, g);
|
||||
std::copy_if(begin(er), end(er), std::back_inserter(dead), deads);
|
||||
|
||||
/* assert has been resolved; clear flag */
|
||||
g[v].assert_flags &= ~POS_FLAG_MULTILINE_START;
|
||||
}
|
||||
@ -251,6 +253,7 @@ void checkForMultilineStart(ReportManager &rm, NGHolder &g,
|
||||
|
||||
static
|
||||
bool hasAssertVertices(const NGHolder &g) {
|
||||
// cppcheck-suppress useStlAlgorithm
|
||||
for (auto v : vertices_range(g)) {
|
||||
int flags = g[v].assert_flags;
|
||||
if (flags & WORDBOUNDARY_FLAGS) {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2020, Intel Corporation
|
||||
* Copyright (c) 2015-2021, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -323,7 +323,8 @@ void addExpression(NG &ng, unsigned index, const char *expression,
|
||||
}
|
||||
|
||||
// Ensure that our pattern isn't too long (in characters).
|
||||
if (strlen(expression) > cc.grey.limitPatternLength) {
|
||||
size_t maxlen = cc.grey.limitPatternLength + 1;
|
||||
if (strnlen(expression, maxlen) >= maxlen) {
|
||||
throw CompileError("Pattern length exceeds limit.");
|
||||
}
|
||||
|
||||
@ -416,6 +417,10 @@ void addLitExpression(NG &ng, unsigned index, const char *expression,
|
||||
"HS_FLAG_SOM_LEFTMOST are supported in literal API.");
|
||||
}
|
||||
|
||||
if (expLength == 0) {
|
||||
throw CompileError("Pure literal API doesn't support empty string.");
|
||||
}
|
||||
|
||||
// This expression must be a pure literal, we can build ue2_literal
|
||||
// directly based on expression text.
|
||||
ParsedLitExpression ple(index, expression, expLength, flags, id);
|
||||
@ -438,7 +443,7 @@ bytecode_ptr<RoseEngine> generateRoseEngine(NG &ng) {
|
||||
if (!rose) {
|
||||
DEBUG_PRINTF("error building rose\n");
|
||||
assert(0);
|
||||
return nullptr;
|
||||
return bytecode_ptr<RoseEngine>(nullptr);
|
||||
}
|
||||
|
||||
dumpReportManager(ng.rm, ng.cc.grey);
|
||||
@ -458,6 +463,9 @@ platform_t target_to_platform(const target_t &target_info) {
|
||||
if (!target_info.has_avx512()) {
|
||||
p |= HS_PLATFORM_NOAVX512;
|
||||
}
|
||||
if (!target_info.has_avx512vbmi()) {
|
||||
p |= HS_PLATFORM_NOAVX512VBMI;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
@ -470,7 +478,7 @@ hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) {
|
||||
DEBUG_PRINTF("db size %zu\n", db_len);
|
||||
DEBUG_PRINTF("db platform %llx\n", platform);
|
||||
|
||||
struct hs_database *db = (struct hs_database *)hs_database_alloc(db_len);
|
||||
struct hs_database *db = static_cast<struct hs_database *>(hs_database_alloc(db_len));
|
||||
if (hs_check_alloc(db) != HS_SUCCESS) {
|
||||
hs_database_free(db);
|
||||
return nullptr;
|
||||
@ -484,7 +492,7 @@ hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) {
|
||||
DEBUG_PRINTF("shift is %zu\n", shift);
|
||||
|
||||
db->bytecode = offsetof(struct hs_database, bytes) - shift;
|
||||
char *bytecode = (char *)db + db->bytecode;
|
||||
char *bytecode = reinterpret_cast<char *>(db) + db->bytecode;
|
||||
assert(ISALIGNED_CL(bytecode));
|
||||
|
||||
db->magic = HS_DB_MAGIC;
|
||||
@ -517,7 +525,7 @@ struct hs_database *build(NG &ng, unsigned int *length, u8 pureFlag) {
|
||||
throw CompileError("Internal error.");
|
||||
}
|
||||
|
||||
const char *bytecode = (const char *)(rose.get());
|
||||
const char *bytecode = reinterpret_cast<const char *>(rose.get());
|
||||
const platform_t p = target_to_platform(ng.cc.target_info);
|
||||
struct hs_database *db = dbCreate(bytecode, *length, p);
|
||||
if (!db) {
|
||||
|
@ -57,15 +57,14 @@ extern const hs_compile_error_t hs_badalloc = {
|
||||
namespace ue2 {
|
||||
|
||||
hs_compile_error_t *generateCompileError(const string &err, int expression) {
|
||||
hs_compile_error_t *ret =
|
||||
(struct hs_compile_error *)hs_misc_alloc(sizeof(hs_compile_error_t));
|
||||
hs_compile_error_t *ret = static_cast<struct hs_compile_error *>(hs_misc_alloc(sizeof(hs_compile_error_t)));
|
||||
if (ret) {
|
||||
hs_error_t e = hs_check_alloc(ret);
|
||||
if (e != HS_SUCCESS) {
|
||||
hs_misc_free(ret);
|
||||
return const_cast<hs_compile_error_t *>(&hs_badalloc);
|
||||
}
|
||||
char *msg = (char *)hs_misc_alloc(err.size() + 1);
|
||||
char *msg = static_cast<char *>(hs_misc_alloc(err.size() + 1));
|
||||
if (msg) {
|
||||
e = hs_check_alloc(msg);
|
||||
if (e != HS_SUCCESS) {
|
||||
|
54
src/crc32.c
54
src/crc32.c
@ -30,7 +30,6 @@
|
||||
#include "config.h"
|
||||
#include "ue2common.h"
|
||||
#include "util/arch.h"
|
||||
#include "util/intrinsics.h"
|
||||
|
||||
#if !defined(HAVE_SSE42)
|
||||
|
||||
@ -543,14 +542,13 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
|
||||
|
||||
// Main aligned loop, processes eight bytes at a time.
|
||||
|
||||
u32 term1, term2;
|
||||
for (size_t li = 0; li < running_length/8; li++) {
|
||||
u32 block = *(const u32 *)p_buf;
|
||||
crc ^= block;
|
||||
p_buf += 4;
|
||||
term1 = crc_tableil8_o88[crc & 0x000000FF] ^
|
||||
u32 term1 = crc_tableil8_o88[crc & 0x000000FF] ^
|
||||
crc_tableil8_o80[(crc >> 8) & 0x000000FF];
|
||||
term2 = crc >> 16;
|
||||
u32 term2 = crc >> 16;
|
||||
crc = term1 ^
|
||||
crc_tableil8_o72[term2 & 0x000000FF] ^
|
||||
crc_tableil8_o64[(term2 >> 8) & 0x000000FF];
|
||||
@ -579,53 +577,7 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
|
||||
}
|
||||
|
||||
#else // HAVE_SSE42
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
#define CRC_WORD 8
|
||||
#define CRC_TYPE u64a
|
||||
#define CRC_FUNC _mm_crc32_u64
|
||||
#else
|
||||
#define CRC_WORD 4
|
||||
#define CRC_TYPE u32
|
||||
#define CRC_FUNC _mm_crc32_u32
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Use the crc32 instruction from SSE4.2 to compute our checksum - same
|
||||
* polynomial as the above function.
|
||||
*/
|
||||
static really_inline
|
||||
u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,
|
||||
const size_t length) {
|
||||
u32 crc = running_crc;
|
||||
|
||||
// Process byte-by-byte until p_buf is aligned
|
||||
|
||||
const unsigned char *aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD);
|
||||
size_t init_bytes = aligned_buf - p_buf;
|
||||
size_t running_length = ((length - init_bytes)/CRC_WORD)*CRC_WORD;
|
||||
size_t end_bytes = length - init_bytes - running_length;
|
||||
|
||||
while (p_buf < aligned_buf) {
|
||||
crc = _mm_crc32_u8(crc, *p_buf++);
|
||||
}
|
||||
|
||||
// Main aligned loop, processes a word at a time.
|
||||
|
||||
for (size_t li = 0; li < running_length/CRC_WORD; li++) {
|
||||
CRC_TYPE block = *(const CRC_TYPE *)p_buf;
|
||||
crc = CRC_FUNC(crc, block);
|
||||
p_buf += CRC_WORD;
|
||||
}
|
||||
|
||||
// Remaining bytes
|
||||
|
||||
for(size_t li = 0; li < end_bytes; li++) {
|
||||
crc = _mm_crc32_u8(crc, *p_buf++);
|
||||
}
|
||||
|
||||
return crc;
|
||||
}
|
||||
#include "util/arch/x86/crc32.h"
|
||||
#endif
|
||||
|
||||
#ifdef VERIFY_ASSERTION
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
* Copyright (c) 2015-2020, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -115,7 +115,8 @@ static
|
||||
hs_error_t db_check_platform(const u64a p) {
|
||||
if (p != hs_current_platform
|
||||
&& p != (hs_current_platform | hs_current_platform_no_avx2)
|
||||
&& p != (hs_current_platform | hs_current_platform_no_avx512)) {
|
||||
&& p != (hs_current_platform | hs_current_platform_no_avx512)
|
||||
&& p != (hs_current_platform | hs_current_platform_no_avx512vbmi)) {
|
||||
return HS_DB_PLATFORM_ERROR;
|
||||
}
|
||||
// passed all checks
|
||||
@ -352,12 +353,6 @@ hs_error_t dbIsValid(const hs_database_t *db) {
|
||||
return HS_SUCCESS;
|
||||
}
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define SNPRINTF_COMPAT _snprintf
|
||||
#else
|
||||
#define SNPRINTF_COMPAT snprintf
|
||||
#endif
|
||||
|
||||
/** Allocate a buffer and prints the database info into it. Returns an
|
||||
* appropriate error code on failure, or HS_SUCCESS on success. */
|
||||
static
|
||||
@ -370,9 +365,11 @@ hs_error_t print_database_string(char **s, u32 version, const platform_t plat,
|
||||
u8 minor = (version >> 16) & 0xff;
|
||||
u8 major = (version >> 24) & 0xff;
|
||||
|
||||
const char *features = (plat & HS_PLATFORM_NOAVX512)
|
||||
? (plat & HS_PLATFORM_NOAVX2) ? "" : "AVX2"
|
||||
: "AVX512";
|
||||
const char *features = (plat & HS_PLATFORM_NOAVX512VBMI)
|
||||
? (plat & HS_PLATFORM_NOAVX512)
|
||||
? (plat & HS_PLATFORM_NOAVX2) ? "" : "AVX2"
|
||||
: "AVX512"
|
||||
: "AVX512VBMI";
|
||||
|
||||
const char *mode = NULL;
|
||||
|
||||
@ -397,9 +394,7 @@ hs_error_t print_database_string(char **s, u32 version, const platform_t plat,
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Note: SNPRINTF_COMPAT is a macro defined above, to cope with systems
|
||||
// that don't have snprintf but have a workalike.
|
||||
int p_len = SNPRINTF_COMPAT(
|
||||
int p_len = snprintf(
|
||||
buf, len, "Version: %u.%u.%u Features: %s Mode: %s",
|
||||
major, minor, release, features, mode);
|
||||
if (p_len < 0) {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
* Copyright (c) 2015-2020, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -51,10 +51,12 @@ extern "C"
|
||||
// CPU type is the low 6 bits (we can't need more than 64, surely!)
|
||||
|
||||
#define HS_PLATFORM_INTEL 1
|
||||
#define HS_PLATFORM_ARM 2
|
||||
#define HS_PLATFORM_CPU_MASK 0x3F
|
||||
|
||||
#define HS_PLATFORM_NOAVX2 (4<<13)
|
||||
#define HS_PLATFORM_NOAVX512 (8<<13)
|
||||
#define HS_PLATFORM_NOAVX512VBMI (0x10<<13)
|
||||
|
||||
/** \brief Platform features bitmask. */
|
||||
typedef u64a platform_t;
|
||||
@ -66,6 +68,9 @@ const platform_t hs_current_platform = {
|
||||
#endif
|
||||
#if !defined(HAVE_AVX512)
|
||||
HS_PLATFORM_NOAVX512 |
|
||||
#endif
|
||||
#if !defined(HAVE_AVX512VBMI)
|
||||
HS_PLATFORM_NOAVX512VBMI |
|
||||
#endif
|
||||
0,
|
||||
};
|
||||
@ -74,13 +79,18 @@ static UNUSED
|
||||
const platform_t hs_current_platform_no_avx2 = {
|
||||
HS_PLATFORM_NOAVX2 |
|
||||
HS_PLATFORM_NOAVX512 |
|
||||
0,
|
||||
HS_PLATFORM_NOAVX512VBMI
|
||||
};
|
||||
|
||||
static UNUSED
|
||||
const platform_t hs_current_platform_no_avx512 = {
|
||||
HS_PLATFORM_NOAVX512 |
|
||||
0,
|
||||
HS_PLATFORM_NOAVX512VBMI
|
||||
};
|
||||
|
||||
static UNUSED
|
||||
const platform_t hs_current_platform_no_avx512vbmi = {
|
||||
HS_PLATFORM_NOAVX512VBMI
|
||||
};
|
||||
|
||||
/*
|
||||
@ -102,6 +112,7 @@ struct hs_database {
|
||||
|
||||
static really_inline
|
||||
const void *hs_get_bytecode(const struct hs_database *db) {
|
||||
// cppcheck-suppress cstyleCast
|
||||
return ((const char *)db + db->bytecode);
|
||||
}
|
||||
|
||||
|
252
src/dispatcher.c
252
src/dispatcher.c
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2016-2017, Intel Corporation
|
||||
* Copyright (c) 2016-2020, Intel Corporation
|
||||
* Copyright (c) 2024, VectorCamp PC
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -30,7 +31,41 @@
|
||||
#include "hs_common.h"
|
||||
#include "hs_runtime.h"
|
||||
#include "ue2common.h"
|
||||
#include "util/cpuid_inline.h"
|
||||
|
||||
/* Streamlining the dispatch to eliminate runtime checking/branching:
|
||||
* What we want to do is, first call to the function will run the resolve
|
||||
* code and set the static resolved/dispatch pointer to point to the
|
||||
* correct function. Subsequent calls to the function will go directly to
|
||||
* the resolved ptr. The simplest way to accomplish this is, to
|
||||
* initially set the pointer to the resolve function.
|
||||
* To accomplish this in a manner invisible to the user,
|
||||
* we do involve some rather ugly/confusing macros in here.
|
||||
* There are four macros that assemble the code for each function
|
||||
* we want to dispatch in this manner:
|
||||
* CREATE_DISPATCH
|
||||
* this generates the declarations for the candidate target functions,
|
||||
* for the fat_dispatch function pointer, for the resolve_ function,
|
||||
* points the function pointer to the resolve function, and contains
|
||||
* most of the definition of the resolve function. The very end of the
|
||||
* resolve function is completed by the next macro, because in the
|
||||
* CREATE_DISPATCH macro we have the argument list with the arg declarations,
|
||||
* which is needed to generate correct function signatures, but we
|
||||
* can't generate from this, in a macro, a _call_ to one of those functions.
|
||||
* CONNECT_ARGS_1
|
||||
* this macro fills in the actual call at the end of the resolve function,
|
||||
* with the correct arg list. hence the name connect args.
|
||||
* CONNECT_DISPATCH_2
|
||||
* this macro likewise gives up the beginning of the definition of the
|
||||
* actual entry point function (the 'real name' that's called by the user)
|
||||
* but again in the pass-through call, cannot invoke the target without
|
||||
* getting the arg list , which is supplied by the final macro,
|
||||
* CONNECT_ARGS_3
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||
#include "util/arch/x86/cpuid_inline.h"
|
||||
#include "util/join.h"
|
||||
|
||||
#if defined(DISABLE_AVX512_DISPATCH)
|
||||
@ -38,8 +73,14 @@
|
||||
#define check_avx512() (0)
|
||||
#endif
|
||||
|
||||
#if defined(DISABLE_AVX512VBMI_DISPATCH)
|
||||
#define avx512vbmi_ disabled_
|
||||
#define check_avx512vbmi() (0)
|
||||
#endif
|
||||
|
||||
#define CREATE_DISPATCH(RTYPE, NAME, ...) \
|
||||
/* create defns */ \
|
||||
RTYPE JOIN(avx512vbmi_, NAME)(__VA_ARGS__); \
|
||||
RTYPE JOIN(avx512_, NAME)(__VA_ARGS__); \
|
||||
RTYPE JOIN(avx2_, NAME)(__VA_ARGS__); \
|
||||
RTYPE JOIN(corei7_, NAME)(__VA_ARGS__); \
|
||||
@ -50,93 +91,274 @@
|
||||
return (RTYPE)HS_ARCH_ERROR; \
|
||||
} \
|
||||
\
|
||||
/* dispatch routing pointer for this function */ \
|
||||
/* initially point it at the resolve function */ \
|
||||
static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__); \
|
||||
static RTYPE (* JOIN(fat_dispatch_, NAME))(__VA_ARGS__) = \
|
||||
&JOIN(resolve_, NAME); \
|
||||
\
|
||||
/* resolver */ \
|
||||
static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) { \
|
||||
if (check_avx512()) { \
|
||||
return JOIN(avx512_, NAME); \
|
||||
static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__) { \
|
||||
if (check_avx512vbmi()) { \
|
||||
fat_dispatch_ ## NAME = &JOIN(avx512vbmi_, NAME); \
|
||||
} \
|
||||
if (check_avx2()) { \
|
||||
return JOIN(avx2_, NAME); \
|
||||
else if (check_avx512()) { \
|
||||
fat_dispatch_ ## NAME = &JOIN(avx512_, NAME); \
|
||||
} \
|
||||
if (check_sse42() && check_popcnt()) { \
|
||||
return JOIN(corei7_, NAME); \
|
||||
else if (check_avx2()) { \
|
||||
fat_dispatch_ ## NAME = &JOIN(avx2_, NAME); \
|
||||
} \
|
||||
if (check_ssse3()) { \
|
||||
return JOIN(core2_, NAME); \
|
||||
else if (check_sse42() && check_popcnt()) { \
|
||||
fat_dispatch_ ## NAME = &JOIN(corei7_, NAME); \
|
||||
} \
|
||||
/* anything else is fail */ \
|
||||
return JOIN(error_, NAME); \
|
||||
else if (check_ssse3()) { \
|
||||
fat_dispatch_ ## NAME = &JOIN(core2_, NAME); \
|
||||
} else { \
|
||||
/* anything else is fail */ \
|
||||
fat_dispatch_ ## NAME = &JOIN(error_, NAME); \
|
||||
} \
|
||||
|
||||
|
||||
|
||||
/* the rest of the function is completed in the CONNECT_ARGS_1 macro. */
|
||||
|
||||
|
||||
|
||||
#elif defined(ARCH_AARCH64)
|
||||
#include "util/arch/arm/cpuid_inline.h"
|
||||
#include "util/join.h"
|
||||
|
||||
#define CREATE_DISPATCH(RTYPE, NAME, ...) \
|
||||
/* create defns */ \
|
||||
RTYPE JOIN(sve2_, NAME)(__VA_ARGS__); \
|
||||
RTYPE JOIN(sve_, NAME)(__VA_ARGS__); \
|
||||
RTYPE JOIN(neon_, NAME)(__VA_ARGS__); \
|
||||
\
|
||||
/* error func */ \
|
||||
static inline RTYPE JOIN(error_, NAME)(__VA_ARGS__) { \
|
||||
return (RTYPE)HS_ARCH_ERROR; \
|
||||
} \
|
||||
\
|
||||
/* function */ \
|
||||
/* dispatch routing pointer for this function */ \
|
||||
/* initially point it at the resolve function */ \
|
||||
static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__); \
|
||||
static RTYPE (* JOIN(fat_dispatch_, NAME))(__VA_ARGS__) = \
|
||||
&JOIN(resolve_, NAME); \
|
||||
\
|
||||
/* resolver */ \
|
||||
static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__) { \
|
||||
if (check_sve2()) { \
|
||||
fat_dispatch_ ## NAME = &JOIN(sve2_, NAME); \
|
||||
} \
|
||||
else if (check_sve()) { \
|
||||
fat_dispatch_ ## NAME = &JOIN(sve_, NAME); \
|
||||
} \
|
||||
else if (check_neon()) { \
|
||||
fat_dispatch_ ## NAME = &JOIN(neon_, NAME); \
|
||||
} else { \
|
||||
/* anything else is fail */ \
|
||||
fat_dispatch_ ## NAME = &JOIN(error_, NAME); \
|
||||
} \
|
||||
|
||||
|
||||
/* the rest of the function is completed in the CONNECT_ARGS_1 macro. */
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#define CONNECT_ARGS_1(RTYPE, NAME, ...) \
|
||||
return (*fat_dispatch_ ## NAME)(__VA_ARGS__); \
|
||||
} \
|
||||
|
||||
|
||||
#define CONNECT_DISPATCH_2(RTYPE, NAME, ...) \
|
||||
/* new function */ \
|
||||
HS_PUBLIC_API \
|
||||
RTYPE NAME(__VA_ARGS__) __attribute__((ifunc("resolve_" #NAME)))
|
||||
RTYPE NAME(__VA_ARGS__) { \
|
||||
|
||||
|
||||
#define CONNECT_ARGS_3(RTYPE, NAME, ...) \
|
||||
return (*fat_dispatch_ ## NAME)(__VA_ARGS__); \
|
||||
} \
|
||||
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wunused-parameter"
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wunused-function"
|
||||
|
||||
/* this gets a bit ugly to compose the static redirect functions,
|
||||
* as we necessarily need first the typed arg list and then just the arg
|
||||
* names, twice in a row, to define the redirect function and the
|
||||
* dispatch function call */
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_scan, const hs_database_t *db, const char *data,
|
||||
unsigned length, unsigned flags, hs_scratch_t *scratch,
|
||||
match_event_handler onEvent, void *userCtx);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_scan, db, data, length, flags, scratch, onEvent, userCtx);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_scan, const hs_database_t *db, const char *data,
|
||||
unsigned length, unsigned flags, hs_scratch_t *scratch,
|
||||
match_event_handler onEvent, void *userCtx);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_scan, db, data, length, flags, scratch, onEvent, userCtx);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_stream_size, const hs_database_t *database,
|
||||
size_t *stream_size);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_stream_size, database, stream_size);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_stream_size, const hs_database_t *database,
|
||||
size_t *stream_size);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_stream_size, database, stream_size);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_database_size, const hs_database_t *db,
|
||||
size_t *size);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_database_size, db, size);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_database_size, const hs_database_t *db,
|
||||
size_t *size);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_database_size, db, size);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, dbIsValid, const hs_database_t *db);
|
||||
CONNECT_ARGS_1(hs_error_t, dbIsValid, db);
|
||||
CONNECT_DISPATCH_2(hs_error_t, dbIsValid, const hs_database_t *db);
|
||||
CONNECT_ARGS_3(hs_error_t, dbIsValid, db);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_free_database, hs_database_t *db);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_free_database, db);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_free_database, hs_database_t *db);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_free_database, db);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_open_stream, const hs_database_t *db,
|
||||
unsigned int flags, hs_stream_t **stream);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_open_stream, db, flags, stream);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_open_stream, const hs_database_t *db,
|
||||
unsigned int flags, hs_stream_t **stream);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_open_stream, db, flags, stream);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_scan_stream, hs_stream_t *id, const char *data,
|
||||
unsigned int length, unsigned int flags, hs_scratch_t *scratch,
|
||||
match_event_handler onEvent, void *ctxt);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_scan_stream, id, data, length, flags, scratch, onEvent, ctxt);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_scan_stream, hs_stream_t *id, const char *data,
|
||||
unsigned int length, unsigned int flags, hs_scratch_t *scratch,
|
||||
match_event_handler onEvent, void *ctxt);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_scan_stream, id, data, length, flags, scratch, onEvent, ctxt);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_close_stream, hs_stream_t *id,
|
||||
hs_scratch_t *scratch, match_event_handler onEvent, void *ctxt);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_close_stream, id, scratch, onEvent, ctxt);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_close_stream, hs_stream_t *id,
|
||||
hs_scratch_t *scratch, match_event_handler onEvent, void *ctxt);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_close_stream, id, scratch, onEvent, ctxt);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_scan_vector, const hs_database_t *db,
|
||||
const char *const *data, const unsigned int *length,
|
||||
unsigned int count, unsigned int flags, hs_scratch_t *scratch,
|
||||
match_event_handler onevent, void *context);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_scan_vector, db, data, length, count, flags, scratch, onevent, context);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_scan_vector, const hs_database_t *db,
|
||||
const char *const *data, const unsigned int *length,
|
||||
unsigned int count, unsigned int flags, hs_scratch_t *scratch,
|
||||
match_event_handler onevent, void *context);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_scan_vector, db, data, length, count, flags, scratch, onevent, context);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_database_info, const hs_database_t *db, char **info);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_database_info, db, info);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_database_info, const hs_database_t *db, char **info);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_database_info, db, info);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_copy_stream, hs_stream_t **to_id,
|
||||
const hs_stream_t *from_id);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_copy_stream, to_id, from_id);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_copy_stream, hs_stream_t **to_id,
|
||||
const hs_stream_t *from_id);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_copy_stream, to_id, from_id);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_reset_stream, hs_stream_t *id,
|
||||
unsigned int flags, hs_scratch_t *scratch,
|
||||
match_event_handler onEvent, void *context);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_reset_stream, id, flags, scratch, onEvent, context);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_reset_stream, hs_stream_t *id,
|
||||
unsigned int flags, hs_scratch_t *scratch,
|
||||
match_event_handler onEvent, void *context);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_reset_stream, id, flags, scratch, onEvent, context);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_reset_and_copy_stream, hs_stream_t *to_id,
|
||||
const hs_stream_t *from_id, hs_scratch_t *scratch,
|
||||
match_event_handler onEvent, void *context);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_reset_and_copy_stream, to_id, from_id, scratch, onEvent, context);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_reset_and_copy_stream, hs_stream_t *to_id,
|
||||
const hs_stream_t *from_id, hs_scratch_t *scratch,
|
||||
match_event_handler onEvent, void *context);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_reset_and_copy_stream, to_id, from_id, scratch, onEvent, context);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_serialize_database, const hs_database_t *db,
|
||||
char **bytes, size_t *length);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_serialize_database, db, bytes, length);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_serialize_database, const hs_database_t *db,
|
||||
char **bytes, size_t *length);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_serialize_database, db, bytes, length);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_deserialize_database, const char *bytes,
|
||||
const size_t length, hs_database_t **db);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_deserialize_database, bytes, length, db);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_deserialize_database, const char *bytes,
|
||||
const size_t length, hs_database_t **db);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_deserialize_database, bytes, length, db);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_deserialize_database_at, const char *bytes,
|
||||
const size_t length, hs_database_t *db);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_deserialize_database_at, bytes, length, db);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_deserialize_database_at, const char *bytes,
|
||||
const size_t length, hs_database_t *db);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_deserialize_database_at, bytes, length, db);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_serialized_database_info, const char *bytes,
|
||||
size_t length, char **info);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_serialized_database_info, bytes, length, info);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_serialized_database_info, const char *bytes,
|
||||
size_t length, char **info);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_serialized_database_info, bytes, length, info);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_serialized_database_size, const char *bytes,
|
||||
const size_t length, size_t *deserialized_size);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_serialized_database_size, bytes, length, deserialized_size);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_serialized_database_size, const char *bytes,
|
||||
const size_t length, size_t *deserialized_size);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_serialized_database_size, bytes, length, deserialized_size);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_compress_stream, const hs_stream_t *stream,
|
||||
char *buf, size_t buf_space, size_t *used_space);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_compress_stream, stream,
|
||||
buf, buf_space, used_space);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_compress_stream, const hs_stream_t *stream,
|
||||
char *buf, size_t buf_space, size_t *used_space);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_compress_stream, stream,
|
||||
buf, buf_space, used_space);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_expand_stream, const hs_database_t *db,
|
||||
hs_stream_t **stream, const char *buf,size_t buf_size);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_expand_stream, db, stream, buf,buf_size);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_expand_stream, const hs_database_t *db,
|
||||
hs_stream_t **stream, const char *buf,size_t buf_size);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_expand_stream, db, stream, buf,buf_size);
|
||||
|
||||
CREATE_DISPATCH(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_stream,
|
||||
const char *buf, size_t buf_size, hs_scratch_t *scratch,
|
||||
match_event_handler onEvent, void *context);
|
||||
CONNECT_ARGS_1(hs_error_t, hs_reset_and_expand_stream, to_stream,
|
||||
buf, buf_size, scratch, onEvent, context);
|
||||
CONNECT_DISPATCH_2(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_stream,
|
||||
const char *buf, size_t buf_size, hs_scratch_t *scratch,
|
||||
match_event_handler onEvent, void *context);
|
||||
CONNECT_ARGS_3(hs_error_t, hs_reset_and_expand_stream, to_stream,
|
||||
buf, buf_size, scratch, onEvent, context);
|
||||
|
||||
/** INTERNALS **/
|
||||
|
||||
CREATE_DISPATCH(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
|
||||
CONNECT_ARGS_1(u32, Crc32c_ComputeBuf, inCrc32, buf, bufLen);
|
||||
CONNECT_DISPATCH_2(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
|
||||
CONNECT_ARGS_3(u32, Crc32c_ComputeBuf, inCrc32, buf, bufLen);
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
|
132
src/fdr/fdr.c
132
src/fdr/fdr.c
@ -36,6 +36,7 @@
|
||||
#include "teddy.h"
|
||||
#include "teddy_internal.h"
|
||||
#include "util/arch.h"
|
||||
#include "util/bitutils.h"
|
||||
#include "util/simd_utils.h"
|
||||
#include "util/uniform_ops.h"
|
||||
|
||||
@ -119,20 +120,6 @@ const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
|
||||
};
|
||||
|
||||
/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
|
||||
* so we force its generation.
|
||||
*/
|
||||
static really_inline
|
||||
u64a andn(const u32 a, const u8 *b) {
|
||||
u64a r;
|
||||
#if defined(HAVE_BMI) && !defined(NO_ASM)
|
||||
__asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
|
||||
#else
|
||||
r = unaligned_load_u32(b) & ~a;
|
||||
#endif
|
||||
return r;
|
||||
}
|
||||
|
||||
/* generates an initial state mask based on the last byte-ish of history rather
|
||||
* than being all accepting. If there is no history to consider, the state is
|
||||
* generated based on the minimum length of each bucket in order to prevent
|
||||
@ -160,33 +147,43 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
|
||||
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
|
||||
/* +1: the zones ensure that we can read the byte at z->end */
|
||||
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
|
||||
u64a reach0 = andn(domain_mask_flipped, itPtr);
|
||||
u64a reach1 = andn(domain_mask_flipped, itPtr + 1);
|
||||
u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
|
||||
u64a reach3 = andn(domain_mask_flipped, itPtr + 3);
|
||||
u64a domain_mask = ~domain_mask_flipped;
|
||||
|
||||
m128 st0 = load_m128_from_u64a(ft + reach0);
|
||||
m128 st1 = load_m128_from_u64a(ft + reach1);
|
||||
m128 st2 = load_m128_from_u64a(ft + reach2);
|
||||
m128 st3 = load_m128_from_u64a(ft + reach3);
|
||||
u64a it_hi = *(const u64a *)itPtr;
|
||||
u64a it_lo = *(const u64a *)(itPtr + 8);
|
||||
u64a reach0 = domain_mask & it_hi;
|
||||
u64a reach1 = domain_mask & (it_hi >> 8);
|
||||
u64a reach2 = domain_mask & (it_hi >> 16);
|
||||
u64a reach3 = domain_mask & (it_hi >> 24);
|
||||
u64a reach4 = domain_mask & (it_hi >> 32);
|
||||
u64a reach5 = domain_mask & (it_hi >> 40);
|
||||
u64a reach6 = domain_mask & (it_hi >> 48);
|
||||
u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8));
|
||||
u64a reach8 = domain_mask & it_lo;
|
||||
u64a reach9 = domain_mask & (it_lo >> 8);
|
||||
u64a reach10 = domain_mask & (it_lo >> 16);
|
||||
u64a reach11 = domain_mask & (it_lo >> 24);
|
||||
u64a reach12 = domain_mask & (it_lo >> 32);
|
||||
u64a reach13 = domain_mask & (it_lo >> 40);
|
||||
u64a reach14 = domain_mask & (it_lo >> 48);
|
||||
u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
|
||||
|
||||
u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
|
||||
u64a reach5 = andn(domain_mask_flipped, itPtr + 5);
|
||||
u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
|
||||
u64a reach7 = andn(domain_mask_flipped, itPtr + 7);
|
||||
|
||||
m128 st4 = load_m128_from_u64a(ft + reach4);
|
||||
m128 st5 = load_m128_from_u64a(ft + reach5);
|
||||
m128 st6 = load_m128_from_u64a(ft + reach6);
|
||||
m128 st7 = load_m128_from_u64a(ft + reach7);
|
||||
|
||||
st1 = lshiftbyte_m128(st1, 1);
|
||||
st2 = lshiftbyte_m128(st2, 2);
|
||||
st3 = lshiftbyte_m128(st3, 3);
|
||||
st4 = lshiftbyte_m128(st4, 4);
|
||||
st5 = lshiftbyte_m128(st5, 5);
|
||||
st6 = lshiftbyte_m128(st6, 6);
|
||||
st7 = lshiftbyte_m128(st7, 7);
|
||||
m128 st0 = load_m128_from_u64a(ft + reach0);
|
||||
m128 st1 = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1);
|
||||
m128 st2 = lshiftbyte_m128(load_m128_from_u64a(ft + reach2), 2);
|
||||
m128 st3 = lshiftbyte_m128(load_m128_from_u64a(ft + reach3), 3);
|
||||
m128 st4 = lshiftbyte_m128(load_m128_from_u64a(ft + reach4), 4);
|
||||
m128 st5 = lshiftbyte_m128(load_m128_from_u64a(ft + reach5), 5);
|
||||
m128 st6 = lshiftbyte_m128(load_m128_from_u64a(ft + reach6), 6);
|
||||
m128 st7 = lshiftbyte_m128(load_m128_from_u64a(ft + reach7), 7);
|
||||
m128 st8 = load_m128_from_u64a(ft + reach8);
|
||||
m128 st9 = lshiftbyte_m128(load_m128_from_u64a(ft + reach9), 1);
|
||||
m128 st10 = lshiftbyte_m128(load_m128_from_u64a(ft + reach10), 2);
|
||||
m128 st11 = lshiftbyte_m128(load_m128_from_u64a(ft + reach11), 3);
|
||||
m128 st12 = lshiftbyte_m128(load_m128_from_u64a(ft + reach12), 4);
|
||||
m128 st13 = lshiftbyte_m128(load_m128_from_u64a(ft + reach13), 5);
|
||||
m128 st14 = lshiftbyte_m128(load_m128_from_u64a(ft + reach14), 6);
|
||||
m128 st15 = lshiftbyte_m128(load_m128_from_u64a(ft + reach15), 7);
|
||||
|
||||
st0 = or128(st0, st1);
|
||||
st2 = or128(st2, st3);
|
||||
@ -195,39 +192,6 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
|
||||
st0 = or128(st0, st2);
|
||||
st4 = or128(st4, st6);
|
||||
st0 = or128(st0, st4);
|
||||
*s = or128(*s, st0);
|
||||
|
||||
*conf0 = movq(*s);
|
||||
*s = rshiftbyte_m128(*s, 8);
|
||||
*conf0 ^= ~0ULL;
|
||||
|
||||
u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
|
||||
u64a reach9 = andn(domain_mask_flipped, itPtr + 9);
|
||||
u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
|
||||
u64a reach11 = andn(domain_mask_flipped, itPtr + 11);
|
||||
|
||||
m128 st8 = load_m128_from_u64a(ft + reach8);
|
||||
m128 st9 = load_m128_from_u64a(ft + reach9);
|
||||
m128 st10 = load_m128_from_u64a(ft + reach10);
|
||||
m128 st11 = load_m128_from_u64a(ft + reach11);
|
||||
|
||||
u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
|
||||
u64a reach13 = andn(domain_mask_flipped, itPtr + 13);
|
||||
u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
|
||||
u64a reach15 = andn(domain_mask_flipped, itPtr + 15);
|
||||
|
||||
m128 st12 = load_m128_from_u64a(ft + reach12);
|
||||
m128 st13 = load_m128_from_u64a(ft + reach13);
|
||||
m128 st14 = load_m128_from_u64a(ft + reach14);
|
||||
m128 st15 = load_m128_from_u64a(ft + reach15);
|
||||
|
||||
st9 = lshiftbyte_m128(st9, 1);
|
||||
st10 = lshiftbyte_m128(st10, 2);
|
||||
st11 = lshiftbyte_m128(st11, 3);
|
||||
st12 = lshiftbyte_m128(st12, 4);
|
||||
st13 = lshiftbyte_m128(st13, 5);
|
||||
st14 = lshiftbyte_m128(st14, 6);
|
||||
st15 = lshiftbyte_m128(st15, 7);
|
||||
|
||||
st8 = or128(st8, st9);
|
||||
st10 = or128(st10, st11);
|
||||
@ -236,11 +200,14 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
|
||||
st8 = or128(st8, st10);
|
||||
st12 = or128(st12, st14);
|
||||
st8 = or128(st8, st12);
|
||||
*s = or128(*s, st8);
|
||||
|
||||
*conf8 = movq(*s);
|
||||
*s = rshiftbyte_m128(*s, 8);
|
||||
*conf8 ^= ~0ULL;
|
||||
m128 st = or128(*s, st0);
|
||||
*conf0 = movq(st) ^ ~0ULL;
|
||||
st = rshiftbyte_m128(st, 8);
|
||||
st = or128(st, st8);
|
||||
|
||||
*conf8 = movq(st) ^ ~0ULL;
|
||||
*s = rshiftbyte_m128(st, 8);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
@ -248,6 +215,7 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
|
||||
UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
|
||||
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
|
||||
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
|
||||
|
||||
u64a reach0 = andn(domain_mask_flipped, itPtr);
|
||||
u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
|
||||
u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
|
||||
@ -300,6 +268,7 @@ void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
|
||||
UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
|
||||
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
|
||||
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
|
||||
|
||||
u64a reach0 = andn(domain_mask_flipped, itPtr);
|
||||
u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
|
||||
u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
|
||||
@ -329,7 +298,7 @@ void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
|
||||
static really_inline
|
||||
void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
|
||||
const u32 *confBase, const struct FDR_Runtime_Args *a,
|
||||
const u8 *ptr, u32 *last_match_id, struct zone *z) {
|
||||
const u8 *ptr, u32 *last_match_id, const struct zone *z) {
|
||||
const u8 bucket = 8;
|
||||
|
||||
if (likely(!*conf)) {
|
||||
@ -339,7 +308,7 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
|
||||
/* ptr is currently referring to a location in the zone's buffer, we also
|
||||
* need a pointer in the original, main buffer for the final string compare.
|
||||
*/
|
||||
const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust);
|
||||
const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr)
|
||||
|
||||
const u8 *confLoc = ptr;
|
||||
|
||||
@ -364,7 +333,7 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
|
||||
}
|
||||
|
||||
static really_inline
|
||||
void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) {
|
||||
void dumpZoneInfo(UNUSED const struct zone *z, UNUSED size_t zone_id) {
|
||||
#ifdef DEBUG
|
||||
DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf);
|
||||
DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n",
|
||||
@ -696,6 +665,10 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
|
||||
const u8 *tryFloodDetect = zz->floodPtr; \
|
||||
const u8 *start_ptr = zz->start; \
|
||||
const u8 *end_ptr = zz->end; \
|
||||
for (const u8 *itPtr = ROUNDDOWN_PTR(start_ptr, 64); itPtr + 4*ITER_BYTES <= end_ptr; \
|
||||
itPtr += 4*ITER_BYTES) { \
|
||||
__builtin_prefetch(itPtr); \
|
||||
} \
|
||||
\
|
||||
for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \
|
||||
itPtr += ITER_BYTES) { \
|
||||
@ -739,6 +712,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
|
||||
assert(ISALIGNED_CL(confBase));
|
||||
struct zone zones[ZONE_MAX];
|
||||
assert(fdr->domain > 8 && fdr->domain < 16);
|
||||
memset(zones, 0, sizeof(zones));
|
||||
|
||||
size_t numZone = prepareZones(a->buf, a->len,
|
||||
a->buf_history + a->len_history,
|
||||
|
@ -44,7 +44,6 @@
|
||||
#include "util/compare.h"
|
||||
#include "util/container.h"
|
||||
#include "util/dump_mask.h"
|
||||
#include "util/make_unique.h"
|
||||
#include "util/math.h"
|
||||
#include "util/noncopyable.h"
|
||||
#include "util/target_info.h"
|
||||
@ -99,7 +98,7 @@ public:
|
||||
const FDREngineDescription &eng_in,
|
||||
bool make_small_in, const Grey &grey_in)
|
||||
: eng(eng_in), grey(grey_in), tab(eng_in.getTabSizeBytes()),
|
||||
lits(move(lits_in)), bucketToLits(move(bucketToLits_in)),
|
||||
lits(std::move(lits_in)), bucketToLits(std::move(bucketToLits_in)),
|
||||
make_small(make_small_in) {}
|
||||
|
||||
bytecode_ptr<FDR> build();
|
||||
@ -128,7 +127,7 @@ void andMask(u8 *dest, const u8 *a, const u8 *b, u32 num_bytes) {
|
||||
}
|
||||
|
||||
void FDRCompiler::createInitialState(FDR *fdr) {
|
||||
u8 *start = (u8 *)&fdr->start;
|
||||
u8 *start = reinterpret_cast<u8 *>(&fdr->start);
|
||||
|
||||
/* initial state should to be 1 in each slot in the bucket up to bucket
|
||||
* minlen - 1, and 0 thereafter */
|
||||
@ -136,8 +135,9 @@ void FDRCompiler::createInitialState(FDR *fdr) {
|
||||
// Find the minimum length for the literals in this bucket.
|
||||
const vector<LiteralIndex> &bucket_lits = bucketToLits[b];
|
||||
u32 min_len = ~0U;
|
||||
for (const LiteralIndex &lit_idx : bucket_lits) {
|
||||
min_len = min(min_len, verify_u32(lits[lit_idx].s.length()));
|
||||
for (const LiteralIndex &lit_idx : bucket_lits) {
|
||||
// cppcheck-suppress useStlAlgorithm
|
||||
min_len = min(min_len, verify_u32(lits[lit_idx].s.length()));
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("bucket %u has min_len=%u\n", b, min_len);
|
||||
@ -176,7 +176,7 @@ bytecode_ptr<FDR> FDRCompiler::setupFDR() {
|
||||
auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
|
||||
assert(fdr); // otherwise would have thrown std::bad_alloc
|
||||
|
||||
u8 *fdr_base = (u8 *)fdr.get();
|
||||
u8 *fdr_base = reinterpret_cast<u8 *>(fdr.get());
|
||||
|
||||
// Write header.
|
||||
fdr->size = size;
|
||||
@ -206,8 +206,7 @@ bytecode_ptr<FDR> FDRCompiler::setupFDR() {
|
||||
assert(ISALIGNED_CL(ptr));
|
||||
fdr->floodOffset = verify_u32(ptr - fdr_base);
|
||||
memcpy(ptr, floodTable.get(), floodTable.size());
|
||||
ptr += floodTable.size(); // last write, no need to round up
|
||||
|
||||
|
||||
return fdr;
|
||||
}
|
||||
|
||||
@ -494,18 +493,18 @@ map<BucketIndex, vector<LiteralIndex>> assignStringsToBuckets(
|
||||
u32 cnt = last_id - first_id;
|
||||
// long literals first for included literals checking
|
||||
for (u32 k = 0; k < cnt; k++) {
|
||||
litIds.push_back(last_id - k - 1);
|
||||
litIds.emplace_back(last_id - k - 1);
|
||||
}
|
||||
|
||||
i = j;
|
||||
buckets.push_back(litIds);
|
||||
buckets.emplace_back(litIds);
|
||||
}
|
||||
|
||||
// reverse bucket id, longer literals come first
|
||||
map<BucketIndex, vector<LiteralIndex>> bucketToLits;
|
||||
size_t bucketCnt = buckets.size();
|
||||
for (size_t i = 0; i < bucketCnt; i++) {
|
||||
bucketToLits.emplace(bucketCnt - i - 1, move(buckets[i]));
|
||||
bucketToLits.emplace(bucketCnt - i - 1, std::move(buckets[i]));
|
||||
}
|
||||
|
||||
return bucketToLits;
|
||||
@ -868,7 +867,7 @@ unique_ptr<HWLMProto> fdrBuildProtoInternal(u8 engType,
|
||||
auto bucketToLits = assignStringsToBuckets(lits, *des);
|
||||
addIncludedInfo(lits, des->getNumBuckets(), bucketToLits);
|
||||
auto proto =
|
||||
ue2::make_unique<HWLMProto>(engType, move(des), lits, bucketToLits,
|
||||
std::make_unique<HWLMProto>(engType, std::move(des), lits, bucketToLits,
|
||||
make_small);
|
||||
return proto;
|
||||
}
|
||||
|
@ -39,6 +39,7 @@ namespace ue2 {
|
||||
size_t maxLen(const vector<hwlmLiteral> &lits) {
|
||||
size_t rv = 0;
|
||||
for (const auto &lit : lits) {
|
||||
// cppcheck-suppress useStlAlgorithm
|
||||
rv = max(rv, lit.s.size());
|
||||
}
|
||||
return rv;
|
||||
|
@ -84,9 +84,10 @@ struct FDRConfirm {
|
||||
|
||||
static really_inline
|
||||
const u32 *getConfirmLitIndex(const struct FDRConfirm *fdrc) {
|
||||
// cppcheck-suppress cstyleCast
|
||||
const u8 *base = (const u8 *)fdrc;
|
||||
const u32 *litIndex =
|
||||
(const u32 *)(base + ROUNDUP_N(sizeof(*fdrc), alignof(u32)));
|
||||
// cppcheck-suppress cstyleCast
|
||||
const u32 *litIndex =(const u32 *)(base + ROUNDUP_N(sizeof(*fdrc), alignof(u32)));
|
||||
assert(ISALIGNED(litIndex));
|
||||
return litIndex;
|
||||
}
|
||||
|
@ -58,7 +58,7 @@ u64a make_u64a_mask(const vector<u8> &v) {
|
||||
u64a mask = 0;
|
||||
size_t vlen = v.size();
|
||||
size_t len = std::min(vlen, sizeof(mask));
|
||||
unsigned char *m = (unsigned char *)&mask;
|
||||
u8 *m = reinterpret_cast<u8 *>(&mask);
|
||||
memcpy(m + sizeof(mask) - len, &v[vlen - len], len);
|
||||
return mask;
|
||||
}
|
||||
@ -159,10 +159,10 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
|
||||
map<u32, vector<LiteralIndex> > res2lits;
|
||||
hwlm_group_t gm = 0;
|
||||
for (LiteralIndex i = 0; i < lits.size(); i++) {
|
||||
LitInfo & li = tmpLitInfo[i];
|
||||
const LitInfo & li = tmpLitInfo[i];
|
||||
u32 hash = CONF_HASH_CALL(li.v, andmsk, mult, nBits);
|
||||
DEBUG_PRINTF("%016llx --> %u\n", li.v, hash);
|
||||
res2lits[hash].push_back(i);
|
||||
res2lits[hash].emplace_back(i);
|
||||
gm |= li.groups;
|
||||
}
|
||||
|
||||
@ -245,10 +245,10 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
|
||||
fdrc->groups = gm;
|
||||
|
||||
// After the FDRConfirm, we have the lit index array.
|
||||
u8 *fdrc_base = (u8 *)fdrc.get();
|
||||
u8 *fdrc_base = reinterpret_cast<u8 *>(fdrc.get());
|
||||
u8 *ptr = fdrc_base + sizeof(*fdrc);
|
||||
ptr = ROUNDUP_PTR(ptr, alignof(u32));
|
||||
u32 *bitsToLitIndex = (u32 *)ptr;
|
||||
u32 *bitsToLitIndex = reinterpret_cast<u32 *>(ptr);
|
||||
ptr += bitsToLitIndexSize;
|
||||
|
||||
// After the lit index array, we have the LitInfo structures themselves,
|
||||
@ -265,7 +265,7 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
|
||||
LiteralIndex litIdx = *i;
|
||||
|
||||
// Write LitInfo header.
|
||||
LitInfo &finalLI = *(LitInfo *)ptr;
|
||||
LitInfo &finalLI = *(reinterpret_cast<LitInfo *>(ptr));
|
||||
finalLI = tmpLitInfo[litIdx];
|
||||
|
||||
ptr += sizeof(LitInfo); // String starts directly after LitInfo.
|
||||
@ -294,22 +294,20 @@ setupFullConfs(const vector<hwlmLiteral> &lits,
|
||||
const EngineDescription &eng,
|
||||
const map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
|
||||
bool make_small) {
|
||||
unique_ptr<TeddyEngineDescription> teddyDescr =
|
||||
getTeddyDescription(eng.getID());
|
||||
|
||||
BC2CONF bc2Conf;
|
||||
u32 totalConfirmSize = 0;
|
||||
for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
|
||||
if (contains(bucketToLits, b)) {
|
||||
vector<hwlmLiteral> vl;
|
||||
for (const LiteralIndex &lit_idx : bucketToLits.at(b)) {
|
||||
vl.push_back(lits[lit_idx]);
|
||||
// cppcheck-suppress useStlAlgorithm
|
||||
vl.emplace_back(lits[lit_idx]);
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("b %d sz %zu\n", b, vl.size());
|
||||
auto fc = getFDRConfirm(vl, make_small);
|
||||
totalConfirmSize += fc.size();
|
||||
bc2Conf.emplace(b, move(fc));
|
||||
bc2Conf.emplace(b, std::move(fc));
|
||||
}
|
||||
}
|
||||
|
||||
@ -320,7 +318,7 @@ setupFullConfs(const vector<hwlmLiteral> &lits,
|
||||
auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 64);
|
||||
assert(buf); // otherwise would have thrown std::bad_alloc
|
||||
|
||||
u32 *confBase = (u32 *)buf.get();
|
||||
u32 *confBase = reinterpret_cast<u32 *>(buf.get());
|
||||
u8 *ptr = buf.get() + totalConfSwitchSize;
|
||||
assert(ISALIGNED_CL(ptr));
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2019, Intel Corporation
|
||||
* Copyright (c) 2024, VectorCamp PC
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -54,9 +55,14 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
|
||||
if (likely(!start)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// these cplusplus checks are needed because this is included in both fdr.c and teddy.cpp
|
||||
#ifdef __cplusplus
|
||||
const struct LitInfo *li
|
||||
= reinterpret_cast<const struct LitInfo *>(reinterpret_cast<const u8 *>(fdrc) + start);
|
||||
#else
|
||||
const struct LitInfo *li
|
||||
= (const struct LitInfo *)((const u8 *)fdrc + start);
|
||||
#endif
|
||||
|
||||
struct hs_scratch *scratch = a->scratch;
|
||||
assert(!scratch->fdr_conf);
|
||||
@ -74,18 +80,20 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
|
||||
goto out;
|
||||
}
|
||||
|
||||
const u8 *loc = buf + i - li->size + 1;
|
||||
do{ // this do while is to block off the line below from the goto
|
||||
const u8 *loc = buf + i - li->size + 1;
|
||||
|
||||
if (loc < buf) {
|
||||
u32 full_overhang = buf - loc;
|
||||
size_t len_history = a->len_history;
|
||||
|
||||
if (loc < buf) {
|
||||
u32 full_overhang = buf - loc;
|
||||
size_t len_history = a->len_history;
|
||||
|
||||
// can't do a vectored confirm either if we don't have
|
||||
// the bytes
|
||||
if (full_overhang > len_history) {
|
||||
goto out;
|
||||
// can't do a vectored confirm either if we don't have
|
||||
// the bytes
|
||||
if (full_overhang > len_history) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
}
|
||||
}while(0);
|
||||
assert(li->size <= sizeof(CONF_TYPE));
|
||||
|
||||
if (unlikely(!(li->groups & *control))) {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
* Copyright (c) 2015-2020, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -74,9 +74,9 @@ void dumpLitIndex(const FDRConfirm *fdrc, FILE *f) {
|
||||
static
|
||||
void dumpConfirms(const void *fdr_base, u32 conf_offset, u32 num_confirms,
|
||||
FILE *f) {
|
||||
const u32 *conf = (const u32 *)((const char *)fdr_base + conf_offset);
|
||||
const u32 *conf = reinterpret_cast<const u32 *>(reinterpret_cast<const char *>(fdr_base) + conf_offset);
|
||||
for (u32 i = 0; i < num_confirms; i++) {
|
||||
const auto *fdrc = (const FDRConfirm *)((const char *)conf + conf[i]);
|
||||
const auto *fdrc = reinterpret_cast<const FDRConfirm *>(reinterpret_cast<const char *>(conf) + conf[i]);
|
||||
fprintf(f, " confirm %u\n", i);
|
||||
fprintf(f, " andmsk 0x%016llx\n", fdrc->andmsk);
|
||||
fprintf(f, " mult 0x%016llx\n", fdrc->mult);
|
||||
@ -107,12 +107,31 @@ void dumpTeddyReinforced(const u8 *rmsk, const u32 num_tables, FILE *f) {
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void dumpTeddyDupMasks(const u8 *dmsk, u32 numMasks, FILE *f) {
|
||||
// dump nibble masks
|
||||
u32 maskWidth = 2;
|
||||
fprintf(f, " dup nibble masks:\n");
|
||||
for (u32 i = 0; i < numMasks * 2; i++) {
|
||||
fprintf(f, " -%u%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
|
||||
for (u32 j = 0; j < 16 * maskWidth * 2; j++) {
|
||||
u8 val = dmsk[i * 16 * maskWidth * 2 + j];
|
||||
for (u32 k = 0; k < 8; k++) {
|
||||
fprintf(f, "%s", ((val >> k) & 0x1) ? "1" : "0");
|
||||
}
|
||||
fprintf(f, " ");
|
||||
}
|
||||
fprintf(f, "\n");
|
||||
}
|
||||
fprintf(f, "\n");
|
||||
}
|
||||
|
||||
static
|
||||
void dumpTeddyMasks(const u8 *baseMsk, u32 numMasks, u32 maskWidth, FILE *f) {
|
||||
// dump nibble masks
|
||||
fprintf(f, " nibble masks:\n");
|
||||
for (u32 i = 0; i < numMasks * 2; i++) {
|
||||
fprintf(f, " -%d%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
|
||||
fprintf(f, " -%u%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
|
||||
for (u32 j = 0; j < 16 * maskWidth; j++) {
|
||||
u8 val = baseMsk[i * 16 * maskWidth + j];
|
||||
for (u32 k = 0; k < 8; k++) {
|
||||
@ -138,7 +157,7 @@ void dumpTeddy(const Teddy *teddy, FILE *f) {
|
||||
fprintf(f, " buckets %u\n", des->getNumBuckets());
|
||||
fprintf(f, " packed %s\n", des->packed ? "true" : "false");
|
||||
fprintf(f, " strings %u\n", teddy->numStrings);
|
||||
fprintf(f, " size %zu bytes\n", fdrSize((const FDR *)teddy));
|
||||
fprintf(f, " size %zu bytes\n", fdrSize(reinterpret_cast<const FDR *>(teddy)));
|
||||
fprintf(f, " max length %u\n", teddy->maxStringLen);
|
||||
fprintf(f, " floodoff %u (%x)\n", teddy->floodOffset,
|
||||
teddy->floodOffset);
|
||||
@ -146,12 +165,17 @@ void dumpTeddy(const Teddy *teddy, FILE *f) {
|
||||
|
||||
u32 maskWidth = des->getNumBuckets() / 8;
|
||||
size_t headerSize = sizeof(Teddy);
|
||||
size_t maskLen = des->numMasks * 16 * 2 * maskWidth;
|
||||
const u8 *teddy_base = (const u8 *)teddy;
|
||||
const u8 *teddy_base = reinterpret_cast<const u8 *>(teddy);
|
||||
const u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
|
||||
const u8 *rmsk = baseMsk + ROUNDUP_CL(maskLen);
|
||||
dumpTeddyMasks(baseMsk, des->numMasks, maskWidth, f);
|
||||
dumpTeddyReinforced(rmsk, maskWidth, f);
|
||||
size_t maskLen = des->numMasks * 16 * 2 * maskWidth;
|
||||
const u8 *rdmsk = baseMsk + ROUNDUP_CL(maskLen);
|
||||
if (maskWidth == 1) { // reinforcement table in Teddy
|
||||
dumpTeddyReinforced(rdmsk, maskWidth, f);
|
||||
} else { // dup nibble mask table in Fat Teddy
|
||||
assert(maskWidth == 2);
|
||||
dumpTeddyDupMasks(rdmsk, des->numMasks, f);
|
||||
}
|
||||
dumpConfirms(teddy, teddy->confOffset, des->getNumBuckets(), f);
|
||||
}
|
||||
|
||||
@ -177,7 +201,7 @@ void dumpFDR(const FDR *fdr, FILE *f) {
|
||||
|
||||
void fdrPrintStats(const FDR *fdr, FILE *f) {
|
||||
if (fdrIsTeddy(fdr)) {
|
||||
dumpTeddy((const Teddy *)fdr, f);
|
||||
dumpTeddy(reinterpret_cast<const Teddy *>(fdr), f);
|
||||
} else {
|
||||
dumpFDR(fdr, f);
|
||||
}
|
||||
|
@ -31,7 +31,6 @@
|
||||
#include "hs_compile.h"
|
||||
#include "util/target_info.h"
|
||||
#include "util/compare.h" // for ourisalpha()
|
||||
#include "util/make_unique.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
@ -72,7 +71,7 @@ u32 findDesiredStride(size_t num_lits, size_t min_len, size_t min_len_count) {
|
||||
} else if (num_lits < 5000) {
|
||||
// for larger but not huge sizes, go to stride 2 only if we have at
|
||||
// least minlen 3
|
||||
desiredStride = MIN(min_len - 1, 2);
|
||||
desiredStride = std::min(min_len - 1, 2UL);
|
||||
}
|
||||
}
|
||||
|
||||
@ -196,7 +195,7 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("using engine %u\n", best->getID());
|
||||
return ue2::make_unique<FDREngineDescription>(*best);
|
||||
return std::make_unique<FDREngineDescription>(*best);
|
||||
}
|
||||
|
||||
SchemeBitIndex FDREngineDescription::getSchemeBit(BucketIndex b,
|
||||
@ -222,7 +221,7 @@ unique_ptr<FDREngineDescription> getFdrDescription(u32 engineID) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return ue2::make_unique<FDREngineDescription>(allDescs[engineID]);
|
||||
return std::make_unique<FDREngineDescription>(allDescs[engineID]);
|
||||
}
|
||||
|
||||
} // namespace ue2
|
||||
|
@ -208,8 +208,8 @@ bytecode_ptr<u8> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
|
||||
auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 16);
|
||||
assert(buf); // otherwise would have thrown std::bad_alloc
|
||||
|
||||
u32 *floodHeader = (u32 *)buf.get();
|
||||
FDRFlood *layoutFlood = (FDRFlood *)(buf.get() + floodHeaderSize);
|
||||
u32 *floodHeader = reinterpret_cast<u32 *>(buf.get());
|
||||
FDRFlood *layoutFlood = reinterpret_cast<FDRFlood *>(buf.get() + floodHeaderSize);
|
||||
|
||||
u32 currentFloodIndex = 0;
|
||||
for (const auto &m : flood2chars) {
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
* Copyright (c) 2024, VectorCamp PC
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -37,6 +38,13 @@
|
||||
#define FLOOD_MINIMUM_SIZE 256
|
||||
#define FLOOD_BACKOFF_START 32
|
||||
|
||||
// this is because this file is included in both fdr.c and teddy.cpp
|
||||
#if defined __cplusplus
|
||||
#define CU64A_P_CAST(X) reinterpret_cast<const u64a*>(X)
|
||||
#else
|
||||
#define CU64A_P_CAST(X) (const u64a *)(X)
|
||||
#endif
|
||||
|
||||
static really_inline
|
||||
const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) {
|
||||
// if we don't have a flood at either the start or end,
|
||||
@ -47,18 +55,18 @@ const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) {
|
||||
|
||||
/* entry points in runtime.c prefetch relevant data */
|
||||
#ifndef FLOOD_32
|
||||
u64a x11 = *(const u64a *)ROUNDUP_PTR(buf, 8);
|
||||
u64a x12 = *(const u64a *)ROUNDUP_PTR(buf+8, 8);
|
||||
u64a x11 = *CU64A_P_CAST(ROUNDUP_PTR(buf, 8));
|
||||
u64a x12 = *CU64A_P_CAST(ROUNDUP_PTR(buf+8, 8));
|
||||
if (x11 == x12) {
|
||||
return buf + floodBackoff;
|
||||
}
|
||||
u64a x21 = *(const u64a *)ROUNDUP_PTR(buf + len/2, 8);
|
||||
u64a x22 = *(const u64a *)ROUNDUP_PTR(buf + len/2 + 8, 8);
|
||||
u64a x21 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len/2, 8));
|
||||
u64a x22 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len/2 + 8, 8));
|
||||
if (x21 == x22) {
|
||||
return buf + floodBackoff;
|
||||
}
|
||||
u64a x31 = *(const u64a *)ROUNDUP_PTR(buf + len - 24, 8);
|
||||
u64a x32 = *(const u64a *)ROUNDUP_PTR(buf + len - 16, 8);
|
||||
u64a x31 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len - 24, 8));
|
||||
u64a x32 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len - 16, 8));
|
||||
if (x31 == x32) {
|
||||
return buf + floodBackoff;
|
||||
}
|
||||
@ -106,9 +114,15 @@ const u8 * floodDetect(const struct FDR * fdr,
|
||||
|
||||
// go from c to our FDRFlood structure
|
||||
u8 c = buf[i];
|
||||
#ifdef __cplusplus
|
||||
const u8 * fBase = (reinterpret_cast<const u8 *>(fdr)) + fdr->floodOffset;
|
||||
u32 fIdx = (reinterpret_cast<const u32 *>(fBase))[c];
|
||||
const struct FDRFlood * fsb = reinterpret_cast<const struct FDRFlood *>(fBase + sizeof(u32) * 256);
|
||||
#else
|
||||
const u8 * fBase = ((const u8 *)fdr) + fdr->floodOffset;
|
||||
u32 fIdx = ((const u32 *)fBase)[c];
|
||||
const struct FDRFlood * fsb = (const struct FDRFlood *)(fBase + sizeof(u32) * 256);
|
||||
#endif
|
||||
const struct FDRFlood * fl = &fsb[fIdx];
|
||||
|
||||
#ifndef FLOOD_32
|
||||
@ -116,7 +130,7 @@ const u8 * floodDetect(const struct FDR * fdr,
|
||||
cmpVal |= cmpVal << 8;
|
||||
cmpVal |= cmpVal << 16;
|
||||
cmpVal |= cmpVal << 32;
|
||||
u64a probe = *(const u64a *)ROUNDUP_PTR(buf+i, 8);
|
||||
u64a probe = *CU64A_P_CAST(ROUNDUP_PTR(buf+i, 8));
|
||||
#else
|
||||
u32 cmpVal = c;
|
||||
cmpVal |= cmpVal << 8;
|
||||
@ -139,16 +153,16 @@ const u8 * floodDetect(const struct FDR * fdr,
|
||||
#ifndef FLOOD_32
|
||||
j -= (u32)((uintptr_t)buf + j) & 0x7; // push j back to yield 8-aligned addrs
|
||||
for (; j + 32 < mainLoopLen; j += 32) {
|
||||
u64a v = *(const u64a *)(buf + j);
|
||||
u64a v2 = *(const u64a *)(buf + j + 8);
|
||||
u64a v3 = *(const u64a *)(buf + j + 16);
|
||||
u64a v4 = *(const u64a *)(buf + j + 24);
|
||||
u64a v = *CU64A_P_CAST(buf + j);
|
||||
u64a v2 = *CU64A_P_CAST(buf + j + 8);
|
||||
u64a v3 = *CU64A_P_CAST(buf + j + 16);
|
||||
u64a v4 = *CU64A_P_CAST(buf + j + 24);
|
||||
if ((v4 != cmpVal) || (v3 != cmpVal) || (v2 != cmpVal) || (v != cmpVal)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (; j + 8 < mainLoopLen; j += 8) {
|
||||
u64a v = *(const u64a *)(buf + j);
|
||||
u64a v = *CU64A_P_CAST(buf + j);
|
||||
if (v != cmpVal) {
|
||||
break;
|
||||
}
|
||||
@ -172,7 +186,11 @@ const u8 * floodDetect(const struct FDR * fdr,
|
||||
}
|
||||
#endif
|
||||
for (; j < mainLoopLen; j++) {
|
||||
#ifdef __cplusplus
|
||||
u8 v = *(reinterpret_cast<const u8 *>(buf + j));
|
||||
#else
|
||||
u8 v = *(const u8 *)(buf + j);
|
||||
#endif
|
||||
if (v != c) {
|
||||
break;
|
||||
}
|
||||
|
1122
src/fdr/teddy.c
1122
src/fdr/teddy.c
File diff suppressed because it is too large
Load Diff
862
src/fdr/teddy.cpp
Normal file
862
src/fdr/teddy.cpp
Normal file
@ -0,0 +1,862 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2020, Intel Corporation
|
||||
* Copyright (c) 2024, VectorCamp PC
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Teddy literal matcher: SSSE3 engine runtime.
|
||||
*/
|
||||
|
||||
#include "fdr_internal.h"
|
||||
#include "flood_runtime.h"
|
||||
#include "teddy.h"
|
||||
#include "teddy_internal.h"
|
||||
#include "teddy_runtime_common.h"
|
||||
#include "util/arch.h"
|
||||
#include "util/simd_utils.h"
|
||||
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
static really_inline
|
||||
hwlm_error_t conf_chunk_64(u64a chunk, u8 bucket, u8 offset,
|
||||
CautionReason reason, const u8 *pt,
|
||||
const u32* confBase,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t *control,
|
||||
u32 *last_match) {
|
||||
if (unlikely(chunk != ones_u64a)) {
|
||||
chunk = ~chunk;
|
||||
do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
|
||||
control, last_match);
|
||||
// adapted from CHECK_HWLM_TERMINATE_MATCHING
|
||||
if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#define CONF_CHUNK_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
|
||||
if(conf_chunk_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||
|
||||
#else // 32/64
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t conf_chunk_32(u32 chunk, u8 bucket, u8 offset,
|
||||
CautionReason reason, const u8 *pt,
|
||||
const u32* confBase,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t *control,
|
||||
u32 *last_match) {
|
||||
if (unlikely(chunk != ones_u32)) {
|
||||
chunk = ~chunk;
|
||||
do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
|
||||
control, last_match);
|
||||
// adapted from CHECK_HWLM_TERMINATE_MATCHING
|
||||
if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#define CONF_CHUNK_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
|
||||
if(conf_chunk_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX512VBMI) || defined(HAVE_AVX512) // common to both 512b's
|
||||
|
||||
static really_inline
|
||||
const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
|
||||
return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
|
||||
+ ROUNDUP_CL(2 * numMask * sizeof(m256)));
|
||||
}
|
||||
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t confirm_teddy_64_512(m512 var, u8 bucket, u8 offset,
|
||||
CautionReason reason, const u8 *ptr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
const u32* confBase, hwlm_group_t *control,
|
||||
u32 *last_match) {
|
||||
if (unlikely(diff512(var, ones512()))) {
|
||||
m128 p128_0 = extract128from512(var, 0);
|
||||
m128 p128_1 = extract128from512(var, 1);
|
||||
m128 p128_2 = extract128from512(var, 2);
|
||||
m128 p128_3 = extract128from512(var, 3);
|
||||
u64a part1 = movq(p128_0);
|
||||
u64a part2 = movq(rshiftbyte_m128(p128_0, 8));
|
||||
u64a part3 = movq(p128_1);
|
||||
u64a part4 = movq(rshiftbyte_m128(p128_1, 8));
|
||||
u64a part5 = movq(p128_2);
|
||||
u64a part6 = movq(rshiftbyte_m128(p128_2, 8));
|
||||
u64a part7 = movq(p128_3);
|
||||
u64a part8 = movq(rshiftbyte_m128(p128_3, 8));
|
||||
CONF_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_64(part2, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_64(part3, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_64(part4, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_64(part5, bucket, offset + 32, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_64(part6, bucket, offset + 40, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_64(part7, bucket, offset + 48, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_64(part8, bucket, offset + 56, reason, ptr, confBase, a, control, last_match);
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#define confirm_teddy_512_f confirm_teddy_64_512
|
||||
|
||||
#else // 32/64
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t confirm_teddy_32_512(m512 var, u8 bucket, u8 offset,
|
||||
CautionReason reason, const u8 *ptr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
const u32* confBase, hwlm_group_t *control,
|
||||
u32 *last_match) {
|
||||
if (unlikely(diff512(var, ones512()))) {
|
||||
m128 p128_0 = extract128from512(var, 0);
|
||||
m128 p128_1 = extract128from512(var, 1);
|
||||
m128 p128_2 = extract128from512(var, 2);
|
||||
m128 p128_3 = extract128from512(var, 3);
|
||||
u32 part1 = movd(p128_0);
|
||||
u32 part2 = movd(rshiftbyte_m128(p128_0, 4));
|
||||
u32 part3 = movd(rshiftbyte_m128(p128_0, 8));
|
||||
u32 part4 = movd(rshiftbyte_m128(p128_0, 12));
|
||||
u32 part5 = movd(p128_1);
|
||||
u32 part6 = movd(rshiftbyte_m128(p128_1, 4));
|
||||
u32 part7 = movd(rshiftbyte_m128(p128_1, 8));
|
||||
u32 part8 = movd(rshiftbyte_m128(p128_1, 12));
|
||||
u32 part9 = movd(p128_2);
|
||||
u32 part10 = movd(rshiftbyte_m128(p128_2, 4));
|
||||
u32 part11 = movd(rshiftbyte_m128(p128_2, 8));
|
||||
u32 part12 = movd(rshiftbyte_m128(p128_2, 12));
|
||||
u32 part13 = movd(p128_3);
|
||||
u32 part14 = movd(rshiftbyte_m128(p128_3, 4));
|
||||
u32 part15 = movd(rshiftbyte_m128(p128_3, 8));
|
||||
u32 part16 = movd(rshiftbyte_m128(p128_3, 12));
|
||||
CONF_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part5, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part6, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part7, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part8, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part9, bucket, offset + 32, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part10, bucket, offset + 36, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part11, bucket, offset + 40, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part12, bucket, offset + 44, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part13, bucket, offset + 48, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part14, bucket, offset + 52, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part15, bucket, offset + 56, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part16, bucket, offset + 60, reason, ptr, confBase, a, control, last_match);
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#define confirm_teddy_512_f confirm_teddy_32_512
|
||||
|
||||
|
||||
#endif // 32/64
|
||||
|
||||
#define CONFIRM_TEDDY_512(...) if(confirm_teddy_512_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||
|
||||
#endif // AVX512VBMI or AVX512
|
||||
|
||||
|
||||
#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
|
||||
|
||||
#define TEDDY_VBMI_SL1_MASK 0xfffffffffffffffeULL
|
||||
#define TEDDY_VBMI_SL2_MASK 0xfffffffffffffffcULL
|
||||
#define TEDDY_VBMI_SL3_MASK 0xfffffffffffffff8ULL
|
||||
|
||||
template<int NMSK>
|
||||
static really_inline
|
||||
m512 prep_conf_teddy_512vbmi_templ(const m512 *lo_mask, const m512 *dup_mask,
|
||||
const m512 *sl_msk, const m512 val) {
|
||||
m512 lo = and512(val, *lo_mask);
|
||||
m512 hi = and512(rshift64_m512(val, 4), *lo_mask);
|
||||
m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),
|
||||
pshufb_m512(dup_mask[1], hi));
|
||||
|
||||
if constexpr (NMSK == 1) return shuf_or_b0;
|
||||
m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),
|
||||
pshufb_m512(dup_mask[3], hi));
|
||||
m512 sl1 = maskz_vpermb512(TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
|
||||
if constexpr (NMSK == 2) return (or512(sl1, shuf_or_b0));
|
||||
m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),
|
||||
pshufb_m512(dup_mask[5], hi));
|
||||
m512 sl2 = maskz_vpermb512(TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
|
||||
if constexpr (NMSK == 3) return (or512(sl2, or512(sl1, shuf_or_b0)));
|
||||
m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),
|
||||
pshufb_m512(dup_mask[7], hi));
|
||||
m512 sl3 = maskz_vpermb512(TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
|
||||
return (or512(sl3, or512(sl2, or512(sl1, shuf_or_b0))));
|
||||
}
|
||||
|
||||
|
||||
#define TEDDY_VBMI_SL1_POS 15
|
||||
#define TEDDY_VBMI_SL2_POS 14
|
||||
#define TEDDY_VBMI_SL3_POS 13
|
||||
|
||||
#define TEDDY_VBMI_CONF_MASK_HEAD (0xffffffffffffffffULL >> n_sh)
|
||||
#define TEDDY_VBMI_CONF_MASK_FULL (0xffffffffffffffffULL << n_sh)
|
||||
#define TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffffffffffULL >> (64 - n) << overlap)
|
||||
#define TEDDY_VBMI_LOAD_MASK_PATCH (0xffffffffffffffffULL >> (64 - n_sh))
|
||||
|
||||
template<int NMSK>
|
||||
hwlm_error_t fdr_exec_teddy_512vbmi_templ(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = ones_u32;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 64;
|
||||
u32 n_sh = NMSK - 1;
|
||||
const size_t loopBytes = 64 - n_sh;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m128 *maskBase = getMaskBase(teddy);
|
||||
|
||||
m512 lo_mask = set1_64x8(0xf);
|
||||
m512 dup_mask[NMSK * 2];
|
||||
m512 sl_msk[NMSK - 1];
|
||||
dup_mask[0] = set1_4x128(maskBase[0]);
|
||||
dup_mask[1] = set1_4x128(maskBase[1]);
|
||||
if constexpr (NMSK > 1){
|
||||
dup_mask[2] = set1_4x128(maskBase[2]);
|
||||
dup_mask[3] = set1_4x128(maskBase[3]);
|
||||
sl_msk[0] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL1_POS);
|
||||
}
|
||||
if constexpr (NMSK > 2){
|
||||
dup_mask[4] = set1_4x128(maskBase[4]);
|
||||
dup_mask[5] = set1_4x128(maskBase[5]);
|
||||
sl_msk[1] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL2_POS);
|
||||
}
|
||||
if constexpr (NMSK > 3){
|
||||
dup_mask[6] = set1_4x128(maskBase[6]);
|
||||
dup_mask[7] = set1_4x128(maskBase[7]);
|
||||
sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS);
|
||||
}
|
||||
const u32 *confBase = getConfBase(teddy);
|
||||
|
||||
u64a k = TEDDY_VBMI_CONF_MASK_FULL;
|
||||
m512 p_mask = set_mask_m512(~k);
|
||||
u32 overlap = 0;
|
||||
u64a patch = 0;
|
||||
if (likely(ptr + loopBytes <= buf_end)) {
|
||||
m512 p_mask0 = set_mask_m512(~TEDDY_VBMI_CONF_MASK_HEAD);
|
||||
m512 r_0 = prep_conf_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, loadu512(ptr));
|
||||
r_0 = or512(r_0, p_mask0);
|
||||
CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
|
||||
ptr += loopBytes;
|
||||
overlap = n_sh;
|
||||
patch = TEDDY_VBMI_LOAD_MASK_PATCH;
|
||||
}
|
||||
|
||||
for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {
|
||||
__builtin_prefetch(ptr - n_sh + (64 * 2));
|
||||
CHECK_FLOOD;
|
||||
m512 r_0 = prep_conf_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, loadu512(ptr - n_sh));
|
||||
r_0 = or512(r_0, p_mask);
|
||||
CONFIRM_TEDDY_512(r_0, 8, 0, NOT_CAUTIOUS, ptr - n_sh);
|
||||
}
|
||||
|
||||
assert(ptr + loopBytes > buf_end);
|
||||
if (ptr < buf_end) {
|
||||
u32 left = (u32)(buf_end - ptr);
|
||||
u64a k1 = TEDDY_VBMI_CONF_MASK_VAR(left);
|
||||
m512 p_mask1 = set_mask_m512(~k1);
|
||||
m512 val_0 = loadu_maskz_m512(k1 | patch, ptr - overlap);
|
||||
m512 r_0 = prep_conf_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, val_0);
|
||||
r_0 = or512(r_0, p_mask1);
|
||||
CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr - overlap);
|
||||
}
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_512vbmi_templ
|
||||
|
||||
#elif defined(HAVE_AVX512) // AVX512 reinforced teddy
|
||||
|
||||
/* both 512b versions use the same confirm teddy */
|
||||
|
||||
template <int NMSK>
|
||||
static inline
|
||||
m512 shift_or_512_templ(const m512 *dup_mask, m512 lo, m512 hi) {
|
||||
return or512(lshift128_m512(or512(pshufb_m512(dup_mask[(NMSK - 1) * 2], lo),
|
||||
pshufb_m512(dup_mask[(NMSK * 2) - 1], hi)),
|
||||
NMSK - 1), shift_or_512_templ<NMSK - 1>(dup_mask, lo, hi));
|
||||
}
|
||||
|
||||
template <>
|
||||
m512 shift_or_512_templ<1>(const m512 *dup_mask, m512 lo, m512 hi){
|
||||
return or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi));
|
||||
}
|
||||
|
||||
template <int NMSK>
|
||||
static really_inline
|
||||
m512 prep_conf_teddy_no_reinforcement_512_templ(const m512 *lo_mask,
|
||||
const m512 *dup_mask,
|
||||
const m512 val) {
|
||||
m512 lo = and512(val, *lo_mask);
|
||||
m512 hi = and512(rshift64_m512(val, 4), *lo_mask);
|
||||
return shift_or_512_templ<NMSK>(dup_mask, lo, hi);
|
||||
}
|
||||
|
||||
template <int NMSK>
|
||||
static really_inline
|
||||
m512 prep_conf_teddy_512_templ(const m512 *lo_mask, const m512 *dup_mask,
|
||||
const u8 *ptr, const u64a *r_msk_base,
|
||||
u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
|
||||
m512 lo = and512(load512(ptr), *lo_mask);
|
||||
m512 hi = and512(rshift64_m512(load512(ptr), 4), *lo_mask);
|
||||
*c_16 = *(ptr + 15);
|
||||
*c_32 = *(ptr + 31);
|
||||
*c_48 = *(ptr + 47);
|
||||
m512 r_msk = set8x64(0ULL, r_msk_base[*c_48], 0ULL, r_msk_base[*c_32],
|
||||
0ULL, r_msk_base[*c_16], 0ULL, r_msk_base[*c_0]);
|
||||
*c_0 = *(ptr + 63);
|
||||
return or512(shift_or_512_templ<NMSK>(dup_mask, lo, hi), r_msk);
|
||||
}
|
||||
|
||||
|
||||
#define PREP_CONF_FN_512(ptr, n) \
|
||||
prep_conf_teddy_512_templ<n>(&lo_mask, dup_mask, ptr, r_msk_base, \
|
||||
&c_0, &c_16, &c_32, &c_48)
|
||||
|
||||
template <int NMSK>
|
||||
hwlm_error_t fdr_exec_teddy_512_templ(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = ones_u32;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 128;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m128 *maskBase = getMaskBase(teddy);
|
||||
|
||||
m512 lo_mask = set1_64x8(0xf);
|
||||
m512 dup_mask[NMSK * 2];
|
||||
|
||||
dup_mask[0] = set1_4x128(maskBase[0]);
|
||||
dup_mask[1] = set1_4x128(maskBase[1]);
|
||||
if constexpr (NMSK > 1){
|
||||
dup_mask[2] = set1_4x128(maskBase[2]);
|
||||
dup_mask[3] = set1_4x128(maskBase[3]);
|
||||
}
|
||||
if constexpr (NMSK > 2){
|
||||
dup_mask[4] = set1_4x128(maskBase[4]);
|
||||
dup_mask[5] = set1_4x128(maskBase[5]);
|
||||
}
|
||||
if constexpr (NMSK > 3){
|
||||
dup_mask[6] = set1_4x128(maskBase[6]);
|
||||
dup_mask[7] = set1_4x128(maskBase[7]);
|
||||
}
|
||||
const u32 *confBase = getConfBase(teddy);
|
||||
|
||||
const u64a *r_msk_base = getReinforcedMaskBase(teddy, NMSK);
|
||||
u32 c_0 = 0x100;
|
||||
u32 c_16 = 0x100;
|
||||
u32 c_32 = 0x100;
|
||||
u32 c_48 = 0x100;
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 64);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 64;
|
||||
m512 p_mask;
|
||||
m512 val_0 = vectoredLoad512(&p_mask, ptr, a->start_offset,
|
||||
a->buf, buf_end,
|
||||
a->buf_history, a->len_history, NMSK);
|
||||
m512 r_0 = prep_conf_teddy_no_reinforcement_512_templ<NMSK>(&lo_mask, dup_mask, val_0);
|
||||
r_0 = or512(r_0, p_mask);
|
||||
CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
|
||||
ptr += 64;
|
||||
}
|
||||
|
||||
if (ptr + 64 <= buf_end) {
|
||||
m512 r_0 = PREP_CONF_FN_512(ptr, NMSK);
|
||||
CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
|
||||
ptr += 64;
|
||||
}
|
||||
|
||||
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes * 4));
|
||||
CHECK_FLOOD;
|
||||
m512 r_0 = PREP_CONF_FN_512(ptr, NMSK);
|
||||
CONFIRM_TEDDY_512(r_0, 8, 0, NOT_CAUTIOUS, ptr);
|
||||
m512 r_1 = PREP_CONF_FN_512(ptr + 64, NMSK);
|
||||
CONFIRM_TEDDY_512(r_1, 8, 64, NOT_CAUTIOUS, ptr);
|
||||
}
|
||||
|
||||
if (ptr + 64 <= buf_end) {
|
||||
m512 r_0 = PREP_CONF_FN_512(ptr, NMSK);
|
||||
CONFIRM_TEDDY_512(r_0, 8, 0, NOT_CAUTIOUS, ptr);
|
||||
ptr += 64;
|
||||
}
|
||||
|
||||
assert(ptr + 64 > buf_end);
|
||||
if (ptr < buf_end) {
|
||||
m512 p_mask;
|
||||
m512 val_0 = vectoredLoad512(&p_mask, ptr, 0, ptr, buf_end,
|
||||
a->buf_history, a->len_history, NMSK);
|
||||
m512 r_0 = prep_conf_teddy_no_reinforcement_512_templ<NMSK>(&lo_mask, dup_mask,val_0);
|
||||
r_0 = or512(r_0, p_mask);
|
||||
CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
|
||||
}
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_512_templ
|
||||
|
||||
/* #endif // AVX512 vs AVX512VBMI * back to the original fully exclusive logic */
|
||||
|
||||
#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
|
||||
hwlm_error_t confirm_teddy_64_256(m256 var, u8 bucket, u8 offset,
|
||||
CautionReason reason, const u8 *ptr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
const u32* confBase, hwlm_group_t *control,
|
||||
u32 *last_match) {
|
||||
if (unlikely(diff256(var, ones256()))) {
|
||||
m128 lo = movdq_lo(var);
|
||||
m128 hi = movdq_hi(var);
|
||||
u64a part1 = movq(lo);
|
||||
u64a part2 = movq(rshiftbyte_m128(lo, 8));
|
||||
u64a part3 = movq(hi);
|
||||
u64a part4 = movq(rshiftbyte_m128(hi, 8));
|
||||
CONF_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_64(part2, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_64(part3, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_64(part4, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#define confirm_teddy_256_f confirm_teddy_64_256
|
||||
|
||||
#else
|
||||
|
||||
hwlm_error_t confirm_teddy_32_256(m256 var, u8 bucket, u8 offset,
|
||||
CautionReason reason, const u8 *ptr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
const u32* confBase, hwlm_group_t *control,
|
||||
u32 *last_match) {
|
||||
if (unlikely(diff256(var, ones256()))) {
|
||||
m128 lo = movdq_lo(var);
|
||||
m128 hi = movdq_hi(var);
|
||||
u32 part1 = movd(lo);
|
||||
u32 part2 = movd(rshiftbyte_m128(lo, 4));
|
||||
u32 part3 = movd(rshiftbyte_m128(lo, 8));
|
||||
u32 part4 = movd(rshiftbyte_m128(lo, 12));
|
||||
u32 part5 = movd(hi);
|
||||
u32 part6 = movd(rshiftbyte_m128(hi, 4));
|
||||
u32 part7 = movd(rshiftbyte_m128(hi, 8));
|
||||
u32 part8 = movd(rshiftbyte_m128(hi, 12));
|
||||
CONF_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part5, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part6, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part7, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part8, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#define confirm_teddy_256_f confirm_teddy_32_256
|
||||
|
||||
#endif
|
||||
|
||||
#define CONFIRM_TEDDY_256(...) if(confirm_teddy_256_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||
|
||||
/*
|
||||
static really_inline
|
||||
m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
const u8 *lo, const u8 *hi,
|
||||
const u8 *buf_history, size_t len_history,
|
||||
const u32 nMasks) {
|
||||
m128 p_mask128;
|
||||
m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
|
||||
buf_history, len_history, nMasks));
|
||||
*p_mask = set1_2x128(p_mask128);
|
||||
return ret;
|
||||
}
|
||||
*/
|
||||
|
||||
template <int NMSK>
|
||||
static inline
|
||||
m256 shift_or_256_templ(const m256 *dup_mask, m256 lo, m256 hi){
|
||||
return or256(lshift128_m256(or256(pshufb_m256(dup_mask[(NMSK-1)*2], lo),
|
||||
pshufb_m256(dup_mask[(NMSK*2)-1], hi)),
|
||||
(NMSK-1)), shift_or_256_templ<NMSK-1>(dup_mask, lo, hi));
|
||||
}
|
||||
|
||||
template<>
|
||||
m256 shift_or_256_templ<1>(const m256 *dup_mask, m256 lo, m256 hi){
|
||||
return or256(pshufb_m256(dup_mask[0], lo), pshufb_m256(dup_mask[1], hi));
|
||||
}
|
||||
|
||||
template <int NMSK>
|
||||
static really_inline
|
||||
m256 prep_conf_teddy_no_reinforcement_256_templ(const m256 *lo_mask,
|
||||
const m256 *dup_mask,
|
||||
const m256 val) {
|
||||
m256 lo = and256(val, *lo_mask);
|
||||
m256 hi = and256(rshift64_m256(val, 4), *lo_mask);
|
||||
return shift_or_256_templ<NMSK>(dup_mask, lo, hi);
|
||||
}
|
||||
|
||||
template <int NMSK>
|
||||
static really_inline
|
||||
m256 prep_conf_teddy_256_templ(const m256 *lo_mask, const m256 *dup_mask,
|
||||
const u8 *ptr, const u64a *r_msk_base,
|
||||
u32 *c_0, u32 *c_128) {
|
||||
m256 lo = and256(load256(ptr), *lo_mask);
|
||||
m256 hi = and256(rshift64_m256(load256(ptr), 4), *lo_mask);
|
||||
*c_128 = *(ptr + 15);
|
||||
m256 r_msk = set4x64(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]);
|
||||
*c_0 = *(ptr + 31);
|
||||
return or256(shift_or_256_templ<NMSK>(dup_mask, lo, hi), r_msk);
|
||||
}
|
||||
|
||||
#define PREP_CONF_FN_256_NO_REINFORCEMENT(val, n) \
|
||||
prep_conf_teddy_no_reinforcement_256_templ<n>(&lo_mask, dup_mask, val)
|
||||
|
||||
#define PREP_CONF_FN_256(ptr, n) \
|
||||
prep_conf_teddy_256_templ<n>(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
|
||||
|
||||
template <int NMSK>
|
||||
hwlm_error_t fdr_exec_teddy_256_templ(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = ones_u32;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 64;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m128 *maskBase = getMaskBase(teddy);
|
||||
//PREPARE_MASKS_256;
|
||||
|
||||
m256 lo_mask = set1_32x8(0xf);
|
||||
m256 dup_mask[NMSK * 2];
|
||||
dup_mask[0] = set1_2x128(maskBase[0]);
|
||||
dup_mask[1] = set1_2x128(maskBase[1]);
|
||||
if constexpr (NMSK > 1){
|
||||
dup_mask[2] = set1_2x128(maskBase[2]);
|
||||
dup_mask[3] = set1_2x128(maskBase[3]);
|
||||
}
|
||||
if constexpr (NMSK > 2){
|
||||
dup_mask[4] = set1_2x128(maskBase[4]);
|
||||
dup_mask[5] = set1_2x128(maskBase[5]);
|
||||
}
|
||||
if constexpr (NMSK > 3){
|
||||
dup_mask[6] = set1_2x128(maskBase[6]);
|
||||
dup_mask[7] = set1_2x128(maskBase[7]);
|
||||
}
|
||||
const u32 *confBase = getConfBase(teddy);
|
||||
|
||||
const u64a *r_msk_base = getReinforcedMaskBase(teddy, NMSK);
|
||||
u32 c_0 = 0x100;
|
||||
u32 c_128 = 0x100;
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 32);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 32;
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset,
|
||||
a->buf, buf_end,
|
||||
a->buf_history, a->len_history, NMSK);
|
||||
m256 r_0 = PREP_CONF_FN_256_NO_REINFORCEMENT(val_0, NMSK);
|
||||
r_0 = or256(r_0, p_mask);
|
||||
CONFIRM_TEDDY_256(r_0, 8, 0, VECTORING, ptr);
|
||||
ptr += 32;
|
||||
}
|
||||
|
||||
if (ptr + 32 <= buf_end) {
|
||||
m256 r_0 = PREP_CONF_FN_256(ptr, NMSK);
|
||||
CONFIRM_TEDDY_256(r_0, 8, 0, VECTORING, ptr);
|
||||
ptr += 32;
|
||||
}
|
||||
|
||||
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes * 4));
|
||||
CHECK_FLOOD;
|
||||
m256 r_0 = PREP_CONF_FN_256(ptr, NMSK);
|
||||
CONFIRM_TEDDY_256(r_0, 8, 0, NOT_CAUTIOUS, ptr);
|
||||
m256 r_1 = PREP_CONF_FN_256(ptr + 32, NMSK);
|
||||
CONFIRM_TEDDY_256(r_1, 8, 32, NOT_CAUTIOUS, ptr);
|
||||
}
|
||||
|
||||
if (ptr + 32 <= buf_end) {
|
||||
m256 r_0 = PREP_CONF_FN_256(ptr, NMSK);
|
||||
CONFIRM_TEDDY_256(r_0, 8, 0, NOT_CAUTIOUS, ptr);
|
||||
ptr += 32;
|
||||
}
|
||||
|
||||
assert(ptr + 32 > buf_end);
|
||||
if (ptr < buf_end) {
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end,
|
||||
a->buf_history, a->len_history, NMSK);
|
||||
m256 r_0 = PREP_CONF_FN_256_NO_REINFORCEMENT(val_0, NMSK);
|
||||
r_0 = or256(r_0, p_mask);
|
||||
CONFIRM_TEDDY_256(r_0, 8, 0, VECTORING, ptr);
|
||||
}
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_256_templ
|
||||
|
||||
#else // not defined HAVE_AVX2
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
static really_inline
|
||||
hwlm_error_t confirm_teddy_64_128(m128 var, u8 bucket, u8 offset,
|
||||
CautionReason reason, const u8 *ptr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
const u32* confBase, hwlm_group_t *control,
|
||||
u32 *last_match) {
|
||||
if (unlikely(diff128(var, ones128()))) {
|
||||
u64a lo = 0;
|
||||
u64a hi = 0;
|
||||
u64a __attribute__((aligned(16))) vec[2];
|
||||
store128(vec, var);
|
||||
lo = vec[0];
|
||||
hi = vec[1];
|
||||
CONF_CHUNK_64(lo, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_64(hi, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#define confirm_teddy_128_f confirm_teddy_64_128
|
||||
|
||||
#else // 32/64
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t confirm_teddy_32_128(m128 var, u8 bucket, u8 offset,
|
||||
CautionReason reason, const u8 *ptr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
const u32* confBase, hwlm_group_t *control,
|
||||
u32 *last_match) {
|
||||
if (unlikely(diff128(var, ones128()))) {
|
||||
u32 part1 = movd(var);
|
||||
u32 part2 = movd(rshiftbyte_m128(var, 4));
|
||||
u32 part3 = movd(rshiftbyte_m128(var, 8));
|
||||
u32 part4 = movd(rshiftbyte_m128(var, 12));
|
||||
CONF_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_CHUNK_32(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
#define confirm_teddy_128_f confirm_teddy_32_128
|
||||
|
||||
#endif // 32/64
|
||||
|
||||
|
||||
#define CONFIRM_TEDDY_128(...) if(confirm_teddy_128_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||
|
||||
template <int NMSK>
|
||||
static really_inline
|
||||
m128 prep_conf_teddy_128_templ(const m128 *maskBase, m128 val) {
|
||||
m128 mask = set1_16x8(0xf);
|
||||
m128 lo = and128(val, mask);
|
||||
m128 hi = and128(rshift64_m128(val, 4), mask);
|
||||
m128 r1 = or128(pshufb_m128(maskBase[0 * 2], lo),
|
||||
pshufb_m128(maskBase[0 * 2 + 1], hi));
|
||||
if constexpr (NMSK == 1) return r1;
|
||||
m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo),
|
||||
pshufb_m128(maskBase[1 * 2 + 1], hi));
|
||||
|
||||
m128 old_1 = zeroes128();
|
||||
m128 res_shifted_1 = palignr(res_1, old_1, 16 - 1);
|
||||
m128 r2 = or128(r1, res_shifted_1);
|
||||
if constexpr (NMSK == 2) return r2;
|
||||
m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo),
|
||||
pshufb_m128(maskBase[2 * 2 + 1], hi));
|
||||
m128 res_shifted_2 = palignr(res_2, old_1, 16 - 2);
|
||||
m128 r3 = or128(r2, res_shifted_2);
|
||||
if constexpr (NMSK == 3) return r3;
|
||||
m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo),
|
||||
pshufb_m128(maskBase[3 * 2 + 1], hi));
|
||||
m128 res_shifted_3 = palignr(res_3, old_1, 16 - 3);
|
||||
return or128(r3, res_shifted_3);
|
||||
}
|
||||
|
||||
template <int NMSK>
|
||||
hwlm_error_t fdr_exec_teddy_128_templ(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = ones_u32;
|
||||
const struct Teddy *teddy = reinterpret_cast<const struct Teddy *>(fdr);
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m128 *maskBase = getMaskBase(teddy);
|
||||
const u32 *confBase = getConfBase(teddy);
|
||||
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, a->start_offset,
|
||||
a->buf, buf_end,
|
||||
a->buf_history, a->len_history, NMSK);
|
||||
m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, val_0);
|
||||
r_0 = or128(r_0, p_mask);
|
||||
CONFIRM_TEDDY_128(r_0, 8, 0, VECTORING, ptr);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 <= buf_end) {
|
||||
m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr));
|
||||
CONFIRM_TEDDY_128(r_0, 8, 0, VECTORING, ptr);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes * 4));
|
||||
CHECK_FLOOD;
|
||||
m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr));
|
||||
CONFIRM_TEDDY_128(r_0, 8, 0, NOT_CAUTIOUS, ptr);
|
||||
m128 r_1 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr + 16));
|
||||
CONFIRM_TEDDY_128(r_1, 8, 16, NOT_CAUTIOUS, ptr);
|
||||
}
|
||||
|
||||
if (ptr + 16 <= buf_end) {
|
||||
m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr));
|
||||
CONFIRM_TEDDY_128(r_0, 8, 0, NOT_CAUTIOUS, ptr);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
assert(ptr + 16 > buf_end);
|
||||
if (ptr < buf_end) {
|
||||
m128 p_mask;
|
||||
m128 val_0 = vectoredLoad128(&p_mask, ptr, 0, ptr, buf_end,
|
||||
a->buf_history, a->len_history, NMSK);
|
||||
m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, val_0);
|
||||
r_0 = or128(r_0, p_mask);
|
||||
CONFIRM_TEDDY_128(r_0, 8, 0, VECTORING, ptr);
|
||||
}
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_128_templ
|
||||
|
||||
|
||||
#endif // HAVE_AVX2 HAVE_AVX512
|
||||
|
||||
|
||||
|
||||
extern "C" {
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_TEDDY_FN<1>(fdr, a, control);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_TEDDY_FN<1>(fdr, a, control);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_TEDDY_FN<2>(fdr, a, control);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_TEDDY_FN<2>(fdr, a, control);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_TEDDY_FN<3>(fdr, a, control);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_TEDDY_FN<3>(fdr, a, control);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_TEDDY_FN<4>(fdr, a, control);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_TEDDY_FN<4>(fdr, a, control);
|
||||
}
|
||||
|
||||
} // extern
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2016-2017, Intel Corporation
|
||||
* Copyright (c) 2024, VectorCamp PC
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -39,6 +40,10 @@
|
||||
struct FDR; // forward declaration from fdr_internal.h
|
||||
struct FDR_Runtime_Args;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control);
|
||||
@ -106,5 +111,8 @@ hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
|
||||
hwlm_group_t control);
|
||||
|
||||
#endif /* HAVE_AVX2 */
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* TEDDY_H_ */
|
||||
|
@ -1,712 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016-2020, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Teddy literal matcher: AVX2 engine runtime.
|
||||
*/
|
||||
|
||||
#include "fdr_internal.h"
|
||||
#include "flood_runtime.h"
|
||||
#include "teddy.h"
|
||||
#include "teddy_internal.h"
|
||||
#include "teddy_runtime_common.h"
|
||||
#include "util/arch.h"
|
||||
#include "util/simd_utils.h"
|
||||
|
||||
#if defined(HAVE_AVX2)
|
||||
|
||||
const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
|
||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
|
||||
};
|
||||
|
||||
#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn) \
|
||||
do { \
|
||||
if (unlikely(chunk != ones_u64a)) { \
|
||||
chunk = ~chunk; \
|
||||
conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
|
||||
&control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, conf_fn) \
|
||||
do { \
|
||||
if (unlikely(chunk != ones_u32)) { \
|
||||
chunk = ~chunk; \
|
||||
conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
|
||||
&control, &last_match); \
|
||||
CHECK_HWLM_TERMINATE_MATCHING; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
static really_inline
|
||||
const m256 *getMaskBase_fat(const struct Teddy *teddy) {
|
||||
return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
|
||||
}
|
||||
|
||||
#if defined(HAVE_AVX512_REVERT) // revert to AVX2 Fat Teddy
|
||||
|
||||
static really_inline
|
||||
const u64a *getReinforcedMaskBase_fat(const struct Teddy *teddy, u8 numMask) {
|
||||
return (const u64a *)((const u8 *)getMaskBase_fat(teddy)
|
||||
+ ROUNDUP_CL(2 * numMask * sizeof(m256)));
|
||||
}
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
|
||||
do { \
|
||||
if (unlikely(diff512(var, ones512()))) { \
|
||||
m512 swap = swap256in512(var); \
|
||||
m512 r = interleave512lo(var, swap); \
|
||||
m128 r0 = extract128from512(r, 0); \
|
||||
m128 r1 = extract128from512(r, 1); \
|
||||
u64a part1 = movq(r0); \
|
||||
u64a part2 = extract64from128(r0, 1); \
|
||||
u64a part5 = movq(r1); \
|
||||
u64a part6 = extract64from128(r1, 1); \
|
||||
r = interleave512hi(var, swap); \
|
||||
r0 = extract128from512(r, 0); \
|
||||
r1 = extract128from512(r, 1); \
|
||||
u64a part3 = movq(r0); \
|
||||
u64a part4 = extract64from128(r0, 1); \
|
||||
u64a part7 = movq(r1); \
|
||||
u64a part8 = extract64from128(r1, 1); \
|
||||
CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, conf_fn); \
|
||||
} \
|
||||
} while(0)
|
||||
#else
|
||||
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
|
||||
do { \
|
||||
if (unlikely(diff512(var, ones512()))) { \
|
||||
m512 swap = swap256in512(var); \
|
||||
m512 r = interleave512lo(var, swap); \
|
||||
m128 r0 = extract128from512(r, 0); \
|
||||
m128 r1 = extract128from512(r, 1); \
|
||||
u32 part1 = movd(r0); \
|
||||
u32 part2 = extract32from128(r0, 1); \
|
||||
u32 part3 = extract32from128(r0, 2); \
|
||||
u32 part4 = extract32from128(r0, 3); \
|
||||
u32 part9 = movd(r1); \
|
||||
u32 part10 = extract32from128(r1, 1); \
|
||||
u32 part11 = extract32from128(r1, 2); \
|
||||
u32 part12 = extract32from128(r1, 3); \
|
||||
r = interleave512hi(var, swap); \
|
||||
r0 = extract128from512(r, 0); \
|
||||
r1 = extract128from512(r, 1); \
|
||||
u32 part5 = movd(r0); \
|
||||
u32 part6 = extract32from128(r0, 1); \
|
||||
u32 part7 = extract32from128(r0, 2); \
|
||||
u32 part8 = extract32from128(r0, 3); \
|
||||
u32 part13 = movd(r1); \
|
||||
u32 part14 = extract32from128(r1, 1); \
|
||||
u32 part15 = extract32from128(r1, 2); \
|
||||
u32 part16 = extract32from128(r1, 3); \
|
||||
CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, conf_fn); \
|
||||
} \
|
||||
} while(0)
|
||||
#endif
|
||||
|
||||
static really_inline
|
||||
m512 vectoredLoad2x256(m512 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
const u8 *lo, const u8 *hi,
|
||||
const u8 *buf_history, size_t len_history,
|
||||
const u32 nMasks) {
|
||||
m256 p_mask256;
|
||||
m512 ret = set2x256(vectoredLoad256(&p_mask256, ptr, start_offset, lo, hi,
|
||||
buf_history, len_history, nMasks));
|
||||
*p_mask = set2x256(p_mask256);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val) \
|
||||
m512 lo = and512(val, *lo_mask); \
|
||||
m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
|
||||
|
||||
#define PREP_FAT_SHUF_MASK \
|
||||
PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(set2x256(load256(ptr))); \
|
||||
*c_16 = *(ptr + 15); \
|
||||
m512 r_msk = set512_64(0ULL, r_msk_base_hi[*c_16], \
|
||||
0ULL, r_msk_base_hi[*c_0], \
|
||||
0ULL, r_msk_base_lo[*c_16], \
|
||||
0ULL, r_msk_base_lo[*c_0]); \
|
||||
*c_0 = *(ptr + 31)
|
||||
|
||||
#define FAT_SHIFT_OR_M1 \
|
||||
or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi))
|
||||
|
||||
#define FAT_SHIFT_OR_M2 \
|
||||
or512(lshift128_m512(or512(pshufb_m512(dup_mask[2], lo), \
|
||||
pshufb_m512(dup_mask[3], hi)), \
|
||||
1), FAT_SHIFT_OR_M1)
|
||||
|
||||
#define FAT_SHIFT_OR_M3 \
|
||||
or512(lshift128_m512(or512(pshufb_m512(dup_mask[4], lo), \
|
||||
pshufb_m512(dup_mask[5], hi)), \
|
||||
2), FAT_SHIFT_OR_M2)
|
||||
|
||||
#define FAT_SHIFT_OR_M4 \
|
||||
or512(lshift128_m512(or512(pshufb_m512(dup_mask[6], lo), \
|
||||
pshufb_m512(dup_mask[7], hi)), \
|
||||
3), FAT_SHIFT_OR_M3)
|
||||
|
||||
static really_inline
|
||||
m512 prep_conf_fat_teddy_no_reinforcement_m1(const m512 *lo_mask,
|
||||
const m512 *dup_mask,
|
||||
const m512 val) {
|
||||
PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
|
||||
return FAT_SHIFT_OR_M1;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 prep_conf_fat_teddy_no_reinforcement_m2(const m512 *lo_mask,
|
||||
const m512 *dup_mask,
|
||||
const m512 val) {
|
||||
PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
|
||||
return FAT_SHIFT_OR_M2;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 prep_conf_fat_teddy_no_reinforcement_m3(const m512 *lo_mask,
|
||||
const m512 *dup_mask,
|
||||
const m512 val) {
|
||||
PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
|
||||
return FAT_SHIFT_OR_M3;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 prep_conf_fat_teddy_no_reinforcement_m4(const m512 *lo_mask,
|
||||
const m512 *dup_mask,
|
||||
const m512 val) {
|
||||
PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
|
||||
return FAT_SHIFT_OR_M4;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 prep_conf_fat_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
|
||||
const u8 *ptr, const u64a *r_msk_base_lo,
|
||||
const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
|
||||
PREP_FAT_SHUF_MASK;
|
||||
return or512(FAT_SHIFT_OR_M1, r_msk);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 prep_conf_fat_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
|
||||
const u8 *ptr, const u64a *r_msk_base_lo,
|
||||
const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
|
||||
PREP_FAT_SHUF_MASK;
|
||||
return or512(FAT_SHIFT_OR_M2, r_msk);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 prep_conf_fat_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
|
||||
const u8 *ptr, const u64a *r_msk_base_lo,
|
||||
const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
|
||||
PREP_FAT_SHUF_MASK;
|
||||
return or512(FAT_SHIFT_OR_M3, r_msk);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
|
||||
const u8 *ptr, const u64a *r_msk_base_lo,
|
||||
const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
|
||||
PREP_FAT_SHUF_MASK;
|
||||
return or512(FAT_SHIFT_OR_M4, r_msk);
|
||||
}
|
||||
|
||||
#define PREP_CONF_FAT_FN_NO_REINFORCEMENT(val, n) \
|
||||
prep_conf_fat_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
|
||||
|
||||
#define PREP_CONF_FAT_FN(ptr, n) \
|
||||
prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, ptr, \
|
||||
r_msk_base_lo, r_msk_base_hi, &c_0, &c_16)
|
||||
|
||||
/*
|
||||
* In FAT teddy, it needs 2 bytes to represent result of each position,
|
||||
* so each nibble's(for example, lo nibble of last byte) FAT teddy mask
|
||||
* has 16x2 bytes:
|
||||
* |----------------------------------|----------------------------------|
|
||||
* 16bytes (bucket 0..7 in each byte) 16bytes (bucket 8..15 in each byte)
|
||||
* A B
|
||||
* at runtime FAT teddy reads 16 bytes once and duplicate them to 32 bytes:
|
||||
* |----------------------------------|----------------------------------|
|
||||
* 16bytes input data (lo nibbles) 16bytes duplicated data (lo nibbles)
|
||||
* X X
|
||||
* then do pshufb_m256(AB, XX).
|
||||
*
|
||||
* In AVX512 reinforced FAT teddy, it reads 32 bytes once and duplicate them
|
||||
* to 64 bytes:
|
||||
* |----------------|----------------|----------------|----------------|
|
||||
* X Y X Y
|
||||
* in this case we need DUP_FAT_MASK to construct AABB:
|
||||
* |----------------|----------------|----------------|----------------|
|
||||
* A A B B
|
||||
* then do pshufb_m512(AABB, XYXY).
|
||||
*/
|
||||
|
||||
#define DUP_FAT_MASK(a) mask_set2x256(set2x256(swap128in256(a)), 0xC3, a)
|
||||
|
||||
#define PREPARE_FAT_MASKS_1 \
|
||||
dup_mask[0] = DUP_FAT_MASK(maskBase[0]); \
|
||||
dup_mask[1] = DUP_FAT_MASK(maskBase[1]);
|
||||
|
||||
#define PREPARE_FAT_MASKS_2 \
|
||||
PREPARE_FAT_MASKS_1 \
|
||||
dup_mask[2] = DUP_FAT_MASK(maskBase[2]); \
|
||||
dup_mask[3] = DUP_FAT_MASK(maskBase[3]);
|
||||
|
||||
#define PREPARE_FAT_MASKS_3 \
|
||||
PREPARE_FAT_MASKS_2 \
|
||||
dup_mask[4] = DUP_FAT_MASK(maskBase[4]); \
|
||||
dup_mask[5] = DUP_FAT_MASK(maskBase[5]);
|
||||
|
||||
#define PREPARE_FAT_MASKS_4 \
|
||||
PREPARE_FAT_MASKS_3 \
|
||||
dup_mask[6] = DUP_FAT_MASK(maskBase[6]); \
|
||||
dup_mask[7] = DUP_FAT_MASK(maskBase[7]);
|
||||
|
||||
#define PREPARE_FAT_MASKS(n) \
|
||||
m512 lo_mask = set64x8(0xf); \
|
||||
m512 dup_mask[n * 2]; \
|
||||
PREPARE_FAT_MASKS_##n
|
||||
|
||||
#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \
|
||||
do { \
|
||||
const u8 *buf_end = a->buf + a->len; \
|
||||
const u8 *ptr = a->buf + a->start_offset; \
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START; \
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect; \
|
||||
u32 last_match = ones_u32; \
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr; \
|
||||
const size_t iterBytes = 64; \
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
|
||||
a->buf, a->len, a->start_offset); \
|
||||
\
|
||||
const m256 *maskBase = getMaskBase_fat(teddy); \
|
||||
PREPARE_FAT_MASKS(n_msk); \
|
||||
const u32 *confBase = getConfBase(teddy); \
|
||||
\
|
||||
const u64a *r_msk_base_lo = getReinforcedMaskBase_fat(teddy, n_msk); \
|
||||
const u64a *r_msk_base_hi = r_msk_base_lo + (N_CHARS + 1); \
|
||||
u32 c_0 = 0x100; \
|
||||
u32 c_16 = 0x100; \
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 32); \
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
|
||||
if (ptr < mainStart) { \
|
||||
ptr = mainStart - 32; \
|
||||
m512 p_mask; \
|
||||
m512 val_0 = vectoredLoad2x256(&p_mask, ptr, a->start_offset, \
|
||||
a->buf, buf_end, \
|
||||
a->buf_history, a->len_history, n_msk); \
|
||||
m512 r_0 = PREP_CONF_FAT_FN_NO_REINFORCEMENT(val_0, n_msk); \
|
||||
r_0 = or512(r_0, p_mask); \
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
|
||||
ptr += 32; \
|
||||
} \
|
||||
\
|
||||
if (ptr + 32 <= buf_end) { \
|
||||
m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk); \
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
|
||||
ptr += 32; \
|
||||
} \
|
||||
\
|
||||
for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
|
||||
__builtin_prefetch(ptr + (iterBytes * 4)); \
|
||||
CHECK_FLOOD; \
|
||||
m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk); \
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
|
||||
m512 r_1 = PREP_CONF_FAT_FN(ptr + 32, n_msk); \
|
||||
CONFIRM_FAT_TEDDY(r_1, 16, 32, NOT_CAUTIOUS, conf_fn); \
|
||||
} \
|
||||
\
|
||||
if (ptr + 32 <= buf_end) { \
|
||||
m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk); \
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
|
||||
ptr += 32; \
|
||||
} \
|
||||
\
|
||||
assert(ptr + 32 > buf_end); \
|
||||
if (ptr < buf_end) { \
|
||||
m512 p_mask; \
|
||||
m512 val_0 = vectoredLoad2x256(&p_mask, ptr, 0, ptr, buf_end, \
|
||||
a->buf_history, a->len_history, n_msk); \
|
||||
m512 r_0 = PREP_CONF_FAT_FN_NO_REINFORCEMENT(val_0, n_msk); \
|
||||
r_0 = or512(r_0, p_mask); \
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
|
||||
} \
|
||||
\
|
||||
return HWLM_SUCCESS; \
|
||||
} while(0)
|
||||
|
||||
#else // HAVE_AVX512
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
|
||||
do { \
|
||||
if (unlikely(diff256(var, ones256()))) { \
|
||||
m256 swap = swap128in256(var); \
|
||||
m256 r = interleave256lo(var, swap); \
|
||||
u64a part1 = extractlow64from256(r); \
|
||||
u64a part2 = extract64from256(r, 1); \
|
||||
r = interleave256hi(var, swap); \
|
||||
u64a part3 = extractlow64from256(r); \
|
||||
u64a part4 = extract64from256(r, 1); \
|
||||
CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn); \
|
||||
} \
|
||||
} while(0)
|
||||
#else
|
||||
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
|
||||
do { \
|
||||
if (unlikely(diff256(var, ones256()))) { \
|
||||
m256 swap = swap128in256(var); \
|
||||
m256 r = interleave256lo(var, swap); \
|
||||
u32 part1 = extractlow32from256(r); \
|
||||
u32 part2 = extract32from256(r, 1); \
|
||||
u32 part3 = extract32from256(r, 2); \
|
||||
u32 part4 = extract32from256(r, 3); \
|
||||
r = interleave256hi(var, swap); \
|
||||
u32 part5 = extractlow32from256(r); \
|
||||
u32 part6 = extract32from256(r, 1); \
|
||||
u32 part7 = extract32from256(r, 2); \
|
||||
u32 part8 = extract32from256(r, 3); \
|
||||
CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn); \
|
||||
CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn); \
|
||||
} \
|
||||
} while(0)
|
||||
#endif
|
||||
|
||||
static really_inline
|
||||
m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
const u8 *lo, const u8 *hi,
|
||||
const u8 *buf_history, size_t len_history,
|
||||
const u32 nMasks) {
|
||||
m128 p_mask128;
|
||||
m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
|
||||
buf_history, len_history, nMasks));
|
||||
*p_mask = set2x128(p_mask128);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
|
||||
m256 mask = set32x8(0xf);
|
||||
m256 lo = and256(val, mask);
|
||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||
return or256(pshufb_m256(maskBase[0 * 2], lo),
|
||||
pshufb_m256(maskBase[0 * 2 + 1], hi));
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
|
||||
m256 mask = set32x8(0xf);
|
||||
m256 lo = and256(val, mask);
|
||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||
m256 r = prep_conf_fat_teddy_m1(maskBase, val);
|
||||
|
||||
m256 res_1 = or256(pshufb_m256(maskBase[1 * 2], lo),
|
||||
pshufb_m256(maskBase[1 * 2 + 1], hi));
|
||||
m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - 1);
|
||||
*old_1 = res_1;
|
||||
return or256(r, res_shifted_1);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
|
||||
m256 val) {
|
||||
m256 mask = set32x8(0xf);
|
||||
m256 lo = and256(val, mask);
|
||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||
m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
|
||||
|
||||
m256 res_2 = or256(pshufb_m256(maskBase[2 * 2], lo),
|
||||
pshufb_m256(maskBase[2 * 2 + 1], hi));
|
||||
m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - 2);
|
||||
*old_2 = res_2;
|
||||
return or256(r, res_shifted_2);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
|
||||
m256 *old_3, m256 val) {
|
||||
m256 mask = set32x8(0xf);
|
||||
m256 lo = and256(val, mask);
|
||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||
m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
|
||||
|
||||
m256 res_3 = or256(pshufb_m256(maskBase[3 * 2], lo),
|
||||
pshufb_m256(maskBase[3 * 2 + 1], hi));
|
||||
m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - 3);
|
||||
*old_3 = res_3;
|
||||
return or256(r, res_shifted_3);
|
||||
}
|
||||
|
||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD_1 \
|
||||
do { \
|
||||
} while(0)
|
||||
|
||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD_2 \
|
||||
m256 res_old_1 = zeroes256();
|
||||
|
||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD_3 \
|
||||
m256 res_old_1 = zeroes256(); \
|
||||
m256 res_old_2 = zeroes256();
|
||||
|
||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD_4 \
|
||||
m256 res_old_1 = zeroes256(); \
|
||||
m256 res_old_2 = zeroes256(); \
|
||||
m256 res_old_3 = zeroes256();
|
||||
|
||||
#define FDR_EXEC_FAT_TEDDY_RES_OLD(n) FDR_EXEC_FAT_TEDDY_RES_OLD_##n
|
||||
|
||||
#define PREP_CONF_FAT_FN_1(mask_base, val) \
|
||||
prep_conf_fat_teddy_m1(mask_base, val)
|
||||
|
||||
#define PREP_CONF_FAT_FN_2(mask_base, val) \
|
||||
prep_conf_fat_teddy_m2(mask_base, &res_old_1, val)
|
||||
|
||||
#define PREP_CONF_FAT_FN_3(mask_base, val) \
|
||||
prep_conf_fat_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
|
||||
|
||||
#define PREP_CONF_FAT_FN_4(mask_base, val) \
|
||||
prep_conf_fat_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
|
||||
|
||||
#define PREP_CONF_FAT_FN(mask_base, val, n) \
|
||||
PREP_CONF_FAT_FN_##n(mask_base, val)
|
||||
|
||||
#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \
|
||||
do { \
|
||||
const u8 *buf_end = a->buf + a->len; \
|
||||
const u8 *ptr = a->buf + a->start_offset; \
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START; \
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect; \
|
||||
u32 last_match = ones_u32; \
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr; \
|
||||
const size_t iterBytes = 32; \
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
|
||||
a->buf, a->len, a->start_offset); \
|
||||
\
|
||||
const m256 *maskBase = getMaskBase_fat(teddy); \
|
||||
const u32 *confBase = getConfBase(teddy); \
|
||||
\
|
||||
FDR_EXEC_FAT_TEDDY_RES_OLD(n_msk); \
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16); \
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
|
||||
if (ptr < mainStart) { \
|
||||
ptr = mainStart - 16; \
|
||||
m256 p_mask; \
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset, \
|
||||
a->buf, buf_end, \
|
||||
a->buf_history, a->len_history, \
|
||||
n_msk); \
|
||||
m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \
|
||||
r_0 = or256(r_0, p_mask); \
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
|
||||
ptr += 16; \
|
||||
} \
|
||||
\
|
||||
if (ptr + 16 <= buf_end) { \
|
||||
m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
|
||||
ptr += 16; \
|
||||
} \
|
||||
\
|
||||
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
|
||||
__builtin_prefetch(ptr + (iterBytes * 4)); \
|
||||
CHECK_FLOOD; \
|
||||
m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
|
||||
m256 r_1 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr + 16), n_msk); \
|
||||
CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, conf_fn); \
|
||||
} \
|
||||
\
|
||||
if (ptr + 16 <= buf_end) { \
|
||||
m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
|
||||
ptr += 16; \
|
||||
} \
|
||||
\
|
||||
assert(ptr + 16 > buf_end); \
|
||||
if (ptr < buf_end) { \
|
||||
m256 p_mask; \
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end, \
|
||||
a->buf_history, a->len_history, \
|
||||
n_msk); \
|
||||
m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \
|
||||
r_0 = or256(r_0, p_mask); \
|
||||
CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
|
||||
} \
|
||||
\
|
||||
return HWLM_SUCCESS; \
|
||||
} while(0)
|
||||
|
||||
#endif // HAVE_AVX512
|
||||
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
|
||||
}
|
||||
|
||||
#endif // HAVE_AVX2
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
* Copyright (c) 2015-2020, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -46,7 +46,6 @@
|
||||
#include "util/alloc.h"
|
||||
#include "util/compare.h"
|
||||
#include "util/container.h"
|
||||
#include "util/make_unique.h"
|
||||
#include "util/noncopyable.h"
|
||||
#include "util/popcount.h"
|
||||
#include "util/small_vector.h"
|
||||
@ -89,7 +88,7 @@ public:
|
||||
const TeddyEngineDescription &eng_in, bool make_small_in,
|
||||
const Grey &grey_in)
|
||||
: eng(eng_in), grey(grey_in), lits(lits_in),
|
||||
bucketToLits(move(bucketToLits_in)), make_small(make_small_in) {}
|
||||
bucketToLits(std::move(bucketToLits_in)), make_small(make_small_in) {}
|
||||
|
||||
bytecode_ptr<FDR> build();
|
||||
};
|
||||
@ -166,7 +165,7 @@ public:
|
||||
nibbleSets[i * 2] = nibbleSets[i * 2 + 1] = 0xffff;
|
||||
}
|
||||
}
|
||||
litIds.push_back(lit_id);
|
||||
litIds.emplace_back(lit_id);
|
||||
sort_and_unique(litIds);
|
||||
}
|
||||
|
||||
@ -329,7 +328,7 @@ bool pack(const vector<hwlmLiteral> &lits,
|
||||
|
||||
static
|
||||
void initReinforcedTable(u8 *rmsk) {
|
||||
u64a *mask = (u64a *)rmsk;
|
||||
u64a *mask = reinterpret_cast<u64a *>(rmsk);
|
||||
fill_n(mask, N_CHARS, 0x00ffffffffffffffULL);
|
||||
}
|
||||
|
||||
@ -353,6 +352,89 @@ void fillReinforcedMsk(u8 *rmsk, u16 c, u32 j, u8 bmsk) {
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void fillDupNibbleMasks(const map<BucketIndex,
|
||||
vector<LiteralIndex>> &bucketToLits,
|
||||
const vector<hwlmLiteral> &lits,
|
||||
u32 numMasks, size_t maskLen,
|
||||
u8 *baseMsk) {
|
||||
u32 maskWidth = 2;
|
||||
memset(baseMsk, 0xff, maskLen);
|
||||
|
||||
for (const auto &b2l : bucketToLits) {
|
||||
const u32 &bucket_id = b2l.first;
|
||||
const vector<LiteralIndex> &ids = b2l.second;
|
||||
const u8 bmsk = 1U << (bucket_id % 8);
|
||||
|
||||
for (const LiteralIndex &lit_id : ids) {
|
||||
const hwlmLiteral &l = lits[lit_id];
|
||||
DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
|
||||
const u32 sz = verify_u32(l.s.size());
|
||||
|
||||
// fill in masks
|
||||
for (u32 j = 0; j < numMasks; j++) {
|
||||
const u32 msk_id_lo = j * 2 * maskWidth + (bucket_id / 8);
|
||||
const u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id / 8);
|
||||
const u32 lo_base0 = msk_id_lo * 32;
|
||||
const u32 lo_base1 = msk_id_lo * 32 + 16;
|
||||
const u32 hi_base0 = msk_id_hi * 32;
|
||||
const u32 hi_base1 = msk_id_hi * 32 + 16;
|
||||
|
||||
// if we don't have a char at this position, fill in i
|
||||
// locations in these masks with '1'
|
||||
if (j >= sz) {
|
||||
for (u32 n = 0; n < 16; n++) {
|
||||
baseMsk[lo_base0 + n] &= ~bmsk;
|
||||
baseMsk[lo_base1 + n] &= ~bmsk;
|
||||
baseMsk[hi_base0 + n] &= ~bmsk;
|
||||
baseMsk[hi_base1 + n] &= ~bmsk;
|
||||
}
|
||||
} else {
|
||||
u8 c = l.s[sz - 1 - j];
|
||||
// if we do have a char at this position
|
||||
const u32 hiShift = 4;
|
||||
u32 n_hi = (c >> hiShift) & 0xf;
|
||||
u32 n_lo = c & 0xf;
|
||||
|
||||
if (j < l.msk.size() && l.msk[l.msk.size() - 1 - j]) {
|
||||
u8 m = l.msk[l.msk.size() - 1 - j];
|
||||
u8 m_hi = (m >> hiShift) & 0xf;
|
||||
u8 m_lo = m & 0xf;
|
||||
u8 cmp = l.cmp[l.msk.size() - 1 - j];
|
||||
u8 cmp_lo = cmp & 0xf;
|
||||
u8 cmp_hi = (cmp >> hiShift) & 0xf;
|
||||
|
||||
for (u8 cm = 0; cm < 0x10; cm++) {
|
||||
if ((cm & m_lo) == (cmp_lo & m_lo)) {
|
||||
baseMsk[lo_base0 + cm] &= ~bmsk;
|
||||
baseMsk[lo_base1 + cm] &= ~bmsk;
|
||||
}
|
||||
if ((cm & m_hi) == (cmp_hi & m_hi)) {
|
||||
baseMsk[hi_base0 + cm] &= ~bmsk;
|
||||
baseMsk[hi_base1 + cm] &= ~bmsk;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (l.nocase && ourisalpha(c)) {
|
||||
u32 cmHalfClear = (0xdf >> hiShift) & 0xf;
|
||||
u32 cmHalfSet = (0x20 >> hiShift) & 0xf;
|
||||
baseMsk[hi_base0 + (n_hi & cmHalfClear)] &= ~bmsk;
|
||||
baseMsk[hi_base1 + (n_hi & cmHalfClear)] &= ~bmsk;
|
||||
baseMsk[hi_base0 + (n_hi | cmHalfSet)] &= ~bmsk;
|
||||
baseMsk[hi_base1 + (n_hi | cmHalfSet)] &= ~bmsk;
|
||||
} else {
|
||||
baseMsk[hi_base0 + n_hi] &= ~bmsk;
|
||||
baseMsk[hi_base1 + n_hi] &= ~bmsk;
|
||||
}
|
||||
baseMsk[lo_base0 + n_lo] &= ~bmsk;
|
||||
baseMsk[lo_base1 + n_lo] &= ~bmsk;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void fillNibbleMasks(const map<BucketIndex,
|
||||
vector<LiteralIndex>> &bucketToLits,
|
||||
@ -432,7 +514,7 @@ void fillReinforcedTable(const map<BucketIndex,
|
||||
u8 *rtable_base, const u32 num_tables) {
|
||||
vector<u8 *> tables;
|
||||
for (u32 i = 0; i < num_tables; i++) {
|
||||
tables.push_back(rtable_base + i * RTABLE_SIZE);
|
||||
tables.emplace_back(rtable_base + i * RTABLE_SIZE);
|
||||
}
|
||||
|
||||
for (auto t : tables) {
|
||||
@ -479,20 +561,23 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
|
||||
|
||||
size_t headerSize = sizeof(Teddy);
|
||||
size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
|
||||
size_t reinforcedMaskLen = RTABLE_SIZE * maskWidth;
|
||||
size_t reinforcedDupMaskLen = RTABLE_SIZE * maskWidth;
|
||||
if (maskWidth == 2) { // dup nibble mask table in Fat Teddy
|
||||
reinforcedDupMaskLen = maskLen * 2;
|
||||
}
|
||||
|
||||
auto floodTable = setupFDRFloodControl(lits, eng, grey);
|
||||
auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small);
|
||||
|
||||
// Note: we place each major structure here on a cacheline boundary.
|
||||
size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
|
||||
ROUNDUP_CL(reinforcedMaskLen) +
|
||||
ROUNDUP_CL(reinforcedDupMaskLen) +
|
||||
ROUNDUP_CL(confirmTable.size()) + floodTable.size();
|
||||
|
||||
auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
|
||||
assert(fdr); // otherwise would have thrown std::bad_alloc
|
||||
Teddy *teddy = (Teddy *)fdr.get(); // ugly
|
||||
u8 *teddy_base = (u8 *)teddy;
|
||||
Teddy *teddy = reinterpret_cast<Teddy *>(fdr.get()); // ugly
|
||||
u8 *teddy_base = reinterpret_cast<u8 *>(teddy);
|
||||
|
||||
// Write header.
|
||||
teddy->size = size;
|
||||
@ -502,7 +587,7 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
|
||||
|
||||
// Write confirm structures.
|
||||
u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
|
||||
ROUNDUP_CL(reinforcedMaskLen);
|
||||
ROUNDUP_CL(reinforcedDupMaskLen);
|
||||
assert(ISALIGNED_CL(ptr));
|
||||
teddy->confOffset = verify_u32(ptr - teddy_base);
|
||||
memcpy(ptr, confirmTable.get(), confirmTable.size());
|
||||
@ -512,16 +597,23 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
|
||||
assert(ISALIGNED_CL(ptr));
|
||||
teddy->floodOffset = verify_u32(ptr - teddy_base);
|
||||
memcpy(ptr, floodTable.get(), floodTable.size());
|
||||
ptr += floodTable.size();
|
||||
|
||||
|
||||
// Write teddy masks.
|
||||
u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
|
||||
fillNibbleMasks(bucketToLits, lits, eng.numMasks, maskWidth, maskLen,
|
||||
baseMsk);
|
||||
|
||||
// Write reinforcement masks.
|
||||
u8 *reinforcedMsk = baseMsk + ROUNDUP_CL(maskLen);
|
||||
fillReinforcedTable(bucketToLits, lits, reinforcedMsk, maskWidth);
|
||||
if (maskWidth == 1) { // reinforcement table in Teddy
|
||||
// Write reinforcement masks.
|
||||
u8 *reinforcedMsk = baseMsk + ROUNDUP_CL(maskLen);
|
||||
fillReinforcedTable(bucketToLits, lits, reinforcedMsk, maskWidth);
|
||||
} else { // dup nibble mask table in Fat Teddy
|
||||
assert(maskWidth == 2);
|
||||
u8 *dupMsk = baseMsk + ROUNDUP_CL(maskLen);
|
||||
fillDupNibbleMasks(bucketToLits, lits, eng.numMasks,
|
||||
reinforcedDupMaskLen, dupMsk);
|
||||
}
|
||||
|
||||
return fdr;
|
||||
}
|
||||
@ -530,7 +622,7 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
|
||||
static
|
||||
bool assignStringsToBuckets(
|
||||
const vector<hwlmLiteral> &lits,
|
||||
TeddyEngineDescription &eng,
|
||||
const TeddyEngineDescription &eng,
|
||||
map<BucketIndex, vector<LiteralIndex>> &bucketToLits) {
|
||||
assert(eng.numMasks <= MAX_NUM_MASKS);
|
||||
if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
|
||||
@ -584,7 +676,7 @@ unique_ptr<HWLMProto> teddyBuildProtoHinted(
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return ue2::make_unique<HWLMProto>(engType, move(des), lits,
|
||||
return std::make_unique<HWLMProto>(engType, std::move(des), lits,
|
||||
bucketToLits, make_small);
|
||||
}
|
||||
|
||||
|
@ -34,7 +34,6 @@
|
||||
#include "fdr_engine_description.h"
|
||||
#include "teddy_internal.h"
|
||||
#include "teddy_engine_description.h"
|
||||
#include "util/make_unique.h"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
@ -53,14 +52,14 @@ u32 TeddyEngineDescription::getDefaultFloodSuffixLength() const {
|
||||
|
||||
void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
|
||||
static const TeddyEngineDef defns[] = {
|
||||
{ 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false },
|
||||
{ 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true },
|
||||
{ 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false },
|
||||
{ 6, 0 | HS_CPU_FEATURES_AVX2, 2, 16, true },
|
||||
{ 7, 0 | HS_CPU_FEATURES_AVX2, 3, 16, false },
|
||||
{ 8, 0 | HS_CPU_FEATURES_AVX2, 3, 16, true },
|
||||
{ 9, 0 | HS_CPU_FEATURES_AVX2, 4, 16, false },
|
||||
{ 10, 0 | HS_CPU_FEATURES_AVX2, 4, 16, true },
|
||||
{ 3, HS_CPU_FEATURES_AVX2, 1, 16, false },
|
||||
{ 4, HS_CPU_FEATURES_AVX2, 1, 16, true },
|
||||
{ 5, HS_CPU_FEATURES_AVX2, 2, 16, false },
|
||||
{ 6, HS_CPU_FEATURES_AVX2, 2, 16, true },
|
||||
{ 7, HS_CPU_FEATURES_AVX2, 3, 16, false },
|
||||
{ 8, HS_CPU_FEATURES_AVX2, 3, 16, true },
|
||||
{ 9, HS_CPU_FEATURES_AVX2, 4, 16, false },
|
||||
{ 10, HS_CPU_FEATURES_AVX2, 4, 16, true },
|
||||
{ 11, 0, 1, 8, false },
|
||||
{ 12, 0, 1, 8, true },
|
||||
{ 13, 0, 2, 8, false },
|
||||
@ -72,6 +71,7 @@ void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
|
||||
};
|
||||
out->clear();
|
||||
for (const auto &def : defns) {
|
||||
// cppcheck-suppress useStlAlgorithm
|
||||
out->emplace_back(def);
|
||||
}
|
||||
}
|
||||
@ -124,6 +124,7 @@ bool isAllowed(const vector<hwlmLiteral> &vl, const TeddyEngineDescription &eng,
|
||||
u32 n_small_lits = 0;
|
||||
for (const auto &lit : vl) {
|
||||
if (lit.s.length() < eng.numMasks) {
|
||||
// cppcheck-suppress useStlAlgorithm
|
||||
n_small_lits++;
|
||||
}
|
||||
}
|
||||
@ -197,7 +198,7 @@ chooseTeddyEngine(const target_t &target, const vector<hwlmLiteral> &vl) {
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("using engine %u\n", best->getID());
|
||||
return ue2::make_unique<TeddyEngineDescription>(*best);
|
||||
return std::make_unique<TeddyEngineDescription>(*best);
|
||||
}
|
||||
|
||||
unique_ptr<TeddyEngineDescription> getTeddyDescription(u32 engineID) {
|
||||
@ -205,8 +206,9 @@ unique_ptr<TeddyEngineDescription> getTeddyDescription(u32 engineID) {
|
||||
getTeddyDescriptions(&descs);
|
||||
|
||||
for (const auto &desc : descs) {
|
||||
// cppcheck-suppress useStlAlgorithm
|
||||
if (desc.getID() == engineID) {
|
||||
return ue2::make_unique<TeddyEngineDescription>(desc);
|
||||
return std::make_unique<TeddyEngineDescription>(desc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -39,7 +39,7 @@ namespace ue2 {
|
||||
|
||||
#define TEDDY_BUCKET_LOAD 6
|
||||
|
||||
struct TeddyEngineDef {
|
||||
struct TeddyEngineDef { //NOLINT (clang-analyzer-optin.performance.Padding)
|
||||
u32 id;
|
||||
u64a cpu_features;
|
||||
u32 numMasks;
|
||||
|
570
src/fdr/teddy_fat.cpp
Normal file
570
src/fdr/teddy_fat.cpp
Normal file
@ -0,0 +1,570 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2020, Intel Corporation
|
||||
* Copyright (c) 2024, VectorCamp PC
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* fat teddy for AVX2 and AVX512VBMI */
|
||||
|
||||
#include "fdr_internal.h"
|
||||
#include "flood_runtime.h"
|
||||
#include "teddy.h"
|
||||
#include "teddy_internal.h"
|
||||
#include "teddy_runtime_common.h"
|
||||
#include "util/arch.h"
|
||||
#include "util/simd_utils.h"
|
||||
|
||||
#if defined(HAVE_AVX2)
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
static really_inline
|
||||
hwlm_error_t conf_chunk_64(u64a chunk, u8 bucket, u8 offset,
|
||||
CautionReason reason, const u8 *pt,
|
||||
const u32* confBase,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t *control,
|
||||
u32 *last_match) {
|
||||
if (unlikely(chunk != ones_u64a)) {
|
||||
chunk = ~chunk;
|
||||
do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
|
||||
control, last_match);
|
||||
// adapted from CHECK_HWLM_TERMINATE_MATCHING
|
||||
if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
|
||||
if(conf_chunk_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||
#else
|
||||
static really_inline
|
||||
hwlm_error_t conf_chunk_32(u32 chunk, u8 bucket, u8 offset,
|
||||
CautionReason reason, const u8 *pt,
|
||||
const u32* confBase,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t *control,
|
||||
u32 *last_match) {
|
||||
if (unlikely(chunk != ones_u32)) {
|
||||
chunk = ~chunk;
|
||||
do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
|
||||
control, last_match);
|
||||
// adapted from CHECK_HWLM_TERMINATE_MATCHING
|
||||
if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
|
||||
if(conf_chunk_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
|
||||
|
||||
// fat 512 teddy is only with vbmi
|
||||
|
||||
static really_inline
|
||||
const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
|
||||
return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
|
||||
+ ROUNDUP_CL(2 * numMask * sizeof(m256)));
|
||||
}
|
||||
|
||||
|
||||
const u8 ALIGN_CL_DIRECTIVE p_mask_interleave[64] = {
|
||||
0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
|
||||
8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
|
||||
16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
|
||||
24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
|
||||
};
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
hwlm_error_t confirm_fat_teddy_64_512(m512 var, u8 bucket, u8 offset,
|
||||
CautionReason reason, const u8 *ptr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
const u32* confBase, hwlm_group_t *control,
|
||||
u32 *last_match) {
|
||||
if (unlikely(diff512(var, ones512()))) {
|
||||
m512 msk_interleave = load512(p_mask_interleave);
|
||||
m512 r = vpermb512(msk_interleave, var);
|
||||
m128 r0 = extract128from512(r, 0);
|
||||
m128 r1 = extract128from512(r, 1);
|
||||
m128 r2 = extract128from512(r, 2);
|
||||
m128 r3 = extract128from512(r, 3);
|
||||
u64a part1 = movq(r0);
|
||||
u64a part2 = extract64from128(r0, 1);
|
||||
u64a part3 = movq(r1);
|
||||
u64a part4 = extract64from128(r1, 1);
|
||||
u64a part5 = movq(r2);
|
||||
u64a part6 = extract64from128(r2, 1);
|
||||
u64a part7 = movq(r3);
|
||||
u64a part8 = extract64from128(r3, 1);
|
||||
CONF_FAT_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
#define confirm_fat_teddy_512_f confirm_fat_teddy_64_512
|
||||
#else // 32-64
|
||||
|
||||
hwlm_error_t confirm_fat_teddy_32_512(m512 var, u8 bucket, u8 offset,
|
||||
CautionReason reason, const u8 *ptr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
const u32* confBase, hwlm_group_t *control,
|
||||
u32 *last_match) {
|
||||
if (unlikely(diff512(var, ones512()))) {
|
||||
m512 msk_interleave = load512(p_mask_interleave);
|
||||
m512 r = vpermb512(msk_interleave, var);
|
||||
m128 r0 = extract128from512(r, 0);
|
||||
m128 r1 = extract128from512(r, 1);
|
||||
m128 r2 = extract128from512(r, 2);
|
||||
m128 r3 = extract128from512(r, 3);
|
||||
u32 part1 = movd(r0);
|
||||
u32 part2 = extract32from128(r0, 1);
|
||||
u32 part3 = extract32from128(r0, 2);
|
||||
u32 part4 = extract32from128(r0, 3);
|
||||
u32 part5 = movd(r1);
|
||||
u32 part6 = extract32from128(r1, 1);
|
||||
u32 part7 = extract32from128(r1, 2);
|
||||
u32 part8 = extract32from128(r1, 3);
|
||||
u32 part9 = movd(r2);
|
||||
u32 part10 = extract32from128(r2, 1);
|
||||
u32 part11 = extract32from128(r2, 2);
|
||||
u32 part12 = extract32from128(r2, 3);
|
||||
u32 part13 = movd(r3);
|
||||
u32 part14 = extract32from128(r3, 1);
|
||||
u32 part15 = extract32from128(r3, 2);
|
||||
u32 part16 = extract32from128(r3, 3);
|
||||
CONF_FAT_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, ptr, confBase, a, control, last_match);
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
#define confirm_fat_teddy_512_f confirm_fat_teddy_32_512
|
||||
#endif // 32/64
|
||||
|
||||
#define CONFIRM_FAT_TEDDY_512(...) if(confirm_fat_teddy_512_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||
|
||||
#define TEDDY_VBMI_SL1_MASK 0xfffffffffffffffeULL
|
||||
#define TEDDY_VBMI_SL2_MASK 0xfffffffffffffffcULL
|
||||
#define TEDDY_VBMI_SL3_MASK 0xfffffffffffffff8ULL
|
||||
|
||||
#define FAT_TEDDY_VBMI_SL1_MASK 0xfffffffefffffffeULL
|
||||
#define FAT_TEDDY_VBMI_SL2_MASK 0xfffffffcfffffffcULL
|
||||
#define FAT_TEDDY_VBMI_SL3_MASK 0xfffffff8fffffff8ULL
|
||||
|
||||
#define FAT_TEDDY_VBMI_SL1_POS 15
|
||||
#define FAT_TEDDY_VBMI_SL2_POS 14
|
||||
#define FAT_TEDDY_VBMI_SL3_POS 13
|
||||
|
||||
#define FAT_TEDDY_VBMI_CONF_MASK_HEAD (0xffffffffULL >> n_sh)
|
||||
#define FAT_TEDDY_VBMI_CONF_MASK_FULL ((0xffffffffULL << n_sh) & 0xffffffffULL)
|
||||
#define FAT_TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffULL >> (32 - n) << overlap)
|
||||
#define FAT_TEDDY_VBMI_LOAD_MASK_PATCH (0xffffffffULL >> (32 - n_sh))
|
||||
|
||||
template<int NMSK>
|
||||
static really_inline
|
||||
m512 prep_conf_fat_teddy_512vbmi_templ(const m512 *lo_mask, const m512 *dup_mask,
|
||||
const m512 *sl_msk, const m512 val) {
|
||||
m512 lo = and512(val, *lo_mask);
|
||||
m512 hi = and512(rshift64_m512(val, 4), *lo_mask);
|
||||
m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),
|
||||
pshufb_m512(dup_mask[1], hi));
|
||||
|
||||
if constexpr (NMSK == 1) return shuf_or_b0;
|
||||
m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),
|
||||
pshufb_m512(dup_mask[3], hi));
|
||||
m512 sl1 = maskz_vpermb512(FAT_TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
|
||||
if constexpr (NMSK == 2) return (or512(sl1, shuf_or_b0));
|
||||
m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),
|
||||
pshufb_m512(dup_mask[5], hi));
|
||||
m512 sl2 = maskz_vpermb512(FAT_TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
|
||||
if constexpr (NMSK == 3) return (or512(sl2, or512(sl1, shuf_or_b0)));
|
||||
m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),
|
||||
pshufb_m512(dup_mask[7], hi));
|
||||
m512 sl3 = maskz_vpermb512(FAT_TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
|
||||
return (or512(sl3, or512(sl2, or512(sl1, shuf_or_b0))));
|
||||
}
|
||||
|
||||
|
||||
#define TEDDY_VBMI_SL1_POS 15
|
||||
#define TEDDY_VBMI_SL2_POS 14
|
||||
#define TEDDY_VBMI_SL3_POS 13
|
||||
|
||||
#define TEDDY_VBMI_CONF_MASK_HEAD (0xffffffffffffffffULL >> n_sh)
|
||||
#define TEDDY_VBMI_CONF_MASK_FULL (0xffffffffffffffffULL << n_sh)
|
||||
#define TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffffffffffULL >> (64 - n) << overlap)
|
||||
#define TEDDY_VBMI_LOAD_MASK_PATCH (0xffffffffffffffffULL >> (64 - n_sh))
|
||||
|
||||
template<int NMSK>
|
||||
hwlm_error_t fdr_exec_fat_teddy_512vbmi_templ(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = ones_u32;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
u32 n_sh = NMSK - 1;
|
||||
const size_t loopBytes = 32 - n_sh;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m512 *dup_mask = getDupMaskBase(teddy, NMSK);
|
||||
m512 lo_mask = set1_64x8(0xf);
|
||||
m512 sl_msk[NMSK - 1];
|
||||
if constexpr (NMSK > 1){
|
||||
sl_msk[0] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL1_POS);
|
||||
}
|
||||
if constexpr (NMSK > 2){
|
||||
sl_msk[1] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL2_POS);
|
||||
}
|
||||
if constexpr (NMSK > 3){
|
||||
sl_msk[2] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL3_POS);
|
||||
}
|
||||
|
||||
const u32 *confBase = getConfBase(teddy);
|
||||
|
||||
u64a k = FAT_TEDDY_VBMI_CONF_MASK_FULL;
|
||||
m512 p_mask = set_mask_m512(~((k << 32) | k));
|
||||
u32 overlap = 0;
|
||||
u64a patch = 0;
|
||||
if (likely(ptr + loopBytes <= buf_end)) {
|
||||
u64a k0 = FAT_TEDDY_VBMI_CONF_MASK_HEAD;
|
||||
m512 p_mask0 = set_mask_m512(~((k0 << 32) | k0));
|
||||
m512 r_0 = prep_conf_fat_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, set2x256(loadu_maskz_m256(k0, ptr)));
|
||||
|
||||
r_0 = or512(r_0, p_mask0);
|
||||
CONFIRM_FAT_TEDDY_512(r_0, 16, 0, VECTORING, ptr);
|
||||
ptr += loopBytes;
|
||||
overlap = n_sh;
|
||||
patch = FAT_TEDDY_VBMI_LOAD_MASK_PATCH;
|
||||
}
|
||||
|
||||
for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {
|
||||
CHECK_FLOOD;
|
||||
m512 r_0 = prep_conf_fat_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, set2x256(loadu256(ptr - n_sh)));
|
||||
r_0 = or512(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY_512(r_0, 16, 0, NOT_CAUTIOUS, ptr - n_sh);
|
||||
}
|
||||
|
||||
assert(ptr + loopBytes > buf_end);
|
||||
if (ptr < buf_end) {
|
||||
u32 left = (u32)(buf_end - ptr);
|
||||
u64a k1 = FAT_TEDDY_VBMI_CONF_MASK_VAR(left);
|
||||
m512 p_mask1 = set_mask_m512(~((k1 << 32) | k1));
|
||||
m512 val_0 = set2x256(loadu_maskz_m256(k1 | patch, ptr - overlap));
|
||||
m512 r_0 = prep_conf_fat_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, val_0);
|
||||
|
||||
r_0 = or512(r_0, p_mask1);
|
||||
CONFIRM_FAT_TEDDY_512(r_0, 16, 0, VECTORING, ptr - overlap);
|
||||
}
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#define FDR_EXEC_FAT_TEDDY_FN fdr_exec_fat_teddy_512vbmi_templ
|
||||
|
||||
|
||||
#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
|
||||
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
extern "C" {
|
||||
hwlm_error_t confirm_fat_teddy_64_256(m256 var, u8 bucket, u8 offset,
|
||||
CautionReason reason, const u8 *ptr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
const u32* confBase, hwlm_group_t *control,
|
||||
u32 *last_match) {
|
||||
if (unlikely(diff256(var, ones256()))) {
|
||||
m256 swap = swap128in256(var);
|
||||
m256 r = interleave256lo(var, swap);
|
||||
u64a part1 = extractlow64from256(r);
|
||||
u64a part2 = extract64from256(r, 1);
|
||||
r = interleave256hi(var, swap);
|
||||
u64a part3 = extractlow64from256(r);
|
||||
u64a part4 = extract64from256(r, 1);
|
||||
CONF_FAT_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
} // extern C
|
||||
|
||||
#define confirm_fat_teddy_256_f confirm_fat_teddy_64_256
|
||||
|
||||
#else
|
||||
extern "C" {
|
||||
hwlm_error_t confirm_fat_teddy_32_256(m256 var, u8 bucket, u8 offset,
|
||||
CautionReason reason, const u8 *ptr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
const u32* confBase, hwlm_group_t *control,
|
||||
u32 *last_match) {
|
||||
if (unlikely(diff256(var, ones256()))) {
|
||||
m256 swap = swap128in256(var);
|
||||
m256 r = interleave256lo(var, swap);
|
||||
u32 part1 = extractlow32from256(r);
|
||||
u32 part2 = extract32from256(r, 1);
|
||||
u32 part3 = extract32from256(r, 2);
|
||||
u32 part4 = extract32from256(r, 3);
|
||||
r = interleave256hi(var, swap);
|
||||
u32 part5 = extractlow32from256(r);
|
||||
u32 part6 = extract32from256(r, 1);
|
||||
u32 part7 = extract32from256(r, 2);
|
||||
u32 part8 = extract32from256(r, 3);
|
||||
CONF_FAT_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
|
||||
CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, ptr, confBase, a, control, last_match);
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
} // extern C
|
||||
|
||||
#define confirm_fat_teddy_256_f confirm_fat_teddy_32_256
|
||||
|
||||
#endif
|
||||
|
||||
#define CONFIRM_FAT_TEDDY_256(...) if(confirm_fat_teddy_256_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
|
||||
|
||||
static really_inline
|
||||
const m256 *getMaskBase_fat(const struct Teddy *teddy) {
|
||||
return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
|
||||
}
|
||||
|
||||
|
||||
static really_inline
|
||||
m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
const u8 *lo, const u8 *hi,
|
||||
const u8 *buf_history, size_t len_history,
|
||||
const u32 nMasks) {
|
||||
m128 p_mask128;
|
||||
m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
|
||||
buf_history, len_history, nMasks));
|
||||
*p_mask = set1_2x128(p_mask128);
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<int NMSK>
|
||||
static really_inline
|
||||
m256 prep_conf_fat_teddy_256_templ(const m256 *maskBase, m256 val,
|
||||
m256* old_1, m256* old_2, m256* old_3){
|
||||
m256 mask = set1_32x8(0xf);
|
||||
m256 lo = and256(val, mask);
|
||||
m256 hi = and256(rshift64_m256(val, 4), mask);
|
||||
m256 r = or256(pshufb_m256(maskBase[0 * 2], lo),
|
||||
pshufb_m256(maskBase[0 * 2 + 1], hi));
|
||||
if constexpr (NMSK == 1) return r;
|
||||
m256 res_1 = or256(pshufb_m256(maskBase[(NMSK-1) * 2], lo),
|
||||
pshufb_m256(maskBase[(NMSK-1) * 2 + 1], hi));
|
||||
m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - (NMSK-1));
|
||||
*old_1 = res_1;
|
||||
r = or256(r, res_shifted_1);
|
||||
if constexpr (NMSK == 2) return r;
|
||||
m256 res_2 = or256(pshufb_m256(maskBase[(NMSK-1) * 2], lo),
|
||||
pshufb_m256(maskBase[(NMSK-1) * 2 + 1], hi));
|
||||
m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - (NMSK-1));
|
||||
*old_2 = res_2;
|
||||
r = or256(r, res_shifted_2);
|
||||
if constexpr (NMSK == 3) return r;
|
||||
m256 res_3 = or256(pshufb_m256(maskBase[(NMSK-1) * 2], lo),
|
||||
pshufb_m256(maskBase[(NMSK-1) * 2 + 1], hi));
|
||||
m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - (NMSK-1));
|
||||
*old_3 = res_3;
|
||||
return or256(r, res_shifted_3);
|
||||
}
|
||||
|
||||
template<int NMSK>
|
||||
hwlm_error_t fdr_exec_fat_teddy_256_templ(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
const u8 *buf_end = a->buf + a->len;
|
||||
const u8 *ptr = a->buf + a->start_offset;
|
||||
u32 floodBackoff = FLOOD_BACKOFF_START;
|
||||
const u8 *tryFloodDetect = a->firstFloodDetect;
|
||||
u32 last_match = ones_u32;
|
||||
const struct Teddy *teddy = (const struct Teddy *)fdr;
|
||||
const size_t iterBytes = 32;
|
||||
DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
|
||||
a->buf, a->len, a->start_offset);
|
||||
|
||||
const m256 *maskBase = getMaskBase_fat(teddy);
|
||||
const u32 *confBase = getConfBase(teddy);
|
||||
|
||||
m256 res_old_1 = zeroes256();
|
||||
m256 res_old_2 = zeroes256();
|
||||
m256 res_old_3 = zeroes256();
|
||||
const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
|
||||
DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
|
||||
if (ptr < mainStart) {
|
||||
ptr = mainStart - 16;
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset,
|
||||
a->buf, buf_end,
|
||||
a->buf_history, a->len_history,
|
||||
NMSK);
|
||||
m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, val_0, &res_old_1, &res_old_2, &res_old_3);
|
||||
r_0 = or256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY_256(r_0, 16, 0, VECTORING, ptr);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
if (ptr + 16 <= buf_end) {
|
||||
m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr), &res_old_1, &res_old_2, &res_old_3);
|
||||
CONFIRM_FAT_TEDDY_256(r_0, 16, 0, VECTORING, ptr);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
|
||||
__builtin_prefetch(ptr + (iterBytes * 4));
|
||||
CHECK_FLOOD;
|
||||
m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr), &res_old_1, &res_old_2, &res_old_3);
|
||||
CONFIRM_FAT_TEDDY_256(r_0, 16, 0, NOT_CAUTIOUS, ptr);
|
||||
m256 r_1 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr + 16), &res_old_1, &res_old_2, &res_old_3);
|
||||
CONFIRM_FAT_TEDDY_256(r_1, 16, 16, NOT_CAUTIOUS, ptr);
|
||||
}
|
||||
|
||||
if (ptr + 16 <= buf_end) {
|
||||
m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr), &res_old_1, &res_old_2, &res_old_3);
|
||||
CONFIRM_FAT_TEDDY_256(r_0, 16, 0, NOT_CAUTIOUS, ptr);
|
||||
ptr += 16;
|
||||
}
|
||||
|
||||
assert(ptr + 16 > buf_end);
|
||||
if (ptr < buf_end) {
|
||||
m256 p_mask;
|
||||
m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end,
|
||||
a->buf_history, a->len_history,
|
||||
NMSK);
|
||||
m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, val_0, &res_old_1, &res_old_2, &res_old_3);
|
||||
r_0 = or256(r_0, p_mask);
|
||||
CONFIRM_FAT_TEDDY_256(r_0, 16, 0, VECTORING, ptr);
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
// this check is because it is possible to build with both AVX512VBMI and AVX2 defined,
|
||||
// to replicate the behaviour of the original flow of control we give preference
|
||||
// to the former. If we're building for both then this will be compiled multiple times
|
||||
// with the desired variant defined by itself.
|
||||
#ifndef FDR_EXEC_FAT_TEDDY_FN
|
||||
#define FDR_EXEC_FAT_TEDDY_FN fdr_exec_fat_teddy_256_templ
|
||||
#endif
|
||||
|
||||
#endif // HAVE_AVX2 for fat teddy
|
||||
|
||||
/* we only have fat teddy in these two modes */
|
||||
// #if (defined(HAVE_AVX2) || defined(HAVE_AVX512VBMI)) && defined(FDR_EXEC_FAT_TEDDY_FN)
|
||||
// #if defined(FDR_EXEC_FAT_TEDDY_FN)
|
||||
|
||||
extern "C" {
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_FAT_TEDDY_FN<1>(fdr, a, control);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_FAT_TEDDY_FN<1>(fdr, a, control);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_FAT_TEDDY_FN<2>(fdr, a, control);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_FAT_TEDDY_FN<2>(fdr, a, control);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_FAT_TEDDY_FN<3>(fdr, a, control);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_FAT_TEDDY_FN<3>(fdr, a, control);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_FAT_TEDDY_FN<4>(fdr, a, control);
|
||||
}
|
||||
|
||||
hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
|
||||
const struct FDR_Runtime_Args *a,
|
||||
hwlm_group_t control) {
|
||||
return FDR_EXEC_FAT_TEDDY_FN<4>(fdr, a, control);
|
||||
}
|
||||
|
||||
} // extern c
|
||||
|
||||
#endif // HAVE_AVX2 from the beginning
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2016-2020, Intel Corporation
|
||||
* Copyright (c) 2024, VectorCamp PC
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -40,9 +41,15 @@
|
||||
#include "util/simd_utils.h"
|
||||
#include "util/uniform_ops.h"
|
||||
|
||||
extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
|
||||
#if defined(HAVE_AVX2)
|
||||
extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
|
||||
|
||||
#if defined(HAVE_AVX512VBMI)
|
||||
static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
|
||||
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
|
||||
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_64_BIT
|
||||
@ -132,6 +139,37 @@ void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
|
||||
// |----------|-------|----------------|............|
|
||||
// 0 start start+offset end(<=16)
|
||||
// p_mask ffff.....ffffff..ff0000...........00ffff..........
|
||||
|
||||
// replace the p_mask_arr table.
|
||||
// m is the length of the zone of bytes==0 , n is
|
||||
// the offset where that zone begins. more specifically, there are
|
||||
// 16-n bytes of 1's before the zone begins.
|
||||
// m,n 4,7 - 4 bytes of 0s, and 16-7 bytes of 1's before that.
|
||||
// 00 00 00 00 ff..ff
|
||||
// ff ff ff ff ff ff ff ff 00 00 00 00 ff..ff
|
||||
// m,n 15,15 - 15 bytes of 0s , f's high, but also with 16-15=1 byte of 1s
|
||||
// in the beginning - which push the ff at the end off the high end , leaving
|
||||
// ff 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
|
||||
// m,n 15,16 - 15 bytes of 0s, ff high , with 16-16 = 0 ones on the low end
|
||||
// before that, so,
|
||||
// 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ff
|
||||
// so to get the one part, with the f's high, we start out with 1's and
|
||||
// shift them up (right) by m+n.
|
||||
// now to fill in any ones that belong on the low end we have to take
|
||||
// some 1's and shift them down. the ones zone there needs to be 16-n long,
|
||||
// meaning shifted down by 16-(16-n) , or of course just n.
|
||||
// then we should be able to or these together.
|
||||
static really_inline
|
||||
m128 p_mask_gen(u8 m, u8 n){
|
||||
m128 a = ones128();
|
||||
m128 b = ones128();
|
||||
m%=17; n%=17;
|
||||
m+=(16-n); m%=17;
|
||||
a = rshiftbyte_m128(a, n);
|
||||
b = lshiftbyte_m128(b, m);
|
||||
return or128(a, b);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
const u8 *lo, const u8 *hi,
|
||||
@ -151,13 +189,11 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
uintptr_t avail = (uintptr_t)(hi - ptr);
|
||||
if (avail >= 16) {
|
||||
assert(start_offset - start <= 16);
|
||||
*p_mask = loadu128(p_mask_arr[16 - start_offset + start]
|
||||
+ 16 - start_offset + start);
|
||||
*p_mask = p_mask_gen(16 - start_offset + start, 16 - start_offset + start);
|
||||
return loadu128(ptr);
|
||||
}
|
||||
assert(start_offset - start <= avail);
|
||||
*p_mask = loadu128(p_mask_arr[avail - start_offset + start]
|
||||
+ 16 - start_offset + start);
|
||||
*p_mask = p_mask_gen(avail - start_offset + start, 16 - start_offset + start);
|
||||
copy_start = 0;
|
||||
copy_len = avail;
|
||||
} else { // start zone
|
||||
@ -170,8 +206,7 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
}
|
||||
uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
|
||||
assert(start + start_offset <= end);
|
||||
*p_mask = loadu128(p_mask_arr[end - start - start_offset]
|
||||
+ 16 - start - start_offset);
|
||||
*p_mask = p_mask_gen(end - start - start_offset, 16 - start - start_offset);
|
||||
copy_start = start;
|
||||
copy_len = end - start;
|
||||
}
|
||||
@ -260,6 +295,20 @@ void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
|
||||
// |----------|-------|----------------|............|
|
||||
// 0 start start+offset end(<=32)
|
||||
// p_mask ffff.....ffffff..ff0000...........00ffff..........
|
||||
|
||||
// like the pmask gen above this replaces the large array.
|
||||
static really_inline
|
||||
m256 fat_pmask_gen(u8 m, u8 n){
|
||||
m256 a=ones256();
|
||||
m256 b=ones256();
|
||||
m%=33; n%=33;
|
||||
m+=(32-n); m%=33;
|
||||
|
||||
a = rshift_byte_m256(a, m);
|
||||
b = lshift_byte_m256(b, n);
|
||||
return or256(a, b);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
const u8 *lo, const u8 *hi,
|
||||
@ -279,13 +328,11 @@ m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
uintptr_t avail = (uintptr_t)(hi - ptr);
|
||||
if (avail >= 32) {
|
||||
assert(start_offset - start <= 32);
|
||||
*p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
|
||||
+ 32 - start_offset + start);
|
||||
*p_mask = fat_pmask_gen(32 - start_offset + start, 32 - start_offset + start);
|
||||
return loadu256(ptr);
|
||||
}
|
||||
assert(start_offset - start <= avail);
|
||||
*p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
|
||||
+ 32 - start_offset + start);
|
||||
*p_mask = fat_pmask_gen(avail - start_offset + start, 32 - start_offset + start);
|
||||
copy_start = 0;
|
||||
copy_len = avail;
|
||||
} else { //start zone
|
||||
@ -298,8 +345,7 @@ m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
}
|
||||
uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
|
||||
assert(start + start_offset <= end);
|
||||
*p_mask = loadu256(p_mask_arr256[end - start - start_offset]
|
||||
+ 32 - start - start_offset);
|
||||
*p_mask = fat_pmask_gen(end - start - start_offset, 32 - start - start_offset);
|
||||
copy_start = start;
|
||||
copy_len = end - start;
|
||||
}
|
||||
@ -338,7 +384,7 @@ static really_inline
|
||||
m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset,
|
||||
const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen,
|
||||
const u32 nMasks) {
|
||||
m512 val;
|
||||
m512 val = zeroes512();
|
||||
|
||||
uintptr_t copy_start;
|
||||
uintptr_t copy_len;
|
||||
@ -418,8 +464,13 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
|
||||
if (!cf) {
|
||||
continue;
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
const struct FDRConfirm *fdrc = reinterpret_cast<const struct FDRConfirm *>
|
||||
(reinterpret_cast<const u8 *>(confBase) + cf);
|
||||
#else
|
||||
const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
|
||||
((const u8 *)confBase + cf);
|
||||
#endif
|
||||
if (!(fdrc->groups & *control)) {
|
||||
continue;
|
||||
}
|
||||
@ -432,18 +483,31 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
|
||||
|
||||
static really_inline
|
||||
const m128 *getMaskBase(const struct Teddy *teddy) {
|
||||
#ifdef __cplusplus
|
||||
return reinterpret_cast<const m128 *>(reinterpret_cast<const u8 *>(teddy) + ROUNDUP_CL(sizeof(struct Teddy)));
|
||||
#else
|
||||
return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
|
||||
#ifdef __cplusplus
|
||||
return reinterpret_cast<const u64a *>(reinterpret_cast<const u8 *>(getMaskBase(teddy))
|
||||
+ ROUNDUP_CL(2 * numMask * sizeof(m128)));
|
||||
#else
|
||||
return (const u64a *)((const u8 *)getMaskBase(teddy)
|
||||
+ ROUNDUP_CL(2 * numMask * sizeof(m128)));
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
const u32 *getConfBase(const struct Teddy *teddy) {
|
||||
#ifdef __cplusplus
|
||||
return reinterpret_cast<const u32 *>(reinterpret_cast<const u8 *>(teddy) + teddy->confOffset);
|
||||
#else
|
||||
return (const u32 *)((const u8 *)teddy + teddy->confOffset);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* TEDDY_RUNTIME_COMMON_H_ */
|
||||
|
31
src/hs.cpp
31
src/hs.cpp
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2019, Intel Corporation
|
||||
* Copyright (c) 2015-2021, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -44,8 +44,11 @@
|
||||
#include "parser/prefilter.h"
|
||||
#include "parser/unsupported.h"
|
||||
#include "util/compile_error.h"
|
||||
#include "util/cpuid_flags.h"
|
||||
#include "util/cpuid_inline.h"
|
||||
#include "util/arch/common/cpuid_flags.h"
|
||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||
#include "util/arch/x86/cpuid_inline.h"
|
||||
#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
|
||||
#endif
|
||||
#include "util/depth.h"
|
||||
#include "util/popcount.h"
|
||||
#include "util/target_info.h"
|
||||
@ -120,9 +123,10 @@ bool checkMode(unsigned int mode, hs_compile_error **comp_error) {
|
||||
|
||||
static
|
||||
bool checkPlatform(const hs_platform_info *p, hs_compile_error **comp_error) {
|
||||
static constexpr u32 HS_TUNE_LAST = HS_TUNE_FAMILY_GLM;
|
||||
static constexpr u32 HS_TUNE_LAST = HS_TUNE_FAMILY_ICX;
|
||||
static constexpr u32 HS_CPU_FEATURES_ALL =
|
||||
HS_CPU_FEATURES_AVX2 | HS_CPU_FEATURES_AVX512;
|
||||
HS_CPU_FEATURES_AVX2 | HS_CPU_FEATURES_AVX512 |
|
||||
HS_CPU_FEATURES_AVX512VBMI;
|
||||
|
||||
if (!p) {
|
||||
return true;
|
||||
@ -195,11 +199,13 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags,
|
||||
}
|
||||
|
||||
#if defined(FAT_RUNTIME)
|
||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||
if (!check_ssse3()) {
|
||||
*db = nullptr;
|
||||
*comp_error = generateCompileError("Unsupported architecture", -1);
|
||||
return HS_ARCH_ERROR;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!checkMode(mode, comp_error)) {
|
||||
@ -316,13 +322,14 @@ hs_compile_lit_multi_int(const char *const *expressions, const unsigned *flags,
|
||||
*comp_error = generateCompileError("Invalid parameter: elements is zero", -1);
|
||||
return HS_COMPILER_ERROR;
|
||||
}
|
||||
|
||||
#if defined(FAT_RUNTIME)
|
||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||
if (!check_ssse3()) {
|
||||
*db = nullptr;
|
||||
*comp_error = generateCompileError("Unsupported architecture", -1);
|
||||
return HS_ARCH_ERROR;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!checkMode(mode, comp_error)) {
|
||||
@ -496,10 +503,12 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
|
||||
}
|
||||
|
||||
#if defined(FAT_RUNTIME)
|
||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||
if (!check_ssse3()) {
|
||||
*error = generateCompileError("Unsupported architecture", -1);
|
||||
return HS_ARCH_ERROR;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!info) {
|
||||
@ -513,6 +522,12 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
|
||||
return HS_COMPILER_ERROR;
|
||||
}
|
||||
|
||||
if (flags & HS_FLAG_COMBINATION) {
|
||||
*error = generateCompileError("Invalid parameter: unsupported "
|
||||
"logical combination expression", -1);
|
||||
return HS_COMPILER_ERROR;
|
||||
}
|
||||
|
||||
*info = nullptr;
|
||||
*error = nullptr;
|
||||
|
||||
@ -574,7 +589,7 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
|
||||
return HS_COMPILER_ERROR;
|
||||
}
|
||||
|
||||
hs_expr_info *rv = (hs_expr_info *)hs_misc_alloc(sizeof(*rv));
|
||||
hs_expr_info *rv = static_cast<hs_expr_info *>(hs_misc_alloc(sizeof(*rv)));
|
||||
if (!rv) {
|
||||
*error = const_cast<hs_compile_error_t *>(&hs_enomem);
|
||||
return HS_COMPILER_ERROR;
|
||||
@ -621,9 +636,11 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform) {
|
||||
extern "C" HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error) {
|
||||
#if defined(FAT_RUNTIME)
|
||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||
if (!check_ssse3()) {
|
||||
return HS_ARCH_ERROR;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
freeCompileError(error);
|
||||
return HS_SUCCESS;
|
||||
|
7
src/hs.h
7
src/hs.h
@ -39,12 +39,7 @@
|
||||
* the individual component headers for documentation.
|
||||
*/
|
||||
|
||||
/* The current Hyperscan version information. */
|
||||
|
||||
#define HS_MAJOR 5
|
||||
#define HS_MINOR 3
|
||||
#define HS_PATCH 0
|
||||
|
||||
#include "hs_version.h"
|
||||
#include "hs_compile.h"
|
||||
#include "hs_runtime.h"
|
||||
|
||||
|
@ -29,11 +29,7 @@
|
||||
#ifndef HS_COMMON_H_
|
||||
#define HS_COMMON_H_
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define HS_CDECL __cdecl
|
||||
#else
|
||||
#define HS_CDECL
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
|
||||
/**
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2020, Intel Corporation
|
||||
* Copyright (c) 2015-2021, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -748,10 +748,7 @@ hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error);
|
||||
* - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
|
||||
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
|
||||
* when a match is found.
|
||||
* - HS_FLAG_COMBINATION - Parse the expression in logical combination
|
||||
* syntax.
|
||||
* - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
|
||||
* the sub-expressions in logical combinations.
|
||||
* - HS_FLAG_QUIET - This flag will be ignored.
|
||||
*
|
||||
* @param info
|
||||
* On success, a pointer to the pattern information will be returned in
|
||||
@ -814,10 +811,7 @@ hs_error_t HS_CDECL hs_expression_info(const char *expression,
|
||||
* - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
|
||||
* - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
|
||||
* when a match is found.
|
||||
* - HS_FLAG_COMBINATION - Parse the expression in logical combination
|
||||
* syntax.
|
||||
* - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
|
||||
* the sub-expressions in logical combinations.
|
||||
* - HS_FLAG_QUIET - This flag will be ignored.
|
||||
*
|
||||
* @param ext
|
||||
* A pointer to a filled @ref hs_expr_ext_t structure that defines
|
||||
@ -1034,6 +1028,15 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
|
||||
*/
|
||||
#define HS_CPU_FEATURES_AVX512 (1ULL << 3)
|
||||
|
||||
/**
|
||||
* CPU features flag - Intel(R) Advanced Vector Extensions 512
|
||||
* Vector Byte Manipulation Instructions (Intel(R) AVX512VBMI)
|
||||
*
|
||||
* Setting this flag indicates that the target platform supports AVX512VBMI
|
||||
* instructions. Using AVX512VBMI implies the use of AVX512.
|
||||
*/
|
||||
#define HS_CPU_FEATURES_AVX512VBMI (1ULL << 4)
|
||||
|
||||
/** @} */
|
||||
|
||||
/**
|
||||
@ -1114,6 +1117,22 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
|
||||
*/
|
||||
#define HS_TUNE_FAMILY_GLM 8
|
||||
|
||||
/**
|
||||
* Tuning Parameter - Intel(R) microarchitecture code name Icelake
|
||||
*
|
||||
* This indicates that the compiled database should be tuned for the
|
||||
* Icelake microarchitecture.
|
||||
*/
|
||||
#define HS_TUNE_FAMILY_ICL 9
|
||||
|
||||
/**
|
||||
* Tuning Parameter - Intel(R) microarchitecture code name Icelake Server
|
||||
*
|
||||
* This indicates that the compiled database should be tuned for the
|
||||
* Icelake Server microarchitecture.
|
||||
*/
|
||||
#define HS_TUNE_FAMILY_ICX 10
|
||||
|
||||
/** @} */
|
||||
|
||||
/**
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2019, Intel Corporation
|
||||
* Copyright (c) 2019-2021, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -80,7 +80,9 @@ extern "C"
|
||||
| HS_FLAG_PREFILTER \
|
||||
| HS_FLAG_SINGLEMATCH \
|
||||
| HS_FLAG_ALLOWEMPTY \
|
||||
| HS_FLAG_SOM_LEFTMOST)
|
||||
| HS_FLAG_SOM_LEFTMOST \
|
||||
| HS_FLAG_COMBINATION \
|
||||
| HS_FLAG_QUIET)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2016-2017, Intel Corporation
|
||||
* Copyright (c) 2020-2023, VectorCamp PC
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -26,16 +27,36 @@
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "hs_common.h"
|
||||
#include "util/cpuid_flags.h"
|
||||
#include "util/cpuid_inline.h"
|
||||
#include "ue2common.h"
|
||||
#if !defined(VS_SIMDE_BACKEND)
|
||||
#if defined(ARCH_IA32) || defined(ARCH_X86_64)
|
||||
#include "util/arch/x86/cpuid_inline.h"
|
||||
#elif defined(ARCH_AARCH64)
|
||||
#include "util/arch/arm/cpuid_inline.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
HS_PUBLIC_API
|
||||
hs_error_t HS_CDECL hs_valid_platform(void) {
|
||||
/* Hyperscan requires SSSE3, anything else is a bonus */
|
||||
if (check_ssse3()) {
|
||||
/* Vectorscan requires SSE4.2, anything else is a bonus */
|
||||
#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_IA32) || defined(ARCH_X86_64))
|
||||
// cppcheck-suppress knownConditionTrueFalse
|
||||
if (check_sse42()) {
|
||||
return HS_SUCCESS;
|
||||
} else {
|
||||
return HS_ARCH_ERROR;
|
||||
}
|
||||
#elif !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
|
||||
//check_neon returns true for now
|
||||
// cppcheck-suppress knownConditionTrueFalse
|
||||
if (check_neon()) {
|
||||
return HS_SUCCESS;
|
||||
} else {
|
||||
return HS_ARCH_ERROR;
|
||||
}
|
||||
#elif defined(ARCH_PPC64EL) || defined(VS_SIMDE_BACKEND)
|
||||
return HS_SUCCESS;
|
||||
#endif
|
||||
}
|
||||
|
@ -36,5 +36,9 @@
|
||||
|
||||
#define HS_VERSION_32BIT ((@HS_MAJOR_VERSION@ << 24) | (@HS_MINOR_VERSION@ << 16) | (@HS_PATCH_VERSION@ << 8) | 0)
|
||||
|
||||
#define HS_MAJOR @HS_MAJOR_VERSION@
|
||||
#define HS_MINOR @HS_MINOR_VERSION@
|
||||
#define HS_PATCH @HS_PATCH_VERSION@
|
||||
|
||||
#endif /* HS_VERSION_H_C6428FAF8E3713 */
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
* Copyright (c) 2021, Arm Limited
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -38,7 +39,7 @@
|
||||
#include "nfa/accel.h"
|
||||
#include "nfa/shufti.h"
|
||||
#include "nfa/truffle.h"
|
||||
#include "nfa/vermicelli.h"
|
||||
#include "nfa/vermicelli.hpp"
|
||||
#include <string.h>
|
||||
|
||||
#define MIN_ACCEL_LEN_BLOCK 16
|
||||
@ -62,12 +63,22 @@ const u8 *run_hwlm_accel(const union AccelAux *aux, const u8 *ptr,
|
||||
DEBUG_PRINTF("double vermicelli-nocase for 0x%02hhx%02hhx\n",
|
||||
aux->dverm.c1, aux->dverm.c2);
|
||||
return vermicelliDoubleExec(aux->dverm.c1, aux->dverm.c2, 1, ptr, end);
|
||||
#ifdef HAVE_SVE2
|
||||
case ACCEL_VERM16:
|
||||
DEBUG_PRINTF("single vermicelli16\n");
|
||||
return vermicelli16Exec(aux->verm16.mask, ptr, end);
|
||||
#endif // HAVE_SVE2
|
||||
case ACCEL_SHUFTI:
|
||||
DEBUG_PRINTF("single shufti\n");
|
||||
return shuftiExec(aux->shufti.lo, aux->shufti.hi, ptr, end);
|
||||
case ACCEL_TRUFFLE:
|
||||
DEBUG_PRINTF("truffle\n");
|
||||
return truffleExec(aux->truffle.mask1, aux->truffle.mask2, ptr, end);
|
||||
return truffleExec(aux->truffle.mask_lo, aux->truffle.mask_hi, ptr, end);
|
||||
#ifdef CAN_USE_WIDE_TRUFFLE
|
||||
case ACCEL_TRUFFLE_WIDE:
|
||||
DEBUG_PRINTF("truffle wide\n");
|
||||
return truffleExecWide(aux->truffle.mask, ptr, end);
|
||||
#endif // CAN_USE_WIDE_TRUFFLE
|
||||
default:
|
||||
/* no acceleration, fall through and return current ptr */
|
||||
DEBUG_PRINTF("no accel; %u\n", (int)aux->accel_type);
|
||||
@ -164,8 +175,7 @@ void do_accel_streaming(const union AccelAux *aux, const u8 *hbuf, size_t hlen,
|
||||
DEBUG_PRINTF("got %zu/%zu in 2nd buffer\n", delta, len);
|
||||
*start += delta;
|
||||
} else if (hlen) {
|
||||
UNUSED size_t remaining = offset + ptr2 - found;
|
||||
DEBUG_PRINTF("got %zu/%zu remaining in 1st buffer\n", remaining, hlen);
|
||||
DEBUG_PRINTF("got %zu/%zu remaining in 1st buffer\n", offset + ptr2 - found, hlen);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -46,7 +46,6 @@
|
||||
#include "fdr/teddy_engine_description.h"
|
||||
#include "util/compile_context.h"
|
||||
#include "util/compile_error.h"
|
||||
#include "util/make_unique.h"
|
||||
#include "util/ue2string.h"
|
||||
|
||||
#include <cassert>
|
||||
@ -58,24 +57,24 @@ using namespace std;
|
||||
namespace ue2 {
|
||||
|
||||
HWLMProto::HWLMProto(u8 engType_in, vector<hwlmLiteral> lits_in)
|
||||
: engType(engType_in), lits(move(lits_in)) {}
|
||||
: engType(engType_in), lits(std::move(lits_in)) {}
|
||||
|
||||
HWLMProto::HWLMProto(u8 engType_in,
|
||||
unique_ptr<FDREngineDescription> eng_in,
|
||||
vector<hwlmLiteral> lits_in,
|
||||
map<u32, vector<u32>> bucketToLits_in,
|
||||
bool make_small_in)
|
||||
: engType(engType_in), fdrEng(move(eng_in)), lits(move(lits_in)),
|
||||
bucketToLits(move(bucketToLits_in)), make_small(make_small_in) {}
|
||||
: engType(engType_in), fdrEng(std::move(eng_in)), lits(std::move(lits_in)),
|
||||
bucketToLits(std::move(bucketToLits_in)), make_small(make_small_in) {}
|
||||
|
||||
HWLMProto::HWLMProto(u8 engType_in,
|
||||
unique_ptr<TeddyEngineDescription> eng_in,
|
||||
vector<hwlmLiteral> lits_in,
|
||||
map<u32, vector<u32>> bucketToLits_in,
|
||||
bool make_small_in)
|
||||
: engType(engType_in), teddyEng(move(eng_in)),
|
||||
lits(move(lits_in)),
|
||||
bucketToLits(move(bucketToLits_in)), make_small(make_small_in) {}
|
||||
: engType(engType_in), teddyEng(std::move(eng_in)),
|
||||
lits(std::move(lits_in)),
|
||||
bucketToLits(std::move(bucketToLits_in)), make_small(make_small_in) {}
|
||||
|
||||
HWLMProto::~HWLMProto() {}
|
||||
|
||||
@ -94,6 +93,7 @@ void dumpLits(UNUSED const vector<hwlmLiteral> &lits) {
|
||||
// Called by an assertion.
|
||||
static
|
||||
bool everyoneHasGroups(const vector<hwlmLiteral> &lits) {
|
||||
// cppcheck-suppress useStlAlgorithm
|
||||
for (const auto &lit : lits) {
|
||||
if (!lit.groups) {
|
||||
return false;
|
||||
@ -133,18 +133,18 @@ bytecode_ptr<HWLM> hwlmBuild(const HWLMProto &proto, const CompileContext &cc,
|
||||
if (noodle) {
|
||||
engSize = noodle.size();
|
||||
}
|
||||
eng = move(noodle);
|
||||
eng = std::move(noodle);
|
||||
} else {
|
||||
DEBUG_PRINTF("building a new deal\n");
|
||||
auto fdr = fdrBuildTable(proto, cc.grey);
|
||||
if (fdr) {
|
||||
engSize = fdr.size();
|
||||
}
|
||||
eng = move(fdr);
|
||||
eng = std::move(fdr);
|
||||
}
|
||||
|
||||
if (!eng) {
|
||||
return nullptr;
|
||||
return bytecode_ptr<HWLM>(nullptr);
|
||||
}
|
||||
|
||||
assert(engSize);
|
||||
@ -156,6 +156,7 @@ bytecode_ptr<HWLM> hwlmBuild(const HWLMProto &proto, const CompileContext &cc,
|
||||
auto h = make_zeroed_bytecode_ptr<HWLM>(hwlm_len, 64);
|
||||
|
||||
h->type = proto.engType;
|
||||
// cppcheck-suppress cstyleCast
|
||||
memcpy(HWLM_DATA(h.get()), eng.get(), engSize);
|
||||
|
||||
return h;
|
||||
@ -201,7 +202,7 @@ hwlmBuildProto(vector<hwlmLiteral> &lits, bool make_small,
|
||||
|
||||
if (isNoodleable(lits, cc)) {
|
||||
DEBUG_PRINTF("build noodle table\n");
|
||||
proto = ue2::make_unique<HWLMProto>(HWLM_ENGINE_NOOD, lits);
|
||||
proto = std::make_unique<HWLMProto>(HWLM_ENGINE_NOOD, lits);
|
||||
} else {
|
||||
DEBUG_PRINTF("building a new deal\n");
|
||||
proto = fdrBuildProto(HWLM_ENGINE_FDR, lits, make_small,
|
||||
@ -219,10 +220,12 @@ size_t hwlmSize(const HWLM *h) {
|
||||
|
||||
switch (h->type) {
|
||||
case HWLM_ENGINE_NOOD:
|
||||
engSize = noodSize((const noodTable *)HWLM_C_DATA(h));
|
||||
// cppcheck-suppress cstyleCast
|
||||
engSize = noodSize(reinterpret_cast<const noodTable *>(HWLM_C_DATA(h)));
|
||||
break;
|
||||
case HWLM_ENGINE_FDR:
|
||||
engSize = fdrSize((const FDR *)HWLM_C_DATA(h));
|
||||
// cppcheck-suppress cstyleCast
|
||||
engSize = fdrSize(reinterpret_cast<const FDR *>(HWLM_C_DATA(h)));
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -53,10 +53,12 @@ void hwlmGenerateDumpFiles(const HWLM *h, const string &base) {
|
||||
|
||||
switch (h->type) {
|
||||
case HWLM_ENGINE_NOOD:
|
||||
noodPrintStats((const noodTable *)HWLM_C_DATA(h), f);
|
||||
// cppcheck-suppress cstyleCast
|
||||
noodPrintStats(reinterpret_cast<const noodTable *>(HWLM_C_DATA(h)), f);
|
||||
break;
|
||||
case HWLM_ENGINE_FDR:
|
||||
fdrPrintStats((const FDR *)HWLM_C_DATA(h), f);
|
||||
// cppcheck-suppress cstyleCast
|
||||
fdrPrintStats(reinterpret_cast<const FDR *>(HWLM_C_DATA(h)), f);
|
||||
break;
|
||||
default:
|
||||
fprintf(f, "<unknown hwlm subengine>\n");
|
||||
|
@ -56,7 +56,7 @@ u64a make_u64a_mask(const vector<u8> &v) {
|
||||
|
||||
u64a mask = 0;
|
||||
size_t len = v.size();
|
||||
unsigned char *m = (unsigned char *)&mask;
|
||||
u8 *m = reinterpret_cast<u8 *>(&mask);
|
||||
DEBUG_PRINTF("making mask len %zu\n", len);
|
||||
memcpy(m, &v[0], len);
|
||||
return mask;
|
||||
@ -156,7 +156,7 @@ void noodPrintStats(const noodTable *n, FILE *f) {
|
||||
n->msk_len);
|
||||
fprintf(f, "String: ");
|
||||
for (u32 i = 0; i < n->msk_len; i++) {
|
||||
const u8 *m = (const u8 *)&n->cmp;
|
||||
const u8 *m = reinterpret_cast<const u8 *>(&n->cmp);
|
||||
if (isgraph(m[i]) && m[i] != '\\') {
|
||||
fprintf(f, "%c", m[i]);
|
||||
} else {
|
||||
|
@ -1,442 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Noodle literal matcher: runtime.
|
||||
*/
|
||||
#include "hwlm.h"
|
||||
#include "noodle_engine.h"
|
||||
#include "noodle_internal.h"
|
||||
#include "scratch.h"
|
||||
#include "ue2common.h"
|
||||
#include "util/arch.h"
|
||||
#include "util/bitutils.h"
|
||||
#include "util/compare.h"
|
||||
#include "util/intrinsics.h"
|
||||
#include "util/join.h"
|
||||
#include "util/masked_move.h"
|
||||
#include "util/partial_store.h"
|
||||
#include "util/simd_utils.h"
|
||||
|
||||
#include <ctype.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
|
||||
/** \brief Noodle runtime context. */
|
||||
struct cb_info {
|
||||
HWLMCallback cb; //!< callback function called on match
|
||||
u32 id; //!< ID to pass to callback on match
|
||||
struct hs_scratch *scratch; //!< scratch to pass to callback
|
||||
size_t offsetAdj; //!< used in streaming mode
|
||||
};
|
||||
|
||||
#if defined(HAVE_AVX512)
|
||||
#define CHUNKSIZE 64
|
||||
#define MASK_TYPE m512
|
||||
#define Z_BITS 64
|
||||
#define Z_TYPE u64a
|
||||
#elif defined(HAVE_AVX2)
|
||||
#define CHUNKSIZE 32
|
||||
#define MASK_TYPE m256
|
||||
#define Z_BITS 32
|
||||
#define Z_TYPE u32
|
||||
#else
|
||||
#define CHUNKSIZE 16
|
||||
#define MASK_TYPE m128
|
||||
#define Z_BITS 32
|
||||
#define Z_TYPE u32
|
||||
#endif
|
||||
|
||||
|
||||
#define RETURN_IF_TERMINATED(x) \
|
||||
{ \
|
||||
if ((x) == HWLM_TERMINATED) { \
|
||||
return HWLM_TERMINATED; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define SINGLE_ZSCAN() \
|
||||
do { \
|
||||
while (unlikely(z)) { \
|
||||
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); \
|
||||
size_t matchPos = d - buf + pos; \
|
||||
DEBUG_PRINTF("match pos %zu\n", matchPos); \
|
||||
hwlmcb_rv_t rv = final(n, buf, len, 1, cbi, matchPos); \
|
||||
RETURN_IF_TERMINATED(rv); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define DOUBLE_ZSCAN() \
|
||||
do { \
|
||||
while (unlikely(z)) { \
|
||||
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); \
|
||||
size_t matchPos = d - buf + pos - 1; \
|
||||
DEBUG_PRINTF("match pos %zu\n", matchPos); \
|
||||
hwlmcb_rv_t rv = final(n, buf, len, 0, cbi, matchPos); \
|
||||
RETURN_IF_TERMINATED(rv); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static really_inline
|
||||
u8 caseClear8(u8 x, bool noCase) {
|
||||
return (u8)(noCase ? (x & (u8)0xdf) : x);
|
||||
}
|
||||
|
||||
// Make sure the rest of the string is there. The single character scanner
|
||||
// is used only for single chars with case insensitivity used correctly,
|
||||
// so it can go straight to the callback if we get this far.
|
||||
static really_inline
|
||||
hwlm_error_t final(const struct noodTable *n, const u8 *buf, UNUSED size_t len,
|
||||
char single, const struct cb_info *cbi, size_t pos) {
|
||||
if (single) {
|
||||
if (n->msk_len == 1) {
|
||||
goto match;
|
||||
}
|
||||
}
|
||||
assert(len >= n->msk_len);
|
||||
u64a v =
|
||||
partial_load_u64a(buf + pos + n->key_offset - n->msk_len, n->msk_len);
|
||||
DEBUG_PRINTF("v %016llx msk %016llx cmp %016llx\n", v, n->msk, n->cmp);
|
||||
if ((v & n->msk) != n->cmp) {
|
||||
/* mask didn't match */
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
match:
|
||||
pos -= cbi->offsetAdj;
|
||||
DEBUG_PRINTF("match @ %zu\n", pos + n->key_offset);
|
||||
hwlmcb_rv_t rv = cbi->cb(pos + n->key_offset - 1, cbi->id, cbi->scratch);
|
||||
if (rv == HWLM_TERMINATE_MATCHING) {
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#if defined(HAVE_AVX512)
|
||||
#define CHUNKSIZE 64
|
||||
#define MASK_TYPE m512
|
||||
#include "noodle_engine_avx512.c"
|
||||
#elif defined(HAVE_AVX2)
|
||||
#define CHUNKSIZE 32
|
||||
#define MASK_TYPE m256
|
||||
#include "noodle_engine_avx2.c"
|
||||
#else
|
||||
#define CHUNKSIZE 16
|
||||
#define MASK_TYPE m128
|
||||
#include "noodle_engine_sse.c"
|
||||
#endif
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, size_t start, bool noCase,
|
||||
const struct cb_info *cbi) {
|
||||
|
||||
const MASK_TYPE mask1 = getMask(n->key0, noCase);
|
||||
const MASK_TYPE caseMask = getCaseMask();
|
||||
|
||||
size_t offset = start + n->msk_len - 1;
|
||||
size_t end = len;
|
||||
assert(offset < end);
|
||||
|
||||
#if !defined(HAVE_AVX512)
|
||||
hwlm_error_t rv;
|
||||
|
||||
if (end - offset < CHUNKSIZE) {
|
||||
rv = scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, offset,
|
||||
end);
|
||||
return rv;
|
||||
}
|
||||
|
||||
if (end - offset == CHUNKSIZE) {
|
||||
rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
|
||||
cbi, offset, end);
|
||||
return rv;
|
||||
}
|
||||
|
||||
uintptr_t data = (uintptr_t)buf;
|
||||
uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data;
|
||||
uintptr_t last = data + end;
|
||||
uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data;
|
||||
uintptr_t s3Start = end - CHUNKSIZE;
|
||||
|
||||
if (offset != s2Start) {
|
||||
// first scan out to the fast scan starting point
|
||||
DEBUG_PRINTF("stage 1: -> %zu\n", s2Start);
|
||||
rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
|
||||
cbi, offset, s2Start);
|
||||
RETURN_IF_TERMINATED(rv);
|
||||
}
|
||||
|
||||
if (likely(s2Start != s2End)) {
|
||||
// scan as far as we can, bounded by the last point this key can
|
||||
// possibly match
|
||||
DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s2End);
|
||||
rv = scanSingleFast(n, buf, len, noCase, caseMask, mask1, cbi, s2Start,
|
||||
s2End);
|
||||
RETURN_IF_TERMINATED(rv);
|
||||
}
|
||||
|
||||
// if we are done bail out
|
||||
if (s2End == len) {
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("stage 3: %zu -> %zu\n", s2End, len);
|
||||
rv = scanSingleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1, cbi,
|
||||
s2End, len);
|
||||
|
||||
return rv;
|
||||
#else // HAVE_AVX512
|
||||
return scanSingle512(n, buf, len, noCase, caseMask, mask1, cbi, offset,
|
||||
end);
|
||||
#endif
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, size_t start, bool noCase,
|
||||
const struct cb_info *cbi) {
|
||||
// we stop scanning for the key-fragment when the rest of the key can't
|
||||
// possibly fit in the remaining buffer
|
||||
size_t end = len - n->key_offset + 2;
|
||||
|
||||
// the first place the key can match
|
||||
size_t offset = start + n->msk_len - n->key_offset;
|
||||
|
||||
const MASK_TYPE caseMask = getCaseMask();
|
||||
const MASK_TYPE mask1 = getMask(n->key0, noCase);
|
||||
const MASK_TYPE mask2 = getMask(n->key1, noCase);
|
||||
|
||||
#if !defined(HAVE_AVX512)
|
||||
hwlm_error_t rv;
|
||||
|
||||
if (end - offset < CHUNKSIZE) {
|
||||
rv = scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
|
||||
offset, end);
|
||||
return rv;
|
||||
}
|
||||
if (end - offset == CHUNKSIZE) {
|
||||
rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
|
||||
mask2, cbi, offset, end);
|
||||
return rv;
|
||||
}
|
||||
|
||||
uintptr_t data = (uintptr_t)buf;
|
||||
uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data;
|
||||
uintptr_t s1End = s2Start + 1;
|
||||
uintptr_t last = data + end;
|
||||
uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data;
|
||||
uintptr_t s3Start = end - CHUNKSIZE;
|
||||
uintptr_t off = offset;
|
||||
|
||||
if (s2Start != off) {
|
||||
// first scan out to the fast scan starting point plus one char past to
|
||||
// catch the key on the overlap
|
||||
DEBUG_PRINTF("stage 1: %zu -> %zu\n", off, s2Start);
|
||||
rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
|
||||
mask2, cbi, off, s1End);
|
||||
RETURN_IF_TERMINATED(rv);
|
||||
}
|
||||
off = s1End;
|
||||
|
||||
if (s2Start >= end) {
|
||||
DEBUG_PRINTF("s2 == mL %zu\n", end);
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
if (likely(s2Start != s2End)) {
|
||||
// scan as far as we can, bounded by the last point this key can
|
||||
// possibly match
|
||||
DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s3Start);
|
||||
rv = scanDoubleFast(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
|
||||
s2Start, s2End);
|
||||
RETURN_IF_TERMINATED(rv);
|
||||
off = s2End;
|
||||
}
|
||||
|
||||
// if there isn't enough data left to match the key, bail out
|
||||
if (s2End == end) {
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("stage 3: %zu -> %zu\n", s3Start, end);
|
||||
rv = scanDoubleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1,
|
||||
mask2, cbi, off, end);
|
||||
|
||||
return rv;
|
||||
#else // AVX512
|
||||
return scanDouble512(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
|
||||
offset, end);
|
||||
#endif // AVX512
|
||||
}
|
||||
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanSingleNoCase(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, size_t start,
|
||||
const struct cb_info *cbi) {
|
||||
return scanSingleMain(n, buf, len, start, 1, cbi);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanSingleCase(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, size_t start,
|
||||
const struct cb_info *cbi) {
|
||||
return scanSingleMain(n, buf, len, start, 0, cbi);
|
||||
}
|
||||
|
||||
// Single-character specialisation, used when keyLen = 1
|
||||
static really_inline
|
||||
hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len,
|
||||
size_t start, bool noCase, const struct cb_info *cbi) {
|
||||
if (!ourisalpha(n->key0)) {
|
||||
noCase = 0; // force noCase off if we don't have an alphabetic char
|
||||
}
|
||||
|
||||
// kinda ugly, but this forces constant propagation
|
||||
if (noCase) {
|
||||
return scanSingleNoCase(n, buf, len, start, cbi);
|
||||
} else {
|
||||
return scanSingleCase(n, buf, len, start, cbi);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDoubleNoCase(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, size_t start,
|
||||
const struct cb_info *cbi) {
|
||||
return scanDoubleMain(n, buf, len, start, 1, cbi);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDoubleCase(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, size_t start,
|
||||
const struct cb_info *cbi) {
|
||||
return scanDoubleMain(n, buf, len, start, 0, cbi);
|
||||
}
|
||||
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
|
||||
size_t start, bool noCase, const struct cb_info *cbi) {
|
||||
// kinda ugly, but this forces constant propagation
|
||||
if (noCase) {
|
||||
return scanDoubleNoCase(n, buf, len, start, cbi);
|
||||
} else {
|
||||
return scanDoubleCase(n, buf, len, start, cbi);
|
||||
}
|
||||
}
|
||||
|
||||
// main entry point for the scan code
|
||||
static really_inline
|
||||
hwlm_error_t scan(const struct noodTable *n, const u8 *buf, size_t len,
|
||||
size_t start, char single, bool noCase,
|
||||
const struct cb_info *cbi) {
|
||||
if (len - start < n->msk_len) {
|
||||
// can't find string of length keyLen in a shorter buffer
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
if (single) {
|
||||
return scanSingle(n, buf, len, start, noCase, cbi);
|
||||
} else {
|
||||
return scanDouble(n, buf, len, start, noCase, cbi);
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Block-mode scanner. */
|
||||
hwlm_error_t noodExec(const struct noodTable *n, const u8 *buf, size_t len,
|
||||
size_t start, HWLMCallback cb,
|
||||
struct hs_scratch *scratch) {
|
||||
assert(n && buf);
|
||||
|
||||
struct cb_info cbi = {cb, n->id, scratch, 0};
|
||||
DEBUG_PRINTF("nood scan of %zu bytes for %*s @ %p\n", len, n->msk_len,
|
||||
(const char *)&n->cmp, buf);
|
||||
|
||||
return scan(n, buf, len, start, n->single, n->nocase, &cbi);
|
||||
}
|
||||
|
||||
/** \brief Streaming-mode scanner. */
|
||||
hwlm_error_t noodExecStreaming(const struct noodTable *n, const u8 *hbuf,
|
||||
size_t hlen, const u8 *buf, size_t len,
|
||||
HWLMCallback cb, struct hs_scratch *scratch) {
|
||||
assert(n);
|
||||
|
||||
if (len + hlen < n->msk_len) {
|
||||
DEBUG_PRINTF("not enough bytes for a match\n");
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
struct cb_info cbi = {cb, n->id, scratch, 0};
|
||||
DEBUG_PRINTF("nood scan of %zu bytes (%zu hlen) for %*s @ %p\n", len, hlen,
|
||||
n->msk_len, (const char *)&n->cmp, buf);
|
||||
|
||||
if (hlen && n->msk_len > 1) {
|
||||
/*
|
||||
* we have history, so build up a buffer from enough of the history
|
||||
* buffer plus what we've been given to scan. Since this is relatively
|
||||
* short, just check against msk+cmp per byte offset for matches.
|
||||
*/
|
||||
assert(hbuf);
|
||||
u8 ALIGN_DIRECTIVE temp_buf[HWLM_LITERAL_MAX_LEN * 2];
|
||||
memset(temp_buf, 0, sizeof(temp_buf));
|
||||
|
||||
assert(n->msk_len);
|
||||
size_t tl1 = MIN((size_t)n->msk_len - 1, hlen);
|
||||
size_t tl2 = MIN((size_t)n->msk_len - 1, len);
|
||||
|
||||
assert(tl1 + tl2 <= sizeof(temp_buf));
|
||||
assert(tl1 + tl2 >= n->msk_len);
|
||||
assert(tl1 <= sizeof(u64a));
|
||||
assert(tl2 <= sizeof(u64a));
|
||||
DEBUG_PRINTF("using %zu bytes of hist and %zu bytes of buf\n", tl1, tl2);
|
||||
|
||||
unaligned_store_u64a(temp_buf,
|
||||
partial_load_u64a(hbuf + hlen - tl1, tl1));
|
||||
unaligned_store_u64a(temp_buf + tl1, partial_load_u64a(buf, tl2));
|
||||
|
||||
for (size_t i = 0; i <= tl1 + tl2 - n->msk_len; i++) {
|
||||
u64a v = unaligned_load_u64a(temp_buf + i);
|
||||
if ((v & n->msk) == n->cmp) {
|
||||
size_t m_end = -tl1 + i + n->msk_len - 1;
|
||||
DEBUG_PRINTF("match @ %zu (i %zu)\n", m_end, i);
|
||||
hwlmcb_rv_t rv = cb(m_end, n->id, scratch);
|
||||
if (rv == HWLM_TERMINATE_MATCHING) {
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert(buf);
|
||||
|
||||
cbi.offsetAdj = 0;
|
||||
return scan(n, buf, len, 0, n->single, n->nocase, &cbi);
|
||||
}
|
191
src/hwlm/noodle_engine.cpp
Normal file
191
src/hwlm/noodle_engine.cpp
Normal file
@ -0,0 +1,191 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
* Copyright (c) 2020, 2021, VectorCamp PC
|
||||
* Copyright (c) 2021, Arm Limited
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* \brief Noodle literal matcher: runtime.
|
||||
*/
|
||||
#include "hwlm.h"
|
||||
#include "noodle_engine.h"
|
||||
#include "noodle_internal.h"
|
||||
#include "scratch.h"
|
||||
#include "ue2common.h"
|
||||
#include "util/arch.h"
|
||||
#include "util/bitutils.h"
|
||||
#include "util/compare.h"
|
||||
#include "util/intrinsics.h"
|
||||
#include "util/join.h"
|
||||
#include "util/partial_store.h"
|
||||
#include "util/simd_utils.h"
|
||||
|
||||
#if defined(HAVE_AVX2)
|
||||
#include "util/arch/x86/masked_move.h"
|
||||
#endif
|
||||
|
||||
#include <ctype.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
|
||||
/** \brief Noodle runtime context. */
|
||||
struct cb_info {
|
||||
HWLMCallback cb; //!< callback function called on match
|
||||
u32 id; //!< ID to pass to callback on match
|
||||
struct hs_scratch *scratch; //!< scratch to pass to callback
|
||||
size_t offsetAdj; //!< used in streaming mode
|
||||
};
|
||||
|
||||
|
||||
#define RETURN_IF_TERMINATED(x) \
|
||||
{ \
|
||||
if ((x) == HWLM_TERMINATED) { \
|
||||
return HWLM_TERMINATED; \
|
||||
} \
|
||||
}
|
||||
|
||||
// Make sure the rest of the string is there. The single character scanner
|
||||
// is used only for single chars with case insensitivity used correctly,
|
||||
// so it can go straight to the callback if we get this far.
|
||||
static really_inline
|
||||
hwlm_error_t final(const struct noodTable *n, const u8 *buf, UNUSED size_t len,
|
||||
bool needsConfirm, const struct cb_info *cbi, size_t pos) {
|
||||
u64a v{0};
|
||||
if (!needsConfirm) {
|
||||
goto match;
|
||||
}
|
||||
assert(len >= n->msk_len);
|
||||
v = partial_load_u64a(buf + pos + n->key_offset - n->msk_len, n->msk_len);
|
||||
DEBUG_PRINTF("v %016llx msk %016llx cmp %016llx\n", v, n->msk, n->cmp);
|
||||
if ((v & n->msk) != n->cmp) {
|
||||
/* mask didn't match */
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
match:
|
||||
pos -= cbi->offsetAdj;
|
||||
DEBUG_PRINTF("match @ %zu\n", pos + n->key_offset);
|
||||
hwlmcb_rv_t rv = cbi->cb(pos + n->key_offset - 1, cbi->id, cbi->scratch);
|
||||
if (rv == HWLM_TERMINATE_MATCHING) {
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
#ifdef HAVE_SVE2
|
||||
#include "noodle_engine_sve.hpp"
|
||||
#else
|
||||
#include "noodle_engine_simd.hpp"
|
||||
#endif
|
||||
|
||||
// main entry point for the scan code
|
||||
static really_inline
|
||||
hwlm_error_t scan(const struct noodTable *n, const u8 *buf, size_t len,
|
||||
size_t start, char single, bool noCase,
|
||||
const struct cb_info *cbi) {
|
||||
if (len - start < n->msk_len) {
|
||||
// can't find string of length keyLen in a shorter buffer
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
if (single) {
|
||||
return scanSingle(n, buf, len, start, noCase, cbi);
|
||||
} else {
|
||||
return scanDouble(n, buf, len, start, noCase, cbi);
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Block-mode scanner. */
|
||||
hwlm_error_t noodExec(const struct noodTable *n, const u8 *buf, size_t len,
|
||||
size_t start, HWLMCallback cb,
|
||||
struct hs_scratch *scratch) {
|
||||
assert(n && buf);
|
||||
|
||||
struct cb_info cbi = {cb, n->id, scratch, 0};
|
||||
DEBUG_PRINTF("nood scan of %zu bytes for %*s @ %p\n", len, n->msk_len,
|
||||
(const char *)&n->cmp, buf);
|
||||
|
||||
return scan(n, buf, len, start, n->single, n->nocase, &cbi);
|
||||
}
|
||||
|
||||
/** \brief Streaming-mode scanner. */
|
||||
hwlm_error_t noodExecStreaming(const struct noodTable *n, const u8 *hbuf,
|
||||
size_t hlen, const u8 *buf, size_t len,
|
||||
HWLMCallback cb, struct hs_scratch *scratch) {
|
||||
assert(n);
|
||||
|
||||
if (len + hlen < n->msk_len) {
|
||||
DEBUG_PRINTF("not enough bytes for a match\n");
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
struct cb_info cbi = {cb, n->id, scratch, 0};
|
||||
DEBUG_PRINTF("nood scan of %zu bytes (%zu hlen) for %*s @ %p\n", len, hlen,
|
||||
n->msk_len, (const char *)&n->cmp, buf);
|
||||
|
||||
if (hlen && n->msk_len > 1) {
|
||||
/*
|
||||
* we have history, so build up a buffer from enough of the history
|
||||
* buffer plus what we've been given to scan. Since this is relatively
|
||||
* short, just check against msk+cmp per byte offset for matches.
|
||||
*/
|
||||
assert(hbuf);
|
||||
u8 ALIGN_DIRECTIVE temp_buf[HWLM_LITERAL_MAX_LEN * 2];
|
||||
memset(temp_buf, 0, sizeof(temp_buf));
|
||||
|
||||
assert(n->msk_len);
|
||||
size_t tl1 = MIN((size_t)n->msk_len - 1, hlen);
|
||||
size_t tl2 = MIN((size_t)n->msk_len - 1, len);
|
||||
|
||||
assert(tl1 + tl2 <= sizeof(temp_buf));
|
||||
assert(tl1 + tl2 >= n->msk_len);
|
||||
assert(tl1 <= sizeof(u64a));
|
||||
assert(tl2 <= sizeof(u64a));
|
||||
DEBUG_PRINTF("using %zu bytes of hist and %zu bytes of buf\n", tl1, tl2);
|
||||
|
||||
unaligned_store_u64a(temp_buf,
|
||||
partial_load_u64a(hbuf + hlen - tl1, tl1));
|
||||
unaligned_store_u64a(temp_buf + tl1, partial_load_u64a(buf, tl2));
|
||||
|
||||
for (size_t i = 0; i <= tl1 + tl2 - n->msk_len; i++) {
|
||||
u64a v = unaligned_load_u64a(temp_buf + i);
|
||||
if ((v & n->msk) == n->cmp) {
|
||||
size_t m_end = -tl1 + i + n->msk_len - 1;
|
||||
DEBUG_PRINTF("match @ %zu (i %zu)\n", m_end, i);
|
||||
hwlmcb_rv_t rv = cb(m_end, n->id, scratch);
|
||||
if (rv == HWLM_TERMINATE_MATCHING) {
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert(buf);
|
||||
|
||||
cbi.offsetAdj = 0;
|
||||
return scan(n, buf, len, 0, n->single, n->nocase, &cbi);
|
||||
}
|
@ -1,233 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* noodle scan parts for AVX */
|
||||
|
||||
static really_inline m256 getMask(u8 c, bool noCase) {
|
||||
u8 k = caseClear8(c, noCase);
|
||||
return set32x8(k);
|
||||
}
|
||||
|
||||
static really_inline m256 getCaseMask(void) {
|
||||
return set32x8(0xdf);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, size_t offset, bool noCase,
|
||||
m256 caseMask, m256 mask1,
|
||||
const struct cb_info *cbi, size_t start,
|
||||
size_t end) {
|
||||
const u8 *d = buf + offset;
|
||||
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
|
||||
const size_t l = end - start;
|
||||
|
||||
m256 v = loadu256(d);
|
||||
|
||||
if (noCase) {
|
||||
v = and256(v, caseMask);
|
||||
}
|
||||
|
||||
u32 z = movemask256(eq256(mask1, v));
|
||||
|
||||
u32 buf_off = start - offset;
|
||||
u32 mask = (u32)((u64a)(1ULL << l) - 1) << buf_off;
|
||||
DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
|
||||
|
||||
z &= mask;
|
||||
|
||||
SINGLE_ZSCAN();
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, size_t offset, bool noCase,
|
||||
m256 caseMask, m256 mask1, m256 mask2,
|
||||
const struct cb_info *cbi, size_t start,
|
||||
size_t end) {
|
||||
const u8 *d = buf + offset;
|
||||
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
|
||||
size_t l = end - start;
|
||||
|
||||
m256 v = loadu256(d);
|
||||
|
||||
if (noCase) {
|
||||
v = and256(v, caseMask);
|
||||
}
|
||||
|
||||
u32 z0 = movemask256(eq256(mask1, v));
|
||||
u32 z1 = movemask256(eq256(mask2, v));
|
||||
u32 z = (z0 << 1) & z1;
|
||||
|
||||
// mask out where we can't match
|
||||
u32 buf_off = start - offset;
|
||||
u32 mask = (u32)((u64a)(1ULL << l) - 1) << buf_off;
|
||||
DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
|
||||
z &= mask;
|
||||
|
||||
DOUBLE_ZSCAN();
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
// The short scan routine. It is used both to scan data up to an
|
||||
// alignment boundary if needed and to finish off data that the aligned scan
|
||||
// function can't handle (due to small/unaligned chunk at end)
|
||||
static really_inline
|
||||
hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, bool noCase, m256 caseMask, m256 mask1,
|
||||
const struct cb_info *cbi, size_t start,
|
||||
size_t end) {
|
||||
const u8 *d = buf + start;
|
||||
size_t l = end - start;
|
||||
DEBUG_PRINTF("l %zu\n", l);
|
||||
assert(l <= 32);
|
||||
if (!l) {
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
m256 v;
|
||||
|
||||
if (l < 4) {
|
||||
u8 *vp = (u8*)&v;
|
||||
switch (l) {
|
||||
case 3: vp[2] = d[2]; // fallthrough
|
||||
case 2: vp[1] = d[1]; // fallthrough
|
||||
case 1: vp[0] = d[0]; // fallthrough
|
||||
}
|
||||
} else {
|
||||
v = masked_move256_len(d, l);
|
||||
}
|
||||
|
||||
if (noCase) {
|
||||
v = and256(v, caseMask);
|
||||
}
|
||||
|
||||
// mask out where we can't match
|
||||
u32 mask = (0xFFFFFFFF >> (32 - l));
|
||||
|
||||
u32 z = mask & movemask256(eq256(mask1, v));
|
||||
|
||||
SINGLE_ZSCAN();
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, bool noCase, m256 caseMask, m256 mask1,
|
||||
m256 mask2, const struct cb_info *cbi,
|
||||
size_t start, size_t end) {
|
||||
const u8 *d = buf + start;
|
||||
size_t l = end - start;
|
||||
if (!l) {
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
assert(l <= 32);
|
||||
m256 v;
|
||||
|
||||
DEBUG_PRINTF("d %zu\n", d - buf);
|
||||
if (l < 4) {
|
||||
u8 *vp = (u8*)&v;
|
||||
switch (l) {
|
||||
case 3: vp[2] = d[2]; // fallthrough
|
||||
case 2: vp[1] = d[1]; // fallthrough
|
||||
case 1: vp[0] = d[0]; // fallthrough
|
||||
}
|
||||
} else {
|
||||
v = masked_move256_len(d, l);
|
||||
}
|
||||
if (noCase) {
|
||||
v = and256(v, caseMask);
|
||||
}
|
||||
|
||||
u32 z0 = movemask256(eq256(mask1, v));
|
||||
u32 z1 = movemask256(eq256(mask2, v));
|
||||
u32 z = (z0 << 1) & z1;
|
||||
|
||||
// mask out where we can't match
|
||||
u32 mask = (0xFFFFFFFF >> (32 - l));
|
||||
z &= mask;
|
||||
|
||||
DOUBLE_ZSCAN();
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, bool noCase, m256 caseMask, m256 mask1,
|
||||
const struct cb_info *cbi, size_t start,
|
||||
size_t end) {
|
||||
const u8 *d = buf + start, *e = buf + end;
|
||||
assert(d < e);
|
||||
|
||||
for (; d < e; d += 32) {
|
||||
m256 v = noCase ? and256(load256(d), caseMask) : load256(d);
|
||||
|
||||
u32 z = movemask256(eq256(mask1, v));
|
||||
|
||||
// On large packet buffers, this prefetch appears to get us about 2%.
|
||||
__builtin_prefetch(d + 128);
|
||||
|
||||
SINGLE_ZSCAN();
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, bool noCase, m256 caseMask, m256 mask1,
|
||||
m256 mask2, const struct cb_info *cbi, size_t start,
|
||||
size_t end) {
|
||||
const u8 *d = buf + start, *e = buf + end;
|
||||
DEBUG_PRINTF("start %zu end %zu \n", start, end);
|
||||
assert(d < e);
|
||||
u32 lastz0 = 0;
|
||||
|
||||
for (; d < e; d += 32) {
|
||||
m256 v = noCase ? and256(load256(d), caseMask) : load256(d);
|
||||
|
||||
// we have to pull the masks out of the AVX registers because we can't
|
||||
// byte shift between the lanes
|
||||
u32 z0 = movemask256(eq256(mask1, v));
|
||||
u32 z1 = movemask256(eq256(mask2, v));
|
||||
u32 z = (lastz0 | (z0 << 1)) & z1;
|
||||
lastz0 = z0 >> 31;
|
||||
|
||||
// On large packet buffers, this prefetch appears to get us about 2%.
|
||||
__builtin_prefetch(d + 128);
|
||||
|
||||
DOUBLE_ZSCAN();
|
||||
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
@ -1,191 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* noodle scan parts for AVX512 */
|
||||
|
||||
static really_inline
|
||||
m512 getMask(u8 c, bool noCase) {
|
||||
u8 k = caseClear8(c, noCase);
|
||||
return set64x8(k);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
m512 getCaseMask(void) {
|
||||
return set64x8(CASE_CLEAR);
|
||||
}
|
||||
|
||||
// The short scan routine. It is used both to scan data up to an
|
||||
// alignment boundary if needed and to finish off data that the aligned scan
|
||||
// function can't handle (due to small/unaligned chunk at end)
|
||||
static really_inline
|
||||
hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, bool noCase, m512 caseMask, m512 mask1,
|
||||
const struct cb_info *cbi, size_t start,
|
||||
size_t end) {
|
||||
const u8 *d = buf + start;
|
||||
ptrdiff_t scan_len = end - start;
|
||||
DEBUG_PRINTF("scan_len %zu\n", scan_len);
|
||||
assert(scan_len <= 64);
|
||||
if (!scan_len) {
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
__mmask64 k = (~0ULL) >> (64 - scan_len);
|
||||
DEBUG_PRINTF("load mask 0x%016llx\n", k);
|
||||
|
||||
m512 v = loadu_maskz_m512(k, d);
|
||||
|
||||
if (noCase) {
|
||||
v = and512(v, caseMask);
|
||||
}
|
||||
|
||||
// reuse the load mask to indicate valid bytes
|
||||
u64a z = masked_eq512mask(k, mask1, v);
|
||||
|
||||
SINGLE_ZSCAN();
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanSingle512(const struct noodTable *n, const u8 *buf, size_t len,
|
||||
bool noCase, m512 caseMask, m512 mask1,
|
||||
const struct cb_info *cbi, size_t start,
|
||||
size_t end) {
|
||||
const u8 *d = buf + start;
|
||||
const u8 *e = buf + end;
|
||||
DEBUG_PRINTF("start %p end %p \n", d, e);
|
||||
assert(d < e);
|
||||
if (d + 64 >= e) {
|
||||
goto tail;
|
||||
}
|
||||
|
||||
// peel off first part to cacheline boundary
|
||||
const u8 *d1 = ROUNDUP_PTR(d, 64);
|
||||
if (scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, start,
|
||||
d1 - buf) == HWLM_TERMINATED) {
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
d = d1;
|
||||
|
||||
for (; d + 64 < e; d += 64) {
|
||||
DEBUG_PRINTF("d %p e %p \n", d, e);
|
||||
m512 v = noCase ? and512(load512(d), caseMask) : load512(d);
|
||||
|
||||
u64a z = eq512mask(mask1, v);
|
||||
__builtin_prefetch(d + 128);
|
||||
|
||||
SINGLE_ZSCAN();
|
||||
}
|
||||
|
||||
tail:
|
||||
DEBUG_PRINTF("d %p e %p \n", d, e);
|
||||
// finish off tail
|
||||
|
||||
return scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, d - buf,
|
||||
e - buf);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, bool noCase, m512 caseMask, m512 mask1,
|
||||
m512 mask2, const struct cb_info *cbi,
|
||||
u64a *lastz0, size_t start, size_t end) {
|
||||
DEBUG_PRINTF("start %zu end %zu last 0x%016llx\n", start, end, *lastz0);
|
||||
const u8 *d = buf + start;
|
||||
ptrdiff_t scan_len = end - start;
|
||||
if (!scan_len) {
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
assert(scan_len <= 64);
|
||||
__mmask64 k = (~0ULL) >> (64 - scan_len);
|
||||
DEBUG_PRINTF("load mask 0x%016llx scan_len %zu\n", k, scan_len);
|
||||
|
||||
m512 v = loadu_maskz_m512(k, d);
|
||||
if (noCase) {
|
||||
v = and512(v, caseMask);
|
||||
}
|
||||
|
||||
u64a z0 = masked_eq512mask(k, mask1, v);
|
||||
u64a z1 = masked_eq512mask(k, mask2, v);
|
||||
u64a z = (*lastz0 | (z0 << 1)) & z1;
|
||||
DEBUG_PRINTF("z 0x%016llx\n", z);
|
||||
|
||||
DOUBLE_ZSCAN();
|
||||
*lastz0 = z0 >> (scan_len - 1);
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDouble512(const struct noodTable *n, const u8 *buf, size_t len,
|
||||
bool noCase, m512 caseMask, m512 mask1, m512 mask2,
|
||||
const struct cb_info *cbi, size_t start,
|
||||
size_t end) {
|
||||
const u8 *d = buf + start;
|
||||
const u8 *e = buf + end;
|
||||
u64a lastz0 = 0;
|
||||
DEBUG_PRINTF("start %zu end %zu \n", start, end);
|
||||
assert(d < e);
|
||||
if (d + 64 >= e) {
|
||||
goto tail;
|
||||
}
|
||||
|
||||
// peel off first part to cacheline boundary
|
||||
const u8 *d1 = ROUNDUP_PTR(d, 64);
|
||||
if (scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
|
||||
&lastz0, start, d1 - buf) == HWLM_TERMINATED) {
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
d = d1;
|
||||
|
||||
for (; d + 64 < e; d += 64) {
|
||||
DEBUG_PRINTF("d %p e %p 0x%016llx\n", d, e, lastz0);
|
||||
m512 v = noCase ? and512(load512(d), caseMask) : load512(d);
|
||||
|
||||
/* we have to pull the masks out of the AVX registers because we can't
|
||||
byte shift between the lanes */
|
||||
u64a z0 = eq512mask(mask1, v);
|
||||
u64a z1 = eq512mask(mask2, v);
|
||||
u64a z = (lastz0 | (z0 << 1)) & z1;
|
||||
lastz0 = z0 >> 63;
|
||||
|
||||
// On large packet buffers, this prefetch appears to get us about 2%.
|
||||
__builtin_prefetch(d + 256);
|
||||
|
||||
DEBUG_PRINTF("z 0x%016llx\n", z);
|
||||
|
||||
DOUBLE_ZSCAN();
|
||||
}
|
||||
|
||||
tail:
|
||||
DEBUG_PRINTF("d %p e %p off %zu \n", d, e, d - buf);
|
||||
// finish off tail
|
||||
|
||||
return scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
|
||||
&lastz0, d - buf, end);
|
||||
}
|
310
src/hwlm/noodle_engine_simd.hpp
Normal file
310
src/hwlm/noodle_engine_simd.hpp
Normal file
@ -0,0 +1,310 @@
|
||||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
* Copyright (c) 2020-2021, VectorCamp PC
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* SIMD engine agnostic noodle scan parts */
|
||||
|
||||
#include "util/supervector/supervector.hpp"
|
||||
#include "util/supervector/casemask.hpp"
|
||||
|
||||
static really_really_inline
|
||||
hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
|
||||
Z_TYPE z, size_t len, const struct cb_info *cbi) {
|
||||
while (unlikely(z)) {
|
||||
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;
|
||||
size_t matchPos = d - buf + pos;
|
||||
DEBUG_PRINTF("match pos %zu\n", matchPos);
|
||||
hwlmcb_rv_t rv = final(n, buf, len, n->msk_len != 1, cbi, matchPos);
|
||||
RETURN_IF_TERMINATED(rv);
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
static really_really_inline
|
||||
hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
|
||||
Z_TYPE z, size_t len, const struct cb_info *cbi) {
|
||||
while (unlikely(z)) {
|
||||
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;
|
||||
size_t matchPos = d - buf + pos - 1;
|
||||
DEBUG_PRINTF("match pos %zu\n", matchPos);
|
||||
hwlmcb_rv_t rv = final(n, buf, len, true, cbi, matchPos);
|
||||
RETURN_IF_TERMINATED(rv);
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
template<uint16_t S>
|
||||
static really_inline
|
||||
hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
|
||||
SuperVector<S> caseMask, SuperVector<S> mask1,
|
||||
const struct cb_info *cbi, size_t len, size_t start,
|
||||
size_t end) {
|
||||
const u8 *d = buf + start;
|
||||
DEBUG_PRINTF("start %zu end %zu\n", start, end);
|
||||
const size_t l = end - start;
|
||||
DEBUG_PRINTF("l = %ld\n", l);
|
||||
//assert(l <= 64);
|
||||
if (!l) {
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
SuperVector<S> v = SuperVector<S>::Zeroes();
|
||||
memcpy(&v.u, d, l);
|
||||
|
||||
typename SuperVector<S>::comparemask_type mask =
|
||||
SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width());
|
||||
v = v & caseMask;
|
||||
typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
|
||||
z = SuperVector<S>::iteration_mask(z);
|
||||
|
||||
return single_zscan(n, d, buf, z, len, cbi);
|
||||
}
|
||||
|
||||
// The short scan routine. It is used both to scan data up to an
|
||||
// alignment boundary if needed and to finish off data that the aligned scan
|
||||
// function can't handle (due to small/unaligned chunk at end)
|
||||
template<uint16_t S>
|
||||
static really_inline
|
||||
hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
|
||||
SuperVector<S> caseMask, SuperVector<S> mask1,
|
||||
const struct cb_info *cbi, size_t len, size_t offset,
|
||||
size_t start,
|
||||
size_t end) {
|
||||
const u8 *d = buf + offset;
|
||||
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
|
||||
const size_t l = end - start;
|
||||
DEBUG_PRINTF("l = %ld\n", l);
|
||||
assert(l <= 64);
|
||||
if (!l) {
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
size_t buf_off = start - offset;
|
||||
typename SuperVector<S>::comparemask_type mask =
|
||||
SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width())
|
||||
<< (buf_off * SuperVector<S>::mask_width());
|
||||
SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
|
||||
typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
|
||||
z = SuperVector<S>::iteration_mask(z);
|
||||
|
||||
return single_zscan(n, d, buf, z, len, cbi);
|
||||
}
|
||||
|
||||
template<uint16_t S>
|
||||
static really_inline
|
||||
hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
|
||||
SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
|
||||
const struct cb_info *cbi, size_t len, size_t start, size_t end) {
|
||||
const u8 *d = buf + start;
|
||||
DEBUG_PRINTF("start %zu end %zu\n", start, end);
|
||||
const size_t l = end - start;
|
||||
assert(l <= S);
|
||||
if (!l) {
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
SuperVector<S> v = SuperVector<S>::Zeroes();
|
||||
memcpy(&v.u, d, l);
|
||||
v = v & caseMask;
|
||||
|
||||
typename SuperVector<S>::comparemask_type mask =
|
||||
DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width());
|
||||
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
|
||||
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
|
||||
typename SuperVector<S>::comparemask_type z =
|
||||
mask & (z1 << (SuperVector<S>::mask_width())) & z2;
|
||||
z = SuperVector<S>::iteration_mask(z);
|
||||
|
||||
return double_zscan(n, d, buf, z, len, cbi);
|
||||
}
|
||||
|
||||
template<uint16_t S>
|
||||
static really_inline
|
||||
hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
|
||||
SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
|
||||
const struct cb_info *cbi, size_t len, size_t offset, size_t start, size_t end) {
|
||||
const u8 *d = buf + offset;
|
||||
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
|
||||
const size_t l = end - start;
|
||||
assert(l <= S);
|
||||
if (!l) {
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
|
||||
size_t buf_off = start - offset;
|
||||
typename SuperVector<S>::comparemask_type mask =
|
||||
DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width())
|
||||
<< (buf_off * SuperVector<S>::mask_width());
|
||||
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
|
||||
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
|
||||
typename SuperVector<S>::comparemask_type z =
|
||||
mask & (z1 << SuperVector<S>::mask_width()) & z2;
|
||||
z = SuperVector<S>::iteration_mask(z);
|
||||
|
||||
return double_zscan(n, d, buf, z, len, cbi);
|
||||
}
|
||||
|
||||
template <uint16_t S>
|
||||
static really_inline
|
||||
hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, size_t offset,
|
||||
SuperVector<S> caseMask, SuperVector<S> mask1,
|
||||
const struct cb_info *cbi) {
|
||||
size_t start = offset + n->msk_len - 1;
|
||||
size_t end = len;
|
||||
|
||||
const u8 *d = buf + start;
|
||||
const u8 *e = buf + end;
|
||||
DEBUG_PRINTF("start %p end %p \n", d, e);
|
||||
assert(d < e);
|
||||
if (e - d < S) {
|
||||
return scanSingleShort(n, buf, caseMask, mask1, cbi, len, start, end);
|
||||
}
|
||||
if (d + S <= e) {
|
||||
// peel off first part to cacheline boundary
|
||||
const u8 *d1 = ROUNDUP_PTR(d, S);
|
||||
DEBUG_PRINTF("until aligned %p \n", d1);
|
||||
if (scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, start, start, d1 - buf) == HWLM_TERMINATED) {
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
d = d1;
|
||||
|
||||
size_t loops = (end - (d - buf)) / S;
|
||||
DEBUG_PRINTF("loops %ld \n", loops);
|
||||
|
||||
for (size_t i = 0; i < loops; i++, d+= S) {
|
||||
DEBUG_PRINTF("d %p \n", d);
|
||||
const u8 *base = ROUNDUP_PTR(d, 64);
|
||||
// On large packet buffers, this prefetch appears to get us about 2%.
|
||||
__builtin_prefetch(base + 256);
|
||||
|
||||
SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
|
||||
typename SuperVector<S>::comparemask_type z = mask1.eqmask(v);
|
||||
z = SuperVector<S>::iteration_mask(z);
|
||||
|
||||
hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi);
|
||||
RETURN_IF_TERMINATED(rv);
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_PRINTF("d %p e %p \n", d, e);
|
||||
// finish off tail
|
||||
size_t s2End = ROUNDDOWN_PTR(e, S) - buf;
|
||||
if (s2End == end) {
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
return scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, end - S, s2End, len);
|
||||
}
|
||||
|
||||
template <uint16_t S>
|
||||
static really_inline
|
||||
hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, size_t offset,
|
||||
SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
|
||||
const struct cb_info *cbi) {
|
||||
// we stop scanning for the key-fragment when the rest of the key can't
|
||||
// possibly fit in the remaining buffer
|
||||
size_t end = len - n->key_offset + 2;
|
||||
|
||||
size_t start = offset + n->msk_len - n->key_offset;
|
||||
|
||||
typename SuperVector<S>::comparemask_type lastz1{0};
|
||||
|
||||
const u8 *d = buf + start;
|
||||
const u8 *e = buf + end;
|
||||
DEBUG_PRINTF("start %p end %p \n", d, e);
|
||||
assert(d < e);
|
||||
if (e - d < S) {
|
||||
return scanDoubleShort(n, buf, caseMask, mask1, mask2, cbi, len, d - buf, end);
|
||||
}
|
||||
if (d + S <= e) {
|
||||
// peel off first part to cacheline boundary
|
||||
const u8 *d1 = ROUNDUP_PTR(d, S) + 1;
|
||||
DEBUG_PRINTF("until aligned %p \n", d1);
|
||||
if (scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, cbi, len, start, start, d1 - buf) == HWLM_TERMINATED) {
|
||||
return HWLM_TERMINATED;
|
||||
}
|
||||
d = d1 - 1;
|
||||
|
||||
size_t loops = (end - (d - buf)) / S;
|
||||
DEBUG_PRINTF("loops %ld \n", loops);
|
||||
|
||||
for (size_t i = 0; i < loops; i++, d+= S) {
|
||||
DEBUG_PRINTF("d %p \n", d);
|
||||
const u8 *base = ROUNDUP_PTR(d, 64);
|
||||
// On large packet buffers, this prefetch appears to get us about 2%.
|
||||
__builtin_prefetch(base + 256);
|
||||
|
||||
SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
|
||||
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
|
||||
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
|
||||
typename SuperVector<S>::comparemask_type z =
|
||||
(z1 << SuperVector<S>::mask_width() | lastz1) & z2;
|
||||
lastz1 = z1 >> (Z_SHIFT * SuperVector<S>::mask_width());
|
||||
z = SuperVector<S>::iteration_mask(z);
|
||||
|
||||
hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi);
|
||||
RETURN_IF_TERMINATED(rv);
|
||||
}
|
||||
if (loops == 0) {
|
||||
d = d1;
|
||||
}
|
||||
}
|
||||
// finish off tail
|
||||
size_t s2End = ROUNDDOWN_PTR(e, S) - buf;
|
||||
if (s2End == end) {
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
return scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, cbi, len, end - S, d - buf, end);
|
||||
}
|
||||
|
||||
// Single-character specialisation, used when keyLen = 1
|
||||
static really_inline
|
||||
hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len,
|
||||
size_t start, bool noCase, const struct cb_info *cbi) {
|
||||
if (!ourisalpha(n->key0)) {
|
||||
noCase = 0; // force noCase off if we don't have an alphabetic char
|
||||
}
|
||||
|
||||
const SuperVector<VECTORSIZE> caseMask{noCase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
|
||||
const SuperVector<VECTORSIZE> mask1{getMask<VECTORSIZE>(n->key0, noCase)};
|
||||
|
||||
return scanSingleMain(n, buf, len, start, caseMask, mask1, cbi);
|
||||
}
|
||||
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
|
||||
size_t start, bool noCase, const struct cb_info *cbi) {
|
||||
|
||||
const SuperVector<VECTORSIZE> caseMask{noCase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
|
||||
const SuperVector<VECTORSIZE> mask1{getMask<VECTORSIZE>(n->key0, noCase)};
|
||||
const SuperVector<VECTORSIZE> mask2{getMask<VECTORSIZE>(n->key1, noCase)};
|
||||
|
||||
return scanDoubleMain(n, buf, len, start, caseMask, mask1, mask2, cbi);
|
||||
}
|
@ -1,203 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* noodle scan parts for SSE */
|
||||
|
||||
static really_inline m128 getMask(u8 c, bool noCase) {
|
||||
u8 k = caseClear8(c, noCase);
|
||||
return set16x8(k);
|
||||
}
|
||||
|
||||
static really_inline m128 getCaseMask(void) {
|
||||
return set16x8(0xdf);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, bool noCase, m128 caseMask, m128 mask1,
|
||||
const struct cb_info *cbi, size_t start,
|
||||
size_t end) {
|
||||
const u8 *d = buf + start;
|
||||
size_t l = end - start;
|
||||
DEBUG_PRINTF("l %zu\n", l);
|
||||
assert(l <= 16);
|
||||
if (!l) {
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
m128 v = zeroes128();
|
||||
// we don't have a clever way of doing this move yet
|
||||
memcpy(&v, d, l);
|
||||
if (noCase) {
|
||||
v = and128(v, caseMask);
|
||||
}
|
||||
|
||||
// mask out where we can't match
|
||||
u32 mask = (0xFFFF >> (16 - l));
|
||||
|
||||
u32 z = mask & movemask128(eq128(mask1, v));
|
||||
|
||||
SINGLE_ZSCAN();
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, size_t offset, bool noCase,
|
||||
m128 caseMask, m128 mask1,
|
||||
const struct cb_info *cbi, size_t start,
|
||||
size_t end) {
|
||||
const u8 *d = buf + offset;
|
||||
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
|
||||
const size_t l = end - start;
|
||||
|
||||
m128 v = loadu128(d);
|
||||
|
||||
if (noCase) {
|
||||
v = and128(v, caseMask);
|
||||
}
|
||||
|
||||
u32 buf_off = start - offset;
|
||||
u32 mask = ((1 << l) - 1) << buf_off;
|
||||
|
||||
u32 z = mask & movemask128(eq128(mask1, v));
|
||||
|
||||
DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
|
||||
|
||||
z &= mask;
|
||||
|
||||
SINGLE_ZSCAN();
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, bool noCase, m128 caseMask, m128 mask1,
|
||||
m128 mask2, const struct cb_info *cbi,
|
||||
size_t start, size_t end) {
|
||||
const u8 *d = buf + start;
|
||||
size_t l = end - start;
|
||||
if (!l) {
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
assert(l <= 32);
|
||||
|
||||
DEBUG_PRINTF("d %zu\n", d - buf);
|
||||
m128 v = zeroes128();
|
||||
memcpy(&v, d, l);
|
||||
if (noCase) {
|
||||
v = and128(v, caseMask);
|
||||
}
|
||||
|
||||
u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
|
||||
eq128(mask2, v)));
|
||||
|
||||
// mask out where we can't match
|
||||
u32 mask = (0xFFFF >> (16 - l));
|
||||
z &= mask;
|
||||
|
||||
DOUBLE_ZSCAN();
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, size_t offset, bool noCase,
|
||||
m128 caseMask, m128 mask1, m128 mask2,
|
||||
const struct cb_info *cbi, size_t start,
|
||||
size_t end) {
|
||||
const u8 *d = buf + offset;
|
||||
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
|
||||
size_t l = end - start;
|
||||
|
||||
m128 v = loadu128(d);
|
||||
|
||||
if (noCase) {
|
||||
v = and128(v, caseMask);
|
||||
}
|
||||
|
||||
u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
|
||||
eq128(mask2, v)));
|
||||
|
||||
// mask out where we can't match
|
||||
u32 buf_off = start - offset;
|
||||
u32 mask = ((1 << l) - 1) << buf_off;
|
||||
DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
|
||||
z &= mask;
|
||||
|
||||
DOUBLE_ZSCAN();
|
||||
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, bool noCase, m128 caseMask, m128 mask1,
|
||||
const struct cb_info *cbi, size_t start,
|
||||
size_t end) {
|
||||
const u8 *d = buf + start, *e = buf + end;
|
||||
assert(d < e);
|
||||
|
||||
for (; d < e; d += 16) {
|
||||
m128 v = noCase ? and128(load128(d), caseMask) : load128(d);
|
||||
|
||||
u32 z = movemask128(eq128(mask1, v));
|
||||
|
||||
// On large packet buffers, this prefetch appears to get us about 2%.
|
||||
__builtin_prefetch(d + 128);
|
||||
|
||||
SINGLE_ZSCAN();
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, bool noCase, m128 caseMask, m128 mask1,
|
||||
m128 mask2, const struct cb_info *cbi, size_t start,
|
||||
size_t end) {
|
||||
const u8 *d = buf + start, *e = buf + end;
|
||||
assert(d < e);
|
||||
m128 lastz1 = zeroes128();
|
||||
|
||||
for (; d < e; d += 16) {
|
||||
m128 v = noCase ? and128(load128(d), caseMask) : load128(d);
|
||||
m128 z1 = eq128(mask1, v);
|
||||
m128 z2 = eq128(mask2, v);
|
||||
u32 z = movemask128(and128(palignr(z1, lastz1, 15), z2));
|
||||
lastz1 = z1;
|
||||
|
||||
// On large packet buffers, this prefetch appears to get us about 2%.
|
||||
__builtin_prefetch(d + 128);
|
||||
DEBUG_PRINTF("z 0x%08x\n", z);
|
||||
DOUBLE_ZSCAN();
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
259
src/hwlm/noodle_engine_sve.hpp
Normal file
259
src/hwlm/noodle_engine_sve.hpp
Normal file
@ -0,0 +1,259 @@
|
||||
/*
|
||||
* Copyright (c) 2021, Arm Limited
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t checkMatched(const struct noodTable *n, const u8 *buf, size_t len,
|
||||
const struct cb_info *cbi, const u8 *d,
|
||||
svbool_t matched, bool needsConfirm) {
|
||||
assert(d >= buf);
|
||||
size_t basePos = d - buf;
|
||||
svbool_t next_match = svpnext_b8(matched, svpfalse());
|
||||
do {
|
||||
svbool_t brk = svbrkb_z(svptrue_b8(), next_match);
|
||||
size_t matchPos = basePos + svcntp_b8(svptrue_b8(), brk);
|
||||
DEBUG_PRINTF("match pos %zu\n", matchPos);
|
||||
assert(matchPos < len);
|
||||
hwlmcb_rv_t rv = final(n, buf, len, needsConfirm, cbi, matchPos);
|
||||
RETURN_IF_TERMINATED(rv);
|
||||
next_match = svpnext_b8(matched, next_match);
|
||||
} while (unlikely(svptest_any(svptrue_b8(), next_match)));
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t singleCheckMatched(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, const struct cb_info *cbi,
|
||||
const u8 *d, svbool_t matched) {
|
||||
if (unlikely(svptest_any(svptrue_b8(), matched))) {
|
||||
hwlmcb_rv_t rv = checkMatched(n, buf, len, cbi, d, matched,
|
||||
n->msk_len != 1);
|
||||
RETURN_IF_TERMINATED(rv);
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
svbool_t singleMatched(svuint8_t chars, const u8 *d, svbool_t pg) {
|
||||
return svmatch(pg, svld1_u8(pg, d), chars);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanSingleOnce(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, const struct cb_info *cbi,
|
||||
svuint8_t chars, const u8 *d, const u8 *e) {
|
||||
DEBUG_PRINTF("start %p end %p\n", d, e);
|
||||
assert(d < e);
|
||||
assert(d >= buf);
|
||||
DEBUG_PRINTF("l = %td\n", e - d);
|
||||
svbool_t pg = svwhilelt_b8_s64(0, e - d);
|
||||
svbool_t matched = singleMatched(chars, d, pg);
|
||||
return singleCheckMatched(n, buf, len, cbi, d, matched);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanSingleLoop(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, const struct cb_info *cbi,
|
||||
svuint8_t chars, const u8 *d, const u8 *e) {
|
||||
assert(d < e);
|
||||
assert(d >= buf);
|
||||
size_t loops = (e - d) / svcntb();
|
||||
DEBUG_PRINTF("loops %zu \n", loops);
|
||||
assert(d + (loops * svcntb()) <= e);
|
||||
|
||||
for (size_t i = 0; i < loops; i++, d += svcntb()) {
|
||||
DEBUG_PRINTF("d %p \n", d);
|
||||
svbool_t matched = singleMatched(chars, d, svptrue_b8());
|
||||
hwlmcb_rv_t rv = singleCheckMatched(n, buf, len, cbi, d, matched);
|
||||
RETURN_IF_TERMINATED(rv);
|
||||
}
|
||||
DEBUG_PRINTF("d %p e %p \n", d, e);
|
||||
return d == e ? HWLM_SUCCESS
|
||||
: scanSingleOnce(n, buf, len, cbi, chars, d, e);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len,
|
||||
size_t offset, bool noCase, const struct cb_info *cbi) {
|
||||
if (!ourisalpha(n->key0)) {
|
||||
noCase = false; // force noCase off if we don't have an alphabetic char
|
||||
}
|
||||
|
||||
size_t start = offset + n->msk_len - 1;
|
||||
const u8 *d = buf + start;
|
||||
const u8 *e = buf + len;
|
||||
DEBUG_PRINTF("start %p end %p \n", d, e);
|
||||
assert(d < e);
|
||||
assert(d >= buf);
|
||||
|
||||
svuint8_t chars = getCharMaskSingle(n->key0, noCase);
|
||||
|
||||
size_t scan_len = e - d;
|
||||
if (scan_len <= svcntb()) {
|
||||
return scanSingleOnce(n, buf, len, cbi, chars, d, e);
|
||||
}
|
||||
// peel off first part to align to the vector size
|
||||
const u8 *d1 = ROUNDUP_PTR(d, svcntb_pat(SV_POW2));
|
||||
if (d != d1) {
|
||||
DEBUG_PRINTF("until aligned %p \n", d1);
|
||||
hwlmcb_rv_t rv = scanSingleOnce(n, buf, len, cbi, chars, d, d1);
|
||||
RETURN_IF_TERMINATED(rv);
|
||||
}
|
||||
return scanSingleLoop(n, buf, len, cbi, chars, d1, e);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t doubleCheckMatched(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, const struct cb_info *cbi,
|
||||
const u8 *d, svbool_t matched,
|
||||
svbool_t matched_rot, svbool_t any) {
|
||||
if (unlikely(svptest_any(svptrue_b8(), any))) {
|
||||
// Project predicate onto vector.
|
||||
svuint8_t matched_vec = svdup_u8_z(matched, 1);
|
||||
// Shift vector to right by one and project back to the predicate.
|
||||
matched = svcmpeq_n_u8(svptrue_b8(), svinsr_n_u8(matched_vec, 0), 1);
|
||||
matched = svorr_z(svptrue_b8(), matched, matched_rot);
|
||||
// d - 1 won't underflow as the first position in buf has been dealt
|
||||
// with meaning that d > buf
|
||||
assert(d > buf);
|
||||
hwlmcb_rv_t rv = checkMatched(n, buf, len, cbi, d - 1, matched,
|
||||
n->msk_len != 2);
|
||||
RETURN_IF_TERMINATED(rv);
|
||||
}
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
|
||||
static really_inline
|
||||
svbool_t doubleMatchedLoop(svuint16_t chars, const u8 *d,
|
||||
svbool_t * const matched, svbool_t * const matched_rot) {
|
||||
svuint16_t vec = svreinterpret_u16(svld1_u8(svptrue_b8(), d));
|
||||
// d - 1 won't underflow as the first position in buf has been dealt
|
||||
// with meaning that d > buf
|
||||
svuint16_t vec_rot = svreinterpret_u16(svld1_u8(svptrue_b8(), d - 1));
|
||||
*matched = svmatch(svptrue_b8(), vec, chars);
|
||||
*matched_rot = svmatch(svptrue_b8(), vec_rot, chars);
|
||||
return svorr_z(svptrue_b8(), *matched, *matched_rot);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDoubleOnce(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, const struct cb_info *cbi,
|
||||
svuint8_t chars, const u8 *d, const u8 *e) {
|
||||
DEBUG_PRINTF("start %p end %p\n", d, e);
|
||||
assert(d < e);
|
||||
assert(d > buf);
|
||||
const ptrdiff_t size = e - d;
|
||||
svbool_t pg = svwhilelt_b8_s64(0, size);
|
||||
svbool_t pg_rot = svwhilelt_b8_s64(0, size + 1);
|
||||
|
||||
svuint16_t vec = svreinterpret_u16(svld1_u8(pg, d));
|
||||
// d - 1 won't underflow as the first position in buf has been dealt
|
||||
// with meaning that d > buf
|
||||
svuint16_t vec_rot = svreinterpret_u16(svld1_u8(pg_rot, d - 1));
|
||||
|
||||
// we reuse u8 predicates for u16 lanes. This means that we will check against one
|
||||
// extra \0 character at the end of the vector.
|
||||
if(unlikely(n->key1 == '\0')) {
|
||||
if (size % 2) {
|
||||
// if odd, vec has an odd number of lanes and has the spurious \0
|
||||
svbool_t lane_to_disable = svrev_b8(svpfirst(svrev_b8(pg), svpfalse()));
|
||||
pg = sveor_z(svptrue_b8(), pg, lane_to_disable);
|
||||
} else {
|
||||
// if even, vec_rot has an odd number of lanes and has the spurious \0
|
||||
// we need to disable the last active lane as well, but we know pg is
|
||||
// the same as pg_rot without the last lane
|
||||
pg_rot = pg;
|
||||
}
|
||||
}
|
||||
|
||||
svbool_t matched = svmatch(pg, vec, svreinterpret_u16(chars));
|
||||
svbool_t matched_rot = svmatch(pg_rot, vec_rot, svreinterpret_u16(chars));
|
||||
svbool_t any = svorr_z(svptrue_b8(), matched, matched_rot);
|
||||
|
||||
return doubleCheckMatched(n, buf, len, cbi, d, matched, matched_rot, any);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDoubleLoop(const struct noodTable *n, const u8 *buf,
|
||||
size_t len, const struct cb_info *cbi,
|
||||
svuint8_t chars, const u8 *d, const u8 *e) {
|
||||
assert(d < e);
|
||||
assert(d > buf);
|
||||
size_t loops = (e - d) / svcntb();
|
||||
DEBUG_PRINTF("loops %zu \n", loops);
|
||||
assert(d + (loops * svcntb()) <= e);
|
||||
|
||||
for (size_t i = 0; i < loops; i++, d += svcntb()) {
|
||||
DEBUG_PRINTF("d %p \n", d);
|
||||
svbool_t matched, matched_rot;
|
||||
svbool_t any = doubleMatchedLoop(svreinterpret_u16(chars), d, &matched, &matched_rot);
|
||||
hwlm_error_t rv = doubleCheckMatched(n, buf, len, cbi, d,
|
||||
matched, matched_rot, any);
|
||||
RETURN_IF_TERMINATED(rv);
|
||||
}
|
||||
DEBUG_PRINTF("d %p e %p \n", d, e);
|
||||
|
||||
return d == e ? HWLM_SUCCESS
|
||||
: scanDoubleOnce(n, buf, len, cbi, chars, d, e);
|
||||
}
|
||||
|
||||
static really_inline
|
||||
hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
|
||||
size_t offset, bool noCase, const struct cb_info *cbi) {
|
||||
// we stop scanning for the key-fragment when the rest of the key can't
|
||||
// possibly fit in the remaining buffer
|
||||
size_t end = len - n->key_offset + 2;
|
||||
|
||||
size_t start = offset + n->msk_len - n->key_offset;
|
||||
|
||||
const u8 *d = buf + start;
|
||||
const u8 *e = buf + end;
|
||||
DEBUG_PRINTF("start %p end %p \n", d, e);
|
||||
assert(d < e);
|
||||
assert(d >= buf);
|
||||
|
||||
size_t scan_len = e - d;
|
||||
if (scan_len < 2) {
|
||||
return HWLM_SUCCESS;
|
||||
}
|
||||
++d;
|
||||
|
||||
svuint8_t chars = svreinterpret_u8(getCharMaskDouble(n->key0, n->key1, noCase));
|
||||
|
||||
if (scan_len <= svcntb()) {
|
||||
return scanDoubleOnce(n, buf, len, cbi, chars, d, e);
|
||||
}
|
||||
// peel off first part to align to the vector size
|
||||
const u8 *d1 = ROUNDUP_PTR(d, svcntb_pat(SV_POW2));
|
||||
if (d != d1) {
|
||||
DEBUG_PRINTF("until aligned %p \n", d1);
|
||||
hwlmcb_rv_t rv = scanDoubleOnce(n, buf, len, cbi, chars,
|
||||
d, d1);
|
||||
RETURN_IF_TERMINATED(rv);
|
||||
}
|
||||
return scanDoubleLoop(n, buf, len, cbi, chars, d1, e);
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
* Copyright (c) 2021, Arm Limited
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -29,7 +30,7 @@
|
||||
#include "accel.h"
|
||||
#include "shufti.h"
|
||||
#include "truffle.h"
|
||||
#include "vermicelli.h"
|
||||
#include "vermicelli.hpp"
|
||||
#include "ue2common.h"
|
||||
|
||||
const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
|
||||
@ -81,6 +82,39 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
|
||||
c_end - 1);
|
||||
break;
|
||||
|
||||
#ifdef HAVE_SVE2
|
||||
case ACCEL_VERM16:
|
||||
DEBUG_PRINTF("accel verm16 %p %p\n", c, c_end);
|
||||
if (c_end - c < 16) {
|
||||
return c;
|
||||
}
|
||||
|
||||
rv = vermicelli16Exec(accel->verm16.mask, c, c_end);
|
||||
break;
|
||||
|
||||
case ACCEL_DVERM16:
|
||||
DEBUG_PRINTF("accel dverm16 %p %p\n", c, c_end);
|
||||
if (c_end - c < 18) {
|
||||
return c;
|
||||
}
|
||||
|
||||
/* need to stop one early to get an accurate end state */
|
||||
rv = vermicelliDouble16Exec(accel->dverm16.mask, accel->dverm16.firsts,
|
||||
c, c_end - 1);
|
||||
break;
|
||||
|
||||
case ACCEL_DVERM16_MASKED:
|
||||
DEBUG_PRINTF("accel dverm16 masked %p %p\n", c, c_end);
|
||||
if (c_end - c < 18) {
|
||||
return c;
|
||||
}
|
||||
|
||||
/* need to stop one early to get an accurate end state */
|
||||
rv = vermicelliDoubleMasked16Exec(accel->mdverm16.mask, accel->mdverm16.c1,
|
||||
accel->mdverm16.m1, c, c_end - 1);
|
||||
break;
|
||||
#endif // HAVE_SVE2
|
||||
|
||||
case ACCEL_DVERM_MASKED:
|
||||
DEBUG_PRINTF("accel dverm masked %p %p\n", c, c_end);
|
||||
if (c + 16 + 1 >= c_end) {
|
||||
@ -108,9 +142,18 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
|
||||
return c;
|
||||
}
|
||||
|
||||
rv = truffleExec(accel->truffle.mask1, accel->truffle.mask2, c, c_end);
|
||||
rv = truffleExec(accel->truffle.mask_lo, accel->truffle.mask_hi, c, c_end);
|
||||
break;
|
||||
#ifdef CAN_USE_WIDE_TRUFFLE
|
||||
case ACCEL_TRUFFLE_WIDE:
|
||||
DEBUG_PRINTF("accel Truffle Wide %p %p\n", c, c_end);
|
||||
if (c + 15 >= c_end) {
|
||||
return c;
|
||||
}
|
||||
|
||||
rv = truffleExecWide(accel->truffle.mask, c, c_end);
|
||||
break;
|
||||
#endif
|
||||
case ACCEL_DSHUFTI:
|
||||
DEBUG_PRINTF("accel dshufti %p %p\n", c, c_end);
|
||||
if (c + 15 + 1 >= c_end) {
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
* Copyright (c) 2021, Arm Limited
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -62,6 +63,10 @@ enum AccelType {
|
||||
ACCEL_TRUFFLE,
|
||||
ACCEL_RED_TAPE,
|
||||
ACCEL_DVERM_MASKED,
|
||||
ACCEL_VERM16,
|
||||
ACCEL_DVERM16,
|
||||
ACCEL_DVERM16_MASKED,
|
||||
ACCEL_TRUFFLE_WIDE,
|
||||
};
|
||||
|
||||
/** \brief Structure for accel framework. */
|
||||
@ -97,6 +102,24 @@ union AccelAux {
|
||||
u8 len1;
|
||||
u8 len2;
|
||||
} mdverm;
|
||||
struct {
|
||||
u8 accel_type;
|
||||
u8 offset;
|
||||
m128 mask;
|
||||
} verm16;
|
||||
struct {
|
||||
u8 accel_type;
|
||||
u8 offset;
|
||||
u64a firsts;
|
||||
m128 mask;
|
||||
} dverm16;
|
||||
struct {
|
||||
u8 accel_type;
|
||||
u8 offset;
|
||||
u8 c1; // used for partial match
|
||||
u8 m1; // used for partial match
|
||||
m128 mask;
|
||||
} mdverm16;
|
||||
struct {
|
||||
u8 accel_type;
|
||||
u8 offset;
|
||||
@ -114,8 +137,18 @@ union AccelAux {
|
||||
struct {
|
||||
u8 accel_type;
|
||||
u8 offset;
|
||||
m128 mask1;
|
||||
m128 mask2;
|
||||
union {
|
||||
m256 mask;
|
||||
struct {
|
||||
#if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
|
||||
m128 mask_lo;
|
||||
m128 mask_hi;
|
||||
#else
|
||||
m128 mask_hi;
|
||||
m128 mask_lo;
|
||||
#endif
|
||||
};
|
||||
};
|
||||
} truffle;
|
||||
};
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017, Intel Corporation
|
||||
* Copyright (c) 2021, Arm Limited
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
@ -33,6 +34,7 @@
|
||||
#include "nfagraph/ng_limex_accel.h"
|
||||
#include "shufticompile.h"
|
||||
#include "trufflecompile.h"
|
||||
#include "vermicellicompile.h"
|
||||
#include "util/accel_scheme.h"
|
||||
#include "util/charreach.h"
|
||||
#include "util/container.h"
|
||||
@ -105,7 +107,7 @@ static
|
||||
path append(const path &orig, const CharReach &cr, u32 new_dest) {
|
||||
path p(new_dest);
|
||||
p.reach = orig.reach;
|
||||
p.reach.push_back(cr);
|
||||
p.reach.emplace_back(cr);
|
||||
|
||||
return p;
|
||||
}
|
||||
@ -117,25 +119,25 @@ void extend(const raw_dfa &rdfa, const vector<CharReach> &rev_map,
|
||||
const dstate &s = rdfa.states[p.dest];
|
||||
|
||||
if (!p.reach.empty() && p.reach.back().none()) {
|
||||
out.push_back(p);
|
||||
out.emplace_back(p);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!s.reports.empty()) {
|
||||
if (generates_callbacks(rdfa.kind)) {
|
||||
out.push_back(p);
|
||||
out.emplace_back(p);
|
||||
return;
|
||||
} else {
|
||||
path pp = append(p, CharReach(), p.dest);
|
||||
all[p.dest].push_back(pp);
|
||||
out.push_back(move(pp));
|
||||
all[p.dest].emplace_back(pp);
|
||||
out.emplace_back(std::move(pp));
|
||||
}
|
||||
}
|
||||
|
||||
if (!s.reports_eod.empty()) {
|
||||
path pp = append(p, CharReach(), p.dest);
|
||||
all[p.dest].push_back(pp);
|
||||
out.push_back(move(pp));
|
||||
all[p.dest].emplace_back(pp);
|
||||
out.emplace_back(std::move(pp));
|
||||
}
|
||||
|
||||
flat_map<u32, CharReach> dest;
|
||||
@ -154,8 +156,8 @@ void extend(const raw_dfa &rdfa, const vector<CharReach> &rev_map,
|
||||
|
||||
DEBUG_PRINTF("----good: [%s] -> %u\n",
|
||||
describeClasses(pp.reach).c_str(), pp.dest);
|
||||
all[e.first].push_back(pp);
|
||||
out.push_back(move(pp));
|
||||
all[e.first].emplace_back(pp);
|
||||
out.emplace_back(std::move(pp));
|
||||
}
|
||||
}
|
||||
|
||||
@ -165,14 +167,14 @@ vector<vector<CharReach>> generate_paths(const raw_dfa &rdfa,
|
||||
const vector<CharReach> rev_map = reverse_alpha_remapping(rdfa);
|
||||
vector<path> paths{path(base)};
|
||||
unordered_map<u32, vector<path>> all;
|
||||
all[base].push_back(path(base));
|
||||
all[base].emplace_back(path(base));
|
||||
for (u32 i = 0; i < len && paths.size() < PATHS_LIMIT; i++) {
|
||||
vector<path> next_gen;
|
||||
for (const auto &p : paths) {
|
||||
extend(rdfa, rev_map, p, all, next_gen);
|
||||
}
|
||||
|
||||
paths = move(next_gen);
|
||||
paths = std::move(next_gen);
|
||||
}
|
||||
|
||||
dump_paths(paths);
|
||||
@ -180,7 +182,8 @@ vector<vector<CharReach>> generate_paths(const raw_dfa &rdfa,
|
||||
vector<vector<CharReach>> rv;
|
||||
rv.reserve(paths.size());
|
||||
for (auto &p : paths) {
|
||||
rv.push_back(vector<CharReach>(std::make_move_iterator(p.reach.begin()),
|
||||
// cppcheck-suppress useStlAlgorithm
|
||||
rv.emplace_back(vector<CharReach>(std::make_move_iterator(p.reach.begin()),
|
||||
std::make_move_iterator(p.reach.end())));
|
||||
}
|
||||
return rv;
|
||||
@ -318,7 +321,7 @@ set<dstate_id_t> find_region(const raw_dfa &rdfa, dstate_id_t base,
|
||||
|
||||
DEBUG_PRINTF(" %hu is in region\n", t);
|
||||
region.insert(t);
|
||||
pending.push_back(t);
|
||||
pending.emplace_back(t);
|
||||
}
|
||||
}
|
||||
|
||||
@ -424,10 +427,11 @@ void
|
||||
accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
|
||||
const AccelScheme &info,
|
||||
void *accel_out) {
|
||||
AccelAux *accel = (AccelAux *)accel_out;
|
||||
AccelAux *accel = reinterpret_cast<AccelAux *>(accel_out);
|
||||
|
||||
DEBUG_PRINTF("accelerations scheme has offset s%u/d%u\n", info.offset,
|
||||
info.double_offset);
|
||||
// cppcheck-suppress redundantInitialization
|
||||
accel->generic.offset = verify_u8(info.offset);
|
||||
|
||||
if (double_byte_ok(info) && info.double_cr.none() &&
|
||||
@ -440,52 +444,87 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
|
||||
return;
|
||||
}
|
||||
|
||||
if (double_byte_ok(info) && info.double_cr.none() &&
|
||||
(info.double_byte.size() == 2 || info.double_byte.size() == 4)) {
|
||||
bool ok = true;
|
||||
if (double_byte_ok(info) && info.double_cr.none()) {
|
||||
if ((info.double_byte.size() == 2 || info.double_byte.size() == 4)) {
|
||||
bool ok = true;
|
||||
|
||||
assert(!info.double_byte.empty());
|
||||
u8 firstC = info.double_byte.begin()->first & CASE_CLEAR;
|
||||
u8 secondC = info.double_byte.begin()->second & CASE_CLEAR;
|
||||
assert(!info.double_byte.empty());
|
||||
u8 firstC = info.double_byte.begin()->first & CASE_CLEAR;
|
||||
u8 secondC = info.double_byte.begin()->second & CASE_CLEAR;
|
||||
|
||||
for (const pair<u8, u8> &p : info.double_byte) {
|
||||
if ((p.first & CASE_CLEAR) != firstC ||
|
||||
(p.second & CASE_CLEAR) != secondC) {
|
||||
ok = false;
|
||||
break;
|
||||
for (const pair<u8, u8> &p : info.double_byte) {
|
||||
if ((p.first & CASE_CLEAR) != firstC ||
|
||||
(p.second & CASE_CLEAR) != secondC) {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ok) {
|
||||
accel->accel_type = ACCEL_DVERM_NOCASE;
|
||||
accel->dverm.c1 = firstC;
|
||||
accel->dverm.c2 = secondC;
|
||||
accel->dverm.offset = verify_u8(info.double_offset);
|
||||
DEBUG_PRINTF("state %hu is nc double vermicelli\n", this_idx);
|
||||
return;
|
||||
}
|
||||
|
||||
u8 m1;
|
||||
u8 m2;
|
||||
if (buildDvermMask(info.double_byte, &m1, &m2)) {
|
||||
u8 c1 = info.double_byte.begin()->first & m1;
|
||||
u8 c2 = info.double_byte.begin()->second & m2;
|
||||
#ifdef HAVE_SVE2
|
||||
if (vermicelliDoubleMasked16Build(c1, c2, m1, m2,
|
||||
reinterpret_cast<u8 *>(&accel->mdverm16.mask))) {
|
||||
accel->accel_type = ACCEL_DVERM16_MASKED;
|
||||
accel->mdverm16.offset = verify_u8(info.double_offset);
|
||||
accel->mdverm16.c1 = c1;
|
||||
accel->mdverm16.m1 = m1;
|
||||
DEBUG_PRINTF("building maskeddouble16-vermicelli for 0x%02hhx%02hhx\n",
|
||||
c1, c2);
|
||||
return;
|
||||
} else if (info.double_byte.size() <= 8 &&
|
||||
vermicelliDouble16Build(info.double_byte,
|
||||
reinterpret_cast<u8 *>(&accel->dverm16.mask),
|
||||
reinterpret_cast<u8 *>(&accel->dverm16.firsts))) {
|
||||
accel->accel_type = ACCEL_DVERM16;
|
||||
accel->dverm16.offset = verify_u8(info.double_offset);
|
||||
DEBUG_PRINTF("building double16-vermicelli\n");
|
||||
return;
|
||||
}
|
||||
#endif // HAVE_SVE2
|
||||
accel->accel_type = ACCEL_DVERM_MASKED;
|
||||
accel->dverm.offset = verify_u8(info.double_offset);
|
||||
accel->dverm.c1 = c1;
|
||||
accel->dverm.c2 = c2;
|
||||
accel->dverm.m1 = m1;
|
||||
accel->dverm.m2 = m2;
|
||||
DEBUG_PRINTF(
|
||||
"building maskeddouble-vermicelli for 0x%02hhx%02hhx\n", c1, c2);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (ok) {
|
||||
accel->accel_type = ACCEL_DVERM_NOCASE;
|
||||
accel->dverm.c1 = firstC;
|
||||
accel->dverm.c2 = secondC;
|
||||
accel->dverm.offset = verify_u8(info.double_offset);
|
||||
DEBUG_PRINTF("state %hu is nc double vermicelli\n", this_idx);
|
||||
return;
|
||||
}
|
||||
|
||||
u8 m1;
|
||||
u8 m2;
|
||||
if (buildDvermMask(info.double_byte, &m1, &m2)) {
|
||||
accel->accel_type = ACCEL_DVERM_MASKED;
|
||||
accel->dverm.offset = verify_u8(info.double_offset);
|
||||
accel->dverm.c1 = info.double_byte.begin()->first & m1;
|
||||
accel->dverm.c2 = info.double_byte.begin()->second & m2;
|
||||
accel->dverm.m1 = m1;
|
||||
accel->dverm.m2 = m2;
|
||||
DEBUG_PRINTF(
|
||||
"building maskeddouble-vermicelli for 0x%02hhx%02hhx\n",
|
||||
accel->dverm.c1, accel->dverm.c2);
|
||||
#ifdef HAVE_SVE2
|
||||
if (info.double_byte.size() <= 8 &&
|
||||
vermicelliDouble16Build(info.double_byte,
|
||||
reinterpret_cast<u8 *>(&accel->dverm16.mask),
|
||||
reinterpret_cast<u8 *>(&accel->dverm16.firsts))) {
|
||||
accel->accel_type = ACCEL_DVERM16;
|
||||
accel->dverm16.offset = verify_u8(info.double_offset);
|
||||
DEBUG_PRINTF("building double16-vermicelli\n");
|
||||
return;
|
||||
}
|
||||
#endif // HAVE_SVE2
|
||||
}
|
||||
|
||||
if (double_byte_ok(info) &&
|
||||
shuftiBuildDoubleMasks(
|
||||
info.double_cr, info.double_byte, (u8 *)&accel->dshufti.lo1,
|
||||
(u8 *)&accel->dshufti.hi1, (u8 *)&accel->dshufti.lo2,
|
||||
(u8 *)&accel->dshufti.hi2)) {
|
||||
info.double_cr, info.double_byte,
|
||||
reinterpret_cast<u8 *>(&accel->dshufti.lo1),
|
||||
reinterpret_cast<u8 *>(&accel->dshufti.hi1),
|
||||
reinterpret_cast<u8 *>(&accel->dshufti.lo2),
|
||||
reinterpret_cast<u8 *>(&accel->dshufti.hi2))) {
|
||||
accel->accel_type = ACCEL_DSHUFTI;
|
||||
accel->dshufti.offset = verify_u8(info.double_offset);
|
||||
DEBUG_PRINTF("state %hu is double shufti\n", this_idx);
|
||||
@ -514,6 +553,15 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef HAVE_SVE2
|
||||
if (info.cr.count() <= 16) {
|
||||
accel->accel_type = ACCEL_VERM16;
|
||||
vermicelli16Build(info.cr, reinterpret_cast<u8 *>(&accel->verm16.mask));
|
||||
DEBUG_PRINTF("state %hu is vermicelli16\n", this_idx);
|
||||
return;
|
||||
}
|
||||
#endif // HAVE_SVE2
|
||||
|
||||
if (info.cr.count() > max_floating_stop_char()) {
|
||||
accel->accel_type = ACCEL_NONE;
|
||||
DEBUG_PRINTF("state %hu is too broad\n", this_idx);
|
||||
@ -521,16 +569,27 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
|
||||
}
|
||||
|
||||
accel->accel_type = ACCEL_SHUFTI;
|
||||
if (-1 != shuftiBuildMasks(info.cr, (u8 *)&accel->shufti.lo,
|
||||
(u8 *)&accel->shufti.hi)) {
|
||||
if (-1 != shuftiBuildMasks(info.cr,
|
||||
reinterpret_cast<u8 *>(&accel->shufti.lo),
|
||||
reinterpret_cast<u8 *>(&accel->shufti.hi))) {
|
||||
DEBUG_PRINTF("state %hu is shufti\n", this_idx);
|
||||
return;
|
||||
}
|
||||
|
||||
assert(!info.cr.none());
|
||||
accel->accel_type = ACCEL_TRUFFLE;
|
||||
truffleBuildMasks(info.cr, (u8 *)&accel->truffle.mask1,
|
||||
(u8 *)&accel->truffle.mask2);
|
||||
#if defined(CAN_USE_WIDE_TRUFFLE)
|
||||
if(CAN_USE_WIDE_TRUFFLE) {
|
||||
accel->accel_type = ACCEL_TRUFFLE_WIDE;
|
||||
truffleBuildMasksWide(info.cr,
|
||||
reinterpret_cast<u8 *>(&accel->truffle.mask));
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
accel->accel_type = ACCEL_TRUFFLE;
|
||||
truffleBuildMasks(info.cr,
|
||||
reinterpret_cast<u8 *>(&accel->truffle.mask_lo),
|
||||
reinterpret_cast<u8 *>(&accel->truffle.mask_hi));
|
||||
}
|
||||
DEBUG_PRINTF("state %hu is truffle\n", this_idx);
|
||||
}
|
||||
|
||||
|
@ -93,6 +93,8 @@ const char *accelName(u8 accel_type) {
|
||||
return "double-shufti";
|
||||
case ACCEL_TRUFFLE:
|
||||
return "truffle";
|
||||
case ACCEL_TRUFFLE_WIDE:
|
||||
return "truffle wide";
|
||||
case ACCEL_RED_TAPE:
|
||||
return "red tape";
|
||||
default:
|
||||
@ -178,6 +180,13 @@ void dumpTruffleCharReach(FILE *f, const u8 *hiset, const u8 *hiclear) {
|
||||
describeClass(cr).c_str());
|
||||
}
|
||||
|
||||
static
|
||||
void dumpWideTruffleCharReach(FILE *f, const u8 *mask) {
|
||||
CharReach cr = truffle2crWide(mask);
|
||||
fprintf(f, "count %zu class %s\n", cr.count(),
|
||||
describeClass(cr).c_str());
|
||||
}
|
||||
|
||||
static
|
||||
void dumpTruffleMasks(FILE *f, const u8 *hiset, const u8 *hiclear) {
|
||||
fprintf(f, "lo %s\n", dumpMask(hiset, 128).c_str());
|
||||
@ -210,31 +219,38 @@ void dumpAccelInfo(FILE *f, const AccelAux &accel) {
|
||||
break;
|
||||
case ACCEL_SHUFTI: {
|
||||
fprintf(f, "\n");
|
||||
dumpShuftiMasks(f, (const u8 *)&accel.shufti.lo,
|
||||
(const u8 *)&accel.shufti.hi);
|
||||
dumpShuftiCharReach(f, (const u8 *)&accel.shufti.lo,
|
||||
(const u8 *)&accel.shufti.hi);
|
||||
dumpShuftiMasks(f, reinterpret_cast<const u8 *>(&accel.shufti.lo),
|
||||
reinterpret_cast<const u8 *>(&accel.shufti.hi));
|
||||
dumpShuftiCharReach(f, reinterpret_cast<const u8 *>(&accel.shufti.lo),
|
||||
reinterpret_cast<const u8 *>(&accel.shufti.hi));
|
||||
break;
|
||||
}
|
||||
case ACCEL_DSHUFTI:
|
||||
fprintf(f, "\n");
|
||||
fprintf(f, "mask 1\n");
|
||||
dumpShuftiMasks(f, (const u8 *)&accel.dshufti.lo1,
|
||||
(const u8 *)&accel.dshufti.hi1);
|
||||
dumpShuftiMasks(f, reinterpret_cast<const u8 *>(&accel.dshufti.lo1),
|
||||
reinterpret_cast<const u8 *>(&accel.dshufti.hi1));
|
||||
fprintf(f, "mask 2\n");
|
||||
dumpShuftiMasks(f, (const u8 *)&accel.dshufti.lo2,
|
||||
(const u8 *)&accel.dshufti.hi2);
|
||||
dumpDShuftiCharReach(f, (const u8 *)&accel.dshufti.lo1,
|
||||
(const u8 *)&accel.dshufti.hi1,
|
||||
(const u8 *)&accel.dshufti.lo2,
|
||||
(const u8 *)&accel.dshufti.hi2);
|
||||
dumpShuftiMasks(f, reinterpret_cast<const u8 *>(&accel.dshufti.lo2),
|
||||
reinterpret_cast<const u8 *>(&accel.dshufti.hi2));
|
||||
dumpDShuftiCharReach(f, reinterpret_cast<const u8 *>(&accel.dshufti.lo1),
|
||||
reinterpret_cast<const u8 *>(&accel.dshufti.hi1),
|
||||
reinterpret_cast<const u8 *>(&accel.dshufti.lo2),
|
||||
reinterpret_cast<const u8 *>(&accel.dshufti.hi2));
|
||||
break;
|
||||
case ACCEL_TRUFFLE: {
|
||||
fprintf(f, "\n");
|
||||
dumpTruffleMasks(f, (const u8 *)&accel.truffle.mask1,
|
||||
(const u8 *)&accel.truffle.mask2);
|
||||
dumpTruffleCharReach(f, (const u8 *)&accel.truffle.mask1,
|
||||
(const u8 *)&accel.truffle.mask2);
|
||||
dumpTruffleMasks(f, reinterpret_cast<const u8 *>(&accel.truffle.mask_lo),
|
||||
reinterpret_cast<const u8 *>(&accel.truffle.mask_hi));
|
||||
dumpTruffleCharReach(f, reinterpret_cast<const u8 *>(&accel.truffle.mask_lo),
|
||||
reinterpret_cast<const u8 *>(&accel.truffle.mask_hi));
|
||||
break;
|
||||
}
|
||||
case ACCEL_TRUFFLE_WIDE: {
|
||||
fprintf(f, "\n");
|
||||
dumpTruffleMasks(f, reinterpret_cast<const u8 *>(&accel.truffle.mask_lo),
|
||||
reinterpret_cast<const u8 *>(&accel.truffle.mask_hi));
|
||||
dumpWideTruffleCharReach(f, reinterpret_cast<const u8 *>(&accel.truffle.mask));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user